summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c295
-rw-r--r--fs/9p/acl.h8
-rw-r--r--fs/9p/fid.c1
-rw-r--r--fs/9p/v9fs.c1
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_dir.c3
-rw-r--r--fs/9p/vfs_file.c1
-rw-r--r--fs/9p/vfs_inode.c1
-rw-r--r--fs/9p/vfs_inode_dotl.c5
-rw-r--r--fs/9p/vfs_super.c1
-rw-r--r--fs/9p/xattr.c11
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/afs/cmservice.c8
-rw-r--r--fs/afs/dir.c3
-rw-r--r--fs/afs/file.c7
-rw-r--r--fs/afs/fs_probe.c5
-rw-r--r--fs/afs/internal.h12
-rw-r--r--fs/afs/rxrpc.c34
-rw-r--r--fs/afs/volume.c6
-rw-r--r--fs/afs/write.c87
-rw-r--r--fs/aio.c4
-rw-r--r--fs/attr.c74
-rw-r--r--fs/bad_inode.c4
-rw-r--r--fs/binfmt_elf.c303
-rw-r--r--fs/binfmt_elf_fdpic.c11
-rw-r--r--fs/binfmt_misc.c8
-rw-r--r--fs/btrfs/acl.c3
-rw-r--r--fs/btrfs/acl.h2
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/btrfs/ioctl.c4
-rw-r--r--fs/btrfs/ordered-data.c6
-rw-r--r--fs/btrfs/send.c11
-rw-r--r--fs/cachefiles/io.c77
-rw-r--r--fs/ceph/acl.c3
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/caps.c27
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/file.c30
-rw-r--r--fs/ceph/inode.c12
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/locks.c28
-rw-r--r--fs/ceph/mdsmap.c2
-rw-r--r--fs/ceph/super.h8
-rw-r--r--fs/char_dev.c15
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cifs_debug.c8
-rw-r--r--fs/cifs/cifs_dfs_ref.c255
-rw-r--r--fs/cifs/cifs_ioctl.h2
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifsacl.c141
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c18
-rw-r--r--fs/cifs/cifsfs.h8
-rw-r--r--fs/cifs/cifsglob.h94
-rw-r--r--fs/cifs/cifspdu.h50
-rw-r--r--fs/cifs/cifsproto.h40
-rw-r--r--fs/cifs/cifssmb.c206
-rw-r--r--fs/cifs/connect.c915
-rw-r--r--fs/cifs/dfs.c543
-rw-r--r--fs/cifs/dfs.h46
-rw-r--r--fs/cifs/dfs_cache.c277
-rw-r--r--fs/cifs/dfs_cache.h2
-rw-r--r--fs/cifs/dir.c21
-rw-r--r--fs/cifs/dns_resolve.c49
-rw-r--r--fs/cifs/dns_resolve.h4
-rw-r--r--fs/cifs/file.c42
-rw-r--r--fs/cifs/fs_context.c24
-rw-r--r--fs/cifs/fs_context.h3
-rw-r--r--fs/cifs/fscache.c4
-rw-r--r--fs/cifs/inode.c19
-rw-r--r--fs/cifs/link.c1
-rw-r--r--fs/cifs/misc.c87
-rw-r--r--fs/cifs/sess.c5
-rw-r--r--fs/cifs/smb1ops.c63
-rw-r--r--fs/cifs/smb2file.c4
-rw-r--r--fs/cifs/smb2inode.c112
-rw-r--r--fs/cifs/smb2ops.c215
-rw-r--r--fs/cifs/smb2pdu.c17
-rw-r--r--fs/cifs/smb2proto.h5
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/cifs/xattr.c68
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/coredump.c19
-rw-r--r--fs/crypto/fscrypt_private.h13
-rw-r--r--fs/crypto/inline_crypt.c14
-rw-r--r--fs/crypto/keyring.c14
-rw-r--r--fs/crypto/keysetup.c17
-rw-r--r--fs/crypto/policy.c12
-rw-r--r--fs/dax.c221
-rw-r--r--fs/debugfs/file.c28
-rw-r--r--fs/dlm/ast.c322
-rw-r--r--fs/dlm/ast.h17
-rw-r--r--fs/dlm/config.c4
-rw-r--r--fs/dlm/debug_fs.c2
-rw-r--r--fs/dlm/dlm_internal.h25
-rw-r--r--fs/dlm/lock.c190
-rw-r--r--fs/dlm/lockspace.c14
-rw-r--r--fs/dlm/lowcomms.c1540
-rw-r--r--fs/dlm/lowcomms.h6
-rw-r--r--fs/dlm/main.c7
-rw-r--r--fs/dlm/member.c5
-rw-r--r--fs/dlm/memory.c30
-rw-r--r--fs/dlm/memory.h4
-rw-r--r--fs/dlm/midcomms.c141
-rw-r--r--fs/dlm/midcomms.h7
-rw-r--r--fs/dlm/rcom.c4
-rw-r--r--fs/dlm/requestqueue.c3
-rw-r--r--fs/dlm/user.c74
-rw-r--r--fs/dlm/user.h2
-rw-r--r--fs/ecryptfs/inode.c32
-rw-r--r--fs/efivarfs/inode.c4
-rw-r--r--fs/efivarfs/super.c3
-rw-r--r--fs/erofs/data.c10
-rw-r--r--fs/erofs/fscache.c412
-rw-r--r--fs/erofs/inode.c8
-rw-r--r--fs/erofs/internal.h13
-rw-r--r--fs/erofs/namei.c2
-rw-r--r--fs/erofs/super.c15
-rw-r--r--fs/erofs/xattr.c8
-rw-r--r--fs/erofs/zdata.c92
-rw-r--r--fs/erofs/zmap.c25
-rw-r--r--fs/eventfd.c37
-rw-r--r--fs/eventpoll.c18
-rw-r--r--fs/exec.c50
-rw-r--r--fs/exfat/dir.c184
-rw-r--r--fs/exfat/exfat_fs.h56
-rw-r--r--fs/exfat/file.c12
-rw-r--r--fs/exfat/inode.c26
-rw-r--r--fs/exfat/namei.c63
-rw-r--r--fs/exportfs/expfs.c8
-rw-r--r--fs/ext2/acl.c3
-rw-r--r--fs/ext2/acl.h2
-rw-r--r--fs/ext2/balloc.c12
-rw-r--r--fs/ext2/dir.c41
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext2/namei.c4
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext4/acl.c3
-rw-r--r--fs/ext4/acl.h2
-rw-r--r--fs/ext4/ext4.h13
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h10
-rw-r--r--fs/ext4/extents.c16
-rw-r--r--fs/ext4/extents_status.c11
-rw-r--r--fs/ext4/fast_commit.c205
-rw-r--r--fs/ext4/fast_commit.h3
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/ialloc.c8
-rw-r--r--fs/ext4/indirect.c9
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c254
-rw-r--r--fs/ext4/ioctl.c24
-rw-r--r--fs/ext4/mballoc.c10
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/move_extent.c52
-rw-r--r--fs/ext4/namei.c51
-rw-r--r--fs/ext4/orphan.c2
-rw-r--r--fs/ext4/page-io.c44
-rw-r--r--fs/ext4/readpage.c13
-rw-r--r--fs/ext4/resize.c36
-rw-r--r--fs/ext4/super.c67
-rw-r--r--fs/ext4/verity.c2
-rw-r--r--fs/ext4/xattr.c22
-rw-r--r--fs/f2fs/acl.c4
-rw-r--r--fs/f2fs/acl.h2
-rw-r--r--fs/f2fs/checkpoint.c9
-rw-r--r--fs/f2fs/compress.c112
-rw-r--r--fs/f2fs/data.c109
-rw-r--r--fs/f2fs/debug.c131
-rw-r--r--fs/f2fs/dir.c36
-rw-r--r--fs/f2fs/extent_cache.c697
-rw-r--r--fs/f2fs/f2fs.h278
-rw-r--r--fs/f2fs/file.c50
-rw-r--r--fs/f2fs/gc.c81
-rw-r--r--fs/f2fs/inode.c20
-rw-r--r--fs/f2fs/namei.c395
-rw-r--r--fs/f2fs/node.c19
-rw-r--r--fs/f2fs/node.h3
-rw-r--r--fs/f2fs/recovery.c4
-rw-r--r--fs/f2fs/segment.c220
-rw-r--r--fs/f2fs/segment.h6
-rw-r--r--fs/f2fs/shrinker.c25
-rw-r--r--fs/f2fs/super.c126
-rw-r--r--fs/f2fs/sysfs.c164
-rw-r--r--fs/fat/inode.c9
-rw-r--r--fs/fat/nfs.c4
-rw-r--r--fs/fs-writeback.c17
-rw-r--r--fs/fs_parser.c3
-rw-r--r--fs/fscache/cookie.c8
-rw-r--r--fs/fscache/io.c2
-rw-r--r--fs/fuse/acl.c5
-rw-r--r--fs/fuse/cuse.c5
-rw-r--r--fs/fuse/dev.c62
-rw-r--r--fs/fuse/dir.c47
-rw-r--r--fs/fuse/file.c45
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/fuse/ioctl.c4
-rw-r--r--fs/fuse/readdir.c4
-rw-r--r--fs/gfs2/acl.c3
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/file.c3
-rw-r--r--fs/gfs2/glock.c269
-rw-r--r--fs/gfs2/glock.h65
-rw-r--r--fs/gfs2/glops.c44
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c70
-rw-r--r--fs/gfs2/meta_io.c6
-rw-r--r--fs/gfs2/super.c84
-rw-r--r--fs/gfs2/xattr.c26
-rw-r--r--fs/hfs/inode.c17
-rw-r--r--fs/hfs/trans.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c6
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hpfs/file.c9
-rw-r--r--fs/hugetlbfs/inode.c28
-rw-r--r--fs/inode.c75
-rw-r--r--fs/internal.h35
-rw-r--r--fs/iomap/buffered-io.c254
-rw-r--r--fs/iomap/direct-io.c3
-rw-r--r--fs/iomap/iter.c19
-rw-r--r--fs/jbd2/commit.c5
-rw-r--r--fs/jffs2/acl.c3
-rw-r--r--fs/jffs2/acl.h2
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jfs/acl.c3
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/inode.c7
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/jfs_dmap.c27
-rw-r--r--fs/jfs/jfs_extent.h2
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_mount.c4
-rw-r--r--fs/jfs/jfs_umount.c4
-rw-r--r--fs/jfs/jfs_xattr.h2
-rw-r--r--fs/jfs/jfs_xtree.h4
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/super.c6
-rw-r--r--fs/kernfs/dir.c106
-rw-r--r--fs/kernfs/file.c18
-rw-r--r--fs/kernfs/inode.c12
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/kernfs/mount.c10
-rw-r--r--fs/kernfs/symlink.c2
-rw-r--r--fs/ksmbd/auth.c3
-rw-r--r--fs/ksmbd/connection.c7
-rw-r--r--fs/ksmbd/ksmbd_netlink.h1
-rw-r--r--fs/ksmbd/mgmt/user_session.c8
-rw-r--r--fs/ksmbd/server.c20
-rw-r--r--fs/ksmbd/smb2ops.c10
-rw-r--r--fs/ksmbd/smb2pdu.c35
-rw-r--r--fs/ksmbd/smb2pdu.h2
-rw-r--r--fs/ksmbd/smb_common.c2
-rw-r--r--fs/ksmbd/smb_common.h12
-rw-r--r--fs/ksmbd/smbacl.c6
-rw-r--r--fs/ksmbd/transport_tcp.c5
-rw-r--r--fs/ksmbd/vfs.c23
-rw-r--r--fs/ksmbd/vfs.h4
-rw-r--r--fs/libfs.c22
-rw-r--r--fs/lockd/svc4proc.c1
-rw-r--r--fs/lockd/svclock.c17
-rw-r--r--fs/lockd/svcproc.c1
-rw-r--r--fs/lockd/svcsubs.c21
-rw-r--r--fs/locks.c50
-rw-r--r--fs/mbcache.c14
-rw-r--r--fs/namei.c46
-rw-r--r--fs/namespace.c179
-rw-r--r--fs/netfs/io.c6
-rw-r--r--fs/nfs/Kconfig8
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/dir.c37
-rw-r--r--fs/nfs/filelayout/filelayout.c8
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c4
-rw-r--r--fs/nfs/fs_context.c6
-rw-r--r--fs/nfs/fscache.c4
-rw-r--r--fs/nfs/inode.c3
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/namespace.c4
-rw-r--r--fs/nfs/nfs3_fs.h2
-rw-r--r--fs/nfs/nfs3acl.c9
-rw-r--r--fs/nfs/nfs3proc.c4
-rw-r--r--fs/nfs/nfs42xdr.c9
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4file.c12
-rw-r--r--fs/nfs/nfs4idmap.c2
-rw-r--r--fs/nfs/nfs4proc.c57
-rw-r--r--fs/nfs/nfs4state.c7
-rw-r--r--fs/nfs/nfs4trace.h10
-rw-r--r--fs/nfs/nfs4xdr.c22
-rw-r--r--fs/nfs/nfstrace.h6
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/sysfs.c6
-rw-r--r--fs/nfs/unlink.c1
-rw-r--r--fs/nfs/write.c4
-rw-r--r--fs/nfsd/Kconfig19
-rw-r--r--fs/nfsd/Makefile5
-rw-r--r--fs/nfsd/blocklayout.c1
-rw-r--r--fs/nfsd/blocklayoutxdr.c1
-rw-r--r--fs/nfsd/export.h1
-rw-r--r--fs/nfsd/filecache.c523
-rw-r--r--fs/nfsd/filecache.h9
-rw-r--r--fs/nfsd/flexfilelayout.c1
-rw-r--r--fs/nfsd/netns.h2
-rw-r--r--fs/nfsd/nfs2acl.c18
-rw-r--r--fs/nfsd/nfs3acl.c38
-rw-r--r--fs/nfsd/nfs3proc.c10
-rw-r--r--fs/nfsd/nfs4acl.c4
-rw-r--r--fs/nfsd/nfs4callback.c78
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c77
-rw-r--r--fs/nfsd/nfs4state.c383
-rw-r--r--fs/nfsd/nfs4xdr.c784
-rw-r--r--fs/nfsd/nfsctl.c13
-rw-r--r--fs/nfsd/nfsd.h9
-rw-r--r--fs/nfsd/nfsfh.h10
-rw-r--r--fs/nfsd/nfsproc.c66
-rw-r--r--fs/nfsd/nfssvc.c10
-rw-r--r--fs/nfsd/state.h11
-rw-r--r--fs/nfsd/trace.h235
-rw-r--r--fs/nfsd/vfs.c91
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/nfsd/xdr4.h5
-rw-r--r--fs/nfsd/xdr4cb.h6
-rw-r--r--fs/nilfs2/btree.c15
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/nilfs2/the_nilfs.c73
-rw-r--r--fs/ntfs3/attrib.c392
-rw-r--r--fs/ntfs3/attrlist.c5
-rw-r--r--fs/ntfs3/bitfunc.c4
-rw-r--r--fs/ntfs3/bitmap.c168
-rw-r--r--fs/ntfs3/dir.c4
-rw-r--r--fs/ntfs3/file.c211
-rw-r--r--fs/ntfs3/frecord.c40
-rw-r--r--fs/ntfs3/fslog.c62
-rw-r--r--fs/ntfs3/fsntfs.c190
-rw-r--r--fs/ntfs3/index.c127
-rw-r--r--fs/ntfs3/inode.c203
-rw-r--r--fs/ntfs3/namei.c242
-rw-r--r--fs/ntfs3/ntfs.h6
-rw-r--r--fs/ntfs3/ntfs_fs.h45
-rw-r--r--fs/ntfs3/record.c13
-rw-r--r--fs/ntfs3/run.c28
-rw-r--r--fs/ntfs3/super.c143
-rw-r--r--fs/ntfs3/upcase.c12
-rw-r--r--fs/ntfs3/xattr.c167
-rw-r--r--fs/ocfs2/acl.c3
-rw-r--r--fs/ocfs2/acl.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c38
-rw-r--r--fs/ocfs2/cluster/heartbeat.h2
-rw-r--r--fs/ocfs2/cluster/netdebug.c2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c9
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c19
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c30
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/journal.h1
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h3
-rw-r--r--fs/ocfs2/stack_o2cb.c6
-rw-r--r--fs/ocfs2/stackglue.c8
-rw-r--r--fs/ocfs2/super.c5
-rw-r--r--fs/omfs/file.c7
-rw-r--r--fs/open.c10
-rw-r--r--fs/orangefs/acl.c47
-rw-r--r--fs/orangefs/file.c1
-rw-r--r--fs/orangefs/inode.c64
-rw-r--r--fs/orangefs/namei.c2
-rw-r--r--fs/orangefs/orangefs-debugfs.c29
-rw-r--r--fs/orangefs/orangefs-kernel.h7
-rw-r--r--fs/orangefs/orangefs-mod.c8
-rw-r--r--fs/orangefs/orangefs-sysfs.c71
-rw-r--r--fs/overlayfs/Kconfig2
-rw-r--r--fs/overlayfs/copy_up.c38
-rw-r--r--fs/overlayfs/dir.c68
-rw-r--r--fs/overlayfs/export.c8
-rw-r--r--fs/overlayfs/file.c31
-rw-r--r--fs/overlayfs/inode.c187
-rw-r--r--fs/overlayfs/namei.c12
-rw-r--r--fs/overlayfs/overlayfs.h53
-rw-r--r--fs/overlayfs/readdir.c58
-rw-r--r--fs/overlayfs/super.c114
-rw-r--r--fs/overlayfs/util.c15
-rw-r--r--fs/pnode.c2
-rw-r--r--fs/posix_acl.c727
-rw-r--r--fs/proc/cmdline.c6
-rw-r--r--fs/proc/consoles.c21
-rw-r--r--fs/proc/fd.c45
-rw-r--r--fs/proc/kcore.c33
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c16
-rw-r--r--fs/proc/vmcore.c7
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/platform.c25
-rw-r--r--fs/pstore/pmsg.c9
-rw-r--r--fs/pstore/ram.c46
-rw-r--r--fs/pstore/ram_core.c20
-rw-r--r--fs/pstore/ram_internal.h98
-rw-r--r--fs/pstore/zone.c2
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/read_write.c12
-rw-r--r--fs/reiserfs/acl.h6
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/namei.c8
-rw-r--r--fs/reiserfs/xattr_acl.c11
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/remap_range.c9
-rw-r--r--fs/seq_file.c2
-rw-r--r--fs/splice.c10
-rw-r--r--fs/squashfs/Kconfig51
-rw-r--r--fs/squashfs/block.c2
-rw-r--r--fs/squashfs/decompressor.c2
-rw-r--r--fs/squashfs/decompressor_multi.c20
-rw-r--r--fs/squashfs/decompressor_multi_percpu.c23
-rw-r--r--fs/squashfs/decompressor_single.c15
-rw-r--r--fs/squashfs/squashfs.h23
-rw-r--r--fs/squashfs/squashfs_fs_sb.h4
-rw-r--r--fs/squashfs/super.c102
-rw-r--r--fs/stat.c7
-rw-r--r--fs/super.c60
-rw-r--r--fs/sysv/itree.c2
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/lpt_commit.c14
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/udf/inode.c87
-rw-r--r--fs/udf/namei.c8
-rw-r--r--fs/udf/super.c4
-rw-r--r--fs/udf/truncate.c48
-rw-r--r--fs/udf/udf_sb.h6
-rw-r--r--fs/userfaultfd.c28
-rw-r--r--fs/verity/fsverity_private.h5
-rw-r--r--fs/verity/hash_algs.c6
-rw-r--r--fs/verity/measure.c19
-rw-r--r--fs/verity/verify.c12
-rw-r--r--fs/xattr.c440
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c8
-rw-r--r--fs/xfs/libxfs/xfs_btree.c7
-rw-r--r--fs/xfs/libxfs/xfs_btree.h1
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h18
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c146
-rw-r--r--fs/xfs/libxfs/xfs_sb.c4
-rw-r--r--fs/xfs/scrub/agheader.c47
-rw-r--r--fs/xfs/scrub/agheader_repair.c81
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bitmap.c11
-rw-r--r--fs/xfs/scrub/bmap.c147
-rw-r--r--fs/xfs/scrub/btree.c14
-rw-r--r--fs/xfs/scrub/common.c48
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/dabtree.c4
-rw-r--r--fs/xfs/scrub/dir.c10
-rw-r--r--fs/xfs/scrub/fscounters.c109
-rw-r--r--fs/xfs/scrub/inode.c2
-rw-r--r--fs/xfs/scrub/quota.c8
-rw-r--r--fs/xfs/scrub/refcount.c12
-rw-r--r--fs/xfs/scrub/repair.c51
-rw-r--r--fs/xfs/scrub/scrub.c6
-rw-r--r--fs/xfs/scrub/scrub.h18
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_acl.c3
-rw-r--r--fs/xfs/xfs_acl.h2
-rw-r--r--fs/xfs/xfs_aops.c32
-rw-r--r--fs/xfs/xfs_bmap_util.c10
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c1
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_error.c48
-rw-r--r--fs/xfs/xfs_error.h13
-rw-r--r--fs/xfs/xfs_extent_busy.c1
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_icache.c16
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_ioctl.c8
-rw-r--r--fs/xfs/xfs_iomap.c191
-rw-r--r--fs/xfs/xfs_iomap.h6
-rw-r--r--fs/xfs/xfs_iops.c20
-rw-r--r--fs/xfs/xfs_log.c46
-rw-r--r--fs/xfs/xfs_mount.c15
-rw-r--r--fs/xfs/xfs_pnfs.c6
-rw-r--r--fs/xfs/xfs_qm.c18
-rw-r--r--fs/xfs/xfs_reflink.c10
-rw-r--r--fs/xfs/xfs_rtalloc.c60
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h86
-rw-r--r--fs/xfs/xfs_trans_ail.c4
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/zonefs/super.c22
502 files changed, 13532 insertions, 9540 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 4dac4a0dc5f4..c397c51f80d9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -17,34 +17,64 @@
#include "v9fs_vfs.h"
#include "fid.h"
-static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
+static struct posix_acl *v9fs_fid_get_acl(struct p9_fid *fid, const char *name)
{
ssize_t size;
void *value = NULL;
struct posix_acl *acl = NULL;
size = v9fs_fid_xattr_get(fid, name, NULL, 0);
- if (size > 0) {
- value = kzalloc(size, GFP_NOFS);
- if (!value)
- return ERR_PTR(-ENOMEM);
- size = v9fs_fid_xattr_get(fid, name, value, size);
- if (size > 0) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- goto err_out;
- }
- } else if (size == -ENODATA || size == 0 ||
- size == -ENOSYS || size == -EOPNOTSUPP) {
- acl = NULL;
- } else
- acl = ERR_PTR(-EIO);
-
-err_out:
+ if (size < 0)
+ return ERR_PTR(size);
+ if (size == 0)
+ return ERR_PTR(-ENODATA);
+
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+
+ size = v9fs_fid_xattr_get(fid, name, value, size);
+ if (size < 0)
+ acl = ERR_PTR(size);
+ else if (size == 0)
+ acl = ERR_PTR(-ENODATA);
+ else
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
kfree(value);
return acl;
}
+static struct posix_acl *v9fs_acl_get(struct dentry *dentry, const char *name)
+{
+ struct p9_fid *fid;
+ struct posix_acl *acl = NULL;
+
+ fid = v9fs_fid_lookup(dentry);
+ if (IS_ERR(fid))
+ return ERR_CAST(fid);
+
+ acl = v9fs_fid_get_acl(fid, name);
+ p9_fid_put(fid);
+ return acl;
+}
+
+static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, const char *name)
+{
+ int retval;
+ struct posix_acl *acl = NULL;
+
+ acl = v9fs_fid_get_acl(fid, name);
+ if (!IS_ERR(acl))
+ return acl;
+
+ retval = PTR_ERR(acl);
+ if (retval == -ENODATA || retval == -ENOSYS || retval == -EOPNOTSUPP)
+ return NULL;
+
+ /* map everything else to -EIO */
+ return ERR_PTR(-EIO);
+}
+
int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
{
int retval = 0;
@@ -89,7 +119,7 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type)
return acl;
}
-struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu)
+struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type, bool rcu)
{
struct v9fs_session_info *v9ses;
@@ -109,6 +139,112 @@ struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu)
}
+struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ struct v9fs_session_info *v9ses;
+
+ v9ses = v9fs_dentry2v9ses(dentry);
+ /* We allow set/get/list of acl when access=client is not specified. */
+ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
+ return v9fs_acl_get(dentry, posix_acl_xattr_name(type));
+ return v9fs_get_cached_acl(d_inode(dentry), type);
+}
+
+int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+ int retval;
+ size_t size = 0;
+ void *value = NULL;
+ const char *acl_name;
+ struct v9fs_session_info *v9ses;
+ struct inode *inode = d_inode(dentry);
+
+ if (acl) {
+ retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
+ if (retval)
+ goto err_out;
+
+ size = posix_acl_xattr_size(acl->a_count);
+
+ value = kzalloc(size, GFP_NOFS);
+ if (!value) {
+ retval = -ENOMEM;
+ goto err_out;
+ }
+
+ retval = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (retval < 0)
+ goto err_out;
+ }
+
+ /*
+ * set the attribute on the remote. Without even looking at the
+ * xattr value. We leave it to the server to validate
+ */
+ acl_name = posix_acl_xattr_name(type);
+ v9ses = v9fs_dentry2v9ses(dentry);
+ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+ retval = v9fs_xattr_set(dentry, acl_name, value, size, 0);
+ goto err_out;
+ }
+
+ if (S_ISLNK(inode->i_mode)) {
+ retval = -EOPNOTSUPP;
+ goto err_out;
+ }
+
+ if (!inode_owner_or_capable(&init_user_ns, inode)) {
+ retval = -EPERM;
+ goto err_out;
+ }
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ if (acl) {
+ struct iattr iattr = {};
+ struct posix_acl *acl_mode = acl;
+
+ retval = posix_acl_update_mode(&init_user_ns, inode,
+ &iattr.ia_mode,
+ &acl_mode);
+ if (retval)
+ goto err_out;
+ if (!acl_mode) {
+ /*
+ * ACL can be represented by the mode bits.
+ * So don't update ACL below.
+ */
+ kfree(value);
+ value = NULL;
+ size = 0;
+ }
+ iattr.ia_valid = ATTR_MODE;
+ /*
+ * FIXME should we update ctime ?
+ * What is the following setxattr update the mode ?
+ */
+ v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode)) {
+ retval = acl ? -EINVAL : 0;
+ goto err_out;
+ }
+ break;
+ }
+
+ retval = v9fs_xattr_set(dentry, acl_name, value, size, 0);
+ if (!retval)
+ set_cached_acl(inode, type, acl);
+
+err_out:
+ kfree(value);
+ return retval;
+}
+
static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
{
int retval;
@@ -207,124 +343,3 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
*modep = mode;
return 0;
}
-
-static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- struct v9fs_session_info *v9ses;
- struct posix_acl *acl;
- int error;
-
- v9ses = v9fs_dentry2v9ses(dentry);
- /*
- * We allow set/get/list of acl when access=client is not specified
- */
- if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_xattr_get(dentry, handler->name, buffer, size);
-
- acl = v9fs_get_cached_acl(inode, handler->flags);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- int retval;
- struct posix_acl *acl;
- struct v9fs_session_info *v9ses;
-
- v9ses = v9fs_dentry2v9ses(dentry);
- /*
- * set the attribute on the remote. Without even looking at the
- * xattr value. We leave it to the server to validate
- */
- if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_xattr_set(dentry, handler->name, value, size,
- flags);
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EPERM;
- if (value) {
- /* update the cached acl value */
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
- if (retval)
- goto err_out;
- }
- } else
- acl = NULL;
-
- switch (handler->flags) {
- case ACL_TYPE_ACCESS:
- if (acl) {
- struct iattr iattr = { 0 };
- struct posix_acl *old_acl = acl;
-
- retval = posix_acl_update_mode(&init_user_ns, inode,
- &iattr.ia_mode, &acl);
- if (retval)
- goto err_out;
- if (!acl) {
- /*
- * ACL can be represented
- * by the mode bits. So don't
- * update ACL.
- */
- posix_acl_release(old_acl);
- value = NULL;
- size = 0;
- }
- iattr.ia_valid = ATTR_MODE;
- /* FIXME should we update ctime ?
- * What is the following setxattr update the
- * mode ?
- */
- v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr);
- }
- break;
- case ACL_TYPE_DEFAULT:
- if (!S_ISDIR(inode->i_mode)) {
- retval = acl ? -EINVAL : 0;
- goto err_out;
- }
- break;
- default:
- BUG();
- }
- retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
- if (!retval)
- set_cached_acl(inode, handler->flags, acl);
-err_out:
- posix_acl_release(acl);
- return retval;
-}
-
-const struct xattr_handler v9fs_xattr_acl_access_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = v9fs_xattr_get_acl,
- .set = v9fs_xattr_set_acl,
-};
-
-const struct xattr_handler v9fs_xattr_acl_default_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = v9fs_xattr_get_acl,
- .set = v9fs_xattr_set_acl,
-};
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index ce5175d463dd..4c60a2bce5de 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -8,8 +8,12 @@
#ifdef CONFIG_9P_FS_POSIX_ACL
int v9fs_get_acl(struct inode *inode, struct p9_fid *fid);
-struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type,
+struct posix_acl *v9fs_iop_get_inode_acl(struct inode *inode, int type,
bool rcu);
+struct posix_acl *v9fs_iop_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type);
+int v9fs_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type);
int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid);
int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid,
struct posix_acl *dacl, struct posix_acl *acl);
@@ -17,7 +21,9 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
struct posix_acl **dpacl, struct posix_acl **pacl);
void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl);
#else
+#define v9fs_iop_get_inode_acl NULL
#define v9fs_iop_get_acl NULL
+#define v9fs_iop_set_acl NULL
static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
{
return 0;
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 23cf9b2fbfe4..805151114e96 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -11,7 +11,6 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/idr.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 0129de2ea31a..3a9c4517265f 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -14,7 +14,6 @@
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/parser.h>
-#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <net/9p/9p.h>
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 47b9a1122f34..97599edbc300 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -14,7 +14,6 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/pagemap.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/uio.h>
@@ -40,7 +39,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
size_t len = subreq->len - subreq->transferred;
int total, err;
- iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len);
+ iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len);
total = p9_client_read(fid, pos, &to, &err);
@@ -172,7 +171,7 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio)
len = min_t(loff_t, i_size - start, len);
- iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len);
+ iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f89f01734587..65fa2df5e49b 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -15,7 +15,6 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/namei.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <net/9p/9p.h>
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 000fbaae9b18..59b0e8948f78 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -14,7 +14,6 @@
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/inet.h>
-#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/uio.h>
#include <linux/fscache.h>
@@ -109,7 +108,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
struct iov_iter to;
int n;
- iov_iter_kvec(&to, READ, &kvec, 1, buflen);
+ iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buflen);
n = p9_client_read(file->private_data, ctx->pos, &to,
&err);
if (err)
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index aec43ba83799..b740017634ef 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -18,7 +18,6 @@
#include <linux/pagemap.h>
#include <linux/utsname.h>
#include <linux/uaccess.h>
-#include <linux/idr.h>
#include <linux/uio.h>
#include <linux/slab.h>
#include <net/9p/9p.h>
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4d1a4a8d9277..27a04a226d97 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -17,7 +17,6 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/namei.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/xattr.h>
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 5cfa4b4f070f..f806b3f11649 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -15,7 +15,6 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/namei.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/xattr.h>
@@ -983,14 +982,18 @@ const struct inode_operations v9fs_dir_inode_operations_dotl = {
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.listxattr = v9fs_listxattr,
+ .get_inode_acl = v9fs_iop_get_inode_acl,
.get_acl = v9fs_iop_get_acl,
+ .set_acl = v9fs_iop_set_acl,
};
const struct inode_operations v9fs_file_inode_operations_dotl = {
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.listxattr = v9fs_listxattr,
+ .get_inode_acl = v9fs_iop_get_inode_acl,
.get_acl = v9fs_iop_get_acl,
+ .set_acl = v9fs_iop_set_acl,
};
const struct inode_operations v9fs_symlink_inode_operations_dotl = {
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2d9ee073d12c..266c4693e20c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -15,7 +15,6 @@
#include <linux/inet.h>
#include <linux/pagemap.h>
#include <linux/mount.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/statfs.h>
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 1f9298a4bd42..b6984311e00a 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/uio.h>
+#include <linux/posix_acl_xattr.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -24,7 +25,7 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
struct iov_iter to;
int err;
- iov_iter_kvec(&to, READ, &kvec, 1, buffer_size);
+ iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buffer_size);
attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
if (IS_ERR(attr_fid)) {
@@ -109,7 +110,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
struct iov_iter from;
int retval, err;
- iov_iter_kvec(&from, WRITE, &kvec, 1, value_len);
+ iov_iter_kvec(&from, ITER_SOURCE, &kvec, 1, value_len);
p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
name, value_len, flags);
@@ -182,9 +183,9 @@ static struct xattr_handler v9fs_xattr_security_handler = {
const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
-#ifdef CONFIG_9P_FS_POSIX_ACL
- &v9fs_xattr_acl_access_handler,
- &v9fs_xattr_acl_default_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_9P_FS_SECURITY
&v9fs_xattr_security_handler,
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index 3e11fc3331eb..b5636e544c8a 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -11,8 +11,6 @@
#include <net/9p/client.h>
extern const struct xattr_handler *v9fs_xattr_handlers[];
-extern const struct xattr_handler v9fs_xattr_acl_access_handler;
-extern const struct xattr_handler v9fs_xattr_acl_default_handler;
ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
void *buffer, size_t buffer_size);
diff --git a/fs/affs/file.c b/fs/affs/file.c
index cefa222f7881..8daeed31e1af 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -880,7 +880,7 @@ affs_truncate(struct inode *inode)
if (inode->i_size > AFFS_I(inode)->mmu_private) {
struct address_space *mapping = inode->i_mapping;
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
loff_t isize = inode->i_size;
int res;
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 0a090d614e76..d4ddb20d6732 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -13,6 +13,8 @@
#include "internal.h"
#include "afs_cm.h"
#include "protocol_yfs.h"
+#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
+#include <trace/events/rxrpc.h>
static int afs_deliver_cb_init_call_back_state(struct afs_call *);
static int afs_deliver_cb_init_call_back_state3(struct afs_call *);
@@ -191,7 +193,7 @@ static void afs_cm_destructor(struct afs_call *call)
* Abort a service call from within an action function.
*/
static void afs_abort_service_call(struct afs_call *call, u32 abort_code, int error,
- const char *why)
+ enum rxrpc_abort_reason why)
{
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, error, why);
@@ -298,7 +300,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
if (call->count2 != call->count && call->count2 != 0)
return afs_protocol_error(call, afs_eproto_cb_count);
call->iter = &call->def_iter;
- iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4);
+ iov_iter_discard(&call->def_iter, ITER_DEST, call->count2 * 3 * 4);
call->unmarshall++;
fallthrough;
@@ -469,7 +471,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
afs_send_empty_reply(call);
else
- afs_abort_service_call(call, 1, 1, "K-1");
+ afs_abort_service_call(call, 1, 1, afs_abort_probeuuid_negative);
afs_put_call(call);
_leave("");
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 230c2d19116d..b7c1f8c84b38 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -77,6 +77,7 @@ const struct address_space_operations afs_dir_aops = {
.dirty_folio = afs_dir_dirty_folio,
.release_folio = afs_dir_release_folio,
.invalidate_folio = afs_dir_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
};
const struct dentry_operations afs_fs_dentry_operations = {
@@ -305,7 +306,7 @@ expand:
req->actual_len = i_size; /* May change */
req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
req->data_version = dvnode->status.data_version; /* May change */
- iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages,
+ iov_iter_xarray(&req->def_iter, ITER_DEST, &dvnode->netfs.inode.i_mapping->i_pages,
0, i_size);
req->iter = &req->def_iter;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d1cfb235c4b9..68d6d5dc608d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -58,14 +58,15 @@ const struct address_space_operations afs_file_aops = {
.invalidate_folio = afs_invalidate_folio,
.write_begin = afs_write_begin,
.write_end = afs_write_end,
- .writepage = afs_writepage,
.writepages = afs_writepages,
+ .migrate_folio = filemap_migrate_folio,
};
const struct address_space_operations afs_symlink_aops = {
.read_folio = afs_symlink_read_folio,
.release_folio = afs_release_folio,
.invalidate_folio = afs_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
};
static const struct vm_operations_struct afs_vm_ops = {
@@ -324,7 +325,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
- iov_iter_xarray(&fsreq->def_iter, READ,
+ iov_iter_xarray(&fsreq->def_iter, ITER_DEST,
&fsreq->vnode->netfs.inode.i_mapping->i_pages,
fsreq->pos, fsreq->len);
@@ -346,7 +347,7 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio)
fsreq->len = folio_size(folio);
fsreq->vnode = vnode;
fsreq->iter = &fsreq->def_iter;
- iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages,
+ iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages,
fsreq->pos, fsreq->len);
ret = afs_fetch_data(fsreq->vnode, fsreq);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 3ac5fcf98d0d..daaf3810cc92 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -366,12 +366,15 @@ void afs_fs_probe_dispatcher(struct work_struct *work)
unsigned long nowj, timer_at, poll_at;
bool first_pass = true, set_timer = false;
- if (!net->live)
+ if (!net->live) {
+ afs_dec_servers_outstanding(net);
return;
+ }
_enter("");
if (list_empty(&net->fs_probe_fast) && list_empty(&net->fs_probe_slow)) {
+ afs_dec_servers_outstanding(net);
_leave(" [none]");
return;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 723d162078a3..fd8567b98e2b 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -972,13 +972,6 @@ extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
/*
- * cache.c
- */
-#ifdef CONFIG_AFS_FSCACHE
-extern struct fscache_netfs afs_cache_netfs;
-#endif
-
-/*
* callback.c
*/
extern void afs_invalidate_mmap_work(struct work_struct *);
@@ -1301,7 +1294,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si
call->iov_len = size;
call->kvec[0].iov_base = buf;
call->kvec[0].iov_len = size;
- iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size);
+ iov_iter_kvec(&call->def_iter, ITER_DEST, call->kvec, 1, size);
}
static inline void afs_extract_to_tmp(struct afs_call *call)
@@ -1319,7 +1312,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call)
static inline void afs_extract_discard(struct afs_call *call, size_t size)
{
call->iov_len = size;
- iov_iter_discard(&call->def_iter, READ, size);
+ iov_iter_discard(&call->def_iter, ITER_DEST, size);
}
static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
@@ -1391,7 +1384,6 @@ extern void afs_put_permits(struct afs_permits *);
extern void afs_clear_permits(struct afs_vnode *);
extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int,
struct afs_status_cb *);
-extern void afs_zap_permits(struct rcu_head *);
extern struct key *afs_request_key(struct afs_cell *);
extern struct key *afs_request_key_rcu(struct afs_cell *);
extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index eccc3cd0cb70..7817e2b860e5 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -13,6 +13,8 @@
#include "internal.h"
#include "afs_cm.h"
#include "protocol_yfs.h"
+#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
+#include <trace/events/rxrpc.h>
struct workqueue_struct *afs_async_calls;
@@ -359,7 +361,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0);
@@ -397,10 +399,11 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
error_do_abort:
if (ret != -ECONNABORTED) {
rxrpc_kernel_abort_call(call->net->socket, rxcall,
- RX_USER_ABORT, ret, "KSD");
+ RX_USER_ABORT, ret,
+ afs_abort_send_data_error);
} else {
len = 0;
- iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
rxrpc_kernel_recv_data(call->net->socket, rxcall,
&msg.msg_iter, &len, false,
&call->abort_code, &call->service_id);
@@ -485,7 +488,7 @@ static void afs_deliver_to_call(struct afs_call *call)
) {
if (state == AFS_CALL_SV_AWAIT_ACK) {
len = 0;
- iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0);
+ iov_iter_kvec(&call->def_iter, ITER_DEST, NULL, 0, 0);
ret = rxrpc_kernel_recv_data(call->net->socket,
call->rxcall, &call->def_iter,
&len, false, &remote_abort,
@@ -527,7 +530,8 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENOTSUPP:
abort_code = RXGEN_OPCODE;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- abort_code, ret, "KIV");
+ abort_code, ret,
+ afs_abort_op_not_supported);
goto local_abort;
case -EIO:
pr_err("kAFS: Call %u in bad state %u\n",
@@ -542,12 +546,14 @@ static void afs_deliver_to_call(struct afs_call *call)
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- abort_code, ret, "KUM");
+ abort_code, ret,
+ afs_abort_unmarshal_error);
goto local_abort;
default:
abort_code = RX_CALL_DEAD;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- abort_code, ret, "KER");
+ abort_code, ret,
+ afs_abort_general_error);
goto local_abort;
}
}
@@ -619,7 +625,8 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
/* Kill off the call if it's still live. */
_debug("call interrupted");
if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- RX_USER_ABORT, -EINTR, "KWI"))
+ RX_USER_ABORT, -EINTR,
+ afs_abort_interrupted))
afs_set_call_complete(call, -EINTR, 0);
}
}
@@ -822,7 +829,7 @@ void afs_send_empty_reply(struct afs_call *call)
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE, NULL, 0, 0);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, NULL, 0, 0);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -836,7 +843,8 @@ void afs_send_empty_reply(struct afs_call *call)
case -ENOMEM:
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM,
+ afs_abort_oom);
fallthrough;
default:
_leave(" [error]");
@@ -862,7 +870,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
iov[0].iov_len = len;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, len);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -878,7 +886,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM,
+ afs_abort_oom);
}
_leave(" [error]");
}
@@ -900,6 +909,7 @@ int afs_extract_data(struct afs_call *call, bool want_more)
ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter,
&call->iov_len, want_more, &remote_abort,
&call->service_id);
+ trace_afs_receive_data(call, call->iter, want_more, ret);
if (ret == 0 || ret == -EAGAIN)
return ret;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index f4937029dcd7..29d483c80281 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -70,11 +70,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
{
struct afs_server_list *slist;
struct afs_volume *volume;
- int ret = -ENOMEM, nr_servers = 0, i;
-
- for (i = 0; i < vldb->nr_servers; i++)
- if (vldb->fs_mask[i] & type_mask)
- nr_servers++;
+ int ret = -ENOMEM;
volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
if (!volume)
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9ebdd36eaf2f..19df10d63323 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,11 @@
#include <linux/netfs.h>
#include "internal.h"
+static int afs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ loff_t start, loff_t end, loff_t *_next,
+ bool max_one_loop);
+
static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len,
loff_t i_size, bool caching);
@@ -39,6 +44,25 @@ static void afs_folio_start_fscache(bool caching, struct folio *folio)
#endif
/*
+ * Flush out a conflicting write. This may extend the write to the surrounding
+ * pages if also dirty and contiguous to the conflicting region..
+ */
+static int afs_flush_conflicting_write(struct address_space *mapping,
+ struct folio *folio)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = folio_pos(folio),
+ .range_end = LLONG_MAX,
+ };
+ loff_t next;
+
+ return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX,
+ &next, true);
+}
+
+/*
* prepare to perform part of a write to a page
*/
int afs_write_begin(struct file *file, struct address_space *mapping,
@@ -80,7 +104,8 @@ try_again:
if (folio_test_writeback(folio)) {
trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio);
- goto flush_conflicting_write;
+ folio_unlock(folio);
+ goto wait_for_writeback;
}
/* If the file is being filled locally, allow inter-write
* spaces to be merged into writes. If it's not, only write
@@ -99,8 +124,15 @@ try_again:
* flush the page out.
*/
flush_conflicting_write:
- _debug("flush conflict");
- ret = folio_write_one(folio);
+ trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio);
+ folio_unlock(folio);
+
+ ret = afs_flush_conflicting_write(mapping, folio);
+ if (ret < 0)
+ goto error;
+
+wait_for_writeback:
+ ret = folio_wait_writeback_killable(folio);
if (ret < 0)
goto error;
@@ -609,7 +641,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
*/
afs_write_to_cache(vnode, start, len, i_size, caching);
- iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+ iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
ret = afs_store_data(vnode, &iter, start, false);
} else {
_debug("write discard %x @%llx [%llx]", len, start, i_size);
@@ -664,39 +696,12 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
}
/*
- * write a page back to the server
- * - the caller locked the page for us
- */
-int afs_writepage(struct page *subpage, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(subpage);
- ssize_t ret;
- loff_t start;
-
- _enter("{%lx},", folio_index(folio));
-
-#ifdef CONFIG_AFS_FSCACHE
- folio_wait_fscache(folio);
-#endif
-
- start = folio_index(folio) * PAGE_SIZE;
- ret = afs_write_back_from_locked_folio(folio_mapping(folio), wbc,
- folio, start, LLONG_MAX - start);
- if (ret < 0) {
- _leave(" = %zd", ret);
- return ret;
- }
-
- _leave(" = 0");
- return 0;
-}
-
-/*
* write a region of pages back to the server
*/
static int afs_writepages_region(struct address_space *mapping,
struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next)
+ loff_t start, loff_t end, loff_t *_next,
+ bool max_one_loop)
{
struct folio *folio;
struct page *head_page;
@@ -775,6 +780,9 @@ static int afs_writepages_region(struct address_space *mapping,
start += ret;
+ if (max_one_loop)
+ break;
+
cond_resched();
} while (wbc->nr_to_write > 0);
@@ -806,24 +814,27 @@ int afs_writepages(struct address_space *mapping,
if (wbc->range_cyclic) {
start = mapping->writeback_index * PAGE_SIZE;
- ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
+ ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX,
+ &next, false);
if (ret == 0) {
mapping->writeback_index = next / PAGE_SIZE;
if (start > 0 && wbc->nr_to_write > 0) {
ret = afs_writepages_region(mapping, wbc, 0,
- start, &next);
+ start, &next, false);
if (ret == 0)
mapping->writeback_index =
next / PAGE_SIZE;
}
}
} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
+ ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX,
+ &next, false);
if (wbc->nr_to_write > 0 && ret == 0)
mapping->writeback_index = next / PAGE_SIZE;
} else {
ret = afs_writepages_region(mapping, wbc,
- wbc->range_start, wbc->range_end, &next);
+ wbc->range_start, wbc->range_end,
+ &next, false);
}
up_read(&vnode->validate_lock);
@@ -1000,7 +1011,7 @@ int afs_launder_folio(struct folio *folio)
bv[0].bv_page = &folio->page;
bv[0].bv_offset = f;
bv[0].bv_len = t - f;
- iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
+ iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len);
trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
diff --git a/fs/aio.c b/fs/aio.c
index 5b2ff20ad322..562916d85cba 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1552,7 +1552,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
if (unlikely(!file->f_op->read_iter))
return -EINVAL;
- ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
+ ret = aio_setup_rw(ITER_DEST, iocb, &iovec, vectored, compat, &iter);
if (ret < 0)
return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
@@ -1580,7 +1580,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
if (unlikely(!file->f_op->write_iter))
return -EINVAL;
- ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
+ ret = aio_setup_rw(ITER_SOURCE, iocb, &iovec, vectored, compat, &iter);
if (ret < 0)
return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
diff --git a/fs/attr.c b/fs/attr.c
index 1552a5f23d6b..b45f30e516fa 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -18,6 +18,70 @@
#include <linux/evm.h>
#include <linux/ima.h>
+#include "internal.h"
+
+/**
+ * setattr_should_drop_sgid - determine whether the setgid bit needs to be
+ * removed
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ *
+ * This function determines whether the setgid bit needs to be removed.
+ * We retain backwards compatibility and require setgid bit to be removed
+ * unconditionally if S_IXGRP is set. Otherwise we have the exact same
+ * requirements as setattr_prepare() and setattr_copy().
+ *
+ * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise.
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+ const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+
+ if (!(mode & S_ISGID))
+ return 0;
+ if (mode & S_IXGRP)
+ return ATTR_KILL_SGID;
+ if (!in_group_or_capable(mnt_userns, inode,
+ i_gid_into_vfsgid(mnt_userns, inode)))
+ return ATTR_KILL_SGID;
+ return 0;
+}
+
+/**
+ * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
+ * be dropped
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ *
+ * This function determines whether the set{g,u}id bits need to be removed.
+ * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the
+ * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both
+ * set{g,u}id bits need to be removed the corresponding mask of both flags is
+ * returned.
+ *
+ * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits
+ * to remove, 0 otherwise.
+ */
+int setattr_should_drop_suidgid(struct user_namespace *mnt_userns,
+ struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ kill |= setattr_should_drop_sgid(mnt_userns, inode);
+
+ if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
+EXPORT_SYMBOL(setattr_should_drop_suidgid);
+
/**
* chown_ok - verify permissions to chown inode
* @mnt_userns: user namespace of the mount @inode was found from
@@ -140,8 +204,7 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* Also check the setgid bit! */
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_or_capable(mnt_userns, inode, vfsgid))
attr->ia_mode &= ~S_ISGID;
}
@@ -251,9 +314,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
- if (!vfsgid_in_group_p(vfsgid) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_or_capable(mnt_userns, inode,
+ i_gid_into_vfsgid(mnt_userns, inode)))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
@@ -375,7 +437,7 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
}
}
if (ia_valid & ATTR_KILL_SGID) {
- if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+ if (mode & S_ISGID) {
if (!(ia_valid & ATTR_MODE)) {
ia_valid = attr->ia_valid |= ATTR_MODE;
attr->ia_mode = inode->i_mode;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 9d1cde8066cf..92737166203f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -154,7 +154,7 @@ static int bad_inode_tmpfile(struct user_namespace *mnt_userns,
}
static int bad_inode_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl,
+ struct dentry *dentry, struct posix_acl *acl,
int type)
{
return -EIO;
@@ -177,7 +177,7 @@ static const struct inode_operations bad_inode_ops =
.setattr = bad_inode_setattr,
.listxattr = bad_inode_listxattr,
.get_link = bad_inode_get_link,
- .get_acl = bad_inode_get_acl,
+ .get_inode_acl = bad_inode_get_acl,
.fiemap = bad_inode_fiemap,
.update_time = bad_inode_update_time,
.atomic_open = bad_inode_atomic_open,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6a11025e5850..9a780fafc539 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -248,7 +248,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
} while (0)
#ifdef ARCH_DLINFO
- /*
+ /*
* ARCH_DLINFO must come first so PPC can do its special alignment of
* AUXV.
* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in
@@ -456,13 +456,13 @@ static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr)
*
* Loads ELF program headers from the binary file elf_file, which has the ELF
* header pointed to by elf_ex, into a newly allocated array. The caller is
- * responsible for freeing the allocated data. Returns an ERR_PTR upon failure.
+ * responsible for freeing the allocated data. Returns NULL upon failure.
*/
static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
struct file *elf_file)
{
struct elf_phdr *elf_phdata = NULL;
- int retval, err = -1;
+ int retval = -1;
unsigned int size;
/*
@@ -484,15 +484,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
/* Read in the program headers */
retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff);
- if (retval < 0) {
- err = retval;
- goto out;
- }
- /* Success! */
- err = 0;
out:
- if (err) {
+ if (retval) {
kfree(elf_phdata);
elf_phdata = NULL;
}
@@ -1020,7 +1014,7 @@ out_free_interp:
executable_stack);
if (retval < 0)
goto out_free_dentry;
-
+
elf_bss = 0;
elf_brk = 0;
@@ -1043,7 +1037,7 @@ out_free_interp:
if (unlikely (elf_brk > elf_bss)) {
unsigned long nbyte;
-
+
/* There was a PT_LOAD segment with p_memsz > p_filesz
before this one. Map anonymous pages, if needed,
and clear the area. */
@@ -1166,7 +1160,7 @@ out_free_interp:
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, total_size);
if (BAD_ADDR(error)) {
- retval = IS_ERR((void *)error) ?
+ retval = IS_ERR_VALUE(error) ?
PTR_ERR((void*)error) : -EINVAL;
goto out_free_dentry;
}
@@ -1251,7 +1245,7 @@ out_free_interp:
interpreter,
load_bias, interp_elf_phdata,
&arch_state);
- if (!IS_ERR((void *)elf_entry)) {
+ if (!IS_ERR_VALUE(elf_entry)) {
/*
* load_elf_interp() returns relocation
* adjustment
@@ -1260,7 +1254,7 @@ out_free_interp:
elf_entry += interp_elf_ex->e_entry;
}
if (BAD_ADDR(elf_entry)) {
- retval = IS_ERR((void *)elf_entry) ?
+ retval = IS_ERR_VALUE(elf_entry) ?
(int)elf_entry : -EINVAL;
goto out_free_dentry;
}
@@ -1521,7 +1515,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
phdr->p_align = 0;
}
-static void fill_note(struct memelfnote *note, const char *name, int type,
+static void fill_note(struct memelfnote *note, const char *name, int type,
unsigned int sz, void *data)
{
note->name = name;
@@ -1724,7 +1718,6 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
return 0;
}
-#ifdef CORE_DUMP_USE_REGSET
#include <linux/regset.h>
struct elf_thread_core_info {
@@ -1745,6 +1738,7 @@ struct elf_note_info {
int thread_notes;
};
+#ifdef CORE_DUMP_USE_REGSET
/*
* When a regset has a writeback hook, we call it on each thread before
* dumping user memory. On register window machines, this makes sure the
@@ -1824,34 +1818,58 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
return 1;
}
+#else
+static int fill_thread_core_info(struct elf_thread_core_info *t,
+ const struct user_regset_view *view,
+ long signr, struct elf_note_info *info)
+{
+ struct task_struct *p = t->task;
+ elf_fpregset_t *fpu;
+
+ fill_prstatus(&t->prstatus.common, p, signr);
+ elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
+
+ fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+ &(t->prstatus));
+ info->size += notesize(&t->notes[0]);
+
+ fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL);
+ if (!fpu || !elf_core_copy_task_fpregs(p, fpu)) {
+ kfree(fpu);
+ return 1;
+ }
+
+ t->prstatus.pr_fpvalid = 1;
+ fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+ info->size += notesize(&t->notes[1]);
+
+ return 1;
+}
+#endif
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
struct coredump_params *cprm)
{
struct task_struct *dump_task = current;
- const struct user_regset_view *view = task_user_regset_view(dump_task);
+ const struct user_regset_view *view;
struct elf_thread_core_info *t;
struct elf_prpsinfo *psinfo;
struct core_thread *ct;
- unsigned int i;
-
- info->size = 0;
- info->thread = NULL;
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
- if (psinfo == NULL) {
- info->psinfo.data = NULL; /* So we don't free this wrongly */
+ if (!psinfo)
return 0;
- }
-
fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+#ifdef CORE_DUMP_USE_REGSET
+ view = task_user_regset_view(dump_task);
+
/*
* Figure out how many notes we're going to need for each thread.
*/
info->thread_notes = 0;
- for (i = 0; i < view->n; ++i)
+ for (int i = 0; i < view->n; ++i)
if (view->regsets[i].core_note_type != 0)
++info->thread_notes;
@@ -1870,11 +1888,23 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
*/
fill_elf_header(elf, phdrs,
view->e_machine, view->e_flags);
+#else
+ view = NULL;
+ info->thread_notes = 2;
+ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
+#endif
/*
* Allocate a structure for each thread.
*/
- for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
+ info->thread = kzalloc(offsetof(struct elf_thread_core_info,
+ notes[info->thread_notes]),
+ GFP_KERNEL);
+ if (unlikely(!info->thread))
+ return 0;
+
+ info->thread->task = dump_task;
+ for (ct = dump_task->signal->core_state->dumper.next; ct; ct = ct->next) {
t = kzalloc(offsetof(struct elf_thread_core_info,
notes[info->thread_notes]),
GFP_KERNEL);
@@ -1882,17 +1912,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
return 0;
t->task = ct->task;
- if (ct->task == dump_task || !info->thread) {
- t->next = info->thread;
- info->thread = t;
- } else {
- /*
- * Make sure to keep the original task at
- * the head of the list.
- */
- t->next = info->thread->next;
- info->thread->next = t;
- }
+ t->next = info->thread->next;
+ info->thread->next = t;
}
/*
@@ -1920,11 +1941,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
return 1;
}
-static size_t get_note_info_size(struct elf_note_info *info)
-{
- return info->size;
-}
-
/*
* Write all the notes for each thread. When writing the first thread, the
* process-wide notes are interleaved after the first thread-specific note.
@@ -1979,197 +1995,6 @@ static void free_note_info(struct elf_note_info *info)
kvfree(info->files.data);
}
-#else
-
-/* Here is the structure in which status of each thread is captured. */
-struct elf_thread_status
-{
- struct list_head list;
- struct elf_prstatus prstatus; /* NT_PRSTATUS */
- elf_fpregset_t fpu; /* NT_PRFPREG */
- struct task_struct *thread;
- struct memelfnote notes[3];
- int num_notes;
-};
-
-/*
- * In order to add the specific thread information for the elf file format,
- * we need to keep a linked list of every threads pr_status and then create
- * a single section for them in the final core file.
- */
-static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
-{
- int sz = 0;
- struct task_struct *p = t->thread;
- t->num_notes = 0;
-
- fill_prstatus(&t->prstatus.common, p, signr);
- elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
-
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
- &(t->prstatus));
- t->num_notes++;
- sz += notesize(&t->notes[0]);
-
- if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
- &t->fpu))) {
- fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
- &(t->fpu));
- t->num_notes++;
- sz += notesize(&t->notes[1]);
- }
- return sz;
-}
-
-struct elf_note_info {
- struct memelfnote *notes;
- struct memelfnote *notes_files;
- struct elf_prstatus *prstatus; /* NT_PRSTATUS */
- struct elf_prpsinfo *psinfo; /* NT_PRPSINFO */
- struct list_head thread_list;
- elf_fpregset_t *fpu;
- user_siginfo_t csigdata;
- int thread_status_size;
- int numnote;
-};
-
-static int elf_note_info_init(struct elf_note_info *info)
-{
- memset(info, 0, sizeof(*info));
- INIT_LIST_HEAD(&info->thread_list);
-
- /* Allocate space for ELF notes */
- info->notes = kmalloc_array(8, sizeof(struct memelfnote), GFP_KERNEL);
- if (!info->notes)
- return 0;
- info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
- if (!info->psinfo)
- return 0;
- info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
- if (!info->prstatus)
- return 0;
- info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
- if (!info->fpu)
- return 0;
- return 1;
-}
-
-static int fill_note_info(struct elfhdr *elf, int phdrs,
- struct elf_note_info *info,
- struct coredump_params *cprm)
-{
- struct core_thread *ct;
- struct elf_thread_status *ets;
-
- if (!elf_note_info_init(info))
- return 0;
-
- for (ct = current->signal->core_state->dumper.next;
- ct; ct = ct->next) {
- ets = kzalloc(sizeof(*ets), GFP_KERNEL);
- if (!ets)
- return 0;
-
- ets->thread = ct->task;
- list_add(&ets->list, &info->thread_list);
- }
-
- list_for_each_entry(ets, &info->thread_list, list) {
- int sz;
-
- sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
- info->thread_status_size += sz;
- }
- /* now collect the dump for the current */
- memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
- elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
-
- /* Set up header */
- fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
-
- /*
- * Set up the notes in similar form to SVR4 core dumps made
- * with info from their /proc.
- */
-
- fill_note(info->notes + 0, "CORE", NT_PRSTATUS,
- sizeof(*info->prstatus), info->prstatus);
- fill_psinfo(info->psinfo, current->group_leader, current->mm);
- fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
- sizeof(*info->psinfo), info->psinfo);
-
- fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
- fill_auxv_note(info->notes + 3, current->mm);
- info->numnote = 4;
-
- if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
- info->notes_files = info->notes + info->numnote;
- info->numnote++;
- }
-
- /* Try to dump the FPU. */
- info->prstatus->pr_fpvalid =
- elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
- if (info->prstatus->pr_fpvalid)
- fill_note(info->notes + info->numnote++,
- "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
- return 1;
-}
-
-static size_t get_note_info_size(struct elf_note_info *info)
-{
- int sz = 0;
- int i;
-
- for (i = 0; i < info->numnote; i++)
- sz += notesize(info->notes + i);
-
- sz += info->thread_status_size;
-
- return sz;
-}
-
-static int write_note_info(struct elf_note_info *info,
- struct coredump_params *cprm)
-{
- struct elf_thread_status *ets;
- int i;
-
- for (i = 0; i < info->numnote; i++)
- if (!writenote(info->notes + i, cprm))
- return 0;
-
- /* write out the thread status notes section */
- list_for_each_entry(ets, &info->thread_list, list) {
- for (i = 0; i < ets->num_notes; i++)
- if (!writenote(&ets->notes[i], cprm))
- return 0;
- }
-
- return 1;
-}
-
-static void free_note_info(struct elf_note_info *info)
-{
- while (!list_empty(&info->thread_list)) {
- struct list_head *tmp = info->thread_list.next;
- list_del(tmp);
- kfree(list_entry(tmp, struct elf_thread_status, list));
- }
-
- /* Free data possibly allocated by fill_files_note(): */
- if (info->notes_files)
- kvfree(info->notes_files->data);
-
- kfree(info->prstatus);
- kfree(info->psinfo);
- kfree(info->notes);
- kfree(info->fpu);
-}
-
-#endif
-
static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
elf_addr_t e_shoff, int segs)
{
@@ -2209,7 +2034,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = cprm->vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs(cprm);
/* for notes section */
segs++;
@@ -2233,7 +2058,7 @@ static int elf_core_dump(struct coredump_params *cprm)
/* Write notes phdr entry */
{
- size_t sz = get_note_info_size(&info);
+ size_t sz = info.size;
/* For cell spufs */
sz += elf_coredump_extra_notes_size();
@@ -2249,7 +2074,7 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
offset += cprm->vma_data_size;
- offset += elf_core_extra_data_size();
+ offset += elf_core_extra_data_size(cprm);
e_shoff = offset;
if (e_phnum == PN_XNUM) {
@@ -2295,7 +2120,7 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
- /* write out the notes section */
+ /* write out the notes section */
if (!write_note_info(&info, cprm))
goto end_coredump;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 08d0c8797828..a05eafcacfb2 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -434,8 +434,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
current->mm->start_stack = current->mm->start_brk + stack_size;
#endif
- if (create_elf_fdpic_tables(bprm, current->mm,
- &exec_params, &interp_params) < 0)
+ retval = create_elf_fdpic_tables(bprm, current->mm, &exec_params,
+ &interp_params);
+ if (retval < 0)
goto error;
kdebug("- start_code %lx", current->mm->start_code);
@@ -1508,7 +1509,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = cprm->vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs(cprm);
/* for notes section */
segs++;
@@ -1554,7 +1555,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
offset += cprm->vma_data_size;
- offset += elf_core_extra_data_size();
+ offset += elf_core_extra_data_size(cprm);
e_shoff = offset;
if (e_phnum == PN_XNUM) {
@@ -1603,7 +1604,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
- /* write out the notes section */
+ /* write out the notes section */
if (!writenote(thread_list->notes, cprm))
goto end_coredump;
if (!writenote(&psinfo_note, cprm))
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e1eae7ea823a..bb202ad369d5 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -44,10 +44,10 @@ static LIST_HEAD(entries);
static int enabled = 1;
enum {Enabled, Magic};
-#define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
-#define MISC_FMT_OPEN_BINARY (1 << 30)
-#define MISC_FMT_CREDENTIALS (1 << 29)
-#define MISC_FMT_OPEN_FILE (1 << 28)
+#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
+#define MISC_FMT_OPEN_BINARY (1UL << 30)
+#define MISC_FMT_CREDENTIALS (1UL << 29)
+#define MISC_FMT_OPEN_FILE (1UL << 28)
typedef struct {
struct list_head list;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 100bae33c677..3da1779e8b79 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -110,10 +110,11 @@ out:
return ret;
}
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int ret;
+ struct inode *inode = d_inode(dentry);
umode_t old_mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index 45197b4f73bf..39bd36e6eeb7 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -6,7 +6,7 @@
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
-int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
struct posix_acl *acl, int type);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 940b404c8f28..98a800b8bd43 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5307,7 +5307,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
err = btrfs_dirty_inode(BTRFS_I(inode));
if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ err = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
}
return err;
@@ -11362,7 +11362,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.mknod = btrfs_mknod,
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
@@ -11415,7 +11415,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.listxattr = btrfs_listxattr,
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.fileattr_get = btrfs_fileattr_get,
@@ -11426,7 +11426,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.listxattr = btrfs_listxattr,
- .get_acl = btrfs_get_acl,
+ .get_inode_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4fd6b61b06a4..7e348bd2ccde 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4393,7 +4393,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
goto out_acct;
}
- ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
@@ -4492,7 +4492,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (args.len > args.unencoded_len - args.unencoded_offset)
goto out_acct;
- ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 4bed0839b640..57d8c72737e1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -764,11 +764,11 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
struct btrfs_ordered_extent *ordered;
if (start + len < start) {
- orig_end = INT_LIMIT(loff_t);
+ orig_end = OFFSET_MAX;
} else {
orig_end = start + len - 1;
- if (orig_end > INT_LIMIT(loff_t))
- orig_end = INT_LIMIT(loff_t);
+ if (orig_end > OFFSET_MAX)
+ orig_end = OFFSET_MAX;
}
/* start IO across the range first to instantiate any delalloc
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 67f7c698ade3..e65e6b6600a7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -486,6 +486,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
old_buf_len = p->buf_len;
/*
+ * Allocate to the next largest kmalloc bucket size, to let
+ * the fast path happen most of the time.
+ */
+ len = kmalloc_size_roundup(len);
+ /*
* First time the inline_buf does not suffice
*/
if (p->buf == p->inline_buf) {
@@ -498,11 +503,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (!tmp_buf)
return -ENOMEM;
p->buf = tmp_buf;
- /*
- * The real size of the buffer is bigger, this will let the fast path
- * happen most of the time
- */
- p->buf_len = ksize(p->buf);
+ p->buf_len = len;
if (p->reversed) {
tmp_buf = p->buf + old_buf_len - path_len - 1;
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 000a28f46e59..175a25fcade8 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -385,38 +385,35 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
term_func, term_func_priv);
}
-/*
- * Prepare a read operation, shortening it to a cached/uncached
- * boundary as appropriate.
- */
-static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
- loff_t i_size)
+static inline enum netfs_io_source
+cachefiles_do_prepare_read(struct netfs_cache_resources *cres,
+ loff_t start, size_t *_len, loff_t i_size,
+ unsigned long *_flags, ino_t netfs_ino)
{
enum cachefiles_prepare_read_trace why;
- struct netfs_io_request *rreq = subreq->rreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct cachefiles_object *object;
+ struct cachefiles_object *object = NULL;
struct cachefiles_cache *cache;
struct fscache_cookie *cookie = fscache_cres_cookie(cres);
const struct cred *saved_cred;
struct file *file = cachefiles_cres_file(cres);
enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
+ size_t len = *_len;
loff_t off, to;
ino_t ino = file ? file_inode(file)->i_ino : 0;
int rc;
- _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
+ _enter("%zx @%llx/%llx", len, start, i_size);
- if (subreq->start >= i_size) {
+ if (start >= i_size) {
ret = NETFS_FILL_WITH_ZEROES;
why = cachefiles_trace_read_after_eof;
goto out_no_object;
}
if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
- __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
why = cachefiles_trace_read_no_data;
- if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags))
+ if (!test_bit(NETFS_SREQ_ONDEMAND, _flags))
goto out_no_object;
}
@@ -437,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
retry:
off = cachefiles_inject_read_error();
if (off == 0)
- off = vfs_llseek(file, subreq->start, SEEK_DATA);
+ off = vfs_llseek(file, start, SEEK_DATA);
if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
if (off == (loff_t)-ENXIO) {
why = cachefiles_trace_read_seek_nxio;
@@ -449,21 +446,22 @@ retry:
goto out;
}
- if (off >= subreq->start + subreq->len) {
+ if (off >= start + len) {
why = cachefiles_trace_read_found_hole;
goto download_and_store;
}
- if (off > subreq->start) {
+ if (off > start) {
off = round_up(off, cache->bsize);
- subreq->len = off - subreq->start;
+ len = off - start;
+ *_len = len;
why = cachefiles_trace_read_found_part;
goto download_and_store;
}
to = cachefiles_inject_read_error();
if (to == 0)
- to = vfs_llseek(file, subreq->start, SEEK_HOLE);
+ to = vfs_llseek(file, start, SEEK_HOLE);
if (to < 0 && to >= (loff_t)-MAX_ERRNO) {
trace_cachefiles_io_error(object, file_inode(file), to,
cachefiles_trace_seek_error);
@@ -471,12 +469,13 @@ retry:
goto out;
}
- if (to < subreq->start + subreq->len) {
- if (subreq->start + subreq->len >= i_size)
+ if (to < start + len) {
+ if (start + len >= i_size)
to = round_up(to, cache->bsize);
else
to = round_down(to, cache->bsize);
- subreq->len = to - subreq->start;
+ len = to - start;
+ *_len = len;
}
why = cachefiles_trace_read_have_data;
@@ -484,12 +483,11 @@ retry:
goto out;
download_and_store:
- __set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
- if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) {
- rc = cachefiles_ondemand_read(object, subreq->start,
- subreq->len);
+ __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
+ if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) {
+ rc = cachefiles_ondemand_read(object, start, len);
if (!rc) {
- __clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags);
+ __clear_bit(NETFS_SREQ_ONDEMAND, _flags);
goto retry;
}
ret = NETFS_INVALID_READ;
@@ -497,11 +495,35 @@ download_and_store:
out:
cachefiles_end_secure(cache, saved_cred);
out_no_object:
- trace_cachefiles_prep_read(subreq, ret, why, ino);
+ trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino);
return ret;
}
/*
+ * Prepare a read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
+ loff_t i_size)
+{
+ return cachefiles_do_prepare_read(&subreq->rreq->cache_resources,
+ subreq->start, &subreq->len, i_size,
+ &subreq->flags, subreq->rreq->inode->i_ino);
+}
+
+/*
+ * Prepare an on-demand read operation, shortening it to a cached/uncached
+ * boundary as appropriate.
+ */
+static enum netfs_io_source
+cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
+ loff_t start, size_t *_len, loff_t i_size,
+ unsigned long *_flags, ino_t ino)
+{
+ return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino);
+}
+
+/*
* Prepare for a write to occur.
*/
int __cachefiles_prepare_write(struct cachefiles_object *object,
@@ -621,6 +643,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
.write = cachefiles_write,
.prepare_read = cachefiles_prepare_read,
.prepare_write = cachefiles_prepare_write,
+ .prepare_ondemand_read = cachefiles_prepare_ondemand_read,
.query_occupancy = cachefiles_query_occupancy,
};
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index f4fc8e0b847c..c7e8dd5b58d4 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -85,13 +85,14 @@ retry:
return acl;
}
-int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ceph_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int ret = 0, size = 0;
const char *name = NULL;
char *value = NULL;
struct iattr newattrs;
+ struct inode *inode = d_inode(dentry);
struct timespec64 old_ctime = inode->i_ctime;
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index dcf701b05cc1..8c74871e37c9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -288,7 +288,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
}
len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
if (err == 0)
err = -EFAULT;
@@ -327,7 +327,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
}
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
if (err < 0) {
dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
@@ -1367,7 +1367,7 @@ out:
folio_put(folio);
if (check_cap)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY);
return copied;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e54814d0c2f7..f75ad432f375 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1898,8 +1898,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
*/
-void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session)
+void ceph_check_caps(struct ceph_inode_info *ci, int flags)
{
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
@@ -1913,15 +1912,14 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
bool queue_invalidate = false;
bool tried_invalidate = false;
bool queue_writeback = false;
-
- if (session)
- ceph_get_mds_session(session);
+ struct ceph_mds_session *session = NULL;
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS;
+
/* Don't send messages until we get async create reply */
spin_unlock(&ci->i_ceph_lock);
- ceph_put_mds_session(session);
return;
}
@@ -2851,7 +2849,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
check = 1;
spin_unlock(&ci->i_ceph_lock);
if (check)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
}
static inline int get_used_fmode(int caps)
@@ -2915,7 +2913,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
while (true) {
flags &= CEPH_FILE_MODE_MASK;
- if (atomic_read(&fi->num_locks))
+ if (vfs_inode_has_locks(inode))
flags |= CHECK_FILELOCK;
_got = 0;
ret = try_get_cap_refs(inode, need, want, endoff,
@@ -3140,7 +3138,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
switch (mode) {
case PUT_CAP_REFS_SYNC:
if (last)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
else if (flushsnaps)
ceph_flush_snaps(ci, NULL);
break;
@@ -3255,7 +3253,7 @@ unlock:
spin_unlock(&ci->i_ceph_lock);
if (last) {
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
} else if (flush_snaps) {
ceph_flush_snaps(ci, NULL);
}
@@ -3604,10 +3602,9 @@ static void handle_cap_grant(struct inode *inode,
mutex_unlock(&session->s_mutex);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
- session);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
+ ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
}
/*
@@ -4333,7 +4330,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
if (inode) {
spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
iput(inode);
spin_lock(&mdsc->cap_delay_lock);
}
@@ -4362,7 +4359,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
ceph_wait_on_async_create(inode);
- ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e7e2ebac330d..6c7026cc8988 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -2033,7 +2033,7 @@ const struct inode_operations ceph_dir_iops = {
.getattr = ceph_getattr,
.setattr = ceph_setattr,
.listxattr = ceph_listxattr,
- .get_acl = ceph_get_acl,
+ .get_inode_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
.mknod = ceph_mknod,
.symlink = ceph_symlink,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 04fd34557de8..764598e1efd9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -313,7 +313,7 @@ int ceph_renew_caps(struct inode *inode, int fmode)
spin_unlock(&ci->i_ceph_lock);
dout("renew caps %p want %s issued %s updating mds_wanted\n",
inode, ceph_cap_string(wanted), ceph_cap_string(issued));
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
return 0;
}
spin_unlock(&ci->i_ceph_lock);
@@ -408,7 +408,7 @@ int ceph_open(struct inode *inode, struct file *file)
if ((issued & wanted) != wanted &&
(mds_wanted & wanted) != wanted &&
ceph_snap(inode) != CEPH_SNAPDIR)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
return ceph_init_file(inode, file, fmode);
} else if (ceph_snap(inode) != CEPH_NOSNAP &&
@@ -534,14 +534,23 @@ static void wake_async_create_waiters(struct inode *inode,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ bool check_cap = false;
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS;
+ check_cap = true;
+ }
}
ceph_kick_flushing_inode_caps(session, ci);
spin_unlock(&ci->i_ceph_lock);
+
+ if (check_cap)
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
}
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
@@ -1092,7 +1101,7 @@ static void ceph_aio_complete(struct inode *inode,
loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
if (endoff > i_size_read(inode)) {
if (ceph_inode_set_size(inode, endoff))
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
}
spin_lock(&ci->i_ceph_lock);
@@ -1161,7 +1170,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
aio_req->total_len = rc + zlen;
}
- iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs,
+ iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs,
osd_data->num_bvecs, len);
iov_iter_advance(&i, rc);
iov_iter_zero(zlen, &i);
@@ -1400,7 +1409,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
int zlen = min_t(size_t, len - ret,
size - pos - ret);
- iov_iter_bvec(&i, READ, bvecs, num_pages, len);
+ iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len);
iov_iter_advance(&i, ret);
iov_iter_zero(zlen, &i);
ret += zlen;
@@ -1421,8 +1430,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (write && pos > size) {
if (ceph_inode_set_size(inode, pos))
ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
+ CHECK_CAPS_AUTHONLY);
}
}
@@ -1577,8 +1585,7 @@ out:
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
+ CHECK_CAPS_AUTHONLY);
}
}
@@ -1906,7 +1913,7 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -2521,8 +2528,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
/* Let the MDS know about dst file size change */
if (ceph_inode_set_size(dst_inode, dst_off) ||
ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
- ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH,
- NULL);
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index bad9eeb6a1a5..23d05ec87fcc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -126,7 +126,7 @@ const struct inode_operations ceph_file_iops = {
.setattr = ceph_setattr,
.getattr = ceph_getattr,
.listxattr = ceph_listxattr,
- .get_acl = ceph_get_acl,
+ .get_inode_acl = ceph_get_acl,
.set_acl = ceph_set_acl,
};
@@ -362,7 +362,7 @@ static int ceph_fill_fragtree(struct inode *inode,
if (nsplits != ci->i_fragtree_nsplits) {
update = true;
} else if (nsplits) {
- i = prandom_u32_max(nsplits);
+ i = get_random_u32_below(nsplits);
id = le32_to_cpu(fragtree->splits[i].frag);
if (!__ceph_find_frag(ci, id))
update = true;
@@ -1909,7 +1909,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
mutex_unlock(&ci->i_truncate_mutex);
out:
if (check)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
}
/*
@@ -1969,7 +1969,7 @@ retry:
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
wake_up_all(&ci->i_cap_wq);
}
@@ -1991,7 +1991,7 @@ static void ceph_inode_work(struct work_struct *work)
__ceph_do_pending_vmtruncate(inode);
if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
ceph_flush_snaps(ci, NULL);
@@ -2255,7 +2255,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
err = __ceph_setattr(inode, attr);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
- err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
+ err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode);
return err;
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 6e061bf62ad4..deac817647eb 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -253,7 +253,7 @@ static long ceph_ioctl_lazyio(struct file *file)
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
} else {
dout("ioctl_layzio: file %p already lazy\n", file);
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 3e2843e86e27..9c8dc8a55e7e 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -32,24 +32,36 @@ void __init ceph_flock_init(void)
static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
{
- struct ceph_file_info *fi = dst->fl_file->private_data;
struct inode *inode = file_inode(dst->fl_file);
atomic_inc(&ceph_inode(inode)->i_filelock_ref);
- atomic_inc(&fi->num_locks);
+ dst->fl_u.ceph.inode = igrab(inode);
}
+/*
+ * Do not use the 'fl->fl_file' in release function, which
+ * is possibly already released by another thread.
+ */
static void ceph_fl_release_lock(struct file_lock *fl)
{
- struct ceph_file_info *fi = fl->fl_file->private_data;
- struct inode *inode = file_inode(fl->fl_file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- atomic_dec(&fi->num_locks);
+ struct inode *inode = fl->fl_u.ceph.inode;
+ struct ceph_inode_info *ci;
+
+ /*
+ * If inode is NULL it should be a request file_lock,
+ * nothing we can do.
+ */
+ if (!inode)
+ return;
+
+ ci = ceph_inode(inode);
if (atomic_dec_and_test(&ci->i_filelock_ref)) {
/* clear error when all locks are released */
spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
spin_unlock(&ci->i_ceph_lock);
}
+ fl->fl_u.ceph.inode = NULL;
+ iput(inode);
}
static const struct file_lock_operations ceph_fl_lock_ops = {
@@ -364,7 +376,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
*fcntl_count = 0;
*flock_count = 0;
- ctx = inode->i_flctx;
+ ctx = locks_inode_context(inode);
if (ctx) {
spin_lock(&ctx->flc_lock);
list_for_each_entry(lock, &ctx->flc_posix, fl_list)
@@ -418,7 +430,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
int num_fcntl_locks, int num_flock_locks)
{
struct file_lock *lock;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx = locks_inode_context(inode);
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 3fbabc98e1f7..7dac21ee6ce7 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -29,7 +29,7 @@ static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
return -1;
/* pick */
- n = prandom_u32_max(n);
+ n = get_random_u32_below(n);
for (j = 0, i = 0; i < m->possible_max_rank; i++) {
if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 40630e6f691c..0ed3be75bb9a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -593,6 +593,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */
+#define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async
+ creating finishes */
/*
* Masks of ceph inode work.
@@ -788,7 +790,6 @@ struct ceph_file_info {
struct list_head rw_contexts;
u32 filp_gen;
- atomic_t num_locks;
};
struct ceph_dir_file_info {
@@ -1117,7 +1118,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx);
struct posix_acl *ceph_get_acl(struct inode *, int, bool);
int ceph_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl, int type);
+ struct dentry *dentry, struct posix_acl *acl, int type);
int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
struct ceph_acl_sec_ctx *as_ctx);
void ceph_init_inode_acls(struct inode *inode,
@@ -1200,8 +1201,7 @@ extern void ceph_remove_capsnap(struct inode *inode,
extern void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession);
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
-extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags);
extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index ba0ded7842a7..13deb45f1ec6 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -483,17 +483,24 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count)
p->dev = dev;
p->count = count;
- if (WARN_ON(dev == WHITEOUT_DEV))
- return -EBUSY;
+ if (WARN_ON(dev == WHITEOUT_DEV)) {
+ error = -EBUSY;
+ goto err;
+ }
error = kobj_map(cdev_map, dev, count, NULL,
exact_match, exact_lock, p);
if (error)
- return error;
+ goto err;
kobject_get(p->kobj.parent);
return 0;
+
+err:
+ kfree_const(p->kobj.name);
+ p->kobj.name = NULL;
+ return error;
}
/**
@@ -547,7 +554,7 @@ int cdev_device_add(struct cdev *cdev, struct device *dev)
}
rc = device_add(dev);
- if (rc)
+ if (rc && dev->devt)
cdev_del(cdev);
return rc;
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 7c9785973f49..304a7f6cc13a 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -21,7 +21,7 @@ cifs-$(CONFIG_CIFS_XATTR) += xattr.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
-cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o dfs.o
cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 90850da390ae..56b23def4c95 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -372,6 +372,14 @@ skip_rdma:
seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d",
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
+ if (IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)) {
+ if (server->origin_fullpath)
+ seq_printf(m, "\nDFS origin full path: %s",
+ server->origin_fullpath);
+ if (server->leaf_fullpath)
+ seq_printf(m, "\nDFS leaf full path: %s",
+ server->leaf_fullpath);
+ }
seq_printf(m, "\n\n\tSessions: ");
i = 0;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b0864da9ef43..2b1a8d55b4ec 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -21,8 +21,7 @@
#include "cifsfs.h"
#include "dns_resolve.h"
#include "cifs_debug.h"
-#include "cifs_unicode.h"
-#include "dfs_cache.h"
+#include "dfs.h"
#include "fs_context.h"
static LIST_HEAD(cifs_dfs_automount_list);
@@ -60,7 +59,7 @@ void cifs_dfs_release_automount_timer(void)
* Returns pointer to the built string, or a ERR_PTR. Caller is responsible
* for freeing the returned string.
*/
-static char *
+char *
cifs_build_devname(char *nodename, const char *prepath)
{
size_t pplen;
@@ -119,200 +118,34 @@ cifs_build_devname(char *nodename, const char *prepath)
return dev;
}
-
-/**
- * cifs_compose_mount_options - creates mount options for referral
- * @sb_mountdata: parent/root DFS mount options (template)
- * @fullpath: full path in UNC format
- * @ref: optional server's referral
- * @devname: return the built cifs device name if passed pointer not NULL
- * creates mount options for submount based on template options sb_mountdata
- * and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
- *
- * Returns: pointer to new mount options or ERR_PTR.
- * Caller is responsible for freeing returned value if it is not error.
- */
-char *cifs_compose_mount_options(const char *sb_mountdata,
- const char *fullpath,
- const struct dfs_info3_param *ref,
- char **devname)
+static int set_dest_addr(struct smb3_fs_context *ctx, const char *full_path)
{
+ struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
int rc;
- char *name;
- char *mountdata = NULL;
- const char *prepath = NULL;
- int md_len;
- char *tkn_e;
- char *srvIP = NULL;
- char sep = ',';
- int off, noff;
-
- if (sb_mountdata == NULL)
- return ERR_PTR(-EINVAL);
-
- if (ref) {
- if (WARN_ON_ONCE(!ref->node_name || ref->path_consumed < 0))
- return ERR_PTR(-EINVAL);
-
- if (strlen(fullpath) - ref->path_consumed) {
- prepath = fullpath + ref->path_consumed;
- /* skip initial delimiter */
- if (*prepath == '/' || *prepath == '\\')
- prepath++;
- }
-
- name = cifs_build_devname(ref->node_name, prepath);
- if (IS_ERR(name)) {
- rc = PTR_ERR(name);
- name = NULL;
- goto compose_mount_options_err;
- }
- } else {
- name = cifs_build_devname((char *)fullpath, NULL);
- if (IS_ERR(name)) {
- rc = PTR_ERR(name);
- name = NULL;
- goto compose_mount_options_err;
- }
- }
-
- rc = dns_resolve_server_name_to_ip(name, &srvIP, NULL);
- if (rc < 0) {
- cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
- __func__, name, rc);
- goto compose_mount_options_err;
- }
-
- /*
- * In most cases, we'll be building a shorter string than the original,
- * but we do have to assume that the address in the ip= option may be
- * much longer than the original. Add the max length of an address
- * string to the length of the original string to allow for worst case.
- */
- md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
- mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL);
- if (mountdata == NULL) {
- rc = -ENOMEM;
- goto compose_mount_options_err;
- }
-
- /* copy all options except of unc,ip,prefixpath */
- off = 0;
- if (strncmp(sb_mountdata, "sep=", 4) == 0) {
- sep = sb_mountdata[4];
- strncpy(mountdata, sb_mountdata, 5);
- off += 5;
- }
-
- do {
- tkn_e = strchr(sb_mountdata + off, sep);
- if (tkn_e == NULL)
- noff = strlen(sb_mountdata + off);
- else
- noff = tkn_e - (sb_mountdata + off) + 1;
-
- if (strncasecmp(sb_mountdata + off, "cruid=", 6) == 0) {
- off += noff;
- continue;
- }
- if (strncasecmp(sb_mountdata + off, "unc=", 4) == 0) {
- off += noff;
- continue;
- }
- if (strncasecmp(sb_mountdata + off, "ip=", 3) == 0) {
- off += noff;
- continue;
- }
- if (strncasecmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
- off += noff;
- continue;
- }
- strncat(mountdata, sb_mountdata + off, noff);
- off += noff;
- } while (tkn_e);
- strcat(mountdata, sb_mountdata + off);
- mountdata[md_len] = '\0';
-
- /* copy new IP and ref share name */
- if (mountdata[strlen(mountdata) - 1] != sep)
- strncat(mountdata, &sep, 1);
- strcat(mountdata, "ip=");
- strcat(mountdata, srvIP);
-
- if (devname)
- *devname = name;
- else
- kfree(name);
-
- /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
- /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
-compose_mount_options_out:
- kfree(srvIP);
- return mountdata;
-
-compose_mount_options_err:
- kfree(mountdata);
- mountdata = ERR_PTR(rc);
- kfree(name);
- goto compose_mount_options_out;
-}
-
-/**
- * cifs_dfs_do_mount - mounts specified path using DFS full path
- *
- * Always pass down @fullpath to smb3_do_mount() so we can use the root server
- * to perform failover in case we failed to connect to the first target in the
- * referral.
- *
- * @mntpt: directory entry for the path we are trying to automount
- * @cifs_sb: parent/root superblock
- * @fullpath: full path in UNC format
- */
-static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt,
- struct cifs_sb_info *cifs_sb,
- const char *fullpath)
-{
- struct vfsmount *mnt;
- char *mountdata;
- char *devname;
-
- devname = kstrdup(fullpath, GFP_KERNEL);
- if (!devname)
- return ERR_PTR(-ENOMEM);
-
- convert_delimiter(devname, '/');
-
- /* TODO: change to call fs_context_for_mount(), fill in context directly, call fc_mount */
-
- /* See afs_mntpt_do_automount in fs/afs/mntpt.c for an example */
-
- /* strip first '\' from fullpath */
- mountdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options,
- fullpath + 1, NULL, NULL);
- if (IS_ERR(mountdata)) {
- kfree(devname);
- return (struct vfsmount *)mountdata;
- }
-
- mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
- kfree(mountdata);
- kfree(devname);
- return mnt;
+ rc = dns_resolve_server_name_to_ip(full_path, addr, NULL);
+ if (!rc)
+ cifs_set_port(addr, ctx->port);
+ return rc;
}
/*
* Create a vfsmount that we can automount
*/
-static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
+static struct vfsmount *cifs_dfs_do_automount(struct path *path)
{
+ int rc;
+ struct dentry *mntpt = path->dentry;
+ struct fs_context *fc;
struct cifs_sb_info *cifs_sb;
- void *page;
+ void *page = NULL;
+ struct smb3_fs_context *ctx, *cur_ctx;
+ struct smb3_fs_context tmp;
char *full_path;
struct vfsmount *mnt;
- cifs_dbg(FYI, "in %s\n", __func__);
- BUG_ON(IS_ROOT(mntpt));
+ if (IS_ROOT(mntpt))
+ return ERR_PTR(-ESTALE);
/*
* The MSDFS spec states that paths in DFS referral requests and
@@ -321,29 +154,53 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
* gives us the latter, so we must adjust the result.
*/
cifs_sb = CIFS_SB(mntpt->d_sb);
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) {
- mnt = ERR_PTR(-EREMOTE);
- goto cdda_exit;
- }
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+ return ERR_PTR(-EREMOTE);
+
+ cur_ctx = cifs_sb->ctx;
+
+ fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, mntpt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ctx = smb3_fc2context(fc);
page = alloc_dentry_path();
- /* always use tree name prefix */
- full_path = build_path_from_dentry_optional_prefix(mntpt, page, true);
+ full_path = dfs_get_automount_devname(mntpt, page);
if (IS_ERR(full_path)) {
mnt = ERR_CAST(full_path);
- goto free_full_path;
+ goto out;
}
- convert_delimiter(full_path, '\\');
+ convert_delimiter(full_path, '/');
cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
- mnt = cifs_dfs_do_mount(mntpt, cifs_sb, full_path);
- cifs_dbg(FYI, "%s: cifs_dfs_do_mount:%s , mnt:%p\n", __func__, full_path + 1, mnt);
+ tmp = *cur_ctx;
+ tmp.source = full_path;
+ tmp.leaf_fullpath = NULL;
+ tmp.UNC = tmp.prepath = NULL;
+
+ rc = smb3_fs_context_dup(ctx, &tmp);
+ if (rc) {
+ mnt = ERR_PTR(rc);
+ goto out;
+ }
+
+ rc = set_dest_addr(ctx, full_path);
+ if (rc) {
+ mnt = ERR_PTR(rc);
+ goto out;
+ }
+
+ rc = smb3_parse_devname(full_path, ctx);
+ if (!rc)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(rc);
-free_full_path:
+out:
+ put_fs_context(fc);
free_dentry_path(page);
-cdda_exit:
- cifs_dbg(FYI, "leaving %s\n" , __func__);
return mnt;
}
@@ -354,9 +211,9 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
{
struct vfsmount *newmnt;
- cifs_dbg(FYI, "in %s\n", __func__);
+ cifs_dbg(FYI, "%s: %pd\n", __func__, path->dentry);
- newmnt = cifs_dfs_do_automount(path->dentry);
+ newmnt = cifs_dfs_do_automount(path);
if (IS_ERR(newmnt)) {
cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__);
return newmnt;
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index d86d78d5bfdc..332588e77c31 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -108,7 +108,7 @@ struct smb3_notify_info {
#define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info)
-#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32)
+#define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32)
/*
* Flags for going down operation
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 342717bf1dc2..6f3285f1dfee 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -189,7 +189,7 @@ init_cifs_spnego(void)
* spnego upcalls.
*/
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fa480d62f313..bbf58c2439da 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -13,6 +13,9 @@
#include <linux/string.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
+#include <uapi/linux/posix_acl.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <keys/user-type.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -20,6 +23,8 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "fs_context.h"
+#include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
/* security id for everyone/world system group */
static const struct cifs_sid sid_everyone = {
@@ -465,7 +470,7 @@ init_cifs_idmap(void)
* this is used to prevent malicious redirections from being installed
* with add_key().
*/
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
@@ -1668,3 +1673,137 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
kfree(pntsd);
return rc;
}
+
+struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX)
+ struct posix_acl *acl = NULL;
+ ssize_t rc = -EOPNOTSUPP;
+ unsigned int xid;
+ struct super_block *sb = dentry->d_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct tcon_link *tlink;
+ struct cifs_tcon *pTcon;
+ const char *full_path;
+ void *page;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return ERR_CAST(tlink);
+ pTcon = tlink_tcon(tlink);
+
+ xid = get_xid();
+ page = alloc_dentry_path();
+
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ acl = ERR_CAST(full_path);
+ goto out;
+ }
+
+ /* return alt name if available as pseudo attr */
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_get_acl(xid, pTcon, full_path, &acl,
+ ACL_TYPE_ACCESS,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_get_acl(xid, pTcon, full_path, &acl,
+ ACL_TYPE_DEFAULT,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+ }
+
+ if (rc < 0) {
+ if (rc == -EINVAL)
+ acl = ERR_PTR(-EOPNOTSUPP);
+ else
+ acl = ERR_PTR(rc);
+ }
+
+out:
+ free_dentry_path(page);
+ free_xid(xid);
+ cifs_put_tlink(tlink);
+ return acl;
+#else
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+}
+
+int cifs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+#if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX)
+ int rc = -EOPNOTSUPP;
+ unsigned int xid;
+ struct super_block *sb = dentry->d_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct tcon_link *tlink;
+ struct cifs_tcon *pTcon;
+ const char *full_path;
+ void *page;
+
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ pTcon = tlink_tcon(tlink);
+
+ xid = get_xid();
+ page = alloc_dentry_path();
+
+ full_path = build_path_from_dentry(dentry, page);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ goto out;
+ }
+
+ if (!acl)
+ goto out;
+
+ /* return dos attributes as pseudo xattr */
+ /* return alt name if available as pseudo attr */
+
+ /* if proc/fs/cifs/streamstoxattr is set then
+ search server for EAs or streams to
+ returns as xattrs */
+ if (posix_acl_xattr_size(acl->a_count) > CIFSMaxBufSize) {
+ cifs_dbg(FYI, "size of EA value too large\n");
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
+ ACL_TYPE_ACCESS,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ if (sb->s_flags & SB_POSIXACL)
+ rc = cifs_do_set_acl(xid, pTcon, full_path, acl,
+ ACL_TYPE_DEFAULT,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ break;
+ }
+
+out:
+ free_dentry_path(page);
+ free_xid(xid);
+ cifs_put_tlink(tlink);
+ return rc;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5db73c0f792a..cbc18b4a9cb2 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -278,6 +278,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
* ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) +
* unicode length of a netbios domain name
*/
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.len = size + 2 * dlen;
ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
if (!ses->auth_key.response) {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 712a43161448..10e00c624922 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -678,9 +678,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
- /* Only display max_credits if it was overridden on mount */
+ /* Only display the following if overridden on mount */
if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE)
seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits);
+ if (tcon->ses->server->tcp_nodelay)
+ seq_puts(s, ",tcpnodelay");
+ if (tcon->ses->server->noautotune)
+ seq_puts(s, ",noautotune");
+ if (tcon->ses->server->noblocksnd)
+ seq_puts(s, ",noblocksend");
if (tcon->snapshot_time)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
@@ -890,12 +896,6 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
goto out;
}
- rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, NULL);
- if (rc) {
- root = ERR_PTR(rc);
- goto out;
- }
-
rc = cifs_setup_cifs_sb(cifs_sb);
if (rc) {
root = ERR_PTR(rc);
@@ -1133,6 +1133,8 @@ const struct inode_operations cifs_dir_inode_ops = {
.symlink = cifs_symlink,
.mknod = cifs_mknod,
.listxattr = cifs_listxattr,
+ .get_acl = cifs_get_acl,
+ .set_acl = cifs_set_acl,
};
const struct inode_operations cifs_file_inode_ops = {
@@ -1141,6 +1143,8 @@ const struct inode_operations cifs_file_inode_ops = {
.permission = cifs_permission,
.listxattr = cifs_listxattr,
.fiemap = cifs_fiemap,
+ .get_acl = cifs_get_acl,
+ .set_acl = cifs_set_acl,
};
const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 388b745a978e..63a0ac2b9355 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -105,8 +105,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_flush(struct file *, fl_owner_t id);
-extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
-extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma);
+extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
extern const struct file_operations cifs_dir_ops;
extern int cifs_dir_open(struct inode *inode, struct file *file);
extern int cifs_readdir(struct file *file, struct dir_context *ctx);
@@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 40
-#define CIFS_VERSION "2.40"
+#define SMB3_PRODUCT_BUILD 41
+#define CIFS_VERSION "2.41"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1420acf987f0..cfdd5bf701a1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -13,6 +13,8 @@
#include <linux/in6.h>
#include <linux/inet.h>
#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/mm.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/utsname.h>
@@ -21,7 +23,6 @@
#include "cifs_fs_sb.h"
#include "cifsacl.h"
#include <crypto/internal/hash.h>
-#include <linux/scatterlist.h>
#include <uapi/linux/cifs/cifs_mount.h>
#include "../smbfs_common/smb2pdu.h"
#include "smb2pdu.h"
@@ -106,6 +107,8 @@
#define CIFS_MAX_WORKSTATION_LEN (__NEW_UTS_LEN + 1) /* reasonable max for client */
+#define CIFS_DFS_ROOT_SES(ses) ((ses)->dfs_root_ses ?: (ses))
+
/*
* CIFS vfs client Status information (based on what we know.)
*/
@@ -737,8 +740,6 @@ struct TCP_Server_Info {
bool use_swn_dstaddr;
struct sockaddr_storage swn_dstaddr;
#endif
-#ifdef CONFIG_CIFS_DFS_UPCALL
- bool is_dfs_conn; /* if a dfs connection */
struct mutex refpath_lock; /* protects leaf_fullpath */
/*
* Canonical DFS full paths that were used to chase referrals in mount and reconnect.
@@ -752,7 +753,6 @@ struct TCP_Server_Info {
* format: \\HOST\SHARE\[OPTIONAL PATH]
*/
char *origin_fullpath, *leaf_fullpath, *current_fullpath;
-#endif
};
static inline bool is_smb1(struct TCP_Server_Info *server)
@@ -785,6 +785,7 @@ static inline unsigned int
in_flight(struct TCP_Server_Info *server)
{
unsigned int num;
+
spin_lock(&server->req_lock);
num = server->in_flight;
spin_unlock(&server->req_lock);
@@ -795,6 +796,7 @@ static inline bool
has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
{
int num;
+
spin_lock(&server->req_lock);
num = *credits;
spin_unlock(&server->req_lock);
@@ -1025,7 +1027,7 @@ struct cifs_ses {
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */
- unsigned overrideSecFlg; /* if non-zero override global sec flags */
+ unsigned int overrideSecFlg; /* if non-zero override global sec flags */
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
char *serverDomain; /* security realm of server */
@@ -1099,6 +1101,7 @@ struct cifs_ses {
*/
unsigned long chans_need_reconnect;
/* ========= end: protected by chan_lock ======== */
+ struct cifs_ses *dfs_root_ses;
};
static inline bool
@@ -1381,7 +1384,7 @@ struct cifsFileInfo {
__u32 pid; /* process id who opened file */
struct cifs_fid fid; /* file id from remote */
struct list_head rlist; /* reconnect list */
- /* BB add lock scope info here if needed */ ;
+ /* BB add lock scope info here if needed */
/* lock scope id (0 if none) */
struct dentry *dentry;
struct tcon_link *tlink;
@@ -1757,6 +1760,18 @@ struct file_list {
struct cifsFileInfo *cfile;
};
+struct cifs_mount_ctx {
+ struct cifs_sb_info *cifs_sb;
+ struct smb3_fs_context *fs_ctx;
+ unsigned int xid;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+ struct cifs_ses *root_ses;
+ uuid_t mount_id;
+ char *origin_fullpath, *leaf_fullpath;
+};
+
static inline void free_dfs_info_param(struct dfs_info3_param *param)
{
if (param) {
@@ -1769,6 +1784,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
int number_of_items)
{
int i;
+
if ((number_of_items == 0) || (param == NULL))
return;
for (i = 0; i < number_of_items; i++) {
@@ -2137,4 +2153,70 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const
dst->FileNameLength = src->FileNameLength;
}
+static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
+ int num_rqst,
+ const u8 *sig)
+{
+ unsigned int len, skip;
+ unsigned int nents = 0;
+ unsigned long addr;
+ int i, j;
+
+ /* Assumes the first rqst has a transform header as the first iov.
+ * I.e.
+ * rqst[0].rq_iov[0] is transform header
+ * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+ * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+ */
+ for (i = 0; i < num_rqst; i++) {
+ /*
+ * The first rqst has a transform header where the
+ * first 20 bytes are not part of the encrypted blob.
+ */
+ for (j = 0; j < rqst[i].rq_nvec; j++) {
+ struct kvec *iov = &rqst[i].rq_iov[j];
+
+ skip = (i == 0) && (j == 0) ? 20 : 0;
+ addr = (unsigned long)iov->iov_base + skip;
+ if (unlikely(is_vmalloc_addr((void *)addr))) {
+ len = iov->iov_len - skip;
+ nents += DIV_ROUND_UP(offset_in_page(addr) + len,
+ PAGE_SIZE);
+ } else {
+ nents++;
+ }
+ }
+ nents += rqst[i].rq_npages;
+ }
+ nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
+ return nents;
+}
+
+/* We can not use the normal sg_set_buf() as we will sometimes pass a
+ * stack object as buf.
+ */
+static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg,
+ const void *buf,
+ unsigned int buflen)
+{
+ unsigned long addr = (unsigned long)buf;
+ unsigned int off = offset_in_page(addr);
+
+ addr &= PAGE_MASK;
+ if (unlikely(is_vmalloc_addr((void *)addr))) {
+ do {
+ unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off);
+
+ sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off);
+
+ off = 0;
+ addr += PAGE_SIZE;
+ buflen -= len;
+ } while (buflen);
+ } else {
+ sg_set_page(sg++, virt_to_page(addr), buflen, off);
+ }
+ return sg;
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index d1abaeea974a..623caece2b10 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1429,7 +1429,7 @@ typedef struct smb_com_transaction_change_notify_req {
__u8 WatchTree; /* 1 = Monitor subdirectories */
__u8 Reserved2;
__le16 ByteCount;
-/* __u8 Pad[3];*/
+/* __u8 Pad[3];*/
/* __u8 Data[1];*/
} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ;
@@ -1752,8 +1752,7 @@ struct smb_com_transaction2_sfi_rsp {
struct smb_hdr hdr; /* wct = 10 + SetupCount */
struct trans2_resp t2;
__u16 ByteCount;
- __u16 Reserved2; /* parameter word reserved -
- present for infolevels > 100 */
+ __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */
} __attribute__((packed));
struct smb_t2_qfi_req {
@@ -1768,8 +1767,7 @@ struct smb_t2_qfi_rsp {
struct smb_hdr hdr; /* wct = 10 + SetupCount */
struct trans2_resp t2;
__u16 ByteCount;
- __u16 Reserved2; /* parameter word reserved -
- present for infolevels > 100 */
+ __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */
} __attribute__((packed));
/*
@@ -2146,13 +2144,11 @@ typedef struct {
#define CIFS_UNIX_POSIX_PATH_OPS_CAP 0x00000020 /* Allow new POSIX path based
calls including posix open
and posix unlink */
-#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up
- to 0xFFFF00 */
+#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up to 0xFFFF00 */
#define CIFS_UNIX_LARGE_WRITE_CAP 0x00000080
#define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */
#define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP 0x00000200 /* must do */
-#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and
- QFS PROXY call */
+#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */
#ifdef CONFIG_CIFS_POSIX
/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
LockingX instead of posix locking call on unix sess (and we do not expect
@@ -2368,8 +2364,7 @@ typedef struct {
struct file_allocation_info {
__le64 AllocationSize; /* Note old Samba srvr rounds this up too much */
-} __attribute__((packed)); /* size used on disk, for level 0x103 for set,
- 0x105 for query */
+} __packed; /* size used on disk, for level 0x103 for set, 0x105 for query */
struct file_end_of_file_info {
__le64 FileSize; /* offset to end of file */
@@ -2409,8 +2404,7 @@ struct cifs_posix_acl { /* access conrol list (ACL) */
__le16 access_entry_count; /* access ACL - count of entries */
__le16 default_entry_count; /* default ACL - count of entries */
struct cifs_posix_ace ace_array[];
- /* followed by
- struct cifs_posix_ace default_ace_arraay[] */
+ /* followed by struct cifs_posix_ace default_ace_array[] */
} __attribute__((packed)); /* level 0x204 */
/* types of access control entries already defined in posix_acl.h */
@@ -2429,17 +2423,17 @@ struct cifs_posix_acl { /* access conrol list (ACL) */
/* end of POSIX ACL definitions */
/* POSIX Open Flags */
-#define SMB_O_RDONLY 0x1
-#define SMB_O_WRONLY 0x2
-#define SMB_O_RDWR 0x4
-#define SMB_O_CREAT 0x10
-#define SMB_O_EXCL 0x20
-#define SMB_O_TRUNC 0x40
-#define SMB_O_APPEND 0x80
-#define SMB_O_SYNC 0x100
-#define SMB_O_DIRECTORY 0x200
-#define SMB_O_NOFOLLOW 0x400
-#define SMB_O_DIRECT 0x800
+#define SMB_O_RDONLY 0x1
+#define SMB_O_WRONLY 0x2
+#define SMB_O_RDWR 0x4
+#define SMB_O_CREAT 0x10
+#define SMB_O_EXCL 0x20
+#define SMB_O_TRUNC 0x40
+#define SMB_O_APPEND 0x80
+#define SMB_O_SYNC 0x100
+#define SMB_O_DIRECTORY 0x200
+#define SMB_O_NOFOLLOW 0x400
+#define SMB_O_DIRECT 0x800
typedef struct {
__le32 OpenFlags; /* same as NT CreateX */
@@ -2716,15 +2710,13 @@ typedef struct file_xattr_info {
__u32 xattr_value_len;
char xattr_name[];
/* followed by xattr_value[xattr_value_len], no pad */
-} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
- level 0x205 */
+} __packed FILE_XATTR_INFO; /* extended attribute info level 0x205 */
/* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */
typedef struct file_chattr_info {
__le64 mask; /* list of all possible attribute bits */
__le64 mode; /* list of actual attribute bits on this inode */
-} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes
- (chattr, chflags) level 0x206 */
-#endif /* POSIX */
+} __packed FILE_CHATTR_INFO; /* ext attributes (chattr, chflags) level 0x206 */
+#endif /* POSIX */
#endif /* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 83e83d8beabb..1207b39686fb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -57,6 +57,9 @@ extern void exit_cifs_idmap(void);
extern int init_cifs_spnego(void);
extern void exit_cifs_spnego(void);
extern const char *build_path_from_dentry(struct dentry *, void *);
+char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ const char *tree, int tree_len,
+ bool prefix);
extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry,
void *page, bool prefix);
static inline void *alloc_dentry_path(void)
@@ -75,9 +78,7 @@ extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx,
struct cifs_tcon *tcon,
int add_treename);
extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
-extern char *cifs_compose_mount_options(const char *sb_mountdata,
- const char *fullpath, const struct dfs_info3_param *ref,
- char **devname);
+char *cifs_build_devname(char *nodename, const char *prepath);
extern void delete_mid(struct mid_q_entry *mid);
extern void release_mid(struct mid_q_entry *mid);
extern void cifs_wake_up_task(struct mid_q_entry *mid);
@@ -124,7 +125,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec * /* resp vec */);
extern int SendReceiveBlockingLock(const unsigned int xid,
struct cifs_tcon *ptcon,
- struct smb_hdr *in_buf ,
+ struct smb_hdr *in_buf,
struct smb_hdr *out_buf,
int *bytes_returned);
void
@@ -224,6 +225,10 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
const char *, u32 *, u32);
extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
const struct cifs_fid *, u32 *, u32);
+extern struct posix_acl *cifs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type);
+extern int cifs_set_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct posix_acl *acl, int type);
extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
const char *, int);
extern unsigned int setup_authusers_ACE(struct cifs_ace *pace);
@@ -240,6 +245,10 @@ extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
unsigned int page_offset,
unsigned int to_read);
extern int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb);
+void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx);
+int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx);
+int cifs_is_path_remote(struct cifs_mount_ctx *mnt_ctx);
+int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx);
extern int cifs_match_super(struct super_block *, void *);
extern int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx);
extern void cifs_umount(struct cifs_sb_info *);
@@ -537,14 +546,14 @@ extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
__u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen);
extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
struct cifs_ntsd *, __u32, int);
-extern int CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *searchName,
- char *acl_inf, const int buflen, const int acl_type,
- const struct nls_table *nls_codepage, int remap_special_chars);
-extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *fileName,
- const char *local_acl, const int buflen, const int acl_type,
- const struct nls_table *nls_codepage, int remap_special_chars);
+extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName,
+ struct posix_acl **acl, const int acl_type,
+ const struct nls_table *nls_codepage, int remap);
+extern int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *fileName,
+ const struct posix_acl *acl, const int acl_type,
+ const struct nls_table *nls_codepage, int remap);
extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
#endif /* CIFS_ALLOW_INSECURE_LEGACY */
@@ -557,9 +566,6 @@ extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
const struct nls_table *codepage);
-extern int
-cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname);
-
extern struct TCP_Server_Info *
cifs_find_tcp_session(struct smb3_fs_context *ctx);
@@ -600,8 +606,8 @@ int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
int cifs_alloc_hash(const char *name, struct shash_desc **sdesc);
void cifs_free_hash(struct shash_desc **sdesc);
-extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset);
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+ unsigned int *len, unsigned int *offset);
struct cifs_chan *
cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1724066c1536..23f10e0d6e7e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2914,32 +2914,57 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
#ifdef CONFIG_CIFS_POSIX
-/*Convert an Access Control Entry from wire format to local POSIX xattr format*/
-static void cifs_convert_ace(struct posix_acl_xattr_entry *ace,
- struct cifs_posix_ace *cifs_ace)
+#ifdef CONFIG_FS_POSIX_ACL
+/**
+ * cifs_init_posix_acl - convert ACL from cifs to POSIX ACL format
+ * @ace: POSIX ACL entry to store converted ACL into
+ * @cifs_ace: ACL in cifs format
+ *
+ * Convert an Access Control Entry from wire format to local POSIX xattr
+ * format.
+ *
+ * Note that the @cifs_uid member is used to store both {g,u}id_t.
+ */
+static void cifs_init_posix_acl(struct posix_acl_entry *ace,
+ struct cifs_posix_ace *cifs_ace)
{
/* u8 cifs fields do not need le conversion */
- ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
- ace->e_tag = cpu_to_le16(cifs_ace->cifs_e_tag);
- ace->e_id = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-/*
- cifs_dbg(FYI, "perm %d tag %d id %d\n",
- ace->e_perm, ace->e_tag, ace->e_id);
-*/
+ ace->e_perm = cifs_ace->cifs_e_perm;
+ ace->e_tag = cifs_ace->cifs_e_tag;
+ switch (ace->e_tag) {
+ case ACL_USER:
+ ace->e_uid = make_kuid(&init_user_ns,
+ le64_to_cpu(cifs_ace->cifs_uid));
+ break;
+ case ACL_GROUP:
+ ace->e_gid = make_kgid(&init_user_ns,
+ le64_to_cpu(cifs_ace->cifs_uid));
+ break;
+ }
return;
}
-/* Convert ACL from CIFS POSIX wire format to local Linux POSIX ACL xattr */
-static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
- const int acl_type, const int size_of_data_area)
+/**
+ * cifs_to_posix_acl - copy cifs ACL format to POSIX ACL format
+ * @acl: ACLs returned in POSIX ACL format
+ * @src: ACLs in cifs format
+ * @acl_type: type of POSIX ACL requested
+ * @size_of_data_area: size of SMB we got
+ *
+ * This function converts ACLs from cifs format to POSIX ACL format.
+ * If @acl is NULL then the size of the buffer required to store POSIX ACLs in
+ * their uapi format is returned.
+ */
+static int cifs_to_posix_acl(struct posix_acl **acl, char *src,
+ const int acl_type, const int size_of_data_area)
{
int size = 0;
- int i;
__u16 count;
struct cifs_posix_ace *pACE;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src;
- struct posix_acl_xattr_header *local_acl = (void *)trgt;
+ struct posix_acl *kacl = NULL;
+ struct posix_acl_entry *pa, *pe;
if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
return -EOPNOTSUPP;
@@ -2959,7 +2984,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
count = le16_to_cpu(cifs_acl->access_entry_count);
size = sizeof(struct cifs_posix_acl);
size += sizeof(struct cifs_posix_ace) * count;
-/* skip past access ACEs to get to default ACEs */
+ /* skip past access ACEs to get to default ACEs */
pACE = &cifs_acl->ace_array[count];
count = le16_to_cpu(cifs_acl->default_entry_count);
size += sizeof(struct cifs_posix_ace) * count;
@@ -2971,62 +2996,75 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
return -EINVAL;
}
- size = posix_acl_xattr_size(count);
- if ((buflen == 0) || (local_acl == NULL)) {
- /* used to query ACL EA size */
- } else if (size > buflen) {
- return -ERANGE;
- } else /* buffer big enough */ {
- struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
-
- local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
- for (i = 0; i < count ; i++) {
- cifs_convert_ace(&ace[i], pACE);
- pACE++;
- }
+ /* Allocate number of POSIX ACLs to store in VFS format. */
+ kacl = posix_acl_alloc(count, GFP_NOFS);
+ if (!kacl)
+ return -ENOMEM;
+
+ FOREACH_ACL_ENTRY(pa, kacl, pe) {
+ cifs_init_posix_acl(pa, pACE);
+ pACE++;
}
- return size;
+
+ *acl = kacl;
+ return 0;
}
-static void convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
- const struct posix_acl_xattr_entry *local_ace)
+/**
+ * cifs_init_ace - convert ACL entry from POSIX ACL to cifs format
+ * @cifs_ace: the cifs ACL entry to store into
+ * @local_ace: the POSIX ACL entry to convert
+ */
+static void cifs_init_ace(struct cifs_posix_ace *cifs_ace,
+ const struct posix_acl_entry *local_ace)
{
- cifs_ace->cifs_e_perm = le16_to_cpu(local_ace->e_perm);
- cifs_ace->cifs_e_tag = le16_to_cpu(local_ace->e_tag);
- /* BB is there a better way to handle the large uid? */
- if (local_ace->e_id == cpu_to_le32(-1)) {
- /* Probably no need to le convert -1 on any arch but can not hurt */
+ cifs_ace->cifs_e_perm = local_ace->e_perm;
+ cifs_ace->cifs_e_tag = local_ace->e_tag;
+
+ switch (local_ace->e_tag) {
+ case ACL_USER:
+ cifs_ace->cifs_uid =
+ cpu_to_le64(from_kuid(&init_user_ns, local_ace->e_uid));
+ break;
+ case ACL_GROUP:
+ cifs_ace->cifs_uid =
+ cpu_to_le64(from_kgid(&init_user_ns, local_ace->e_gid));
+ break;
+ default:
cifs_ace->cifs_uid = cpu_to_le64(-1);
- } else
- cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-/*
- cifs_dbg(FYI, "perm %d tag %d id %d\n",
- ace->e_perm, ace->e_tag, ace->e_id);
-*/
+ }
}
-/* Convert ACL from local Linux POSIX xattr to CIFS POSIX ACL wire format */
-static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
- const int buflen, const int acl_type)
+/**
+ * posix_acl_to_cifs - convert ACLs from POSIX ACL to cifs format
+ * @parm_data: ACLs in cifs format to conver to
+ * @acl: ACLs in POSIX ACL format to convert from
+ * @acl_type: the type of POSIX ACLs stored in @acl
+ *
+ * Return: the number cifs ACL entries after conversion
+ */
+static __u16 posix_acl_to_cifs(char *parm_data, const struct posix_acl *acl,
+ const int acl_type)
{
__u16 rc = 0;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
- struct posix_acl_xattr_header *local_acl = (void *)pACL;
- struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
+ const struct posix_acl_entry *pa, *pe;
int count;
- int i;
+ int i = 0;
- if ((buflen == 0) || (pACL == NULL) || (cifs_acl == NULL))
+ if ((acl == NULL) || (cifs_acl == NULL))
return 0;
- count = posix_acl_xattr_count((size_t)buflen);
- cifs_dbg(FYI, "setting acl with %d entries from buf of length %d and version of %d\n",
- count, buflen, le32_to_cpu(local_acl->a_version));
- if (le32_to_cpu(local_acl->a_version) != 2) {
- cifs_dbg(FYI, "unknown POSIX ACL version %d\n",
- le32_to_cpu(local_acl->a_version));
- return 0;
- }
+ count = acl->a_count;
+ cifs_dbg(FYI, "setting acl with %d entries\n", count);
+
+ /*
+ * Note that the uapi POSIX ACL version is verified by the VFS and is
+ * independent of the cifs ACL version. Changing the POSIX ACL version
+ * is a uapi change and if it's changed we will pass down the POSIX ACL
+ * version in struct posix_acl from the VFS. For now there's really
+ * only one that all filesystems know how to deal with.
+ */
cifs_acl->version = cpu_to_le16(1);
if (acl_type == ACL_TYPE_ACCESS) {
cifs_acl->access_entry_count = cpu_to_le16(count);
@@ -3038,8 +3076,9 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
return 0;
}
- for (i = 0; i < count; i++)
- convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
+ FOREACH_ACL_ENTRY(pa, acl, pe) {
+ cifs_init_ace(&cifs_acl->ace_array[i++], pa);
+ }
if (rc == 0) {
rc = (__u16)(count * sizeof(struct cifs_posix_ace));
rc += sizeof(struct cifs_posix_acl);
@@ -3048,11 +3087,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
return rc;
}
-int
-CIFSSMBGetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *searchName,
- char *acl_inf, const int buflen, const int acl_type,
- const struct nls_table *nls_codepage, int remap)
+int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName, struct posix_acl **acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
{
/* SMB_QUERY_POSIX_ACL */
TRANSACTION2_QPI_REQ *pSMB = NULL;
@@ -3124,23 +3162,26 @@ queryAclRetry:
else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
__u16 count = le16_to_cpu(pSMBr->t2.DataCount);
- rc = cifs_copy_posix_acl(acl_inf,
+ rc = cifs_to_posix_acl(acl,
(char *)&pSMBr->hdr.Protocol+data_offset,
- buflen, acl_type, count);
+ acl_type, count);
}
}
cifs_buf_release(pSMB);
+ /*
+ * The else branch after SendReceive() doesn't return EAGAIN so if we
+ * allocated @acl in cifs_to_posix_acl() we are guaranteed to return
+ * here and don't leak POSIX ACLs.
+ */
if (rc == -EAGAIN)
goto queryAclRetry;
return rc;
}
-int
-CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *fileName,
- const char *local_acl, const int buflen,
- const int acl_type,
- const struct nls_table *nls_codepage, int remap)
+int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *fileName, const struct posix_acl *acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
{
struct smb_com_transaction2_spi_req *pSMB = NULL;
struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
@@ -3181,7 +3222,7 @@ setAclRetry:
pSMB->ParameterOffset = cpu_to_le16(param_offset);
/* convert to on the wire format for POSIX ACL */
- data_count = ACL_to_cifs_posix(parm_data, local_acl, buflen, acl_type);
+ data_count = posix_acl_to_cifs(parm_data, acl, acl_type);
if (data_count == 0) {
rc = -EOPNOTSUPP;
@@ -3211,6 +3252,23 @@ setACLerrorExit:
goto setAclRetry;
return rc;
}
+#else
+int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName, struct posix_acl **acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
+{
+ return -EOPNOTSUPP;
+}
+
+int cifs_do_set_acl(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *fileName, const struct posix_acl *acl,
+ const int acl_type, const struct nls_table *nls_codepage,
+ int remap)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_FS_POSIX_ACL */
int
CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9db9527c61cf..b2a04b4e89a5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -46,6 +46,7 @@
#include "smbdirect.h"
#include "dns_resolve.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs.h"
#include "dfs_cache.h"
#endif
#include "fs_context.h"
@@ -61,20 +62,6 @@ extern bool disable_legacy_dialects;
/* Drop the connection to not overload the server */
#define NUM_STATUS_IO_TIMEOUT 5
-struct mount_ctx {
- struct cifs_sb_info *cifs_sb;
- struct smb3_fs_context *fs_ctx;
- unsigned int xid;
- struct TCP_Server_Info *server;
- struct cifs_ses *ses;
- struct cifs_tcon *tcon;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- struct cifs_ses *root_ses;
- uuid_t mount_id;
- char *origin_fullpath, *leaf_fullpath;
-#endif
-};
-
static int ip_connect(struct TCP_Server_Info *server);
static int generic_ip_connect(struct TCP_Server_Info *server);
static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
@@ -90,7 +77,8 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
{
int rc;
int len;
- char *unc, *ipaddr = NULL;
+ char *unc;
+ struct sockaddr_storage ss;
time64_t expiry, now;
unsigned long ttl = SMB_DNS_RESOLVE_INTERVAL_DEFAULT;
@@ -110,7 +98,11 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
}
scnprintf(unc, len, "\\\\%s", server->hostname);
- rc = dns_resolve_server_name_to_ip(unc, &ipaddr, &expiry);
+ spin_lock(&server->srv_lock);
+ ss = server->dstaddr;
+ spin_unlock(&server->srv_lock);
+
+ rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, &expiry);
kfree(unc);
if (rc < 0) {
@@ -120,22 +112,13 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
}
spin_lock(&server->srv_lock);
- rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
- strlen(ipaddr));
+ memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr));
spin_unlock(&server->srv_lock);
- kfree(ipaddr);
- /* rc == 1 means success here */
- if (rc) {
- now = ktime_get_real_seconds();
- if (expiry && expiry > now)
- /*
- * To make sure we don't use the cached entry, retry 1s
- * after expiry.
- */
- ttl = max_t(unsigned long, expiry - now, SMB_DNS_RESOLVE_INTERVAL_MIN) + 1;
- }
- rc = !rc ? -1 : 0;
+ now = ktime_get_real_seconds();
+ if (expiry && expiry > now)
+ /* To make sure we don't use the cached entry, retry 1s */
+ ttl = max_t(unsigned long, expiry - now, SMB_DNS_RESOLVE_INTERVAL_MIN) + 1;
requeue_resolve:
cifs_dbg(FYI, "%s: next dns resolution scheduled for %lu seconds in the future\n",
@@ -279,8 +262,10 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
tcon->need_reconnect = true;
tcon->status = TID_NEED_RECON;
}
- if (ses->tcon_ipc)
+ if (ses->tcon_ipc) {
ses->tcon_ipc->need_reconnect = true;
+ ses->tcon_ipc->status = TID_NEED_RECON;
+ }
next_session:
spin_unlock(&ses->chan_lock);
@@ -546,9 +531,7 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
} while (server->tcpStatus == CifsNeedReconnect);
- if (target_hint)
- dfs_cache_noreq_update_tgthint(refpath, target_hint);
-
+ dfs_cache_noreq_update_tgthint(refpath, target_hint);
dfs_cache_free_tgts(&tl);
/* Need to set up echo worker again once connection has been established */
@@ -563,16 +546,8 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
{
- /* If tcp session is not an dfs connection, then reconnect to last target server */
- spin_lock(&server->srv_lock);
- if (!server->is_dfs_conn) {
- spin_unlock(&server->srv_lock);
- return __cifs_reconnect(server, mark_smb_session);
- }
- spin_unlock(&server->srv_lock);
-
mutex_lock(&server->refpath_lock);
- if (!server->origin_fullpath || !server->leaf_fullpath) {
+ if (!server->leaf_fullpath) {
mutex_unlock(&server->refpath_lock);
return __cifs_reconnect(server, mark_smb_session);
}
@@ -759,7 +734,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
{
struct msghdr smb_msg = {};
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
- iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
+ iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -774,7 +749,7 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read)
* and cifs_readv_from_socket sets msg_control and msg_controllen
* so little to initialize in struct msghdr
*/
- iov_iter_discard(&smb_msg.msg_iter, READ, to_read);
+ iov_iter_discard(&smb_msg.msg_iter, ITER_DEST, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -786,7 +761,7 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
struct msghdr smb_msg = {};
struct bio_vec bv = {
.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
- iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
+ iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
@@ -1384,9 +1359,7 @@ match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
return port == *sport;
}
-static bool
-match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
- struct sockaddr *srcaddr)
+static bool match_server_address(struct TCP_Server_Info *server, struct sockaddr *addr)
{
switch (addr->sa_family) {
case AF_INET: {
@@ -1415,9 +1388,6 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
return false; /* don't expect to be here */
}
- if (!cifs_match_ipaddr(srcaddr, (struct sockaddr *)&server->srcaddr))
- return false;
-
return true;
}
@@ -1444,8 +1414,23 @@ match_security(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
return true;
}
+static bool dfs_src_pathname_equal(const char *s1, const char *s2)
+{
+ if (strlen(s1) != strlen(s2))
+ return false;
+ for (; *s1; s1++, s2++) {
+ if (*s1 == '/' || *s1 == '\\') {
+ if (*s2 != '/' && *s2 != '\\')
+ return false;
+ } else if (tolower(*s1) != tolower(*s2))
+ return false;
+ }
+ return true;
+}
+
/* this function must be called with srv_lock held */
-static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
+static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx,
+ bool dfs_super_cmp)
{
struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
@@ -1470,15 +1455,30 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *
if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
return 0;
- if (strcasecmp(server->hostname, ctx->server_hostname))
- return 0;
-
- if (!match_address(server, addr,
- (struct sockaddr *)&ctx->srcaddr))
- return 0;
-
- if (!match_port(server, addr))
+ if (!cifs_match_ipaddr((struct sockaddr *)&ctx->srcaddr,
+ (struct sockaddr *)&server->srcaddr))
return 0;
+ /*
+ * When matching DFS superblocks, we only check for original source pathname as the
+ * currently connected target might be different than the one parsed earlier in i.e.
+ * mount.cifs(8).
+ */
+ if (dfs_super_cmp) {
+ if (!ctx->source || !server->origin_fullpath ||
+ !dfs_src_pathname_equal(server->origin_fullpath, ctx->source))
+ return 0;
+ } else {
+ /* Skip addr, hostname and port matching for DFS connections */
+ if (server->leaf_fullpath) {
+ if (!ctx->leaf_fullpath ||
+ strcasecmp(server->leaf_fullpath, ctx->leaf_fullpath))
+ return 0;
+ } else if (strcasecmp(server->hostname, ctx->server_hostname) ||
+ !match_server_address(server, addr) ||
+ !match_port(server, addr)) {
+ return 0;
+ }
+ }
if (!match_security(server, ctx))
return 0;
@@ -1506,23 +1506,11 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
spin_lock(&server->srv_lock);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- /*
- * DFS failover implementation in cifs_reconnect() requires unique tcp sessions for
- * DFS connections to do failover properly, so avoid sharing them with regular
- * shares or even links that may connect to same server but having completely
- * different failover targets.
- */
- if (server->is_dfs_conn) {
- spin_unlock(&server->srv_lock);
- continue;
- }
-#endif
/*
* Skip ses channels since they're only handled in lower layers
* (e.g. cifs_send_recv).
*/
- if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx)) {
+ if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx, false)) {
spin_unlock(&server->srv_lock);
continue;
}
@@ -1617,6 +1605,15 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
goto out_err;
}
+ if (ctx->leaf_fullpath) {
+ tcp_ses->leaf_fullpath = kstrdup(ctx->leaf_fullpath, GFP_KERNEL);
+ if (!tcp_ses->leaf_fullpath) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+ tcp_ses->current_fullpath = tcp_ses->leaf_fullpath;
+ }
+
if (ctx->nosharesock)
tcp_ses->nosharesock = true;
@@ -1765,6 +1762,7 @@ out_err:
if (CIFS_SERVER_IS_CHAN(tcp_ses))
cifs_put_tcp_session(tcp_ses->primary_server, false);
kfree(tcp_ses->hostname);
+ kfree(tcp_ses->leaf_fullpath);
if (tcp_ses->ssocket)
sock_release(tcp_ses->ssocket);
kfree(tcp_ses);
@@ -1871,6 +1869,9 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid);
+ spin_lock(&tcon->tc_lock);
+ tcon->status = TID_GOOD;
+ spin_unlock(&tcon->tc_lock);
ses->tcon_ipc = tcon;
out:
return rc;
@@ -2157,7 +2158,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx __attribute__((unused)),
struct cifs_ses *
cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
{
- int rc = -ENOMEM;
+ int rc = 0;
unsigned int xid;
struct cifs_ses *ses;
struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
@@ -2206,6 +2207,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
return ses;
}
+ rc = -ENOMEM;
+
cifs_dbg(FYI, "Existing smb sess not found\n");
ses = sesInfoAlloc();
if (ses == NULL)
@@ -2278,10 +2281,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
list_add(&ses->smb_ses_list, &server->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- free_xid(xid);
-
cifs_setup_ipc(ses, ctx);
+ free_xid(xid);
+
return ses;
get_ses_fail:
@@ -2291,11 +2294,12 @@ get_ses_fail:
}
/* this function must be called with tc_lock held */
-static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
+static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx, bool dfs_super_cmp)
{
if (tcon->status == TID_EXITING)
return 0;
- if (strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE))
+ /* Skip UNC validation when matching DFS superblocks */
+ if (!dfs_super_cmp && strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE))
return 0;
if (tcon->seal != ctx->seal)
return 0;
@@ -2318,7 +2322,7 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->tc_lock);
- if (!match_tcon(tcon, ctx)) {
+ if (!match_tcon(tcon, ctx, false)) {
spin_unlock(&tcon->tc_lock);
continue;
}
@@ -2600,12 +2604,16 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->nodelete = ctx->nodelete;
tcon->local_lease = ctx->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
+ tcon->status = TID_GOOD;
- /* schedule query interfaces poll */
INIT_DELAYED_WORK(&tcon->query_interfaces,
smb2_query_server_interfaces);
- queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
- (SMB_INTERFACE_POLL_INTERVAL * HZ));
+ if (ses->server->dialect >= SMB30_PROT_ID &&
+ (ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ /* schedule query interfaces poll */
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+ }
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcon->tcon_list, &ses->tcon_list);
@@ -2712,6 +2720,7 @@ cifs_match_super(struct super_block *sb, void *data)
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct tcon_link *tlink;
+ bool dfs_super_cmp;
int rc = 0;
spin_lock(&cifs_tcp_ses_lock);
@@ -2726,14 +2735,16 @@ cifs_match_super(struct super_block *sb, void *data)
ses = tcon->ses;
tcp_srv = ses->server;
+ dfs_super_cmp = IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && tcp_srv->origin_fullpath;
+
ctx = mnt_data->ctx;
spin_lock(&tcp_srv->srv_lock);
spin_lock(&ses->ses_lock);
spin_lock(&tcon->tc_lock);
- if (!match_server(tcp_srv, ctx) ||
+ if (!match_server(tcp_srv, ctx, dfs_super_cmp) ||
!match_session(ses, ctx) ||
- !match_tcon(tcon, ctx) ||
+ !match_tcon(tcon, ctx, dfs_super_cmp) ||
!match_prepath(sb, mnt_data)) {
rc = 0;
goto out;
@@ -2944,6 +2955,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Socket created\n");
server->ssocket = socket;
socket->sk->sk_allocation = GFP_NOFS;
+ socket->sk->sk_use_task_frag = false;
if (sfamily == AF_INET6)
cifs_reclassify_socket6(socket);
else
@@ -3190,7 +3202,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
}
/* Release all succeed connections */
-static inline void mount_put_conns(struct mount_ctx *mnt_ctx)
+void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx)
{
int rc = 0;
@@ -3204,19 +3216,22 @@ static inline void mount_put_conns(struct mount_ctx *mnt_ctx)
free_xid(mnt_ctx->xid);
}
-/* Get connections for tcp, ses and tcon */
-static int mount_get_conns(struct mount_ctx *mnt_ctx)
+int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx)
{
- int rc = 0;
struct TCP_Server_Info *server = NULL;
+ struct smb3_fs_context *ctx;
struct cifs_ses *ses = NULL;
- struct cifs_tcon *tcon = NULL;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
unsigned int xid;
+ int rc = 0;
xid = get_xid();
+ if (WARN_ON_ONCE(!mnt_ctx || !mnt_ctx->fs_ctx)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ ctx = mnt_ctx->fs_ctx;
+
/* get a reference to a tcp session */
server = cifs_get_tcp_session(ctx, NULL);
if (IS_ERR(server)) {
@@ -3237,11 +3252,36 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
SMB2_GLOBAL_CAP_PERSISTENT_HANDLES))) {
cifs_server_dbg(VFS, "persistent handles not supported by server\n");
rc = -EOPNOTSUPP;
+ }
+
+out:
+ mnt_ctx->xid = xid;
+ mnt_ctx->server = server;
+ mnt_ctx->ses = ses;
+ mnt_ctx->tcon = NULL;
+
+ return rc;
+}
+
+int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)
+{
+ struct TCP_Server_Info *server;
+ struct cifs_sb_info *cifs_sb;
+ struct smb3_fs_context *ctx;
+ struct cifs_tcon *tcon = NULL;
+ int rc = 0;
+
+ if (WARN_ON_ONCE(!mnt_ctx || !mnt_ctx->server || !mnt_ctx->ses || !mnt_ctx->fs_ctx ||
+ !mnt_ctx->cifs_sb)) {
+ rc = -EINVAL;
goto out;
}
+ server = mnt_ctx->server;
+ ctx = mnt_ctx->fs_ctx;
+ cifs_sb = mnt_ctx->cifs_sb;
/* search for existing tcon to this server share */
- tcon = cifs_get_tcon(ses, ctx);
+ tcon = cifs_get_tcon(mnt_ctx->ses, ctx);
if (IS_ERR(tcon)) {
rc = PTR_ERR(tcon);
tcon = NULL;
@@ -3259,7 +3299,7 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
* reset of caps checks mount to see if unix extensions disabled
* for just this mount.
*/
- reset_cifs_unix_caps(xid, tcon, cifs_sb, ctx);
+ reset_cifs_unix_caps(mnt_ctx->xid, tcon, cifs_sb, ctx);
spin_lock(&tcon->ses->server->srv_lock);
if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
(le64_to_cpu(tcon->fsUnixInfo.Capability) &
@@ -3275,7 +3315,7 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
/* do not care if a following call succeed - informational */
if (!tcon->pipe && server->ops->qfs_tcon) {
- server->ops->qfs_tcon(xid, tcon, cifs_sb);
+ server->ops->qfs_tcon(mnt_ctx->xid, tcon, cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) {
if (tcon->fsDevInfo.DeviceCharacteristics &
cpu_to_le32(FILE_READ_ONLY_DEVICE))
@@ -3308,11 +3348,7 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
cifs_fscache_get_super_cookie(tcon);
out:
- mnt_ctx->server = server;
- mnt_ctx->ses = ses;
mnt_ctx->tcon = tcon;
- mnt_ctx->xid = xid;
-
return rc;
}
@@ -3342,146 +3378,6 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
return 0;
}
-#ifdef CONFIG_CIFS_DFS_UPCALL
-/* Get unique dfs connections */
-static int mount_get_dfs_conns(struct mount_ctx *mnt_ctx)
-{
- int rc;
-
- mnt_ctx->fs_ctx->nosharesock = true;
- rc = mount_get_conns(mnt_ctx);
- if (mnt_ctx->server) {
- cifs_dbg(FYI, "%s: marking tcp session as a dfs connection\n", __func__);
- spin_lock(&mnt_ctx->server->srv_lock);
- mnt_ctx->server->is_dfs_conn = true;
- spin_unlock(&mnt_ctx->server->srv_lock);
- }
- return rc;
-}
-
-/*
- * cifs_build_path_to_root returns full path to root when we do not have an
- * existing connection (tcon)
- */
-static char *
-build_unc_path_to_root(const struct smb3_fs_context *ctx,
- const struct cifs_sb_info *cifs_sb, bool useppath)
-{
- char *full_path, *pos;
- unsigned int pplen = useppath && ctx->prepath ?
- strlen(ctx->prepath) + 1 : 0;
- unsigned int unc_len = strnlen(ctx->UNC, MAX_TREE_SIZE + 1);
-
- if (unc_len > MAX_TREE_SIZE)
- return ERR_PTR(-EINVAL);
-
- full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
- if (full_path == NULL)
- return ERR_PTR(-ENOMEM);
-
- memcpy(full_path, ctx->UNC, unc_len);
- pos = full_path + unc_len;
-
- if (pplen) {
- *pos = CIFS_DIR_SEP(cifs_sb);
- memcpy(pos + 1, ctx->prepath, pplen);
- pos += pplen;
- }
-
- *pos = '\0'; /* add trailing null */
- convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
- cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path);
- return full_path;
-}
-
-/*
- * expand_dfs_referral - Update cifs_sb from dfs referral path
- *
- * cifs_sb->ctx->mount_options will be (re-)allocated to a string containing updated options for the
- * submount. Otherwise it will be left untouched.
- */
-static int expand_dfs_referral(struct mount_ctx *mnt_ctx, const char *full_path,
- struct dfs_info3_param *referral)
-{
- int rc;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- char *fake_devname = NULL, *mdata = NULL;
-
- mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, full_path + 1, referral,
- &fake_devname);
- if (IS_ERR(mdata)) {
- rc = PTR_ERR(mdata);
- mdata = NULL;
- } else {
- /*
- * We can not clear out the whole structure since we no longer have an explicit
- * function to parse a mount-string. Instead we need to clear out the individual
- * fields that are no longer valid.
- */
- kfree(ctx->prepath);
- ctx->prepath = NULL;
- rc = cifs_setup_volume_info(ctx, mdata, fake_devname);
- }
- kfree(fake_devname);
- kfree(cifs_sb->ctx->mount_options);
- cifs_sb->ctx->mount_options = mdata;
-
- return rc;
-}
-#endif
-
-/* TODO: all callers to this are broken. We are not parsing mount_options here
- * we should pass a clone of the original context?
- */
-int
-cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname)
-{
- int rc;
-
- if (devname) {
- cifs_dbg(FYI, "%s: devname=%s\n", __func__, devname);
- rc = smb3_parse_devname(devname, ctx);
- if (rc) {
- cifs_dbg(VFS, "%s: failed to parse %s: %d\n", __func__, devname, rc);
- return rc;
- }
- }
-
- if (mntopts) {
- char *ip;
-
- rc = smb3_parse_opt(mntopts, "ip", &ip);
- if (rc) {
- cifs_dbg(VFS, "%s: failed to parse ip options: %d\n", __func__, rc);
- return rc;
- }
-
- rc = cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, strlen(ip));
- kfree(ip);
- if (!rc) {
- cifs_dbg(VFS, "%s: failed to convert ip address\n", __func__);
- return -EINVAL;
- }
- }
-
- if (ctx->nullauth) {
- cifs_dbg(FYI, "Anonymous login\n");
- kfree(ctx->username);
- ctx->username = NULL;
- } else if (ctx->username) {
- /* BB fixme parse for domain name here */
- cifs_dbg(FYI, "Username: %s\n", ctx->username);
- } else {
- cifs_dbg(VFS, "No username specified\n");
- /* In userspace mount helper we can get user name from alternate
- locations such as env variables and files on disk */
- return -EINVAL;
- }
-
- return 0;
-}
-
static int
cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
unsigned int xid,
@@ -3534,7 +3430,7 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
*
* Return -EREMOTE if it is, otherwise 0 or -errno.
*/
-static int is_path_remote(struct mount_ctx *mnt_ctx)
+int cifs_is_path_remote(struct cifs_mount_ctx *mnt_ctx)
{
int rc;
struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
@@ -3543,9 +3439,6 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
struct cifs_tcon *tcon = mnt_ctx->tcon;
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
char *full_path;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- bool nodfs = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS;
-#endif
if (!server->ops->is_path_accessible)
return -EOPNOTSUPP;
@@ -3562,19 +3455,6 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
full_path);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- if (nodfs) {
- if (rc == -EREMOTE)
- rc = -EOPNOTSUPP;
- goto out;
- }
-
- /* path *might* exist with non-ASCII characters in DFS root
- * try again with full path (only if nodfs is not set) */
- if (rc == -ENOENT && is_tcon_dfs(tcon))
- rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb,
- full_path);
-#endif
if (rc != 0 && rc != -EREMOTE)
goto out;
@@ -3594,251 +3474,19 @@ out:
}
#ifdef CONFIG_CIFS_DFS_UPCALL
-static void set_root_ses(struct mount_ctx *mnt_ctx)
-{
- if (mnt_ctx->ses) {
- spin_lock(&cifs_tcp_ses_lock);
- mnt_ctx->ses->ses_count++;
- spin_unlock(&cifs_tcp_ses_lock);
- dfs_cache_add_refsrv_session(&mnt_ctx->mount_id, mnt_ctx->ses);
- }
- mnt_ctx->root_ses = mnt_ctx->ses;
-}
-
-static int is_dfs_mount(struct mount_ctx *mnt_ctx, bool *isdfs, struct dfs_cache_tgt_list *root_tl)
-{
- int rc;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
-
- *isdfs = true;
-
- rc = mount_get_conns(mnt_ctx);
- /*
- * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally
- * try to get an DFS referral (even cached) to determine whether it is an DFS mount.
- *
- * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem
- * to respond with PATH_NOT_COVERED to requests that include the prefix.
- */
- if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) ||
- dfs_cache_find(mnt_ctx->xid, mnt_ctx->ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
- ctx->UNC + 1, NULL, root_tl)) {
- if (rc)
- return rc;
- /* Check if it is fully accessible and then mount it */
- rc = is_path_remote(mnt_ctx);
- if (!rc)
- *isdfs = false;
- else if (rc != -EREMOTE)
- return rc;
- }
- return 0;
-}
-
-static int connect_dfs_target(struct mount_ctx *mnt_ctx, const char *full_path,
- const char *ref_path, struct dfs_cache_tgt_iterator *tit)
-{
- int rc;
- struct dfs_info3_param ref = {};
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- char *oldmnt = cifs_sb->ctx->mount_options;
-
- cifs_dbg(FYI, "%s: full_path=%s ref_path=%s target=%s\n", __func__, full_path, ref_path,
- dfs_cache_get_tgt_name(tit));
-
- rc = dfs_cache_get_tgt_referral(ref_path, tit, &ref);
- if (rc)
- goto out;
-
- rc = expand_dfs_referral(mnt_ctx, full_path, &ref);
- if (rc)
- goto out;
-
- /* Connect to new target only if we were redirected (e.g. mount options changed) */
- if (oldmnt != cifs_sb->ctx->mount_options) {
- mount_put_conns(mnt_ctx);
- rc = mount_get_dfs_conns(mnt_ctx);
- }
- if (!rc) {
- if (cifs_is_referral_server(mnt_ctx->tcon, &ref))
- set_root_ses(mnt_ctx);
- rc = dfs_cache_update_tgthint(mnt_ctx->xid, mnt_ctx->root_ses, cifs_sb->local_nls,
- cifs_remap(cifs_sb), ref_path, tit);
- }
-
-out:
- free_dfs_info_param(&ref);
- return rc;
-}
-
-static int connect_dfs_root(struct mount_ctx *mnt_ctx, struct dfs_cache_tgt_list *root_tl)
-{
- int rc;
- char *full_path;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- struct dfs_cache_tgt_iterator *tit;
-
- /* Put initial connections as they might be shared with other mounts. We need unique dfs
- * connections per mount to properly failover, so mount_get_dfs_conns() must be used from
- * now on.
- */
- mount_put_conns(mnt_ctx);
- mount_get_dfs_conns(mnt_ctx);
- set_root_ses(mnt_ctx);
-
- full_path = build_unc_path_to_root(ctx, cifs_sb, true);
- if (IS_ERR(full_path))
- return PTR_ERR(full_path);
-
- mnt_ctx->origin_fullpath = dfs_cache_canonical_path(ctx->UNC, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (IS_ERR(mnt_ctx->origin_fullpath)) {
- rc = PTR_ERR(mnt_ctx->origin_fullpath);
- mnt_ctx->origin_fullpath = NULL;
- goto out;
- }
-
- /* Try all dfs root targets */
- for (rc = -ENOENT, tit = dfs_cache_get_tgt_iterator(root_tl);
- tit; tit = dfs_cache_get_next_tgt(root_tl, tit)) {
- rc = connect_dfs_target(mnt_ctx, full_path, mnt_ctx->origin_fullpath + 1, tit);
- if (!rc) {
- mnt_ctx->leaf_fullpath = kstrdup(mnt_ctx->origin_fullpath, GFP_KERNEL);
- if (!mnt_ctx->leaf_fullpath)
- rc = -ENOMEM;
- break;
- }
- }
-
-out:
- kfree(full_path);
- return rc;
-}
-
-static int __follow_dfs_link(struct mount_ctx *mnt_ctx)
-{
- int rc;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- char *full_path;
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
- struct dfs_cache_tgt_iterator *tit;
-
- full_path = build_unc_path_to_root(ctx, cifs_sb, true);
- if (IS_ERR(full_path))
- return PTR_ERR(full_path);
-
- kfree(mnt_ctx->leaf_fullpath);
- mnt_ctx->leaf_fullpath = dfs_cache_canonical_path(full_path, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (IS_ERR(mnt_ctx->leaf_fullpath)) {
- rc = PTR_ERR(mnt_ctx->leaf_fullpath);
- mnt_ctx->leaf_fullpath = NULL;
- goto out;
- }
-
- /* Get referral from dfs link */
- rc = dfs_cache_find(mnt_ctx->xid, mnt_ctx->root_ses, cifs_sb->local_nls,
- cifs_remap(cifs_sb), mnt_ctx->leaf_fullpath + 1, NULL, &tl);
- if (rc)
- goto out;
-
- /* Try all dfs link targets. If an I/O fails from currently connected DFS target with an
- * error other than STATUS_PATH_NOT_COVERED (-EREMOTE), then retry it from other targets as
- * specified in MS-DFSC "3.1.5.2 I/O Operation to Target Fails with an Error Other Than
- * STATUS_PATH_NOT_COVERED."
- */
- for (rc = -ENOENT, tit = dfs_cache_get_tgt_iterator(&tl);
- tit; tit = dfs_cache_get_next_tgt(&tl, tit)) {
- rc = connect_dfs_target(mnt_ctx, full_path, mnt_ctx->leaf_fullpath + 1, tit);
- if (!rc) {
- rc = is_path_remote(mnt_ctx);
- if (!rc || rc == -EREMOTE)
- break;
- }
- }
-
-out:
- kfree(full_path);
- dfs_cache_free_tgts(&tl);
- return rc;
-}
-
-static int follow_dfs_link(struct mount_ctx *mnt_ctx)
-{
- int rc;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- char *full_path;
- int num_links = 0;
-
- full_path = build_unc_path_to_root(ctx, cifs_sb, true);
- if (IS_ERR(full_path))
- return PTR_ERR(full_path);
-
- kfree(mnt_ctx->origin_fullpath);
- mnt_ctx->origin_fullpath = dfs_cache_canonical_path(full_path, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- kfree(full_path);
-
- if (IS_ERR(mnt_ctx->origin_fullpath)) {
- rc = PTR_ERR(mnt_ctx->origin_fullpath);
- mnt_ctx->origin_fullpath = NULL;
- return rc;
- }
-
- do {
- rc = __follow_dfs_link(mnt_ctx);
- if (!rc || rc != -EREMOTE)
- break;
- } while (rc = -ELOOP, ++num_links < MAX_NESTED_LINKS);
-
- return rc;
-}
-
-/* Set up DFS referral paths for failover */
-static void setup_server_referral_paths(struct mount_ctx *mnt_ctx)
-{
- struct TCP_Server_Info *server = mnt_ctx->server;
-
- mutex_lock(&server->refpath_lock);
- server->origin_fullpath = mnt_ctx->origin_fullpath;
- server->leaf_fullpath = mnt_ctx->leaf_fullpath;
- server->current_fullpath = mnt_ctx->leaf_fullpath;
- mutex_unlock(&server->refpath_lock);
- mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL;
-}
-
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
{
- int rc;
- struct mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ struct cifs_mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
bool isdfs;
+ int rc;
- rc = is_dfs_mount(&mnt_ctx, &isdfs, &tl);
+ uuid_gen(&mnt_ctx.mount_id);
+ rc = dfs_mount_share(&mnt_ctx, &isdfs);
if (rc)
goto error;
if (!isdfs)
goto out;
- /* proceed as DFS mount */
- uuid_gen(&mnt_ctx.mount_id);
- rc = connect_dfs_root(&mnt_ctx, &tl);
- dfs_cache_free_tgts(&tl);
-
- if (rc)
- goto error;
-
- rc = is_path_remote(&mnt_ctx);
- if (rc)
- rc = follow_dfs_link(&mnt_ctx);
- if (rc)
- goto error;
-
- setup_server_referral_paths(&mnt_ctx);
/*
* After reconnecting to a different server, unique ids won't match anymore, so we disable
* serverino. This prevents dentry revalidation to think the dentry are stale (ESTALE).
@@ -3867,26 +3515,28 @@ error:
dfs_cache_put_refsrv_sessions(&mnt_ctx.mount_id);
kfree(mnt_ctx.origin_fullpath);
kfree(mnt_ctx.leaf_fullpath);
- mount_put_conns(&mnt_ctx);
+ cifs_mount_put_conns(&mnt_ctx);
return rc;
}
#else
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
{
int rc = 0;
- struct mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
+ struct cifs_mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
- rc = mount_get_conns(&mnt_ctx);
+ rc = cifs_mount_get_session(&mnt_ctx);
if (rc)
goto error;
- if (mnt_ctx.tcon) {
- rc = is_path_remote(&mnt_ctx);
- if (rc == -EREMOTE)
- rc = -EOPNOTSUPP;
- if (rc)
- goto error;
- }
+ rc = cifs_mount_get_tcon(&mnt_ctx);
+ if (rc)
+ goto error;
+
+ rc = cifs_is_path_remote(&mnt_ctx);
+ if (rc == -EREMOTE)
+ rc = -EOPNOTSUPP;
+ if (rc)
+ goto error;
rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
if (rc)
@@ -3896,7 +3546,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
return rc;
error:
- mount_put_conns(&mnt_ctx);
+ cifs_mount_put_conns(&mnt_ctx);
return rc;
}
#endif
@@ -4449,264 +4099,7 @@ cifs_prune_tlinks(struct work_struct *work)
TLINK_IDLE_EXPIRE);
}
-#ifdef CONFIG_CIFS_DFS_UPCALL
-/* Update dfs referral path of superblock */
-static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb,
- const char *target)
-{
- int rc = 0;
- size_t len = strlen(target);
- char *refpath, *npath;
-
- if (unlikely(len < 2 || *target != '\\'))
- return -EINVAL;
-
- if (target[1] == '\\') {
- len += 1;
- refpath = kmalloc(len, GFP_KERNEL);
- if (!refpath)
- return -ENOMEM;
-
- scnprintf(refpath, len, "%s", target);
- } else {
- len += sizeof("\\");
- refpath = kmalloc(len, GFP_KERNEL);
- if (!refpath)
- return -ENOMEM;
-
- scnprintf(refpath, len, "\\%s", target);
- }
-
- npath = dfs_cache_canonical_path(refpath, cifs_sb->local_nls, cifs_remap(cifs_sb));
- kfree(refpath);
-
- if (IS_ERR(npath)) {
- rc = PTR_ERR(npath);
- } else {
- mutex_lock(&server->refpath_lock);
- kfree(server->leaf_fullpath);
- server->leaf_fullpath = npath;
- mutex_unlock(&server->refpath_lock);
- server->current_fullpath = server->leaf_fullpath;
- }
- return rc;
-}
-
-static int target_share_matches_server(struct TCP_Server_Info *server, const char *tcp_host,
- size_t tcp_host_len, char *share, bool *target_match)
-{
- int rc = 0;
- const char *dfs_host;
- size_t dfs_host_len;
-
- *target_match = true;
- extract_unc_hostname(share, &dfs_host, &dfs_host_len);
-
- /* Check if hostnames or addresses match */
- if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
- cifs_dbg(FYI, "%s: %.*s doesn't match %.*s\n", __func__, (int)dfs_host_len,
- dfs_host, (int)tcp_host_len, tcp_host);
- rc = match_target_ip(server, dfs_host, dfs_host_len, target_match);
- if (rc)
- cifs_dbg(VFS, "%s: failed to match target ip: %d\n", __func__, rc);
- }
- return rc;
-}
-
-static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, char *tree, bool islink,
- struct dfs_cache_tgt_list *tl)
-{
- int rc;
- struct TCP_Server_Info *server = tcon->ses->server;
- const struct smb_version_operations *ops = server->ops;
- struct cifs_tcon *ipc = tcon->ses->tcon_ipc;
- char *share = NULL, *prefix = NULL;
- const char *tcp_host;
- size_t tcp_host_len;
- struct dfs_cache_tgt_iterator *tit;
- bool target_match;
-
- extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len);
-
- tit = dfs_cache_get_tgt_iterator(tl);
- if (!tit) {
- rc = -ENOENT;
- goto out;
- }
-
- /* Try to tree connect to all dfs targets */
- for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) {
- const char *target = dfs_cache_get_tgt_name(tit);
- struct dfs_cache_tgt_list ntl = DFS_CACHE_TGT_LIST_INIT(ntl);
-
- kfree(share);
- kfree(prefix);
- share = prefix = NULL;
-
- /* Check if share matches with tcp ses */
- rc = dfs_cache_get_tgt_share(server->current_fullpath + 1, tit, &share, &prefix);
- if (rc) {
- cifs_dbg(VFS, "%s: failed to parse target share: %d\n", __func__, rc);
- break;
- }
-
- rc = target_share_matches_server(server, tcp_host, tcp_host_len, share,
- &target_match);
- if (rc)
- break;
- if (!target_match) {
- rc = -EHOSTUNREACH;
- continue;
- }
-
- if (ipc->need_reconnect) {
- scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
- rc = ops->tree_connect(xid, ipc->ses, tree, ipc, cifs_sb->local_nls);
- if (rc)
- break;
- }
-
- scnprintf(tree, MAX_TREE_SIZE, "\\%s", share);
- if (!islink) {
- rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
- break;
- }
- /*
- * If no dfs referrals were returned from link target, then just do a TREE_CONNECT
- * to it. Otherwise, cache the dfs referral and then mark current tcp ses for
- * reconnect so either the demultiplex thread or the echo worker will reconnect to
- * newly resolved target.
- */
- if (dfs_cache_find(xid, tcon->ses, cifs_sb->local_nls, cifs_remap(cifs_sb), target,
- NULL, &ntl)) {
- rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
- if (rc)
- continue;
- rc = dfs_cache_noreq_update_tgthint(server->current_fullpath + 1, tit);
- if (!rc)
- rc = cifs_update_super_prepath(cifs_sb, prefix);
- } else {
- /* Target is another dfs share */
- rc = update_server_fullpath(server, cifs_sb, target);
- dfs_cache_free_tgts(tl);
-
- if (!rc) {
- rc = -EREMOTE;
- list_replace_init(&ntl.tl_list, &tl->tl_list);
- } else
- dfs_cache_free_tgts(&ntl);
- }
- break;
- }
-
-out:
- kfree(share);
- kfree(prefix);
-
- return rc;
-}
-
-static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, char *tree, bool islink,
- struct dfs_cache_tgt_list *tl)
-{
- int rc;
- int num_links = 0;
- struct TCP_Server_Info *server = tcon->ses->server;
-
- do {
- rc = __tree_connect_dfs_target(xid, tcon, cifs_sb, tree, islink, tl);
- if (!rc || rc != -EREMOTE)
- break;
- } while (rc = -ELOOP, ++num_links < MAX_NESTED_LINKS);
- /*
- * If we couldn't tree connect to any targets from last referral path, then retry from
- * original referral path.
- */
- if (rc && server->current_fullpath != server->origin_fullpath) {
- server->current_fullpath = server->origin_fullpath;
- cifs_signal_cifsd_for_reconnect(server, true);
- }
-
- dfs_cache_free_tgts(tl);
- return rc;
-}
-
-int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc)
-{
- int rc;
- struct TCP_Server_Info *server = tcon->ses->server;
- const struct smb_version_operations *ops = server->ops;
- struct super_block *sb = NULL;
- struct cifs_sb_info *cifs_sb;
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
- char *tree;
- struct dfs_info3_param ref = {0};
-
- /* only send once per connect */
- spin_lock(&tcon->tc_lock);
- if (tcon->ses->ses_status != SES_GOOD ||
- (tcon->status != TID_NEW &&
- tcon->status != TID_NEED_TCON)) {
- spin_unlock(&tcon->tc_lock);
- return 0;
- }
- tcon->status = TID_IN_TCON;
- spin_unlock(&tcon->tc_lock);
-
- tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
- if (!tree) {
- rc = -ENOMEM;
- goto out;
- }
-
- if (tcon->ipc) {
- scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
- rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc);
- goto out;
- }
-
- sb = cifs_get_tcp_super(server);
- if (IS_ERR(sb)) {
- rc = PTR_ERR(sb);
- cifs_dbg(VFS, "%s: could not find superblock: %d\n", __func__, rc);
- goto out;
- }
-
- cifs_sb = CIFS_SB(sb);
-
- /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */
- if (!server->current_fullpath ||
- dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) {
- rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls);
- goto out;
- }
-
- rc = tree_connect_dfs_target(xid, tcon, cifs_sb, tree, ref.server_type == DFS_TYPE_LINK,
- &tl);
- free_dfs_info_param(&ref);
-
-out:
- kfree(tree);
- cifs_put_tcp_super(sb);
-
- if (rc) {
- spin_lock(&tcon->tc_lock);
- if (tcon->status == TID_IN_TCON)
- tcon->status = TID_NEED_TCON;
- spin_unlock(&tcon->tc_lock);
- } else {
- spin_lock(&tcon->tc_lock);
- if (tcon->status == TID_IN_TCON)
- tcon->status = TID_GOOD;
- spin_unlock(&tcon->tc_lock);
- tcon->need_reconnect = false;
- }
-
- return rc;
-}
-#else
+#ifndef CONFIG_CIFS_DFS_UPCALL
int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc)
{
int rc;
diff --git a/fs/cifs/dfs.c b/fs/cifs/dfs.c
new file mode 100644
index 000000000000..b64d20374b9c
--- /dev/null
+++ b/fs/cifs/dfs.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#include <linux/namei.h>
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "dns_resolve.h"
+#include "fs_context.h"
+#include "dfs.h"
+
+/**
+ * dfs_parse_target_referral - set fs context for dfs target referral
+ *
+ * @full_path: full path in UNC format.
+ * @ref: dfs referral pointer.
+ * @ctx: smb3 fs context pointer.
+ *
+ * Return zero if dfs referral was parsed correctly, otherwise non-zero.
+ */
+int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref,
+ struct smb3_fs_context *ctx)
+{
+ int rc;
+ const char *prepath = NULL;
+ char *path;
+
+ if (!full_path || !*full_path || !ref || !ctx)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!ref->node_name || ref->path_consumed < 0))
+ return -EINVAL;
+
+ if (strlen(full_path) - ref->path_consumed) {
+ prepath = full_path + ref->path_consumed;
+ /* skip initial delimiter */
+ if (*prepath == '/' || *prepath == '\\')
+ prepath++;
+ }
+
+ path = cifs_build_devname(ref->node_name, prepath);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ rc = smb3_parse_devname(path, ctx);
+ if (rc)
+ goto out;
+
+ rc = dns_resolve_server_name_to_ip(path, (struct sockaddr *)&ctx->dstaddr, NULL);
+
+out:
+ kfree(path);
+ return rc;
+}
+
+/*
+ * cifs_build_path_to_root returns full path to root when we do not have an
+ * existing connection (tcon)
+ */
+static char *build_unc_path_to_root(const struct smb3_fs_context *ctx,
+ const struct cifs_sb_info *cifs_sb, bool useppath)
+{
+ char *full_path, *pos;
+ unsigned int pplen = useppath && ctx->prepath ? strlen(ctx->prepath) + 1 : 0;
+ unsigned int unc_len = strnlen(ctx->UNC, MAX_TREE_SIZE + 1);
+
+ if (unc_len > MAX_TREE_SIZE)
+ return ERR_PTR(-EINVAL);
+
+ full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
+ if (full_path == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(full_path, ctx->UNC, unc_len);
+ pos = full_path + unc_len;
+
+ if (pplen) {
+ *pos = CIFS_DIR_SEP(cifs_sb);
+ memcpy(pos + 1, ctx->prepath, pplen);
+ pos += pplen;
+ }
+
+ *pos = '\0'; /* add trailing null */
+ convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+ cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path);
+ return full_path;
+}
+
+static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path)
+{
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ int rc;
+
+ ctx->leaf_fullpath = (char *)full_path;
+ rc = cifs_mount_get_session(mnt_ctx);
+ ctx->leaf_fullpath = NULL;
+ if (!rc) {
+ struct cifs_ses *ses = mnt_ctx->ses;
+
+ mutex_lock(&ses->session_mutex);
+ ses->dfs_root_ses = mnt_ctx->root_ses;
+ mutex_unlock(&ses->session_mutex);
+ }
+ return rc;
+}
+
+static void set_root_ses(struct cifs_mount_ctx *mnt_ctx)
+{
+ if (mnt_ctx->ses) {
+ spin_lock(&cifs_tcp_ses_lock);
+ mnt_ctx->ses->ses_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+ dfs_cache_add_refsrv_session(&mnt_ctx->mount_id, mnt_ctx->ses);
+ }
+ mnt_ctx->root_ses = mnt_ctx->ses;
+}
+
+static int get_dfs_conn(struct cifs_mount_ctx *mnt_ctx, const char *ref_path, const char *full_path,
+ const struct dfs_cache_tgt_iterator *tit)
+{
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ struct dfs_info3_param ref = {};
+ int rc;
+
+ rc = dfs_cache_get_tgt_referral(ref_path + 1, tit, &ref);
+ if (rc)
+ return rc;
+
+ rc = dfs_parse_target_referral(full_path + 1, &ref, ctx);
+ if (rc)
+ goto out;
+
+ cifs_mount_put_conns(mnt_ctx);
+ rc = get_session(mnt_ctx, ref_path);
+ if (rc)
+ goto out;
+
+ if (ref.flags & DFSREF_REFERRAL_SERVER)
+ set_root_ses(mnt_ctx);
+
+ rc = -EREMOTE;
+ if (ref.flags & DFSREF_STORAGE_SERVER) {
+ rc = cifs_mount_get_tcon(mnt_ctx);
+ if (rc)
+ goto out;
+
+ /* some servers may not advertise referral capability under ref.flags */
+ if (!(ref.flags & DFSREF_REFERRAL_SERVER) &&
+ is_tcon_dfs(mnt_ctx->tcon))
+ set_root_ses(mnt_ctx);
+
+ rc = cifs_is_path_remote(mnt_ctx);
+ }
+
+out:
+ free_dfs_info_param(&ref);
+ return rc;
+}
+
+static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
+{
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ char *ref_path = NULL, *full_path = NULL;
+ struct dfs_cache_tgt_iterator *tit;
+ struct TCP_Server_Info *server;
+ char *origin_fullpath = NULL;
+ int num_links = 0;
+ int rc;
+
+ ref_path = dfs_get_path(cifs_sb, ctx->UNC);
+ if (IS_ERR(ref_path))
+ return PTR_ERR(ref_path);
+
+ full_path = build_unc_path_to_root(ctx, cifs_sb, true);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ full_path = NULL;
+ goto out;
+ }
+
+ origin_fullpath = kstrdup(full_path, GFP_KERNEL);
+ if (!origin_fullpath) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ do {
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+
+ rc = dfs_get_referral(mnt_ctx, ref_path + 1, NULL, &tl);
+ if (rc)
+ break;
+
+ tit = dfs_cache_get_tgt_iterator(&tl);
+ if (!tit) {
+ cifs_dbg(VFS, "%s: dfs referral (%s) with no targets\n", __func__,
+ ref_path + 1);
+ rc = -ENOENT;
+ dfs_cache_free_tgts(&tl);
+ break;
+ }
+
+ do {
+ rc = get_dfs_conn(mnt_ctx, ref_path, full_path, tit);
+ if (!rc)
+ break;
+ if (rc == -EREMOTE) {
+ if (++num_links > MAX_NESTED_LINKS) {
+ rc = -ELOOP;
+ break;
+ }
+ kfree(ref_path);
+ kfree(full_path);
+ ref_path = full_path = NULL;
+
+ full_path = build_unc_path_to_root(ctx, cifs_sb, true);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ full_path = NULL;
+ } else {
+ ref_path = dfs_get_path(cifs_sb, full_path);
+ if (IS_ERR(ref_path)) {
+ rc = PTR_ERR(ref_path);
+ ref_path = NULL;
+ }
+ }
+ break;
+ }
+ } while ((tit = dfs_cache_get_next_tgt(&tl, tit)));
+ dfs_cache_free_tgts(&tl);
+ } while (rc == -EREMOTE);
+
+ if (!rc) {
+ server = mnt_ctx->server;
+
+ mutex_lock(&server->refpath_lock);
+ server->origin_fullpath = origin_fullpath;
+ server->current_fullpath = server->leaf_fullpath;
+ mutex_unlock(&server->refpath_lock);
+ origin_fullpath = NULL;
+ }
+
+out:
+ kfree(origin_fullpath);
+ kfree(ref_path);
+ kfree(full_path);
+ return rc;
+}
+
+int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
+{
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ int rc;
+
+ *isdfs = false;
+
+ rc = get_session(mnt_ctx, NULL);
+ if (rc)
+ return rc;
+ mnt_ctx->root_ses = mnt_ctx->ses;
+ /*
+ * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally
+ * try to get an DFS referral (even cached) to determine whether it is an DFS mount.
+ *
+ * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem
+ * to respond with PATH_NOT_COVERED to requests that include the prefix.
+ */
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) ||
+ dfs_get_referral(mnt_ctx, ctx->UNC + 1, NULL, NULL)) {
+ rc = cifs_mount_get_tcon(mnt_ctx);
+ if (rc)
+ return rc;
+
+ rc = cifs_is_path_remote(mnt_ctx);
+ if (!rc || rc != -EREMOTE)
+ return rc;
+ }
+
+ *isdfs = true;
+ set_root_ses(mnt_ctx);
+
+ return __dfs_mount_share(mnt_ctx);
+}
+
+/* Update dfs referral path of superblock */
+static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb,
+ const char *target)
+{
+ int rc = 0;
+ size_t len = strlen(target);
+ char *refpath, *npath;
+
+ if (unlikely(len < 2 || *target != '\\'))
+ return -EINVAL;
+
+ if (target[1] == '\\') {
+ len += 1;
+ refpath = kmalloc(len, GFP_KERNEL);
+ if (!refpath)
+ return -ENOMEM;
+
+ scnprintf(refpath, len, "%s", target);
+ } else {
+ len += sizeof("\\");
+ refpath = kmalloc(len, GFP_KERNEL);
+ if (!refpath)
+ return -ENOMEM;
+
+ scnprintf(refpath, len, "\\%s", target);
+ }
+
+ npath = dfs_cache_canonical_path(refpath, cifs_sb->local_nls, cifs_remap(cifs_sb));
+ kfree(refpath);
+
+ if (IS_ERR(npath)) {
+ rc = PTR_ERR(npath);
+ } else {
+ mutex_lock(&server->refpath_lock);
+ kfree(server->leaf_fullpath);
+ server->leaf_fullpath = npath;
+ mutex_unlock(&server->refpath_lock);
+ server->current_fullpath = server->leaf_fullpath;
+ }
+ return rc;
+}
+
+static int target_share_matches_server(struct TCP_Server_Info *server, char *share,
+ bool *target_match)
+{
+ int rc = 0;
+ const char *dfs_host;
+ size_t dfs_host_len;
+
+ *target_match = true;
+ extract_unc_hostname(share, &dfs_host, &dfs_host_len);
+
+ /* Check if hostnames or addresses match */
+ cifs_server_lock(server);
+ if (dfs_host_len != strlen(server->hostname) ||
+ strncasecmp(dfs_host, server->hostname, dfs_host_len)) {
+ cifs_dbg(FYI, "%s: %.*s doesn't match %s\n", __func__,
+ (int)dfs_host_len, dfs_host, server->hostname);
+ rc = match_target_ip(server, dfs_host, dfs_host_len, target_match);
+ if (rc)
+ cifs_dbg(VFS, "%s: failed to match target ip: %d\n", __func__, rc);
+ }
+ cifs_server_unlock(server);
+ return rc;
+}
+
+static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, char *tree, bool islink,
+ struct dfs_cache_tgt_list *tl)
+{
+ int rc;
+ struct TCP_Server_Info *server = tcon->ses->server;
+ const struct smb_version_operations *ops = server->ops;
+ struct cifs_ses *root_ses = CIFS_DFS_ROOT_SES(tcon->ses);
+ struct cifs_tcon *ipc = root_ses->tcon_ipc;
+ char *share = NULL, *prefix = NULL;
+ struct dfs_cache_tgt_iterator *tit;
+ bool target_match;
+
+ tit = dfs_cache_get_tgt_iterator(tl);
+ if (!tit) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ /* Try to tree connect to all dfs targets */
+ for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) {
+ const char *target = dfs_cache_get_tgt_name(tit);
+ struct dfs_cache_tgt_list ntl = DFS_CACHE_TGT_LIST_INIT(ntl);
+
+ kfree(share);
+ kfree(prefix);
+ share = prefix = NULL;
+
+ /* Check if share matches with tcp ses */
+ rc = dfs_cache_get_tgt_share(server->current_fullpath + 1, tit, &share, &prefix);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to parse target share: %d\n", __func__, rc);
+ break;
+ }
+
+ rc = target_share_matches_server(server, share, &target_match);
+ if (rc)
+ break;
+ if (!target_match) {
+ rc = -EHOSTUNREACH;
+ continue;
+ }
+
+ dfs_cache_noreq_update_tgthint(server->current_fullpath + 1, tit);
+
+ if (ipc->need_reconnect) {
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
+ rc = ops->tree_connect(xid, ipc->ses, tree, ipc, cifs_sb->local_nls);
+ cifs_dbg(FYI, "%s: reconnect ipc: %d\n", __func__, rc);
+ }
+
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", share);
+ if (!islink) {
+ rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
+ break;
+ }
+ /*
+ * If no dfs referrals were returned from link target, then just do a TREE_CONNECT
+ * to it. Otherwise, cache the dfs referral and then mark current tcp ses for
+ * reconnect so either the demultiplex thread or the echo worker will reconnect to
+ * newly resolved target.
+ */
+ if (dfs_cache_find(xid, root_ses, cifs_sb->local_nls, cifs_remap(cifs_sb), target,
+ NULL, &ntl)) {
+ rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
+ if (rc)
+ continue;
+
+ rc = cifs_update_super_prepath(cifs_sb, prefix);
+ } else {
+ /* Target is another dfs share */
+ rc = update_server_fullpath(server, cifs_sb, target);
+ dfs_cache_free_tgts(tl);
+
+ if (!rc) {
+ rc = -EREMOTE;
+ list_replace_init(&ntl.tl_list, &tl->tl_list);
+ } else
+ dfs_cache_free_tgts(&ntl);
+ }
+ break;
+ }
+
+out:
+ kfree(share);
+ kfree(prefix);
+
+ return rc;
+}
+
+static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, char *tree, bool islink,
+ struct dfs_cache_tgt_list *tl)
+{
+ int rc;
+ int num_links = 0;
+ struct TCP_Server_Info *server = tcon->ses->server;
+ char *old_fullpath = server->leaf_fullpath;
+
+ do {
+ rc = __tree_connect_dfs_target(xid, tcon, cifs_sb, tree, islink, tl);
+ if (!rc || rc != -EREMOTE)
+ break;
+ } while (rc = -ELOOP, ++num_links < MAX_NESTED_LINKS);
+ /*
+ * If we couldn't tree connect to any targets from last referral path, then
+ * retry it from newly resolved dfs referral.
+ */
+ if (rc && server->leaf_fullpath != old_fullpath)
+ cifs_signal_cifsd_for_reconnect(server, true);
+
+ dfs_cache_free_tgts(tl);
+ return rc;
+}
+
+int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc)
+{
+ int rc;
+ struct TCP_Server_Info *server = tcon->ses->server;
+ const struct smb_version_operations *ops = server->ops;
+ struct super_block *sb = NULL;
+ struct cifs_sb_info *cifs_sb;
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ char *tree;
+ struct dfs_info3_param ref = {0};
+
+ /* only send once per connect */
+ spin_lock(&tcon->tc_lock);
+ if (tcon->ses->ses_status != SES_GOOD ||
+ (tcon->status != TID_NEW &&
+ tcon->status != TID_NEED_TCON)) {
+ spin_unlock(&tcon->tc_lock);
+ return 0;
+ }
+ tcon->status = TID_IN_TCON;
+ spin_unlock(&tcon->tc_lock);
+
+ tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
+ if (!tree) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (tcon->ipc) {
+ cifs_server_lock(server);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
+ cifs_server_unlock(server);
+ rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc);
+ goto out;
+ }
+
+ sb = cifs_get_tcp_super(server);
+ if (IS_ERR(sb)) {
+ rc = PTR_ERR(sb);
+ cifs_dbg(VFS, "%s: could not find superblock: %d\n", __func__, rc);
+ goto out;
+ }
+
+ cifs_sb = CIFS_SB(sb);
+
+ /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */
+ if (!server->current_fullpath ||
+ dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) {
+ rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls);
+ goto out;
+ }
+
+ rc = tree_connect_dfs_target(xid, tcon, cifs_sb, tree, ref.server_type == DFS_TYPE_LINK,
+ &tl);
+ free_dfs_info_param(&ref);
+
+out:
+ kfree(tree);
+ cifs_put_tcp_super(sb);
+
+ if (rc) {
+ spin_lock(&tcon->tc_lock);
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_NEED_TCON;
+ spin_unlock(&tcon->tc_lock);
+ } else {
+ spin_lock(&tcon->tc_lock);
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_GOOD;
+ spin_unlock(&tcon->tc_lock);
+ tcon->need_reconnect = false;
+ }
+
+ return rc;
+}
diff --git a/fs/cifs/dfs.h b/fs/cifs/dfs.h
new file mode 100644
index 000000000000..344bea6d8bab
--- /dev/null
+++ b/fs/cifs/dfs.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#ifndef _CIFS_DFS_H
+#define _CIFS_DFS_H
+
+#include "cifsglob.h"
+#include "fs_context.h"
+#include "cifs_unicode.h"
+
+int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref,
+ struct smb3_fs_context *ctx);
+int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs);
+
+static inline char *dfs_get_path(struct cifs_sb_info *cifs_sb, const char *path)
+{
+ return dfs_cache_canonical_path(path, cifs_sb->local_nls, cifs_remap(cifs_sb));
+}
+
+static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *path,
+ struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tl)
+{
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+
+ return dfs_cache_find(mnt_ctx->xid, mnt_ctx->root_ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), path, ref, tl);
+}
+
+static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ if (unlikely(!server->origin_fullpath))
+ return ERR_PTR(-EREMOTE);
+
+ return __build_path_from_dentry_optional_prefix(dentry, page,
+ server->origin_fullpath,
+ strlen(server->origin_fullpath),
+ true);
+}
+
+#endif /* _CIFS_DFS_H */
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index e70915ad7541..e20f8880363f 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -83,27 +83,6 @@ static void refresh_cache_worker(struct work_struct *work);
static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker);
-static void get_ipc_unc(const char *ref_path, char *ipc, size_t ipclen)
-{
- const char *host;
- size_t len;
-
- extract_unc_hostname(ref_path, &host, &len);
- scnprintf(ipc, ipclen, "\\\\%.*s\\IPC$", (int)len, host);
-}
-
-static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const char *path)
-{
- char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0};
-
- get_ipc_unc(path, unc, sizeof(unc));
- for (; *ses; ses++) {
- if (!strcasecmp(unc, (*ses)->tcon_ipc->tree_name))
- return *ses;
- }
- return ERR_PTR(-ENOENT);
-}
-
static void __mount_group_release(struct mount_group *mg)
{
int i;
@@ -760,8 +739,6 @@ static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, const
int rc;
int i;
- cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path);
-
*refs = NULL;
*numrefs = 0;
@@ -770,6 +747,7 @@ static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, const
if (unlikely(!cache_cp))
return -EINVAL;
+ cifs_dbg(FYI, "%s: ipc=%s referral=%s\n", __func__, ses->tcon_ipc->tree_name, path);
rc = ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs, cache_cp,
NO_MAP_UNI_RSVD);
if (!rc) {
@@ -1104,26 +1082,23 @@ out_free_path:
*
* Return zero if the target hint was updated successfully, otherwise non-zero.
*/
-int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it)
+void dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it)
{
- int rc;
- struct cache_entry *ce;
struct cache_dfs_tgt *t;
+ struct cache_entry *ce;
- if (!it)
- return -EINVAL;
+ if (!path || !it)
+ return;
cifs_dbg(FYI, "%s: path: %s\n", __func__, path);
- down_write(&htable_rw_lock);
+ if (!down_write_trylock(&htable_rw_lock))
+ return;
ce = lookup_cache_entry(path);
- if (IS_ERR(ce)) {
- rc = PTR_ERR(ce);
+ if (IS_ERR(ce))
goto out_unlock;
- }
- rc = 0;
t = ce->tgthint;
if (unlikely(!strcasecmp(it->it_name, t->name)))
@@ -1140,7 +1115,6 @@ int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_
out_unlock:
up_write(&htable_rw_lock);
- return rc;
}
/**
@@ -1314,8 +1288,7 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c
char unc[sizeof("\\\\") + SERVER_NAME_LENGTH] = {0};
const char *host;
size_t hostlen;
- char *ip = NULL;
- struct sockaddr sa;
+ struct sockaddr_storage ss;
bool match;
int rc;
@@ -1326,27 +1299,20 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c
* Resolve share's hostname and check if server address matches. Otherwise just ignore it
* as we could not have upcall to resolve hostname or failed to convert ip address.
*/
- match = true;
extract_unc_hostname(s1, &host, &hostlen);
scnprintf(unc, sizeof(unc), "\\\\%.*s", (int)hostlen, host);
- rc = dns_resolve_server_name_to_ip(unc, &ip, NULL);
+ rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, NULL);
if (rc < 0) {
cifs_dbg(FYI, "%s: could not resolve %.*s. assuming server address matches.\n",
__func__, (int)hostlen, host);
return true;
}
- if (!cifs_convert_address(&sa, ip, strlen(ip))) {
- cifs_dbg(VFS, "%s: failed to convert address \'%s\'. skip address matching.\n",
- __func__, ip);
- } else {
- cifs_server_lock(server);
- match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, &sa);
- cifs_server_unlock(server);
- }
+ cifs_server_lock(server);
+ match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, (struct sockaddr *)&ss);
+ cifs_server_unlock(server);
- kfree(ip);
return match;
}
@@ -1373,23 +1339,19 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach
}
/* Refresh dfs referral of tcon and mark it for reconnect if needed */
-static int __refresh_tcon(const char *path, struct cifs_ses **sessions, struct cifs_tcon *tcon,
- bool force_refresh)
+static int __refresh_tcon(const char *path, struct cifs_tcon *tcon, bool force_refresh)
{
- struct cifs_ses *ses;
- struct cache_entry *ce;
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ struct cifs_ses *ses = CIFS_DFS_ROOT_SES(tcon->ses);
+ struct cifs_tcon *ipc = ses->tcon_ipc;
struct dfs_info3_param *refs = NULL;
- int numrefs = 0;
bool needs_refresh = false;
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
- int rc = 0;
+ struct cache_entry *ce;
unsigned int xid;
+ int numrefs = 0;
+ int rc = 0;
- ses = find_ipc_from_server_path(sessions, path);
- if (IS_ERR(ses)) {
- cifs_dbg(FYI, "%s: could not find ipc session\n", __func__);
- return PTR_ERR(ses);
- }
+ xid = get_xid();
down_read(&htable_rw_lock);
ce = lookup_cache_entry(path);
@@ -1406,12 +1368,17 @@ static int __refresh_tcon(const char *path, struct cifs_ses **sessions, struct c
goto out;
}
- xid = get_xid();
- rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
- free_xid(xid);
+ spin_lock(&ipc->tc_lock);
+ if (ses->ses_status != SES_GOOD || ipc->status != TID_GOOD) {
+ spin_unlock(&ipc->tc_lock);
+ cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n", __func__);
+ goto out;
+ }
+ spin_unlock(&ipc->tc_lock);
- /* Create or update a cache entry with the new referral */
+ rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
if (!rc) {
+ /* Create or update a cache entry with the new referral */
dump_refs(refs, numrefs);
down_write(&htable_rw_lock);
@@ -1426,24 +1393,20 @@ static int __refresh_tcon(const char *path, struct cifs_ses **sessions, struct c
}
out:
+ free_xid(xid);
dfs_cache_free_tgts(&tl);
free_dfs_info_array(refs, numrefs);
return rc;
}
-static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool force_refresh)
+static int refresh_tcon(struct cifs_tcon *tcon, bool force_refresh)
{
struct TCP_Server_Info *server = tcon->ses->server;
mutex_lock(&server->refpath_lock);
- if (server->origin_fullpath) {
- if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
- server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh);
- }
+ if (server->leaf_fullpath)
+ __refresh_tcon(server->leaf_fullpath + 1, tcon, force_refresh);
mutex_unlock(&server->refpath_lock);
-
return 0;
}
@@ -1461,9 +1424,6 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
{
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
- struct mount_group *mg;
- struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL};
- int rc;
if (!cifs_sb || !cifs_sb->master_tlink)
return -EINVAL;
@@ -1480,21 +1440,6 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
cifs_dbg(FYI, "%s: no dfs mount group id\n", __func__);
return -EINVAL;
}
-
- mutex_lock(&mount_group_list_lock);
- mg = find_mount_group_locked(&cifs_sb->dfs_mount_id);
- if (IS_ERR(mg)) {
- mutex_unlock(&mount_group_list_lock);
- cifs_dbg(FYI, "%s: no ipc session for refreshing referral\n", __func__);
- return PTR_ERR(mg);
- }
- kref_get(&mg->refcount);
- mutex_unlock(&mount_group_list_lock);
-
- spin_lock(&mg->lock);
- memcpy(&sessions, mg->sessions, mg->num_sessions * sizeof(mg->sessions[0]));
- spin_unlock(&mg->lock);
-
/*
* After reconnecting to a different server, unique ids won't match anymore, so we disable
* serverino. This prevents dentry revalidation to think the dentry are stale (ESTALE).
@@ -1505,42 +1450,38 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
* that have different prefix paths.
*/
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
- rc = refresh_tcon(sessions, tcon, true);
- kref_put(&mg->refcount, mount_group_release);
- return rc;
+ return refresh_tcon(tcon, true);
}
/*
- * Refresh all active dfs mounts regardless of whether they are in cache or not.
- * (cache can be cleared)
+ * Worker that will refresh DFS cache from all active mounts based on lowest TTL value
+ * from a DFS referral.
*/
-static void refresh_mounts(struct cifs_ses **sessions)
+static void refresh_cache_worker(struct work_struct *work)
{
struct TCP_Server_Info *server;
- struct cifs_ses *ses;
struct cifs_tcon *tcon, *ntcon;
struct list_head tcons;
+ struct cifs_ses *ses;
INIT_LIST_HEAD(&tcons);
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- spin_lock(&server->srv_lock);
- if (!server->is_dfs_conn) {
- spin_unlock(&server->srv_lock);
+ if (!server->leaf_fullpath)
continue;
- }
- spin_unlock(&server->srv_lock);
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (ses->tcon_ipc) {
+ ses->ses_count++;
+ list_add_tail(&ses->tcon_ipc->ulist, &tcons);
+ }
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
- spin_lock(&tcon->tc_lock);
- if (!tcon->ipc && !tcon->need_reconnect) {
+ if (!tcon->ipc) {
tcon->tc_count++;
list_add_tail(&tcon->ulist, &tcons);
}
- spin_unlock(&tcon->tc_lock);
}
}
}
@@ -1552,132 +1493,14 @@ static void refresh_mounts(struct cifs_ses **sessions)
list_del_init(&tcon->ulist);
mutex_lock(&server->refpath_lock);
- if (server->origin_fullpath) {
- if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
- server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false);
- }
+ if (server->leaf_fullpath)
+ __refresh_tcon(server->leaf_fullpath + 1, tcon, false);
mutex_unlock(&server->refpath_lock);
- cifs_put_tcon(tcon);
- }
-}
-
-static void refresh_cache(struct cifs_ses **sessions)
-{
- int i;
- struct cifs_ses *ses;
- unsigned int xid;
- char *ref_paths[CACHE_MAX_ENTRIES];
- int count = 0;
- struct cache_entry *ce;
-
- /*
- * Refresh all cached entries. Get all new referrals outside critical section to avoid
- * starvation while performing SMB2 IOCTL on broken or slow connections.
-
- * The cache entries may cover more paths than the active mounts
- * (e.g. domain-based DFS referrals or multi tier DFS setups).
- */
- down_read(&htable_rw_lock);
- for (i = 0; i < CACHE_HTABLE_SIZE; i++) {
- struct hlist_head *l = &cache_htable[i];
-
- hlist_for_each_entry(ce, l, hlist) {
- if (count == ARRAY_SIZE(ref_paths))
- goto out_unlock;
- if (hlist_unhashed(&ce->hlist) || !cache_entry_expired(ce) ||
- IS_ERR(find_ipc_from_server_path(sessions, ce->path)))
- continue;
- ref_paths[count++] = kstrdup(ce->path, GFP_ATOMIC);
- }
- }
-
-out_unlock:
- up_read(&htable_rw_lock);
-
- for (i = 0; i < count; i++) {
- char *path = ref_paths[i];
- struct dfs_info3_param *refs = NULL;
- int numrefs = 0;
- int rc = 0;
-
- if (!path)
- continue;
-
- ses = find_ipc_from_server_path(sessions, path);
- if (IS_ERR(ses))
- goto next_referral;
-
- xid = get_xid();
- rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
- free_xid(xid);
-
- if (!rc) {
- down_write(&htable_rw_lock);
- ce = lookup_cache_entry(path);
- /*
- * We need to re-check it because other tasks might have it deleted or
- * updated.
- */
- if (!IS_ERR(ce) && cache_entry_expired(ce))
- update_cache_entry_locked(ce, refs, numrefs);
- up_write(&htable_rw_lock);
- }
-
-next_referral:
- kfree(path);
- free_dfs_info_array(refs, numrefs);
- }
-}
-
-/*
- * Worker that will refresh DFS cache and active mounts based on lowest TTL value from a DFS
- * referral.
- */
-static void refresh_cache_worker(struct work_struct *work)
-{
- struct list_head mglist;
- struct mount_group *mg, *tmp_mg;
- struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL};
- int max_sessions = ARRAY_SIZE(sessions) - 1;
- int i = 0, count;
-
- INIT_LIST_HEAD(&mglist);
-
- /* Get refereces of mount groups */
- mutex_lock(&mount_group_list_lock);
- list_for_each_entry(mg, &mount_group_list, list) {
- kref_get(&mg->refcount);
- list_add(&mg->refresh_list, &mglist);
- }
- mutex_unlock(&mount_group_list_lock);
-
- /* Fill in local array with an NULL-terminated list of all referral server sessions */
- list_for_each_entry(mg, &mglist, refresh_list) {
- if (i >= max_sessions)
- break;
-
- spin_lock(&mg->lock);
- if (i + mg->num_sessions > max_sessions)
- count = max_sessions - i;
+ if (tcon->ipc)
+ cifs_put_smb_ses(tcon->ses);
else
- count = mg->num_sessions;
- memcpy(&sessions[i], mg->sessions, count * sizeof(mg->sessions[0]));
- spin_unlock(&mg->lock);
- i += count;
- }
-
- if (sessions[0]) {
- /* Refresh all active mounts and cached entries */
- refresh_mounts(sessions);
- refresh_cache(sessions);
- }
-
- list_for_each_entry_safe(mg, tmp_mg, &mglist, refresh_list) {
- list_del_init(&mg->refresh_list);
- kref_put(&mg->refcount, mount_group_release);
+ cifs_put_tcon(tcon);
}
spin_lock(&cache_ttl_lock);
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
index 52070d1df189..f7cff0be9327 100644
--- a/fs/cifs/dfs_cache.h
+++ b/fs/cifs/dfs_cache.h
@@ -38,7 +38,7 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *cp, int remap, const char *path,
const struct dfs_cache_tgt_iterator *it);
-int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it);
+void dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it);
int dfs_cache_get_tgt_referral(const char *path, const struct dfs_cache_tgt_iterator *it,
struct dfs_info3_param *ref);
int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 8b1c37158556..ad4208bf1e32 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -78,14 +78,13 @@ build_path_from_dentry(struct dentry *direntry, void *page)
prefix);
}
-char *
-build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
- bool prefix)
+char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ const char *tree, int tree_len,
+ bool prefix)
{
int dfsplen;
int pplen = 0;
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
char dirsep = CIFS_DIR_SEP(cifs_sb);
char *s;
@@ -93,7 +92,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
return ERR_PTR(-ENOMEM);
if (prefix)
- dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1);
+ dfsplen = strnlen(tree, tree_len + 1);
else
dfsplen = 0;
@@ -123,7 +122,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
}
if (dfsplen) {
s -= dfsplen;
- memcpy(s, tcon->tree_name, dfsplen);
+ memcpy(s, tree, dfsplen);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
int i;
for (i = 0; i < dfsplen; i++) {
@@ -135,6 +134,16 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
return s;
}
+char *build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ bool prefix)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+ return __build_path_from_dentry_optional_prefix(direntry, page, tcon->tree_name,
+ MAX_TREE_SIZE, prefix);
+}
+
/*
* Don't allow path components longer than the server max.
* Don't allow the separator character in a path component.
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0458d28d71aa..8bf8978bc5d6 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -12,6 +12,7 @@
*
*/
+#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/dns_resolver.h>
#include "dns_resolve.h"
@@ -25,17 +26,13 @@
* @ip_addr: Where to return the IP address.
* @expiry: Where to return the expiry time for the dns record.
*
- * The IP address will be returned in string form, and the caller is
- * responsible for freeing it.
- *
- * Returns length of result on success, -ve on error.
+ * Returns zero success, -ve on error.
*/
int
-dns_resolve_server_name_to_ip(const char *unc, char **ip_addr, time64_t *expiry)
+dns_resolve_server_name_to_ip(const char *unc, struct sockaddr *ip_addr, time64_t *expiry)
{
- struct sockaddr_storage ss;
const char *hostname, *sep;
- char *name;
+ char *ip;
int len, rc;
if (!ip_addr || !unc)
@@ -60,30 +57,32 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr, time64_t *expiry)
__func__, unc);
/* Try to interpret hostname as an IPv4 or IPv6 address */
- rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
- if (rc > 0)
- goto name_is_IP_address;
+ rc = cifs_convert_address(ip_addr, hostname, len);
+ if (rc > 0) {
+ cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %*.*s\n", __func__, len, len,
+ hostname);
+ return 0;
+ }
/* Perform the upcall */
rc = dns_query(current->nsproxy->net_ns, NULL, hostname, len,
- NULL, ip_addr, expiry, false);
- if (rc < 0)
+ NULL, &ip, expiry, false);
+ if (rc < 0) {
cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
__func__, len, len, hostname);
- else
+ } else {
cifs_dbg(FYI, "%s: resolved: %*.*s to %s expiry %llu\n",
- __func__, len, len, hostname, *ip_addr,
+ __func__, len, len, hostname, ip,
expiry ? (*expiry) : 0);
- return rc;
-name_is_IP_address:
- name = kmalloc(len + 1, GFP_KERNEL);
- if (!name)
- return -ENOMEM;
- memcpy(name, hostname, len);
- name[len] = 0;
- cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %s\n",
- __func__, name);
- *ip_addr = name;
- return 0;
+ rc = cifs_convert_address(ip_addr, ip, strlen(ip));
+ kfree(ip);
+
+ if (!rc) {
+ cifs_dbg(FYI, "%s: unable to determine ip address\n", __func__);
+ rc = -EHOSTUNREACH;
+ } else
+ rc = 0;
+ }
+ return rc;
}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index afc0df381246..6eb0c15a2440 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -11,8 +11,10 @@
#ifndef _DNS_RESOLVE_H
#define _DNS_RESOLVE_H
+#include <linux/net.h>
+
#ifdef __KERNEL__
-extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr, time64_t *expiry);
+int dns_resolve_server_name_to_ip(const char *unc, struct sockaddr *ip_addr, time64_t *expiry);
#endif /* KERNEL */
#endif /* _DNS_RESOLVE_H */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cd9698209930..22dfc1f8b4f1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1413,7 +1413,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
struct inode *inode = d_inode(cfile->dentry);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct file_lock *flock;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
unsigned int count = 0, i;
int rc = 0, xid, type;
struct list_head locks_to_send, *el;
@@ -2646,6 +2646,21 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
return rc;
}
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc);
+
+static int cifs_write_one_page(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret;
+
+ ret = cifs_writepage_locked(page, wbc);
+ unlock_page(page);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -2662,10 +2677,11 @@ static int cifs_writepages(struct address_space *mapping,
/*
* If wsize is smaller than the page cache size, default to writing
- * one page at a time via cifs_writepage
+ * one page at a time.
*/
if (cifs_sb->ctx->wsize < PAGE_SIZE)
- return generic_writepages(mapping, wbc);
+ return write_cache_pages(mapping, wbc, cifs_write_one_page,
+ mapping);
xid = get_xid();
if (wbc->range_cyclic) {
@@ -2852,13 +2868,6 @@ retry_write:
return rc;
}
-static int cifs_writepage(struct page *page, struct writeback_control *wbc)
-{
- int rc = cifs_writepage_locked(page, wbc);
- unlock_page(page);
- return rc;
-}
-
static int cifs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
@@ -3532,7 +3541,7 @@ static ssize_t __cifs_writev(
ctx->iter = *from;
ctx->len = len;
} else {
- rc = setup_aio_ctx_iter(ctx, from, WRITE);
+ rc = setup_aio_ctx_iter(ctx, from, ITER_SOURCE);
if (rc) {
kref_put(&ctx->refcount, cifs_aio_ctx_release);
return rc;
@@ -4276,7 +4285,7 @@ static ssize_t __cifs_readv(
ctx->iter = *to;
ctx->len = len;
} else {
- rc = setup_aio_ctx_iter(ctx, to, READ);
+ rc = setup_aio_ctx_iter(ctx, to, ITER_DEST);
if (rc) {
kref_put(&ctx->refcount, cifs_aio_ctx_release);
return rc;
@@ -5231,7 +5240,6 @@ static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
const struct address_space_operations cifs_addr_ops = {
.read_folio = cifs_read_folio,
.readahead = cifs_readahead,
- .writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
@@ -5240,10 +5248,10 @@ const struct address_space_operations cifs_addr_ops = {
.direct_IO = cifs_direct_io,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
+ .migrate_folio = filemap_migrate_folio,
/*
- * TODO: investigate and if useful we could add an cifs_migratePage
- * helper (under an CONFIG_MIGRATION) in the future, and also
- * investigate and add an is_dirty_writeback helper if needed
+ * TODO: investigate and if useful we could add an is_dirty_writeback
+ * helper if needed
*/
.swap_activate = cifs_swap_activate,
.swap_deactivate = cifs_swap_deactivate,
@@ -5256,7 +5264,6 @@ const struct address_space_operations cifs_addr_ops = {
*/
const struct address_space_operations cifs_addr_ops_smallbuf = {
.read_folio = cifs_read_folio,
- .writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
@@ -5264,4 +5271,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.release_folio = cifs_release_folio,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
+ .migrate_folio = filemap_migrate_folio,
};
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 45119597c765..6d13f8207e96 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -308,7 +308,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
{
memcpy(new_ctx, ctx, sizeof(*ctx));
new_ctx->prepath = NULL;
- new_ctx->mount_options = NULL;
new_ctx->nodename = NULL;
new_ctx->username = NULL;
new_ctx->password = NULL;
@@ -317,11 +316,11 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->UNC = NULL;
new_ctx->source = NULL;
new_ctx->iocharset = NULL;
+ new_ctx->leaf_fullpath = NULL;
/*
* Make sure to stay in sync with smb3_cleanup_fs_context_contents()
*/
DUP_CTX_STR(prepath);
- DUP_CTX_STR(mount_options);
DUP_CTX_STR(username);
DUP_CTX_STR(password);
DUP_CTX_STR(server_hostname);
@@ -330,6 +329,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(domainname);
DUP_CTX_STR(nodename);
DUP_CTX_STR(iocharset);
+ DUP_CTX_STR(leaf_fullpath);
return 0;
}
@@ -569,17 +569,12 @@ static const struct fs_context_operations smb3_fs_context_ops = {
static int smb3_fs_context_parse_monolithic(struct fs_context *fc,
void *data)
{
- struct smb3_fs_context *ctx = smb3_fc2context(fc);
char *options = data, *key;
int ret = 0;
if (!options)
return 0;
- ctx->mount_options = kstrdup(data, GFP_KERNEL);
- if (ctx->mount_options == NULL)
- return -ENOMEM;
-
ret = security_sb_eat_lsm_opts(options, &fc->security);
if (ret)
return ret;
@@ -884,16 +879,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->nodfs = 1;
break;
case Opt_hard:
- if (result.negated)
+ if (result.negated) {
+ if (ctx->retry == 1)
+ cifs_dbg(VFS, "conflicting hard vs. soft mount options\n");
ctx->retry = 0;
- else
+ } else
ctx->retry = 1;
break;
case Opt_soft:
if (result.negated)
ctx->retry = 1;
- else
+ else {
+ if (ctx->retry == 1)
+ cifs_dbg(VFS, "conflicting hard vs soft mount options\n");
ctx->retry = 0;
+ }
break;
case Opt_mapposix:
if (result.negated)
@@ -1576,8 +1576,6 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
/*
* Make sure this stays in sync with smb3_fs_context_dup()
*/
- kfree(ctx->mount_options);
- ctx->mount_options = NULL;
kfree(ctx->username);
ctx->username = NULL;
kfree_sensitive(ctx->password);
@@ -1596,6 +1594,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->iocharset = NULL;
kfree(ctx->prepath);
ctx->prepath = NULL;
+ kfree(ctx->leaf_fullpath);
+ ctx->leaf_fullpath = NULL;
}
void
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index bbaee4c2281f..44cb5639ed3b 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -264,8 +264,7 @@ struct smb3_fs_context {
__u16 compression; /* compression algorithm 0xFFFF default 0=disabled */
bool rootfs:1; /* if it's a SMB root file system */
bool witness:1; /* use witness protocol */
-
- char *mount_options;
+ char *leaf_fullpath;
};
extern const struct fs_parameter_spec smb3_fs_parameters[];
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a1751b956318..f6f3a6b75601 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -150,7 +150,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page)
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_read_operation(&cres, cookie);
if (ret < 0)
@@ -180,7 +180,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_write_operation(&cres, cookie);
if (ret < 0)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4e2ca3c6e5c0..f145a59af89b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -632,6 +632,8 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from POSIX info struct */
static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data,
+ struct cifs_sid *owner,
+ struct cifs_sid *group,
struct super_block *sb, bool adjust_tz, bool symlink)
{
struct smb311_posix_qinfo *info = &data->posix_fi;
@@ -680,8 +682,8 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope
}
/* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */
- fattr->cf_uid = cifs_sb->ctx->linux_uid; /* TODO: map uid and gid from SID */
- fattr->cf_gid = cifs_sb->ctx->linux_gid;
+ sid_to_id(cifs_sb, owner, fattr, SIDOWNER);
+ sid_to_id(cifs_sb, group, fattr, SIDGROUP);
cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n",
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
@@ -991,12 +993,6 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
}
rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data,
&adjust_tz, &is_reparse_point);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- if (rc == -ENOENT && is_tcon_dfs(tcon))
- rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon,
- cifs_sb,
- full_path);
-#endif
data = &tmp_data;
}
@@ -1175,6 +1171,7 @@ smb311_posix_get_inode_info(struct inode **inode,
struct cifs_fattr fattr = {0};
bool symlink = false;
struct cifs_open_info_data data = {};
+ struct cifs_sid owner, group;
int rc = 0;
int tmprc = 0;
@@ -1192,7 +1189,8 @@ smb311_posix_get_inode_info(struct inode **inode,
goto out;
}
- rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, &adjust_tz,
+ rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data,
+ &owner, &group, &adjust_tz,
&symlink);
/*
@@ -1201,7 +1199,8 @@ smb311_posix_get_inode_info(struct inode **inode,
switch (rc) {
case 0:
- smb311_posix_info_to_fattr(&fattr, &data, sb, adjust_tz, symlink);
+ smb311_posix_info_to_fattr(&fattr, &data, &owner, &group,
+ sb, adjust_tz, symlink);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index bd374feeccaa..a5a097a69983 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -428,6 +428,7 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
oparms.disposition = FILE_CREATE;
oparms.fid = &fid;
oparms.reconnect = false;
+ oparms.mode = 0644;
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
NULL, NULL);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3e68d8208cf5..2a19c7987c5b 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1136,8 +1136,8 @@ cifs_free_hash(struct shash_desc **sdesc)
* @len: Where to store the length for this page:
* @offset: Where to store the offset for this page
*/
-void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset)
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+ unsigned int *len, unsigned int *offset)
{
*len = rqst->rq_pagesz;
*offset = (page == 0) ? rqst->rq_offset : 0;
@@ -1258,44 +1258,30 @@ int match_target_ip(struct TCP_Server_Info *server,
bool *result)
{
int rc;
- char *target, *tip = NULL;
- struct sockaddr tipaddr;
+ char *target;
+ struct sockaddr_storage ss;
*result = false;
target = kzalloc(share_len + 3, GFP_KERNEL);
- if (!target) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!target)
+ return -ENOMEM;
scnprintf(target, share_len + 3, "\\\\%.*s", (int)share_len, share);
cifs_dbg(FYI, "%s: target name: %s\n", __func__, target + 2);
- rc = dns_resolve_server_name_to_ip(target, &tip, NULL);
- if (rc < 0)
- goto out;
-
- cifs_dbg(FYI, "%s: target ip: %s\n", __func__, tip);
+ rc = dns_resolve_server_name_to_ip(target, (struct sockaddr *)&ss, NULL);
+ kfree(target);
- if (!cifs_convert_address(&tipaddr, tip, strlen(tip))) {
- cifs_dbg(VFS, "%s: failed to convert target ip address\n",
- __func__);
- rc = -EINVAL;
- goto out;
- }
+ if (rc < 0)
+ return rc;
- *result = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr,
- &tipaddr);
+ spin_lock(&server->srv_lock);
+ *result = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, (struct sockaddr *)&ss);
+ spin_unlock(&server->srv_lock);
cifs_dbg(FYI, "%s: ip addresses match: %u\n", __func__, *result);
- rc = 0;
-
-out:
- kfree(target);
- kfree(tip);
-
- return rc;
+ return 0;
}
int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix)
@@ -1314,49 +1300,4 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
return 0;
}
-
-/** cifs_dfs_query_info_nonascii_quirk
- * Handle weird Windows SMB server behaviour. It responds with
- * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request
- * for "\<server>\<dfsname>\<linkpath>" DFS reference,
- * where <dfsname> contains non-ASCII unicode symbols.
- *
- * Check such DFS reference.
- */
-int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *linkpath)
-{
- char *treename, *dfspath, sep;
- int treenamelen, linkpathlen, rc;
-
- treename = tcon->tree_name;
- /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL
- * messages MUST be encoded with exactly one leading backslash, not two
- * leading backslashes.
- */
- sep = CIFS_DIR_SEP(cifs_sb);
- if (treename[0] == sep && treename[1] == sep)
- treename++;
- linkpathlen = strlen(linkpath);
- treenamelen = strnlen(treename, MAX_TREE_SIZE + 1);
- dfspath = kzalloc(treenamelen + linkpathlen + 1, GFP_KERNEL);
- if (!dfspath)
- return -ENOMEM;
- if (treenamelen)
- memcpy(dfspath, treename, treenamelen);
- memcpy(dfspath + treenamelen, linkpath, linkpathlen);
- rc = dfs_cache_find(xid, tcon->ses, cifs_sb->local_nls,
- cifs_remap(cifs_sb), dfspath, NULL, NULL);
- if (rc == 0) {
- cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n",
- dfspath);
- rc = -EREMOTE;
- } else {
- cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc);
- }
- kfree(dfspath);
- return rc;
-}
#endif
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 9e7d9f0baa18..c47b254f0d1e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -292,9 +292,10 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
continue;
}
kref_get(&iface->refcount);
+ break;
}
- if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ if (list_entry_is_head(iface, &ses->iface_list, iface_head)) {
rc = 1;
iface = NULL;
cifs_dbg(FYI, "unable to find a suitable iface\n");
@@ -814,6 +815,7 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
return -EINVAL;
}
if (tilen) {
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen,
GFP_KERNEL);
if (!ses->auth_key.response) {
@@ -1427,6 +1429,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
goto out_put_spnego_key;
}
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
GFP_KERNEL);
if (!ses->auth_key.response) {
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 50480751e521..4cb364454e13 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -562,17 +562,20 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls,
cifs_remap(cifs_sb));
- if (!rc)
- move_cifs_info_to_smb2(&data->fi, &fi);
*adjustTZ = true;
}
- if (!rc && (le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) {
+ if (!rc) {
int tmprc;
int oplock = 0;
struct cifs_fid fid;
struct cifs_open_parms oparms;
+ move_cifs_info_to_smb2(&data->fi, &fi);
+
+ if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE))
+ return 0;
+
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = FILE_READ_ATTRIBUTES;
@@ -716,17 +719,25 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
static int cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
void *buf)
{
- FILE_ALL_INFO *fi = buf;
+ struct cifs_open_info_data *data = buf;
+ FILE_ALL_INFO fi = {};
+ int rc;
if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
- return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
- oparms->disposition,
- oparms->desired_access,
- oparms->create_options,
- &oparms->fid->netfid, oplock, fi,
- oparms->cifs_sb->local_nls,
- cifs_remap(oparms->cifs_sb));
- return CIFS_open(xid, oparms, oplock, fi);
+ rc = SMBLegacyOpen(xid, oparms->tcon, oparms->path,
+ oparms->disposition,
+ oparms->desired_access,
+ oparms->create_options,
+ &oparms->fid->netfid, oplock, &fi,
+ oparms->cifs_sb->local_nls,
+ cifs_remap(oparms->cifs_sb));
+ else
+ rc = CIFS_open(xid, oparms, oplock, &fi);
+
+ if (!rc && data)
+ move_cifs_info_to_smb2(&data->fi, &fi);
+
+ return rc;
}
static void
@@ -1050,7 +1061,7 @@ cifs_make_node(unsigned int xid, struct inode *inode,
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct inode *newinode = NULL;
int rc = -EPERM;
- FILE_ALL_INFO *buf = NULL;
+ struct cifs_open_info_data buf = {};
struct cifs_io_parms io_parms;
__u32 oplock = 0;
struct cifs_fid fid;
@@ -1082,14 +1093,14 @@ cifs_make_node(unsigned int xid, struct inode *inode,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
if (rc)
- goto out;
+ return rc;
rc = cifs_get_inode_info_unix(&newinode, full_path,
inode->i_sb, xid);
if (rc == 0)
d_instantiate(dentry, newinode);
- goto out;
+ return rc;
}
/*
@@ -1097,19 +1108,13 @@ cifs_make_node(unsigned int xid, struct inode *inode,
* support block and char device (no socket & fifo)
*/
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- goto out;
+ return rc;
if (!S_ISCHR(mode) && !S_ISBLK(mode))
- goto out;
+ return rc;
cifs_dbg(FYI, "sfu compat create special file\n");
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto out;
- }
-
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_WRITE;
@@ -1124,21 +1129,21 @@ cifs_make_node(unsigned int xid, struct inode *inode,
oplock = REQ_OPLOCK;
else
oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf);
if (rc)
- goto out;
+ return rc;
/*
* BB Do not bother to decode buf since no local inode yet to put
* timestamps in, but we can reuse it safely.
*/
- pdev = (struct win_dev *)buf;
+ pdev = (struct win_dev *)&buf.fi;
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
io_parms.offset = 0;
io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = buf;
+ iov[1].iov_base = &buf.fi;
iov[1].iov_len = sizeof(struct win_dev);
if (S_ISCHR(mode)) {
memcpy(pdev->type, "IntxCHR", 8);
@@ -1157,8 +1162,8 @@ cifs_make_node(unsigned int xid, struct inode *inode,
d_drop(dentry);
/* FIXME: add code here to set EAs */
-out:
- kfree(buf);
+
+ cifs_free_open_info(&buf);
return rc;
}
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index ffbd9a99fc12..ba6cc50af390 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -122,8 +122,8 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
struct smb2_hdr *hdr = err_iov.iov_base;
if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER))
- rc = -ENOMEM;
- else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
+ goto out;
+ if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov,
&data->symlink_target);
if (!rc) {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 68e08c85fbb8..8521adf9ce79 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -59,6 +59,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
__u32 desired_access, __u32 create_disposition, __u32 create_options,
umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile,
+ __u8 **extbuf, size_t *extbuflen,
struct kvec *err_iov, int *err_buftype)
{
struct cop_vars *vars = NULL;
@@ -430,6 +431,21 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
&rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */,
(char *)&idata->posix_fi);
}
+ if (rc == 0) {
+ unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength);
+
+ if (length > sizeof(idata->posix_fi)) {
+ char *base = (char *)rsp_iov[1].iov_base +
+ le16_to_cpu(qi_rsp->OutputBufferOffset) +
+ sizeof(idata->posix_fi);
+ *extbuflen = length - sizeof(idata->posix_fi);
+ *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL);
+ if (!*extbuf)
+ rc = -ENOMEM;
+ } else {
+ rc = -EINVAL;
+ }
+ }
if (rqst[1].rq_iov)
SMB2_query_info_free(&rqst[1]);
if (rqst[2].rq_iov)
@@ -539,23 +555,43 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile,
- err_iov, err_buftype);
- if (rc == -EOPNOTSUPP) {
- if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
- ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE &&
- ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) {
- rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target);
+ NULL, NULL, err_iov, err_buftype);
+ if (rc) {
+ struct smb2_hdr *hdr = err_iov[0].iov_base;
+
+ if (unlikely(!hdr || err_buftype[0] == CIFS_NO_BUFFER))
+ goto out;
+ if (rc == -EOPNOTSUPP && hdr->Command == SMB2_CREATE &&
+ hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
+ rc = smb2_parse_symlink_response(cifs_sb, err_iov,
+ &data->symlink_target);
if (rc)
goto out;
- }
- *reparse = true;
- create_options |= OPEN_REPARSE_POINT;
- /* Failed on a symbolic link - query a reparse point info */
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
- FILE_OPEN, create_options, ACL_NO_MODE, data,
- SMB2_OP_QUERY_INFO, cfile, NULL, NULL);
+ *reparse = true;
+ create_options |= OPEN_REPARSE_POINT;
+
+ /* Failed on a symbolic link - query a reparse point info */
+ cifs_get_readable_path(tcon, full_path, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE, data,
+ SMB2_OP_QUERY_INFO, cfile, NULL, NULL,
+ NULL, NULL);
+ goto out;
+ } else if (rc != -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) &&
+ hdr->Status == STATUS_OBJECT_NAME_INVALID) {
+ /*
+ * Handle weird Windows SMB server behaviour. It responds with
+ * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request
+ * for "\<server>\<dfsname>\<linkpath>" DFS reference,
+ * where <dfsname> contains non-ASCII unicode symbols.
+ */
+ rc = -EREMOTE;
+ }
+ if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb &&
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS))
+ rc = -EOPNOTSUPP;
}
out:
@@ -568,13 +604,20 @@ out:
int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse)
+ struct cifs_open_info_data *data,
+ struct cifs_sid *owner,
+ struct cifs_sid *group,
+ bool *adjust_tz, bool *reparse)
{
int rc;
__u32 create_options = 0;
struct cifsFileInfo *cfile;
struct kvec err_iov[3] = {};
int err_buftype[3] = {};
+ __u8 *sidsbuf = NULL;
+ __u8 *sidsbuf_end = NULL;
+ size_t sidsbuflen = 0;
+ size_t owner_len, group_len;
*adjust_tz = false;
*reparse = false;
@@ -589,7 +632,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile,
- err_iov, err_buftype);
+ &sidsbuf, &sidsbuflen, err_iov, err_buftype);
if (rc == -EOPNOTSUPP) {
/* BB TODO: When support for special files added to Samba re-verify this path */
if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
@@ -606,10 +649,31 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
FILE_OPEN, create_options, ACL_NO_MODE, data,
- SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL);
+ SMB2_OP_POSIX_QUERY_INFO, cfile,
+ &sidsbuf, &sidsbuflen, NULL, NULL);
+ }
+
+ if (rc == 0) {
+ sidsbuf_end = sidsbuf + sidsbuflen;
+
+ owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
+ if (owner_len == -1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ memcpy(owner, sidsbuf, owner_len);
+
+ group_len = posix_info_sid_size(
+ sidsbuf + owner_len, sidsbuf_end);
+ if (group_len == -1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ memcpy(group, sidsbuf + owner_len, group_len);
}
out:
+ kfree(sidsbuf);
free_rsp_buf(err_buftype[0], err_iov[0].iov_base);
free_rsp_buf(err_buftype[1], err_iov[1].iov_base);
free_rsp_buf(err_buftype[2], err_iov[2].iov_base);
@@ -624,7 +688,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
return smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL, NULL);
}
void
@@ -646,7 +710,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, ACL_NO_MODE,
- &data, SMB2_OP_SET_INFO, cfile, NULL, NULL);
+ &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -658,7 +722,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_NOT_FILE, ACL_NO_MODE,
- NULL, SMB2_OP_RMDIR, NULL, NULL, NULL);
+ NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL);
}
int
@@ -667,7 +731,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL);
+ ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL);
}
static int
@@ -686,7 +750,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
}
rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
- command, cfile, NULL, NULL);
+ command, cfile, NULL, NULL, NULL, NULL);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -727,7 +791,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
return smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
- &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL);
+ &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL);
}
int
@@ -754,7 +818,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN,
0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile,
- NULL, NULL);
+ NULL, NULL, NULL, NULL);
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index bfaafd02fb1f..e6bcd2baf446 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -530,7 +530,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
p = buf;
spin_lock(&ses->iface_lock);
- ses->iface_count = 0;
/*
* Go through iface_list and do kref_put to remove
* any unused ifaces. ifaces in use will be removed
@@ -540,6 +539,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
iface_head) {
iface->is_active = 0;
kref_put(&iface->refcount, release_iface);
+ ses->iface_count--;
}
spin_unlock(&ses->iface_lock);
@@ -618,6 +618,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
/* just get a ref so that it doesn't get picked/freed */
iface->is_active = 1;
kref_get(&iface->refcount);
+ ses->iface_count++;
spin_unlock(&ses->iface_lock);
goto next_iface;
} else if (ret < 0) {
@@ -796,7 +797,9 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
__le16 *utf16_path;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ int err_buftype = CIFS_NO_BUFFER;
struct cifs_open_parms oparms;
+ struct kvec err_iov = {};
struct cifs_fid fid;
struct cached_fid *cfid;
@@ -820,14 +823,32 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL,
- NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
+ &err_iov, &err_buftype);
if (rc) {
- kfree(utf16_path);
- return rc;
+ struct smb2_hdr *hdr = err_iov.iov_base;
+
+ if (unlikely(!hdr || err_buftype == CIFS_NO_BUFFER))
+ goto out;
+ /*
+ * Handle weird Windows SMB server behaviour. It responds with
+ * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request
+ * for "\<server>\<dfsname>\<linkpath>" DFS reference,
+ * where <dfsname> contains non-ASCII unicode symbols.
+ */
+ if (rc != -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) &&
+ hdr->Status == STATUS_OBJECT_NAME_INVALID)
+ rc = -EREMOTE;
+ if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb &&
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS))
+ rc = -EOPNOTSUPP;
+ goto out;
}
rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+
+out:
+ free_rsp_buf(err_buftype, err_iov.iov_base);
kfree(utf16_path);
return rc;
}
@@ -4204,69 +4225,82 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
}
-/* We can not use the normal sg_set_buf() as we will sometimes pass a
- * stack object as buf.
- */
-static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
- unsigned int buflen)
+static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+ int num_rqst, const u8 *sig, u8 **iv,
+ struct aead_request **req, struct scatterlist **sgl,
+ unsigned int *num_sgs)
{
- void *addr;
- /*
- * VMAP_STACK (at least) puts stack into the vmalloc address space
- */
- if (is_vmalloc_addr(buf))
- addr = vmalloc_to_page(buf);
- else
- addr = virt_to_page(buf);
- sg_set_page(sg, addr, buflen, offset_in_page(buf));
+ unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
+ unsigned int iv_size = crypto_aead_ivsize(tfm);
+ unsigned int len;
+ u8 *p;
+
+ *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig);
+
+ len = iv_size;
+ len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ len += req_size;
+ len = ALIGN(len, __alignof__(struct scatterlist));
+ len += *num_sgs * sizeof(**sgl);
+
+ p = kmalloc(len, GFP_ATOMIC);
+ if (!p)
+ return NULL;
+
+ *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1);
+ *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size,
+ crypto_tfm_ctx_alignment());
+ *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size,
+ __alignof__(struct scatterlist));
+ return p;
}
-/* Assumes the first rqst has a transform header as the first iov.
- * I.e.
- * rqst[0].rq_iov[0] is transform header
- * rqst[0].rq_iov[1+] data to be encrypted/decrypted
- * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
- */
-static struct scatterlist *
-init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign)
+static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+ int num_rqst, const u8 *sig, u8 **iv,
+ struct aead_request **req, struct scatterlist **sgl)
{
- unsigned int sg_len;
+ unsigned int off, len, skip;
struct scatterlist *sg;
- unsigned int i;
- unsigned int j;
- unsigned int idx = 0;
- int skip;
-
- sg_len = 1;
- for (i = 0; i < num_rqst; i++)
- sg_len += rqst[i].rq_nvec + rqst[i].rq_npages;
+ unsigned int num_sgs;
+ unsigned long addr;
+ int i, j;
+ void *p;
- sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL);
- if (!sg)
+ p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs);
+ if (!p)
return NULL;
- sg_init_table(sg, sg_len);
+ sg_init_table(*sgl, num_sgs);
+ sg = *sgl;
+
+ /* Assumes the first rqst has a transform header as the first iov.
+ * I.e.
+ * rqst[0].rq_iov[0] is transform header
+ * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+ * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+ */
for (i = 0; i < num_rqst; i++) {
+ /*
+ * The first rqst has a transform header where the
+ * first 20 bytes are not part of the encrypted blob.
+ */
for (j = 0; j < rqst[i].rq_nvec; j++) {
- /*
- * The first rqst has a transform header where the
- * first 20 bytes are not part of the encrypted blob
- */
- skip = (i == 0) && (j == 0) ? 20 : 0;
- smb2_sg_set_buf(&sg[idx++],
- rqst[i].rq_iov[j].iov_base + skip,
- rqst[i].rq_iov[j].iov_len - skip);
- }
+ struct kvec *iov = &rqst[i].rq_iov[j];
+ skip = (i == 0) && (j == 0) ? 20 : 0;
+ addr = (unsigned long)iov->iov_base + skip;
+ len = iov->iov_len - skip;
+ sg = cifs_sg_set_buf(sg, (void *)addr, len);
+ }
for (j = 0; j < rqst[i].rq_npages; j++) {
- unsigned int len, offset;
-
- rqst_page_get_length(&rqst[i], j, &len, &offset);
- sg_set_page(&sg[idx++], rqst[i].rq_pages[j], len, offset);
+ rqst_page_get_length(&rqst[i], j, &len, &off);
+ sg_set_page(sg++, rqst[i].rq_pages[j], len, off);
}
}
- smb2_sg_set_buf(&sg[idx], sign, SMB2_SIGNATURE_SIZE);
- return sg;
+ cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE);
+
+ return p;
}
static int
@@ -4314,11 +4348,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
u8 sign[SMB2_SIGNATURE_SIZE] = {};
u8 key[SMB3_ENC_DEC_KEY_SIZE];
struct aead_request *req;
- char *iv;
- unsigned int iv_len;
+ u8 *iv;
DECLARE_CRYPTO_WAIT(wait);
struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+ void *creq;
rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
if (rc) {
@@ -4352,32 +4386,15 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- req = aead_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__);
+ creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg);
+ if (unlikely(!creq))
return -ENOMEM;
- }
if (!enc) {
memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE);
crypt_len += SMB2_SIGNATURE_SIZE;
}
- sg = init_sg(num_rqst, rqst, sign);
- if (!sg) {
- cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__);
- rc = -ENOMEM;
- goto free_req;
- }
-
- iv_len = crypto_aead_ivsize(tfm);
- iv = kzalloc(iv_len, GFP_KERNEL);
- if (!iv) {
- cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__);
- rc = -ENOMEM;
- goto free_sg;
- }
-
if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
@@ -4386,6 +4403,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
}
+ aead_request_set_tfm(req, tfm);
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
@@ -4398,11 +4416,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
- kfree_sensitive(iv);
-free_sg:
- kfree_sensitive(sg);
-free_req:
- kfree_sensitive(req);
+ kfree_sensitive(creq);
return rc;
}
@@ -4445,21 +4459,27 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
int rc = -ENOMEM;
for (i = 1; i < num_rqst; i++) {
- npages = old_rq[i - 1].rq_npages;
+ struct smb_rqst *old = &old_rq[i - 1];
+ struct smb_rqst *new = &new_rq[i];
+
+ orig_len += smb_rqst_len(server, old);
+ new->rq_iov = old->rq_iov;
+ new->rq_nvec = old->rq_nvec;
+
+ npages = old->rq_npages;
+ if (!npages)
+ continue;
+
pages = kmalloc_array(npages, sizeof(struct page *),
GFP_KERNEL);
if (!pages)
goto err_free;
- new_rq[i].rq_pages = pages;
- new_rq[i].rq_npages = npages;
- new_rq[i].rq_offset = old_rq[i - 1].rq_offset;
- new_rq[i].rq_pagesz = old_rq[i - 1].rq_pagesz;
- new_rq[i].rq_tailsz = old_rq[i - 1].rq_tailsz;
- new_rq[i].rq_iov = old_rq[i - 1].rq_iov;
- new_rq[i].rq_nvec = old_rq[i - 1].rq_nvec;
-
- orig_len += smb_rqst_len(server, &old_rq[i - 1]);
+ new->rq_pages = pages;
+ new->rq_npages = npages;
+ new->rq_offset = old->rq_offset;
+ new->rq_pagesz = old->rq_pagesz;
+ new->rq_tailsz = old->rq_tailsz;
for (j = 0; j < npages; j++) {
pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
@@ -4469,17 +4489,12 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
/* copy pages form the old */
for (j = 0; j < npages; j++) {
- char *dst, *src;
unsigned int offset, len;
- rqst_page_get_length(&new_rq[i], j, &len, &offset);
-
- dst = (char *) kmap(new_rq[i].rq_pages[j]) + offset;
- src = (char *) kmap(old_rq[i - 1].rq_pages[j]) + offset;
+ rqst_page_get_length(new, j, &len, &offset);
- memcpy(dst, src, len);
- kunmap(new_rq[i].rq_pages[j]);
- kunmap(old_rq[i - 1].rq_pages[j]);
+ memcpy_page(new->rq_pages[j], offset,
+ old->rq_pages[j], offset, len);
}
}
@@ -4723,13 +4738,13 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
return 0;
}
- iov_iter_bvec(&iter, WRITE, bvec, npages, data_len);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, npages, data_len);
} else if (buf_len >= data_offset + data_len) {
/* read response payload is in buf */
WARN_ONCE(npages > 0, "read data can be either in buf or in pages");
iov.iov_base = buf + data_offset;
iov.iov_len = data_len;
- iov_iter_kvec(&iter, WRITE, &iov, 1, data_len);
+ iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, data_len);
} else {
/* read response payload cannot be in both buf and pages */
WARN_ONCE(1, "buf can not contain only a part of read data");
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index a5695748a89b..4b71f4a92f76 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -541,9 +541,10 @@ static void
assemble_neg_contexts(struct smb2_negotiate_req *req,
struct TCP_Server_Info *server, unsigned int *total_len)
{
- char *pneg_ctxt;
- char *hostname = NULL;
unsigned int ctxt_len, neg_context_count;
+ struct TCP_Server_Info *pserver;
+ char *pneg_ctxt;
+ char *hostname;
if (*total_len > 200) {
/* In case length corrupted don't want to overrun smb buffer */
@@ -574,8 +575,9 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
* secondary channels don't have the hostname field populated
* use the hostname field in the primary channel instead
*/
- hostname = CIFS_SERVER_IS_CHAN(server) ?
- server->primary_server->hostname : server->hostname;
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ cifs_server_lock(pserver);
+ hostname = pserver->hostname;
if (hostname && (hostname[0] != 0)) {
ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
hostname);
@@ -584,6 +586,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
neg_context_count = 3;
} else
neg_context_count = 2;
+ cifs_server_unlock(pserver);
build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
*total_len += sizeof(struct smb2_posix_neg_context);
@@ -1450,6 +1453,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
/* keep session key if binding */
if (!is_binding) {
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
GFP_KERNEL);
if (!ses->auth_key.response) {
@@ -1479,8 +1483,11 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
out_put_spnego_key:
key_invalidate(spnego_key);
key_put(spnego_key);
- if (rc)
+ if (rc) {
kfree_sensitive(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+ ses->auth_key.len = 0;
+ }
out:
sess_data->result = rc;
sess_data->func = NULL;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index be21b5d26f67..d5d7ffb7711c 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -277,7 +277,10 @@ extern int smb2_query_info_compound(const unsigned int xid,
/* query path info from the server using SMB311 POSIX extensions*/
int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
+ struct cifs_open_info_data *data,
+ struct cifs_sid *owner,
+ struct cifs_sid *group,
+ bool *adjust_tz, bool *reparse);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 575fa8f58342..3851d0aaa288 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -347,7 +347,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
.iov_base = &rfc1002_marker,
.iov_len = 4
};
- iov_iter_kvec(&smb_msg.msg_iter, WRITE, &hiov, 1, 4);
+ iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, &hiov, 1, 4);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
goto unmask;
@@ -368,7 +368,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
size += iov[i].iov_len;
}
- iov_iter_kvec(&smb_msg.msg_iter, WRITE, iov, n_vec, size);
+ iov_iter_kvec(&smb_msg.msg_iter, ITER_SOURCE, iov, n_vec, size);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
@@ -384,7 +384,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rqst_page_get_length(&rqst[j], i, &bvec.bv_len,
&bvec.bv_offset);
- iov_iter_bvec(&smb_msg.msg_iter, WRITE,
+ iov_iter_bvec(&smb_msg.msg_iter, ITER_SOURCE,
&bvec, 1, bvec.bv_len);
rc = smb_send_kvec(server, &smb_msg, &sent);
if (rc < 0)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 998fa51f9b68..5f2fb2fd2e37 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -200,32 +200,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
}
break;
}
-
-#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- case XATTR_ACL_ACCESS:
-#ifdef CONFIG_CIFS_POSIX
- if (!value)
- goto out;
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
- value, (const int)size,
- ACL_TYPE_ACCESS, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-
- case XATTR_ACL_DEFAULT:
-#ifdef CONFIG_CIFS_POSIX
- if (!value)
- goto out;
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
- value, (const int)size,
- ACL_TYPE_DEFAULT, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
out:
@@ -366,27 +340,6 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
}
break;
}
-#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- case XATTR_ACL_ACCESS:
-#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
- value, size, ACL_TYPE_ACCESS,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-
- case XATTR_ACL_DEFAULT:
-#ifdef CONFIG_CIFS_POSIX
- if (sb->s_flags & SB_POSIXACL)
- rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
- value, size, ACL_TYPE_DEFAULT,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
-#endif /* CONFIG_CIFS_POSIX */
- break;
-#endif /* ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
}
/* We could add an additional check for streams ie
@@ -525,21 +478,6 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = {
.set = cifs_xattr_set,
};
-
-static const struct xattr_handler cifs_posix_acl_access_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = XATTR_ACL_ACCESS,
- .get = cifs_xattr_get,
- .set = cifs_xattr_set,
-};
-
-static const struct xattr_handler cifs_posix_acl_default_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = XATTR_ACL_DEFAULT,
- .get = cifs_xattr_get,
- .set = cifs_xattr_set,
-};
-
const struct xattr_handler *cifs_xattr_handlers[] = {
&cifs_user_xattr_handler,
&cifs_os2_xattr_handler,
@@ -549,7 +487,9 @@ const struct xattr_handler *cifs_xattr_handlers[] = {
&smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */
&cifs_cifs_ntsd_full_xattr_handler,
&smb3_ntsd_full_xattr_handler, /* alias for above since avoiding "cifs" */
- &cifs_posix_acl_access_xattr_handler,
- &cifs_posix_acl_default_xattr_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
NULL
};
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d1f9d2632202..ec6519e1ca3b 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -316,6 +316,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry,
return 0;
out_remove:
+ configfs_put(dentry->d_fsdata);
configfs_remove_dirent(dentry);
return PTR_ERR(inode);
}
@@ -382,6 +383,7 @@ int configfs_create_link(struct configfs_dirent *target, struct dentry *parent,
return 0;
out_remove:
+ configfs_put(dentry->d_fsdata);
configfs_remove_dirent(dentry);
return PTR_ERR(inode);
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 7bad7785e8e6..de78bde2991b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -68,7 +68,10 @@ struct core_name {
static int expand_corename(struct core_name *cn, int size)
{
- char *corename = krealloc(cn->corename, size, GFP_KERNEL);
+ char *corename;
+
+ size = kmalloc_size_roundup(size);
+ corename = krealloc(cn->corename, size, GFP_KERNEL);
if (!corename)
return -ENOMEM;
@@ -76,7 +79,7 @@ static int expand_corename(struct core_name *cn, int size)
if (size > core_name_size) /* racy but harmless */
core_name_size = size;
- cn->size = ksize(corename);
+ cn->size = size;
cn->corename = corename;
return 0;
}
@@ -325,6 +328,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
err = cn_printf(cn, "%lu",
rlimit(RLIMIT_CORE));
break;
+ /* CPU the task ran on */
+ case 'C':
+ err = cn_printf(cn, "%d", cprm->cpu);
+ break;
default:
break;
}
@@ -525,7 +532,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.siginfo = siginfo,
- .regs = signal_pt_regs(),
.limit = rlimit(RLIMIT_CORE),
/*
* We must use the same mm->flags while dumping core to avoid
@@ -534,6 +540,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
*/
.mm_flags = mm->flags,
.vma_meta = NULL,
+ .cpu = raw_smp_processor_id(),
};
audit_core_dumps(siginfo->si_signo);
@@ -716,8 +723,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* filesystem.
*/
mnt_userns = file_mnt_user_ns(cprm.file);
- if (!uid_eq(i_uid_into_mnt(mnt_userns, inode),
- current_fsuid())) {
+ if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode),
+ current_fsuid())) {
pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
cn.corename);
goto close_fail;
@@ -853,7 +860,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
if (dump_interrupted())
return 0;
pos = file->f_pos;
- iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
n = __kernel_write_iter(cprm->file, &iter, &pos);
if (n != PAGE_SIZE)
return 0;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index d5f68a0c5d15..316a778cec0f 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -439,13 +439,7 @@ struct fscrypt_master_key_secret {
struct fscrypt_master_key {
/*
- * Back-pointer to the super_block of the filesystem to which this
- * master key has been added. Only valid if ->mk_active_refs > 0.
- */
- struct super_block *mk_sb;
-
- /*
- * Link in ->mk_sb->s_master_keys->key_hashtable.
+ * Link in ->s_master_keys->key_hashtable.
* Only valid if ->mk_active_refs > 0.
*/
struct hlist_node mk_node;
@@ -456,7 +450,7 @@ struct fscrypt_master_key {
/*
* Active and structural reference counts. An active ref guarantees
* that the struct continues to exist, continues to be in the keyring
- * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g.
+ * ->s_master_keys, and that any embedded subkeys (e.g.
* ->mk_direct_keys) that have been prepared continue to exist.
* A structural ref only guarantees that the struct continues to exist.
*
@@ -569,7 +563,8 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
void fscrypt_put_master_key(struct fscrypt_master_key *mk);
-void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk);
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+ struct fscrypt_master_key *mk);
struct fscrypt_master_key *
fscrypt_find_master_key(struct super_block *sb,
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index cea8b14007e6..8bfb3ce86476 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -12,7 +12,7 @@
* provides the key and IV to use.
*/
-#include <linux/blk-crypto-profile.h>
+#include <linux/blk-crypto.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
@@ -77,10 +77,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
unsigned int i;
for (i = 0; i < num_devs; i++) {
- struct request_queue *q = bdev_get_queue(devs[i]);
-
if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- __blk_crypto_cfg_supported(q->crypto_profile, cfg)) {
+ blk_crypto_config_supported_natively(devs[i], cfg)) {
if (!xchg(&mode->logged_blk_crypto_native, 1))
pr_info("fscrypt: %s using blk-crypto (native)\n",
mode->friendly_name);
@@ -139,8 +137,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
return PTR_ERR(devs);
for (i = 0; i < num_devs; i++) {
- if (!blk_crypto_config_supported(bdev_get_queue(devs[i]),
- &crypto_cfg))
+ if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
goto out_free_devs;
}
@@ -184,8 +181,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
goto fail;
}
for (i = 0; i < num_devs; i++) {
- err = blk_crypto_start_using_key(blk_key,
- bdev_get_queue(devs[i]));
+ err = blk_crypto_start_using_key(devs[i], blk_key);
if (err)
break;
}
@@ -224,7 +220,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
devs = fscrypt_get_devices(sb, &num_devs);
if (!IS_ERR(devs)) {
for (i = 0; i < num_devs; i++)
- blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key);
+ blk_crypto_evict_key(devs[i], blk_key);
kfree(devs);
}
kfree_sensitive(blk_key);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 2a24b1f0ae68..78dd2ff306bd 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -79,10 +79,9 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk)
call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
}
-void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
+void fscrypt_put_master_key_activeref(struct super_block *sb,
+ struct fscrypt_master_key *mk)
{
- struct super_block *sb = mk->mk_sb;
- struct fscrypt_keyring *keyring = sb->s_master_keys;
size_t i;
if (!refcount_dec_and_test(&mk->mk_active_refs))
@@ -93,9 +92,9 @@ void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
* destroying any subkeys embedded in it.
*/
- spin_lock(&keyring->lock);
+ spin_lock(&sb->s_master_keys->lock);
hlist_del_rcu(&mk->mk_node);
- spin_unlock(&keyring->lock);
+ spin_unlock(&sb->s_master_keys->lock);
/*
* ->mk_active_refs == 0 implies that ->mk_secret is not present and
@@ -243,7 +242,7 @@ void fscrypt_destroy_keyring(struct super_block *sb)
WARN_ON(refcount_read(&mk->mk_struct_refs) != 1);
WARN_ON(!is_master_key_secret_present(&mk->mk_secret));
wipe_master_key_secret(&mk->mk_secret);
- fscrypt_put_master_key_activeref(mk);
+ fscrypt_put_master_key_activeref(sb, mk);
}
}
kfree_sensitive(keyring);
@@ -424,7 +423,6 @@ static int add_new_master_key(struct super_block *sb,
if (!mk)
return -ENOMEM;
- mk->mk_sb = sb;
init_rwsem(&mk->mk_sem);
refcount_set(&mk->mk_struct_refs, 1);
mk->mk_spec = *mk_spec;
@@ -1068,7 +1066,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
err = -ENOKEY;
if (is_master_key_secret_present(&mk->mk_secret)) {
wipe_master_key_secret(&mk->mk_secret);
- fscrypt_put_master_key_activeref(mk);
+ fscrypt_put_master_key_activeref(sb, mk);
err = 0;
}
inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index f7407071a952..94757ccd3056 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -44,6 +44,21 @@ struct fscrypt_mode fscrypt_modes[] = {
.security_strength = 16,
.ivsize = 16,
},
+ [FSCRYPT_MODE_SM4_XTS] = {
+ .friendly_name = "SM4-XTS",
+ .cipher_str = "xts(sm4)",
+ .keysize = 32,
+ .security_strength = 16,
+ .ivsize = 16,
+ .blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS,
+ },
+ [FSCRYPT_MODE_SM4_CTS] = {
+ .friendly_name = "SM4-CTS-CBC",
+ .cipher_str = "cts(cbc(sm4))",
+ .keysize = 16,
+ .security_strength = 16,
+ .ivsize = 16,
+ },
[FSCRYPT_MODE_ADIANTUM] = {
.friendly_name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
@@ -509,7 +524,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
spin_lock(&mk->mk_decrypted_inodes_lock);
list_del(&ci->ci_master_key_link);
spin_unlock(&mk->mk_decrypted_inodes_lock);
- fscrypt_put_master_key_activeref(mk);
+ fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk);
}
memzero_explicit(ci, sizeof(*ci));
kmem_cache_free(fscrypt_info_cachep, ci);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 46757c3052ef..893661b52376 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -61,6 +61,13 @@ fscrypt_get_dummy_policy(struct super_block *sb)
return sb->s_cop->get_dummy_policy(sb);
}
+/*
+ * Return %true if the given combination of encryption modes is supported for v1
+ * (and later) encryption policies.
+ *
+ * Do *not* add anything new here, since v1 encryption policies are deprecated.
+ * New combinations of modes should go in fscrypt_valid_enc_modes_v2() only.
+ */
static bool fscrypt_valid_enc_modes_v1(u32 contents_mode, u32 filenames_mode)
{
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
@@ -83,6 +90,11 @@ static bool fscrypt_valid_enc_modes_v2(u32 contents_mode, u32 filenames_mode)
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
filenames_mode == FSCRYPT_MODE_AES_256_HCTR2)
return true;
+
+ if (contents_mode == FSCRYPT_MODE_SM4_XTS &&
+ filenames_mode == FSCRYPT_MODE_SM4_CTS)
+ return true;
+
return fscrypt_valid_enc_modes_v1(contents_mode, filenames_mode);
}
diff --git a/fs/dax.c b/fs/dax.c
index 1c6867810cbd..c48a3a93ab29 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
-static inline bool dax_mapping_is_cow(struct address_space *mapping)
+static inline bool dax_page_is_shared(struct page *page)
{
- return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+ return page->mapping == PAGE_MAPPING_DAX_SHARED;
}
/*
- * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
+ * refcount.
*/
-static inline void dax_mapping_set_cow(struct page *page)
+static inline void dax_page_share_get(struct page *page)
{
- if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+ if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
/*
* Reset the index if the page was already mapped
* regularly before.
*/
if (page->mapping)
- page->index = 1;
- page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+ page->share = 1;
+ page->mapping = PAGE_MAPPING_DAX_SHARED;
}
- page->index++;
+ page->share++;
+}
+
+static inline unsigned long dax_page_share_put(struct page *page)
+{
+ return --page->share;
}
/*
- * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * When it is called in dax_insert_entry(), the shared flag will indicate that
* whether this entry is shared by multiple files. If so, set the page->mapping
- * FS_DAX_MAPPING_COW, and use page->index as refcount.
+ * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
*/
static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address, bool cow)
+ struct vm_area_struct *vma, unsigned long address, bool shared)
{
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
@@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
- if (cow) {
- dax_mapping_set_cow(page);
+ if (shared) {
+ dax_page_share_get(page);
} else {
WARN_ON_ONCE(page->mapping);
page->mapping = mapping;
@@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- if (dax_mapping_is_cow(page->mapping)) {
- /* keep the CoW flag if this page is still shared */
- if (page->index-- > 0)
+ if (dax_page_is_shared(page)) {
+ /* keep the shared flag if this page is still shared */
+ if (dax_page_share_put(page) > 0)
continue;
} else
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
@@ -840,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
(iter->iomap.flags & IOMAP_F_DIRTY);
}
-static bool dax_fault_is_cow(const struct iomap_iter *iter)
-{
- return (iter->flags & IOMAP_WRITE) &&
- (iter->iomap.flags & IOMAP_F_SHARED);
-}
-
/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
@@ -859,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *new_entry = dax_make_entry(pfn, flags);
- bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
- bool cow = dax_fault_is_cow(iter);
+ bool write = iter->flags & IOMAP_WRITE;
+ bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
+ bool shared = iter->iomap.flags & IOMAP_F_SHARED;
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
+ if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
unsigned long index = xas->xa_index;
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
@@ -877,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
xas_reset(xas);
xas_lock_irq(xas);
- if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
void *old;
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
- cow);
+ shared);
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -902,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
- if (cow)
+ if (write && shared)
xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irq(xas);
@@ -1086,7 +1087,8 @@ out:
}
/**
- * dax_iomap_cow_copy - Copy the data from source to destination before write
+ * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
+ * by copying the data before and after the range to be written.
* @pos: address to do copy from.
* @length: size of copy operation.
* @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
@@ -1095,35 +1097,50 @@ out:
*
* This can be called from two places. Either during DAX write fault (page
* aligned), to copy the length size data to daddr. Or, while doing normal DAX
- * write operation, dax_iomap_actor() might call this to do the copy of either
+ * write operation, dax_iomap_iter() might call this to do the copy of either
* start or end unaligned address. In the latter case the rest of the copy of
- * aligned ranges is taken care by dax_iomap_actor() itself.
+ * aligned ranges is taken care by dax_iomap_iter() itself.
+ * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
+ * area to make sure no old data remains.
*/
-static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
+static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
const struct iomap *srcmap, void *daddr)
{
loff_t head_off = pos & (align_size - 1);
size_t size = ALIGN(head_off + length, align_size);
loff_t end = pos + length;
loff_t pg_end = round_up(end, align_size);
+ /* copy_all is usually in page fault case */
bool copy_all = head_off == 0 && end == pg_end;
+ /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
+ bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
+ srcmap->type == IOMAP_UNWRITTEN;
void *saddr = 0;
int ret = 0;
- ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
- if (ret)
- return ret;
+ if (!zero_edge) {
+ ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
+ if (ret)
+ return ret;
+ }
if (copy_all) {
- ret = copy_mc_to_kernel(daddr, saddr, length);
- return ret ? -EIO : 0;
+ if (zero_edge)
+ memset(daddr, 0, size);
+ else
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ goto out;
}
/* Copy the head part of the range */
if (head_off) {
- ret = copy_mc_to_kernel(daddr, saddr, head_off);
- if (ret)
- return -EIO;
+ if (zero_edge)
+ memset(daddr, 0, head_off);
+ else {
+ ret = copy_mc_to_kernel(daddr, saddr, head_off);
+ if (ret)
+ return -EIO;
+ }
}
/* Copy the tail part of the range */
@@ -1131,12 +1148,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
loff_t tail_off = head_off + length;
loff_t tail_len = pg_end - end;
- ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
- tail_len);
- if (ret)
- return -EIO;
+ if (zero_edge)
+ memset(daddr + tail_off, 0, tail_len);
+ else {
+ ret = copy_mc_to_kernel(daddr + tail_off,
+ saddr + tail_off, tail_len);
+ if (ret)
+ return -EIO;
+ }
}
- return 0;
+out:
+ if (zero_edge)
+ dax_flush(srcmap->dax_dev, daddr, size);
+ return ret ? -EIO : 0;
}
/*
@@ -1221,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
+static s64 dax_unshare_iter(struct iomap_iter *iter)
+{
+ struct iomap *iomap = &iter->iomap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
+ int id = 0;
+ s64 ret = 0;
+ void *daddr = NULL, *saddr = NULL;
+
+ /* don't bother with blocks that are not shared to start with */
+ if (!(iomap->flags & IOMAP_F_SHARED))
+ return length;
+ /* don't bother with holes or unwritten extents */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ return length;
+
+ id = dax_read_lock();
+ ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = copy_mc_to_kernel(daddr, saddr, length);
+ if (ret)
+ ret = -EIO;
+
+out_unlock:
+ dax_read_unlock(id);
+ return ret;
+}
+
+int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
+ const struct iomap_ops *ops)
+{
+ struct iomap_iter iter = {
+ .inode = inode,
+ .pos = pos,
+ .len = len,
+ .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
+ };
+ int ret;
+
+ while ((ret = iomap_iter(&iter, ops)) > 0)
+ iter.processed = dax_unshare_iter(&iter);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dax_file_unshare);
+
static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
{
const struct iomap *iomap = &iter->iomap;
@@ -1235,13 +1311,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
if (ret < 0)
return ret;
memset(kaddr + offset, 0, size);
- if (srcmap->addr != iomap->addr) {
- ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
- kaddr);
- if (ret < 0)
- return ret;
- dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
- } else
+ if (iomap->flags & IOMAP_F_SHARED)
+ ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
+ kaddr);
+ else
dax_flush(iomap->dax_dev, kaddr + offset, size);
return ret;
}
@@ -1258,6 +1331,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
return length;
+ /*
+ * invalidate the pages whose sharing state is to be changed
+ * because of CoW.
+ */
+ if (iomap->flags & IOMAP_F_SHARED)
+ invalidate_inode_pages2_range(iter->inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (pos + length - 1) >> PAGE_SHIFT);
+
do {
unsigned offset = offset_in_page(pos);
unsigned size = min_t(u64, PAGE_SIZE - offset, length);
@@ -1318,12 +1400,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
- const struct iomap *srcmap = &iomi->srcmap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iomi);
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
struct dax_device *dax_dev = iomap->dax_dev;
loff_t end = pos + length, done = 0;
bool write = iov_iter_rw(iter) == WRITE;
+ bool cow = write && iomap->flags & IOMAP_F_SHARED;
ssize_t ret = 0;
size_t xfer;
int id;
@@ -1350,7 +1433,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
- if (iomap->flags & IOMAP_F_NEW) {
+ if (iomap->flags & IOMAP_F_NEW || cow) {
invalidate_inode_pages2_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
@@ -1384,10 +1467,9 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
break;
}
- if (write &&
- srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
- ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
- kaddr);
+ if (cow) {
+ ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
+ srcmap, kaddr);
if (ret)
break;
}
@@ -1532,7 +1614,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
struct xa_state *xas, void **entry, bool pmd)
{
const struct iomap *iomap = &iter->iomap;
- const struct iomap *srcmap = &iter->srcmap;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = iter->flags & IOMAP_WRITE;
@@ -1563,9 +1645,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
- if (write &&
- srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
- err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
+ if (write && iomap->flags & IOMAP_F_SHARED) {
+ err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
if (err)
return dax_fault_return(err);
}
@@ -1936,15 +2017,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
.len = len,
.flags = IOMAP_DAX,
};
- int ret;
+ int ret, compared = 0;
- while ((ret = iomap_iter(&src_iter, ops)) > 0) {
- while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
- dst_iter.processed = dax_range_compare_iter(&src_iter,
- &dst_iter, len, same);
- }
- if (ret <= 0)
- src_iter.processed = ret;
+ while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
+ (ret = iomap_iter(&dst_iter, ops)) > 0) {
+ compared = dax_range_compare_iter(&src_iter, &dst_iter, len,
+ same);
+ if (compared < 0)
+ return ret;
+ src_iter.processed = dst_iter.processed = compared;
}
return ret;
}
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index ddb3fc258df9..b54f470e0d03 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf,
}
EXPORT_SYMBOL_GPL(debugfs_attr_read);
-ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct dentry *dentry = F_DENTRY(file);
ssize_t ret;
@@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
ret = debugfs_file_get(dentry);
if (unlikely(ret))
return ret;
- ret = simple_attr_write(file, buf, len, ppos);
+ if (is_signed)
+ ret = simple_attr_write_signed(file, buf, len, ppos);
+ else
+ ret = simple_attr_write(file, buf, len, ppos);
debugfs_file_put(dentry);
return ret;
}
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(debugfs_attr_write);
+ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write_signed);
+
static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode,
struct dentry *parent, void *value,
const struct file_operations *fops,
@@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
*val = atomic_read((atomic_t *)data);
return 0;
}
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get,
debugfs_atomic_t_set, "%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
"%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
"%lld\n");
/**
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index d60a8d8f109d..26fef9945cc9 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -12,55 +12,67 @@
#include <trace/events/dlm.h>
#include "dlm_internal.h"
+#include "memory.h"
#include "lock.h"
#include "user.h"
#include "ast.h"
-static uint64_t dlm_cb_seq;
-static DEFINE_SPINLOCK(dlm_cb_seq_spin);
+void dlm_release_callback(struct kref *ref)
+{
+ struct dlm_callback *cb = container_of(ref, struct dlm_callback, ref);
+
+ dlm_free_cb(cb);
+}
+
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+ struct dlm_callback *to)
+{
+ if (*from)
+ kref_put(&(*from)->ref, dlm_release_callback);
+
+ if (to)
+ kref_get(&to->ref);
+
+ *from = to;
+}
-static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb)
{
- int i;
-
- log_print("last_bast %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id,
- (unsigned long long)lkb->lkb_last_bast.seq,
- lkb->lkb_last_bast.flags,
- lkb->lkb_last_bast.mode,
- lkb->lkb_last_bast.sb_status,
- lkb->lkb_last_bast.sb_flags);
-
- log_print("last_cast %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id,
- (unsigned long long)lkb->lkb_last_cast.seq,
- lkb->lkb_last_cast.flags,
- lkb->lkb_last_cast.mode,
- lkb->lkb_last_cast.sb_status,
- lkb->lkb_last_cast.sb_flags);
-
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- log_print("cb %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id,
- (unsigned long long)lkb->lkb_callbacks[i].seq,
- lkb->lkb_callbacks[i].flags,
- lkb->lkb_callbacks[i].mode,
- lkb->lkb_callbacks[i].sb_status,
- lkb->lkb_callbacks[i].sb_flags);
+ struct dlm_callback *cb, *safe;
+
+ list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) {
+ list_del(&cb->list);
+ kref_put(&cb->ref, dlm_release_callback);
}
+
+ lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
+
+ /* invalidate */
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+ lkb->lkb_last_bast_mode = -1;
}
-int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq)
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- uint64_t prev_seq;
+ int rv = DLM_ENQUEUE_CALLBACK_SUCCESS;
+ struct dlm_callback *cb;
int prev_mode;
- int i, rv;
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- if (lkb->lkb_callbacks[i].seq)
- continue;
+ if (flags & DLM_CB_BAST) {
+ /* if cb is a bast, it should be skipped if the blocking mode is
+ * compatible with the last granted mode
+ */
+ if (lkb->lkb_last_cast) {
+ if (dlm_modes_compat(mode, lkb->lkb_last_cast->mode)) {
+ log_debug(ls, "skip %x bast mode %d for cast mode %d",
+ lkb->lkb_id, mode,
+ lkb->lkb_last_cast->mode);
+ goto out;
+ }
+ }
/*
* Suppress some redundant basts here, do more on removal.
@@ -68,148 +80,95 @@ int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
* is a bast for the same mode or a more restrictive mode.
* (the addional > PR check is needed for PR/CW inversion)
*/
-
- if ((i > 0) && (flags & DLM_CB_BAST) &&
- (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
-
- prev_seq = lkb->lkb_callbacks[i-1].seq;
- prev_mode = lkb->lkb_callbacks[i-1].mode;
+ if (lkb->lkb_last_cb && lkb->lkb_last_cb->flags & DLM_CB_BAST) {
+ prev_mode = lkb->lkb_last_cb->mode;
if ((prev_mode == mode) ||
(prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
-
- log_debug(ls, "skip %x add bast %llu mode %d "
- "for bast %llu mode %d",
- lkb->lkb_id,
- (unsigned long long)seq,
- mode,
- (unsigned long long)prev_seq,
- prev_mode);
- rv = 0;
+ log_debug(ls, "skip %x add bast mode %d for bast mode %d",
+ lkb->lkb_id, mode, prev_mode);
goto out;
}
}
-
- lkb->lkb_callbacks[i].seq = seq;
- lkb->lkb_callbacks[i].flags = flags;
- lkb->lkb_callbacks[i].mode = mode;
- lkb->lkb_callbacks[i].sb_status = status;
- lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
- rv = 0;
- break;
}
- if (i == DLM_CALLBACKS_SIZE) {
- log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
- lkb->lkb_id, (unsigned long long)seq,
- flags, mode, status, sbflags);
- dlm_dump_lkb_callbacks(lkb);
- rv = -1;
+ cb = dlm_allocate_cb();
+ if (!cb) {
+ rv = DLM_ENQUEUE_CALLBACK_FAILURE;
goto out;
}
- out:
- return rv;
-}
-
-int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_callback *cb, int *resid)
-{
- int i, rv;
-
- *resid = 0;
-
- if (!lkb->lkb_callbacks[0].seq) {
- rv = -ENOENT;
- goto out;
- }
-
- /* oldest undelivered cb is callbacks[0] */
-
- memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
- memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
- /* shift others down */
-
- for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
- if (!lkb->lkb_callbacks[i].seq)
- break;
- memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
- sizeof(struct dlm_callback));
- memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
- (*resid)++;
+ cb->flags = flags;
+ cb->mode = mode;
+ cb->sb_status = status;
+ cb->sb_flags = (sbflags & 0x000000FF);
+ kref_init(&cb->ref);
+ if (!(lkb->lkb_flags & DLM_IFL_CB_PENDING)) {
+ lkb->lkb_flags |= DLM_IFL_CB_PENDING;
+ rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED;
}
+ list_add_tail(&cb->list, &lkb->lkb_callbacks);
- /* if cb is a bast, it should be skipped if the blocking mode is
- compatible with the last granted mode */
-
- if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
- if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
- cb->flags |= DLM_CB_SKIP;
-
- log_debug(ls, "skip %x bast %llu mode %d "
- "for cast %llu mode %d",
- lkb->lkb_id,
- (unsigned long long)cb->seq,
- cb->mode,
- (unsigned long long)lkb->lkb_last_cast.seq,
- lkb->lkb_last_cast.mode);
- rv = 0;
- goto out;
- }
- }
+ if (flags & DLM_CB_CAST)
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, cb);
- if (cb->flags & DLM_CB_CAST) {
- memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
- lkb->lkb_last_cast_time = ktime_get();
- }
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, cb);
- if (cb->flags & DLM_CB_BAST) {
- memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
- lkb->lkb_last_bast_time = ktime_get();
- }
- rv = 0;
out:
return rv;
}
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb)
+{
+ /* oldest undelivered cb is callbacks first entry */
+ *cb = list_first_entry_or_null(&lkb->lkb_callbacks,
+ struct dlm_callback, list);
+ if (!*cb)
+ return DLM_DEQUEUE_CALLBACK_EMPTY;
+
+ /* remove it from callbacks so shift others down */
+ list_del(&(*cb)->list);
+ if (list_empty(&lkb->lkb_callbacks))
+ return DLM_DEQUEUE_CALLBACK_LAST;
+
+ return DLM_DEQUEUE_CALLBACK_SUCCESS;
+}
+
void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
uint32_t sbflags)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
- uint64_t new_seq, prev_seq;
int rv;
- spin_lock(&dlm_cb_seq_spin);
- new_seq = ++dlm_cb_seq;
- if (!dlm_cb_seq)
- new_seq = ++dlm_cb_seq;
- spin_unlock(&dlm_cb_seq_spin);
-
if (lkb->lkb_flags & DLM_IFL_USER) {
- dlm_user_add_ast(lkb, flags, mode, status, sbflags, new_seq);
+ dlm_user_add_ast(lkb, flags, mode, status, sbflags);
return;
}
- mutex_lock(&lkb->lkb_cb_mutex);
- prev_seq = lkb->lkb_callbacks[0].seq;
-
- rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, new_seq);
- if (rv < 0)
- goto out;
-
- if (!prev_seq) {
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+ switch (rv) {
+ case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
kref_get(&lkb->lkb_ref);
- mutex_lock(&ls->ls_cb_mutex);
+ spin_lock(&ls->ls_cb_lock);
if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
} else {
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
}
- mutex_unlock(&ls->ls_cb_mutex);
+ spin_unlock(&ls->ls_cb_lock);
+ break;
+ case DLM_ENQUEUE_CALLBACK_FAILURE:
+ WARN_ON_ONCE(1);
+ break;
+ case DLM_ENQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
- out:
- mutex_unlock(&lkb->lkb_cb_mutex);
+ spin_unlock(&lkb->lkb_cb_lock);
}
void dlm_callback_work(struct work_struct *work)
@@ -218,53 +177,46 @@ void dlm_callback_work(struct work_struct *work)
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
void (*castfn) (void *astparam);
void (*bastfn) (void *astparam, int mode);
- struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
- int i, rv, resid;
-
- memset(&callbacks, 0, sizeof(callbacks));
+ struct dlm_callback *cb;
+ int rv;
- mutex_lock(&lkb->lkb_cb_mutex);
- if (!lkb->lkb_callbacks[0].seq) {
- /* no callback work exists, shouldn't happen */
- log_error(ls, "dlm_callback_work %x no work", lkb->lkb_id);
- dlm_print_lkb(lkb);
- dlm_dump_lkb_callbacks(lkb);
- }
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ spin_unlock(&lkb->lkb_cb_lock);
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
- if (rv < 0)
- break;
- }
+ if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY))
+ goto out;
- if (resid) {
- /* cbs remain, loop should have removed all, shouldn't happen */
- log_error(ls, "dlm_callback_work %x resid %d", lkb->lkb_id,
- resid);
- dlm_print_lkb(lkb);
- dlm_dump_lkb_callbacks(lkb);
- }
- mutex_unlock(&lkb->lkb_cb_mutex);
+ for (;;) {
+ castfn = lkb->lkb_astfn;
+ bastfn = lkb->lkb_bastfn;
+
+ if (cb->flags & DLM_CB_BAST) {
+ trace_dlm_bast(ls, lkb, cb->mode);
+ lkb->lkb_last_bast_time = ktime_get();
+ lkb->lkb_last_bast_mode = cb->mode;
+ bastfn(lkb->lkb_astparam, cb->mode);
+ } else if (cb->flags & DLM_CB_CAST) {
+ lkb->lkb_lksb->sb_status = cb->sb_status;
+ lkb->lkb_lksb->sb_flags = cb->sb_flags;
+ trace_dlm_ast(ls, lkb);
+ lkb->lkb_last_cast_time = ktime_get();
+ castfn(lkb->lkb_astparam);
+ }
- castfn = lkb->lkb_astfn;
- bastfn = lkb->lkb_bastfn;
+ kref_put(&cb->ref, dlm_release_callback);
- for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
- if (!callbacks[i].seq)
+ spin_lock(&lkb->lkb_cb_lock);
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ if (rv == DLM_DEQUEUE_CALLBACK_EMPTY) {
+ lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
+ spin_unlock(&lkb->lkb_cb_lock);
break;
- if (callbacks[i].flags & DLM_CB_SKIP) {
- continue;
- } else if (callbacks[i].flags & DLM_CB_BAST) {
- trace_dlm_bast(ls, lkb, callbacks[i].mode);
- bastfn(lkb->lkb_astparam, callbacks[i].mode);
- } else if (callbacks[i].flags & DLM_CB_CAST) {
- lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
- lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
- trace_dlm_ast(ls, lkb);
- castfn(lkb->lkb_astparam);
}
+ spin_unlock(&lkb->lkb_cb_lock);
}
+out:
/* undo kref_get from dlm_add_callback, may cause lkb to be freed */
dlm_put_lkb(lkb);
}
@@ -289,9 +241,9 @@ void dlm_callback_stop(struct dlm_ls *ls)
void dlm_callback_suspend(struct dlm_ls *ls)
{
if (ls->ls_callback_wq) {
- mutex_lock(&ls->ls_cb_mutex);
+ spin_lock(&ls->ls_cb_lock);
set_bit(LSFL_CB_DELAY, &ls->ls_flags);
- mutex_unlock(&ls->ls_cb_mutex);
+ spin_unlock(&ls->ls_cb_lock);
flush_workqueue(ls->ls_callback_wq);
}
@@ -308,10 +260,8 @@ void dlm_callback_resume(struct dlm_ls *ls)
if (!ls->ls_callback_wq)
return;
- clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
-
more:
- mutex_lock(&ls->ls_cb_mutex);
+ spin_lock(&ls->ls_cb_lock);
list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
list_del_init(&lkb->lkb_cb_list);
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
@@ -320,7 +270,9 @@ more:
break;
}
empty = list_empty(&ls->ls_cb_delay);
- mutex_unlock(&ls->ls_cb_mutex);
+ if (empty)
+ clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ spin_unlock(&ls->ls_cb_lock);
sum += count;
if (!empty) {
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index e5e05fcc5813..880b11882495 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -11,13 +11,22 @@
#ifndef __ASTD_DOT_H__
#define __ASTD_DOT_H__
-int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq);
-int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_callback *cb, int *resid);
+#define DLM_ENQUEUE_CALLBACK_NEED_SCHED 1
+#define DLM_ENQUEUE_CALLBACK_SUCCESS 0
+#define DLM_ENQUEUE_CALLBACK_FAILURE -1
+int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+ int status, uint32_t sbflags);
+#define DLM_DEQUEUE_CALLBACK_EMPTY 2
+#define DLM_DEQUEUE_CALLBACK_LAST 1
+#define DLM_DEQUEUE_CALLBACK_SUCCESS 0
+int dlm_dequeue_lkb_callback(struct dlm_lkb *lkb, struct dlm_callback **cb);
void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
uint32_t sbflags);
+void dlm_callback_set_last_ptr(struct dlm_callback **from,
+ struct dlm_callback *to);
+void dlm_release_callback(struct kref *ref);
+void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb);
void dlm_callback_work(struct work_struct *work);
int dlm_callback_start(struct dlm_ls *ls);
void dlm_callback_stop(struct dlm_ls *ls);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index ac8b62106ce0..20b60709eccf 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -183,7 +183,7 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x)
return -EINVAL;
}
- if (dlm_allow_conn)
+ if (dlm_lowcomms_is_running())
return -EBUSY;
return 0;
@@ -194,7 +194,7 @@ static int dlm_check_zero_and_dlm_running(unsigned int x)
if (!x)
return -EINVAL;
- if (dlm_allow_conn)
+ if (dlm_lowcomms_is_running())
return -EBUSY;
return 0;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fb04ebbafb5..8a0e1b1f74ad 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -246,7 +246,7 @@ static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
lkb->lkb_status,
lkb->lkb_grmode,
lkb->lkb_rqmode,
- lkb->lkb_last_bast.mode,
+ lkb->lkb_last_bast_mode,
rsb_lookup,
lkb->lkb_wait_type,
lkb->lkb_lvbseq,
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index e34c3d2639a5..ab1a55337a6e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -211,6 +211,7 @@ struct dlm_args {
#endif
#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
+#define DLM_IFL_CB_PENDING 0x04000000
/* least significant 2 bytes are message changed, they are full transmitted
* but at receive side only the 2 bytes LSB will be set.
*
@@ -222,18 +223,17 @@ struct dlm_args {
#define DLM_IFL_USER 0x00000001
#define DLM_IFL_ORPHAN 0x00000002
-#define DLM_CALLBACKS_SIZE 6
-
#define DLM_CB_CAST 0x00000001
#define DLM_CB_BAST 0x00000002
-#define DLM_CB_SKIP 0x00000004
struct dlm_callback {
- uint64_t seq;
uint32_t flags; /* DLM_CBF_ */
int sb_status; /* copy to lksb status */
uint8_t sb_flags; /* copy to lksb flags */
int8_t mode; /* rq mode of bast, gr mode of cast */
+
+ struct list_head list;
+ struct kref ref;
};
struct dlm_lkb {
@@ -268,12 +268,13 @@ struct dlm_lkb {
unsigned long lkb_timeout_cs;
#endif
- struct mutex lkb_cb_mutex;
+ spinlock_t lkb_cb_lock;
struct work_struct lkb_cb_work;
struct list_head lkb_cb_list; /* for ls_cb_delay or proc->asts */
- struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE];
- struct dlm_callback lkb_last_cast;
- struct dlm_callback lkb_last_bast;
+ struct list_head lkb_callbacks;
+ struct dlm_callback *lkb_last_cast;
+ struct dlm_callback *lkb_last_cb;
+ int lkb_last_bast_mode;
ktime_t lkb_last_cast_time; /* for debugging */
ktime_t lkb_last_bast_time; /* for debugging */
@@ -591,11 +592,7 @@ struct dlm_ls {
int ls_new_rsb_count;
struct list_head ls_new_rsb; /* new rsb structs */
- spinlock_t ls_remove_spin;
- wait_queue_head_t ls_remove_wait;
- char ls_remove_name[DLM_RESNAME_MAXLEN+1];
char *ls_remove_names[DLM_REMOVE_NAMES_MAX];
- int ls_remove_len;
int ls_remove_lens[DLM_REMOVE_NAMES_MAX];
struct list_head ls_nodes; /* current nodes in ls */
@@ -631,7 +628,7 @@ struct dlm_ls {
/* recovery related */
- struct mutex ls_cb_mutex;
+ spinlock_t ls_cb_lock;
struct list_head ls_cb_delay; /* save for queue_work later */
struct timer_list ls_timer;
struct task_struct *ls_recoverd_task;
@@ -670,7 +667,7 @@ struct dlm_ls {
void *ls_ops_arg;
int ls_namelen;
- char ls_name[1];
+ char ls_name[DLM_LOCKSPACE_LEN + 1];
};
/*
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 94a72ede5764..e1adfa5aed05 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1209,6 +1209,7 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
if (!lkb)
return -ENOMEM;
+ lkb->lkb_last_bast_mode = -1;
lkb->lkb_nodeid = -1;
lkb->lkb_grmode = DLM_LOCK_IV;
kref_init(&lkb->lkb_ref);
@@ -1218,7 +1219,8 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
INIT_LIST_HEAD(&lkb->lkb_time_list);
#endif
INIT_LIST_HEAD(&lkb->lkb_cb_list);
- mutex_init(&lkb->lkb_cb_mutex);
+ INIT_LIST_HEAD(&lkb->lkb_callbacks);
+ spin_lock_init(&lkb->lkb_cb_lock);
INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
idr_preload(GFP_NOFS);
@@ -1587,37 +1589,6 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
return error;
}
-/* If there's an rsb for the same resource being removed, ensure
- * that the remove message is sent before the new lookup message.
- */
-
-#define DLM_WAIT_PENDING_COND(ls, r) \
- (ls->ls_remove_len && \
- !rsb_cmp(r, ls->ls_remove_name, \
- ls->ls_remove_len))
-
-static void wait_pending_remove(struct dlm_rsb *r)
-{
- struct dlm_ls *ls = r->res_ls;
- restart:
- spin_lock(&ls->ls_remove_spin);
- if (DLM_WAIT_PENDING_COND(ls, r)) {
- log_debug(ls, "delay lookup for remove dir %d %s",
- r->res_dir_nodeid, r->res_name);
- spin_unlock(&ls->ls_remove_spin);
- wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r));
- goto restart;
- }
- spin_unlock(&ls->ls_remove_spin);
-}
-
-/*
- * ls_remove_spin protects ls_remove_name and ls_remove_len which are
- * read by other threads in wait_pending_remove. ls_remove_names
- * and ls_remove_lens are only used by the scan thread, so they do
- * not need protection.
- */
-
static void shrink_bucket(struct dlm_ls *ls, int b)
{
struct rb_node *n, *next;
@@ -1699,11 +1670,6 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
* list and sending the removal. Keeping this gap small is
* important to keep us (the master node) from being out of sync
* with the remote dir node for very long.
- *
- * From the time the rsb is removed from toss until just after
- * send_remove, the rsb name is saved in ls_remove_name. A new
- * lookup checks this to ensure that a new lookup message for the
- * same resource name is not sent just before the remove message.
*/
for (i = 0; i < remote_count; i++) {
@@ -1750,22 +1716,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
}
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
-
- /* block lookup of same name until we've sent remove */
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = len;
- memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- spin_unlock(&ls->ls_rsbtbl[b].lock);
-
send_remove(r);
-
- /* allow lookup of name again */
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = 0;
- memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- wake_up(&ls->ls_remove_wait);
+ spin_unlock(&ls->ls_rsbtbl[b].lock);
dlm_free_rsb(r);
}
@@ -2716,8 +2668,6 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
return 0;
}
- wait_pending_remove(r);
-
r->res_first_lkid = lkb->lkb_id;
send_lookup(r, lkb);
return 1;
@@ -3552,7 +3502,8 @@ int dlm_unlock(dlm_lockspace_t *lockspace,
static int _create_message(struct dlm_ls *ls, int mb_len,
int to_nodeid, int mstype,
struct dlm_message **ms_ret,
- struct dlm_mhandle **mh_ret)
+ struct dlm_mhandle **mh_ret,
+ gfp_t allocation)
{
struct dlm_message *ms;
struct dlm_mhandle *mh;
@@ -3562,7 +3513,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
pass into midcomms_commit and a message buffer (mb) that we
write our data into */
- mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
+ mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, allocation, &mb);
if (!mh)
return -ENOBUFS;
@@ -3584,7 +3535,8 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
int to_nodeid, int mstype,
struct dlm_message **ms_ret,
- struct dlm_mhandle **mh_ret)
+ struct dlm_mhandle **mh_ret,
+ gfp_t allocation)
{
int mb_len = sizeof(struct dlm_message);
@@ -3605,15 +3557,16 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
}
return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
- ms_ret, mh_ret);
+ ms_ret, mh_ret, allocation);
}
/* further lowcomms enhancements or alternate implementations may make
the return value from this function useful at some point */
-static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms,
+ const void *name, int namelen)
{
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, name, namelen);
return 0;
}
@@ -3673,13 +3626,13 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
if (error)
return error;
- error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
if (error)
goto fail;
send_args(r, lkb, ms);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
if (error)
goto fail;
return 0;
@@ -3734,7 +3687,8 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
to_nodeid = lkb->lkb_nodeid;
- error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+ error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh,
+ GFP_NOFS);
if (error)
goto out;
@@ -3742,7 +3696,7 @@ static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
ms->m_result = 0;
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3755,7 +3709,8 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
to_nodeid = lkb->lkb_nodeid;
- error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh,
+ GFP_NOFS);
if (error)
goto out;
@@ -3763,7 +3718,7 @@ static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
ms->m_bastmode = cpu_to_le32(mode);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3780,13 +3735,14 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
if (error)
return error;
- error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh,
+ GFP_NOFS);
if (error)
goto fail;
send_args(r, lkb, ms);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
if (error)
goto fail;
return 0;
@@ -3804,14 +3760,15 @@ static int send_remove(struct dlm_rsb *r)
to_nodeid = dlm_dir_nodeid(r);
- error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh,
+ GFP_ATOMIC);
if (error)
goto out;
memcpy(ms->m_extra, r->res_name, r->res_length);
ms->m_hash = cpu_to_le32(r->res_hash);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3825,7 +3782,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
to_nodeid = lkb->lkb_nodeid;
- error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh, GFP_NOFS);
if (error)
goto out;
@@ -3833,7 +3790,7 @@ static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
ms->m_result = cpu_to_le32(to_dlm_errno(rv));
- error = send_message(mh, ms);
+ error = send_message(mh, ms, r->res_name, r->res_length);
out:
return error;
}
@@ -3866,7 +3823,8 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
struct dlm_mhandle *mh;
int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
- error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+ error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh,
+ GFP_NOFS);
if (error)
goto out;
@@ -3874,7 +3832,7 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
ms->m_result = cpu_to_le32(to_dlm_errno(rv));
ms->m_nodeid = cpu_to_le32(ret_nodeid);
- error = send_message(mh, ms);
+ error = send_message(mh, ms, ms_in->m_extra, receive_extralen(ms_in));
out:
return error;
}
@@ -4044,66 +4002,6 @@ out:
return error;
}
-static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
-{
- char name[DLM_RESNAME_MAXLEN + 1];
- struct dlm_message *ms;
- struct dlm_mhandle *mh;
- struct dlm_rsb *r;
- uint32_t hash, b;
- int rv, dir_nodeid;
-
- memset(name, 0, sizeof(name));
- memcpy(name, ms_name, len);
-
- hash = jhash(name, len, 0);
- b = hash & (ls->ls_rsbtbl_size - 1);
-
- dir_nodeid = dlm_hash2nodeid(ls, hash);
-
- log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name);
-
- spin_lock(&ls->ls_rsbtbl[b].lock);
- rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
- if (!rv) {
- spin_unlock(&ls->ls_rsbtbl[b].lock);
- log_error(ls, "repeat_remove on keep %s", name);
- return;
- }
-
- rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
- if (!rv) {
- spin_unlock(&ls->ls_rsbtbl[b].lock);
- log_error(ls, "repeat_remove on toss %s", name);
- return;
- }
-
- /* use ls->remove_name2 to avoid conflict with shrink? */
-
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = len;
- memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- spin_unlock(&ls->ls_rsbtbl[b].lock);
-
- rv = _create_message(ls, sizeof(struct dlm_message) + len,
- dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
- if (rv)
- goto out;
-
- memcpy(ms->m_extra, name, len);
- ms->m_hash = cpu_to_le32(hash);
-
- send_message(mh, ms);
-
-out:
- spin_lock(&ls->ls_remove_spin);
- ls->ls_remove_len = 0;
- memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
- spin_unlock(&ls->ls_remove_spin);
- wake_up(&ls->ls_remove_wait);
-}
-
static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
{
struct dlm_lkb *lkb;
@@ -4173,25 +4071,11 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
ENOTBLK request failures when the lookup reply designating us
as master is delayed. */
- /* We could repeatedly return -EBADR here if our send_remove() is
- delayed in being sent/arriving/being processed on the dir node.
- Another node would repeatedly lookup up the master, and the dir
- node would continue returning our nodeid until our send_remove
- took effect.
-
- We send another remove message in case our previous send_remove
- was lost/ignored/missed somehow. */
-
if (error != -ENOTBLK) {
log_limit(ls, "receive_request %x from %d %d",
le32_to_cpu(ms->m_lkid), from_nodeid, error);
}
- if (namelen && error == -EBADR) {
- send_repeat_remove(ls, ms->m_extra, namelen);
- msleep(1000);
- }
-
setup_stub_lkb(ls, ms);
send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
return error;
@@ -6294,8 +6178,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
}
list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
- memset(&lkb->lkb_callbacks, 0,
- sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+ dlm_purge_lkb_callbacks(lkb);
list_del_init(&lkb->lkb_cb_list);
dlm_put_lkb(lkb);
}
@@ -6336,8 +6219,7 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
spin_lock(&proc->asts_spin);
list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
- memset(&lkb->lkb_callbacks, 0,
- sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+ dlm_purge_lkb_callbacks(lkb);
list_del_init(&lkb->lkb_cb_list);
dlm_put_lkb(lkb);
}
@@ -6368,13 +6250,13 @@ static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
int error;
error = _create_message(ls, sizeof(struct dlm_message), nodeid,
- DLM_MSG_PURGE, &ms, &mh);
+ DLM_MSG_PURGE, &ms, &mh, GFP_NOFS);
if (error)
return error;
ms->m_nodeid = cpu_to_le32(nodeid);
ms->m_pid = cpu_to_le32(pid);
- return send_message(mh, ms);
+ return send_message(mh, ms, NULL, 0);
}
int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index bae050df7abf..d0b4e2181a5f 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -17,7 +17,6 @@
#include "recoverd.h"
#include "dir.h"
#include "midcomms.h"
-#include "lowcomms.h"
#include "config.h"
#include "memory.h"
#include "lock.h"
@@ -391,7 +390,7 @@ static int threads_start(void)
/* Thread for sending/receiving messages for all lockspace's */
error = dlm_midcomms_start();
if (error) {
- log_print("cannot start dlm lowcomms %d", error);
+ log_print("cannot start dlm midcomms %d", error);
goto scand_fail;
}
@@ -473,7 +472,7 @@ static int new_lockspace(const char *name, const char *cluster,
error = -ENOMEM;
- ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
+ ls = kzalloc(sizeof(*ls), GFP_NOFS);
if (!ls)
goto out;
memcpy(ls->ls_name, name, namelen);
@@ -524,9 +523,6 @@ static int new_lockspace(const char *name, const char *cluster,
spin_lock_init(&ls->ls_rsbtbl[i].lock);
}
- spin_lock_init(&ls->ls_remove_spin);
- init_waitqueue_head(&ls->ls_remove_wait);
-
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
GFP_KERNEL);
@@ -567,7 +563,7 @@ static int new_lockspace(const char *name, const char *cluster,
init_completion(&ls->ls_recovery_done);
ls->ls_recovery_result = -1;
- mutex_init(&ls->ls_cb_mutex);
+ spin_lock_init(&ls->ls_cb_lock);
INIT_LIST_HEAD(&ls->ls_cb_delay);
ls->ls_recoverd_task = NULL;
@@ -726,7 +722,7 @@ static int __dlm_new_lockspace(const char *name, const char *cluster,
if (!ls_count) {
dlm_scand_stop();
dlm_midcomms_shutdown();
- dlm_lowcomms_stop();
+ dlm_midcomms_stop();
}
out:
mutex_unlock(&ls_lock);
@@ -929,7 +925,7 @@ int dlm_release_lockspace(void *lockspace, int force)
if (!error)
ls_count--;
if (!ls_count)
- dlm_lowcomms_stop();
+ dlm_midcomms_stop();
mutex_unlock(&ls_lock);
return error;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 59f64c596233..4450721ec83c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -63,41 +63,49 @@
#define NEEDED_RMEM (4*1024*1024)
-/* Number of messages to send before rescheduling */
-#define MAX_SEND_MSG_COUNT 25
-#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000)
-
struct connection {
struct socket *sock; /* NULL if not connected */
uint32_t nodeid; /* So we know who we are in the list */
- struct mutex sock_mutex;
+ /* this semaphore is used to allow parallel recv/send in read
+ * lock mode. When we release a sock we need to held the write lock.
+ *
+ * However this is locking code and not nice. When we remove the
+ * othercon handling we can look into other mechanism to synchronize
+ * io handling to call sock_release() at the right time.
+ */
+ struct rw_semaphore sock_lock;
unsigned long flags;
-#define CF_READ_PENDING 1
-#define CF_WRITE_PENDING 2
-#define CF_INIT_PENDING 4
+#define CF_APP_LIMITED 0
+#define CF_RECV_PENDING 1
+#define CF_SEND_PENDING 2
+#define CF_RECV_INTR 3
+#define CF_IO_STOP 4
#define CF_IS_OTHERCON 5
-#define CF_CLOSE 6
-#define CF_APP_LIMITED 7
-#define CF_CLOSING 8
-#define CF_SHUTDOWN 9
-#define CF_CONNECTED 10
-#define CF_RECONNECT 11
-#define CF_DELAY_CONNECT 12
-#define CF_EOF 13
struct list_head writequeue; /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
- atomic_t writequeue_cnt;
int retries;
-#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
+ /* due some connect()/accept() races we currently have this cross over
+ * connection attempt second connection for one node.
+ *
+ * There is a solution to avoid the race by introducing a connect
+ * rule as e.g. our_nodeid > nodeid_to_connect who is allowed to
+ * connect. Otherside can connect but will only be considered that
+ * the other side wants to have a reconnect.
+ *
+ * However changing to this behaviour will break backwards compatible.
+ * In a DLM protocol major version upgrade we should remove this!
+ */
struct connection *othercon;
- struct connection *sendcon;
- struct work_struct rwork; /* Receive workqueue */
- struct work_struct swork; /* Send workqueue */
- wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
- unsigned char *rx_buf;
- int rx_buflen;
+ struct work_struct rwork; /* receive worker */
+ struct work_struct swork; /* send worker */
+ unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE];
int rx_leftover;
+ int mark;
+ int addr_count;
+ int curr_addr_index;
+ struct sockaddr_storage addr[DLM_MAX_ADDR_COUNT];
+ spinlock_t addrs_lock;
struct rcu_head rcu;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -136,13 +144,12 @@ struct dlm_msg {
struct kref ref;
};
-struct dlm_node_addr {
- struct list_head list;
+struct processqueue_entry {
+ unsigned char *buf;
int nodeid;
- int mark;
- int addr_count;
- int curr_addr_index;
- struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+ int buflen;
+
+ struct list_head list;
};
struct dlm_proto_ops {
@@ -157,10 +164,6 @@ struct dlm_proto_ops {
int (*listen_validate)(void);
void (*listen_sockopts)(struct socket *sock);
int (*listen_bind)(struct socket *sock);
- /* What to do to shutdown */
- void (*shutdown_action)(struct connection *con);
- /* What to do to eof check */
- bool (*eof_condition)(struct connection *con);
};
static struct listen_sock_callbacks {
@@ -170,17 +173,13 @@ static struct listen_sock_callbacks {
void (*sk_write_space)(struct sock *);
} listen_sock;
-static LIST_HEAD(dlm_node_addrs);
-static DEFINE_SPINLOCK(dlm_node_addrs_spin);
-
static struct listen_connection listen_con;
-static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static struct sockaddr_storage dlm_local_addr[DLM_MAX_ADDR_COUNT];
static int dlm_local_count;
-int dlm_allow_conn;
/* Work queues */
-static struct workqueue_struct *recv_workqueue;
-static struct workqueue_struct *send_workqueue;
+static struct workqueue_struct *io_workqueue;
+static struct workqueue_struct *process_workqueue;
static struct hlist_head connection_hash[CONN_HASH_SIZE];
static DEFINE_SPINLOCK(connections_lock);
@@ -188,8 +187,45 @@ DEFINE_STATIC_SRCU(connections_srcu);
static const struct dlm_proto_ops *dlm_proto_ops;
+#define DLM_IO_SUCCESS 0
+#define DLM_IO_END 1
+#define DLM_IO_EOF 2
+#define DLM_IO_RESCHED 3
+
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
+static void process_dlm_messages(struct work_struct *work);
+
+static DECLARE_WORK(process_work, process_dlm_messages);
+static DEFINE_SPINLOCK(processqueue_lock);
+static bool process_dlm_messages_pending;
+static LIST_HEAD(processqueue);
+
+bool dlm_lowcomms_is_running(void)
+{
+ return !!listen_con.sock;
+}
+
+static void lowcomms_queue_swork(struct connection *con)
+{
+ assert_spin_locked(&con->writequeue_lock);
+
+ if (!test_bit(CF_IO_STOP, &con->flags) &&
+ !test_bit(CF_APP_LIMITED, &con->flags) &&
+ !test_and_set_bit(CF_SEND_PENDING, &con->flags))
+ queue_work(io_workqueue, &con->swork);
+}
+
+static void lowcomms_queue_rwork(struct connection *con)
+{
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(!lockdep_sock_is_held(con->sock->sk));
+#endif
+
+ if (!test_bit(CF_IO_STOP, &con->flags) &&
+ !test_and_set_bit(CF_RECV_PENDING, &con->flags))
+ queue_work(io_workqueue, &con->rwork);
+}
static void writequeue_entry_ctor(void *data)
{
@@ -214,15 +250,12 @@ static struct writequeue_entry *con_next_wq(struct connection *con)
{
struct writequeue_entry *e;
- if (list_empty(&con->writequeue))
- return NULL;
-
- e = list_first_entry(&con->writequeue, struct writequeue_entry,
- list);
+ e = list_first_entry_or_null(&con->writequeue, struct writequeue_entry,
+ list);
/* if len is zero nothing is to send, if there are users filling
* buffers we wait until the users are done so we can send more.
*/
- if (e->users || e->len == 0)
+ if (!e || e->users || e->len == 0)
return NULL;
return e;
@@ -240,28 +273,15 @@ static struct connection *__find_con(int nodeid, int r)
return NULL;
}
-static bool tcp_eof_condition(struct connection *con)
-{
- return atomic_read(&con->writequeue_cnt);
-}
-
-static int dlm_con_init(struct connection *con, int nodeid)
+static void dlm_con_init(struct connection *con, int nodeid)
{
- con->rx_buflen = dlm_config.ci_buffer_size;
- con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
- if (!con->rx_buf)
- return -ENOMEM;
-
con->nodeid = nodeid;
- mutex_init(&con->sock_mutex);
+ init_rwsem(&con->sock_lock);
INIT_LIST_HEAD(&con->writequeue);
spin_lock_init(&con->writequeue_lock);
- atomic_set(&con->writequeue_cnt, 0);
INIT_WORK(&con->swork, process_send_sockets);
INIT_WORK(&con->rwork, process_recv_sockets);
- init_waitqueue_head(&con->shutdown_wait);
-
- return 0;
+ spin_lock_init(&con->addrs_lock);
}
/*
@@ -271,7 +291,7 @@ static int dlm_con_init(struct connection *con, int nodeid)
static struct connection *nodeid2con(int nodeid, gfp_t alloc)
{
struct connection *con, *tmp;
- int r, ret;
+ int r;
r = nodeid_hash(nodeid);
con = __find_con(nodeid, r);
@@ -282,11 +302,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
if (!con)
return NULL;
- ret = dlm_con_init(con, nodeid);
- if (ret) {
- kfree(con);
- return NULL;
- }
+ dlm_con_init(con, nodeid);
spin_lock(&connections_lock);
/* Because multiple workqueues/threads calls this function it can
@@ -298,7 +314,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
tmp = __find_con(nodeid, r);
if (tmp) {
spin_unlock(&connections_lock);
- kfree(con->rx_buf);
kfree(con);
return tmp;
}
@@ -309,29 +324,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc)
return con;
}
-/* Loop round all connections */
-static void foreach_conn(void (*conn_func)(struct connection *c))
-{
- int i;
- struct connection *con;
-
- for (i = 0; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry_rcu(con, &connection_hash[i], list)
- conn_func(con);
- }
-}
-
-static struct dlm_node_addr *find_node_addr(int nodeid)
-{
- struct dlm_node_addr *na;
-
- list_for_each_entry(na, &dlm_node_addrs, list) {
- if (na->nodeid == nodeid)
- return na;
- }
- return NULL;
-}
-
static int addr_compare(const struct sockaddr_storage *x,
const struct sockaddr_storage *y)
{
@@ -365,40 +357,47 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
unsigned int *mark)
{
struct sockaddr_storage sas;
- struct dlm_node_addr *na;
+ struct connection *con;
+ int idx;
if (!dlm_local_count)
return -1;
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (na && na->addr_count) {
- memcpy(&sas, na->addr[na->curr_addr_index],
- sizeof(struct sockaddr_storage));
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
- if (try_new_addr) {
- na->curr_addr_index++;
- if (na->curr_addr_index == na->addr_count)
- na->curr_addr_index = 0;
- }
+ spin_lock(&con->addrs_lock);
+ if (!con->addr_count) {
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
}
- spin_unlock(&dlm_node_addrs_spin);
- if (!na)
- return -EEXIST;
+ memcpy(&sas, &con->addr[con->curr_addr_index],
+ sizeof(struct sockaddr_storage));
- if (!na->addr_count)
- return -ENOENT;
+ if (try_new_addr) {
+ con->curr_addr_index++;
+ if (con->curr_addr_index == con->addr_count)
+ con->curr_addr_index = 0;
+ }
- *mark = na->mark;
+ *mark = con->mark;
+ spin_unlock(&con->addrs_lock);
if (sas_out)
memcpy(sas_out, &sas, sizeof(struct sockaddr_storage));
- if (!sa_out)
+ if (!sa_out) {
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
+ }
- if (dlm_local_addr[0]->ss_family == AF_INET) {
+ if (dlm_local_addr[0].ss_family == AF_INET) {
struct sockaddr_in *in4 = (struct sockaddr_in *) &sas;
struct sockaddr_in *ret4 = (struct sockaddr_in *) sa_out;
ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
@@ -408,43 +407,46 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
ret6->sin6_addr = in6->sin6_addr;
}
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
}
static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid,
unsigned int *mark)
{
- struct dlm_node_addr *na;
- int rv = -EEXIST;
- int addr_i;
-
- spin_lock(&dlm_node_addrs_spin);
- list_for_each_entry(na, &dlm_node_addrs, list) {
- if (!na->addr_count)
- continue;
-
- for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
- if (addr_compare(na->addr[addr_i], addr)) {
- *nodeid = na->nodeid;
- *mark = na->mark;
- rv = 0;
- goto unlock;
+ struct connection *con;
+ int i, idx, addr_i;
+
+ idx = srcu_read_lock(&connections_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+ WARN_ON_ONCE(!con->addr_count);
+
+ spin_lock(&con->addrs_lock);
+ for (addr_i = 0; addr_i < con->addr_count; addr_i++) {
+ if (addr_compare(&con->addr[addr_i], addr)) {
+ *nodeid = con->nodeid;
+ *mark = con->mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
+ return 0;
+ }
}
+ spin_unlock(&con->addrs_lock);
}
}
-unlock:
- spin_unlock(&dlm_node_addrs_spin);
- return rv;
+ srcu_read_unlock(&connections_srcu, idx);
+
+ return -ENOENT;
}
-/* caller need to held dlm_node_addrs_spin lock */
-static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
- const struct sockaddr_storage *addr)
+static bool dlm_lowcomms_con_has_addr(const struct connection *con,
+ const struct sockaddr_storage *addr)
{
int i;
- for (i = 0; i < na->addr_count; i++) {
- if (addr_compare(na->addr[i], addr))
+ for (i = 0; i < con->addr_count; i++) {
+ if (addr_compare(&con->addr[i], addr))
return true;
}
@@ -453,118 +455,82 @@ static bool dlm_lowcomms_na_has_addr(const struct dlm_node_addr *na,
int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
{
- struct sockaddr_storage *new_addr;
- struct dlm_node_addr *new_node, *na;
- bool ret;
-
- new_node = kzalloc(sizeof(struct dlm_node_addr), GFP_NOFS);
- if (!new_node)
- return -ENOMEM;
+ struct connection *con;
+ bool ret, idx;
- new_addr = kzalloc(sizeof(struct sockaddr_storage), GFP_NOFS);
- if (!new_addr) {
- kfree(new_node);
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, GFP_NOFS);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOMEM;
}
- memcpy(new_addr, addr, len);
-
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (!na) {
- new_node->nodeid = nodeid;
- new_node->addr[0] = new_addr;
- new_node->addr_count = 1;
- new_node->mark = dlm_config.ci_mark;
- list_add(&new_node->list, &dlm_node_addrs);
- spin_unlock(&dlm_node_addrs_spin);
+ spin_lock(&con->addrs_lock);
+ if (!con->addr_count) {
+ memcpy(&con->addr[0], addr, sizeof(*addr));
+ con->addr_count = 1;
+ con->mark = dlm_config.ci_mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
}
- ret = dlm_lowcomms_na_has_addr(na, addr);
+ ret = dlm_lowcomms_con_has_addr(con, addr);
if (ret) {
- spin_unlock(&dlm_node_addrs_spin);
- kfree(new_addr);
- kfree(new_node);
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return -EEXIST;
}
- if (na->addr_count >= DLM_MAX_ADDR_COUNT) {
- spin_unlock(&dlm_node_addrs_spin);
- kfree(new_addr);
- kfree(new_node);
+ if (con->addr_count >= DLM_MAX_ADDR_COUNT) {
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOSPC;
}
- na->addr[na->addr_count++] = new_addr;
- spin_unlock(&dlm_node_addrs_spin);
- kfree(new_node);
+ memcpy(&con->addr[con->addr_count++], addr, sizeof(*addr));
+ srcu_read_unlock(&connections_srcu, idx);
+ spin_unlock(&con->addrs_lock);
return 0;
}
/* Data available on socket or listen socket received a connect */
static void lowcomms_data_ready(struct sock *sk)
{
- struct connection *con;
-
- con = sock2con(sk);
- if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
- queue_work(recv_workqueue, &con->rwork);
-}
-
-static void lowcomms_listen_data_ready(struct sock *sk)
-{
- if (!dlm_allow_conn)
- return;
+ struct connection *con = sock2con(sk);
- queue_work(recv_workqueue, &listen_con.rwork);
+ set_bit(CF_RECV_INTR, &con->flags);
+ lowcomms_queue_rwork(con);
}
static void lowcomms_write_space(struct sock *sk)
{
- struct connection *con;
-
- con = sock2con(sk);
- if (!con)
- return;
-
- if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
- log_print("connected to node %d", con->nodeid);
- queue_work(send_workqueue, &con->swork);
- return;
- }
+ struct connection *con = sock2con(sk);
clear_bit(SOCK_NOSPACE, &con->sock->flags);
+ spin_lock_bh(&con->writequeue_lock);
if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
con->sock->sk->sk_write_pending--;
clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
}
- queue_work(send_workqueue, &con->swork);
-}
-
-static inline void lowcomms_connect_sock(struct connection *con)
-{
- if (test_bit(CF_CLOSE, &con->flags))
- return;
- queue_work(send_workqueue, &con->swork);
- cond_resched();
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
}
static void lowcomms_state_change(struct sock *sk)
{
/* SCTP layer is not calling sk_data_ready when the connection
- * is done, so we catch the signal through here. Also, it
- * doesn't switch socket state when entering shutdown, so we
- * skip the write in that case.
+ * is done, so we catch the signal through here.
*/
- if (sk->sk_shutdown) {
- if (sk->sk_shutdown == RCV_SHUTDOWN)
- lowcomms_data_ready(sk);
- } else if (sk->sk_state == TCP_ESTABLISHED) {
- lowcomms_write_space(sk);
- }
+ if (sk->sk_shutdown == RCV_SHUTDOWN)
+ lowcomms_data_ready(sk);
+}
+
+static void lowcomms_listen_data_ready(struct sock *sk)
+{
+ queue_work(io_workqueue, &listen_con.rwork);
}
int dlm_lowcomms_connect_node(int nodeid)
@@ -576,47 +542,49 @@ int dlm_lowcomms_connect_node(int nodeid)
return 0;
idx = srcu_read_lock(&connections_srcu);
- con = nodeid2con(nodeid, GFP_NOFS);
- if (!con) {
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
srcu_read_unlock(&connections_srcu, idx);
- return -ENOMEM;
+ return -ENOENT;
}
- lowcomms_connect_sock(con);
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ }
+ up_read(&con->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
+ cond_resched();
return 0;
}
int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark)
{
- struct dlm_node_addr *na;
+ struct connection *con;
+ int idx;
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (!na) {
- spin_unlock(&dlm_node_addrs_spin);
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (!con) {
+ srcu_read_unlock(&connections_srcu, idx);
return -ENOENT;
}
- na->mark = mark;
- spin_unlock(&dlm_node_addrs_spin);
-
+ spin_lock(&con->addrs_lock);
+ con->mark = mark;
+ spin_unlock(&con->addrs_lock);
+ srcu_read_unlock(&connections_srcu, idx);
return 0;
}
static void lowcomms_error_report(struct sock *sk)
{
- struct connection *con;
- void (*orig_report)(struct sock *) = NULL;
+ struct connection *con = sock2con(sk);
struct inet_sock *inet;
- con = sock2con(sk);
- if (con == NULL)
- goto out;
-
- orig_report = listen_sock.sk_error_report;
-
inet = inet_sk(sk);
switch (sk->sk_family) {
case AF_INET:
@@ -642,66 +610,25 @@ static void lowcomms_error_report(struct sock *sk)
"invalid socket family %d set, "
"sk_err=%d/%d\n", dlm_our_nodeid(),
sk->sk_family, sk->sk_err, sk->sk_err_soft);
- goto out;
- }
-
- /* below sendcon only handling */
- if (test_bit(CF_IS_OTHERCON, &con->flags))
- con = con->sendcon;
-
- switch (sk->sk_err) {
- case ECONNREFUSED:
- set_bit(CF_DELAY_CONNECT, &con->flags);
- break;
- default:
break;
}
- if (!test_and_set_bit(CF_RECONNECT, &con->flags))
- queue_work(send_workqueue, &con->swork);
+ dlm_midcomms_unack_msg_resend(con->nodeid);
-out:
- if (orig_report)
- orig_report(sk);
+ listen_sock.sk_error_report(sk);
}
-/* Note: sk_callback_lock must be locked before calling this function. */
-static void save_listen_callbacks(struct socket *sock)
+static void restore_callbacks(struct sock *sk)
{
- struct sock *sk = sock->sk;
-
- listen_sock.sk_data_ready = sk->sk_data_ready;
- listen_sock.sk_state_change = sk->sk_state_change;
- listen_sock.sk_write_space = sk->sk_write_space;
- listen_sock.sk_error_report = sk->sk_error_report;
-}
-
-static void restore_callbacks(struct socket *sock)
-{
- struct sock *sk = sock->sk;
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(!lockdep_sock_is_held(sk));
+#endif
- lock_sock(sk);
sk->sk_user_data = NULL;
sk->sk_data_ready = listen_sock.sk_data_ready;
sk->sk_state_change = listen_sock.sk_state_change;
sk->sk_write_space = listen_sock.sk_write_space;
sk->sk_error_report = listen_sock.sk_error_report;
- release_sock(sk);
-}
-
-static void add_listen_sock(struct socket *sock, struct listen_connection *con)
-{
- struct sock *sk = sock->sk;
-
- lock_sock(sk);
- save_listen_callbacks(sock);
- con->sock = sock;
-
- sk->sk_user_data = con;
- sk->sk_allocation = GFP_NOFS;
- /* Install a data_ready callback */
- sk->sk_data_ready = lowcomms_listen_data_ready;
- release_sock(sk);
}
/* Make a socket active */
@@ -713,11 +640,12 @@ static void add_sock(struct socket *sock, struct connection *con)
con->sock = sock;
sk->sk_user_data = con;
- /* Install a data_ready callback */
sk->sk_data_ready = lowcomms_data_ready;
sk->sk_write_space = lowcomms_write_space;
- sk->sk_state_change = lowcomms_state_change;
+ if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
+ sk->sk_state_change = lowcomms_state_change;
sk->sk_allocation = GFP_NOFS;
+ sk->sk_use_task_frag = false;
sk->sk_error_report = lowcomms_error_report;
release_sock(sk);
}
@@ -727,7 +655,7 @@ static void add_sock(struct socket *sock, struct connection *con)
static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
int *addr_len)
{
- saddr->ss_family = dlm_local_addr[0]->ss_family;
+ saddr->ss_family = dlm_local_addr[0].ss_family;
if (saddr->ss_family == AF_INET) {
struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
in4_addr->sin_port = cpu_to_be16(port);
@@ -773,43 +701,67 @@ static void free_entry(struct writequeue_entry *e)
}
list_del(&e->list);
- atomic_dec(&e->con->writequeue_cnt);
kref_put(&e->ref, dlm_page_release);
}
static void dlm_close_sock(struct socket **sock)
{
- if (*sock) {
- restore_callbacks(*sock);
- sock_release(*sock);
- *sock = NULL;
+ lock_sock((*sock)->sk);
+ restore_callbacks((*sock)->sk);
+ release_sock((*sock)->sk);
+
+ sock_release(*sock);
+ *sock = NULL;
+}
+
+static void allow_connection_io(struct connection *con)
+{
+ if (con->othercon)
+ clear_bit(CF_IO_STOP, &con->othercon->flags);
+ clear_bit(CF_IO_STOP, &con->flags);
+}
+
+static void stop_connection_io(struct connection *con)
+{
+ if (con->othercon)
+ stop_connection_io(con->othercon);
+
+ down_write(&con->sock_lock);
+ if (con->sock) {
+ lock_sock(con->sock->sk);
+ restore_callbacks(con->sock->sk);
+
+ spin_lock_bh(&con->writequeue_lock);
+ set_bit(CF_IO_STOP, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
+ } else {
+ spin_lock_bh(&con->writequeue_lock);
+ set_bit(CF_IO_STOP, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
}
+ up_write(&con->sock_lock);
+
+ cancel_work_sync(&con->swork);
+ cancel_work_sync(&con->rwork);
}
/* Close a remote connection and tidy up */
-static void close_connection(struct connection *con, bool and_other,
- bool tx, bool rx)
+static void close_connection(struct connection *con, bool and_other)
{
- bool closing = test_and_set_bit(CF_CLOSING, &con->flags);
struct writequeue_entry *e;
- if (tx && !closing && cancel_work_sync(&con->swork)) {
- log_print("canceled swork for node %d", con->nodeid);
- clear_bit(CF_WRITE_PENDING, &con->flags);
- }
- if (rx && !closing && cancel_work_sync(&con->rwork)) {
- log_print("canceled rwork for node %d", con->nodeid);
- clear_bit(CF_READ_PENDING, &con->flags);
+ if (con->othercon && and_other)
+ close_connection(con->othercon, false);
+
+ down_write(&con->sock_lock);
+ if (!con->sock) {
+ up_write(&con->sock_lock);
+ return;
}
- mutex_lock(&con->sock_mutex);
dlm_close_sock(&con->sock);
- if (con->othercon && and_other) {
- /* Will only re-enter once. */
- close_connection(con->othercon, false, tx, rx);
- }
-
/* if we send a writequeue entry only a half way, we drop the
* whole entry because reconnection and that we not start of the
* middle of a msg which will confuse the other end.
@@ -821,200 +773,209 @@ static void close_connection(struct connection *con, bool and_other,
* our policy is to start on a clean state when disconnects, we don't
* know what's send/received on transport layer in this case.
*/
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
if (!list_empty(&con->writequeue)) {
e = list_first_entry(&con->writequeue, struct writequeue_entry,
list);
if (e->dirty)
free_entry(e);
}
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
con->rx_leftover = 0;
con->retries = 0;
clear_bit(CF_APP_LIMITED, &con->flags);
- clear_bit(CF_CONNECTED, &con->flags);
- clear_bit(CF_DELAY_CONNECT, &con->flags);
- clear_bit(CF_RECONNECT, &con->flags);
- clear_bit(CF_EOF, &con->flags);
- mutex_unlock(&con->sock_mutex);
- clear_bit(CF_CLOSING, &con->flags);
+ clear_bit(CF_RECV_PENDING, &con->flags);
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ up_write(&con->sock_lock);
}
-static void shutdown_connection(struct connection *con)
+static struct processqueue_entry *new_processqueue_entry(int nodeid,
+ int buflen)
{
- int ret;
-
- flush_work(&con->swork);
+ struct processqueue_entry *pentry;
- mutex_lock(&con->sock_mutex);
- /* nothing to shutdown */
- if (!con->sock) {
- mutex_unlock(&con->sock_mutex);
- return;
- }
+ pentry = kmalloc(sizeof(*pentry), GFP_NOFS);
+ if (!pentry)
+ return NULL;
- set_bit(CF_SHUTDOWN, &con->flags);
- ret = kernel_sock_shutdown(con->sock, SHUT_WR);
- mutex_unlock(&con->sock_mutex);
- if (ret) {
- log_print("Connection %p failed to shutdown: %d will force close",
- con, ret);
- goto force_close;
- } else {
- ret = wait_event_timeout(con->shutdown_wait,
- !test_bit(CF_SHUTDOWN, &con->flags),
- DLM_SHUTDOWN_WAIT_TIMEOUT);
- if (ret == 0) {
- log_print("Connection %p shutdown timed out, will force close",
- con);
- goto force_close;
- }
+ pentry->buf = kmalloc(buflen, GFP_NOFS);
+ if (!pentry->buf) {
+ kfree(pentry);
+ return NULL;
}
- return;
+ pentry->nodeid = nodeid;
+ return pentry;
+}
-force_close:
- clear_bit(CF_SHUTDOWN, &con->flags);
- close_connection(con, false, true, true);
+static void free_processqueue_entry(struct processqueue_entry *pentry)
+{
+ kfree(pentry->buf);
+ kfree(pentry);
}
-static void dlm_tcp_shutdown(struct connection *con)
+struct dlm_processed_nodes {
+ int nodeid;
+
+ struct list_head list;
+};
+
+static void add_processed_node(int nodeid, struct list_head *processed_nodes)
{
- if (con->othercon)
- shutdown_connection(con->othercon);
- shutdown_connection(con);
+ struct dlm_processed_nodes *n;
+
+ list_for_each_entry(n, processed_nodes, list) {
+ /* we already remembered this node */
+ if (n->nodeid == nodeid)
+ return;
+ }
+
+ /* if it's fails in worst case we simple don't send an ack back.
+ * We try it next time.
+ */
+ n = kmalloc(sizeof(*n), GFP_NOFS);
+ if (!n)
+ return;
+
+ n->nodeid = nodeid;
+ list_add(&n->list, processed_nodes);
}
-static int con_realloc_receive_buf(struct connection *con, int newlen)
+static void process_dlm_messages(struct work_struct *work)
{
- unsigned char *newbuf;
+ struct dlm_processed_nodes *n, *n_tmp;
+ struct processqueue_entry *pentry;
+ LIST_HEAD(processed_nodes);
- newbuf = kmalloc(newlen, GFP_NOFS);
- if (!newbuf)
- return -ENOMEM;
+ spin_lock(&processqueue_lock);
+ pentry = list_first_entry_or_null(&processqueue,
+ struct processqueue_entry, list);
+ if (WARN_ON_ONCE(!pentry)) {
+ spin_unlock(&processqueue_lock);
+ return;
+ }
- /* copy any leftover from last receive */
- if (con->rx_leftover)
- memmove(newbuf, con->rx_buf, con->rx_leftover);
+ list_del(&pentry->list);
+ spin_unlock(&processqueue_lock);
- /* swap to new buffer space */
- kfree(con->rx_buf);
- con->rx_buflen = newlen;
- con->rx_buf = newbuf;
+ for (;;) {
+ dlm_process_incoming_buffer(pentry->nodeid, pentry->buf,
+ pentry->buflen);
+ add_processed_node(pentry->nodeid, &processed_nodes);
+ free_processqueue_entry(pentry);
+
+ spin_lock(&processqueue_lock);
+ pentry = list_first_entry_or_null(&processqueue,
+ struct processqueue_entry, list);
+ if (!pentry) {
+ process_dlm_messages_pending = false;
+ spin_unlock(&processqueue_lock);
+ break;
+ }
- return 0;
+ list_del(&pentry->list);
+ spin_unlock(&processqueue_lock);
+ }
+
+ /* send ack back after we processed couple of messages */
+ list_for_each_entry_safe(n, n_tmp, &processed_nodes, list) {
+ list_del(&n->list);
+ dlm_midcomms_receive_done(n->nodeid);
+ kfree(n);
+ }
}
/* Data received from remote end */
-static int receive_from_sock(struct connection *con)
+static int receive_from_sock(struct connection *con, int buflen)
{
+ struct processqueue_entry *pentry;
+ int ret, buflen_real;
struct msghdr msg;
struct kvec iov;
- int ret, buflen;
- mutex_lock(&con->sock_mutex);
+ pentry = new_processqueue_entry(con->nodeid, buflen);
+ if (!pentry)
+ return DLM_IO_RESCHED;
- if (con->sock == NULL) {
- ret = -EAGAIN;
- goto out_close;
- }
-
- /* realloc if we get new buffer size to read out */
- buflen = dlm_config.ci_buffer_size;
- if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
- ret = con_realloc_receive_buf(con, buflen);
- if (ret < 0)
- goto out_resched;
- }
+ memcpy(pentry->buf, con->rx_leftover_buf, con->rx_leftover);
- for (;;) {
- /* calculate new buffer parameter regarding last receive and
- * possible leftover bytes
- */
- iov.iov_base = con->rx_buf + con->rx_leftover;
- iov.iov_len = con->rx_buflen - con->rx_leftover;
-
- memset(&msg, 0, sizeof(msg));
- msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
- ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
- msg.msg_flags);
- trace_dlm_recv(con->nodeid, ret);
- if (ret == -EAGAIN)
- break;
- else if (ret <= 0)
- goto out_close;
-
- /* new buflen according readed bytes and leftover from last receive */
- buflen = ret + con->rx_leftover;
- ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
- if (ret < 0)
- goto out_close;
-
- /* calculate leftover bytes from process and put it into begin of
- * the receive buffer, so next receive we have the full message
- * at the start address of the receive buffer.
- */
- con->rx_leftover = buflen - ret;
- if (con->rx_leftover) {
- memmove(con->rx_buf, con->rx_buf + ret,
- con->rx_leftover);
+ /* calculate new buffer parameter regarding last receive and
+ * possible leftover bytes
+ */
+ iov.iov_base = pentry->buf + con->rx_leftover;
+ iov.iov_len = buflen - con->rx_leftover;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+ clear_bit(CF_RECV_INTR, &con->flags);
+again:
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
+ trace_dlm_recv(con->nodeid, ret);
+ if (ret == -EAGAIN) {
+ lock_sock(con->sock->sk);
+ if (test_and_clear_bit(CF_RECV_INTR, &con->flags)) {
+ release_sock(con->sock->sk);
+ goto again;
}
+
+ clear_bit(CF_RECV_PENDING, &con->flags);
+ release_sock(con->sock->sk);
+ free_processqueue_entry(pentry);
+ return DLM_IO_END;
+ } else if (ret == 0) {
+ /* close will clear CF_RECV_PENDING */
+ free_processqueue_entry(pentry);
+ return DLM_IO_EOF;
+ } else if (ret < 0) {
+ free_processqueue_entry(pentry);
+ return ret;
}
- dlm_midcomms_receive_done(con->nodeid);
- mutex_unlock(&con->sock_mutex);
- return 0;
+ /* new buflen according readed bytes and leftover from last receive */
+ buflen_real = ret + con->rx_leftover;
+ ret = dlm_validate_incoming_buffer(con->nodeid, pentry->buf,
+ buflen_real);
+ if (ret < 0) {
+ free_processqueue_entry(pentry);
+ return ret;
+ }
-out_resched:
- if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
- queue_work(recv_workqueue, &con->rwork);
- mutex_unlock(&con->sock_mutex);
- return -EAGAIN;
-
-out_close:
- if (ret == 0) {
- log_print("connection %p got EOF from %d",
- con, con->nodeid);
-
- if (dlm_proto_ops->eof_condition &&
- dlm_proto_ops->eof_condition(con)) {
- set_bit(CF_EOF, &con->flags);
- mutex_unlock(&con->sock_mutex);
- } else {
- mutex_unlock(&con->sock_mutex);
- close_connection(con, false, true, false);
+ pentry->buflen = ret;
- /* handling for tcp shutdown */
- clear_bit(CF_SHUTDOWN, &con->flags);
- wake_up(&con->shutdown_wait);
- }
+ /* calculate leftover bytes from process and put it into begin of
+ * the receive buffer, so next receive we have the full message
+ * at the start address of the receive buffer.
+ */
+ con->rx_leftover = buflen_real - ret;
+ memmove(con->rx_leftover_buf, pentry->buf + ret,
+ con->rx_leftover);
- /* signal to breaking receive worker */
- ret = -1;
- } else {
- mutex_unlock(&con->sock_mutex);
+ spin_lock(&processqueue_lock);
+ list_add_tail(&pentry->list, &processqueue);
+ if (!process_dlm_messages_pending) {
+ process_dlm_messages_pending = true;
+ queue_work(process_workqueue, &process_work);
}
- return ret;
+ spin_unlock(&processqueue_lock);
+
+ return DLM_IO_SUCCESS;
}
/* Listening socket is busy, accept a connection */
-static int accept_from_sock(struct listen_connection *con)
+static int accept_from_sock(void)
{
- int result;
struct sockaddr_storage peeraddr;
- struct socket *newsock;
- int len, idx;
- int nodeid;
+ int len, idx, result, nodeid;
struct connection *newcon;
- struct connection *addcon;
+ struct socket *newsock;
unsigned int mark;
- if (!con->sock)
- return -ENOTCONN;
-
- result = kernel_accept(con->sock, &newsock, O_NONBLOCK);
- if (result < 0)
+ result = kernel_accept(listen_con.sock, &newsock, O_NONBLOCK);
+ if (result == -EAGAIN)
+ return DLM_IO_END;
+ else if (result < 0)
goto accept_err;
/* Get the connected socket's peer */
@@ -1062,16 +1023,16 @@ static int accept_from_sock(struct listen_connection *con)
* In this case we store the incoming one in "othercon"
*/
idx = srcu_read_lock(&connections_srcu);
- newcon = nodeid2con(nodeid, GFP_NOFS);
- if (!newcon) {
+ newcon = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!newcon)) {
srcu_read_unlock(&connections_srcu, idx);
- result = -ENOMEM;
+ result = -ENOENT;
goto accept_err;
}
sock_set_mark(newsock->sk, mark);
- mutex_lock(&newcon->sock_mutex);
+ down_write(&newcon->sock_lock);
if (newcon->sock) {
struct connection *othercon = newcon->othercon;
@@ -1079,63 +1040,50 @@ static int accept_from_sock(struct listen_connection *con)
othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
if (!othercon) {
log_print("failed to allocate incoming socket");
- mutex_unlock(&newcon->sock_mutex);
+ up_write(&newcon->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
result = -ENOMEM;
goto accept_err;
}
- result = dlm_con_init(othercon, nodeid);
- if (result < 0) {
- kfree(othercon);
- mutex_unlock(&newcon->sock_mutex);
- srcu_read_unlock(&connections_srcu, idx);
- goto accept_err;
- }
-
- lockdep_set_subclass(&othercon->sock_mutex, 1);
- set_bit(CF_IS_OTHERCON, &othercon->flags);
+ dlm_con_init(othercon, nodeid);
+ lockdep_set_subclass(&othercon->sock_lock, 1);
newcon->othercon = othercon;
- othercon->sendcon = newcon;
+ set_bit(CF_IS_OTHERCON, &othercon->flags);
} else {
/* close other sock con if we have something new */
- close_connection(othercon, false, true, false);
+ close_connection(othercon, false);
}
- mutex_lock(&othercon->sock_mutex);
+ down_write(&othercon->sock_lock);
add_sock(newsock, othercon);
- addcon = othercon;
- mutex_unlock(&othercon->sock_mutex);
+
+ /* check if we receved something while adding */
+ lock_sock(othercon->sock->sk);
+ lowcomms_queue_rwork(othercon);
+ release_sock(othercon->sock->sk);
+ up_write(&othercon->sock_lock);
}
else {
/* accept copies the sk after we've saved the callbacks, so we
don't want to save them a second time or comm errors will
result in calling sk_error_report recursively. */
add_sock(newsock, newcon);
- addcon = newcon;
- }
-
- set_bit(CF_CONNECTED, &addcon->flags);
- mutex_unlock(&newcon->sock_mutex);
-
- /*
- * Add it to the active queue in case we got data
- * between processing the accept adding the socket
- * to the read_sockets list
- */
- if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
- queue_work(recv_workqueue, &addcon->rwork);
+ /* check if we receved something while adding */
+ lock_sock(newcon->sock->sk);
+ lowcomms_queue_rwork(newcon);
+ release_sock(newcon->sock->sk);
+ }
+ up_write(&newcon->sock_lock);
srcu_read_unlock(&connections_srcu, idx);
- return 0;
+ return DLM_IO_SUCCESS;
accept_err:
if (newsock)
sock_release(newsock);
- if (result != -EAGAIN)
- log_print("error accepting connection from node: %d", result);
return result;
}
@@ -1167,7 +1115,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port)
int i, addr_len, result = 0;
for (i = 0; i < dlm_local_count; i++) {
- memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+ memcpy(&localaddr, &dlm_local_addr[i], sizeof(localaddr));
make_sockaddr(&localaddr, port, &addr_len);
if (!i)
@@ -1187,7 +1135,7 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port)
/* Get local addresses */
static void init_local(void)
{
- struct sockaddr_storage sas, *addr;
+ struct sockaddr_storage sas;
int i;
dlm_local_count = 0;
@@ -1195,21 +1143,10 @@ static void init_local(void)
if (dlm_our_addr(&sas, i))
break;
- addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS);
- if (!addr)
- break;
- dlm_local_addr[dlm_local_count++] = addr;
+ memcpy(&dlm_local_addr[dlm_local_count++], &sas, sizeof(sas));
}
}
-static void deinit_local(void)
-{
- int i;
-
- for (i = 0; i < dlm_local_count; i++)
- kfree(dlm_local_addr[i]);
-}
-
static struct writequeue_entry *new_writequeue_entry(struct connection *con)
{
struct writequeue_entry *entry;
@@ -1240,7 +1177,7 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
{
struct writequeue_entry *e;
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
if (!list_empty(&con->writequeue)) {
e = list_last_entry(&con->writequeue, struct writequeue_entry, list);
if (DLM_WQ_REMAIN_BYTES(e) >= len) {
@@ -1263,14 +1200,13 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len,
kref_get(&e->ref);
*ppc = page_address(e->page);
e->end += len;
- atomic_inc(&con->writequeue_cnt);
if (cb)
cb(data);
list_add_tail(&e->list, &con->writequeue);
out:
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
return e;
};
@@ -1319,13 +1255,13 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
len < sizeof(struct dlm_header)) {
BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE);
log_print("failed to allocate a buffer of size %d", len);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return NULL;
}
idx = srcu_read_lock(&connections_srcu);
- con = nodeid2con(nodeid, allocation);
- if (!con) {
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
srcu_read_unlock(&connections_srcu, idx);
return NULL;
}
@@ -1350,7 +1286,7 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
struct connection *con = e->con;
int users;
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
kref_get(&msg->ref);
list_add(&msg->list, &e->msgs);
@@ -1359,13 +1295,11 @@ static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg)
goto out;
e->len = DLM_WQ_LENGTH_BYTES(e);
- spin_unlock(&con->writequeue_lock);
- queue_work(send_workqueue, &con->swork);
- return;
+ lowcomms_queue_swork(con);
out:
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
return;
}
@@ -1387,7 +1321,7 @@ void dlm_lowcomms_put_msg(struct dlm_msg *msg)
kref_put(&msg->ref, dlm_msg_release);
}
-/* does not held connections_srcu, usage workqueue only */
+/* does not held connections_srcu, usage lowcomms_error_report only */
int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
{
struct dlm_msg *msg_resend;
@@ -1413,90 +1347,79 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg)
}
/* Send a message */
-static void send_to_sock(struct connection *con)
+static int send_to_sock(struct connection *con)
{
const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
struct writequeue_entry *e;
int len, offset, ret;
- int count = 0;
- mutex_lock(&con->sock_mutex);
- if (con->sock == NULL)
- goto out_connect;
-
- spin_lock(&con->writequeue_lock);
- for (;;) {
- e = con_next_wq(con);
- if (!e)
- break;
+ spin_lock_bh(&con->writequeue_lock);
+ e = con_next_wq(con);
+ if (!e) {
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ return DLM_IO_END;
+ }
- len = e->len;
- offset = e->offset;
- BUG_ON(len == 0 && e->users == 0);
- spin_unlock(&con->writequeue_lock);
-
- ret = kernel_sendpage(con->sock, e->page, offset, len,
- msg_flags);
- trace_dlm_send(con->nodeid, ret);
- if (ret == -EAGAIN || ret == 0) {
- if (ret == -EAGAIN &&
- test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
- !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
- /* Notify TCP that we're limited by the
- * application window size.
- */
- set_bit(SOCK_NOSPACE, &con->sock->flags);
- con->sock->sk->sk_write_pending++;
- }
- cond_resched();
- goto out;
- } else if (ret < 0)
- goto out;
+ len = e->len;
+ offset = e->offset;
+ WARN_ON_ONCE(len == 0 && e->users == 0);
+ spin_unlock_bh(&con->writequeue_lock);
- /* Don't starve people filling buffers */
- if (++count >= MAX_SEND_MSG_COUNT) {
- cond_resched();
- count = 0;
+ ret = kernel_sendpage(con->sock, e->page, offset, len,
+ msg_flags);
+ trace_dlm_send(con->nodeid, ret);
+ if (ret == -EAGAIN || ret == 0) {
+ lock_sock(con->sock->sk);
+ spin_lock_bh(&con->writequeue_lock);
+ if (test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
+ !test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
+ /* Notify TCP that we're limited by the
+ * application window size.
+ */
+ set_bit(SOCK_NOSPACE, &con->sock->sk->sk_socket->flags);
+ con->sock->sk->sk_write_pending++;
+
+ clear_bit(CF_SEND_PENDING, &con->flags);
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
+
+ /* wait for write_space() event */
+ return DLM_IO_END;
}
+ spin_unlock_bh(&con->writequeue_lock);
+ release_sock(con->sock->sk);
- spin_lock(&con->writequeue_lock);
- writequeue_entry_complete(e, ret);
- }
- spin_unlock(&con->writequeue_lock);
-
- /* close if we got EOF */
- if (test_and_clear_bit(CF_EOF, &con->flags)) {
- mutex_unlock(&con->sock_mutex);
- close_connection(con, false, false, true);
-
- /* handling for tcp shutdown */
- clear_bit(CF_SHUTDOWN, &con->flags);
- wake_up(&con->shutdown_wait);
- } else {
- mutex_unlock(&con->sock_mutex);
+ return DLM_IO_RESCHED;
+ } else if (ret < 0) {
+ return ret;
}
- return;
-
-out:
- mutex_unlock(&con->sock_mutex);
- return;
+ spin_lock_bh(&con->writequeue_lock);
+ writequeue_entry_complete(e, ret);
+ spin_unlock_bh(&con->writequeue_lock);
-out_connect:
- mutex_unlock(&con->sock_mutex);
- queue_work(send_workqueue, &con->swork);
- cond_resched();
+ return DLM_IO_SUCCESS;
}
static void clean_one_writequeue(struct connection *con)
{
struct writequeue_entry *e, *safe;
- spin_lock(&con->writequeue_lock);
+ spin_lock_bh(&con->writequeue_lock);
list_for_each_entry_safe(e, safe, &con->writequeue, list) {
free_entry(e);
}
- spin_unlock(&con->writequeue_lock);
+ spin_unlock_bh(&con->writequeue_lock);
+}
+
+static void connection_release(struct rcu_head *rcu)
+{
+ struct connection *con = container_of(rcu, struct connection, rcu);
+
+ WARN_ON_ONCE(!list_empty(&con->writequeue));
+ WARN_ON_ONCE(con->sock);
+ kfree(con);
}
/* Called from recovery when it knows that a node has
@@ -1504,286 +1427,311 @@ static void clean_one_writequeue(struct connection *con)
int dlm_lowcomms_close(int nodeid)
{
struct connection *con;
- struct dlm_node_addr *na;
int idx;
log_print("closing connection to node %d", nodeid);
+
idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, 0);
- if (con) {
- set_bit(CF_CLOSE, &con->flags);
- close_connection(con, true, true, true);
- clean_one_writequeue(con);
+ if (WARN_ON_ONCE(!con)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return -ENOENT;
+ }
+
+ stop_connection_io(con);
+ log_print("io handling for node: %d stopped", nodeid);
+ close_connection(con, true);
+
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+
+ clean_one_writequeue(con);
+ call_srcu(&connections_srcu, &con->rcu, connection_release);
+ if (con->othercon) {
+ clean_one_writequeue(con->othercon);
if (con->othercon)
- clean_one_writequeue(con->othercon);
+ call_srcu(&connections_srcu, &con->othercon->rcu, connection_release);
}
srcu_read_unlock(&connections_srcu, idx);
- spin_lock(&dlm_node_addrs_spin);
- na = find_node_addr(nodeid);
- if (na) {
- list_del(&na->list);
- while (na->addr_count--)
- kfree(na->addr[na->addr_count]);
- kfree(na);
- }
- spin_unlock(&dlm_node_addrs_spin);
+ /* for debugging we print when we are done to compare with other
+ * messages in between. This function need to be correctly synchronized
+ * with io handling
+ */
+ log_print("closing connection to node %d done", nodeid);
return 0;
}
-/* Receive workqueue function */
+/* Receive worker function */
static void process_recv_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, rwork);
+ int ret, buflen;
+
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ up_read(&con->sock_lock);
+ return;
+ }
+
+ buflen = READ_ONCE(dlm_config.ci_buffer_size);
+ do {
+ ret = receive_from_sock(con, buflen);
+ } while (ret == DLM_IO_SUCCESS);
+ up_read(&con->sock_lock);
- clear_bit(CF_READ_PENDING, &con->flags);
- receive_from_sock(con);
+ switch (ret) {
+ case DLM_IO_END:
+ /* CF_RECV_PENDING cleared */
+ break;
+ case DLM_IO_EOF:
+ close_connection(con, false);
+ /* CF_RECV_PENDING cleared */
+ break;
+ case DLM_IO_RESCHED:
+ cond_resched();
+ queue_work(io_workqueue, &con->rwork);
+ /* CF_RECV_PENDING not cleared */
+ break;
+ default:
+ if (ret < 0) {
+ if (test_bit(CF_IS_OTHERCON, &con->flags)) {
+ close_connection(con, false);
+ } else {
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ }
+
+ /* CF_RECV_PENDING cleared for othercon
+ * we trigger send queue if not already done
+ * and process_send_sockets will handle it
+ */
+ break;
+ }
+
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void process_listen_recv_socket(struct work_struct *work)
{
- accept_from_sock(&listen_con);
+ int ret;
+
+ if (WARN_ON_ONCE(!listen_con.sock))
+ return;
+
+ do {
+ ret = accept_from_sock();
+ } while (ret == DLM_IO_SUCCESS);
+
+ if (ret < 0)
+ log_print("critical error accepting connection: %d", ret);
}
-static void dlm_connect(struct connection *con)
+static int dlm_connect(struct connection *con)
{
struct sockaddr_storage addr;
int result, addr_len;
struct socket *sock;
unsigned int mark;
- /* Some odd races can cause double-connects, ignore them */
- if (con->retries++ > MAX_CONNECT_RETRIES)
- return;
-
- if (con->sock) {
- log_print("node %d already connected.", con->nodeid);
- return;
- }
-
memset(&addr, 0, sizeof(addr));
result = nodeid_to_addr(con->nodeid, &addr, NULL,
dlm_proto_ops->try_new_addr, &mark);
if (result < 0) {
log_print("no address for nodeid %d", con->nodeid);
- return;
+ return result;
}
/* Create a socket to communicate with */
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0)
- goto socket_err;
+ return result;
sock_set_mark(sock->sk, mark);
dlm_proto_ops->sockopts(sock);
- add_sock(sock, con);
-
result = dlm_proto_ops->bind(sock);
- if (result < 0)
- goto add_sock_err;
+ if (result < 0) {
+ sock_release(sock);
+ return result;
+ }
+
+ add_sock(sock, con);
log_print_ratelimited("connecting to %d", con->nodeid);
make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len);
result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr,
addr_len);
- if (result < 0)
- goto add_sock_err;
-
- return;
-
-add_sock_err:
- dlm_close_sock(&con->sock);
+ switch (result) {
+ case -EINPROGRESS:
+ /* not an error */
+ fallthrough;
+ case 0:
+ break;
+ default:
+ if (result < 0)
+ dlm_close_sock(&con->sock);
-socket_err:
- /*
- * Some errors are fatal and this list might need adjusting. For other
- * errors we try again until the max number of retries is reached.
- */
- if (result != -EHOSTUNREACH &&
- result != -ENETUNREACH &&
- result != -ENETDOWN &&
- result != -EINVAL &&
- result != -EPROTONOSUPPORT) {
- log_print("connect %d try %d error %d", con->nodeid,
- con->retries, result);
- msleep(1000);
- lowcomms_connect_sock(con);
+ break;
}
+
+ return result;
}
-/* Send workqueue function */
+/* Send worker function */
static void process_send_sockets(struct work_struct *work)
{
struct connection *con = container_of(work, struct connection, swork);
+ int ret;
- WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags));
-
- clear_bit(CF_WRITE_PENDING, &con->flags);
+ WARN_ON_ONCE(test_bit(CF_IS_OTHERCON, &con->flags));
- if (test_and_clear_bit(CF_RECONNECT, &con->flags)) {
- close_connection(con, false, false, true);
- dlm_midcomms_unack_msg_resend(con->nodeid);
+ down_read(&con->sock_lock);
+ if (!con->sock) {
+ up_read(&con->sock_lock);
+ down_write(&con->sock_lock);
+ if (!con->sock) {
+ ret = dlm_connect(con);
+ switch (ret) {
+ case 0:
+ break;
+ case -EINPROGRESS:
+ /* avoid spamming resched on connection
+ * we might can switch to a state_change
+ * event based mechanism if established
+ */
+ msleep(100);
+ break;
+ default:
+ /* CF_SEND_PENDING not cleared */
+ up_write(&con->sock_lock);
+ log_print("connect to node %d try %d error %d",
+ con->nodeid, con->retries++, ret);
+ msleep(1000);
+ /* For now we try forever to reconnect. In
+ * future we should send a event to cluster
+ * manager to fence itself after certain amount
+ * of retries.
+ */
+ queue_work(io_workqueue, &con->swork);
+ return;
+ }
+ }
+ downgrade_write(&con->sock_lock);
}
- if (con->sock == NULL) {
- if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags))
- msleep(1000);
+ do {
+ ret = send_to_sock(con);
+ } while (ret == DLM_IO_SUCCESS);
+ up_read(&con->sock_lock);
- mutex_lock(&con->sock_mutex);
- dlm_connect(con);
- mutex_unlock(&con->sock_mutex);
- }
+ switch (ret) {
+ case DLM_IO_END:
+ /* CF_SEND_PENDING cleared */
+ break;
+ case DLM_IO_RESCHED:
+ /* CF_SEND_PENDING not cleared */
+ cond_resched();
+ queue_work(io_workqueue, &con->swork);
+ break;
+ default:
+ if (ret < 0) {
+ close_connection(con, false);
+
+ /* CF_SEND_PENDING cleared */
+ spin_lock_bh(&con->writequeue_lock);
+ lowcomms_queue_swork(con);
+ spin_unlock_bh(&con->writequeue_lock);
+ break;
+ }
- if (!list_empty(&con->writequeue))
- send_to_sock(con);
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void work_stop(void)
{
- if (recv_workqueue) {
- destroy_workqueue(recv_workqueue);
- recv_workqueue = NULL;
+ if (io_workqueue) {
+ destroy_workqueue(io_workqueue);
+ io_workqueue = NULL;
}
- if (send_workqueue) {
- destroy_workqueue(send_workqueue);
- send_workqueue = NULL;
+ if (process_workqueue) {
+ destroy_workqueue(process_workqueue);
+ process_workqueue = NULL;
}
}
static int work_start(void)
{
- recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM);
- if (!recv_workqueue) {
- log_print("can't start dlm_recv");
+ io_workqueue = alloc_workqueue("dlm_io", WQ_HIGHPRI | WQ_MEM_RECLAIM,
+ 0);
+ if (!io_workqueue) {
+ log_print("can't start dlm_io");
return -ENOMEM;
}
- send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM);
- if (!send_workqueue) {
- log_print("can't start dlm_send");
- destroy_workqueue(recv_workqueue);
- recv_workqueue = NULL;
+ /* ordered dlm message process queue,
+ * should be converted to a tasklet
+ */
+ process_workqueue = alloc_ordered_workqueue("dlm_process",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM);
+ if (!process_workqueue) {
+ log_print("can't start dlm_process");
+ destroy_workqueue(io_workqueue);
+ io_workqueue = NULL;
return -ENOMEM;
}
return 0;
}
-static void shutdown_conn(struct connection *con)
-{
- if (dlm_proto_ops->shutdown_action)
- dlm_proto_ops->shutdown_action(con);
-}
-
void dlm_lowcomms_shutdown(void)
{
- int idx;
-
- /* Set all the flags to prevent any
- * socket activity.
- */
- dlm_allow_conn = 0;
-
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
+ /* stop lowcomms_listen_data_ready calls */
+ lock_sock(listen_con.sock->sk);
+ listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready;
+ release_sock(listen_con.sock->sk);
+ cancel_work_sync(&listen_con.rwork);
dlm_close_sock(&listen_con.sock);
- idx = srcu_read_lock(&connections_srcu);
- foreach_conn(shutdown_conn);
- srcu_read_unlock(&connections_srcu, idx);
-}
-
-static void _stop_conn(struct connection *con, bool and_other)
-{
- mutex_lock(&con->sock_mutex);
- set_bit(CF_CLOSE, &con->flags);
- set_bit(CF_READ_PENDING, &con->flags);
- set_bit(CF_WRITE_PENDING, &con->flags);
- if (con->sock && con->sock->sk) {
- lock_sock(con->sock->sk);
- con->sock->sk->sk_user_data = NULL;
- release_sock(con->sock->sk);
- }
- if (con->othercon && and_other)
- _stop_conn(con->othercon, false);
- mutex_unlock(&con->sock_mutex);
-}
-
-static void stop_conn(struct connection *con)
-{
- _stop_conn(con, true);
+ flush_workqueue(process_workqueue);
}
-static void connection_release(struct rcu_head *rcu)
+void dlm_lowcomms_shutdown_node(int nodeid, bool force)
{
- struct connection *con = container_of(rcu, struct connection, rcu);
-
- kfree(con->rx_buf);
- kfree(con);
-}
+ struct connection *con;
+ int idx;
-static void free_conn(struct connection *con)
-{
- close_connection(con, true, true, true);
- spin_lock(&connections_lock);
- hlist_del_rcu(&con->list);
- spin_unlock(&connections_lock);
- if (con->othercon) {
- clean_one_writequeue(con->othercon);
- call_srcu(&connections_srcu, &con->othercon->rcu,
- connection_release);
+ idx = srcu_read_lock(&connections_srcu);
+ con = nodeid2con(nodeid, 0);
+ if (WARN_ON_ONCE(!con)) {
+ srcu_read_unlock(&connections_srcu, idx);
+ return;
}
- clean_one_writequeue(con);
- call_srcu(&connections_srcu, &con->rcu, connection_release);
-}
-static void work_flush(void)
-{
- int ok;
- int i;
- struct connection *con;
-
- do {
- ok = 1;
- foreach_conn(stop_conn);
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
- for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
- hlist_for_each_entry_rcu(con, &connection_hash[i],
- list) {
- ok &= test_bit(CF_READ_PENDING, &con->flags);
- ok &= test_bit(CF_WRITE_PENDING, &con->flags);
- if (con->othercon) {
- ok &= test_bit(CF_READ_PENDING,
- &con->othercon->flags);
- ok &= test_bit(CF_WRITE_PENDING,
- &con->othercon->flags);
- }
- }
- }
- } while (!ok);
+ flush_work(&con->swork);
+ stop_connection_io(con);
+ WARN_ON_ONCE(!force && !list_empty(&con->writequeue));
+ close_connection(con, true);
+ clean_one_writequeue(con);
+ if (con->othercon)
+ clean_one_writequeue(con->othercon);
+ allow_connection_io(con);
+ srcu_read_unlock(&connections_srcu, idx);
}
void dlm_lowcomms_stop(void)
{
- int idx;
-
- idx = srcu_read_lock(&connections_srcu);
- work_flush();
- foreach_conn(free_conn);
- srcu_read_unlock(&connections_srcu, idx);
work_stop();
- deinit_local();
-
dlm_proto_ops = NULL;
}
@@ -1799,7 +1747,7 @@ static int dlm_listen_for_all(void)
if (result < 0)
return result;
- result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ result = sock_create_kern(&init_net, dlm_local_addr[0].ss_family,
SOCK_STREAM, dlm_proto_ops->proto, &sock);
if (result < 0) {
log_print("Can't create comms socket: %d", result);
@@ -1813,14 +1761,23 @@ static int dlm_listen_for_all(void)
if (result < 0)
goto out;
- save_listen_callbacks(sock);
- add_listen_sock(sock, &listen_con);
+ lock_sock(sock->sk);
+ listen_sock.sk_data_ready = sock->sk->sk_data_ready;
+ listen_sock.sk_write_space = sock->sk->sk_write_space;
+ listen_sock.sk_error_report = sock->sk->sk_error_report;
+ listen_sock.sk_state_change = sock->sk->sk_state_change;
+
+ listen_con.sock = sock;
+
+ sock->sk->sk_allocation = GFP_NOFS;
+ sock->sk->sk_use_task_frag = false;
+ sock->sk->sk_data_ready = lowcomms_listen_data_ready;
+ release_sock(sock->sk);
- INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
result = sock->ops->listen(sock, 5);
if (result < 0) {
dlm_close_sock(&listen_con.sock);
- goto out;
+ return result;
}
return 0;
@@ -1838,7 +1795,7 @@ static int dlm_tcp_bind(struct socket *sock)
/* Bind to our cluster-known address connecting to avoid
* routing problems.
*/
- memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr));
+ memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
make_sockaddr(&src_addr, 0, &addr_len);
result = sock->ops->bind(sock, (struct sockaddr *)&src_addr,
@@ -1854,17 +1811,7 @@ static int dlm_tcp_bind(struct socket *sock)
static int dlm_tcp_connect(struct connection *con, struct socket *sock,
struct sockaddr *addr, int addr_len)
{
- int ret;
-
- ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
- switch (ret) {
- case -EINPROGRESS:
- fallthrough;
- case 0:
- return 0;
- }
-
- return ret;
+ return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
}
static int dlm_tcp_listen_validate(void)
@@ -1895,8 +1842,8 @@ static int dlm_tcp_listen_bind(struct socket *sock)
int addr_len;
/* Bind to our port */
- make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
- return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0],
+ make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
+ return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0],
addr_len);
}
@@ -1909,8 +1856,6 @@ static const struct dlm_proto_ops dlm_tcp_ops = {
.listen_validate = dlm_tcp_listen_validate,
.listen_sockopts = dlm_tcp_listen_sockopts,
.listen_bind = dlm_tcp_listen_bind,
- .shutdown_action = dlm_tcp_shutdown,
- .eof_condition = tcp_eof_condition,
};
static int dlm_sctp_bind(struct socket *sock)
@@ -1931,13 +1876,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
sock_set_sndtimeo(sock->sk, 5);
ret = sock->ops->connect(sock, addr, addr_len, 0);
sock_set_sndtimeo(sock->sk, 0);
- if (ret < 0)
- return ret;
-
- if (!test_and_set_bit(CF_CONNECTED, &con->flags))
- log_print("connected to node %d", con->nodeid);
-
- return 0;
+ return ret;
}
static int dlm_sctp_listen_validate(void)
@@ -1977,11 +1916,7 @@ static const struct dlm_proto_ops dlm_sctp_ops = {
int dlm_lowcomms_start(void)
{
- int error = -EINVAL;
- int i;
-
- for (i = 0; i < CONN_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&connection_hash[i]);
+ int error;
init_local();
if (!dlm_local_count) {
@@ -1990,13 +1925,9 @@ int dlm_lowcomms_start(void)
goto fail;
}
- INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
-
error = work_start();
if (error)
- goto fail_local;
-
- dlm_allow_conn = 1;
+ goto fail;
/* Start listening */
switch (dlm_config.ci_protocol) {
@@ -2022,25 +1953,38 @@ int dlm_lowcomms_start(void)
fail_listen:
dlm_proto_ops = NULL;
fail_proto_ops:
- dlm_allow_conn = 0;
- dlm_close_sock(&listen_con.sock);
work_stop();
-fail_local:
- deinit_local();
fail:
return error;
}
+void dlm_lowcomms_init(void)
+{
+ int i;
+
+ for (i = 0; i < CONN_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&connection_hash[i]);
+
+ INIT_WORK(&listen_con.rwork, process_listen_recv_socket);
+}
+
void dlm_lowcomms_exit(void)
{
- struct dlm_node_addr *na, *safe;
+ struct connection *con;
+ int i, idx;
- spin_lock(&dlm_node_addrs_spin);
- list_for_each_entry_safe(na, safe, &dlm_node_addrs, list) {
- list_del(&na->list);
- while (na->addr_count--)
- kfree(na->addr[na->addr_count]);
- kfree(na);
+ idx = srcu_read_lock(&connections_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+
+ if (con->othercon)
+ call_srcu(&connections_srcu, &con->othercon->rcu,
+ connection_release);
+ call_srcu(&connections_srcu, &con->rcu, connection_release);
+ }
}
- spin_unlock(&dlm_node_addrs_spin);
+ srcu_read_unlock(&connections_srcu, idx);
}
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index 29369feea991..3e8dca66183b 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -29,12 +29,14 @@ static inline int nodeid_hash(int nodeid)
return nodeid & (CONN_HASH_SIZE-1);
}
-/* switch to check if dlm is running */
-extern int dlm_allow_conn;
+/* check if dlm is running */
+bool dlm_lowcomms_is_running(void);
int dlm_lowcomms_start(void);
void dlm_lowcomms_shutdown(void);
+void dlm_lowcomms_shutdown_node(int nodeid, bool force);
void dlm_lowcomms_stop(void);
+void dlm_lowcomms_init(void);
void dlm_lowcomms_exit(void);
int dlm_lowcomms_close(int nodeid);
struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 1c5be4b70ac1..a77338be3237 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -17,7 +17,7 @@
#include "user.h"
#include "memory.h"
#include "config.h"
-#include "lowcomms.h"
+#include "midcomms.h"
#define CREATE_TRACE_POINTS
#include <trace/events/dlm.h>
@@ -30,6 +30,8 @@ static int __init init_dlm(void)
if (error)
goto out;
+ dlm_midcomms_init();
+
error = dlm_lockspace_init();
if (error)
goto out_mem;
@@ -66,6 +68,7 @@ static int __init init_dlm(void)
out_lockspace:
dlm_lockspace_exit();
out_mem:
+ dlm_midcomms_exit();
dlm_memory_exit();
out:
return error;
@@ -79,7 +82,7 @@ static void __exit exit_dlm(void)
dlm_config_exit();
dlm_memory_exit();
dlm_lockspace_exit();
- dlm_lowcomms_exit();
+ dlm_midcomms_exit();
dlm_unregister_debugfs();
}
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 2af2ccfe43a9..923c01a8a0aa 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -573,7 +573,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
node = &rv->nodes[i];
if (dlm_is_member(ls, node->nodeid))
continue;
- dlm_add_member(ls, node);
+ error = dlm_add_member(ls, node);
+ if (error)
+ return error;
+
log_rinfo(ls, "add member %d", node->nodeid);
}
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index ce35c3c19aeb..eb7a08641fcf 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -14,12 +14,14 @@
#include "lowcomms.h"
#include "config.h"
#include "memory.h"
+#include "ast.h"
static struct kmem_cache *writequeue_cache;
static struct kmem_cache *mhandle_cache;
static struct kmem_cache *msg_cache;
static struct kmem_cache *lkb_cache;
static struct kmem_cache *rsb_cache;
+static struct kmem_cache *cb_cache;
int __init dlm_memory_init(void)
@@ -46,8 +48,16 @@ int __init dlm_memory_init(void)
if (!rsb_cache)
goto rsb;
+ cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback),
+ __alignof__(struct dlm_callback), 0,
+ NULL);
+ if (!rsb_cache)
+ goto cb;
+
return 0;
+cb:
+ kmem_cache_destroy(rsb_cache);
rsb:
kmem_cache_destroy(msg_cache);
msg:
@@ -67,6 +77,7 @@ void dlm_memory_exit(void)
kmem_cache_destroy(msg_cache);
kmem_cache_destroy(lkb_cache);
kmem_cache_destroy(rsb_cache);
+ kmem_cache_destroy(cb_cache);
}
char *dlm_allocate_lvb(struct dlm_ls *ls)
@@ -115,12 +126,17 @@ void dlm_free_lkb(struct dlm_lkb *lkb)
kfree(ua);
}
}
+
+ /* drop references if they are set */
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL);
+ dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL);
+
kmem_cache_free(lkb_cache, lkb);
}
-struct dlm_mhandle *dlm_allocate_mhandle(void)
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation)
{
- return kmem_cache_alloc(mhandle_cache, GFP_NOFS);
+ return kmem_cache_alloc(mhandle_cache, allocation);
}
void dlm_free_mhandle(struct dlm_mhandle *mhandle)
@@ -147,3 +163,13 @@ void dlm_free_msg(struct dlm_msg *msg)
{
kmem_cache_free(msg_cache, msg);
}
+
+struct dlm_callback *dlm_allocate_cb(void)
+{
+ return kmem_cache_alloc(cb_cache, GFP_ATOMIC);
+}
+
+void dlm_free_cb(struct dlm_callback *cb)
+{
+ kmem_cache_free(cb_cache, cb);
+}
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
index 7bd3f1a391ca..6b29563d24f7 100644
--- a/fs/dlm/memory.h
+++ b/fs/dlm/memory.h
@@ -20,12 +20,14 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls);
void dlm_free_lkb(struct dlm_lkb *l);
char *dlm_allocate_lvb(struct dlm_ls *ls);
void dlm_free_lvb(char *l);
-struct dlm_mhandle *dlm_allocate_mhandle(void);
+struct dlm_mhandle *dlm_allocate_mhandle(gfp_t allocation);
void dlm_free_mhandle(struct dlm_mhandle *mhandle);
struct writequeue_entry *dlm_allocate_writequeue(void);
void dlm_free_writequeue(struct writequeue_entry *writequeue);
struct dlm_msg *dlm_allocate_msg(gfp_t allocation);
void dlm_free_msg(struct dlm_msg *msg);
+struct dlm_callback *dlm_allocate_cb(void);
+void dlm_free_cb(struct dlm_callback *cb);
#endif /* __MEMORY_DOT_H__ */
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 6489bc22ad61..fc015a6abe17 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -132,6 +132,7 @@
*/
#define DLM_DEBUG_FENCE_TERMINATION 0
+#include <trace/events/dlm.h>
#include <net/tcp.h>
#include "dlm_internal.h"
@@ -194,7 +195,7 @@ struct midcomms_node {
};
struct dlm_mhandle {
- const struct dlm_header *inner_hd;
+ const union dlm_packet *inner_p;
struct midcomms_node *node;
struct dlm_opts *opts;
struct dlm_msg *msg;
@@ -305,11 +306,11 @@ static void dlm_send_queue_flush(struct midcomms_node *node)
pr_debug("flush midcomms send queue of node %d\n", node->nodeid);
rcu_read_lock();
- spin_lock(&node->send_queue_lock);
+ spin_lock_bh(&node->send_queue_lock);
list_for_each_entry_rcu(mh, &node->send_queue, list) {
dlm_mhandle_delete(node, mh);
}
- spin_unlock(&node->send_queue_lock);
+ spin_unlock_bh(&node->send_queue_lock);
rcu_read_unlock();
}
@@ -415,7 +416,7 @@ static int dlm_send_fin(struct midcomms_node *node,
m_header->h_cmd = DLM_FIN;
pr_debug("sending fin msg to node %d\n", node->nodeid);
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags);
return 0;
@@ -436,7 +437,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
}
}
- spin_lock(&node->send_queue_lock);
+ spin_lock_bh(&node->send_queue_lock);
list_for_each_entry_rcu(mh, &node->send_queue, list) {
if (before(mh->seq, seq)) {
dlm_mhandle_delete(node, mh);
@@ -445,7 +446,7 @@ static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq)
break;
}
}
- spin_unlock(&node->send_queue_lock);
+ spin_unlock_bh(&node->send_queue_lock);
rcu_read_unlock();
}
@@ -468,12 +469,26 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node)
spin_unlock(&node->state_lock);
log_print("%s: unexpected state: %d\n",
__func__, node->state);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return;
}
spin_unlock(&node->state_lock);
}
+static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p)
+{
+ switch (p->header.h_cmd) {
+ case DLM_MSG:
+ trace_dlm_recv_message(dlm_our_nodeid(), seq, &p->message);
+ break;
+ case DLM_RCOM:
+ trace_dlm_recv_rcom(dlm_our_nodeid(), seq, &p->rcom);
+ break;
+ default:
+ break;
+ }
+}
+
static void dlm_midcomms_receive_buffer(union dlm_packet *p,
struct midcomms_node *node,
uint32_t seq)
@@ -525,7 +540,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
spin_unlock(&node->state_lock);
log_print("%s: unexpected state: %d\n",
__func__, node->state);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return;
}
spin_unlock(&node->state_lock);
@@ -533,7 +548,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags);
break;
default:
- WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ dlm_receive_buffer_3_2_trace(seq, p);
dlm_receive_buffer(p, node->nodeid);
set_bit(DLM_NODE_ULP_DELIVERED, &node->flags);
break;
@@ -754,7 +770,7 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
goto out;
}
- WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags));
dlm_receive_buffer(p, nodeid);
break;
case DLM_OPTS:
@@ -874,12 +890,7 @@ static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid)
dlm_receive_buffer(p, nodeid);
}
-/*
- * Called from the low-level comms layer to process a buffer of
- * commands.
- */
-
-int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len)
{
const unsigned char *ptr = buf;
const struct dlm_header *hd;
@@ -914,6 +925,32 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
if (msglen > len)
break;
+ ret += msglen;
+ len -= msglen;
+ ptr += msglen;
+ }
+
+ return ret;
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ */
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
+{
+ const unsigned char *ptr = buf;
+ const struct dlm_header *hd;
+ uint16_t msglen;
+ int ret = 0;
+
+ while (len >= sizeof(struct dlm_header)) {
+ hd = (struct dlm_header *)ptr;
+
+ msglen = le16_to_cpu(hd->h_length);
+ if (msglen > len)
+ break;
+
switch (hd->h_version) {
case cpu_to_le32(DLM_VERSION_3_1):
dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid);
@@ -1030,9 +1067,9 @@ static void midcomms_new_msg_cb(void *data)
atomic_inc(&mh->node->send_queue_cnt);
- spin_lock(&mh->node->send_queue_lock);
+ spin_lock_bh(&mh->node->send_queue_lock);
list_add_tail_rcu(&mh->list, &mh->node->send_queue);
- spin_unlock(&mh->node->send_queue_lock);
+ spin_unlock_bh(&mh->node->send_queue_lock);
mh->seq = mh->node->seq_send++;
}
@@ -1055,7 +1092,7 @@ static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int node
dlm_fill_opts_header(opts, len, mh->seq);
*ppc += sizeof(*opts);
- mh->inner_hd = (const struct dlm_header *)*ppc;
+ mh->inner_p = (const union dlm_packet *)*ppc;
return msg;
}
@@ -1079,9 +1116,9 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
}
/* this is a bug, however we going on and hope it will be resolved */
- WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
+ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
- mh = dlm_allocate_mhandle();
+ mh = dlm_allocate_mhandle(allocation);
if (!mh)
goto err;
@@ -1111,7 +1148,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
break;
default:
dlm_free_mhandle(mh);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
goto err;
}
@@ -1130,11 +1167,32 @@ err:
}
#endif
-static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
+static void dlm_midcomms_commit_msg_3_2_trace(const struct dlm_mhandle *mh,
+ const void *name, int namelen)
+{
+ switch (mh->inner_p->header.h_cmd) {
+ case DLM_MSG:
+ trace_dlm_send_message(mh->node->nodeid, mh->seq,
+ &mh->inner_p->message,
+ name, namelen);
+ break;
+ case DLM_RCOM:
+ trace_dlm_send_rcom(mh->node->nodeid, mh->seq,
+ &mh->inner_p->rcom);
+ break;
+ default:
+ /* nothing to trace */
+ break;
+ }
+}
+
+static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh,
+ const void *name, int namelen)
{
/* nexthdr chain for fast lookup */
- mh->opts->o_nextcmd = mh->inner_hd->h_cmd;
+ mh->opts->o_nextcmd = mh->inner_p->header.h_cmd;
mh->committed = true;
+ dlm_midcomms_commit_msg_3_2_trace(mh, name, namelen);
dlm_lowcomms_commit_msg(mh->msg);
}
@@ -1142,8 +1200,10 @@ static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh)
* dlm_midcomms_get_mhandle
*/
#ifndef __CHECKER__
-void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh,
+ const void *name, int namelen)
{
+
switch (mh->node->version) {
case DLM_VERSION_3_1:
srcu_read_unlock(&nodes_srcu, mh->idx);
@@ -1154,12 +1214,12 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
dlm_free_mhandle(mh);
break;
case DLM_VERSION_3_2:
- dlm_midcomms_commit_msg_3_2(mh);
+ dlm_midcomms_commit_msg_3_2(mh, name, namelen);
srcu_read_unlock(&nodes_srcu, mh->idx);
break;
default:
srcu_read_unlock(&nodes_srcu, mh->idx);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
break;
}
}
@@ -1167,12 +1227,27 @@ void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh)
int dlm_midcomms_start(void)
{
+ return dlm_lowcomms_start();
+}
+
+void dlm_midcomms_stop(void)
+{
+ dlm_lowcomms_stop();
+}
+
+void dlm_midcomms_init(void)
+{
int i;
for (i = 0; i < CONN_HASH_SIZE; i++)
INIT_HLIST_HEAD(&node_hash[i]);
- return dlm_lowcomms_start();
+ dlm_lowcomms_init();
+}
+
+void dlm_midcomms_exit(void)
+{
+ dlm_lowcomms_exit();
}
static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
@@ -1201,7 +1276,7 @@ static void dlm_act_fin_ack_rcv(struct midcomms_node *node)
spin_unlock(&node->state_lock);
log_print("%s: unexpected state: %d\n",
__func__, node->state);
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return;
}
spin_unlock(&node->state_lock);
@@ -1319,7 +1394,7 @@ static void midcomms_node_release(struct rcu_head *rcu)
{
struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
- WARN_ON(atomic_read(&node->send_queue_cnt));
+ WARN_ON_ONCE(atomic_read(&node->send_queue_cnt));
kfree(node);
}
@@ -1372,11 +1447,13 @@ static void midcomms_shutdown(struct midcomms_node *node)
pr_debug("active shutdown timed out for node %d with state %s\n",
node->nodeid, dlm_state_str(node->state));
midcomms_node_reset(node);
+ dlm_lowcomms_shutdown_node(node->nodeid, true);
return;
}
pr_debug("active shutdown done for node %d with state %s\n",
node->nodeid, dlm_state_str(node->state));
+ dlm_lowcomms_shutdown_node(node->nodeid, false);
}
void dlm_midcomms_shutdown(void)
@@ -1384,6 +1461,8 @@ void dlm_midcomms_shutdown(void)
struct midcomms_node *node;
int i, idx;
+ dlm_lowcomms_shutdown();
+
mutex_lock(&close_lock);
idx = srcu_read_lock(&nodes_srcu);
for (i = 0; i < CONN_HASH_SIZE; i++) {
@@ -1401,8 +1480,6 @@ void dlm_midcomms_shutdown(void)
}
srcu_read_unlock(&nodes_srcu, idx);
mutex_unlock(&close_lock);
-
- dlm_lowcomms_shutdown();
}
int dlm_midcomms_close(int nodeid)
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 82bcd9661922..bea1cee4279c 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -14,12 +14,17 @@
struct midcomms_node;
+int dlm_validate_incoming_buffer(int nodeid, unsigned char *buf, int len);
int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
gfp_t allocation, char **ppc);
-void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh);
+void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name,
+ int namelen);
int dlm_midcomms_close(int nodeid);
int dlm_midcomms_start(void);
+void dlm_midcomms_stop(void);
+void dlm_midcomms_init(void);
+void dlm_midcomms_exit(void);
void dlm_midcomms_shutdown(void);
void dlm_midcomms_add_member(int nodeid);
void dlm_midcomms_remove_member(int nodeid);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f19860315043..b76d52e2f6bd 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -91,7 +91,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc)
{
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
}
static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc)
@@ -516,7 +516,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
rf = (struct rcom_config *) rc->rc_buf;
rf->rf_lvblen = cpu_to_le32(~0U);
- dlm_midcomms_commit_mhandle(mh);
+ dlm_midcomms_commit_mhandle(mh, NULL, 0);
return 0;
}
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 036a9a0078f6..8be2893ad15b 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -44,7 +44,8 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
e->recover_seq = ls->ls_recover_seq & 0xFFFFFFFF;
e->nodeid = nodeid;
- memcpy(&e->request, ms, le16_to_cpu(ms->m_header.h_length));
+ memcpy(&e->request, ms, sizeof(*ms));
+ memcpy(&e->request.m_extra, ms->m_extra, length);
atomic_inc(&ls->ls_requestqueue_cnt);
mutex_lock(&ls->ls_requestqueue_mutex);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index c5d27bccc3dc..35129505ddda 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -25,6 +25,7 @@
#include "user.h"
#include "ast.h"
#include "config.h"
+#include "memory.h"
static const char name_prefix[] = "dlm";
static const struct file_operations device_fops;
@@ -175,7 +176,7 @@ static int lkb_is_endoflife(int mode, int status)
being removed and then remove that lkb from the orphans list and free it */
void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq)
+ int status, uint32_t sbflags)
{
struct dlm_ls *ls;
struct dlm_user_args *ua;
@@ -209,16 +210,22 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
spin_lock(&proc->asts_spin);
- rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
- if (rv < 0) {
+ rv = dlm_enqueue_lkb_callback(lkb, flags, mode, status, sbflags);
+ switch (rv) {
+ case DLM_ENQUEUE_CALLBACK_FAILURE:
spin_unlock(&proc->asts_spin);
+ WARN_ON_ONCE(1);
goto out;
- }
-
- if (list_empty(&lkb->lkb_cb_list)) {
+ case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
kref_get(&lkb->lkb_ref);
list_add_tail(&lkb->lkb_cb_list, &proc->asts);
wake_up_interruptible(&proc->wait);
+ break;
+ case DLM_ENQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
spin_unlock(&proc->asts_spin);
@@ -800,8 +807,8 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
struct dlm_user_proc *proc = file->private_data;
struct dlm_lkb *lkb;
DECLARE_WAITQUEUE(wait, current);
- struct dlm_callback cb;
- int rv, resid, copy_lvb = 0;
+ struct dlm_callback *cb;
+ int rv, copy_lvb = 0;
int old_mode, new_mode;
if (count == sizeof(struct dlm_device_version)) {
@@ -857,53 +864,58 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
without removing lkb_cb_list; so empty lkb_cb_list is always
consistent with empty lkb_callbacks */
- lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_cb_list);
+ lkb = list_first_entry(&proc->asts, struct dlm_lkb, lkb_cb_list);
/* rem_lkb_callback sets a new lkb_last_cast */
- old_mode = lkb->lkb_last_cast.mode;
+ old_mode = lkb->lkb_last_cast->mode;
- rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
- if (rv < 0) {
+ rv = dlm_dequeue_lkb_callback(lkb, &cb);
+ switch (rv) {
+ case DLM_DEQUEUE_CALLBACK_EMPTY:
/* this shouldn't happen; lkb should have been removed from
- list when resid was zero */
+ * list when last item was dequeued
+ */
log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
list_del_init(&lkb->lkb_cb_list);
spin_unlock(&proc->asts_spin);
/* removes ref for proc->asts, may cause lkb to be freed */
dlm_put_lkb(lkb);
+ WARN_ON_ONCE(1);
goto try_another;
- }
- if (!resid)
+ case DLM_DEQUEUE_CALLBACK_LAST:
list_del_init(&lkb->lkb_cb_list);
- spin_unlock(&proc->asts_spin);
-
- if (cb.flags & DLM_CB_SKIP) {
- /* removes ref for proc->asts, may cause lkb to be freed */
- if (!resid)
- dlm_put_lkb(lkb);
- goto try_another;
+ lkb->lkb_flags &= ~DLM_IFL_CB_PENDING;
+ break;
+ case DLM_DEQUEUE_CALLBACK_SUCCESS:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
+ spin_unlock(&proc->asts_spin);
- if (cb.flags & DLM_CB_BAST) {
- trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb.mode);
- } else if (cb.flags & DLM_CB_CAST) {
- new_mode = cb.mode;
+ if (cb->flags & DLM_CB_BAST) {
+ trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode);
+ } else if (cb->flags & DLM_CB_CAST) {
+ new_mode = cb->mode;
- if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
+ if (!cb->sb_status && lkb->lkb_lksb->sb_lvbptr &&
dlm_lvb_operations[old_mode + 1][new_mode + 1])
copy_lvb = 1;
- lkb->lkb_lksb->sb_status = cb.sb_status;
- lkb->lkb_lksb->sb_flags = cb.sb_flags;
+ lkb->lkb_lksb->sb_status = cb->sb_status;
+ lkb->lkb_lksb->sb_flags = cb->sb_flags;
trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
}
rv = copy_result_to_user(lkb->lkb_ua,
test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
- cb.flags, cb.mode, copy_lvb, buf, count);
+ cb->flags, cb->mode, copy_lvb, buf, count);
+
+ kref_put(&cb->ref, dlm_release_callback);
/* removes ref for proc->asts, may cause lkb to be freed */
- if (!resid)
+ if (rv == DLM_DEQUEUE_CALLBACK_LAST)
dlm_put_lkb(lkb);
return rv;
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 6b9bce6b96e0..33059452d79e 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -7,7 +7,7 @@
#define __USER_DOT_H__
void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
- int status, uint32_t sbflags, uint64_t seq);
+ int status, uint32_t sbflags);
int dlm_user_init(void);
void dlm_user_exit(void);
int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c214fe0981bd..f3cd00fac9c3 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -18,6 +18,8 @@
#include <linux/fs_stack.h>
#include <linux/slab.h>
#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/fileattr.h>
#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
@@ -1120,6 +1122,28 @@ static int ecryptfs_fileattr_set(struct user_namespace *mnt_userns,
return rc;
}
+static struct posix_acl *ecryptfs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ return vfs_get_acl(mnt_userns, ecryptfs_dentry_to_lower(dentry),
+ posix_acl_xattr_name(type));
+}
+
+static int ecryptfs_set_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct posix_acl *acl,
+ int type)
+{
+ int rc;
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ struct inode *lower_inode = d_inode(lower_dentry);
+
+ rc = vfs_set_acl(&init_user_ns, lower_dentry,
+ posix_acl_xattr_name(type), acl);
+ if (!rc)
+ fsstack_copy_attr_all(d_inode(dentry), lower_inode);
+ return rc;
+}
+
const struct inode_operations ecryptfs_symlink_iops = {
.get_link = ecryptfs_get_link,
.permission = ecryptfs_permission,
@@ -1143,6 +1167,8 @@ const struct inode_operations ecryptfs_dir_iops = {
.listxattr = ecryptfs_listxattr,
.fileattr_get = ecryptfs_fileattr_get,
.fileattr_set = ecryptfs_fileattr_set,
+ .get_acl = ecryptfs_get_acl,
+ .set_acl = ecryptfs_set_acl,
};
const struct inode_operations ecryptfs_main_iops = {
@@ -1152,6 +1178,8 @@ const struct inode_operations ecryptfs_main_iops = {
.listxattr = ecryptfs_listxattr,
.fileattr_get = ecryptfs_fileattr_get,
.fileattr_set = ecryptfs_fileattr_set,
+ .get_acl = ecryptfs_get_acl,
+ .set_acl = ecryptfs_set_acl,
};
static int ecryptfs_xattr_get(const struct xattr_handler *handler,
@@ -1182,6 +1210,10 @@ static const struct xattr_handler ecryptfs_xattr_handler = {
};
const struct xattr_handler *ecryptfs_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
&ecryptfs_xattr_handler,
NULL
};
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 939e5e242b98..617f3ad2485e 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -91,6 +91,10 @@ static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir,
err = guid_parse(dentry->d_name.name + namelen + 1, &var->var.VendorGuid);
if (err)
goto out;
+ if (guid_equal(&var->var.VendorGuid, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) {
+ err = -EPERM;
+ goto out;
+ }
if (efivar_variable_is_removable(var->var.VendorGuid,
dentry->d_name.name, namelen))
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6780fc81cc11..07e82e246666 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -116,6 +116,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
int err = -ENOMEM;
bool is_removable = false;
+ if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
+ return 0;
+
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return err;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fe8ac0e163f7..f57f921683d7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -13,9 +13,7 @@
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
if (buf->kmap_type == EROFS_KMAP)
- kunmap(buf->page);
- else if (buf->kmap_type == EROFS_KMAP_ATOMIC)
- kunmap_atomic(buf->base);
+ kunmap_local(buf->base);
buf->base = NULL;
buf->kmap_type = EROFS_NO_KMAP;
}
@@ -54,9 +52,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
}
if (buf->kmap_type == EROFS_NO_KMAP) {
if (type == EROFS_KMAP)
- buf->base = kmap(page);
- else if (type == EROFS_KMAP_ATOMIC)
- buf->base = kmap_atomic(page);
+ buf->base = kmap_local_page(page);
buf->kmap_type = type;
} else if (buf->kmap_type != type) {
DBG_BUGON(1);
@@ -403,6 +399,8 @@ const struct address_space_operations erofs_raw_access_aops = {
.readahead = erofs_readahead,
.bmap = erofs_bmap,
.direct_IO = noop_direct_IO,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
};
#ifdef CONFIG_FS_DAX
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index af5ed6b9c54d..014e20962376 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -11,265 +11,201 @@ static DEFINE_MUTEX(erofs_domain_cookies_lock);
static LIST_HEAD(erofs_domain_list);
static struct vfsmount *erofs_pseudo_mnt;
-static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
+struct erofs_fscache_request {
+ struct erofs_fscache_request *primary;
+ struct netfs_cache_resources cache_resources;
+ struct address_space *mapping; /* The mapping being accessed */
+ loff_t start; /* Start position */
+ size_t len; /* Length of the request */
+ size_t submitted; /* Length of submitted */
+ short error; /* 0 or error that occurred */
+ refcount_t ref;
+};
+
+static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
loff_t start, size_t len)
{
- struct netfs_io_request *rreq;
+ struct erofs_fscache_request *req;
- rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
- if (!rreq)
+ req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
+ if (!req)
return ERR_PTR(-ENOMEM);
- rreq->start = start;
- rreq->len = len;
- rreq->mapping = mapping;
- rreq->inode = mapping->host;
- INIT_LIST_HEAD(&rreq->subrequests);
- refcount_set(&rreq->ref, 1);
- return rreq;
-}
+ req->mapping = mapping;
+ req->start = start;
+ req->len = len;
+ refcount_set(&req->ref, 1);
-static void erofs_fscache_put_request(struct netfs_io_request *rreq)
-{
- if (!refcount_dec_and_test(&rreq->ref))
- return;
- if (rreq->cache_resources.ops)
- rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
+ return req;
}
-static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq)
+static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
+ size_t len)
{
- if (!refcount_dec_and_test(&subreq->ref))
- return;
- erofs_fscache_put_request(subreq->rreq);
- kfree(subreq);
-}
+ struct erofs_fscache_request *req;
-static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
+ /* use primary request for the first submission */
+ if (!primary->submitted) {
+ refcount_inc(&primary->ref);
+ return primary;
+ }
- while (!list_empty(&rreq->subrequests)) {
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_io_subrequest, rreq_link);
- list_del(&subreq->rreq_link);
- erofs_fscache_put_subrequest(subreq);
+ req = erofs_fscache_req_alloc(primary->mapping,
+ primary->start + primary->submitted, len);
+ if (!IS_ERR(req)) {
+ req->primary = primary;
+ refcount_inc(&primary->ref);
}
+ return req;
}
-static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq)
+static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
{
- struct netfs_io_subrequest *subreq;
struct folio *folio;
- unsigned int iopos = 0;
- pgoff_t start_page = rreq->start / PAGE_SIZE;
- pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
- bool subreq_failed = false;
+ bool failed = req->error;
+ pgoff_t start_page = req->start / PAGE_SIZE;
+ pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1;
- XA_STATE(xas, &rreq->mapping->i_pages, start_page);
-
- subreq = list_first_entry(&rreq->subrequests,
- struct netfs_io_subrequest, rreq_link);
- subreq_failed = (subreq->error < 0);
+ XA_STATE(xas, &req->mapping->i_pages, start_page);
rcu_read_lock();
xas_for_each(&xas, folio, last_page) {
- unsigned int pgpos, pgend;
- bool pg_failed = false;
-
if (xas_retry(&xas, folio))
continue;
-
- pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
- pgend = pgpos + folio_size(folio);
-
- for (;;) {
- if (!subreq) {
- pg_failed = true;
- break;
- }
-
- pg_failed |= subreq_failed;
- if (pgend < iopos + subreq->len)
- break;
-
- iopos += subreq->len;
- if (!list_is_last(&subreq->rreq_link,
- &rreq->subrequests)) {
- subreq = list_next_entry(subreq, rreq_link);
- subreq_failed = (subreq->error < 0);
- } else {
- subreq = NULL;
- subreq_failed = false;
- }
- if (pgend == iopos)
- break;
- }
-
- if (!pg_failed)
+ if (!failed)
folio_mark_uptodate(folio);
-
folio_unlock(folio);
}
rcu_read_unlock();
}
-static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq)
+static void erofs_fscache_req_put(struct erofs_fscache_request *req)
{
- erofs_fscache_rreq_unlock_folios(rreq);
- erofs_fscache_clear_subrequests(rreq);
- erofs_fscache_put_request(rreq);
+ if (refcount_dec_and_test(&req->ref)) {
+ if (req->cache_resources.ops)
+ req->cache_resources.ops->end_operation(&req->cache_resources);
+ if (!req->primary)
+ erofs_fscache_req_complete(req);
+ else
+ erofs_fscache_req_put(req->primary);
+ kfree(req);
+ }
}
-static void erofc_fscache_subreq_complete(void *priv,
+static void erofs_fscache_subreq_complete(void *priv,
ssize_t transferred_or_error, bool was_async)
{
- struct netfs_io_subrequest *subreq = priv;
- struct netfs_io_request *rreq = subreq->rreq;
-
- if (IS_ERR_VALUE(transferred_or_error))
- subreq->error = transferred_or_error;
+ struct erofs_fscache_request *req = priv;
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- erofs_fscache_rreq_complete(rreq);
-
- erofs_fscache_put_subrequest(subreq);
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ if (req->primary)
+ req->primary->error = transferred_or_error;
+ else
+ req->error = transferred_or_error;
+ }
+ erofs_fscache_req_put(req);
}
/*
- * Read data from fscache and fill the read data into page cache described by
- * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes
- * the start physical address in the cache file.
+ * Read data from fscache (cookie, pstart, len), and fill the read data into
+ * page cache described by (req->mapping, lstart, len). @pstart describeis the
+ * start physical address in the cache file.
*/
static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
- struct netfs_io_request *rreq, loff_t pstart)
+ struct erofs_fscache_request *req, loff_t pstart, size_t len)
{
enum netfs_io_source source;
- struct super_block *sb = rreq->mapping->host->i_sb;
- struct netfs_io_subrequest *subreq;
- struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct super_block *sb = req->mapping->host->i_sb;
+ struct netfs_cache_resources *cres = &req->cache_resources;
struct iov_iter iter;
- loff_t start = rreq->start;
- size_t len = rreq->len;
+ loff_t lstart = req->start + req->submitted;
size_t done = 0;
int ret;
- atomic_set(&rreq->nr_outstanding, 1);
+ DBG_BUGON(len > req->len - req->submitted);
ret = fscache_begin_read_operation(cres, cookie);
if (ret)
- goto out;
+ return ret;
while (done < len) {
- subreq = kzalloc(sizeof(struct netfs_io_subrequest),
- GFP_KERNEL);
- if (subreq) {
- INIT_LIST_HEAD(&subreq->rreq_link);
- refcount_set(&subreq->ref, 2);
- subreq->rreq = rreq;
- refcount_inc(&rreq->ref);
- } else {
- ret = -ENOMEM;
- goto out;
- }
-
- subreq->start = pstart + done;
- subreq->len = len - done;
- subreq->flags = 1 << NETFS_SREQ_ONDEMAND;
+ loff_t sstart = pstart + done;
+ size_t slen = len - done;
+ unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
- list_add_tail(&subreq->rreq_link, &rreq->subrequests);
-
- source = cres->ops->prepare_read(subreq, LLONG_MAX);
- if (WARN_ON(subreq->len == 0))
+ source = cres->ops->prepare_ondemand_read(cres,
+ sstart, &slen, LLONG_MAX, &flags, 0);
+ if (WARN_ON(slen == 0))
source = NETFS_INVALID_READ;
if (source != NETFS_READ_FROM_CACHE) {
- erofs_err(sb, "failed to fscache prepare_read (source %d)",
- source);
- ret = -EIO;
- subreq->error = ret;
- erofs_fscache_put_subrequest(subreq);
- goto out;
+ erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+ return -EIO;
}
- atomic_inc(&rreq->nr_outstanding);
+ refcount_inc(&req->ref);
+ iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
+ lstart + done, slen);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
- start + done, subreq->len);
-
- ret = fscache_read(cres, subreq->start, &iter,
- NETFS_READ_HOLE_FAIL,
- erofc_fscache_subreq_complete, subreq);
+ ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
+ erofs_fscache_subreq_complete, req);
if (ret == -EIOCBQUEUED)
ret = 0;
if (ret) {
erofs_err(sb, "failed to fscache_read (ret %d)", ret);
- goto out;
+ return ret;
}
- done += subreq->len;
+ done += slen;
}
-out:
- if (atomic_dec_and_test(&rreq->nr_outstanding))
- erofs_fscache_rreq_complete(rreq);
-
- return ret;
+ DBG_BUGON(done != len);
+ return 0;
}
static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
{
int ret;
struct super_block *sb = folio_mapping(folio)->host->i_sb;
- struct netfs_io_request *rreq;
+ struct erofs_fscache_request *req;
struct erofs_map_dev mdev = {
.m_deviceid = 0,
.m_pa = folio_pos(folio),
};
ret = erofs_map_dev(sb, &mdev);
- if (ret)
- goto out;
+ if (ret) {
+ folio_unlock(folio);
+ return ret;
+ }
- rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio_mapping(folio),
folio_pos(folio), folio_size(folio));
- if (IS_ERR(rreq)) {
- ret = PTR_ERR(rreq);
- goto out;
+ if (IS_ERR(req)) {
+ folio_unlock(folio);
+ return PTR_ERR(req);
}
- return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- rreq, mdev.m_pa);
-out:
- folio_unlock(folio);
+ ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ req, mdev.m_pa, folio_size(folio));
+ if (ret)
+ req->error = ret;
+
+ erofs_fscache_req_put(req);
return ret;
}
-/*
- * Read into page cache in the range described by (@pos, @len).
- *
- * On return, the caller is responsible for page unlocking if the output @unlock
- * is true, or the callee will take this responsibility through netfs_io_request
- * interface.
- *
- * The return value is the number of bytes successfully handled, or negative
- * error code on failure. The only exception is that, the length of the range
- * instead of the error code is returned on failure after netfs_io_request is
- * allocated, so that .readahead() could advance rac accordingly.
- */
-static int erofs_fscache_data_read(struct address_space *mapping,
- loff_t pos, size_t len, bool *unlock)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
{
+ struct address_space *mapping = primary->mapping;
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
- struct netfs_io_request *rreq;
+ struct erofs_fscache_request *req;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
struct iov_iter iter;
+ loff_t pos = primary->start + primary->submitted;
size_t count;
int ret;
- *unlock = true;
-
map.m_la = pos;
ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
if (ret)
@@ -290,24 +226,26 @@ static int erofs_fscache_data_read(struct address_space *mapping,
if (IS_ERR(src))
return PTR_ERR(src);
- iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE);
+ iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, PAGE_SIZE);
if (copy_to_iter(src + offset, size, &iter) != size) {
erofs_put_metabuf(&buf);
return -EFAULT;
}
iov_iter_zero(PAGE_SIZE - size, &iter);
erofs_put_metabuf(&buf);
- return PAGE_SIZE;
+ primary->submitted += PAGE_SIZE;
+ return 0;
}
+ count = primary->len - primary->submitted;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
- count = len;
- iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count);
+ iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
iov_iter_zero(count, &iter);
- return count;
+ primary->submitted += count;
+ return 0;
}
- count = min_t(size_t, map.m_llen - (pos - map.m_la), len);
+ count = min_t(size_t, map.m_llen - (pos - map.m_la), count);
DBG_BUGON(!count || count % PAGE_SIZE);
mdev = (struct erofs_map_dev) {
@@ -318,64 +256,65 @@ static int erofs_fscache_data_read(struct address_space *mapping,
if (ret)
return ret;
- rreq = erofs_fscache_alloc_request(mapping, pos, count);
- if (IS_ERR(rreq))
- return PTR_ERR(rreq);
+ req = erofs_fscache_req_chain(primary, count);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
- *unlock = false;
- erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
- rreq, mdev.m_pa + (pos - map.m_la));
- return count;
+ ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ req, mdev.m_pa + (pos - map.m_la), count);
+ erofs_fscache_req_put(req);
+ primary->submitted += count;
+ return ret;
}
-static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+static int erofs_fscache_data_read(struct erofs_fscache_request *req)
{
- bool unlock;
int ret;
- DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+ do {
+ ret = erofs_fscache_data_read_slice(req);
+ if (ret)
+ req->error = ret;
+ } while (!ret && req->submitted < req->len);
- ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio),
- folio_size(folio), &unlock);
- if (unlock) {
- if (ret > 0)
- folio_mark_uptodate(folio);
+ return ret;
+}
+
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+{
+ struct erofs_fscache_request *req;
+ int ret;
+
+ req = erofs_fscache_req_alloc(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(req)) {
folio_unlock(folio);
+ return PTR_ERR(req);
}
- return ret < 0 ? ret : 0;
+
+ ret = erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
+ return ret;
}
static void erofs_fscache_readahead(struct readahead_control *rac)
{
- struct folio *folio;
- size_t len, done = 0;
- loff_t start, pos;
- bool unlock;
- int ret, size;
+ struct erofs_fscache_request *req;
if (!readahead_count(rac))
return;
- start = readahead_pos(rac);
- len = readahead_length(rac);
+ req = erofs_fscache_req_alloc(rac->mapping,
+ readahead_pos(rac), readahead_length(rac));
+ if (IS_ERR(req))
+ return;
- do {
- pos = start + done;
- ret = erofs_fscache_data_read(rac->mapping, pos,
- len - done, &unlock);
- if (ret <= 0)
- return;
+ /* The request completion will drop refs on the folios. */
+ while (readahead_folio(rac))
+ ;
- size = ret;
- while (size) {
- folio = readahead_folio(rac);
- size -= folio_size(folio);
- if (unlock) {
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- }
- }
- } while ((done += ret) < len);
+ erofs_fscache_data_read(req);
+ erofs_fscache_req_put(req);
}
static const struct address_space_operations erofs_fscache_meta_aops = {
@@ -494,7 +433,8 @@ static int erofs_fscache_register_domain(struct super_block *sb)
static
struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
struct fscache_volume *volume = EROFS_SB(sb)->volume;
struct erofs_fscache *ctx;
@@ -516,7 +456,7 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
fscache_use_cookie(cookie, false);
ctx->cookie = cookie;
- if (need_inode) {
+ if (flags & EROFS_REG_COOKIE_NEED_INODE) {
struct inode *const inode = new_inode(sb);
if (!inode) {
@@ -554,14 +494,15 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
static
struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
int err;
struct inode *inode;
struct erofs_fscache *ctx;
struct erofs_domain *domain = EROFS_SB(sb)->domain;
- ctx = erofs_fscache_acquire_cookie(sb, name, need_inode);
+ ctx = erofs_fscache_acquire_cookie(sb, name, flags);
if (IS_ERR(ctx))
return ctx;
@@ -589,7 +530,8 @@ out:
static
struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
struct inode *inode;
struct erofs_fscache *ctx;
@@ -602,23 +544,30 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
ctx = inode->i_private;
if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
continue;
- igrab(inode);
+ if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) {
+ igrab(inode);
+ } else {
+ erofs_err(sb, "%s already exists in domain %s", name,
+ domain->domain_id);
+ ctx = ERR_PTR(-EEXIST);
+ }
spin_unlock(&psb->s_inode_list_lock);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
}
spin_unlock(&psb->s_inode_list_lock);
- ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode);
+ ctx = erofs_fscache_domain_init_cookie(sb, name, flags);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
}
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
if (EROFS_SB(sb)->domain_id)
- return erofs_domain_register_cookie(sb, name, need_inode);
- return erofs_fscache_acquire_cookie(sb, name, need_inode);
+ return erofs_domain_register_cookie(sb, name, flags);
+ return erofs_fscache_acquire_cookie(sb, name, flags);
}
void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
@@ -647,6 +596,7 @@ int erofs_fscache_register_fs(struct super_block *sb)
int ret;
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
+ unsigned int flags;
if (sbi->domain_id)
ret = erofs_fscache_register_domain(sb);
@@ -655,8 +605,20 @@ int erofs_fscache_register_fs(struct super_block *sb)
if (ret)
return ret;
- /* acquired domain/volume will be relinquished in kill_sb() on error */
- fscache = erofs_fscache_register_cookie(sb, sbi->fsid, true);
+ /*
+ * When shared domain is enabled, using NEED_NOEXIST to guarantee
+ * the primary data blob (aka fsid) is unique in the shared domain.
+ *
+ * For non-shared-domain case, fscache_acquire_volume() invoked by
+ * erofs_fscache_register_volume() has already guaranteed
+ * the uniqueness of primary data blob.
+ *
+ * Acquired domain/volume will be relinquished in kill_sb() on error.
+ */
+ flags = EROFS_REG_COOKIE_NEED_INODE;
+ if (sbi->domain_id)
+ flags |= EROFS_REG_COOKIE_NEED_NOEXIST;
+ fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags);
if (IS_ERR(fscache))
return PTR_ERR(fscache);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index ad2a82f2eb4c..d3b8736fa124 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -268,6 +268,7 @@ static int erofs_fill_inode(struct inode *inode)
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
inode->i_fop = &erofs_dir_fops;
+ inode_nohighmem(inode);
break;
case S_IFLNK:
err = erofs_fill_symlink(inode, kaddr, ofs);
@@ -295,6 +296,7 @@ static int erofs_fill_inode(struct inode *inode)
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
+ mapping_set_large_folios(inode->i_mapping);
#ifdef CONFIG_EROFS_FS_ONDEMAND
if (erofs_is_fscache_mode(inode->i_sb))
inode->i_mapping->a_ops = &erofs_fscache_access_aops;
@@ -371,7 +373,7 @@ int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
const struct inode_operations erofs_generic_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
.fiemap = erofs_fiemap,
};
@@ -379,12 +381,12 @@ const struct inode_operations erofs_symlink_iops = {
.get_link = page_get_link,
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
};
const struct inode_operations erofs_fast_symlink_iops = {
.get_link = simple_get_link,
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
};
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 05dc68627722..bb8501c0ff5b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -255,8 +255,7 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
- EROFS_KMAP, /* use kmap() to map the buffer */
- EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */
+ EROFS_KMAP, /* use kmap_local_page() to map the buffer */
};
struct erofs_buf {
@@ -604,13 +603,18 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
+/* flags for erofs_fscache_register_cookie() */
+#define EROFS_REG_COOKIE_NEED_INODE 1
+#define EROFS_REG_COOKIE_NEED_NOEXIST 2
+
/* fscache.c */
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
- char *name, bool need_inode);
+ char *name,
+ unsigned int flags);
void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
extern const struct address_space_operations erofs_fscache_access_aops;
@@ -623,7 +627,8 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
static inline
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
- char *name, bool need_inode)
+ char *name,
+ unsigned int flags)
{
return ERR_PTR(-EOPNOTSUPP);
}
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 0dc34721080c..b64a108fac92 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -228,6 +228,6 @@ const struct inode_operations erofs_dir_iops = {
.lookup = erofs_lookup,
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
- .get_acl = erofs_get_acl,
+ .get_inode_acl = erofs_get_acl,
.fiemap = erofs_fiemap,
};
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 1c7dcca702b3..626a615dafc2 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -245,7 +245,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
}
if (erofs_is_fscache_mode(sb)) {
- fscache = erofs_fscache_register_cookie(sb, dif->path, false);
+ fscache = erofs_fscache_register_cookie(sb, dif->path, 0);
if (IS_ERR(fscache))
return PTR_ERR(fscache);
dif->fscache = fscache;
@@ -577,26 +577,25 @@ static int erofs_fc_parse_param(struct fs_context *fc,
}
++ctx->devs->extra_devices;
break;
- case Opt_fsid:
#ifdef CONFIG_EROFS_FS_ONDEMAND
+ case Opt_fsid:
kfree(ctx->fsid);
ctx->fsid = kstrdup(param->string, GFP_KERNEL);
if (!ctx->fsid)
return -ENOMEM;
-#else
- errorfc(fc, "fsid option not supported");
-#endif
break;
case Opt_domain_id:
-#ifdef CONFIG_EROFS_FS_ONDEMAND
kfree(ctx->domain_id);
ctx->domain_id = kstrdup(param->string, GFP_KERNEL);
if (!ctx->domain_id)
return -ENOMEM;
+ break;
#else
- errorfc(fc, "domain_id option not supported");
-#endif
+ case Opt_fsid:
+ case Opt_domain_id:
+ errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
break;
+#endif
default:
return -ENOPARAM;
}
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 8106bcb5a38d..a62fb8a3318a 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -148,7 +148,7 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
it->blkaddr += erofs_blknr(it->ofs);
it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
it->ofs = erofs_blkoff(it->ofs);
@@ -174,7 +174,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
return vi->xattr_isize - xattr_header_sz;
@@ -368,7 +368,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->it.kaddr))
return PTR_ERR(it->it.kaddr);
it->it.blkaddr = blkaddr;
@@ -580,7 +580,7 @@ static int shared_listxattr(struct listxattr_iter *it)
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
- EROFS_KMAP_ATOMIC);
+ EROFS_KMAP);
if (IS_ERR(it->it.kaddr))
return PTR_ERR(it->it.kaddr);
it->it.blkaddr = blkaddr;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index b792d424d774..5200bb86e264 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -175,16 +175,6 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
DBG_BUGON(1);
}
-/* how to allocate cached pages for a pcluster */
-enum z_erofs_cache_alloctype {
- DONTALLOC, /* don't allocate any cached pages */
- /*
- * try to use cached I/O if page allocation succeeds or fallback
- * to in-place I/O instead to avoid any direct reclaim.
- */
- TRYALLOC,
-};
-
/*
* tagged pointer with 1-bit tag for all compressed pages
* tag 0 - the page is just found with an extra page reference
@@ -292,12 +282,29 @@ struct z_erofs_decompress_frontend {
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
.mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
+static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
+{
+ unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
+
+ if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
+ return false;
+
+ if (fe->backmost)
+ return true;
+
+ if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
+ fe->map.m_la < fe->headoffset)
+ return true;
+
+ return false;
+}
+
static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
- enum z_erofs_cache_alloctype type,
struct page **pagepool)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
+ bool shouldalloc = z_erofs_should_alloc_cache(fe);
bool standalone = true;
/*
* optimistic allocation without direct reclaim since inplace I/O
@@ -326,18 +333,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
} else {
/* I/O is needed, no possible to decompress directly */
standalone = false;
- switch (type) {
- case TRYALLOC:
- newpage = erofs_allocpage(pagepool, gfp);
- if (!newpage)
- continue;
- set_page_private(newpage,
- Z_EROFS_PREALLOCATED_PAGE);
- t = tag_compressed_page_justfound(newpage);
- break;
- default: /* DONTALLOC */
+ if (!shouldalloc)
continue;
- }
+
+ /*
+ * try to use cached I/O if page allocation
+ * succeeds or fallback to in-place I/O instead
+ * to avoid any direct reclaim.
+ */
+ newpage = erofs_allocpage(pagepool, gfp);
+ if (!newpage)
+ continue;
+ set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
+ t = tag_compressed_page_justfound(newpage);
}
if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
@@ -488,7 +496,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
struct erofs_workgroup *grp;
int err;
- if (!(map->m_flags & EROFS_MAP_ENCODED)) {
+ if (!(map->m_flags & EROFS_MAP_ENCODED) ||
+ (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -637,20 +646,6 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
return true;
}
-static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
- unsigned int cachestrategy,
- erofs_off_t la)
-{
- if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
- return false;
-
- if (fe->backmost)
- return true;
-
- return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
- la < fe->headoffset;
-}
-
static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
struct page *page, unsigned int pageofs,
unsigned int len)
@@ -687,12 +682,9 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct page *page, struct page **pagepool)
{
struct inode *const inode = fe->inode;
- struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
bool tight = true, exclusive;
-
- enum z_erofs_cache_alloctype cache_strategy;
unsigned int cur, end, spiltted;
int err = 0;
@@ -746,13 +738,7 @@ repeat:
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
- if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
- map->m_la))
- cache_strategy = TRYALLOC;
- else
- cache_strategy = DONTALLOC;
-
- z_erofs_bind_cache(fe, cache_strategy, pagepool);
+ z_erofs_bind_cache(fe, pagepool);
}
hitted:
/*
@@ -1046,12 +1032,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
if (!be->decompressed_pages)
be->decompressed_pages =
- kvcalloc(be->nr_pages, sizeof(struct page *),
- GFP_KERNEL | __GFP_NOFAIL);
+ kcalloc(be->nr_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
if (!be->compressed_pages)
be->compressed_pages =
- kvcalloc(pclusterpages, sizeof(struct page *),
- GFP_KERNEL | __GFP_NOFAIL);
+ kcalloc(pclusterpages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
z_erofs_parse_out_bvecs(be);
err2 = z_erofs_parse_in_bvecs(be, &overlapped);
@@ -1099,7 +1085,7 @@ out:
}
if (be->compressed_pages < be->onstack_pages ||
be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
- kvfree(be->compressed_pages);
+ kfree(be->compressed_pages);
z_erofs_fill_other_copies(be, err);
for (i = 0; i < be->nr_pages; ++i) {
@@ -1118,7 +1104,7 @@ out:
}
if (be->decompressed_pages != be->onstack_pages)
- kvfree(be->decompressed_pages);
+ kfree(be->decompressed_pages);
pcl->length = 0;
pcl->partial = true;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 0bb66927e3d0..98fb90b9af71 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -178,7 +178,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int advise, type;
m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+ erofs_blknr(pos), EROFS_KMAP);
if (IS_ERR(m->kaddr))
return PTR_ERR(m->kaddr);
@@ -416,7 +416,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
out:
pos += lcn * (1 << amortizedshift);
m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+ erofs_blknr(pos), EROFS_KMAP);
if (IS_ERR(m->kaddr))
return PTR_ERR(m->kaddr);
return unpack_compacted_index(m, amortizedshift, pos, lookahead);
@@ -694,10 +694,15 @@ static int z_erofs_do_map_blocks(struct inode *inode,
map->m_pa = blknr_to_addr(m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
- goto out;
+ goto unmap_out;
}
if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
+ if (map->m_llen > map->m_plen) {
+ DBG_BUGON(1);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
map->m_algorithmformat =
Z_EROFS_COMPRESSION_INTERLACED;
@@ -718,14 +723,12 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
}
+
unmap_out:
erofs_unmap_metabuf(&m.map->buf);
-
-out:
erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
__func__, map->m_la, map->m_pa,
map->m_llen, map->m_plen, map->m_flags);
-
return err;
}
@@ -790,12 +793,16 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
/*
- * No strict rule how to describe extents for post EOF, yet
- * we need do like below. Otherwise, iomap itself will get
+ * No strict rule on how to describe extents for post EOF, yet
+ * we need to do like below. Otherwise, iomap itself will get
* into an endless loop on post EOF.
+ *
+ * Calculate the effective offset by subtracting extent start
+ * (map.m_la) from the requested offset, and add it to length.
+ * (NB: offset >= map.m_la always)
*/
if (iomap->offset >= inode->i_size)
- iomap->length = length + map.m_la - offset;
+ iomap->length = length + offset - map.m_la;
}
iomap->flags = 0;
return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index c0ffee99ad23..249ca6c0b784 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -43,21 +43,7 @@ struct eventfd_ctx {
int id;
};
-/**
- * eventfd_signal - Adds @n to the eventfd counter.
- * @ctx: [in] Pointer to the eventfd context.
- * @n: [in] Value of the counter to be added to the eventfd internal counter.
- * The value cannot be negative.
- *
- * This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a EPOLLERR
- * to poll(2).
- *
- * Returns the amount by which the counter was incremented. This will be less
- * than @n if the counter has overflowed.
- */
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
{
unsigned long flags;
@@ -78,12 +64,31 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
- wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
current->in_eventfd = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
}
+
+/**
+ * eventfd_signal - Adds @n to the eventfd counter.
+ * @ctx: [in] Pointer to the eventfd context.
+ * @n: [in] Value of the counter to be added to the eventfd internal counter.
+ * The value cannot be negative.
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returning a EPOLLERR
+ * to poll(2).
+ *
+ * Returns the amount by which the counter was incremented. This will be less
+ * than @n if the counter has overflowed.
+ */
+__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
+{
+ return eventfd_signal_mask(ctx, n, 0);
+}
EXPORT_SYMBOL_GPL(eventfd_signal);
static void eventfd_free_ctx(struct eventfd_ctx *ctx)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 52954d4637b5..64659b110973 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -491,7 +491,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
+ unsigned pollflags)
{
struct eventpoll *ep_src;
unsigned long flags;
@@ -522,16 +523,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
}
spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
ep->nests = nests + 1;
- wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
+ wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
ep->nests = 0;
spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}
#else
-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
+static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
+ unsigned pollflags)
{
- wake_up_poll(&ep->poll_wait, EPOLLIN);
+ wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}
#endif
@@ -742,7 +744,7 @@ static void ep_free(struct eventpoll *ep)
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
- ep_poll_safewake(ep, NULL);
+ ep_poll_safewake(ep, NULL, 0);
/*
* We need to lock this because we could be hit by
@@ -1208,7 +1210,7 @@ out_unlock:
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(ep, epi);
+ ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
@@ -1553,7 +1555,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(ep, NULL);
+ ep_poll_safewake(ep, NULL, 0);
return 0;
}
@@ -1629,7 +1631,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
/* We have to call this outside the lock */
if (pwake)
- ep_poll_safewake(ep, NULL);
+ ep_poll_safewake(ep, NULL, 0);
return 0;
}
diff --git a/fs/exec.c b/fs/exec.c
index a0b1f0337a62..ab913243a367 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -64,6 +64,7 @@
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
+#include <linux/time_namespace.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -171,7 +172,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
exit:
fput(file);
out:
- return error;
+ return error;
}
#endif /* #ifdef CONFIG_USELIB */
@@ -199,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
{
struct page *page;
int ret;
- unsigned int gup_flags = FOLL_FORCE;
+ unsigned int gup_flags = 0;
#ifdef CONFIG_STACK_GROWSUP
if (write) {
@@ -842,16 +843,13 @@ int setup_arg_pages(struct linux_binprm *bprm,
* will align it up.
*/
rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
+
+ stack_expand = min(rlim_stack, stack_size + stack_expand);
+
#ifdef CONFIG_STACK_GROWSUP
- if (stack_size + stack_expand > rlim_stack)
- stack_base = vma->vm_start + rlim_stack;
- else
- stack_base = vma->vm_end + stack_expand;
+ stack_base = vma->vm_start + stack_expand;
#else
- if (stack_size + stack_expand > rlim_stack)
- stack_base = vma->vm_end - rlim_stack;
- else
- stack_base = vma->vm_start - stack_expand;
+ stack_base = vma->vm_end - stack_expand;
#endif
current->mm->start_stack = bprm->p;
ret = expand_stack(vma, stack_base);
@@ -1297,6 +1295,10 @@ int begin_new_exec(struct linux_binprm * bprm)
bprm->mm = NULL;
+ retval = exec_task_namespaces();
+ if (retval)
+ goto out_unlock;
+
#ifdef CONFIG_POSIX_TIMERS
spin_lock_irq(&me->sighand->siglock);
posix_cpu_timers_exit(me);
@@ -1568,6 +1570,12 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
if (task_no_new_privs(current))
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+ /*
+ * If another task is sharing our fs, we cannot safely
+ * suid exec because the differently privileged task
+ * will be able to manipulate the current directory, etc.
+ * It would be nice to force an unshare instead...
+ */
t = p;
n_fs = 1;
spin_lock(&p->fs->lock);
@@ -1591,8 +1599,8 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
struct user_namespace *mnt_userns;
struct inode *inode = file_inode(file);
unsigned int mode;
- kuid_t uid;
- kgid_t gid;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
if (!mnt_may_suid(file->f_path.mnt))
return;
@@ -1611,23 +1619,23 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
/* reload atomically mode/uid/gid now that lock held */
mode = inode->i_mode;
- uid = i_uid_into_mnt(mnt_userns, inode);
- gid = i_gid_into_mnt(mnt_userns, inode);
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
inode_unlock(inode);
/* We ignore suid/sgid if there are no mappings for them in the ns */
- if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
- !kgid_has_mapping(bprm->cred->user_ns, gid))
+ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
+ !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
return;
if (mode & S_ISUID) {
bprm->per_clear |= PER_CLEAR_ON_SETID;
- bprm->cred->euid = uid;
+ bprm->cred->euid = vfsuid_into_kuid(vfsuid);
}
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
bprm->per_clear |= PER_CLEAR_ON_SETID;
- bprm->cred->egid = gid;
+ bprm->cred->egid = vfsgid_into_kgid(vfsgid);
}
}
@@ -1748,6 +1756,7 @@ static int search_binary_handler(struct linux_binprm *bprm)
return retval;
}
+/* binfmt handlers will call back into begin_new_exec() on success. */
static int exec_binprm(struct linux_binprm *bprm)
{
pid_t old_pid, old_vpid;
@@ -1806,6 +1815,11 @@ static int bprm_execve(struct linux_binprm *bprm,
if (retval)
return retval;
+ /*
+ * Check for unsafe execution states before exec_binprm(), which
+ * will call back into begin_new_exec(), into bprm_creds_from_file(),
+ * where setuid-ness is evaluated.
+ */
check_unsafe_exec(bprm);
current->in_execve = 1;
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 0fc08fdcba73..1dfa67f307f1 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -33,10 +33,9 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
struct exfat_chain *p_dir, int entry, unsigned short *uniname)
{
int i;
- struct exfat_entry_set_cache *es;
+ struct exfat_entry_set_cache es;
- es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES);
- if (!es)
+ if (exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES))
return;
/*
@@ -45,8 +44,8 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
* Third entry : first file-name entry
* So, the index of first file-name dentry should start from 2.
*/
- for (i = 2; i < es->num_entries; i++) {
- struct exfat_dentry *ep = exfat_get_dentry_cached(es, i);
+ for (i = ES_IDX_FIRST_FILENAME; i < es.num_entries; i++) {
+ struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i);
/* end of name entry */
if (exfat_get_entry_type(ep) != TYPE_EXTEND)
@@ -56,13 +55,13 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
uniname += EXFAT_FILE_NAME_LEN;
}
- exfat_free_dentry_set(es, false);
+ exfat_put_dentry_set(&es, false);
}
/* read a directory entry from the opened directory */
static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry)
{
- int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext;
+ int i, dentries_per_clu, num_ext;
unsigned int type, clu_offset, max_dentries;
struct exfat_chain dir, clu;
struct exfat_uni_name uni_name;
@@ -84,11 +83,10 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags);
dentries_per_clu = sbi->dentries_per_clu;
- dentries_per_clu_bits = ilog2(dentries_per_clu);
max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES,
- (u64)sbi->num_clusters << dentries_per_clu_bits);
+ (u64)EXFAT_CLU_TO_DEN(sbi->num_clusters, sbi));
- clu_offset = dentry >> dentries_per_clu_bits;
+ clu_offset = EXFAT_DEN_TO_CLU(dentry, sbi);
exfat_chain_dup(&clu, &dir);
if (clu.flags == ALLOC_NO_FAT_CHAIN) {
@@ -163,7 +161,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
dir_entry->entry = dentry;
brelse(bh);
- ei->hint_bmap.off = dentry >> dentries_per_clu_bits;
+ ei->hint_bmap.off = EXFAT_DEN_TO_CLU(dentry, sbi);
ei->hint_bmap.clu = clu.dir;
*cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext);
@@ -337,7 +335,7 @@ int exfat_calc_num_entries(struct exfat_uni_name *p_uniname)
return -EINVAL;
/* 1 file entry + 1 stream entry + name entries */
- return ((len - 1) / EXFAT_FILE_NAME_LEN + 3);
+ return ES_ENTRY_NUM(len);
}
unsigned int exfat_get_entry_type(struct exfat_dentry *ep)
@@ -592,18 +590,18 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
unsigned short chksum = 0;
struct exfat_dentry *ep;
- for (i = 0; i < es->num_entries; i++) {
+ for (i = ES_IDX_FILE; i < es->num_entries; i++) {
ep = exfat_get_dentry_cached(es, i);
chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum,
chksum_type);
chksum_type = CS_DEFAULT;
}
- ep = exfat_get_dentry_cached(es, 0);
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
ep->dentry.file.checksum = cpu_to_le16(chksum);
es->modified = true;
}
-int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
+int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync)
{
int i, err = 0;
@@ -615,7 +613,10 @@ int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
bforget(es->bh[i]);
else
brelse(es->bh[i]);
- kfree(es);
+
+ if (IS_DYNAMIC_ES(es))
+ kfree(es->bh);
+
return err;
}
@@ -812,14 +813,14 @@ struct exfat_dentry *exfat_get_dentry_cached(
* pointer of entry set on success,
* NULL on failure.
*/
-struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, unsigned int type)
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir, int entry,
+ unsigned int type)
{
int ret, i, num_bh;
- unsigned int off, byte_offset, clu = 0;
+ unsigned int off;
sector_t sec;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct exfat_entry_set_cache *es;
struct exfat_dentry *ep;
int num_entries;
enum exfat_validate_dentry_mode mode = ES_MODE_STARTED;
@@ -827,52 +828,51 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
if (p_dir->dir == DIR_DELETED) {
exfat_err(sb, "access to deleted dentry");
- return NULL;
+ return -EIO;
}
- byte_offset = EXFAT_DEN_TO_B(entry);
- ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu);
+ ret = exfat_find_location(sb, p_dir, entry, &sec, &off);
if (ret)
- return NULL;
+ return ret;
- es = kzalloc(sizeof(*es), GFP_KERNEL);
- if (!es)
- return NULL;
+ memset(es, 0, sizeof(*es));
es->sb = sb;
es->modified = false;
-
- /* byte offset in cluster */
- byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi);
-
- /* byte offset in sector */
- off = EXFAT_BLK_OFFSET(byte_offset, sb);
es->start_off = off;
-
- /* sector offset in cluster */
- sec = EXFAT_B_TO_BLK(byte_offset, sb);
- sec += exfat_cluster_to_sector(sbi, clu);
+ es->bh = es->__bh;
bh = sb_bread(sb, sec);
if (!bh)
- goto free_es;
+ return -EIO;
es->bh[es->num_bh++] = bh;
- ep = exfat_get_dentry_cached(es, 0);
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
- goto free_es;
+ goto put_es;
num_entries = type == ES_ALL_ENTRIES ?
ep->dentry.file.num_ext + 1 : type;
es->num_entries = num_entries;
num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
+ if (num_bh > ARRAY_SIZE(es->__bh)) {
+ es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL);
+ if (!es->bh) {
+ brelse(bh);
+ return -ENOMEM;
+ }
+ es->bh[0] = bh;
+ }
+
for (i = 1; i < num_bh; i++) {
/* get the next sector */
if (exfat_is_last_sector_in_cluster(sbi, sec)) {
+ unsigned int clu = exfat_sector_to_cluster(sbi, sec);
+
if (p_dir->flags == ALLOC_NO_FAT_CHAIN)
clu++;
else if (exfat_get_next_cluster(sb, &clu))
- goto free_es;
+ goto put_es;
sec = exfat_cluster_to_sector(sbi, clu);
} else {
sec++;
@@ -880,21 +880,51 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
bh = sb_bread(sb, sec);
if (!bh)
- goto free_es;
+ goto put_es;
es->bh[es->num_bh++] = bh;
}
/* validate cached dentries */
- for (i = 1; i < num_entries; i++) {
+ for (i = ES_IDX_STREAM; i < num_entries; i++) {
ep = exfat_get_dentry_cached(es, i);
if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
- goto free_es;
+ goto put_es;
}
- return es;
+ return 0;
+
+put_es:
+ exfat_put_dentry_set(es, false);
+ return -EIO;
+}
-free_es:
- exfat_free_dentry_set(es, false);
- return NULL;
+static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp)
+{
+ hint_femp->eidx = EXFAT_HINT_NONE;
+ hint_femp->count = 0;
+}
+
+static inline void exfat_set_empty_hint(struct exfat_inode_info *ei,
+ struct exfat_hint_femp *candi_empty, struct exfat_chain *clu,
+ int dentry, int num_entries, int entry_type)
+{
+ if (ei->hint_femp.eidx == EXFAT_HINT_NONE ||
+ ei->hint_femp.eidx > dentry) {
+ int total_entries = EXFAT_B_TO_DEN(i_size_read(&ei->vfs_inode));
+
+ if (candi_empty->count == 0) {
+ candi_empty->cur = *clu;
+ candi_empty->eidx = dentry;
+ }
+
+ if (entry_type == TYPE_UNUSED)
+ candi_empty->count += total_entries - dentry;
+ else
+ candi_empty->count++;
+
+ if (candi_empty->count == num_entries ||
+ candi_empty->count + candi_empty->eidx == total_entries)
+ ei->hint_femp = *candi_empty;
+ }
}
enum {
@@ -917,17 +947,21 @@ enum {
*/
int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
- int num_entries, unsigned int type, struct exfat_hint *hint_opt)
+ struct exfat_hint *hint_opt)
{
int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len;
int order, step, name_len = 0;
- int dentries_per_clu, num_empty = 0;
+ int dentries_per_clu;
unsigned int entry_type;
unsigned short *uniname = NULL;
struct exfat_chain clu;
struct exfat_hint *hint_stat = &ei->hint_stat;
struct exfat_hint_femp candi_empty;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ int num_entries = exfat_calc_num_entries(p_uniname);
+
+ if (num_entries < 0)
+ return num_entries;
dentries_per_clu = sbi->dentries_per_clu;
@@ -939,10 +973,13 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
end_eidx = dentry;
}
- candi_empty.eidx = EXFAT_HINT_NONE;
+ exfat_reset_empty_hint(&ei->hint_femp);
+
rewind:
order = 0;
step = DIRENT_STEP_FILE;
+ exfat_reset_empty_hint(&candi_empty);
+
while (clu.dir != EXFAT_EOF_CLUSTER) {
i = dentry & (dentries_per_clu - 1);
for (; i < dentries_per_clu; i++, dentry++) {
@@ -962,26 +999,9 @@ rewind:
entry_type == TYPE_DELETED) {
step = DIRENT_STEP_FILE;
- num_empty++;
- if (candi_empty.eidx == EXFAT_HINT_NONE &&
- num_empty == 1) {
- exfat_chain_set(&candi_empty.cur,
- clu.dir, clu.size, clu.flags);
- }
-
- if (candi_empty.eidx == EXFAT_HINT_NONE &&
- num_empty >= num_entries) {
- candi_empty.eidx =
- dentry - (num_empty - 1);
- WARN_ON(candi_empty.eidx < 0);
- candi_empty.count = num_empty;
-
- if (ei->hint_femp.eidx ==
- EXFAT_HINT_NONE ||
- candi_empty.eidx <=
- ei->hint_femp.eidx)
- ei->hint_femp = candi_empty;
- }
+ exfat_set_empty_hint(ei, &candi_empty, &clu,
+ dentry, num_entries,
+ entry_type);
brelse(bh);
if (entry_type == TYPE_UNUSED)
@@ -989,17 +1009,14 @@ rewind:
continue;
}
- num_empty = 0;
- candi_empty.eidx = EXFAT_HINT_NONE;
+ exfat_reset_empty_hint(&candi_empty);
if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) {
step = DIRENT_STEP_FILE;
hint_opt->clu = clu.dir;
hint_opt->eidx = i;
- if (type == TYPE_ALL || type == entry_type) {
- num_ext = ep->dentry.file.num_ext;
- step = DIRENT_STEP_STRM;
- }
+ num_ext = ep->dentry.file.num_ext;
+ step = DIRENT_STEP_STRM;
brelse(bh);
continue;
}
@@ -1090,12 +1107,19 @@ not_found:
rewind = 1;
dentry = 0;
clu.dir = p_dir->dir;
- /* reset empty hint */
- num_empty = 0;
- candi_empty.eidx = EXFAT_HINT_NONE;
goto rewind;
}
+ /*
+ * set the EXFAT_EOF_CLUSTER flag to avoid search
+ * from the beginning again when allocated a new cluster
+ */
+ if (ei->hint_femp.eidx == EXFAT_HINT_NONE) {
+ ei->hint_femp.cur.dir = EXFAT_EOF_CLUSTER;
+ ei->hint_femp.eidx = p_dir->size * dentries_per_clu;
+ ei->hint_femp.count = 0;
+ }
+
/* initialized hint_stat */
hint_stat->clu = p_dir->dir;
hint_stat->eidx = 0;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index a8f8eee4937c..bc6d21d7c5ad 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/ratelimit.h>
#include <linux/nls.h>
+#include <linux/blkdev.h>
#define EXFAT_ROOT_INO 1
@@ -41,6 +42,14 @@ enum {
#define ES_2_ENTRIES 2
#define ES_ALL_ENTRIES 0
+#define ES_IDX_FILE 0
+#define ES_IDX_STREAM 1
+#define ES_IDX_FIRST_FILENAME 2
+#define EXFAT_FILENAME_ENTRY_NUM(name_len) \
+ DIV_ROUND_UP(name_len, EXFAT_FILE_NAME_LEN)
+#define ES_IDX_LAST_FILENAME(name_len) \
+ (ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1)
+
#define DIR_DELETED 0xFFFF0321
/* type values */
@@ -62,15 +71,11 @@ enum {
#define TYPE_PADDING 0x0402
#define TYPE_ACLTAB 0x0403
#define TYPE_BENIGN_SEC 0x0800
-#define TYPE_ALL 0x0FFF
#define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */
#define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */
#define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE)
-/* Enough size to hold 256 dentry (even 512 Byte sector) */
-#define DIR_CACHE_SIZE (256*sizeof(struct exfat_dentry)/512+1)
-
#define EXFAT_HINT_NONE -1
#define EXFAT_MIN_SUBDIR 2
@@ -95,12 +100,18 @@ enum {
/*
* helpers for block size to dentry size conversion.
*/
-#define EXFAT_B_TO_DEN_IDX(b, sbi) \
- ((b) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
#define EXFAT_B_TO_DEN(b) ((b) >> DENTRY_SIZE_BITS)
#define EXFAT_DEN_TO_B(b) ((b) << DENTRY_SIZE_BITS)
/*
+ * helpers for cluster size to dentry size conversion.
+ */
+#define EXFAT_CLU_TO_DEN(clu, sbi) \
+ ((clu) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
+#define EXFAT_DEN_TO_CLU(dentry, sbi) \
+ ((dentry) >> ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
+
+/*
* helpers for fat entry.
*/
#define FAT_ENT_SIZE (4)
@@ -125,6 +136,17 @@ enum {
#define BITS_PER_BYTE_MASK 0x7
#define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1)
+#define ES_ENTRY_NUM(name_len) (ES_IDX_LAST_FILENAME(name_len) + 1)
+/* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */
+#define ES_MAX_ENTRY_NUM ES_ENTRY_NUM(MAX_NAME_LENGTH)
+
+/*
+ * 19 entries x 32 bytes/entry = 608 bytes.
+ * The 608 bytes are in 3 sectors at most (even 512 Byte sector).
+ */
+#define DIR_CACHE_SIZE \
+ (DIV_ROUND_UP(EXFAT_DEN_TO_B(ES_MAX_ENTRY_NUM), SECTOR_SIZE) + 1)
+
struct exfat_dentry_namebuf {
char *lfn;
int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */
@@ -166,13 +188,16 @@ struct exfat_hint {
struct exfat_entry_set_cache {
struct super_block *sb;
- bool modified;
unsigned int start_off;
int num_bh;
- struct buffer_head *bh[DIR_CACHE_SIZE];
+ struct buffer_head *__bh[DIR_CACHE_SIZE];
+ struct buffer_head **bh;
unsigned int num_entries;
+ bool modified;
};
+#define IS_DYNAMIC_ES(es) ((es)->__bh != (es)->bh)
+
struct exfat_dir_entry {
struct exfat_chain dir;
int entry;
@@ -375,7 +400,7 @@ static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi,
sbi->data_start_sector;
}
-static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
+static inline unsigned int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
sector_t sec)
{
return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) +
@@ -423,8 +448,8 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range);
/* file.c */
extern const struct file_operations exfat_file_operations;
-int __exfat_truncate(struct inode *inode, loff_t new_size);
-void exfat_truncate(struct inode *inode, loff_t size);
+int __exfat_truncate(struct inode *inode);
+void exfat_truncate(struct inode *inode);
int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr);
int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path,
@@ -464,15 +489,16 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es);
int exfat_calc_num_entries(struct exfat_uni_name *p_uniname);
int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
- int num_entries, unsigned int type, struct exfat_hint *hint_opt);
+ struct exfat_hint *hint_opt);
int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu);
struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
struct exfat_chain *p_dir, int entry, struct buffer_head **bh);
struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
int num);
-struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, unsigned int type);
-int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir, int entry,
+ unsigned int type);
+int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
/* inode.c */
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 4e0793f35e8f..f5b29072775d 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -93,7 +93,7 @@ static int exfat_sanitize_mode(const struct exfat_sb_info *sbi,
}
/* resize the file length */
-int __exfat_truncate(struct inode *inode, loff_t new_size)
+int __exfat_truncate(struct inode *inode)
{
unsigned int num_clusters_new, num_clusters_phys;
unsigned int last_clu = EXFAT_FREE_CLUSTER;
@@ -113,7 +113,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags);
- if (new_size > 0) {
+ if (i_size_read(inode) > 0) {
/*
* Truncate FAT chain num_clusters after the first cluster
* num_clusters = min(new, phys);
@@ -143,8 +143,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
ei->start_clu = EXFAT_EOF_CLUSTER;
}
- i_size_write(inode, new_size);
-
if (ei->type == TYPE_FILE)
ei->attr |= ATTR_ARCHIVE;
@@ -189,7 +187,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
return 0;
}
-void exfat_truncate(struct inode *inode, loff_t size)
+void exfat_truncate(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -207,7 +205,7 @@ void exfat_truncate(struct inode *inode, loff_t size)
goto write_size;
}
- err = __exfat_truncate(inode, i_size_read(inode));
+ err = __exfat_truncate(inode);
if (err)
goto write_size;
@@ -310,7 +308,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* __exfat_write_inode() is called from exfat_truncate(), inode
* is already written by it, so mark_inode_dirty() is unneeded.
*/
- exfat_truncate(inode, attr->ia_size);
+ exfat_truncate(inode);
up_write(&EXFAT_I(inode)->truncate_lock);
} else
mark_inode_dirty(inode);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 5590a1e83126..5b644cb057fa 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -21,7 +21,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
{
unsigned long long on_disk_size;
struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es = NULL;
+ struct exfat_entry_set_cache es;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
@@ -42,11 +42,10 @@ int __exfat_write_inode(struct inode *inode, int sync)
exfat_set_volume_dirty(sb);
/* get the directory entry of given file or directory */
- es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES);
- if (!es)
+ if (exfat_get_dentry_set(&es, sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES))
return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
+ ep = exfat_get_dentry_cached(&es, ES_IDX_FILE);
+ ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM);
ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode));
@@ -83,8 +82,8 @@ int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
}
- exfat_update_dir_chksum_with_entry_set(es);
- return exfat_free_dentry_set(es, sync);
+ exfat_update_dir_chksum_with_entry_set(&es);
+ return exfat_put_dentry_set(&es, sync);
}
int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -345,11 +344,6 @@ static void exfat_readahead(struct readahead_control *rac)
mpage_readahead(rac, exfat_get_block);
}
-static int exfat_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, exfat_get_block, wbc);
-}
-
static int exfat_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -363,7 +357,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
if (to > i_size_read(inode)) {
truncate_pagecache(inode, i_size_read(inode));
inode->i_mtime = inode->i_ctime = current_time(inode);
- exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned);
+ exfat_truncate(inode);
}
}
@@ -473,12 +467,12 @@ static const struct address_space_operations exfat_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = exfat_read_folio,
.readahead = exfat_readahead,
- .writepage = exfat_writepage,
.writepages = exfat_writepages,
.write_begin = exfat_write_begin,
.write_end = exfat_write_end,
.direct_IO = exfat_direct_IO,
- .bmap = exfat_aop_bmap
+ .bmap = exfat_aop_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
static inline unsigned long exfat_hash(loff_t i_pos)
@@ -627,7 +621,7 @@ void exfat_evict_inode(struct inode *inode)
if (!inode->i_nlink) {
i_size_write(inode, 0);
mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
- __exfat_truncate(inode, 0);
+ __exfat_truncate(inode);
mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock);
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index b617bebc3d0f..5f995eba5dbb 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -224,11 +224,18 @@ static int exfat_search_empty_slot(struct super_block *sb,
if (hint_femp->eidx != EXFAT_HINT_NONE) {
dentry = hint_femp->eidx;
- if (num_entries <= hint_femp->count) {
- hint_femp->eidx = EXFAT_HINT_NONE;
- return dentry;
- }
+ /*
+ * If hint_femp->count is enough, it is needed to check if
+ * there are actual empty entries.
+ * Otherwise, and if "dentry + hint_famp->count" is also equal
+ * to "p_dir->size * dentries_per_clu", it means ENOSPC.
+ */
+ if (dentry + hint_femp->count == p_dir->size * dentries_per_clu &&
+ num_entries > hint_femp->count)
+ return -ENOSPC;
+
+ hint_femp->eidx = EXFAT_HINT_NONE;
exfat_chain_dup(&clu, &hint_femp->cur);
} else {
exfat_chain_dup(&clu, p_dir);
@@ -293,6 +300,12 @@ static int exfat_search_empty_slot(struct super_block *sb,
}
}
+ hint_femp->eidx = p_dir->size * dentries_per_clu - num_empty;
+ hint_femp->count = num_empty;
+ if (num_empty == 0)
+ exfat_chain_set(&hint_femp->cur, EXFAT_EOF_CLUSTER, 0,
+ clu.flags);
+
return -ENOSPC;
}
@@ -369,15 +382,11 @@ static int exfat_find_empty_entry(struct inode *inode,
if (exfat_ent_set(sb, last_clu, clu.dir))
return -EIO;
- if (hint_femp.eidx == EXFAT_HINT_NONE) {
- /* the special case that new dentry
- * should be allocated from the start of new cluster
- */
- hint_femp.eidx = EXFAT_B_TO_DEN_IDX(p_dir->size, sbi);
- hint_femp.count = sbi->dentries_per_clu;
-
+ if (hint_femp.cur.dir == EXFAT_EOF_CLUSTER)
exfat_chain_set(&hint_femp.cur, clu.dir, 0, clu.flags);
- }
+
+ hint_femp.count += sbi->dentries_per_clu;
+
hint_femp.cur.size++;
p_dir->size++;
size = EXFAT_CLU_TO_B(p_dir->size, sbi);
@@ -588,14 +597,14 @@ unlock:
static int exfat_find(struct inode *dir, struct qstr *qname,
struct exfat_dir_entry *info)
{
- int ret, dentry, num_entries, count;
+ int ret, dentry, count;
struct exfat_chain cdir;
struct exfat_uni_name uni_name;
struct super_block *sb = dir->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(dir);
struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es;
+ struct exfat_entry_set_cache es;
/* for optimized dir & entry to prevent long traverse of cluster chain */
struct exfat_hint hint_opt;
@@ -607,10 +616,6 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
if (ret)
return ret;
- num_entries = exfat_calc_num_entries(&uni_name);
- if (num_entries < 0)
- return num_entries;
-
/* check the validation of hint_stat and initialize it if required */
if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) {
ei->hint_stat.clu = cdir.dir;
@@ -620,9 +625,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
}
/* search the file name for directories */
- dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name,
- num_entries, TYPE_ALL, &hint_opt);
-
+ dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, &hint_opt);
if (dentry < 0)
return dentry; /* -error value */
@@ -635,11 +638,10 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
if (cdir.flags & ALLOC_NO_FAT_CHAIN)
cdir.size -= dentry / sbi->dentries_per_clu;
dentry = hint_opt.eidx;
- es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES);
- if (!es)
+ if (exfat_get_dentry_set(&es, sb, &cdir, dentry, ES_2_ENTRIES))
return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
+ ep = exfat_get_dentry_cached(&es, ES_IDX_FILE);
+ ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM);
info->type = exfat_get_entry_type(ep);
info->attr = le16_to_cpu(ep->dentry.file.attr);
@@ -668,7 +670,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
ep->dentry.file.access_time,
ep->dentry.file.access_date,
0);
- exfat_free_dentry_set(es, false);
+ exfat_put_dentry_set(&es, false);
if (ei->start_clu == EXFAT_FREE_CLUSTER) {
exfat_fs_error(sb,
@@ -1167,7 +1169,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
struct exfat_inode_info *new_ei = NULL;
unsigned int new_entry_type = TYPE_UNUSED;
int new_entry = 0;
- struct buffer_head *old_bh, *new_bh = NULL;
+ struct buffer_head *new_bh = NULL;
/* check the validity of pointer parameters */
if (new_path == NULL || strlen(new_path) == 0)
@@ -1183,13 +1185,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
EXFAT_I(old_parent_inode)->flags);
dentry = ei->entry;
- ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh);
- if (!ep) {
- ret = -EIO;
- goto out;
- }
- brelse(old_bh);
-
/* check whether new dir is existing directory and empty */
if (new_inode) {
ret = -EIO;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index c648a493faf2..3204bd33e4e8 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -18,7 +18,7 @@
#include <linux/sched.h>
#include <linux/cred.h>
-#define dprintk(fmt, args...) do{}while(0)
+#define dprintk(fmt, args...) pr_debug(fmt, ##args)
static int get_name(const struct path *path, char *name, struct dentry *child);
@@ -132,8 +132,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
inode_unlock(dentry->d_inode);
if (IS_ERR(parent)) {
- dprintk("%s: get_parent of %ld failed, err %d\n",
- __func__, dentry->d_inode->i_ino, PTR_ERR(parent));
+ dprintk("get_parent of %lu failed, err %ld\n",
+ dentry->d_inode->i_ino, PTR_ERR(parent));
return parent;
}
@@ -147,7 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
dprintk("%s: found name: %s\n", __func__, nbuf);
tmp = lookup_one_unlocked(mnt_user_ns(mnt), nbuf, parent, strlen(nbuf));
if (IS_ERR(tmp)) {
- dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
+ dprintk("lookup failed: %ld\n", PTR_ERR(tmp));
err = PTR_ERR(tmp);
goto out_err;
}
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index bf298967c5b8..440d5f1e9d47 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -219,11 +219,12 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
* inode->i_mutex: down
*/
int
-ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error;
int update_mode = 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
if (type == ACL_TYPE_ACCESS && acl) {
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 925ab6287d35..3841becb94ff 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -56,7 +56,7 @@ static inline int ext2_acl_count(size_t size)
/* acl.c */
extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu);
-extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int ext2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 5dc0a31f4a08..eca60b747c6b 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -667,7 +667,7 @@ ext2_try_to_allocate(struct super_block *sb, int group,
{
ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group);
ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group);
- ext2_grpblk_t start, end;
+ ext2_grpblk_t start, end;
unsigned long num = 0;
start = 0;
@@ -1481,11 +1481,11 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
desc_count, bitmap_count);
return bitmap_count;
#else
- for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
- desc = ext2_get_group_desc (sb, i, NULL);
- if (!desc)
- continue;
- desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+ for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+ desc = ext2_get_group_desc(sb, i, NULL);
+ if (!desc)
+ continue;
+ desc_count += le16_to_cpu(desc->bg_free_blocks_count);
}
return desc_count;
#endif
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 8f597753ac12..e5cbc27ba459 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -81,11 +81,10 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
+static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
{
struct address_space *mapping = page->mapping;
struct inode *dir = mapping->host;
- int err = 0;
inode_inc_iversion(dir);
block_write_end(NULL, mapping, pos, len, len, page, NULL);
@@ -94,16 +93,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
}
-
- if (IS_DIRSYNC(dir)) {
- err = write_one_page(page);
- if (!err)
- err = sync_inode_metadata(dir, 1);
- } else {
- unlock_page(page);
- }
-
- return err;
+ unlock_page(page);
}
static bool ext2_check_page(struct page *page, int quiet, char *kaddr)
@@ -413,7 +403,7 @@ found:
return de;
}
-/**
+/*
* Return the '..' directory entry and the page in which the entry was found
* (as a parameter - p).
*
@@ -460,6 +450,17 @@ static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, ext2_get_block);
}
+
+static int ext2_handle_dirsync(struct inode *dir)
+{
+ int err;
+
+ err = filemap_write_and_wait(dir->i_mapping);
+ if (!err)
+ err = sync_inode_metadata(dir, 1);
+ return err;
+}
+
void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
struct page *page, void *page_addr, struct inode *inode,
int update_times)
@@ -474,11 +475,12 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
BUG_ON(err);
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type(de, inode);
- err = ext2_commit_chunk(page, pos, len);
+ ext2_commit_chunk(page, pos, len);
if (update_times)
dir->i_mtime = dir->i_ctime = current_time(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
+ ext2_handle_dirsync(dir);
}
/*
@@ -566,10 +568,11 @@ got_it:
memcpy(de->name, name, namelen);
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
- err = ext2_commit_chunk(page, pos, rec_len);
+ ext2_commit_chunk(page, pos, rec_len);
dir->i_mtime = dir->i_ctime = current_time(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
+ err = ext2_handle_dirsync(dir);
/* OFFSET_CACHE */
out_put:
ext2_put_page(page, page_addr);
@@ -615,10 +618,11 @@ int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page,
if (pde)
pde->rec_len = ext2_rec_len_to_disk(to - from);
dir->inode = 0;
- err = ext2_commit_chunk(page, pos, to - from);
+ ext2_commit_chunk(page, pos, to - from);
inode->i_ctime = inode->i_mtime = current_time(inode);
EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(inode);
+ err = ext2_handle_dirsync(inode);
out:
return err;
}
@@ -658,7 +662,8 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
memcpy (de->name, "..\0", 4);
ext2_set_de_type (de, inode);
kunmap_atomic(kaddr);
- err = ext2_commit_chunk(page, 0, chunk_size);
+ ext2_commit_chunk(page, 0, chunk_size);
+ err = ext2_handle_dirsync(inode);
fail:
put_page(page);
return err;
@@ -679,7 +684,7 @@ int ext2_empty_dir (struct inode * inode)
page = ext2_get_page(inode, i, 0, &page_addr);
if (IS_ERR(page))
- goto not_empty;
+ return 0;
kaddr = page_addr;
de = (ext2_dirent *)kaddr;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index eb97aa3d700e..6b4bebe982ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -200,7 +200,7 @@ const struct inode_operations ext2_file_inode_operations = {
.listxattr = ext2_listxattr,
.getattr = ext2_getattr,
.setattr = ext2_setattr,
- .get_acl = ext2_get_acl,
+ .get_inode_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
.fiemap = ext2_fiemap,
.fileattr_get = ext2_fileattr_get,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index f4944c4dee60..78b8686d9a4a 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -277,7 +277,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- parent_group = prandom_u32_max(ngroups);
+ parent_group = get_random_u32_below(ngroups);
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
desc = ext2_get_group_desc (sb, group, NULL);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 918ab2f9e4c0..69aed9e2359e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -869,11 +869,6 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
-static int ext2_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, ext2_get_block, wbc);
-}
-
static int ext2_read_folio(struct file *file, struct folio *folio)
{
return mpage_read_folio(folio, ext2_get_block);
@@ -948,7 +943,6 @@ const struct address_space_operations ext2_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = ext2_read_folio,
.readahead = ext2_readahead,
- .writepage = ext2_writepage,
.write_begin = ext2_write_begin,
.write_end = ext2_write_end,
.bmap = ext2_bmap,
@@ -1652,7 +1646,7 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
setattr_copy(&init_user_ns, inode, iattr);
if (iattr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
mark_inode_dirty(inode);
return error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 9125eab85146..c056957221a2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -427,7 +427,7 @@ const struct inode_operations ext2_dir_inode_operations = {
.listxattr = ext2_listxattr,
.getattr = ext2_getattr,
.setattr = ext2_setattr,
- .get_acl = ext2_get_acl,
+ .get_inode_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
.tmpfile = ext2_tmpfile,
.fileattr_get = ext2_fileattr_get,
@@ -438,6 +438,6 @@ const struct inode_operations ext2_special_inode_operations = {
.listxattr = ext2_listxattr,
.getattr = ext2_getattr,
.setattr = ext2_setattr,
- .get_acl = ext2_get_acl,
+ .get_inode_acl = ext2_get_acl,
.set_acl = ext2_set_acl,
};
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 03f2af98b1b4..69c88facfe90 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1648,7 +1648,7 @@ static int __init init_ext2_fs(void)
err = init_inodecache();
if (err)
return err;
- err = register_filesystem(&ext2_fs_type);
+ err = register_filesystem(&ext2_fs_type);
if (err)
goto out;
return 0;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 57e82e25f8e2..a9f89539aeee 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -225,12 +225,13 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
}
int
-ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
handle_t *handle;
int error, credits, retries = 0;
size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
int update_mode = 0;
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 3219669732bf..09c4a8a3b716 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -56,7 +56,7 @@ static inline int ext4_acl_count(size_t size)
/* acl.c */
struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu);
-int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ext4_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8d5453852f98..140e1eb300d1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -558,7 +558,7 @@ enum {
*
* It's not paranoia if the Murphy's Law really *is* out to get you. :-)
*/
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
static inline void ext4_check_flag_values(void)
@@ -2964,7 +2964,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode,
typedef enum {
EXT4_IGET_NORMAL = 0,
EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */
- EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */
+ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */
+ EXT4_IGET_BAD = 0x0004 /* Allow to iget a bad inode */
} ext4_iget_flags;
extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -2999,6 +3000,7 @@ extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
@@ -3619,8 +3621,8 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
-extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
- struct inode *inode);
+extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
struct dentry *dentry);
@@ -3756,8 +3758,7 @@ extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
- int len,
- bool keep_towrite);
+ int len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 8e1fb18f465e..77f318ec8abb 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -86,15 +86,21 @@ static int ext4_journal_check_start(struct super_block *sb)
return 0;
}
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+handle_t *__ext4_journal_start_sb(struct inode *inode,
+ struct super_block *sb, unsigned int line,
int type, int blocks, int rsv_blocks,
int revoke_creds)
{
journal_t *journal;
int err;
-
- trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
- _RET_IP_);
+ if (inode)
+ trace_ext4_journal_start_inode(inode, blocks, rsv_blocks,
+ revoke_creds, type,
+ _RET_IP_);
+ else
+ trace_ext4_journal_start_sb(sb, blocks, rsv_blocks,
+ revoke_creds, type,
+ _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index db2ae4a2b38d..0c77697d5e90 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -261,9 +261,9 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
(bh))
-handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks,
- int revoke_creds);
+handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
+ unsigned int line, int type, int blocks,
+ int rsv_blocks, int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -303,7 +303,7 @@ static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
+ __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\
ext4_trans_default_revoke_credits(sb))
#define ext4_journal_start(inode, type, nblocks) \
@@ -323,7 +323,7 @@ static inline handle_t *__ext4_journal_start(struct inode *inode,
int blocks, int rsv_blocks,
int revoke_creds)
{
- return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+ return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks,
rsv_blocks, revoke_creds);
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6c399a8b22b3..9de1c9d1a13d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2635,9 +2635,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unwritten, ex_ee_len);
path[depth].p_ext = ex;
- a = ex_ee_block > start ? ex_ee_block : start;
- b = ex_ee_block+ex_ee_len - 1 < end ?
- ex_ee_block+ex_ee_len - 1 : end;
+ a = max(ex_ee_block, start);
+ b = min(ex_ee_block + ex_ee_len - 1, end);
ext_debug(inode, " border %u:%u\n", a, b);
@@ -5567,8 +5566,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
- ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
- len_lblk, SHIFT_RIGHT);
+ max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
@@ -5799,6 +5797,14 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
struct ext4_extent *extent;
ext4_lblk_t first_lblk, first_lclu, last_lclu;
+ /*
+ * if data can be stored inline, the logical cluster isn't
+ * mapped - no physical clusters have been allocated, and the
+ * file has no extents
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ return 0;
+
/* search for the extent closest to the first block in the cluster */
path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
if (IS_ERR(path)) {
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index cd0a861853e3..7bc221038c6c 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -155,9 +155,7 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
int __init ext4_init_es(void)
{
- ext4_es_cachep = kmem_cache_create("ext4_extent_status",
- sizeof(struct extent_status),
- 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
if (ext4_es_cachep == NULL)
return -ENOMEM;
return 0;
@@ -1371,7 +1369,7 @@ retry:
if (count_reserved)
count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
&orig_es, &rc);
- goto out;
+ goto out_get_reserved;
}
if (len1 > 0) {
@@ -1413,6 +1411,7 @@ retry:
}
}
+out_get_reserved:
if (count_reserved)
*reserved = get_rsvd(inode, end, es, &rc);
out:
@@ -1807,9 +1806,7 @@ static void ext4_print_pending_tree(struct inode *inode)
int __init ext4_init_pending(void)
{
- ext4_pending_cachep = kmem_cache_create("ext4_pending_reservation",
- sizeof(struct pending_reservation),
- 0, (SLAB_RECLAIM_ACCOUNT), NULL);
+ ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
if (ext4_pending_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 0f6d0a80467d..4594b62f147b 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -420,25 +420,34 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
struct __track_dentry_update_args *dentry_update =
(struct __track_dentry_update_args *)arg;
struct dentry *dentry = dentry_update->dentry;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
mutex_unlock(&ei->i_fc_lock);
+
+ if (IS_ENCRYPTED(dir)) {
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
+ NULL);
+ mutex_lock(&ei->i_fc_lock);
+ return -EOPNOTSUPP;
+ }
+
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
node->fcd_op = dentry_update->op;
- node->fcd_parent = dentry->d_parent->d_inode->i_ino;
+ node->fcd_parent = dir->i_ino;
node->fcd_ino = inode->i_ino;
if (dentry->d_name.len > DNAME_INLINE_LEN) {
node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
if (!node->fcd_name.name) {
kmem_cache_free(ext4_fc_dentry_cachep, node);
- ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_NOMEM, NULL);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -666,18 +675,6 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
/* Ext4 commit path routines */
-/* memzero and update CRC */
-static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
- u32 *crc)
-{
- void *ret;
-
- ret = memset(dst, 0, len);
- if (crc)
- *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
- return ret;
-}
-
/*
* Allocate len bytes on a fast commit buffer.
*
@@ -691,62 +688,60 @@ static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
*/
static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{
- struct ext4_fc_tl *tl;
+ struct ext4_fc_tl tl;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct buffer_head *bh;
int bsize = sbi->s_journal->j_blocksize;
int ret, off = sbi->s_fc_bytes % bsize;
- int pad_len;
+ int remaining;
+ u8 *dst;
/*
- * After allocating len, we should have space at least for a 0 byte
- * padding.
+ * If 'len' is too long to fit in any block alongside a PAD tlv, then we
+ * cannot fulfill the request.
*/
- if (len + EXT4_FC_TAG_BASE_LEN > bsize)
+ if (len > bsize - EXT4_FC_TAG_BASE_LEN)
return NULL;
- if (bsize - off - 1 > len + EXT4_FC_TAG_BASE_LEN) {
- /*
- * Only allocate from current buffer if we have enough space for
- * this request AND we have space to add a zero byte padding.
- */
- if (!sbi->s_fc_bh) {
- ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
- if (ret)
- return NULL;
- sbi->s_fc_bh = bh;
- }
+ if (!sbi->s_fc_bh) {
+ ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
+ if (ret)
+ return NULL;
+ sbi->s_fc_bh = bh;
+ }
+ dst = sbi->s_fc_bh->b_data + off;
+
+ /*
+ * Allocate the bytes in the current block if we can do so while still
+ * leaving enough space for a PAD tlv.
+ */
+ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
+ if (len <= remaining) {
sbi->s_fc_bytes += len;
- return sbi->s_fc_bh->b_data + off;
+ return dst;
}
- /* Need to add PAD tag */
- tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
- tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
- pad_len = bsize - off - 1 - EXT4_FC_TAG_BASE_LEN;
- tl->fc_len = cpu_to_le16(pad_len);
- if (crc)
- *crc = ext4_chksum(sbi, *crc, tl, EXT4_FC_TAG_BASE_LEN);
- if (pad_len > 0)
- ext4_fc_memzero(sb, tl + 1, pad_len, crc);
+
+ /*
+ * Else, terminate the current block with a PAD tlv, then allocate a new
+ * block and allocate the bytes at the start of that new block.
+ */
+
+ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
+ tl.fc_len = cpu_to_le16(remaining);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
+ *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
+
ext4_fc_submit_bh(sb, false);
ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
if (ret)
return NULL;
sbi->s_fc_bh = bh;
- sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
+ sbi->s_fc_bytes += bsize - off + len;
return sbi->s_fc_bh->b_data;
}
-/* memcpy to fc reserved space and update CRC */
-static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
- int len, u32 *crc)
-{
- if (crc)
- *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
- return memcpy(dst, src, len);
-}
-
/*
* Complete a fast commit by writing tail tag.
*
@@ -774,16 +769,20 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
off = sbi->s_fc_bytes % bsize;
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
- tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
+ tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, &crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
dst += EXT4_FC_TAG_BASE_LEN;
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
- ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
+ memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
dst += sizeof(tail.fc_tid);
+ crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
+ dst - (u8 *)sbi->s_fc_bh->b_data);
tail.fc_crc = cpu_to_le32(crc);
- ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
+ memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
+ dst += sizeof(tail.fc_crc);
+ memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
ext4_fc_submit_bh(sb, true);
@@ -807,8 +806,8 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
tl.fc_tag = cpu_to_le16(tag);
tl.fc_len = cpu_to_le16(len);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
- ext4_fc_memcpy(sb, dst + EXT4_FC_TAG_BASE_LEN, val, len, crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
+ memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
return true;
}
@@ -830,11 +829,11 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
- ext4_fc_memcpy(sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc);
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
dst += EXT4_FC_TAG_BASE_LEN;
- ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
+ memcpy(dst, &fcd, sizeof(fcd));
dst += sizeof(fcd);
- ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc);
+ memcpy(dst, fc_dentry->fcd_name.name, dlen);
return true;
}
@@ -872,15 +871,11 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
if (!dst)
goto err;
- if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, EXT4_FC_TAG_BASE_LEN, crc))
- goto err;
+ memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
dst += EXT4_FC_TAG_BASE_LEN;
- if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
- goto err;
+ memcpy(dst, &fc_inode, sizeof(fc_inode));
dst += sizeof(fc_inode);
- if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
- inode_len, crc))
- goto err;
+ memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
ret = 0;
err:
brelse(iloc.bh);
@@ -986,7 +981,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
finish_wait(&ei->i_fc_wait, &wait);
}
spin_unlock(&sbi->s_fc_lock);
- ret = jbd2_submit_inode_data(ei->jinode);
+ ret = jbd2_submit_inode_data(journal, ei->jinode);
if (ret)
return ret;
spin_lock(&sbi->s_fc_lock);
@@ -1388,7 +1383,7 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
return 0;
}
- ret = __ext4_unlink(NULL, old_parent, &entry, inode);
+ ret = __ext4_unlink(old_parent, &entry, inode, NULL);
/* -ENOENT ok coz it might not exist anymore. */
if (ret == -ENOENT)
ret = 0;
@@ -1977,32 +1972,31 @@ void ext4_fc_replay_cleanup(struct super_block *sb)
kfree(sbi->s_fc_replay_state.fc_modified_inodes);
}
-static inline bool ext4_fc_tag_len_isvalid(struct ext4_fc_tl *tl,
- u8 *val, u8 *end)
+static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
+ int tag, int len)
{
- if (val + tl->fc_len > end)
- return false;
-
- /* Here only check ADD_RANGE/TAIL/HEAD which will read data when do
- * journal rescan before do CRC check. Other tags length check will
- * rely on CRC check.
- */
- switch (tl->fc_tag) {
+ switch (tag) {
case EXT4_FC_TAG_ADD_RANGE:
- return (sizeof(struct ext4_fc_add_range) == tl->fc_len);
- case EXT4_FC_TAG_TAIL:
- return (sizeof(struct ext4_fc_tail) <= tl->fc_len);
- case EXT4_FC_TAG_HEAD:
- return (sizeof(struct ext4_fc_head) == tl->fc_len);
+ return len == sizeof(struct ext4_fc_add_range);
case EXT4_FC_TAG_DEL_RANGE:
+ return len == sizeof(struct ext4_fc_del_range);
+ case EXT4_FC_TAG_CREAT:
case EXT4_FC_TAG_LINK:
case EXT4_FC_TAG_UNLINK:
- case EXT4_FC_TAG_CREAT:
+ len -= sizeof(struct ext4_fc_dentry_info);
+ return len >= 1 && len <= EXT4_NAME_LEN;
case EXT4_FC_TAG_INODE:
+ len -= sizeof(struct ext4_fc_inode);
+ return len >= EXT4_GOOD_OLD_INODE_SIZE &&
+ len <= sbi->s_inode_size;
case EXT4_FC_TAG_PAD:
- default:
- return true;
+ return true; /* padding can have any length */
+ case EXT4_FC_TAG_TAIL:
+ return len >= sizeof(struct ext4_fc_tail);
+ case EXT4_FC_TAG_HEAD:
+ return len == sizeof(struct ext4_fc_head);
}
+ return false;
}
/*
@@ -2040,7 +2034,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
state = &sbi->s_fc_replay_state;
start = (u8 *)bh->b_data;
- end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+ end = start + journal->j_blocksize;
if (state->fc_replay_expected_off == 0) {
state->fc_cur_tag = 0;
@@ -2061,11 +2055,12 @@ static int ext4_fc_replay_scan(journal_t *journal,
}
state->fc_replay_expected_off++;
- for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
ext4_fc_get_tl(&tl, cur);
val = cur + EXT4_FC_TAG_BASE_LEN;
- if (!ext4_fc_tag_len_isvalid(&tl, val, end)) {
+ if (tl.fc_len > end - val ||
+ !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
ret = state->fc_replay_num_tags ?
JBD2_FC_REPLAY_STOP : -ECANCELED;
goto out_err;
@@ -2178,9 +2173,9 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
#endif
start = (u8 *)bh->b_data;
- end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+ end = start + journal->j_blocksize;
- for (cur = start; cur < end - EXT4_FC_TAG_BASE_LEN;
+ for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
ext4_fc_get_tl(&tl, cur);
val = cur + EXT4_FC_TAG_BASE_LEN;
@@ -2249,17 +2244,17 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal)
journal->j_fc_cleanup_callback = ext4_fc_cleanup;
}
-static const char *fc_ineligible_reasons[] = {
- "Extended attributes changed",
- "Cross rename",
- "Journal flag changed",
- "Insufficient memory",
- "Swap boot",
- "Resize",
- "Dir renamed",
- "Falloc range op",
- "Data journalling",
- "FC Commit Failed"
+static const char * const fc_ineligible_reasons[] = {
+ [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
+ [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
+ [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
+ [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
+ [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
+ [EXT4_FC_REASON_RESIZE] = "Resize",
+ [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
+ [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
+ [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
+ [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
};
int ext4_fc_info_show(struct seq_file *seq, void *v)
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index a6154c3ed135..2fadb2c4780c 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -58,7 +58,7 @@ struct ext4_fc_dentry_info {
__u8 fc_dname[];
};
-/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
+/* Value structure for EXT4_FC_TAG_INODE. */
struct ext4_fc_inode {
__le32 fc_ino;
__u8 fc_raw_inode[];
@@ -96,6 +96,7 @@ enum {
EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
+ EXT4_FC_REASON_ENCRYPTED_FILENAME,
EXT4_FC_REASON_MAX
};
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a7a597c727e6..7ac0a81bd371 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -955,7 +955,7 @@ const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_file_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
.fileattr_get = ext4_fileattr_get,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e9bc46684106..63f9bb6e8851 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -465,7 +465,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
parent_group = hinfo.hash % ngroups;
} else
- parent_group = prandom_u32_max(ngroups);
+ parent_group = get_random_u32_below(ngroups);
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
get_orlov_stats(sb, g, flex_size, &stats);
@@ -870,7 +870,7 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
struct super_block *sb = dir->i_sb;
int nblocks = 0;
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+ struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -1076,8 +1076,8 @@ repeat_in_this_group:
if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
BUG_ON(nblocks <= 0);
- handle = __ext4_journal_start_sb(dir->i_sb, line_no,
- handle_type, nblocks, 0,
+ handle = __ext4_journal_start_sb(NULL, dir->i_sb,
+ line_no, handle_type, nblocks, 0,
ext4_trans_default_revoke_credits(sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 860fc5119009..c68bebe7ff4b 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
struct super_block *sb = inode->i_sb;
Indirect *p = chain;
struct buffer_head *bh;
+ unsigned int key;
int ret = -EIO;
*err = 0;
@@ -156,7 +157,13 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
if (!p->key)
goto no_block;
while (--depth) {
- bh = sb_getblk(sb, le32_to_cpu(p->key));
+ key = le32_to_cpu(p->key);
+ if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+ /* the block was out of range */
+ ret = -EFSCORRUPTED;
+ goto failure;
+ }
+ bh = sb_getblk(sb, key);
if (unlikely(!bh)) {
ret = -ENOMEM;
goto failure;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index a4fbe825694b..2b42ececa46d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -180,8 +180,7 @@ static int ext4_read_inline_data(struct inode *inode, void *buffer,
BUG_ON(len > EXT4_I(inode)->i_inline_size);
- cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
- len : EXT4_MIN_INLINE_DATA_SIZE;
+ cp_len = min_t(unsigned int, len, EXT4_MIN_INLINE_DATA_SIZE);
raw_inode = ext4_raw_inode(iloc);
memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2b5ef1b64249..9d9f414f99fe 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -222,13 +222,13 @@ void ext4_evict_inode(struct inode *inode)
/*
* For inodes with journalled data, transaction commit could have
- * dirtied the inode. Flush worker is ignoring it because of I_FREEING
- * flag but we still need to remove the inode from the writeback lists.
+ * dirtied the inode. And for inodes with dioread_nolock, unwritten
+ * extents converting worker could merge extents and also have dirtied
+ * the inode. Flush worker is ignoring it because of I_FREEING flag but
+ * we still need to remove the inode from the writeback lists.
*/
- if (!list_empty_careful(&inode->i_io_list)) {
- WARN_ON_ONCE(!ext4_should_journal_data(inode));
+ if (!list_empty_careful(&inode->i_io_list))
inode_io_list_del(inode);
- }
/*
* Protect us against freezing - iput() caller didn't have to have any
@@ -335,6 +335,12 @@ stop_handle:
ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
+ /*
+ * Check out some where else accidentally dirty the evicting inode,
+ * which may probably cause inode use-after-free issues later.
+ */
+ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));
+
if (!list_empty(&EXT4_I(inode)->i_fc_list))
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
@@ -1309,7 +1315,8 @@ static int ext4_write_end(struct file *file,
trace_ext4_write_end(inode, pos, len, copied);
- if (ext4_has_inline_data(inode))
+ if (ext4_has_inline_data(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
return ext4_write_inline_data_end(inode, pos, len, copied, page);
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -1543,9 +1550,12 @@ void ext4_da_release_space(struct inode *inode, int to_free)
*/
struct mpage_da_data {
+ /* These are input fields for ext4_do_writepages() */
struct inode *inode;
struct writeback_control *wbc;
+ unsigned int can_map:1; /* Can writepages call map blocks? */
+ /* These are internal state of ext4_do_writepages() */
pgoff_t first_page; /* The first page to write */
pgoff_t next_page; /* Current page to examine */
pgoff_t last_page; /* Last page to examine */
@@ -2009,7 +2019,6 @@ static int ext4_writepage(struct page *page,
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
struct ext4_io_submit io_submit;
- bool keep_towrite = false;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
folio_invalidate(folio, 0, folio_size(folio));
@@ -2067,7 +2076,6 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return 0;
}
- keep_towrite = true;
}
if (PageChecked(page) && ext4_should_journal_data(inode))
@@ -2084,7 +2092,7 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return -ENOMEM;
}
- ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite);
+ ret = ext4_bio_write_page(&io_submit, page, len);
ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
@@ -2118,7 +2126,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
- err = ext4_bio_write_page(&mpd->io_submit, page, len, false);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len);
if (!err)
mpd->wbc->nr_to_write--;
mpd->first_page++;
@@ -2551,18 +2559,33 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}
+/* Return true if the page needs to be written as part of transaction commit */
+static bool ext4_page_nomap_can_writeout(struct page *page)
+{
+ struct buffer_head *bh, *head;
+
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh))
+ return true;
+ } while ((bh = bh->b_this_page) != head);
+ return false;
+}
+
/*
* mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
- * and underlying extent to map
+ * needing mapping, submit mapped pages
*
* @mpd - where to look for pages
*
* Walk dirty pages in the mapping. If they are fully mapped, submit them for
- * IO immediately. When we find a page which isn't mapped we start accumulating
- * extent of buffers underlying these pages that needs mapping (formed by
- * either delayed or unwritten buffers). We also lock the pages containing
- * these buffers. The extent found is returned in @mpd structure (starting at
- * mpd->lblk with length mpd->len blocks).
+ * IO immediately. If we cannot map blocks, we submit just already mapped
+ * buffers in the page for IO and keep page dirty. When we can map blocks and
+ * we find a page which isn't mapped we start accumulating extent of buffers
+ * underlying these pages that needs mapping (formed by either delayed or
+ * unwritten buffers). We also lock the pages containing these buffers. The
+ * extent found is returned in @mpd structure (starting at mpd->lblk with
+ * length mpd->len blocks).
*
* Note that this function can attach bios to one io_end structure which are
* neither logically nor physically contiguous. Although it may seem as an
@@ -2653,14 +2676,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
- /* Add all dirty buffers to mpd */
- lblk = ((ext4_lblk_t)page->index) <<
- (PAGE_SHIFT - blkbits);
- head = page_buffers(page);
- err = mpage_process_page_bufs(mpd, head, head, lblk);
- if (err <= 0)
- goto out;
- err = 0;
+ /*
+ * Writeout for transaction commit where we cannot
+ * modify metadata is simple. Just submit the page.
+ */
+ if (!mpd->can_map) {
+ if (ext4_page_nomap_can_writeout(page)) {
+ err = mpage_submit_page(mpd, page);
+ if (err < 0)
+ goto out;
+ } else {
+ unlock_page(page);
+ mpd->first_page++;
+ }
+ } else {
+ /* Add all dirty buffers to mpd */
+ lblk = ((ext4_lblk_t)page->index) <<
+ (PAGE_SHIFT - blkbits);
+ head = page_buffers(page);
+ err = mpage_process_page_bufs(mpd, head, head,
+ lblk);
+ if (err <= 0)
+ goto out;
+ err = 0;
+ }
left--;
}
pagevec_release(&pvec);
@@ -2673,25 +2712,27 @@ out:
return err;
}
-static int ext4_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc,
+ void *data)
{
+ return ext4_writepage(page, wbc);
+}
+
+static int ext4_do_writepages(struct mpage_da_data *mpd)
+{
+ struct writeback_control *wbc = mpd->wbc;
pgoff_t writeback_index = 0;
long nr_to_write = wbc->nr_to_write;
int range_whole = 0;
int cycled = 1;
handle_t *handle = NULL;
- struct mpage_da_data mpd;
- struct inode *inode = mapping->host;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
int needed_blocks, rsv_blocks = 0, ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
struct blk_plug plug;
bool give_up_on_write = false;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
-
- percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
/*
@@ -2703,7 +2744,9 @@ static int ext4_writepages(struct address_space *mapping,
goto out_writepages;
if (ext4_should_journal_data(inode)) {
- ret = generic_writepages(mapping, wbc);
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL);
+ blk_finish_plug(&plug);
goto out_writepages;
}
@@ -2757,19 +2800,18 @@ static int ext4_writepages(struct address_space *mapping,
writeback_index = mapping->writeback_index;
if (writeback_index)
cycled = 0;
- mpd.first_page = writeback_index;
- mpd.last_page = -1;
+ mpd->first_page = writeback_index;
+ mpd->last_page = -1;
} else {
- mpd.first_page = wbc->range_start >> PAGE_SHIFT;
- mpd.last_page = wbc->range_end >> PAGE_SHIFT;
+ mpd->first_page = wbc->range_start >> PAGE_SHIFT;
+ mpd->last_page = wbc->range_end >> PAGE_SHIFT;
}
- mpd.inode = inode;
- mpd.wbc = wbc;
- ext4_io_submit_init(&mpd.io_submit, wbc);
+ ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+ tag_pages_for_writeback(mapping, mpd->first_page,
+ mpd->last_page);
blk_start_plug(&plug);
/*
@@ -2778,31 +2820,32 @@ retry:
* in the block layer on device congestion while having transaction
* started.
*/
- mpd.do_map = 0;
- mpd.scanned_until_end = 0;
- mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!mpd.io_submit.io_end) {
+ mpd->do_map = 0;
+ mpd->scanned_until_end = 0;
+ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd->io_submit.io_end) {
ret = -ENOMEM;
goto unplug;
}
- ret = mpage_prepare_extent_to_map(&mpd);
+ ret = mpage_prepare_extent_to_map(mpd);
/* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, false);
+ mpage_release_unused_pages(mpd, false);
/* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
- ext4_put_io_end_defer(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_io_submit(&mpd->io_submit);
+ ext4_put_io_end_defer(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
if (ret < 0)
goto unplug;
- while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
+ while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
/* For each extent of pages we use new io_end */
- mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!mpd.io_submit.io_end) {
+ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!mpd->io_submit.io_end) {
ret = -ENOMEM;
break;
}
+ WARN_ON_ONCE(!mpd->can_map);
/*
* We have two constraints: We find one extent to map and we
* must always write out whole page (makes a difference when
@@ -2822,16 +2865,16 @@ retry:
"%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
/* Release allocated io_end */
- ext4_put_io_end(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_put_io_end(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
break;
}
- mpd.do_map = 1;
+ mpd->do_map = 1;
- trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
- ret = mpage_prepare_extent_to_map(&mpd);
- if (!ret && mpd.map.m_len)
- ret = mpage_map_and_submit_extent(handle, &mpd,
+ trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
+ ret = mpage_prepare_extent_to_map(mpd);
+ if (!ret && mpd->map.m_len)
+ ret = mpage_map_and_submit_extent(handle, mpd,
&give_up_on_write);
/*
* Caution: If the handle is synchronous,
@@ -2846,12 +2889,12 @@ retry:
if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
ext4_journal_stop(handle);
handle = NULL;
- mpd.do_map = 0;
+ mpd->do_map = 0;
}
/* Unlock pages we didn't use */
- mpage_release_unused_pages(&mpd, give_up_on_write);
+ mpage_release_unused_pages(mpd, give_up_on_write);
/* Submit prepared bio */
- ext4_io_submit(&mpd.io_submit);
+ ext4_io_submit(&mpd->io_submit);
/*
* Drop our io_end reference we got from init. We have
@@ -2861,11 +2904,11 @@ retry:
* up doing unwritten extent conversion.
*/
if (handle) {
- ext4_put_io_end_defer(mpd.io_submit.io_end);
+ ext4_put_io_end_defer(mpd->io_submit.io_end);
ext4_journal_stop(handle);
} else
- ext4_put_io_end(mpd.io_submit.io_end);
- mpd.io_submit.io_end = NULL;
+ ext4_put_io_end(mpd->io_submit.io_end);
+ mpd->io_submit.io_end = NULL;
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2885,8 +2928,8 @@ unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
- mpd.last_page = writeback_index - 1;
- mpd.first_page = 0;
+ mpd->last_page = writeback_index - 1;
+ mpd->first_page = 0;
goto retry;
}
@@ -2896,15 +2939,51 @@ unplug:
* Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = mpd.first_page;
+ mapping->writeback_index = mpd->first_page;
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
+static int ext4_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct super_block *sb = mapping->host->i_sb;
+ struct mpage_da_data mpd = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .can_map = 1,
+ };
+ int ret;
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ return -EIO;
+
+ percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
+ ret = ext4_do_writepages(&mpd);
+ percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
+
+ return ret;
+}
+
+int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = jinode->i_dirty_start,
+ .range_end = jinode->i_dirty_end,
+ };
+ struct mpage_da_data mpd = {
+ .inode = jinode->i_vfs_inode,
+ .wbc = &wbc,
+ .can_map = 0,
+ };
+ return ext4_do_writepages(&mpd);
+}
+
static int ext4_dax_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -3646,7 +3725,6 @@ static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
static const struct address_space_operations ext4_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
@@ -3664,7 +3742,6 @@ static const struct address_space_operations ext4_aops = {
static const struct address_space_operations ext4_journalled_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
@@ -3673,6 +3750,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.invalidate_folio = ext4_journalled_invalidate_folio,
.release_folio = ext4_release_folio,
.direct_IO = noop_direct_IO,
+ .migrate_folio = buffer_migrate_folio_norefs,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.swap_activate = ext4_iomap_swap_activate,
@@ -3681,7 +3759,6 @@ static const struct address_space_operations ext4_journalled_aops = {
static const struct address_space_operations ext4_da_aops = {
.read_folio = ext4_read_folio,
.readahead = ext4_readahead,
- .writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
@@ -4225,7 +4302,8 @@ int ext4_truncate(struct inode *inode)
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
- if (ext4_inode_attach_jinode(inode) < 0)
+ err = ext4_inode_attach_jinode(inode);
+ if (err)
goto out_trace;
}
@@ -4473,9 +4551,17 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((ino - 1) %
EXT4_INODES_PER_GROUP(sb));
- block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ block = ext4_inode_table(sb, gdp);
+ if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
+ (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
+ ext4_error(sb, "Invalid inode table block %llu in "
+ "block_group %u", block, iloc->block_group);
+ return -EFSCORRUPTED;
+ }
+ block += (inode_offset / inodes_per_block);
+
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
@@ -5044,8 +5130,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
ext4_error_inode(inode, function, line, 0,
"casefold flag without casefold feature");
- brelse(iloc.bh);
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ ext4_error_inode(inode, function, line, 0,
+ "bad inode without EXT4_IGET_BAD flag");
+ ret = -EUCLEAN;
+ goto bad_inode;
+ }
+ brelse(iloc.bh);
unlock_new_inode(inode);
return inode;
@@ -5550,7 +5642,7 @@ out_mmap_sem:
ext4_orphan_del(NULL, inode);
if (!error && (ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ rc = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
err_out:
if (error)
@@ -5853,6 +5945,14 @@ static int __ext4_expand_extra_isize(struct inode *inode,
return 0;
}
+ /*
+ * We may need to allocate external xattr block so we need quotas
+ * initialized. Here we can be called with various locks held so we
+ * cannot affort to initialize quotas ourselves. So just bail.
+ */
+ if (dquot_initialize_needed(inode))
+ return -EAGAIN;
+
/* try to expand with EAs present */
error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
raw_inode, handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 95dfea28bf4e..8067ccda34e4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -374,7 +374,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
blkcnt_t blocks;
unsigned short bytes;
- inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO,
+ EXT4_IGET_SPECIAL | EXT4_IGET_BAD);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
@@ -424,7 +425,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
- if (inode_bl->i_nlink == 0) {
+ if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) {
/* this inode has never been used as a BOOT_LOADER */
set_nlink(inode_bl, 1);
i_uid_write(inode_bl, 0);
@@ -731,6 +732,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
if (ext4_is_quota_file(inode))
return err;
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
@@ -746,10 +751,6 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
brelse(iloc.bh);
}
- err = dquot_initialize(inode);
- if (err)
- return err;
-
handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(sb) +
EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
@@ -1153,19 +1154,22 @@ static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi,
if (fsuuid.fsu_len == 0) {
fsuuid.fsu_len = UUID_SIZE;
- if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid.fsu_len)))
+ if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len,
+ sizeof(fsuuid.fsu_len)))
return -EFAULT;
- return -EINVAL;
+ return 0;
}
- if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0)
+ if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0)
return -EINVAL;
lock_buffer(sbi->s_sbh);
memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE);
unlock_buffer(sbi->s_sbh);
- if (copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
+ fsuuid.fsu_len = UUID_SIZE;
+ if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) ||
+ copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE))
return -EFAULT;
return 0;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9dad93059945..5b2ae37a8b80 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5204,7 +5204,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
mutex_lock(&ac->ac_lg->lg_mutex);
}
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
@@ -5253,8 +5253,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
(unsigned) ar->lleft, (unsigned) ar->pleft,
(unsigned) ar->lright, (unsigned) ar->pright,
inode_is_open_for_write(ar->inode) ? "" : "non-");
- return 0;
-
}
static noinline_for_stack void
@@ -5591,11 +5589,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto out;
}
- *errp = ext4_mb_initialize_context(ac, ar);
- if (*errp) {
- ar->len = 0;
- goto out;
- }
+ ext4_mb_initialize_context(ac, ar);
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
seq = this_cpu_read(discard_pa_seq);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 588cb09c5291..4681fff6665f 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -262,13 +262,7 @@ void ext4_stop_mmpd(struct ext4_sb_info *sbi)
*/
static unsigned int mmp_new_seq(void)
{
- u32 new_seq;
-
- do {
- new_seq = get_random_u32();
- } while (new_seq > EXT4_MMP_SEQ_MAX);
-
- return new_seq;
+ return get_random_u32_below(EXT4_MMP_SEQ_MAX + 1);
}
/*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 044e34cd835c..8dbb87edf24c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -253,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
{
struct inode *orig_inode = file_inode(o_filp);
struct page *pagep[2] = {NULL, NULL};
+ struct folio *folio[2] = {NULL, NULL};
handle_t *handle;
ext4_lblk_t orig_blk_offset, donor_blk_offset;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
@@ -313,6 +314,13 @@ again:
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
+ folio[0] = page_folio(pagep[0]);
+ folio[1] = page_folio(pagep[1]);
+
+ VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
+ VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
+ VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
+
if (unwritten) {
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
@@ -331,10 +339,10 @@ again:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
- if ((page_has_private(pagep[0]) &&
- !try_to_release_page(pagep[0], 0)) ||
- (page_has_private(pagep[1]) &&
- !try_to_release_page(pagep[1], 0))) {
+ if ((folio_has_private(folio[0]) &&
+ !filemap_release_folio(folio[0], 0)) ||
+ (folio_has_private(folio[1]) &&
+ !filemap_release_folio(folio[1], 0))) {
*err = -EBUSY;
goto drop_data_sem;
}
@@ -344,19 +352,21 @@ again:
block_len_in_page, 1, err);
drop_data_sem:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
- goto unlock_pages;
+ goto unlock_folios;
}
data_copy:
- *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+ *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size);
if (*err)
- goto unlock_pages;
+ goto unlock_folios;
/* At this point all buffers in range are uptodate, old mapping layout
* is no longer required, try to drop it now. */
- if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
- (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+ if ((folio_has_private(folio[0]) &&
+ !filemap_release_folio(folio[0], 0)) ||
+ (folio_has_private(folio[1]) &&
+ !filemap_release_folio(folio[1], 0))) {
*err = -EBUSY;
- goto unlock_pages;
+ goto unlock_folios;
}
ext4_double_down_write_data_sem(orig_inode, donor_inode);
replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
@@ -369,13 +379,13 @@ data_copy:
replaced_size =
block_len_in_page << orig_inode->i_blkbits;
} else
- goto unlock_pages;
+ goto unlock_folios;
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- if (!page_has_buffers(pagep[0]))
- create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
- bh = page_buffers(pagep[0]);
+ if (!folio_buffers(folio[0]))
+ create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
+ bh = folio_buffers(folio[0]);
for (i = 0; i < data_offset_in_page; i++)
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
@@ -385,7 +395,7 @@ data_copy:
bh = bh->b_this_page;
}
if (!*err)
- *err = block_commit_write(pagep[0], from, from + replaced_size);
+ *err = block_commit_write(&folio[0]->page, from, from + replaced_size);
if (unlikely(*err < 0))
goto repair_branches;
@@ -395,11 +405,11 @@ data_copy:
*err = ext4_jbd2_inode_add_write(handle, orig_inode,
(loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
-unlock_pages:
- unlock_page(pagep[0]);
- put_page(pagep[0]);
- unlock_page(pagep[1]);
- put_page(pagep[1]);
+unlock_folios:
+ folio_unlock(folio[0]);
+ folio_put(folio[0]);
+ folio_unlock(folio[1]);
+ folio_put(folio[1]);
stop_journal:
ext4_journal_stop(handle);
if (*err == -ENOSPC &&
@@ -430,7 +440,7 @@ repair_branches:
*err = -EIO;
}
replaced_count = 0;
- goto unlock_pages;
+ goto unlock_folios;
}
/**
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c08c0aba1883..dd28453d6ea3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3204,14 +3204,20 @@ end_rmdir:
return retval;
}
-int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name,
- struct inode *inode)
+int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
+ struct inode *inode,
+ struct dentry *dentry /* NULL during fast_commit recovery */)
{
int retval = -ENOENT;
struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
+ handle_t *handle;
int skip_remove_dentry = 0;
+ /*
+ * Keep this outside the transaction; it may have to set up the
+ * directory's encryption key, which isn't GFP_NOFS-safe.
+ */
bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
@@ -3228,7 +3234,14 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
skip_remove_dentry = 1;
else
- goto out;
+ goto out_bh;
+ }
+
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ goto out_bh;
}
if (IS_DIRSYNC(dir))
@@ -3237,12 +3250,12 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
if (!skip_remove_dentry) {
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
- goto out;
+ goto out_handle;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
- goto out;
+ goto out_handle;
} else {
retval = 0;
}
@@ -3255,15 +3268,17 @@ int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
retval = ext4_mark_inode_dirty(handle, inode);
-
-out:
+ if (dentry && !retval)
+ ext4_fc_track_unlink(handle, dentry);
+out_handle:
+ ext4_journal_stop(handle);
+out_bh:
brelse(bh);
return retval;
}
static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
- handle_t *handle;
int retval;
if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
@@ -3281,16 +3296,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (retval)
goto out_trace;
- handle = ext4_journal_start(dir, EXT4_HT_DIR,
- EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
- if (IS_ERR(handle)) {
- retval = PTR_ERR(handle);
- goto out_trace;
- }
-
- retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
- if (!retval)
- ext4_fc_track_unlink(handle, dentry);
+ retval = __ext4_unlink(dir, &dentry->d_name, d_inode(dentry), dentry);
#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -3301,8 +3307,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
- if (handle)
- ext4_journal_stop(handle);
out_trace:
trace_ext4_unlink_exit(dentry, retval);
@@ -3794,6 +3798,9 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
retval = dquot_initialize(old.dir);
if (retval)
return retval;
+ retval = dquot_initialize(old.inode);
+ if (retval)
+ return retval;
retval = dquot_initialize(new.dir);
if (retval)
return retval;
@@ -4194,7 +4201,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
.fileattr_get = ext4_fileattr_get,
@@ -4205,6 +4212,6 @@ const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
- .get_acl = ext4_get_acl,
+ .get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
};
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 69a9cf9137a6..e5b47dda3317 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -412,7 +412,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es)
/* don't clear list on RO mount w/ errors */
if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
- "clearing orphan list.\n");
+ "clearing orphan list.");
es->s_last_orphan = 0;
}
ext4_debug("Skipping orphan recovery on fs with errors.\n");
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 97fa7b4c645f..beaec6d81074 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -430,25 +430,20 @@ submit_and_retry:
int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
- int len,
- bool keep_towrite)
+ int len)
{
struct page *bounce_page = NULL;
struct inode *inode = page->mapping->host;
unsigned block_start;
struct buffer_head *bh, *head;
int ret = 0;
- int nr_submitted = 0;
int nr_to_submit = 0;
struct writeback_control *wbc = io->io_wbc;
+ bool keep_towrite = false;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- if (keep_towrite)
- set_page_writeback_keepwrite(page);
- else
- set_page_writeback(page);
ClearPageError(page);
/*
@@ -482,16 +477,31 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/* A hole? We can safely clear the dirty bit */
if (!buffer_mapped(bh))
clear_buffer_dirty(bh);
- if (io->io_bio)
- ext4_io_submit(io);
+ /*
+ * Keeping dirty some buffer we cannot write? Make sure
+ * to redirty the page and keep TOWRITE tag so that
+ * racing WB_SYNC_ALL writeback does not skip the page.
+ * This happens e.g. when doing writeout for
+ * transaction commit.
+ */
+ if (buffer_dirty(bh)) {
+ if (!PageDirty(page))
+ redirty_page_for_writepage(wbc, page);
+ keep_towrite = true;
+ }
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh);
set_buffer_async_write(bh);
+ clear_buffer_dirty(bh);
nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
+ /* Nothing to submit? Just unlock the page... */
+ if (!nr_to_submit)
+ goto unlock;
+
bh = head = page_buffers(page);
/*
@@ -532,27 +542,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
redirty_page_for_writepage(wbc, page);
do {
- clear_buffer_async_write(bh);
+ if (buffer_async_write(bh)) {
+ clear_buffer_async_write(bh);
+ set_buffer_dirty(bh);
+ }
bh = bh->b_this_page;
} while (bh != head);
goto unlock;
}
}
+ if (keep_towrite)
+ set_page_writeback_keepwrite(page);
+ else
+ set_page_writeback(page);
+
/* Now submit buffers to write */
do {
if (!buffer_async_write(bh))
continue;
io_submit_add_bh(io, inode,
bounce_page ? bounce_page : page, bh);
- nr_submitted++;
- clear_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
-
unlock:
unlock_page(page);
- /* Nothing submitted - we have to end page writeback */
- if (!nr_submitted)
- end_page_writeback(page);
return ret;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 3d21eae267fc..d5266932ce6c 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -75,14 +75,10 @@ static void __read_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, iter_all) {
page = bv->bv_page;
- /* PG_error was set if verity failed. */
- if (bio->bi_status || PageError(page)) {
+ if (bio->bi_status)
ClearPageUptodate(page);
- /* will re-read again later */
- ClearPageError(page);
- } else {
+ else
SetPageUptodate(page);
- }
unlock_page(page);
}
if (bio->bi_private)
@@ -410,9 +406,8 @@ int ext4_mpage_readpages(struct inode *inode,
int __init ext4_init_post_read_processing(void)
{
- bio_post_read_ctx_cache =
- kmem_cache_create("ext4_bio_post_read_ctx",
- sizeof(struct bio_post_read_ctx), 0, 0, NULL);
+ bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT);
+
if (!bio_post_read_ctx_cache)
goto fail;
bio_post_read_ctx_pool =
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 46b87ffeb304..6b91443d6bf3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1110,6 +1110,16 @@ exit_free:
return err;
}
+static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
+ ext4_group_t group)
+{
+ struct ext4_super_block *es = (struct ext4_super_block *) data;
+
+ es->s_block_group_nr = cpu_to_le16(group);
+ if (ext4_has_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
/*
* Update the backup copies of the ext4 metadata. These don't need to be part
* of the main resize transaction, because e2fsck will re-write them if there
@@ -1158,7 +1168,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
while (group < sbi->s_groups_count) {
struct buffer_head *bh;
ext4_fsblk_t backup_block;
- struct ext4_super_block *es;
+ int has_super = ext4_bg_has_super(sb, group);
+ ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group);
/* Out of journal space, and can't get more - abort - so sad */
err = ext4_resize_ensure_credits_batch(handle, 1);
@@ -1168,8 +1179,7 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
if (meta_bg == 0)
backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
else
- backup_block = (ext4_group_first_block_no(sb, group) +
- ext4_bg_has_super(sb, group));
+ backup_block = first_block + has_super;
bh = sb_getblk(sb, backup_block);
if (unlikely(!bh)) {
@@ -1187,10 +1197,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
memcpy(bh->b_data, data, size);
if (rest)
memset(bh->b_data + size, 0, rest);
- es = (struct ext4_super_block *) bh->b_data;
- es->s_block_group_nr = cpu_to_le16(group);
- if (ext4_has_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ if (has_super && (backup_block == first_block))
+ ext4_set_block_group_nr(sb, bh->b_data, group);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -1476,8 +1484,6 @@ static void ext4_update_super(struct super_block *sb,
* active. */
ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
reserved_blocks);
- ext4_superblock_csum_set(sb);
- unlock_buffer(sbi->s_sbh);
/* Update the free space counts */
percpu_counter_add(&sbi->s_freeclusters_counter,
@@ -1513,6 +1519,8 @@ static void ext4_update_super(struct super_block *sb,
ext4_calculate_overhead(sb);
es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+ ext4_superblock_csum_set(sb);
+ unlock_buffer(sbi->s_sbh);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
"%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
@@ -1596,8 +1604,8 @@ exit_journal:
int meta_bg = ext4_has_feature_meta_bg(sb);
sector_t old_gdb = 0;
- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block), 0);
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
+ (char *)es, sizeof(struct ext4_super_block), 0);
for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
@@ -1808,7 +1816,7 @@ errout:
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
(char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
@@ -1831,7 +1839,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_grpblk_t last;
ext4_grpblk_t add;
struct buffer_head *bh;
- int err;
ext4_group_t group;
o_blocks_count = ext4_blocks_count(es);
@@ -1886,8 +1893,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
brelse(bh);
- err = ext4_group_extend_no_check(sb, o_blocks_count, add);
- return err;
+ return ext4_group_extend_no_check(sb, o_blocks_count, add);
} /* ext4_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7cdd2138c897..260c1b3e3ef2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -540,8 +540,7 @@ static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
if (ext4_should_journal_data(jinode->i_vfs_inode))
ret = ext4_journalled_submit_inode_data_buffers(jinode);
else
- ret = jbd2_journal_submit_inode_data_buffers(jinode);
-
+ ret = ext4_normal_submit_inode_data_buffers(jinode);
return ret;
}
@@ -1206,7 +1205,8 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_sysfs(sb);
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount"))
- ext4_msg(sb, KERN_INFO, "unmounting filesystem.");
+ ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.",
+ &sb->s_uuid);
ext4_unregister_li_request(sb);
ext4_quota_off_umount(sb);
@@ -1225,7 +1225,7 @@ static void ext4_put_super(struct super_block *sb)
}
ext4_es_unregister_shrinker(sbi);
- del_timer_sync(&sbi->s_err_report);
+ timer_shutdown_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
@@ -1323,6 +1323,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
inode_set_iversion(&ei->vfs_inode, 1);
+ ei->i_flags = 0;
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
atomic_set(&ei->i_prealloc_active, 0);
@@ -2247,7 +2248,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
}
- error = fs_lookup_param(fc, param, 1, &path);
+ error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path);
if (error) {
ext4_msg(NULL, KERN_ERR, "error: could not find "
"journal device path");
@@ -3778,7 +3779,7 @@ cont_thread:
}
if (!progress) {
elr->lr_next_sched = jiffies +
- prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ);
+ get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
}
if (time_before(elr->lr_next_sched, next_wakeup))
next_wakeup = elr->lr_next_sched;
@@ -3925,8 +3926,7 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
* spread the inode table initialization requests
* better.
*/
- elr->lr_next_sched = jiffies + prandom_u32_max(
- EXT4_DEF_LI_MAX_START_DELAY * HZ);
+ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
return elr;
}
@@ -5287,14 +5287,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount3a;
} else {
/* Nojournal mode, all journal mount options are illegal */
- if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_checksum, fs mounted w/o journal");
+ "journal_async_commit, fs mounted w/o journal");
goto failed_mount3a;
}
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit, fs mounted w/o journal");
+ "journal_checksum, fs mounted w/o journal");
goto failed_mount3a;
}
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
@@ -5655,8 +5656,9 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
descr = "out journal";
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
- ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
- "Quota mode: %s.", descr, ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. "
+ "Quota mode: %s.", &sb->s_uuid, descr,
+ ext4_quota_mode(sb));
/* Update the s_overhead_clusters if necessary */
ext4_update_overhead(sb, false);
@@ -5723,7 +5725,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
ext4_debug("Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
- if (!S_ISREG(journal_inode->i_mode)) {
+ if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
iput(journal_inode);
return NULL;
@@ -6611,8 +6613,8 @@ static int ext4_reconfigure(struct fs_context *fc)
if (ret < 0)
return ret;
- ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.",
- ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.",
+ &sb->s_uuid, ext4_quota_mode(sb));
return 0;
}
@@ -6886,6 +6888,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
return err;
}
+static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
+{
+ switch (type) {
+ case USRQUOTA:
+ return qf_inum == EXT4_USR_QUOTA_INO;
+ case GRPQUOTA:
+ return qf_inum == EXT4_GRP_QUOTA_INO;
+ case PRJQUOTA:
+ return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
+ default:
+ BUG();
+ }
+}
+
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags)
{
@@ -6902,9 +6918,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
if (!qf_inums[type])
return -EPERM;
+ if (!ext4_check_quota_inum(type, qf_inums[type])) {
+ ext4_error(sb, "Bad quota inum: %lu, type: %d",
+ qf_inums[type], type);
+ return -EUCLEAN;
+ }
+
qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
if (IS_ERR(qf_inode)) {
- ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+ ext4_error(sb, "Bad quota inode: %lu, type: %d",
+ qf_inums[type], type);
return PTR_ERR(qf_inode);
}
@@ -6943,8 +6966,9 @@ int ext4_enable_quotas(struct super_block *sb)
if (err) {
ext4_warning(sb,
"Failed to enable quota tracking "
- "(type=%d, err=%d). Please run "
- "e2fsck to fix.", type, err);
+ "(type=%d, err=%d, ino=%lu). "
+ "Please run e2fsck to fix.", type,
+ err, qf_inums[type]);
for (type--; type >= 0; type--) {
struct inode *inode;
@@ -7031,8 +7055,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
len = i_size-off;
toread = len;
while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
+ tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
bh = ext4_bread(NULL, inode, blk, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 3c640bd7ecae..30e3b65798b5 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -79,7 +79,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int res;
res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 36d6ba7190b6..7decaaf27e82 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1281,7 +1281,7 @@ retry_ref:
ce = mb_cache_entry_get(ea_block_cache, hash,
bh->b_blocknr);
if (ce) {
- ce->e_reusable = 1;
+ set_bit(MBE_REUSABLE_B, &ce->e_flags);
mb_cache_entry_put(ea_block_cache, ce);
}
}
@@ -1441,6 +1441,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
if (!err)
err = ext4_inode_attach_jinode(ea_inode);
if (err) {
+ if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+ ext4_warning_inode(ea_inode,
+ "cleanup dec ref error %d", err);
iput(ea_inode);
return ERR_PTR(err);
}
@@ -1540,7 +1543,8 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
if (err) {
- ext4_xattr_inode_dec_ref(handle, ea_inode);
+ if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+ ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
iput(ea_inode);
return err;
}
@@ -2042,7 +2046,7 @@ inserted:
}
BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
if (ref == EXT4_XATTR_REFCOUNT_MAX)
- ce->e_reusable = 0;
+ clear_bit(MBE_REUSABLE_B, &ce->e_flags);
ea_bdebug(new_bh, "reusing; refcount now=%d",
ref);
ext4_xattr_block_csum_set(inode, new_bh);
@@ -2070,19 +2074,11 @@ inserted:
goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
-
- /* non-extent files can't have physical blocks past 2^32 */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-
block = ext4_new_meta_blocks(handle, inode, goal, 0,
NULL, &error);
if (error)
goto cleanup;
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
-
ea_idebug(inode, "creating block %llu",
(unsigned long long)block);
@@ -2555,7 +2551,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
- buffer = kmalloc(value_size, GFP_NOFS);
+ buffer = kvmalloc(value_size, GFP_NOFS);
b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
if (!is || !bs || !buffer || !b_entry_name) {
error = -ENOMEM;
@@ -2607,7 +2603,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
error = 0;
out:
kfree(b_entry_name);
- kfree(buffer);
+ kvfree(buffer);
if (is)
brelse(is->iloc.bh);
if (bs)
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 5bbc44a5216e..c1c74aa658ae 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -276,9 +276,11 @@ static int __f2fs_set_acl(struct user_namespace *mnt_userns,
return error;
}
-int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int f2fs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
+ struct inode *inode = d_inode(dentry);
+
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index a26e33cab4ff..ea2bbb3f264b 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -34,7 +34,7 @@ struct f2fs_acl_header {
#ifdef CONFIG_F2FS_FS_POSIX_ACL
extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool);
-extern int f2fs_set_acl(struct user_namespace *, struct inode *,
+extern int f2fs_set_acl(struct user_namespace *, struct dentry *,
struct posix_acl *, int);
extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
struct page *);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0c82dae082aa..56f7d0d6a8b2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -171,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
+ if (time_to_inject(sbi, FAULT_BLKADDR)) {
+ f2fs_show_injection_info(sbi, FAULT_BLKADDR);
+ return false;
+ }
+
switch (type) {
case META_NAT:
break;
@@ -1897,8 +1902,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
"f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(cprc->f2fs_issue_ckpt)) {
+ int err = PTR_ERR(cprc->f2fs_issue_ckpt);
+
cprc->f2fs_issue_ckpt = NULL;
- return -ENOMEM;
+ return err;
}
set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d315c2de136f..2532f369cb10 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -346,7 +346,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
if (!level)
level = F2FS_ZSTD_DEFAULT_CLEVEL;
- params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen);
+ params = zstd_get_params(level, cc->rlen);
workspace_size = zstd_cstream_workspace_bound(&params.cParams);
workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
@@ -567,10 +567,7 @@ MODULE_PARM_DESC(num_compress_pages,
int f2fs_init_compress_mempool(void)
{
compress_page_pool = mempool_create_page_pool(num_compress_pages, 0);
- if (!compress_page_pool)
- return -ENOMEM;
-
- return 0;
+ return compress_page_pool ? 0 : -ENOMEM;
}
void f2fs_destroy_compress_mempool(void)
@@ -1711,50 +1708,27 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
}
}
-/*
- * Update and unlock the cluster's pagecache pages, and release the reference to
- * the decompress_io_ctx that was being held for I/O completion.
- */
-static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
- bool in_task)
+static void f2fs_verify_cluster(struct work_struct *work)
{
+ struct decompress_io_ctx *dic =
+ container_of(work, struct decompress_io_ctx, verity_work);
int i;
+ /* Verify, update, and unlock the decompressed pages. */
for (i = 0; i < dic->cluster_size; i++) {
struct page *rpage = dic->rpages[i];
if (!rpage)
continue;
- /* PG_error was set if verity failed. */
- if (failed || PageError(rpage)) {
- ClearPageUptodate(rpage);
- /* will re-read again later */
- ClearPageError(rpage);
- } else {
+ if (fsverity_verify_page(rpage))
SetPageUptodate(rpage);
- }
+ else
+ ClearPageUptodate(rpage);
unlock_page(rpage);
}
- f2fs_put_dic(dic, in_task);
-}
-
-static void f2fs_verify_cluster(struct work_struct *work)
-{
- struct decompress_io_ctx *dic =
- container_of(work, struct decompress_io_ctx, verity_work);
- int i;
-
- /* Verify the cluster's decompressed pages with fs-verity. */
- for (i = 0; i < dic->cluster_size; i++) {
- struct page *rpage = dic->rpages[i];
-
- if (rpage && !fsverity_verify_page(rpage))
- SetPageError(rpage);
- }
-
- __f2fs_decompress_end_io(dic, false, true);
+ f2fs_put_dic(dic, true);
}
/*
@@ -1764,6 +1738,8 @@ static void f2fs_verify_cluster(struct work_struct *work)
void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
bool in_task)
{
+ int i;
+
if (!failed && dic->need_verity) {
/*
* Note that to avoid deadlocks, the verity work can't be done
@@ -1773,9 +1749,28 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
*/
INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
fsverity_enqueue_verify_work(&dic->verity_work);
- } else {
- __f2fs_decompress_end_io(dic, failed, in_task);
+ return;
}
+
+ /* Update and unlock the cluster's pagecache pages. */
+ for (i = 0; i < dic->cluster_size; i++) {
+ struct page *rpage = dic->rpages[i];
+
+ if (!rpage)
+ continue;
+
+ if (failed)
+ ClearPageUptodate(rpage);
+ else
+ SetPageUptodate(rpage);
+ unlock_page(rpage);
+ }
+
+ /*
+ * Release the reference to the decompress_io_ctx that was being held
+ * for I/O completion.
+ */
+ f2fs_put_dic(dic, in_task);
}
/*
@@ -1983,9 +1978,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
sbi->page_array_slab = f2fs_kmem_cache_create(slab_name,
sbi->page_array_slab_size);
- if (!sbi->page_array_slab)
- return -ENOMEM;
- return 0;
+ return sbi->page_array_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
@@ -1993,53 +1986,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
kmem_cache_destroy(sbi->page_array_slab);
}
-static int __init f2fs_init_cic_cache(void)
+int __init f2fs_init_compress_cache(void)
{
cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry",
sizeof(struct compress_io_ctx));
if (!cic_entry_slab)
return -ENOMEM;
- return 0;
-}
-
-static void f2fs_destroy_cic_cache(void)
-{
- kmem_cache_destroy(cic_entry_slab);
-}
-
-static int __init f2fs_init_dic_cache(void)
-{
dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry",
sizeof(struct decompress_io_ctx));
if (!dic_entry_slab)
- return -ENOMEM;
- return 0;
-}
-
-static void f2fs_destroy_dic_cache(void)
-{
- kmem_cache_destroy(dic_entry_slab);
-}
-
-int __init f2fs_init_compress_cache(void)
-{
- int err;
-
- err = f2fs_init_cic_cache();
- if (err)
- goto out;
- err = f2fs_init_dic_cache();
- if (err)
goto free_cic;
return 0;
free_cic:
- f2fs_destroy_cic_cache();
-out:
+ kmem_cache_destroy(cic_entry_slab);
return -ENOMEM;
}
void f2fs_destroy_compress_cache(void)
{
- f2fs_destroy_dic_cache();
- f2fs_destroy_cic_cache();
+ kmem_cache_destroy(dic_entry_slab);
+ kmem_cache_destroy(cic_entry_slab);
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a71e818cd67b..97e816590cd9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -39,10 +39,8 @@ static struct bio_set f2fs_bioset;
int __init f2fs_init_bioset(void)
{
- if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
- 0, BIOSET_NEED_BVECS))
- return -ENOMEM;
- return 0;
+ return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
+ 0, BIOSET_NEED_BVECS);
}
void f2fs_destroy_bioset(void)
@@ -116,43 +114,56 @@ struct bio_post_read_ctx {
struct f2fs_sb_info *sbi;
struct work_struct work;
unsigned int enabled_steps;
+ /*
+ * decompression_attempted keeps track of whether
+ * f2fs_end_read_compressed_page() has been called on the pages in the
+ * bio that belong to a compressed cluster yet.
+ */
+ bool decompression_attempted;
block_t fs_blkaddr;
};
+/*
+ * Update and unlock a bio's pages, and free the bio.
+ *
+ * This marks pages up-to-date only if there was no error in the bio (I/O error,
+ * decryption error, or verity error), as indicated by bio->bi_status.
+ *
+ * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk)
+ * aren't marked up-to-date here, as decompression is done on a per-compression-
+ * cluster basis rather than a per-bio basis. Instead, we only must do two
+ * things for each compressed page here: call f2fs_end_read_compressed_page()
+ * with failed=true if an error occurred before it would have normally gotten
+ * called (i.e., I/O error or decryption error, but *not* verity error), and
+ * release the bio's reference to the decompress_io_ctx of the page's cluster.
+ */
static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
+ struct bio_post_read_ctx *ctx = bio->bi_private;
- /*
- * Update and unlock the bio's pagecache pages, and put the
- * decompression context for any compressed pages.
- */
bio_for_each_segment_all(bv, bio, iter_all) {
struct page *page = bv->bv_page;
if (f2fs_is_compressed_page(page)) {
- if (bio->bi_status)
+ if (ctx && !ctx->decompression_attempted)
f2fs_end_read_compressed_page(page, true, 0,
in_task);
f2fs_put_page_dic(page, in_task);
continue;
}
- /* PG_error was set if verity failed. */
- if (bio->bi_status || PageError(page)) {
+ if (bio->bi_status)
ClearPageUptodate(page);
- /* will re-read again later */
- ClearPageError(page);
- } else {
+ else
SetPageUptodate(page);
- }
dec_page_count(F2FS_P_SB(page), __read_io_type(page));
unlock_page(page);
}
- if (bio->bi_private)
- mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+ if (ctx)
+ mempool_free(ctx, bio_post_read_ctx_pool);
bio_put(bio);
}
@@ -185,8 +196,10 @@ static void f2fs_verify_bio(struct work_struct *work)
struct page *page = bv->bv_page;
if (!f2fs_is_compressed_page(page) &&
- !fsverity_verify_page(page))
- SetPageError(page);
+ !fsverity_verify_page(page)) {
+ bio->bi_status = BLK_STS_IOERR;
+ break;
+ }
}
} else {
fsverity_verify_bio(bio);
@@ -245,6 +258,8 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
blkaddr++;
}
+ ctx->decompression_attempted = true;
+
/*
* Optimization: if all the bio's pages are compressed, then scheduling
* the per-bio verity work is unnecessary, as verity will be fully
@@ -1062,6 +1077,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
ctx->sbi = sbi;
ctx->enabled_steps = post_read_steps;
ctx->fs_blkaddr = blkaddr;
+ ctx->decompression_attempted = false;
bio->bi_private = ctx;
}
iostat_alloc_and_bind_ctx(sbi, bio, ctx);
@@ -1089,7 +1105,6 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
bio_put(bio);
return -EFAULT;
}
- ClearPageError(page);
inc_page_count(sbi, F2FS_RD_DATA);
f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE);
__submit_bio(sbi, bio, DATA);
@@ -1128,7 +1143,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
dn->data_blkaddr = blkaddr;
f2fs_set_data_blkaddr(dn);
- f2fs_update_extent_cache(dn);
+ f2fs_update_read_extent_cache(dn);
}
/* dn->ofs_in_node will be returned with up-to-date last block pointer */
@@ -1197,7 +1212,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
struct extent_info ei = {0, };
struct inode *inode = dn->inode;
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn->data_blkaddr = ei.blk + index - ei.fofs;
return 0;
}
@@ -1206,7 +1221,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
}
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write)
+ blk_opf_t op_flags, bool for_write,
+ pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
@@ -1218,7 +1234,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
if (!page)
return ERR_PTR(-ENOMEM);
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
@@ -1232,12 +1248,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err)
+ if (err) {
+ if (err == -ENOENT && next_pgofs)
+ *next_pgofs = f2fs_get_next_page_offset(&dn, index);
goto put_err;
+ }
f2fs_put_dnode(&dn);
if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
err = -ENOENT;
+ if (next_pgofs)
+ *next_pgofs = index + 1;
goto put_err;
}
if (dn.data_blkaddr != NEW_ADDR &&
@@ -1281,7 +1302,8 @@ put_err:
return ERR_PTR(err);
}
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+ pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -1291,7 +1313,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
return page;
f2fs_put_page(page, 0);
- page = f2fs_get_read_data_page(inode, index, 0, false);
+ page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs);
if (IS_ERR(page))
return page;
@@ -1317,7 +1339,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct page *page;
repeat:
- page = f2fs_get_read_data_page(inode, index, 0, for_write);
+ page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL);
if (IS_ERR(page))
return page;
@@ -1480,7 +1502,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
pgofs = (pgoff_t)map->m_lblk;
end = pgofs + maxblocks;
- if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+ if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) {
if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create)
goto next_dnode;
@@ -1690,7 +1712,7 @@ skip:
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
- f2fs_update_extent_cache_range(&dn,
+ f2fs_update_read_extent_cache_range(&dn,
start_pgofs, map->m_pblk + ofs,
map->m_len - ofs);
}
@@ -1735,7 +1757,7 @@ sync_out:
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
- f2fs_update_extent_cache_range(&dn,
+ f2fs_update_read_extent_cache_range(&dn,
start_pgofs, map->m_pblk + ofs,
map->m_len - ofs);
}
@@ -2141,7 +2163,6 @@ submit_and_realloc:
inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
F2FS_BLKSIZE);
- ClearPageError(page);
*last_block_in_bio = block_nr;
goto out;
out:
@@ -2162,7 +2183,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
sector_t last_block_in_file;
const unsigned blocksize = blks_to_bytes(inode, 1);
struct decompress_io_ctx *dic = NULL;
- struct extent_info ei = {0, };
+ struct extent_info ei = {};
bool from_dnode = true;
int i;
int ret = 0;
@@ -2196,7 +2217,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
if (f2fs_cluster_is_empty(cc))
goto out;
- if (f2fs_lookup_extent_cache(inode, start_idx, &ei))
+ if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei))
from_dnode = false;
if (!from_dnode)
@@ -2289,7 +2310,6 @@ submit_and_realloc:
inc_page_count(sbi, F2FS_RD_DATA);
f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
- ClearPageError(page);
*last_block_in_bio = blkaddr;
}
@@ -2306,7 +2326,6 @@ out:
for (i = 0; i < cc->cluster_size; i++) {
if (cc->rpages[i]) {
ClearPageUptodate(cc->rpages[i]);
- ClearPageError(cc->rpages[i]);
unlock_page(cc->rpages[i]);
}
}
@@ -2403,7 +2422,6 @@ read_single_page:
#ifdef CONFIG_F2FS_FS_COMPRESSION
set_error_page:
#endif
- SetPageError(page);
zero_user_segment(page, 0, PAGE_SIZE);
unlock_page(page);
}
@@ -2630,7 +2648,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
set_new_dnode(&dn, inode, NULL, NULL, 0);
if (need_inplace_update(fio) &&
- f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+ f2fs_lookup_read_extent_cache(inode, page->index, &ei)) {
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
@@ -3354,7 +3372,7 @@ restart:
} else if (locked) {
err = f2fs_get_block(&dn, index);
} else {
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
} else {
/* hole case */
@@ -3395,7 +3413,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index,
set_new_dnode(&dn, inode, ipage, ipage, 0);
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
} else {
/* hole case */
@@ -3459,6 +3477,9 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
else if (*blk_addr != NULL_ADDR)
return 0;
+ if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE))
+ goto reserve_block;
+
/* Look for the block in the original inode */
err = __find_data_block(inode, index, &ori_blk_addr);
if (err)
@@ -4080,9 +4101,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi)
sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq",
WQ_UNBOUND | WQ_HIGHPRI,
num_online_cpus());
- if (!sbi->post_read_wq)
- return -ENOMEM;
- return 0;
+ return sbi->post_read_wq ? 0 : -ENOMEM;
}
void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
@@ -4095,9 +4114,7 @@ int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- if (!bio_entry_slab)
- return -ENOMEM;
- return 0;
+ return bio_entry_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_bio_entry_cache(void)
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a216dcdf6941..32af4f0c5735 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -72,15 +72,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->main_area_zones = si->main_area_sections /
le32_to_cpu(raw_super->secs_per_zone);
- /* validation check of the segment numbers */
+ /* general extent cache stats */
+ for (i = 0; i < NR_EXTENT_CACHES; i++) {
+ struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+ si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]);
+ si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]);
+ si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]);
+ si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i];
+ si->ext_tree[i] = atomic_read(&eti->total_ext_tree);
+ si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree);
+ si->ext_node[i] = atomic_read(&eti->total_ext_node);
+ }
+ /* read extent_cache only */
si->hit_largest = atomic64_read(&sbi->read_hit_largest);
- si->hit_cached = atomic64_read(&sbi->read_hit_cached);
- si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
- si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
- si->total_ext = atomic64_read(&sbi->total_hit_ext);
- si->ext_tree = atomic_read(&sbi->total_ext_tree);
- si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
- si->ext_node = atomic_read(&sbi->total_ext_node);
+ si->hit_total[EX_READ] += si->hit_largest;
+
+ /* block age extent_cache only */
+ si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks);
+
+ /* validation check of the segment numbers */
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
@@ -294,25 +305,32 @@ get_cache:
sizeof(struct nat_entry_set);
for (i = 0; i < MAX_INO_ENTRY; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
- si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+
+ for (i = 0; i < NR_EXTENT_CACHES; i++) {
+ struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+ si->ext_mem[i] = atomic_read(&eti->total_ext_tree) *
sizeof(struct extent_tree);
- si->cache_mem += atomic_read(&sbi->total_ext_node) *
+ si->ext_mem[i] += atomic_read(&eti->total_ext_node) *
sizeof(struct extent_node);
+ si->cache_mem += si->ext_mem[i];
+ }
si->page_mem = 0;
if (sbi->node_inode) {
- unsigned npages = NODE_MAPPING(sbi)->nrpages;
+ unsigned long npages = NODE_MAPPING(sbi)->nrpages;
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
if (sbi->meta_inode) {
- unsigned npages = META_MAPPING(sbi)->nrpages;
+ unsigned long npages = META_MAPPING(sbi)->nrpages;
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (sbi->compress_inode) {
- unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
+ unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages;
+
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
#endif
@@ -460,28 +478,28 @@ static int stat_show(struct seq_file *s, void *v)
si->meta_count[META_NAT]);
seq_printf(s, " - ssa blocks : %u\n",
si->meta_count[META_SSA]);
- seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
- "Cur time: %4d(ms), Peak time: %4d(ms))\n",
- si->nr_queued_ckpt, si->nr_issued_ckpt,
- si->nr_total_ckpt, si->cur_ckpt_time,
- si->peak_ckpt_time);
+ seq_puts(s, "CP merge:\n");
+ seq_printf(s, " - Queued : %4d\n", si->nr_queued_ckpt);
+ seq_printf(s, " - Issued : %4d\n", si->nr_issued_ckpt);
+ seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt);
+ seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time);
+ seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
si->data_segs, si->bg_data_segs);
seq_printf(s, " - node segments : %d (%d)\n",
si->node_segs, si->bg_node_segs);
- seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
- "Idle Greedy (%d), Idle AT (%d), "
- "Urgent High (%d), Urgent Mid (%d), "
- "Urgent Low (%d)\n",
- si->sbi->gc_reclaimed_segs[GC_NORMAL],
- si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
- si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
- si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
- si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
- si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
- si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
+ seq_puts(s, " - Reclaimed segs :\n");
+ seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]);
+ seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]);
+ seq_printf(s, " - Idle Greedy : %d\n",
+ si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]);
+ seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]);
+ seq_printf(s, " - Urgent High : %d\n",
+ si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]);
+ seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]);
+ seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
@@ -490,26 +508,44 @@ static int stat_show(struct seq_file *s, void *v)
si->bg_node_blks);
seq_printf(s, "BG skip : IO: %u, Other: %u\n",
si->io_skip_bggc, si->other_skip_bggc);
- seq_puts(s, "\nExtent Cache:\n");
+ seq_puts(s, "\nExtent Cache (Read):\n");
seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
- si->hit_largest, si->hit_cached,
- si->hit_rbtree);
+ si->hit_largest, si->hit_cached[EX_READ],
+ si->hit_rbtree[EX_READ]);
+ seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
+ !si->total_ext[EX_READ] ? 0 :
+ div64_u64(si->hit_total[EX_READ] * 100,
+ si->total_ext[EX_READ]),
+ si->hit_total[EX_READ], si->total_ext[EX_READ]);
+ seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
+ si->ext_tree[EX_READ], si->zombie_tree[EX_READ],
+ si->ext_node[EX_READ]);
+ seq_puts(s, "\nExtent Cache (Block Age):\n");
+ seq_printf(s, " - Allocated Data Blocks: %llu\n",
+ si->allocated_data_blocks);
+ seq_printf(s, " - Hit Count: L1:%llu L2:%llu\n",
+ si->hit_cached[EX_BLOCK_AGE],
+ si->hit_rbtree[EX_BLOCK_AGE]);
seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
- !si->total_ext ? 0 :
- div64_u64(si->hit_total * 100, si->total_ext),
- si->hit_total, si->total_ext);
+ !si->total_ext[EX_BLOCK_AGE] ? 0 :
+ div64_u64(si->hit_total[EX_BLOCK_AGE] * 100,
+ si->total_ext[EX_BLOCK_AGE]),
+ si->hit_total[EX_BLOCK_AGE],
+ si->total_ext[EX_BLOCK_AGE]);
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
- si->ext_tree, si->zombie_tree, si->ext_node);
+ si->ext_tree[EX_BLOCK_AGE],
+ si->zombie_tree[EX_BLOCK_AGE],
+ si->ext_node[EX_BLOCK_AGE]);
seq_puts(s, "\nBalancing F2FS Async:\n");
seq_printf(s, " - DIO (R: %4d, W: %4d)\n",
si->nr_dio_read, si->nr_dio_write);
seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
- seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
- "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
+ seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ",
si->nr_wb_cp_data, si->nr_wb_data,
si->nr_flushing, si->nr_flushed,
- si->flush_list_empty,
+ si->flush_list_empty);
+ seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
si->nr_discarding, si->nr_discarded,
si->nr_discard_cmd, si->undiscard_blks);
seq_printf(s, " - atomic IO: %4d (Max. %4d)\n",
@@ -566,8 +602,12 @@ static int stat_show(struct seq_file *s, void *v)
(si->base_mem + si->cache_mem + si->page_mem) >> 10);
seq_printf(s, " - static: %llu KB\n",
si->base_mem >> 10);
- seq_printf(s, " - cached: %llu KB\n",
+ seq_printf(s, " - cached all: %llu KB\n",
si->cache_mem >> 10);
+ seq_printf(s, " - read extent cache: %llu KB\n",
+ si->ext_mem[EX_READ] >> 10);
+ seq_printf(s, " - block age extent cache: %llu KB\n",
+ si->ext_mem[EX_BLOCK_AGE] >> 10);
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
@@ -600,10 +640,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->sbi = sbi;
sbi->stat_info = si;
- atomic64_set(&sbi->total_hit_ext, 0);
- atomic64_set(&sbi->read_hit_rbtree, 0);
+ /* general extent cache stats */
+ for (i = 0; i < NR_EXTENT_CACHES; i++) {
+ atomic64_set(&sbi->total_hit_ext[i], 0);
+ atomic64_set(&sbi->read_hit_rbtree[i], 0);
+ atomic64_set(&sbi->read_hit_cached[i], 0);
+ }
+
+ /* read extent_cache only */
atomic64_set(&sbi->read_hit_largest, 0);
- atomic64_set(&sbi->read_hit_cached, 0);
atomic_set(&sbi->inline_xattr, 0);
atomic_set(&sbi->inline_inode, 0);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 21960a899b6a..8e025157f35c 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
unsigned int bidx, end_block;
struct page *dentry_page;
struct f2fs_dir_entry *de = NULL;
+ pgoff_t next_pgofs;
bool room = false;
int max_slots;
@@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
le32_to_cpu(fname->hash) % nbucket);
end_block = bidx + nblock;
- for (; bidx < end_block; bidx++) {
+ while (bidx < end_block) {
/* no need to allocate new dentry pages to all the indices */
- dentry_page = f2fs_find_data_page(dir, bidx);
+ dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
if (IS_ERR(dentry_page)) {
if (PTR_ERR(dentry_page) == -ENOENT) {
room = true;
+ bidx = next_pgofs;
continue;
} else {
*res_page = dentry_page;
@@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
if (max_slots >= s)
room = true;
f2fs_put_page(dentry_page, 0);
+
+ bidx++;
}
if (!de && room && F2FS_I(dir)->chash != fname->hash) {
@@ -956,7 +960,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
bool f2fs_empty_dir(struct inode *dir)
{
- unsigned long bidx;
+ unsigned long bidx = 0;
struct page *dentry_page;
unsigned int bit_pos;
struct f2fs_dentry_block *dentry_blk;
@@ -965,13 +969,17 @@ bool f2fs_empty_dir(struct inode *dir)
if (f2fs_has_inline_dentry(dir))
return f2fs_empty_inline_dir(dir);
- for (bidx = 0; bidx < nblock; bidx++) {
- dentry_page = f2fs_get_lock_data_page(dir, bidx, false);
+ while (bidx < nblock) {
+ pgoff_t next_pgofs;
+
+ dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
if (IS_ERR(dentry_page)) {
- if (PTR_ERR(dentry_page) == -ENOENT)
+ if (PTR_ERR(dentry_page) == -ENOENT) {
+ bidx = next_pgofs;
continue;
- else
+ } else {
return false;
+ }
}
dentry_blk = page_address(dentry_page);
@@ -983,10 +991,12 @@ bool f2fs_empty_dir(struct inode *dir)
NR_DENTRY_IN_BLOCK,
bit_pos);
- f2fs_put_page(dentry_page, 1);
+ f2fs_put_page(dentry_page, 0);
if (bit_pos < NR_DENTRY_IN_BLOCK)
return false;
+
+ bidx++;
}
return true;
}
@@ -1000,7 +1010,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
struct blk_plug plug;
- bool readdir_ra = sbi->readdir_ra == 1;
+ bool readdir_ra = sbi->readdir_ra;
bool found_valid_dirent = false;
int err = 0;
@@ -1104,7 +1114,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
goto out_free;
}
- for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+ for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+ pgoff_t next_pgofs;
/* allow readdir() to be interrupted */
if (fatal_signal_pending(current)) {
@@ -1118,11 +1129,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
page_cache_sync_readahead(inode->i_mapping, ra, file, n,
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
- dentry_page = f2fs_find_data_page(inode, n);
+ dentry_page = f2fs_find_data_page(inode, n, &next_pgofs);
if (IS_ERR(dentry_page)) {
err = PTR_ERR(dentry_page);
if (err == -ENOENT) {
err = 0;
+ n = next_pgofs;
continue;
} else {
goto out_free;
@@ -1141,6 +1153,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
}
f2fs_put_page(dentry_page, 0);
+
+ n++;
}
out_free:
fscrypt_fname_free_buffer(&fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 932c070173b9..342af24b2f8c 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -6,6 +6,10 @@
* Copyright (c) 2015 Samsung Electronics
* Authors: Jaegeuk Kim <jaegeuk@kernel.org>
* Chao Yu <chao2.yu@samsung.com>
+ *
+ * block_age-based extent cache added by:
+ * Copyright (c) 2022 xiaomi Co., Ltd.
+ * http://www.xiaomi.com/
*/
#include <linux/fs.h>
@@ -15,6 +19,123 @@
#include "node.h"
#include <trace/events/f2fs.h>
+static void __set_extent_info(struct extent_info *ei,
+ unsigned int fofs, unsigned int len,
+ block_t blk, bool keep_clen,
+ unsigned long age, unsigned long last_blocks,
+ enum extent_type type)
+{
+ ei->fofs = fofs;
+ ei->len = len;
+
+ if (type == EX_READ) {
+ ei->blk = blk;
+ if (keep_clen)
+ return;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ ei->c_len = 0;
+#endif
+ } else if (type == EX_BLOCK_AGE) {
+ ei->age = age;
+ ei->last_blocks = last_blocks;
+ }
+}
+
+static bool __may_read_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!test_opt(sbi, READ_EXTENT_CACHE))
+ return false;
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
+ return false;
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+ !f2fs_sb_has_readonly(sbi))
+ return false;
+ return S_ISREG(inode->i_mode);
+}
+
+static bool __may_age_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!test_opt(sbi, AGE_EXTENT_CACHE))
+ return false;
+ /* don't cache block age info for cold file */
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
+ return false;
+ if (file_is_cold(inode))
+ return false;
+
+ return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
+}
+
+static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
+{
+ if (type == EX_READ)
+ return __may_read_extent_tree(inode);
+ else if (type == EX_BLOCK_AGE)
+ return __may_age_extent_tree(inode);
+ return false;
+}
+
+static bool __may_extent_tree(struct inode *inode, enum extent_type type)
+{
+ /*
+ * for recovered files during mount do not create extents
+ * if shrinker is not registered.
+ */
+ if (list_empty(&F2FS_I_SB(inode)->s_list))
+ return false;
+
+ return __init_may_extent_tree(inode, type);
+}
+
+static void __try_update_largest_extent(struct extent_tree *et,
+ struct extent_node *en)
+{
+ if (et->type != EX_READ)
+ return;
+ if (en->ei.len <= et->largest.len)
+ return;
+
+ et->largest = en->ei;
+ et->largest_updated = true;
+}
+
+static bool __is_extent_mergeable(struct extent_info *back,
+ struct extent_info *front, enum extent_type type)
+{
+ if (type == EX_READ) {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (back->c_len && back->len != back->c_len)
+ return false;
+ if (front->c_len && front->len != front->c_len)
+ return false;
+#endif
+ return (back->fofs + back->len == front->fofs &&
+ back->blk + back->len == front->blk);
+ } else if (type == EX_BLOCK_AGE) {
+ return (back->fofs + back->len == front->fofs &&
+ abs(back->age - front->age) <= SAME_AGE_REGION &&
+ abs(back->last_blocks - front->last_blocks) <=
+ SAME_AGE_REGION);
+ }
+ return false;
+}
+
+static bool __is_back_mergeable(struct extent_info *cur,
+ struct extent_info *back, enum extent_type type)
+{
+ return __is_extent_mergeable(back, cur, type);
+}
+
+static bool __is_front_mergeable(struct extent_info *cur,
+ struct extent_info *front, enum extent_type type)
+{
+ return __is_extent_mergeable(cur, front, type);
+}
+
static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
unsigned int ofs)
{
@@ -237,6 +358,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
struct rb_node *parent, struct rb_node **p,
bool leftmost)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
struct extent_node *en;
en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi);
@@ -250,16 +372,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
rb_link_node(&en->rb_node, parent, p);
rb_insert_color_cached(&en->rb_node, &et->root, leftmost);
atomic_inc(&et->node_cnt);
- atomic_inc(&sbi->total_ext_node);
+ atomic_inc(&eti->total_ext_node);
return en;
}
static void __detach_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_node *en)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
rb_erase_cached(&en->rb_node, &et->root);
atomic_dec(&et->node_cnt);
- atomic_dec(&sbi->total_ext_node);
+ atomic_dec(&eti->total_ext_node);
if (et->cached_en == en)
et->cached_en = NULL;
@@ -275,61 +399,51 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
static void __release_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_node *en)
{
- spin_lock(&sbi->extent_lock);
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
+ spin_lock(&eti->extent_lock);
f2fs_bug_on(sbi, list_empty(&en->list));
list_del_init(&en->list);
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
__detach_extent_node(sbi, et, en);
}
-static struct extent_tree *__grab_extent_tree(struct inode *inode)
+static struct extent_tree *__grab_extent_tree(struct inode *inode,
+ enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
struct extent_tree *et;
nid_t ino = inode->i_ino;
- mutex_lock(&sbi->extent_tree_lock);
- et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+ mutex_lock(&eti->extent_tree_lock);
+ et = radix_tree_lookup(&eti->extent_tree_root, ino);
if (!et) {
et = f2fs_kmem_cache_alloc(extent_tree_slab,
GFP_NOFS, true, NULL);
- f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+ f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et);
memset(et, 0, sizeof(struct extent_tree));
et->ino = ino;
+ et->type = type;
et->root = RB_ROOT_CACHED;
et->cached_en = NULL;
rwlock_init(&et->lock);
INIT_LIST_HEAD(&et->list);
atomic_set(&et->node_cnt, 0);
- atomic_inc(&sbi->total_ext_tree);
+ atomic_inc(&eti->total_ext_tree);
} else {
- atomic_dec(&sbi->total_zombie_tree);
+ atomic_dec(&eti->total_zombie_tree);
list_del_init(&et->list);
}
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_unlock(&eti->extent_tree_lock);
/* never died until evict_inode */
- F2FS_I(inode)->extent_tree = et;
+ F2FS_I(inode)->extent_tree[type] = et;
return et;
}
-static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, struct extent_info *ei)
-{
- struct rb_node **p = &et->root.rb_root.rb_node;
- struct extent_node *en;
-
- en = __attach_extent_node(sbi, et, ei, NULL, p, true);
- if (!en)
- return NULL;
-
- et->largest = en->ei;
- et->cached_en = en;
- return en;
-}
-
static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
struct extent_tree *et)
{
@@ -358,70 +472,89 @@ static void __drop_largest_extent(struct extent_tree *et,
}
}
-/* return true, if inode page is changed */
-static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
+ struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
+ struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
struct extent_tree *et;
struct extent_node *en;
struct extent_info ei;
- if (!f2fs_may_extent_tree(inode)) {
- /* drop largest extent */
+ if (!__may_extent_tree(inode, EX_READ)) {
+ /* drop largest read extent */
if (i_ext && i_ext->len) {
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
i_ext->len = 0;
set_page_dirty(ipage);
- return;
}
- return;
+ goto out;
}
- et = __grab_extent_tree(inode);
+ et = __grab_extent_tree(inode, EX_READ);
if (!i_ext || !i_ext->len)
- return;
+ goto out;
- get_extent_info(&ei, i_ext);
+ get_read_extent_info(&ei, i_ext);
write_lock(&et->lock);
if (atomic_read(&et->node_cnt))
- goto out;
+ goto unlock_out;
- en = __init_extent_tree(sbi, et, &ei);
+ en = __attach_extent_node(sbi, et, &ei, NULL,
+ &et->root.rb_root.rb_node, true);
if (en) {
- spin_lock(&sbi->extent_lock);
- list_add_tail(&en->list, &sbi->extent_list);
- spin_unlock(&sbi->extent_lock);
+ et->largest = en->ei;
+ et->cached_en = en;
+
+ spin_lock(&eti->extent_lock);
+ list_add_tail(&en->list, &eti->extent_list);
+ spin_unlock(&eti->extent_lock);
}
-out:
+unlock_out:
write_unlock(&et->lock);
+out:
+ if (!F2FS_I(inode)->extent_tree[EX_READ])
+ set_inode_flag(inode, FI_NO_EXTENT);
}
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+void f2fs_init_age_extent_tree(struct inode *inode)
{
- __f2fs_init_extent_tree(inode, ipage);
+ if (!__init_may_extent_tree(inode, EX_BLOCK_AGE))
+ return;
+ __grab_extent_tree(inode, EX_BLOCK_AGE);
+}
- if (!F2FS_I(inode)->extent_tree)
- set_inode_flag(inode, FI_NO_EXTENT);
+void f2fs_init_extent_tree(struct inode *inode)
+{
+ /* initialize read cache */
+ if (__init_may_extent_tree(inode, EX_READ))
+ __grab_extent_tree(inode, EX_READ);
+
+ /* initialize block age cache */
+ if (__init_may_extent_tree(inode, EX_BLOCK_AGE))
+ __grab_extent_tree(inode, EX_BLOCK_AGE);
}
-static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei)
+static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
struct extent_node *en;
bool ret = false;
- f2fs_bug_on(sbi, !et);
+ if (!et)
+ return false;
- trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+ trace_f2fs_lookup_extent_tree_start(inode, pgofs, type);
read_lock(&et->lock);
- if (et->largest.fofs <= pgofs &&
+ if (type == EX_READ &&
+ et->largest.fofs <= pgofs &&
et->largest.fofs + et->largest.len > pgofs) {
*ei = et->largest;
ret = true;
@@ -435,23 +568,26 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
goto out;
if (en == et->cached_en)
- stat_inc_cached_node_hit(sbi);
+ stat_inc_cached_node_hit(sbi, type);
else
- stat_inc_rbtree_node_hit(sbi);
+ stat_inc_rbtree_node_hit(sbi, type);
*ei = en->ei;
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
if (!list_empty(&en->list)) {
- list_move_tail(&en->list, &sbi->extent_list);
+ list_move_tail(&en->list, &eti->extent_list);
et->cached_en = en;
}
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
ret = true;
out:
- stat_inc_total_hit(sbi);
+ stat_inc_total_hit(sbi, type);
read_unlock(&et->lock);
- trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
+ if (type == EX_READ)
+ trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei);
+ else if (type == EX_BLOCK_AGE)
+ trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei);
return ret;
}
@@ -460,18 +596,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
struct extent_node *prev_ex,
struct extent_node *next_ex)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
struct extent_node *en = NULL;
- if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
+ if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) {
prev_ex->ei.len += ei->len;
ei = &prev_ex->ei;
en = prev_ex;
}
- if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
+ if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) {
next_ex->ei.fofs = ei->fofs;
- next_ex->ei.blk = ei->blk;
next_ex->ei.len += ei->len;
+ if (et->type == EX_READ)
+ next_ex->ei.blk = ei->blk;
if (en)
__release_extent_node(sbi, et, prev_ex);
@@ -483,12 +621,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
__try_update_largest_extent(et, en);
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
if (!list_empty(&en->list)) {
- list_move_tail(&en->list, &sbi->extent_list);
+ list_move_tail(&en->list, &eti->extent_list);
et->cached_en = en;
}
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
return en;
}
@@ -498,6 +636,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
struct rb_node *insert_parent,
bool leftmost)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
struct rb_node **p;
struct rb_node *parent = NULL;
struct extent_node *en = NULL;
@@ -520,47 +659,54 @@ do_insert:
__try_update_largest_extent(et, en);
/* update in global extent list */
- spin_lock(&sbi->extent_lock);
- list_add_tail(&en->list, &sbi->extent_list);
+ spin_lock(&eti->extent_lock);
+ list_add_tail(&en->list, &eti->extent_list);
et->cached_en = en;
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
return en;
}
-static void f2fs_update_extent_tree_range(struct inode *inode,
- pgoff_t fofs, block_t blkaddr, unsigned int len)
+static void __update_extent_tree_range(struct inode *inode,
+ struct extent_info *tei, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
struct extent_node *en = NULL, *en1 = NULL;
struct extent_node *prev_en = NULL, *next_en = NULL;
struct extent_info ei, dei, prev;
struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ unsigned int fofs = tei->fofs, len = tei->len;
unsigned int end = fofs + len;
- unsigned int pos = (unsigned int)fofs;
bool updated = false;
bool leftmost = false;
if (!et)
return;
- trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0);
+ if (type == EX_READ)
+ trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
+ tei->blk, 0);
+ else if (type == EX_BLOCK_AGE)
+ trace_f2fs_update_age_extent_tree_range(inode, fofs, len,
+ tei->age, tei->last_blocks);
write_lock(&et->lock);
- if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
- write_unlock(&et->lock);
- return;
- }
+ if (type == EX_READ) {
+ if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
+ write_unlock(&et->lock);
+ return;
+ }
- prev = et->largest;
- dei.len = 0;
+ prev = et->largest;
+ dei.len = 0;
- /*
- * drop largest extent before lookup, in case it's already
- * been shrunk from extent tree
- */
- __drop_largest_extent(et, fofs, len);
+ /*
+ * drop largest extent before lookup, in case it's already
+ * been shrunk from extent tree
+ */
+ __drop_largest_extent(et, fofs, len);
+ }
/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
@@ -581,26 +727,32 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
dei = en->ei;
org_end = dei.fofs + dei.len;
- f2fs_bug_on(sbi, pos >= org_end);
+ f2fs_bug_on(sbi, fofs >= org_end);
- if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
- en->ei.len = pos - en->ei.fofs;
+ if (fofs > dei.fofs && (type != EX_READ ||
+ fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) {
+ en->ei.len = fofs - en->ei.fofs;
prev_en = en;
parts = 1;
}
- if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+ if (end < org_end && (type != EX_READ ||
+ org_end - end >= F2FS_MIN_EXTENT_LEN)) {
if (parts) {
- set_extent_info(&ei, end,
- end - dei.fofs + dei.blk,
- org_end - end);
+ __set_extent_info(&ei,
+ end, org_end - end,
+ end - dei.fofs + dei.blk, false,
+ dei.age, dei.last_blocks,
+ type);
en1 = __insert_extent_tree(sbi, et, &ei,
NULL, NULL, true);
next_en = en1;
} else {
- en->ei.fofs = end;
- en->ei.blk += end - dei.fofs;
- en->ei.len -= end - dei.fofs;
+ __set_extent_info(&en->ei,
+ end, en->ei.len - (end - dei.fofs),
+ en->ei.blk + (end - dei.fofs), true,
+ dei.age, dei.last_blocks,
+ type);
next_en = en;
}
parts++;
@@ -630,10 +782,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
en = next_en;
}
- /* 3. update extent in extent cache */
- if (blkaddr) {
+ if (type == EX_BLOCK_AGE)
+ goto update_age_extent_cache;
- set_extent_info(&ei, fofs, blkaddr, len);
+ /* 3. update extent in read extent cache */
+ BUG_ON(type != EX_READ);
+
+ if (tei->blk) {
+ __set_extent_info(&ei, fofs, len, tei->blk, false,
+ 0, 0, EX_READ);
if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
__insert_extent_tree(sbi, et, &ei,
insert_p, insert_parent, leftmost);
@@ -655,7 +812,17 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
et->largest_updated = false;
updated = true;
}
+ goto out_read_extent_cache;
+update_age_extent_cache:
+ if (!tei->last_blocks)
+ goto out_read_extent_cache;
+ __set_extent_info(&ei, fofs, len, 0, false,
+ tei->age, tei->last_blocks, EX_BLOCK_AGE);
+ if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+ __insert_extent_tree(sbi, et, &ei,
+ insert_p, insert_parent, leftmost);
+out_read_extent_cache:
write_unlock(&et->lock);
if (updated)
@@ -663,19 +830,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
pgoff_t fofs, block_t blkaddr, unsigned int llen,
unsigned int c_len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
struct extent_node *en = NULL;
struct extent_node *prev_en = NULL, *next_en = NULL;
struct extent_info ei;
struct rb_node **insert_p = NULL, *insert_parent = NULL;
bool leftmost = false;
- trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len);
+ trace_f2fs_update_read_extent_tree_range(inode, fofs, llen,
+ blkaddr, c_len);
/* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
if (is_inode_flag_set(inode, FI_NO_EXTENT))
@@ -692,7 +860,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode,
if (en)
goto unlock_out;
- set_extent_info(&ei, fofs, blkaddr, llen);
+ __set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ);
ei.c_len = c_len;
if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
@@ -703,24 +871,114 @@ unlock_out:
}
#endif
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+static unsigned long long __calculate_block_age(unsigned long long new,
+ unsigned long long old)
+{
+ unsigned long long diff;
+
+ diff = (new >= old) ? new - (new - old) : new + (old - new);
+
+ return div_u64(diff * LAST_AGE_WEIGHT, 100);
+}
+
+/* This returns a new age and allocated blocks in ei */
+static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
+ block_t blkaddr)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ loff_t f_size = i_size_read(inode);
+ unsigned long long cur_blocks =
+ atomic64_read(&sbi->allocated_data_blocks);
+ struct extent_info tei = *ei; /* only fofs and len are valid */
+
+ /*
+ * When I/O is not aligned to a PAGE_SIZE, update will happen to the last
+ * file block even in seq write. So don't record age for newly last file
+ * block here.
+ */
+ if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) &&
+ blkaddr == NEW_ADDR)
+ return -EINVAL;
+
+ if (__lookup_extent_tree(inode, ei->fofs, &tei, EX_BLOCK_AGE)) {
+ unsigned long long cur_age;
+
+ if (cur_blocks >= tei.last_blocks)
+ cur_age = cur_blocks - tei.last_blocks;
+ else
+ /* allocated_data_blocks overflow */
+ cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks;
+
+ if (tei.age)
+ ei->age = __calculate_block_age(cur_age, tei.age);
+ else
+ ei->age = cur_age;
+ ei->last_blocks = cur_blocks;
+ WARN_ON(ei->age > cur_blocks);
+ return 0;
+ }
+
+ f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
+
+ /* the data block was allocated for the first time */
+ if (blkaddr == NEW_ADDR)
+ goto out;
+
+ if (__is_valid_data_blkaddr(blkaddr) &&
+ !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
+ f2fs_bug_on(sbi, 1);
+ return -EINVAL;
+ }
+out:
+ /*
+ * init block age with zero, this can happen when the block age extent
+ * was reclaimed due to memory constraint or system reboot
+ */
+ ei->age = 0;
+ ei->last_blocks = cur_blocks;
+ return 0;
+}
+
+static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type)
{
+ struct extent_info ei = {};
+
+ if (!__may_extent_tree(dn->inode, type))
+ return;
+
+ ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+ dn->ofs_in_node;
+ ei.len = 1;
+
+ if (type == EX_READ) {
+ if (dn->data_blkaddr == NEW_ADDR)
+ ei.blk = NULL_ADDR;
+ else
+ ei.blk = dn->data_blkaddr;
+ } else if (type == EX_BLOCK_AGE) {
+ if (__get_new_block_age(dn->inode, &ei, dn->data_blkaddr))
+ return;
+ }
+ __update_extent_tree_range(dn->inode, &ei, type);
+}
+
+static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink,
+ enum extent_type type)
+{
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
struct extent_tree *et, *next;
struct extent_node *en;
unsigned int node_cnt = 0, tree_cnt = 0;
int remained;
- if (!test_opt(sbi, EXTENT_CACHE))
- return 0;
-
- if (!atomic_read(&sbi->total_zombie_tree))
+ if (!atomic_read(&eti->total_zombie_tree))
goto free_node;
- if (!mutex_trylock(&sbi->extent_tree_lock))
+ if (!mutex_trylock(&eti->extent_tree_lock))
goto out;
/* 1. remove unreferenced extent tree */
- list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+ list_for_each_entry_safe(et, next, &eti->zombie_list, list) {
if (atomic_read(&et->node_cnt)) {
write_lock(&et->lock);
node_cnt += __free_extent_tree(sbi, et);
@@ -728,61 +986,137 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
}
f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
list_del_init(&et->list);
- radix_tree_delete(&sbi->extent_tree_root, et->ino);
+ radix_tree_delete(&eti->extent_tree_root, et->ino);
kmem_cache_free(extent_tree_slab, et);
- atomic_dec(&sbi->total_ext_tree);
- atomic_dec(&sbi->total_zombie_tree);
+ atomic_dec(&eti->total_ext_tree);
+ atomic_dec(&eti->total_zombie_tree);
tree_cnt++;
if (node_cnt + tree_cnt >= nr_shrink)
goto unlock_out;
cond_resched();
}
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_unlock(&eti->extent_tree_lock);
free_node:
/* 2. remove LRU extent entries */
- if (!mutex_trylock(&sbi->extent_tree_lock))
+ if (!mutex_trylock(&eti->extent_tree_lock))
goto out;
remained = nr_shrink - (node_cnt + tree_cnt);
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
for (; remained > 0; remained--) {
- if (list_empty(&sbi->extent_list))
+ if (list_empty(&eti->extent_list))
break;
- en = list_first_entry(&sbi->extent_list,
+ en = list_first_entry(&eti->extent_list,
struct extent_node, list);
et = en->et;
if (!write_trylock(&et->lock)) {
/* refresh this extent node's position in extent list */
- list_move_tail(&en->list, &sbi->extent_list);
+ list_move_tail(&en->list, &eti->extent_list);
continue;
}
list_del_init(&en->list);
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
__detach_extent_node(sbi, et, en);
write_unlock(&et->lock);
node_cnt++;
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
}
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
unlock_out:
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_unlock(&eti->extent_tree_lock);
out:
- trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+ trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type);
return node_cnt + tree_cnt;
}
-unsigned int f2fs_destroy_extent_node(struct inode *inode)
+/* read extent cache operations */
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ if (!__may_extent_tree(inode, EX_READ))
+ return false;
+
+ return __lookup_extent_tree(inode, pgofs, ei, EX_READ);
+}
+
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn)
+{
+ return __update_extent_cache(dn, EX_READ);
+}
+
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, block_t blkaddr, unsigned int len)
+{
+ struct extent_info ei = {
+ .fofs = fofs,
+ .len = len,
+ .blk = blkaddr,
+ };
+
+ if (!__may_extent_tree(dn->inode, EX_READ))
+ return;
+
+ __update_extent_tree_range(dn->inode, &ei, EX_READ);
+}
+
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ if (!test_opt(sbi, READ_EXTENT_CACHE))
+ return 0;
+
+ return __shrink_extent_tree(sbi, nr_shrink, EX_READ);
+}
+
+/* block age extent cache operations */
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ if (!__may_extent_tree(inode, EX_BLOCK_AGE))
+ return false;
+
+ return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn)
+{
+ return __update_extent_cache(dn, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, unsigned int len)
+{
+ struct extent_info ei = {
+ .fofs = fofs,
+ .len = len,
+ };
+
+ if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE))
+ return;
+
+ __update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE);
+}
+
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ if (!test_opt(sbi, AGE_EXTENT_CACHE))
+ return 0;
+
+ return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE);
+}
+
+static unsigned int __destroy_extent_node(struct inode *inode,
+ enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
unsigned int node_cnt = 0;
if (!et || !atomic_read(&et->node_cnt))
@@ -795,31 +1129,46 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
return node_cnt;
}
-void f2fs_drop_extent_tree(struct inode *inode)
+void f2fs_destroy_extent_node(struct inode *inode)
+{
+ __destroy_extent_node(inode, EX_READ);
+ __destroy_extent_node(inode, EX_BLOCK_AGE);
+}
+
+static void __drop_extent_tree(struct inode *inode, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
bool updated = false;
- if (!f2fs_may_extent_tree(inode))
+ if (!__may_extent_tree(inode, type))
return;
write_lock(&et->lock);
- set_inode_flag(inode, FI_NO_EXTENT);
__free_extent_tree(sbi, et);
- if (et->largest.len) {
- et->largest.len = 0;
- updated = true;
+ if (type == EX_READ) {
+ set_inode_flag(inode, FI_NO_EXTENT);
+ if (et->largest.len) {
+ et->largest.len = 0;
+ updated = true;
+ }
}
write_unlock(&et->lock);
if (updated)
f2fs_mark_inode_dirty_sync(inode, true);
}
-void f2fs_destroy_extent_tree(struct inode *inode)
+void f2fs_drop_extent_tree(struct inode *inode)
+{
+ __drop_extent_tree(inode, EX_READ);
+ __drop_extent_tree(inode, EX_BLOCK_AGE);
+}
+
+static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
unsigned int node_cnt = 0;
if (!et)
@@ -827,76 +1176,56 @@ void f2fs_destroy_extent_tree(struct inode *inode)
if (inode->i_nlink && !is_bad_inode(inode) &&
atomic_read(&et->node_cnt)) {
- mutex_lock(&sbi->extent_tree_lock);
- list_add_tail(&et->list, &sbi->zombie_list);
- atomic_inc(&sbi->total_zombie_tree);
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_lock(&eti->extent_tree_lock);
+ list_add_tail(&et->list, &eti->zombie_list);
+ atomic_inc(&eti->total_zombie_tree);
+ mutex_unlock(&eti->extent_tree_lock);
return;
}
/* free all extent info belong to this extent tree */
- node_cnt = f2fs_destroy_extent_node(inode);
+ node_cnt = __destroy_extent_node(inode, type);
/* delete extent tree entry in radix tree */
- mutex_lock(&sbi->extent_tree_lock);
+ mutex_lock(&eti->extent_tree_lock);
f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
- radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+ radix_tree_delete(&eti->extent_tree_root, inode->i_ino);
kmem_cache_free(extent_tree_slab, et);
- atomic_dec(&sbi->total_ext_tree);
- mutex_unlock(&sbi->extent_tree_lock);
+ atomic_dec(&eti->total_ext_tree);
+ mutex_unlock(&eti->extent_tree_lock);
- F2FS_I(inode)->extent_tree = NULL;
+ F2FS_I(inode)->extent_tree[type] = NULL;
- trace_f2fs_destroy_extent_tree(inode, node_cnt);
+ trace_f2fs_destroy_extent_tree(inode, node_cnt, type);
}
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei)
-{
- if (!f2fs_may_extent_tree(inode))
- return false;
-
- return f2fs_lookup_extent_tree(inode, pgofs, ei);
-}
-
-void f2fs_update_extent_cache(struct dnode_of_data *dn)
+void f2fs_destroy_extent_tree(struct inode *inode)
{
- pgoff_t fofs;
- block_t blkaddr;
-
- if (!f2fs_may_extent_tree(dn->inode))
- return;
-
- if (dn->data_blkaddr == NEW_ADDR)
- blkaddr = NULL_ADDR;
- else
- blkaddr = dn->data_blkaddr;
-
- fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
- dn->ofs_in_node;
- f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
+ __destroy_extent_tree(inode, EX_READ);
+ __destroy_extent_tree(inode, EX_BLOCK_AGE);
}
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
- pgoff_t fofs, block_t blkaddr, unsigned int len)
-
+static void __init_extent_tree_info(struct extent_tree_info *eti)
{
- if (!f2fs_may_extent_tree(dn->inode))
- return;
-
- f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
+ INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO);
+ mutex_init(&eti->extent_tree_lock);
+ INIT_LIST_HEAD(&eti->extent_list);
+ spin_lock_init(&eti->extent_lock);
+ atomic_set(&eti->total_ext_tree, 0);
+ INIT_LIST_HEAD(&eti->zombie_list);
+ atomic_set(&eti->total_zombie_tree, 0);
+ atomic_set(&eti->total_ext_node, 0);
}
void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
{
- INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
- mutex_init(&sbi->extent_tree_lock);
- INIT_LIST_HEAD(&sbi->extent_list);
- spin_lock_init(&sbi->extent_lock);
- atomic_set(&sbi->total_ext_tree, 0);
- INIT_LIST_HEAD(&sbi->zombie_list);
- atomic_set(&sbi->total_zombie_tree, 0);
- atomic_set(&sbi->total_ext_node, 0);
+ __init_extent_tree_info(&sbi->extent_tree[EX_READ]);
+ __init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]);
+
+ /* initialize for block age extents */
+ atomic64_set(&sbi->allocated_data_blocks, 0);
+ sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD;
+ sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD;
}
int __init f2fs_create_extent_cache(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e6355a5683b7..e8953c3dc81a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -60,6 +60,7 @@ enum {
FAULT_SLAB_ALLOC,
FAULT_DQUOT_INIT,
FAULT_LOCK_OP,
+ FAULT_BLKADDR,
FAULT_MAX,
};
@@ -91,7 +92,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_FLUSH_MERGE 0x00000400
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
-#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
+#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000
#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
#define F2FS_MOUNT_USRQUOTA 0x00080000
@@ -106,6 +107,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000
#define F2FS_MOUNT_GC_MERGE 0x20000000
#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000
+#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -202,10 +204,6 @@ struct f2fs_mount_info {
#define __F2FS_HAS_FEATURE(raw_super, mask) \
((raw_super->feature & cpu_to_le32(mask)) != 0)
#define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask)
-#define F2FS_SET_FEATURE(sbi, mask) \
- (sbi->raw_super->feature |= cpu_to_le32(mask))
-#define F2FS_CLEAR_FEATURE(sbi, mask) \
- (sbi->raw_super->feature &= ~cpu_to_le32(mask))
/*
* Default values for user and/or group using reserved blocks
@@ -328,8 +326,12 @@ struct discard_entry {
unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */
};
+/* minimum discard granularity, unit: block count */
+#define MIN_DISCARD_GRANULARITY 1
/* default discard granularity of inner discard thread, unit: block count */
#define DEFAULT_DISCARD_GRANULARITY 16
+/* default maximum discard granularity of ordered discard, unit: block count */
+#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16
/* max discard pend list number */
#define MAX_PLIST_NUM 512
@@ -408,7 +410,9 @@ struct discard_cmd_control {
unsigned int min_discard_issue_time; /* min. interval between discard issue */
unsigned int mid_discard_issue_time; /* mid. interval between discard issue */
unsigned int max_discard_issue_time; /* max. interval between discard issue */
+ unsigned int discard_urgent_util; /* utilization which issue discard proactively */
unsigned int discard_granularity; /* discard granularity */
+ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
atomic_t issued_discard; /* # of issued discard */
@@ -593,16 +597,35 @@ enum {
/* dirty segments threshold for triggering CP */
#define DEFAULT_DIRTY_THRESHOLD 4
+#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
+#define RECOVERY_MIN_RA_BLOCKS 1
+
+#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */
+
/* for in-memory extent cache entry */
#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */
/* number of extent info in extent cache we try to shrink */
-#define EXTENT_CACHE_SHRINK_NUMBER 128
+#define READ_EXTENT_CACHE_SHRINK_NUMBER 128
-#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
-#define RECOVERY_MIN_RA_BLOCKS 1
+/* number of age extent info in extent cache we try to shrink */
+#define AGE_EXTENT_CACHE_SHRINK_NUMBER 128
+#define LAST_AGE_WEIGHT 30
+#define SAME_AGE_REGION 1024
-#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */
+/*
+ * Define data block with age less than 1GB as hot data
+ * define data block with age less than 10GB but more than 1GB as warm data
+ */
+#define DEF_HOT_DATA_AGE_THRESHOLD 262144
+#define DEF_WARM_DATA_AGE_THRESHOLD 2621440
+
+/* extent cache type */
+enum extent_type {
+ EX_READ,
+ EX_BLOCK_AGE,
+ NR_EXTENT_CACHES,
+};
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
@@ -618,10 +641,24 @@ struct rb_entry {
struct extent_info {
unsigned int fofs; /* start offset in a file */
unsigned int len; /* length of the extent */
- u32 blk; /* start block address of the extent */
+ union {
+ /* read extent_cache */
+ struct {
+ /* start block address of the extent */
+ block_t blk;
#ifdef CONFIG_F2FS_FS_COMPRESSION
- unsigned int c_len; /* physical extent length of compressed blocks */
+ /* physical extent length of compressed blocks */
+ unsigned int c_len;
#endif
+ };
+ /* block age extent_cache */
+ struct {
+ /* block age of the extent */
+ unsigned long long age;
+ /* last total blocks allocated */
+ unsigned long long last_blocks;
+ };
+ };
};
struct extent_node {
@@ -633,13 +670,25 @@ struct extent_node {
struct extent_tree {
nid_t ino; /* inode number */
+ enum extent_type type; /* keep the extent tree type */
struct rb_root_cached root; /* root of extent info rb-tree */
struct extent_node *cached_en; /* recently accessed extent node */
- struct extent_info largest; /* largested extent info */
struct list_head list; /* to be used by sbi->zombie_list */
rwlock_t lock; /* protect extent info rb-tree */
atomic_t node_cnt; /* # of extent node in rb-tree*/
bool largest_updated; /* largest extent updated */
+ struct extent_info largest; /* largest cached extent for EX_READ */
+};
+
+struct extent_tree_info {
+ struct radix_tree_root extent_tree_root;/* cache extent cache entries */
+ struct mutex extent_tree_lock; /* locking extent radix tree */
+ struct list_head extent_list; /* lru list for shrinker */
+ spinlock_t extent_lock; /* locking extent lru list */
+ atomic_t total_ext_tree; /* extent tree count */
+ struct list_head zombie_list; /* extent zombie tree list */
+ atomic_t total_zombie_tree; /* extent zombie tree count */
+ atomic_t total_ext_node; /* extent info count */
};
/*
@@ -764,6 +813,8 @@ enum {
FI_COMPRESS_RELEASED, /* compressed blocks were released */
FI_ALIGNED_WRITE, /* enable aligned write */
FI_COW_FILE, /* indicate COW file */
+ FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */
+ FI_ATOMIC_REPLACE, /* indicate atomic replace */
FI_MAX, /* max flag, never be used */
};
@@ -800,7 +851,8 @@ struct f2fs_inode_info {
struct list_head dirty_list; /* dirty list for dirs and files */
struct list_head gdirty_list; /* linked in global dirty list */
struct task_struct *atomic_write_task; /* store atomic write task */
- struct extent_tree *extent_tree; /* cached extent_tree entry */
+ struct extent_tree *extent_tree[NR_EXTENT_CACHES];
+ /* cached extent_tree entry */
struct inode *cow_inode; /* copy-on-write inode for atomic write */
/* avoid racing between foreground op and gc */
@@ -822,9 +874,10 @@ struct f2fs_inode_info {
unsigned int i_cluster_size; /* cluster size */
unsigned int atomic_write_cnt;
+ loff_t original_i_size; /* original i_size before atomic write */
};
-static inline void get_extent_info(struct extent_info *ext,
+static inline void get_read_extent_info(struct extent_info *ext,
struct f2fs_extent *i_ext)
{
ext->fofs = le32_to_cpu(i_ext->fofs);
@@ -832,7 +885,7 @@ static inline void get_extent_info(struct extent_info *ext,
ext->len = le32_to_cpu(i_ext->len);
}
-static inline void set_raw_extent(struct extent_info *ext,
+static inline void set_raw_read_extent(struct extent_info *ext,
struct f2fs_extent *i_ext)
{
i_ext->fofs = cpu_to_le32(ext->fofs);
@@ -840,17 +893,6 @@ static inline void set_raw_extent(struct extent_info *ext,
i_ext->len = cpu_to_le32(ext->len);
}
-static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
- u32 blk, unsigned int len)
-{
- ei->fofs = fofs;
- ei->blk = blk;
- ei->len = len;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- ei->c_len = 0;
-#endif
-}
-
static inline bool __is_discard_mergeable(struct discard_info *back,
struct discard_info *front, unsigned int max_len)
{
@@ -870,41 +912,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur,
return __is_discard_mergeable(cur, front, max_len);
}
-static inline bool __is_extent_mergeable(struct extent_info *back,
- struct extent_info *front)
-{
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (back->c_len && back->len != back->c_len)
- return false;
- if (front->c_len && front->len != front->c_len)
- return false;
-#endif
- return (back->fofs + back->len == front->fofs &&
- back->blk + back->len == front->blk);
-}
-
-static inline bool __is_back_mergeable(struct extent_info *cur,
- struct extent_info *back)
-{
- return __is_extent_mergeable(back, cur);
-}
-
-static inline bool __is_front_mergeable(struct extent_info *cur,
- struct extent_info *front)
-{
- return __is_extent_mergeable(cur, front);
-}
-
-extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
-static inline void __try_update_largest_extent(struct extent_tree *et,
- struct extent_node *en)
-{
- if (en->ei.len > et->largest.len) {
- et->largest = en->ei;
- et->largest_updated = true;
- }
-}
-
/*
* For free nid management
*/
@@ -1062,9 +1069,6 @@ struct f2fs_sm_info {
/* a threshold to reclaim prefree segments */
unsigned int rec_prefree_segments;
- /* for batched trimming */
- unsigned int trim_sections; /* # of sections to trim */
-
struct list_head sit_entry_set; /* sit entry set list */
unsigned int ipu_policy; /* in-place-update policy */
@@ -1318,6 +1322,7 @@ enum {
MAX_TIME,
};
+/* Note that you need to keep synchronization with this gc_mode_names array */
enum {
GC_NORMAL,
GC_IDLE_CB,
@@ -1668,14 +1673,12 @@ struct f2fs_sb_info {
struct mutex flush_lock; /* for flush exclusion */
/* for extent tree cache */
- struct radix_tree_root extent_tree_root;/* cache extent cache entries */
- struct mutex extent_tree_lock; /* locking extent radix tree */
- struct list_head extent_list; /* lru list for shrinker */
- spinlock_t extent_lock; /* locking extent lru list */
- atomic_t total_ext_tree; /* extent tree count */
- struct list_head zombie_list; /* extent zombie tree list */
- atomic_t total_zombie_tree; /* extent zombie tree count */
- atomic_t total_ext_node; /* extent info count */
+ struct extent_tree_info extent_tree[NR_EXTENT_CACHES];
+ atomic64_t allocated_data_blocks; /* for block age extent_cache */
+
+ /* The threshold used for hot and warm data seperation*/
+ unsigned int hot_data_age_threshold;
+ unsigned int warm_data_age_threshold;
/* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -1693,7 +1696,7 @@ struct f2fs_sb_info {
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
int dir_level; /* directory level */
- int readdir_ra; /* readahead inode in readdir */
+ bool readdir_ra; /* readahead inode in readdir */
u64 max_io_bytes; /* max io bytes to merge IOs */
block_t user_block_count; /* # of user blocks */
@@ -1734,8 +1737,9 @@ struct f2fs_sb_info {
unsigned int cur_victim_sec; /* current victim section num */
unsigned int gc_mode; /* current GC state */
unsigned int next_victim_seg[2]; /* next segment in victim section */
- spinlock_t gc_urgent_high_lock;
- unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */
+ spinlock_t gc_remaining_trials_lock;
+ /* remaining trial count for GC_URGENT_* and GC_IDLE_* */
+ unsigned int gc_remaining_trials;
/* for skip statistic */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
@@ -1759,10 +1763,14 @@ struct f2fs_sb_info {
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
atomic_t inplace_count; /* # of inplace update */
- atomic64_t total_hit_ext; /* # of lookup extent cache */
- atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */
- atomic64_t read_hit_largest; /* # of hit largest extent node */
- atomic64_t read_hit_cached; /* # of hit cached extent node */
+ /* # of lookup extent cache */
+ atomic64_t total_hit_ext[NR_EXTENT_CACHES];
+ /* # of hit rbtree extent node */
+ atomic64_t read_hit_rbtree[NR_EXTENT_CACHES];
+ /* # of hit cached extent node */
+ atomic64_t read_hit_cached[NR_EXTENT_CACHES];
+ /* # of hit largest extent node in read extent cache */
+ atomic64_t read_hit_largest;
atomic_t inline_xattr; /* # of inline_xattr inodes */
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
@@ -2576,6 +2584,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
}
+extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
struct inode *inode, bool is_inode)
{
@@ -2974,7 +2983,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
/* Flags that should be inherited by new inodes from their parent. */
#define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \
F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
- F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL)
+ F2FS_CASEFOLD_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
@@ -3072,6 +3081,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode,
set_inode_flag(inode, FI_AUTO_RECOVER);
}
+static inline bool f2fs_is_atomic_file(struct inode *inode);
+
static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
{
bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
@@ -3081,6 +3092,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
return;
i_size_write(inode, i_size);
+
+ if (f2fs_is_atomic_file(inode))
+ return;
+
f2fs_mark_inode_dirty_sync(inode, true);
if (clean || recover)
set_inode_flag(inode, FI_AUTO_RECOVER);
@@ -3796,8 +3811,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write);
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
+ blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+ pgoff_t *next_pgofs);
struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
bool for_write);
struct page *f2fs_get_new_data_page(struct inode *inode,
@@ -3856,9 +3872,19 @@ struct f2fs_stat_info {
struct f2fs_sb_info *sbi;
int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
int main_area_segs, main_area_sections, main_area_zones;
- unsigned long long hit_largest, hit_cached, hit_rbtree;
- unsigned long long hit_total, total_ext;
- int ext_tree, zombie_tree, ext_node;
+ unsigned long long hit_cached[NR_EXTENT_CACHES];
+ unsigned long long hit_rbtree[NR_EXTENT_CACHES];
+ unsigned long long total_ext[NR_EXTENT_CACHES];
+ unsigned long long hit_total[NR_EXTENT_CACHES];
+ int ext_tree[NR_EXTENT_CACHES];
+ int zombie_tree[NR_EXTENT_CACHES];
+ int ext_node[NR_EXTENT_CACHES];
+ /* to count memory footprint */
+ unsigned long long ext_mem[NR_EXTENT_CACHES];
+ /* for read extent cache */
+ unsigned long long hit_largest;
+ /* for block age extent cache */
+ unsigned long long allocated_data_blocks;
int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
int ndirty_data, ndirty_qdata;
unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
@@ -3917,10 +3943,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++)
#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
-#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
-#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type]))
+#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type]))
#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest))
-#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached))
+#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type]))
#define stat_inc_inline_xattr(inode) \
do { \
if (f2fs_has_inline_xattr(inode)) \
@@ -4043,10 +4069,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#define stat_other_skip_bggc_count(sbi) do { } while (0)
#define stat_inc_dirty_inode(sbi, type) do { } while (0)
#define stat_dec_dirty_inode(sbi, type) do { } while (0)
-#define stat_inc_total_hit(sbi) do { } while (0)
-#define stat_inc_rbtree_node_hit(sbi) do { } while (0)
+#define stat_inc_total_hit(sbi, type) do { } while (0)
+#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0)
#define stat_inc_largest_node_hit(sbi) do { } while (0)
-#define stat_inc_cached_node_hit(sbi) do { } while (0)
+#define stat_inc_cached_node_hit(sbi, type) do { } while (0)
#define stat_inc_inline_xattr(inode) do { } while (0)
#define stat_dec_inline_xattr(inode) do { } while (0)
#define stat_inc_inline_inode(inode) do { } while (0)
@@ -4152,20 +4178,34 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
bool force, bool *leftmost);
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
struct rb_root_cached *root, bool check_key);
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage);
+void f2fs_init_extent_tree(struct inode *inode);
void f2fs_drop_extent_tree(struct inode *inode);
-unsigned int f2fs_destroy_extent_node(struct inode *inode);
+void f2fs_destroy_extent_node(struct inode *inode);
void f2fs_destroy_extent_tree(struct inode *inode);
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei);
-void f2fs_update_extent_cache(struct dnode_of_data *dn);
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
- pgoff_t fofs, block_t blkaddr, unsigned int len);
void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_extent_cache(void);
void f2fs_destroy_extent_cache(void);
+/* read extent cache ops */
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage);
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei);
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, block_t blkaddr, unsigned int len);
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi,
+ int nr_shrink);
+
+/* block age extent cache ops */
+void f2fs_init_age_extent_tree(struct inode *inode);
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei);
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, unsigned int len);
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi,
+ int nr_shrink);
+
/*
* sysfs.c
*/
@@ -4235,9 +4275,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
struct writeback_control *wbc,
enum iostat_type io_type);
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
- pgoff_t fofs, block_t blkaddr, unsigned int llen,
- unsigned int c_len);
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
+ pgoff_t fofs, block_t blkaddr,
+ unsigned int llen, unsigned int c_len);
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
@@ -4318,9 +4358,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
nid_t ino) { }
#define inc_compr_inode_stat(inode) do { } while (0)
-static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode,
- pgoff_t fofs, block_t blkaddr, unsigned int llen,
- unsigned int c_len) { }
+static inline void f2fs_update_read_extent_tree_range_compressed(
+ struct inode *inode,
+ pgoff_t fofs, block_t blkaddr,
+ unsigned int llen, unsigned int c_len) { }
#endif
static inline int set_compress_context(struct inode *inode)
@@ -4371,7 +4412,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
}
#define F2FS_FEATURE_FUNCS(name, flagname) \
-static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
+static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
{ \
return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \
}
@@ -4391,26 +4432,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
F2FS_FEATURE_FUNCS(compression, COMPRESSION);
F2FS_FEATURE_FUNCS(readonly, RO);
-static inline bool f2fs_may_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- if (!test_opt(sbi, EXTENT_CACHE) ||
- is_inode_flag_set(inode, FI_NO_EXTENT) ||
- (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
- !f2fs_sb_has_readonly(sbi)))
- return false;
-
- /*
- * for recovered files during mount do not create extents
- * if shrinker is not registered.
- */
- if (list_empty(&sbi->s_list))
- return false;
-
- return S_ISREG(inode->i_mode);
-}
-
#ifdef CONFIG_BLK_DEV_ZONED
static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
block_t blkaddr)
@@ -4563,6 +4584,11 @@ static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs,
}
}
+static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
+{
+ return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 82cda1258227..ecbc8c135b49 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -571,7 +571,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
raw_node = F2FS_NODE(dn->node_page);
addr = blkaddr_in_node(raw_node) + base + ofs;
- /* Assumption: truncateion starts with cluster */
+ /* Assumption: truncation starts with cluster */
for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
block_t blkaddr = le32_to_cpu(*addr);
@@ -618,7 +618,8 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
*/
fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
dn->inode) + ofs;
- f2fs_update_extent_cache_range(dn, fofs, 0, len);
+ f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
+ f2fs_update_age_extent_cache_range(dn, fofs, nr_free);
dec_valid_block_count(sbi, dn->inode, nr_free);
}
dn->ofs_in_node = ofs;
@@ -1025,7 +1026,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
__setattr_copy(mnt_userns, inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode));
+ err = posix_acl_chmod(mnt_userns, dentry, f2fs_get_inode_mode(inode));
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
if (!err)
@@ -1046,7 +1047,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
const struct inode_operations f2fs_file_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
- .get_acl = f2fs_get_acl,
+ .get_inode_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
.fiemap = f2fs_fiemap,
@@ -1496,7 +1497,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
f2fs_set_data_blkaddr(dn);
}
- f2fs_update_extent_cache_range(dn, start, 0, index - start);
+ f2fs_update_read_extent_cache_range(dn, start, 0, index - start);
return ret;
}
@@ -1915,6 +1916,10 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
} else {
+ /* try to convert inline_data to support compression */
+ int err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
if (!f2fs_may_compress(inode))
return -EINVAL;
if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
@@ -2030,13 +2035,14 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
return put_user(inode->i_generation, (int __user *)arg);
}
-static int f2fs_ioc_start_atomic_write(struct file *filp)
+static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
{
struct inode *inode = file_inode(filp);
struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct inode *pinode;
+ loff_t isize;
int ret;
if (!inode_owner_or_capable(mnt_userns, inode))
@@ -2095,13 +2101,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
goto out;
}
- f2fs_i_size_write(fi->cow_inode, i_size_read(inode));
+
+ f2fs_write_inode(inode, NULL);
stat_inc_atomic_inode(inode);
set_inode_flag(inode, FI_ATOMIC_FILE);
set_inode_flag(fi->cow_inode, FI_COW_FILE);
clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
+
+ isize = i_size_read(inode);
+ fi->original_i_size = isize;
+ if (truncate) {
+ set_inode_flag(inode, FI_ATOMIC_REPLACE);
+ truncate_inode_pages_final(inode->i_mapping);
+ f2fs_i_size_write(inode, 0);
+ isize = 0;
+ }
+ f2fs_i_size_write(fi->cow_inode, isize);
+
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
f2fs_update_time(sbi, REQ_TIME);
@@ -2133,16 +2151,14 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (f2fs_is_atomic_file(inode)) {
ret = f2fs_commit_atomic_write(inode);
- if (ret)
- goto unlock_out;
-
- ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
if (!ret)
- f2fs_abort_atomic_write(inode, false);
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+
+ f2fs_abort_atomic_write(inode, ret);
} else {
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
}
-unlock_out:
+
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
@@ -2543,7 +2559,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
struct f2fs_map_blocks map = { .m_next_extent = NULL,
.m_seg_type = NO_CHECK_TYPE,
.m_may_create = false };
- struct extent_info ei = {0, 0, 0};
+ struct extent_info ei = {};
pgoff_t pg_start, pg_end, next_pgofs;
unsigned int blk_per_seg = sbi->blocks_per_seg;
unsigned int total = 0, sec_num;
@@ -2575,7 +2591,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
* lookup mapping info in extent cache, skip defragmenting if physical
* block addresses are continuous.
*/
- if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) {
if (ei.fofs + ei.len >= pg_end)
goto out;
}
@@ -4131,7 +4147,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FS_IOC_GETVERSION:
return f2fs_ioc_getversion(filp, arg);
case F2FS_IOC_START_ATOMIC_WRITE:
- return f2fs_ioc_start_atomic_write(filp);
+ return f2fs_ioc_start_atomic_write(filp, false);
+ case F2FS_IOC_START_ATOMIC_REPLACE:
+ return f2fs_ioc_start_atomic_write(filp, true);
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
return f2fs_ioc_commit_atomic_write(filp);
case F2FS_IOC_ABORT_ATOMIC_WRITE:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 4546e01b2ee0..6e2cae3d2e71 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -96,16 +96,6 @@ static int gc_thread_func(void *data)
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
*/
- if (sbi->gc_mode == GC_URGENT_HIGH) {
- spin_lock(&sbi->gc_urgent_high_lock);
- if (sbi->gc_urgent_high_remaining) {
- sbi->gc_urgent_high_remaining--;
- if (!sbi->gc_urgent_high_remaining)
- sbi->gc_mode = GC_NORMAL;
- }
- spin_unlock(&sbi->gc_urgent_high_lock);
- }
-
if (sbi->gc_mode == GC_URGENT_HIGH ||
sbi->gc_mode == GC_URGENT_MID) {
wait_ms = gc_th->urgent_sleep_time;
@@ -151,6 +141,10 @@ do_gc:
/* don't bother wait_ms by foreground gc */
if (!foreground)
wait_ms = gc_th->no_gc_sleep_time;
+ } else {
+ /* reset wait_ms to default sleep time */
+ if (wait_ms == gc_th->no_gc_sleep_time)
+ wait_ms = gc_th->min_sleep_time;
}
if (foreground)
@@ -162,6 +156,15 @@ do_gc:
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi, true);
next:
+ if (sbi->gc_mode != GC_NORMAL) {
+ spin_lock(&sbi->gc_remaining_trials_lock);
+ if (sbi->gc_remaining_trials) {
+ sbi->gc_remaining_trials--;
+ if (!sbi->gc_remaining_trials)
+ sbi->gc_mode = GC_NORMAL;
+ }
+ spin_unlock(&sbi->gc_remaining_trials_lock);
+ }
sb_end_write(sbi->sb);
} while (!kthread_should_stop());
@@ -172,13 +175,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
- int err = 0;
gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
- if (!gc_th) {
- err = -ENOMEM;
- goto out;
- }
+ if (!gc_th)
+ return -ENOMEM;
gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
@@ -193,12 +193,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
- err = PTR_ERR(gc_th->f2fs_gc_task);
+ int err = PTR_ERR(gc_th->f2fs_gc_task);
+
kfree(gc_th);
sbi->gc_thread = NULL;
+ return err;
}
-out:
- return err;
+
+ return 0;
}
void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -282,7 +284,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
/* let's select beginning hot/small space first in no_heap mode*/
if (f2fs_need_rand_seg(sbi))
- p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec);
+ p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
else if (test_opt(sbi, NOHEAP) &&
(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
p->offset = 0;
@@ -1079,7 +1081,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
{
struct page *node_page;
nid_t nid;
- unsigned int ofs_in_node, max_addrs;
+ unsigned int ofs_in_node, max_addrs, base;
block_t source_blkaddr;
nid = le32_to_cpu(sum->nid);
@@ -1105,11 +1107,18 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return false;
}
- max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE :
- DEF_ADDRS_PER_BLOCK;
- if (ofs_in_node >= max_addrs) {
- f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u",
- ofs_in_node, dni->ino, dni->nid, max_addrs);
+ if (IS_INODE(node_page)) {
+ base = offset_in_addr(F2FS_INODE(node_page));
+ max_addrs = DEF_ADDRS_PER_INODE;
+ } else {
+ base = 0;
+ max_addrs = DEF_ADDRS_PER_BLOCK;
+ }
+
+ if (base + ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u",
+ base, ofs_in_node, max_addrs, dni->ino, dni->nid);
+ f2fs_put_page(node_page, 1);
return false;
}
@@ -1141,7 +1150,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
- struct extent_info ei = {0, 0, 0};
+ struct extent_info ei = {0, };
struct f2fs_io_info fio = {
.sbi = sbi,
.ino = inode->i_ino,
@@ -1159,7 +1168,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (!page)
return -ENOMEM;
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ))) {
@@ -1563,8 +1572,8 @@ next_step:
continue;
}
- data_page = f2fs_get_read_data_page(inode,
- start_bidx, REQ_RAHEAD, true);
+ data_page = f2fs_get_read_data_page(inode, start_bidx,
+ REQ_RAHEAD, true, NULL);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (IS_ERR(data_page)) {
iput(inode);
@@ -1744,8 +1753,9 @@ freed:
get_valid_blocks(sbi, segno, false) == 0)
seg_freed++;
- if (__is_large_section(sbi) && segno + 1 < end_segno)
- sbi->next_victim_seg[gc_type] = segno + 1;
+ if (__is_large_section(sbi))
+ sbi->next_victim_seg[gc_type] =
+ (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO;
skip:
f2fs_put_page(sum_page, 0);
}
@@ -1898,9 +1908,7 @@ int __init f2fs_create_garbage_collection_cache(void)
{
victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry",
sizeof(struct victim_entry));
- if (!victim_entry_slab)
- return -ENOMEM;
- return 0;
+ return victim_entry_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_garbage_collection_cache(void)
@@ -2133,8 +2141,6 @@ out_unlock:
if (err)
return err;
- set_sbi_flag(sbi, SBI_IS_RESIZEFS);
-
freeze_super(sbi->sb);
f2fs_down_write(&sbi->gc_lock);
f2fs_down_write(&sbi->cp_global_sem);
@@ -2150,6 +2156,7 @@ out_unlock:
if (err)
goto out_err;
+ set_sbi_flag(sbi, SBI_IS_RESIZEFS);
err = free_segment_range(sbi, secs, false);
if (err)
goto recover_out;
@@ -2173,6 +2180,7 @@ out_unlock:
f2fs_commit_super(sbi, false);
}
recover_out:
+ clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_err(sbi, "resize_fs failed, should run fsck to repair!");
@@ -2185,6 +2193,5 @@ out_err:
f2fs_up_write(&sbi->cp_global_sem);
f2fs_up_write(&sbi->gc_lock);
thaw_super(sbi->sb);
- clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
return err;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9f0d3864d9f1..ff6cf66ed46b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -262,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
- if (fi->extent_tree) {
- struct extent_info *ei = &fi->extent_tree->largest;
+ if (fi->extent_tree[EX_READ]) {
+ struct extent_info *ei = &fi->extent_tree[EX_READ]->largest;
if (ei->len &&
(!f2fs_is_valid_blkaddr(sbi, ei->blk,
@@ -392,8 +392,6 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
- f2fs_init_extent_tree(inode, node_page);
-
get_inline_info(inode, ri);
fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
@@ -479,6 +477,11 @@ static int do_read_inode(struct inode *inode)
}
init_idisk_time(inode);
+
+ /* Need all the flag bits */
+ f2fs_init_read_extent_tree(inode, node_page);
+ f2fs_init_age_extent_tree(inode);
+
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -607,7 +610,7 @@ retry:
void f2fs_update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
f2fs_wait_on_page_writeback(node_page, NODE, true, true);
set_page_dirty(node_page);
@@ -621,12 +624,15 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
ri->i_uid = cpu_to_le32(i_uid_read(inode));
ri->i_gid = cpu_to_le32(i_gid_read(inode));
ri->i_links = cpu_to_le32(inode->i_nlink);
- ri->i_size = cpu_to_le64(i_size_read(inode));
ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1);
+ if (!f2fs_is_atomic_file(inode) ||
+ is_inode_flag_set(inode, FI_ATOMIC_COMMITTED))
+ ri->i_size = cpu_to_le64(i_size_read(inode));
+
if (et) {
read_lock(&et->lock);
- set_raw_extent(&et->largest, &ri->i_ext);
+ set_raw_read_extent(&et->largest, &ri->i_ext);
read_unlock(&et->lock);
} else {
memset(&ri->i_ext, 0, sizeof(ri->i_ext));
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a389772fd212..6032589099ce 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -22,137 +22,6 @@
#include "acl.h"
#include <trace/events/f2fs.h>
-static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
- struct inode *dir, umode_t mode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- nid_t ino;
- struct inode *inode;
- bool nid_free = false;
- bool encrypt = false;
- int xattr_size = 0;
- int err;
-
- inode = new_inode(dir->i_sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- if (!f2fs_alloc_nid(sbi, &ino)) {
- err = -ENOSPC;
- goto fail;
- }
-
- nid_free = true;
-
- inode_init_owner(mnt_userns, inode, dir, mode);
-
- inode->i_ino = ino;
- inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- F2FS_I(inode)->i_crtime = inode->i_mtime;
- inode->i_generation = get_random_u32();
-
- if (S_ISDIR(inode->i_mode))
- F2FS_I(inode)->i_current_depth = 1;
-
- err = insert_inode_locked(inode);
- if (err) {
- err = -EINVAL;
- goto fail;
- }
-
- if (f2fs_sb_has_project_quota(sbi) &&
- (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
- F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
- else
- F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
- F2FS_DEF_PROJID);
-
- err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
- if (err)
- goto fail_drop;
-
- err = f2fs_dquot_initialize(inode);
- if (err)
- goto fail_drop;
-
- set_inode_flag(inode, FI_NEW_INODE);
-
- if (encrypt)
- f2fs_set_encrypted_inode(inode);
-
- if (f2fs_sb_has_extra_attr(sbi)) {
- set_inode_flag(inode, FI_EXTRA_ATTR);
- F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
- }
-
- if (test_opt(sbi, INLINE_XATTR))
- set_inode_flag(inode, FI_INLINE_XATTR);
-
- if (f2fs_may_inline_dentry(inode))
- set_inode_flag(inode, FI_INLINE_DENTRY);
-
- if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
- f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
- if (f2fs_has_inline_xattr(inode))
- xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
- /* Otherwise, will be 0 */
- } else if (f2fs_has_inline_xattr(inode) ||
- f2fs_has_inline_dentry(inode)) {
- xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- }
- F2FS_I(inode)->i_inline_xattr_size = xattr_size;
-
- f2fs_init_extent_tree(inode, NULL);
-
- F2FS_I(inode)->i_flags =
- f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
-
- if (S_ISDIR(inode->i_mode))
- F2FS_I(inode)->i_flags |= F2FS_INDEX_FL;
-
- if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
- set_inode_flag(inode, FI_PROJ_INHERIT);
-
- if (f2fs_sb_has_compression(sbi)) {
- /* Inherit the compression flag in directory */
- if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) &&
- f2fs_may_compress(inode))
- set_compress_context(inode);
- }
-
- /* Should enable inline_data after compression set */
- if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(inode, FI_INLINE_DATA);
-
- stat_inc_inline_xattr(inode);
- stat_inc_inline_inode(inode);
- stat_inc_inline_dir(inode);
-
- f2fs_set_inode_flags(inode);
-
- trace_f2fs_new_inode(inode, 0);
- return inode;
-
-fail:
- trace_f2fs_new_inode(inode, err);
- make_bad_inode(inode);
- if (nid_free)
- set_inode_flag(inode, FI_FREE_NID);
- iput(inode);
- return ERR_PTR(err);
-fail_drop:
- trace_f2fs_new_inode(inode, err);
- dquot_drop(inode);
- inode->i_flags |= S_NOQUOTA;
- if (nid_free)
- set_inode_flag(inode, FI_FREE_NID);
- clear_nlink(inode);
- unlock_new_inode(inode);
- iput(inode);
- return ERR_PTR(err);
-}
-
static inline int is_extension_exist(const unsigned char *s, const char *sub,
bool tmp_ext)
{
@@ -187,36 +56,6 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub,
return 0;
}
-/*
- * Set file's temperature for hot/cold data separation
- */
-static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
- const unsigned char *name)
-{
- __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
- int i, cold_count, hot_count;
-
- f2fs_down_read(&sbi->sb_lock);
-
- cold_count = le32_to_cpu(sbi->raw_super->extension_count);
- hot_count = sbi->raw_super->hot_ext_count;
-
- for (i = 0; i < cold_count + hot_count; i++) {
- if (is_extension_exist(name, extlist[i], true))
- break;
- }
-
- f2fs_up_read(&sbi->sb_lock);
-
- if (i == cold_count + hot_count)
- return;
-
- if (i < cold_count)
- file_set_cold(inode);
- else
- file_set_hot(inode);
-}
-
int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
bool hot, bool set)
{
@@ -283,56 +122,215 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
return 0;
}
-static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
- const unsigned char *name)
+static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir,
+ struct inode *inode, const unsigned char *name)
{
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
- unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions;
+ unsigned char (*noext)[F2FS_EXTENSION_LEN] =
+ F2FS_OPTION(sbi).noextensions;
unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
int i, cold_count, hot_count;
- if (!f2fs_sb_has_compression(sbi) ||
- F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
- !f2fs_may_compress(inode) ||
- (!ext_cnt && !noext_cnt))
+ if (!f2fs_sb_has_compression(sbi))
return;
- f2fs_down_read(&sbi->sb_lock);
+ if (S_ISDIR(inode->i_mode))
+ goto inherit_comp;
+ /* This name comes only from normal files. */
+ if (!name)
+ return;
+
+ /* Don't compress hot files. */
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
+ for (i = cold_count; i < cold_count + hot_count; i++)
+ if (is_extension_exist(name, extlist[i], false))
+ break;
+ f2fs_up_read(&sbi->sb_lock);
+ if (i < (cold_count + hot_count))
+ return;
+
+ /* Don't compress unallowed extension. */
+ for (i = 0; i < noext_cnt; i++)
+ if (is_extension_exist(name, noext[i], false))
+ return;
- for (i = cold_count; i < cold_count + hot_count; i++) {
- if (is_extension_exist(name, extlist[i], false)) {
- f2fs_up_read(&sbi->sb_lock);
+ /* Compress wanting extension. */
+ for (i = 0; i < ext_cnt; i++) {
+ if (is_extension_exist(name, ext[i], false)) {
+ set_compress_context(inode);
return;
}
}
+inherit_comp:
+ /* Inherit the {no-}compression flag in directory */
+ if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) {
+ F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL;
+ f2fs_mark_inode_dirty_sync(inode, true);
+ } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) {
+ set_compress_context(inode);
+ }
+}
+
+/*
+ * Set file's temperature for hot/cold data separation
+ */
+static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
+ const unsigned char *name)
+{
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+ int i, cold_count, hot_count;
+ f2fs_down_read(&sbi->sb_lock);
+ cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ hot_count = sbi->raw_super->hot_ext_count;
+ for (i = 0; i < cold_count + hot_count; i++)
+ if (is_extension_exist(name, extlist[i], true))
+ break;
f2fs_up_read(&sbi->sb_lock);
- for (i = 0; i < noext_cnt; i++) {
- if (is_extension_exist(name, noext[i], false)) {
- f2fs_disable_compressed_file(inode);
- return;
- }
+ if (i == cold_count + hot_count)
+ return;
+
+ if (i < cold_count)
+ file_set_cold(inode);
+ else
+ file_set_hot(inode);
+}
+
+static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
+ struct inode *dir, umode_t mode,
+ const char *name)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ nid_t ino;
+ struct inode *inode;
+ bool nid_free = false;
+ bool encrypt = false;
+ int xattr_size = 0;
+ int err;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!f2fs_alloc_nid(sbi, &ino)) {
+ err = -ENOSPC;
+ goto fail;
}
- if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
- return;
+ nid_free = true;
- for (i = 0; i < ext_cnt; i++) {
- if (!is_extension_exist(name, ext[i], false))
- continue;
+ inode_init_owner(mnt_userns, inode, dir, mode);
- /* Do not use inline_data with compression */
- stat_dec_inline_inode(inode);
- clear_inode_flag(inode, FI_INLINE_DATA);
- set_compress_context(inode);
- return;
+ inode->i_ino = ino;
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ F2FS_I(inode)->i_crtime = inode->i_mtime;
+ inode->i_generation = get_random_u32();
+
+ if (S_ISDIR(inode->i_mode))
+ F2FS_I(inode)->i_current_depth = 1;
+
+ err = insert_inode_locked(inode);
+ if (err) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (f2fs_sb_has_project_quota(sbi) &&
+ (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
+ F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
+ else
+ F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
+ F2FS_DEF_PROJID);
+
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto fail_drop;
+
+ err = f2fs_dquot_initialize(inode);
+ if (err)
+ goto fail_drop;
+
+ set_inode_flag(inode, FI_NEW_INODE);
+
+ if (encrypt)
+ f2fs_set_encrypted_inode(inode);
+
+ if (f2fs_sb_has_extra_attr(sbi)) {
+ set_inode_flag(inode, FI_EXTRA_ATTR);
+ F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
+ }
+
+ if (test_opt(sbi, INLINE_XATTR))
+ set_inode_flag(inode, FI_INLINE_XATTR);
+
+ if (f2fs_may_inline_dentry(inode))
+ set_inode_flag(inode, FI_INLINE_DENTRY);
+
+ if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
+ f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+ if (f2fs_has_inline_xattr(inode))
+ xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
+ /* Otherwise, will be 0 */
+ } else if (f2fs_has_inline_xattr(inode) ||
+ f2fs_has_inline_dentry(inode)) {
+ xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
}
+ F2FS_I(inode)->i_inline_xattr_size = xattr_size;
+
+ F2FS_I(inode)->i_flags =
+ f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
+
+ if (S_ISDIR(inode->i_mode))
+ F2FS_I(inode)->i_flags |= F2FS_INDEX_FL;
+
+ if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
+ set_inode_flag(inode, FI_PROJ_INHERIT);
+
+ /* Check compression first. */
+ set_compress_new_inode(sbi, dir, inode, name);
+
+ /* Should enable inline_data after compression set */
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+ set_inode_flag(inode, FI_INLINE_DATA);
+
+ if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY))
+ set_file_temperature(sbi, inode, name);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
+ f2fs_set_inode_flags(inode);
+
+ f2fs_init_extent_tree(inode);
+
+ trace_f2fs_new_inode(inode, 0);
+ return inode;
+
+fail:
+ trace_f2fs_new_inode(inode, err);
+ make_bad_inode(inode);
+ if (nid_free)
+ set_inode_flag(inode, FI_FREE_NID);
+ iput(inode);
+ return ERR_PTR(err);
+fail_drop:
+ trace_f2fs_new_inode(inode, err);
+ dquot_drop(inode);
+ inode->i_flags |= S_NOQUOTA;
+ if (nid_free)
+ set_inode_flag(inode, FI_FREE_NID);
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
}
static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
@@ -352,15 +350,10 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
- set_file_temperature(sbi, inode, dentry->d_name.name);
-
- set_compress_inode(sbi, inode, dentry->d_name.name);
-
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -632,6 +625,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
goto fail;
}
f2fs_delete_entry(de, page, dir, inode);
+ f2fs_unlock_op(sbi);
+
#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -642,8 +637,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
- f2fs_unlock_op(sbi);
-
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
fail:
@@ -689,7 +682,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -760,7 +753,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -817,7 +810,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -856,7 +849,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -1379,7 +1372,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
.tmpfile = f2fs_tmpfile,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
- .get_acl = f2fs_get_acl,
+ .get_inode_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
.fiemap = f2fs_fiemap,
@@ -1397,7 +1390,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
const struct inode_operations f2fs_special_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
- .get_acl = f2fs_get_acl,
+ .get_inode_acl = f2fs_get_acl,
.set_acl = f2fs_set_acl,
.listxattr = f2fs_listxattr,
};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 983572f23896..dde4c0458704 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -60,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
avail_ram = val.totalram - val.totalhigh;
/*
- * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
+ * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively
*/
if (type == FREE_NIDS) {
mem_size = (nm_i->nid_cnt[FREE_NID] *
@@ -85,12 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
sizeof(struct ino_entry);
mem_size >>= PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
- } else if (type == EXTENT_CACHE) {
- mem_size = (atomic_read(&sbi->total_ext_tree) *
+ } else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) {
+ enum extent_type etype = type == READ_EXTENT_CACHE ?
+ EX_READ : EX_BLOCK_AGE;
+ struct extent_tree_info *eti = &sbi->extent_tree[etype];
+
+ mem_size = (atomic_read(&eti->total_ext_tree) *
sizeof(struct extent_tree) +
- atomic_read(&sbi->total_ext_node) *
+ atomic_read(&eti->total_ext_node) *
sizeof(struct extent_node)) >> PAGE_SHIFT;
- res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == DISCARD_CACHE) {
mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
sizeof(struct discard_cmd)) >> PAGE_SHIFT;
@@ -859,7 +863,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
blkaddr = data_blkaddr(dn->inode, dn->node_page,
dn->ofs_in_node + 1);
- f2fs_update_extent_tree_range_compressed(dn->inode,
+ f2fs_update_read_extent_tree_range_compressed(dn->inode,
index, blkaddr,
F2FS_I(dn->inode)->i_cluster_size,
c_len);
@@ -1360,8 +1364,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags)
return err;
/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
- if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
- is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
+ if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) {
ClearPageUptodate(page);
return -ENOENT;
}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3c09cae058b0..99454d46a939 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -146,7 +146,8 @@ enum mem_type {
NAT_ENTRIES, /* indicates the cached nat entry */
DIRTY_DENTS, /* indicates dirty dentry pages */
INO_ENTRIES, /* indicates inode entries */
- EXTENT_CACHE, /* indicates extent cache */
+ READ_EXTENT_CACHE, /* indicates read extent cache */
+ AGE_EXTENT_CACHE, /* indicates age extent cache */
DISCARD_CACHE, /* indicates memory of cached discard cmds */
COMPRESS_PAGE, /* indicates memory of cached compressed pages */
BASE_CHECK, /* check kernel status */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index dea95b48b647..77fd453949b1 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -923,9 +923,7 @@ int __init f2fs_create_recovery_cache(void)
{
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
sizeof(struct fsync_inode_entry));
- if (!fsync_entry_slab)
- return -ENOMEM;
- return 0;
+ return fsync_entry_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_recovery_cache(void)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index acf3d3fa4363..ae3c4e5474ef 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -192,14 +192,19 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
if (!f2fs_is_atomic_file(inode))
return;
- if (clean)
- truncate_inode_pages_final(inode->i_mapping);
clear_inode_flag(fi->cow_inode, FI_COW_FILE);
iput(fi->cow_inode);
fi->cow_inode = NULL;
release_atomic_write_cnt(inode);
+ clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
+ clear_inode_flag(inode, FI_ATOMIC_REPLACE);
clear_inode_flag(inode, FI_ATOMIC_FILE);
stat_dec_atomic_inode(inode);
+
+ if (clean) {
+ truncate_inode_pages_final(inode->i_mapping);
+ f2fs_i_size_write(inode, fi->original_i_size);
+ }
}
static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
@@ -257,14 +262,19 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head,
bool revoke)
{
struct revoke_entry *cur, *tmp;
+ bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
list_for_each_entry_safe(cur, tmp, head, list) {
if (revoke)
__replace_atomic_write_block(inode, cur->index,
cur->old_addr, NULL, true);
+
list_del(&cur->list);
kmem_cache_free(revoke_entry_slab, cur);
}
+
+ if (!revoke && truncate)
+ f2fs_do_truncate_blocks(inode, 0, false);
}
static int __f2fs_commit_atomic_write(struct inode *inode)
@@ -335,10 +345,12 @@ next:
}
out:
- if (ret)
+ if (ret) {
sbi->revoked_atomic_block += fi->atomic_write_cnt;
- else
+ } else {
sbi->committed_atomic_block += fi->atomic_write_cnt;
+ set_inode_flag(inode, FI_ATOMIC_COMMITTED);
+ }
__complete_revoke_list(inode, &revoke_list, ret ? true : false);
@@ -437,8 +449,14 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
return;
/* try to shrink extent cache when there is no enough memory */
- if (!f2fs_available_free_memory(sbi, EXTENT_CACHE))
- f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE))
+ f2fs_shrink_read_extent_tree(sbi,
+ READ_EXTENT_CACHE_SHRINK_NUMBER);
+
+ /* try to shrink age extent cache when there is no enough memory */
+ if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE))
+ f2fs_shrink_age_extent_tree(sbi,
+ AGE_EXTENT_CACHE_SHRINK_NUMBER);
/* check the # of cached NAT entries */
if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
@@ -620,12 +638,11 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
struct flush_cmd_control *fcc;
- int err = 0;
if (SM_I(sbi)->fcc_info) {
fcc = SM_I(sbi)->fcc_info;
if (fcc->f2fs_issue_flush)
- return err;
+ return 0;
goto init_thread;
}
@@ -638,19 +655,19 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
init_llist_head(&fcc->issue_list);
SM_I(sbi)->fcc_info = fcc;
if (!test_opt(sbi, FLUSH_MERGE))
- return err;
+ return 0;
init_thread:
fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(fcc->f2fs_issue_flush)) {
- err = PTR_ERR(fcc->f2fs_issue_flush);
- kfree(fcc);
- SM_I(sbi)->fcc_info = NULL;
+ int err = PTR_ERR(fcc->f2fs_issue_flush);
+
+ fcc->f2fs_issue_flush = NULL;
return err;
}
- return err;
+ return 0;
}
void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
@@ -856,7 +873,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
}
mutex_unlock(&dirty_i->seglist_lock);
- unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE];
+ unusable = max(holes[DATA], holes[NODE]);
if (unusable > ovp_holes)
return unusable - ovp_holes;
return 0;
@@ -1052,8 +1069,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->io_aware = true;
dpolicy->sync = false;
dpolicy->ordered = true;
- if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
- dpolicy->granularity = 1;
+ if (utilization(sbi) > dcc->discard_urgent_util) {
+ dpolicy->granularity = MIN_DISCARD_GRANULARITY;
if (atomic_read(&dcc->discard_cmd_cnt))
dpolicy->max_interval =
dcc->min_discard_issue_time;
@@ -1068,7 +1085,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
} else if (discard_type == DPOLICY_UMOUNT) {
dpolicy->io_aware = false;
/* we need to issue all to keep CP_TRIMMED_FLAG */
- dpolicy->granularity = 1;
+ dpolicy->granularity = MIN_DISCARD_GRANULARITY;
dpolicy->timeout = true;
}
}
@@ -1126,13 +1143,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
if (time_to_inject(sbi, FAULT_DISCARD)) {
f2fs_show_injection_info(sbi, FAULT_DISCARD);
err = -EIO;
- goto submit;
- }
- err = __blkdev_issue_discard(bdev,
+ } else {
+ err = __blkdev_issue_discard(bdev,
SECTOR_FROM_BLOCK(start),
SECTOR_FROM_BLOCK(len),
GFP_NOFS, &bio);
-submit:
+ }
if (err) {
spin_lock_irqsave(&dc->lock, flags);
if (dc->state == D_PARTIAL)
@@ -1170,7 +1186,7 @@ submit:
atomic_inc(&dcc->issued_discard);
- f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1);
+ f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE);
lstart += len;
start += len;
@@ -1342,13 +1358,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
}
}
-static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+static void __queue_discard_cmd(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t blkstart, block_t blklen)
{
block_t lblkstart = blkstart;
if (!f2fs_bdev_support_discard(bdev))
- return 0;
+ return;
trace_f2fs_queue_discard(bdev, blkstart, blklen);
@@ -1360,7 +1376,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock);
__update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
- return 0;
}
static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
@@ -1448,7 +1463,7 @@ retry:
if (i + 1 < dpolicy->granularity)
break;
- if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
+ if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered)
return __issue_discard_cmd_orderly(sbi, dpolicy);
pend_list = &dcc->pend_list[i];
@@ -1645,6 +1660,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
struct discard_policy dpolicy;
bool dropped;
+ if (!atomic_read(&dcc->discard_cmd_cnt))
+ return false;
+
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
__issue_discard_cmd(sbi, &dpolicy);
@@ -1669,6 +1687,11 @@ static int issue_discard_thread(void *data)
set_freezable();
do {
+ wait_event_interruptible_timeout(*q,
+ kthread_should_stop() || freezing(current) ||
+ dcc->discard_wake,
+ msecs_to_jiffies(wait_ms));
+
if (sbi->gc_mode == GC_URGENT_HIGH ||
!f2fs_available_free_memory(sbi, DISCARD_CACHE))
__init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
@@ -1676,14 +1699,6 @@ static int issue_discard_thread(void *data)
__init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
dcc->discard_granularity);
- if (!atomic_read(&dcc->discard_cmd_cnt))
- wait_ms = dpolicy.max_interval;
-
- wait_event_interruptible_timeout(*q,
- kthread_should_stop() || freezing(current) ||
- dcc->discard_wake,
- msecs_to_jiffies(wait_ms));
-
if (dcc->discard_wake)
dcc->discard_wake = 0;
@@ -1697,12 +1712,11 @@ static int issue_discard_thread(void *data)
continue;
if (kthread_should_stop())
return 0;
- if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
+ !atomic_read(&dcc->discard_cmd_cnt)) {
wait_ms = dpolicy.max_interval;
continue;
}
- if (!atomic_read(&dcc->discard_cmd_cnt))
- continue;
sb_start_intwrite(sbi->sb);
@@ -1717,6 +1731,8 @@ static int issue_discard_thread(void *data)
} else {
wait_ms = dpolicy.max_interval;
}
+ if (!atomic_read(&dcc->discard_cmd_cnt))
+ wait_ms = dpolicy.max_interval;
sb_end_intwrite(sbi->sb);
@@ -1760,7 +1776,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
}
/* For conventional zones, use regular discard if supported */
- return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+ __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+ return 0;
}
#endif
@@ -1771,7 +1788,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev))
return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
#endif
- return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+ __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+ return 0;
}
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
@@ -2025,8 +2043,10 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
- if (IS_ERR(dcc->f2fs_issue_discard))
+ if (IS_ERR(dcc->f2fs_issue_discard)) {
err = PTR_ERR(dcc->f2fs_issue_discard);
+ dcc->f2fs_issue_discard = NULL;
+ }
return err;
}
@@ -2046,6 +2066,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
return -ENOMEM;
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
+ dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
dcc->discard_granularity = sbi->blocks_per_seg;
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
@@ -2066,6 +2087,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
+ dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL;
dcc->undiscard_blks = 0;
dcc->next_pos = 0;
dcc->root = RB_ROOT_CACHED;
@@ -2096,8 +2118,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
* Recovery can cache discard commands, so in error path of
* fill_super(), it needs to give a chance to handle them.
*/
- if (unlikely(atomic_read(&dcc->discard_cmd_cnt)))
- f2fs_issue_discard_timeout(sbi);
+ f2fs_issue_discard_timeout(sbi);
kfree(dcc);
SM_I(sbi)->dcc_info = NULL;
@@ -2534,7 +2555,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
sanity_check_seg_type(sbi, seg_type);
if (f2fs_need_rand_seg(sbi))
- return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec);
+ return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
/* if segs_per_sec is large than 1, we need to keep original policy. */
if (__is_large_section(sbi))
@@ -2588,7 +2609,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
curseg->alloc_type = LFS;
if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
curseg->fragment_remained_chunk =
- prandom_u32_max(sbi->max_fragment_chunk) + 1;
+ get_random_u32_inclusive(1, sbi->max_fragment_chunk);
}
static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2625,9 +2646,9 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
/* To allocate block chunks in different sizes, use random number */
if (--seg->fragment_remained_chunk <= 0) {
seg->fragment_remained_chunk =
- prandom_u32_max(sbi->max_fragment_chunk) + 1;
+ get_random_u32_inclusive(1, sbi->max_fragment_chunk);
seg->next_blkoff +=
- prandom_u32_max(sbi->max_fragment_hole) + 1;
+ get_random_u32_inclusive(1, sbi->max_fragment_hole);
}
}
}
@@ -2642,7 +2663,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
* This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
-static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
+static void change_curseg(struct f2fs_sb_info *sbi, int type)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2650,9 +2671,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
struct f2fs_summary_block *sum_node;
struct page *sum_page;
- if (flush)
- write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
__set_test_and_inuse(sbi, new_segno);
@@ -2691,7 +2710,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
curseg->seg_type = se->type;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
} else {
/* allocate cold segment by default */
curseg->seg_type = CURSEG_COLD_DATA;
@@ -2835,31 +2854,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
return 0;
}
-/*
- * flush out current segment and replace it with new segment
- * This function should be returned with success, otherwise BUG
- */
-static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
- int type, bool force)
+static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- if (force)
- new_curseg(sbi, type, true);
- else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
- curseg->seg_type == CURSEG_WARM_NODE)
- new_curseg(sbi, type, false);
- else if (curseg->alloc_type == LFS &&
- is_next_segment_free(sbi, curseg, type) &&
- likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
- new_curseg(sbi, type, false);
- else if (f2fs_need_SSR(sbi) &&
- get_ssr_segment(sbi, type, SSR, 0))
- change_curseg(sbi, type, true);
- else
- new_curseg(sbi, type, false);
-
- stat_inc_seg_type(sbi, curseg);
+ if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
+ curseg->seg_type == CURSEG_WARM_NODE)
+ return true;
+ if (curseg->alloc_type == LFS &&
+ is_next_segment_free(sbi, curseg, type) &&
+ likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ return true;
+ if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0))
+ return true;
+ return false;
}
void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
@@ -2877,7 +2885,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
goto unlock;
if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
else
new_curseg(sbi, type, true);
@@ -2912,7 +2920,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
return;
alloc:
old_segno = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+ new_curseg(sbi, type, true);
+ stat_inc_seg_type(sbi, curseg);
locate_dirty_segment(sbi, old_segno);
}
@@ -2943,10 +2952,6 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
-static const struct segment_allocation default_salloc_ops = {
- .allocate_segment = allocate_segment_by_default,
-};
-
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct cp_control *cpc)
{
@@ -3152,10 +3157,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio)
}
}
+static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_info ei = {};
+
+ if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) {
+ if (!ei.age)
+ return NO_CHECK_TYPE;
+ if (ei.age <= sbi->hot_data_age_threshold)
+ return CURSEG_HOT_DATA;
+ if (ei.age <= sbi->warm_data_age_threshold)
+ return CURSEG_WARM_DATA;
+ return CURSEG_COLD_DATA;
+ }
+ return NO_CHECK_TYPE;
+}
+
static int __get_segment_type_6(struct f2fs_io_info *fio)
{
if (fio->type == DATA) {
struct inode *inode = fio->page->mapping->host;
+ int type;
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
return CURSEG_COLD_DATA_PINNED;
@@ -3170,6 +3193,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
}
if (file_is_cold(inode) || f2fs_need_compress_data(inode))
return CURSEG_COLD_DATA;
+
+ type = __get_age_segment_type(inode, fio->page->index);
+ if (type != NO_CHECK_TYPE)
+ return type;
+
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
f2fs_is_cow_file(inode))
@@ -3266,11 +3294,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
update_sit_entry(sbi, old_blkaddr, -1);
if (!__has_curseg_space(sbi, curseg)) {
- if (from_gc)
+ /*
+ * Flush out current segment and replace it with new segment.
+ */
+ if (from_gc) {
get_atssr_segment(sbi, type, se->type,
AT_SSR, se->mtime);
- else
- sit_i->s_ops->allocate_segment(sbi, type, false);
+ } else {
+ if (need_new_seg(sbi, type))
+ new_curseg(sbi, type, false);
+ else
+ change_curseg(sbi, type);
+ stat_inc_seg_type(sbi, curseg);
+ }
}
/*
* segment dirty status should be updated after segment allocation,
@@ -3280,6 +3316,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
+ if (IS_DATASEG(type))
+ atomic64_inc(&sbi->allocated_data_blocks);
+
up_write(&sit_i->sentry_lock);
if (page && IS_NODESEG(type)) {
@@ -3407,6 +3446,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn,
struct f2fs_summary sum;
f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
+ if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO)
+ f2fs_update_age_extent_cache(dn);
set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version);
do_write_page(&sum, fio);
f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
@@ -3531,7 +3572,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
/* change the current segment */
if (segno != curseg->segno) {
curseg->next_segno = segno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -3559,7 +3600,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (recover_curseg) {
if (old_cursegno != curseg->segno) {
curseg->next_segno = old_cursegno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = old_blkoff;
curseg->alloc_type = old_alloc_type;
@@ -4256,9 +4297,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
#endif
- /* init SIT information */
- sit_i->s_ops = &default_salloc_ops;
-
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
sit_i->written_valid_blocks = 0;
@@ -5099,11 +5137,9 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
init_f2fs_rwsem(&sm_info->curseg_lock);
- if (!f2fs_readonly(sbi->sb)) {
- err = f2fs_create_flush_cmd_control(sbi);
- if (err)
- return err;
- }
+ err = f2fs_create_flush_cmd_control(sbi);
+ if (err)
+ return err;
err = create_discard_cmd_control(sbi);
if (err)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index be8f2d7d007b..3ad1b7b6fa94 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -222,10 +222,6 @@ struct sec_entry {
unsigned int valid_blocks; /* # of valid blocks in a section */
};
-struct segment_allocation {
- void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
-};
-
#define MAX_SKIP_GC_COUNT 16
struct revoke_entry {
@@ -235,8 +231,6 @@ struct revoke_entry {
};
struct sit_info {
- const struct segment_allocation *s_ops;
-
block_t sit_base_addr; /* start block address of SIT area */
block_t sit_blocks; /* # of blocks used by SIT area */
block_t written_valid_blocks; /* # of valid blocks in main area */
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index dd3c3c7a90ec..83d6fb97dcae 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
return count > 0 ? count : 0;
}
-static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
+static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi,
+ enum extent_type type)
{
- return atomic_read(&sbi->total_zombie_tree) +
- atomic_read(&sbi->total_ext_node);
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
+
+ return atomic_read(&eti->total_zombie_tree) +
+ atomic_read(&eti->total_ext_node);
}
unsigned long f2fs_shrink_count(struct shrinker *shrink,
@@ -53,8 +56,11 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
}
spin_unlock(&f2fs_list_lock);
- /* count extent cache entries */
- count += __count_extent_cache(sbi);
+ /* count read extent cache entries */
+ count += __count_extent_cache(sbi, EX_READ);
+
+ /* count block age extent cache entries */
+ count += __count_extent_cache(sbi, EX_BLOCK_AGE);
/* count clean nat cache entries */
count += __count_nat_entries(sbi);
@@ -100,7 +106,10 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
sbi->shrinker_run_no = run_no;
/* shrink extent cache entries */
- freed += f2fs_shrink_extent_tree(sbi, nr >> 1);
+ freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2);
+
+ /* shrink read extent cache entries */
+ freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2);
/* shrink clean nat cache entries */
if (freed < nr)
@@ -130,7 +139,9 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
{
- f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
+ f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ));
+ f2fs_shrink_age_extent_tree(sbi,
+ __count_extent_cache(sbi, EX_BLOCK_AGE));
spin_lock(&f2fs_list_lock);
list_del_init(&sbi->s_list);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3834ead04620..1f812b9ce985 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -61,6 +61,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_SLAB_ALLOC] = "slab alloc",
[FAULT_DQUOT_INIT] = "dquot initialize",
[FAULT_LOCK_OP] = "lock_op",
+ [FAULT_BLKADDR] = "invalid blkaddr",
};
void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -110,6 +111,7 @@ enum {
Opt_noinline_dentry,
Opt_flush_merge,
Opt_noflush_merge,
+ Opt_barrier,
Opt_nobarrier,
Opt_fastboot,
Opt_extent_cache,
@@ -161,6 +163,7 @@ enum {
Opt_nogc_merge,
Opt_discard_unit,
Opt_memory_mode,
+ Opt_age_extent_cache,
Opt_err,
};
@@ -186,6 +189,7 @@ static match_table_t f2fs_tokens = {
{Opt_noinline_dentry, "noinline_dentry"},
{Opt_flush_merge, "flush_merge"},
{Opt_noflush_merge, "noflush_merge"},
+ {Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
{Opt_fastboot, "fastboot"},
{Opt_extent_cache, "extent_cache"},
@@ -238,6 +242,7 @@ static match_table_t f2fs_tokens = {
{Opt_nogc_merge, "nogc_merge"},
{Opt_discard_unit, "discard_unit=%s"},
{Opt_memory_mode, "memory=%s"},
+ {Opt_age_extent_cache, "age_extent_cache"},
{Opt_err, NULL},
};
@@ -285,9 +290,7 @@ static int __init f2fs_create_casefold_cache(void)
{
f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name",
F2FS_NAME_LEN);
- if (!f2fs_cf_name_slab)
- return -ENOMEM;
- return 0;
+ return f2fs_cf_name_slab ? 0 : -ENOMEM;
}
static void f2fs_destroy_casefold_cache(void)
@@ -806,14 +809,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_nobarrier:
set_opt(sbi, NOBARRIER);
break;
+ case Opt_barrier:
+ clear_opt(sbi, NOBARRIER);
+ break;
case Opt_fastboot:
set_opt(sbi, FASTBOOT);
break;
case Opt_extent_cache:
- set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, READ_EXTENT_CACHE);
break;
case Opt_noextent_cache:
- clear_opt(sbi, EXTENT_CACHE);
+ clear_opt(sbi, READ_EXTENT_CACHE);
break;
case Opt_noinline_data:
clear_opt(sbi, INLINE_DATA);
@@ -1253,6 +1259,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
+ case Opt_age_extent_cache:
+ set_opt(sbi, AGE_EXTENT_CACHE);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1347,6 +1356,11 @@ default_check:
return -EINVAL;
}
+ if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) {
+ f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode");
+ return -EINVAL;
+ }
+
if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
@@ -1567,8 +1581,7 @@ static void f2fs_put_super(struct super_block *sb)
/* be sure to wait for any on-going discard commands */
dropped = f2fs_issue_discard_timeout(sbi);
- if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
- !sbi->discard_blks && !dropped) {
+ if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) {
struct cp_control cpc = {
.reason = CP_UMOUNT | CP_TRIMMED,
};
@@ -1935,16 +1948,22 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",inline_dentry");
else
seq_puts(seq, ",noinline_dentry");
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
+ if (test_opt(sbi, FLUSH_MERGE))
seq_puts(seq, ",flush_merge");
+ else
+ seq_puts(seq, ",noflush_merge");
if (test_opt(sbi, NOBARRIER))
seq_puts(seq, ",nobarrier");
+ else
+ seq_puts(seq, ",barrier");
if (test_opt(sbi, FASTBOOT))
seq_puts(seq, ",fastboot");
- if (test_opt(sbi, EXTENT_CACHE))
+ if (test_opt(sbi, READ_EXTENT_CACHE))
seq_puts(seq, ",extent_cache");
else
seq_puts(seq, ",noextent_cache");
+ if (test_opt(sbi, AGE_EXTENT_CACHE))
+ seq_puts(seq, ",age_extent_cache");
if (test_opt(sbi, DATA_FLUSH))
seq_puts(seq, ",data_flush");
@@ -2043,7 +2062,11 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+ if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <=
+ SMALL_VOLUME_SEGMENTS)
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ else
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
@@ -2059,13 +2082,14 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
- set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, READ_EXTENT_CACHE);
set_opt(sbi, NOHEAP);
clear_opt(sbi, DISABLE_CHECKPOINT);
set_opt(sbi, MERGE_CHECKPOINT);
F2FS_OPTION(sbi).unusable_cap = 0;
sbi->sb->s_flags |= SB_LAZYTIME;
- set_opt(sbi, FLUSH_MERGE);
+ if (!f2fs_is_readonly(sbi))
+ set_opt(sbi, FLUSH_MERGE);
if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi))
set_opt(sbi, DISCARD);
if (f2fs_sb_has_blkzoned(sbi)) {
@@ -2200,14 +2224,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool need_restart_ckpt = false, need_stop_ckpt = false;
bool need_restart_flush = false, need_stop_flush = false;
bool need_restart_discard = false, need_stop_discard = false;
- bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
+ bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
+ bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
bool no_io_align = !F2FS_IO_ALIGNED(sbi);
bool no_atgc = !test_opt(sbi, ATGC);
bool no_discard = !test_opt(sbi, DISCARD);
bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
bool block_unit_discard = f2fs_block_unit_discard(sbi);
- struct discard_cmd_control *dcc;
#ifdef CONFIG_QUOTA
int i, j;
#endif
@@ -2290,11 +2314,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
/* disallow enable/disable extent_cache dynamically */
- if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+ if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) {
err = -EINVAL;
f2fs_warn(sbi, "switch extent_cache option is not allowed");
goto restore_opts;
}
+ /* disallow enable/disable age extent_cache dynamically */
+ if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) {
+ err = -EINVAL;
+ f2fs_warn(sbi, "switch age_extent_cache option is not allowed");
+ goto restore_opts;
+ }
if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) {
err = -EINVAL;
@@ -2388,10 +2418,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_flush;
need_stop_discard = true;
} else {
- dcc = SM_I(sbi)->dcc_info;
f2fs_stop_discard_thread(sbi);
- if (atomic_read(&dcc->discard_cmd_cnt))
- f2fs_issue_discard_timeout(sbi);
+ f2fs_issue_discard_timeout(sbi);
need_restart_discard = true;
}
}
@@ -3616,7 +3644,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->seq_file_ra_mul = MIN_RA_MUL;
sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
- spin_lock_init(&sbi->gc_urgent_high_lock);
+ spin_lock_init(&sbi->gc_remaining_trials_lock);
atomic64_set(&sbi->current_atomic_write, 0);
sbi->dir_level = DEF_DIR_LEVEL;
@@ -4056,18 +4084,16 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
{
- struct f2fs_sm_info *sm_i = SM_I(sbi);
-
/* adjust parameters according to the volume size */
- if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) {
if (f2fs_block_unit_discard(sbi))
- sm_i->dcc_info->discard_granularity = 1;
- sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
+ SM_I(sbi)->dcc_info->discard_granularity =
+ MIN_DISCARD_GRANULARITY;
+ SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE |
1 << F2FS_IPU_HONOR_OPU_WRITE;
}
- sbi->readdir_ra = 1;
+ sbi->readdir_ra = true;
}
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
@@ -4095,6 +4121,24 @@ try_onemore:
sbi->sb = sb;
+ /* initialize locks within allocated memory */
+ init_f2fs_rwsem(&sbi->gc_lock);
+ mutex_init(&sbi->writepages);
+ init_f2fs_rwsem(&sbi->cp_global_sem);
+ init_f2fs_rwsem(&sbi->node_write);
+ init_f2fs_rwsem(&sbi->node_change);
+ spin_lock_init(&sbi->stat_lock);
+ init_f2fs_rwsem(&sbi->cp_rwsem);
+ init_f2fs_rwsem(&sbi->quota_sem);
+ init_waitqueue_head(&sbi->cp_wait);
+ spin_lock_init(&sbi->error_lock);
+
+ for (i = 0; i < NR_INODE_TYPE; i++) {
+ INIT_LIST_HEAD(&sbi->inode_list[i]);
+ spin_lock_init(&sbi->inode_lock[i]);
+ }
+ mutex_init(&sbi->flush_lock);
+
/* Load the checksum driver */
sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
@@ -4118,6 +4162,8 @@ try_onemore:
sb->s_fs_info = sbi;
sbi->raw_super = raw_super;
+ memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
+
/* precompute checksum seed for metadata */
if (f2fs_sb_has_inode_chksum(sbi))
sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
@@ -4174,23 +4220,14 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
- init_f2fs_rwsem(&sbi->gc_lock);
- mutex_init(&sbi->writepages);
- init_f2fs_rwsem(&sbi->cp_global_sem);
- init_f2fs_rwsem(&sbi->node_write);
- init_f2fs_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
- spin_lock_init(&sbi->stat_lock);
err = f2fs_init_write_merge_io(sbi);
if (err)
goto free_bio_info;
- init_f2fs_rwsem(&sbi->cp_rwsem);
- init_f2fs_rwsem(&sbi->quota_sem);
- init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
err = f2fs_init_iostat(sbi);
@@ -4255,9 +4292,6 @@ try_onemore:
goto free_devices;
}
- spin_lock_init(&sbi->error_lock);
- memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
-
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
percpu_counter_set(&sbi->total_valid_inode_count,
@@ -4271,12 +4305,6 @@ try_onemore:
limit_reserve_root(sbi);
adjust_unusable_cap_perc(sbi);
- for (i = 0; i < NR_INODE_TYPE; i++) {
- INIT_LIST_HEAD(&sbi->inode_list[i]);
- spin_lock_init(&sbi->inode_lock[i]);
- }
- mutex_init(&sbi->flush_lock);
-
f2fs_init_extent_cache_info(sbi);
f2fs_init_ino_entry_info(sbi);
@@ -4523,9 +4551,9 @@ free_nm:
f2fs_destroy_node_manager(sbi);
free_sm:
f2fs_destroy_segment_manager(sbi);
- f2fs_destroy_post_read_wq(sbi);
stop_ckpt_thread:
f2fs_stop_ckpt_thread(sbi);
+ f2fs_destroy_post_read_wq(sbi);
free_devices:
destroy_device_list(sbi);
kvfree(sbi->ckpt);
@@ -4626,9 +4654,7 @@ static int __init init_inodecache(void)
f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
sizeof(struct f2fs_inode_info), 0,
SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
- if (!f2fs_inode_cachep)
- return -ENOMEM;
- return 0;
+ return f2fs_inode_cachep ? 0 : -ENOMEM;
}
static void destroy_inodecache(void)
@@ -4693,7 +4719,7 @@ static int __init init_f2fs_fs(void)
goto free_iostat;
err = f2fs_init_bioset();
if (err)
- goto free_bio_enrty_cache;
+ goto free_bio_entry_cache;
err = f2fs_init_compress_mempool();
if (err)
goto free_bioset;
@@ -4710,7 +4736,7 @@ free_compress_mempool:
f2fs_destroy_compress_mempool();
free_bioset:
f2fs_destroy_bioset();
-free_bio_enrty_cache:
+free_bio_entry_cache:
f2fs_destroy_bio_entry_cache();
free_iostat:
f2fs_destroy_iostat_processing();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index df27afd71ef4..83a366f3ee80 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -53,9 +53,9 @@ static const char *gc_mode_names[MAX_GC_MODE] = {
struct f2fs_attr {
struct attribute attr;
- ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
- ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
- const char *, size_t);
+ ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf);
+ ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi,
+ const char *buf, size_t len);
int struct_type;
int offset;
int id;
@@ -95,28 +95,28 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
static ssize_t dirty_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(dirty_segments(sbi)));
}
static ssize_t free_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(free_segments(sbi)));
}
static ssize_t ovp_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(overprovision_segments(sbi)));
}
static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(sbi->kbytes_written +
((f2fs_get_sectors_written(sbi) -
sbi->sectors_written_start) >> 1)));
@@ -125,13 +125,13 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
static ssize_t sb_status_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%lx\n", sbi->s_flag);
+ return sysfs_emit(buf, "%lx\n", sbi->s_flag);
}
static ssize_t cp_status_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
+ return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
}
static ssize_t pending_discard_show(struct f2fs_attr *a,
@@ -139,10 +139,16 @@ static ssize_t pending_discard_show(struct f2fs_attr *a,
{
if (!SM_I(sbi)->dcc_info)
return -EINVAL;
- return sprintf(buf, "%llu\n", (unsigned long long)atomic_read(
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
&SM_I(sbi)->dcc_info->discard_cmd_cnt));
}
+static ssize_t gc_mode_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]);
+}
+
static ssize_t features_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -199,7 +205,7 @@ static ssize_t features_show(struct f2fs_attr *a,
static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%u\n", sbi->current_reserved_blocks);
+ return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks);
}
static ssize_t unusable_show(struct f2fs_attr *a,
@@ -211,7 +217,7 @@ static ssize_t unusable_show(struct f2fs_attr *a,
unusable = sbi->unusable_block_count;
else
unusable = f2fs_get_unusable_blocks(sbi);
- return sprintf(buf, "%llu\n", (unsigned long long)unusable);
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable);
}
static ssize_t encoding_show(struct f2fs_attr *a,
@@ -226,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a,
(sb->s_encoding->version >> 8) & 0xff,
sb->s_encoding->version & 0xff);
#endif
- return sprintf(buf, "(none)");
+ return sysfs_emit(buf, "(none)\n");
}
static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time);
+ return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time);
}
#ifdef CONFIG_F2FS_STAT_FS
@@ -241,7 +247,7 @@ static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(si->tot_blks -
(si->bg_data_blks + si->bg_node_blks)));
}
@@ -251,7 +257,7 @@ static ssize_t moved_blocks_background_show(struct f2fs_attr *a,
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(si->bg_data_blks + si->bg_node_blks));
}
@@ -262,7 +268,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
si->dirty_count = dirty_segments(sbi);
f2fs_update_sit_info(sbi);
- return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
}
#endif
@@ -332,13 +338,8 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
#endif
- if (!strcmp(a->attr.name, "gc_urgent"))
- return sysfs_emit(buf, "%s\n",
- gc_mode_names[sbi->gc_mode]);
-
if (!strcmp(a->attr.name, "gc_segment_mode"))
- return sysfs_emit(buf, "%s\n",
- gc_mode_names[sbi->gc_segment_mode]);
+ return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
return sysfs_emit(buf, "%u\n",
@@ -362,7 +363,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
ui = (unsigned int *)(ptr + a->offset);
- return sprintf(buf, "%u\n", *ui);
+ return sysfs_emit(buf, "%u\n", *ui);
}
static ssize_t __sbi_store(struct f2fs_attr *a,
@@ -483,14 +484,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "max_ordered_discard")) {
+ if (t == 0 || t > MAX_PLIST_NUM)
+ return -EINVAL;
+ if (!f2fs_block_unit_discard(sbi))
+ return -EINVAL;
+ *ui = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "discard_urgent_util")) {
+ if (t > 100)
+ return -EINVAL;
+ *ui = t;
+ return count;
+ }
+
if (!strcmp(a->attr.name, "migration_granularity")) {
if (t == 0 || t > sbi->segs_per_sec)
return -EINVAL;
}
- if (!strcmp(a->attr.name, "trim_sections"))
- return -EINVAL;
-
if (!strcmp(a->attr.name, "gc_urgent")) {
if (t == 0) {
sbi->gc_mode = GC_NORMAL;
@@ -531,10 +545,10 @@ out:
return count;
}
- if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) {
- spin_lock(&sbi->gc_urgent_high_lock);
- sbi->gc_urgent_high_remaining = t;
- spin_unlock(&sbi->gc_urgent_high_lock);
+ if (!strcmp(a->attr.name, "gc_remaining_trials")) {
+ spin_lock(&sbi->gc_remaining_trials_lock);
+ sbi->gc_remaining_trials = t;
+ spin_unlock(&sbi->gc_remaining_trials_lock);
return count;
}
@@ -649,6 +663,29 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "readdir_ra")) {
+ sbi->readdir_ra = !!t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "hot_data_age_threshold")) {
+ if (t == 0 || t >= sbi->warm_data_age_threshold)
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "warm_data_age_threshold")) {
+ if (t == 0 || t <= sbi->hot_data_age_threshold)
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -721,7 +758,7 @@ static void f2fs_sb_release(struct kobject *kobj)
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "supported\n");
+ return sysfs_emit(buf, "supported\n");
}
#define F2FS_FEATURE_RO_ATTR(_name) \
@@ -734,8 +771,8 @@ static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
if (F2FS_HAS_FEATURE(sbi, a->id))
- return sprintf(buf, "supported\n");
- return sprintf(buf, "unsupported\n");
+ return sysfs_emit(buf, "supported\n");
+ return sysfs_emit(buf, "unsupported\n");
}
#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \
@@ -788,9 +825,10 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -825,7 +863,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials);
F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
F2FS_GENERAL_RO_ATTR(dirty_segments);
F2FS_GENERAL_RO_ATTR(free_segments);
@@ -838,6 +876,7 @@ F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
F2FS_GENERAL_RO_ATTR(pending_discard);
+F2FS_GENERAL_RO_ATTR(gc_mode);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -902,6 +941,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block);
+/* For block age extent cache */
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold);
+
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_urgent_sleep_time),
@@ -917,9 +960,11 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(min_discard_issue_time),
ATTR_LIST(mid_discard_issue_time),
ATTR_LIST(max_discard_issue_time),
+ ATTR_LIST(discard_urgent_util),
ATTR_LIST(discard_granularity),
+ ATTR_LIST(max_ordered_discard),
ATTR_LIST(pending_discard),
- ATTR_LIST(batched_trim_sections),
+ ATTR_LIST(gc_mode),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
ATTR_LIST(min_fsync_blocks),
@@ -952,7 +997,7 @@ static struct attribute *f2fs_attrs[] = {
#endif
ATTR_LIST(data_io_flag),
ATTR_LIST(node_io_flag),
- ATTR_LIST(gc_urgent_high_remaining),
+ ATTR_LIST(gc_remaining_trials),
ATTR_LIST(ckpt_thread_ioprio),
ATTR_LIST(dirty_segments),
ATTR_LIST(free_segments),
@@ -995,6 +1040,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(peak_atomic_write),
ATTR_LIST(committed_atomic_block),
ATTR_LIST(revoked_atomic_block),
+ ATTR_LIST(hot_data_age_threshold),
+ ATTR_LIST(warm_data_age_threshold),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -1243,6 +1290,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
return 0;
}
+static int __maybe_unused discard_plist_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ int i, count;
+
+ seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n");
+ if (!f2fs_realtime_discard_enable(sbi))
+ return 0;
+
+ if (dcc) {
+ mutex_lock(&dcc->cmd_lock);
+ for (i = 0; i < MAX_PLIST_NUM; i++) {
+ struct list_head *pend_list;
+ struct discard_cmd *dc, *tmp;
+
+ if (i % 8 == 0)
+ seq_printf(seq, " %-3d", i);
+ count = 0;
+ pend_list = &dcc->pend_list[i];
+ list_for_each_entry_safe(dc, tmp, pend_list, list)
+ count++;
+ if (count)
+ seq_printf(seq, " %7d", count);
+ else
+ seq_puts(seq, " .");
+ if (i % 8 == 7)
+ seq_putc(seq, '\n');
+ }
+ seq_putc(seq, '\n');
+ mutex_unlock(&dcc->cmd_lock);
+ }
+
+ return 0;
+}
+
int __init f2fs_init_sysfs(void)
{
int ret;
@@ -1313,6 +1398,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
#endif
proc_create_single_data("victim_bits", 0444, sbi->s_proc,
victim_bits_seq_show, sb);
+ proc_create_single_data("discard_plist_info", 0444, sbi->s_proc,
+ discard_plist_seq_show, sb);
}
return 0;
put_feature_list_kobj:
@@ -1336,6 +1423,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
remove_proc_entry("segment_info", sbi->s_proc);
remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry("victim_bits", sbi->s_proc);
+ remove_proc_entry("discard_plist_info", sbi->s_proc);
remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 1cbcc4608dc7..d99b8549ec8f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -194,11 +194,6 @@ static int fat_get_block(struct inode *inode, sector_t iblock,
return 0;
}
-static int fat_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, fat_get_block, wbc);
-}
-
static int fat_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -346,12 +341,12 @@ static const struct address_space_operations fat_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = fat_read_folio,
.readahead = fat_readahead,
- .writepage = fat_writepage,
.writepages = fat_writepages,
.write_begin = fat_write_begin,
.write_end = fat_write_end,
.direct_IO = fat_direct_IO,
- .bmap = _fat_bmap
+ .bmap = _fat_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
/*
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index af191371c352..3626eb585a98 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -17,7 +17,7 @@ struct fat_fid {
#define FAT_FID_SIZE_WITHOUT_PARENT 3
#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32))
-/**
+/*
* Look up a directory inode given its starting cluster.
*/
static struct inode *fat_dget(struct super_block *sb, int i_logstart)
@@ -135,7 +135,7 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
return type;
}
-/**
+/*
* Map a NFS file handle to a corresponding dentry.
* The dentry may or may not be connected to the filesystem root.
*/
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9958d4020771..6fba5a52127b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -121,6 +121,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ WARN_ON_ONCE(inode->i_state & I_FREEING);
list_move(&inode->i_io_list, head);
@@ -280,6 +281,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ WARN_ON_ONCE(inode->i_state & I_FREEING);
inode->i_state &= ~I_SYNC_QUEUED;
if (wb != &wb->bdi->wb)
@@ -1129,6 +1131,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ WARN_ON_ONCE(inode->i_state & I_FREEING);
inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list);
@@ -1294,6 +1297,17 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
assert_spin_locked(&inode->i_lock);
+ inode->i_state &= ~I_SYNC_QUEUED;
+ /*
+ * When the inode is being freed just don't bother with dirty list
+ * tracking. Flush worker will ignore this inode anyway and it will
+ * trigger assertions in inode_io_list_move_locked().
+ */
+ if (inode->i_state & I_FREEING) {
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+ return;
+ }
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -1302,7 +1316,6 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
inode->dirtied_when = jiffies;
}
inode_io_list_move_locked(inode, wb, &wb->b_dirty);
- inode->i_state &= ~I_SYNC_QUEUED;
}
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
@@ -1345,8 +1358,6 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
return ret;
}
-#define EXPIRE_DIRTY_ATIME 0x0001
-
/*
* Move expired (dirtied before dirtied_before) dirty inodes from
* @delaying_queue to @dispatch_queue.
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index ed40ce5742fd..edb3712dcfa5 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -138,15 +138,16 @@ EXPORT_SYMBOL(__fs_parse);
* @fc: The filesystem context to log errors through.
* @param: The parameter.
* @want_bdev: T if want a blockdev
+ * @flags: Pathwalk flags passed to filename_lookup()
* @_path: The result of the lookup
*/
int fs_lookup_param(struct fs_context *fc,
struct fs_parameter *param,
bool want_bdev,
+ unsigned int flags,
struct path *_path)
{
struct filename *f;
- unsigned int flags = 0;
bool put_f;
int ret;
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 451d8a077e12..bce2492186d0 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -605,6 +605,14 @@ again:
set_bit(FSCACHE_COOKIE_DO_PREP_TO_WRITE, &cookie->flags);
queue = true;
}
+ /*
+ * We could race with cookie_lru which may set LRU_DISCARD bit
+ * but has yet to run the cookie state machine. If this happens
+ * and another thread tries to use the cookie, clear LRU_DISCARD
+ * so we don't end up withdrawing the cookie while in use.
+ */
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags))
+ fscache_see_cookie(cookie, fscache_cookie_see_lru_discard_clear);
break;
case FSCACHE_COOKIE_STATE_FAILED:
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
index 3af3b08a9bb3..0d2b8dec8f82 100644
--- a/fs/fscache/io.c
+++ b/fs/fscache/io.c
@@ -286,7 +286,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
* taken into account.
*/
- iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+ iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
fscache_write(cres, start, &iter, fscache_wreq_done, wreq);
return;
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index 337cb29a8dd5..a4850aee2639 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -53,9 +53,10 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
+ struct inode *inode = d_inode(dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
const char *name;
int ret;
@@ -98,7 +99,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
return ret;
}
- if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
+ if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) &&
!capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index c7d882a9fe33..a06fbb1a8a5b 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -545,7 +545,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
{
struct fuse_dev *fud = file->private_data;
struct cuse_conn *cc = fc_to_cc(fud->fc);
- int rc;
/* remove from the conntbl, no more access from this point on */
mutex_lock(&cuse_lock);
@@ -560,9 +559,7 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
cdev_del(cc->cdev);
}
- rc = fuse_dev_release(inode, file); /* puts the base reference */
-
- return rc;
+ return fuse_dev_release(inode, file);
}
static struct file_operations cuse_channel_fops; /* initialized during init */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b4a6e0a1b945..e8b60ce72c9a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -764,11 +764,11 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
return ncpy;
}
-static int fuse_check_page(struct page *page)
+static int fuse_check_folio(struct folio *folio)
{
- if (page_mapcount(page) ||
- page->mapping != NULL ||
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+ if (folio_mapped(folio) ||
+ folio->mapping != NULL ||
+ (folio->flags & PAGE_FLAGS_CHECK_AT_PREP &
~(1 << PG_locked |
1 << PG_referenced |
1 << PG_uptodate |
@@ -778,7 +778,7 @@ static int fuse_check_page(struct page *page)
1 << PG_reclaim |
1 << PG_waiters |
LRU_GEN_MASK | LRU_REFS_MASK))) {
- dump_page(page, "fuse: trying to steal weird page");
+ dump_page(&folio->page, "fuse: trying to steal weird page");
return 1;
}
return 0;
@@ -787,11 +787,11 @@ static int fuse_check_page(struct page *page)
static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
{
int err;
- struct page *oldpage = *pagep;
- struct page *newpage;
+ struct folio *oldfolio = page_folio(*pagep);
+ struct folio *newfolio;
struct pipe_buffer *buf = cs->pipebufs;
- get_page(oldpage);
+ folio_get(oldfolio);
err = unlock_request(cs->req);
if (err)
goto out_put_old;
@@ -814,35 +814,36 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (!pipe_buf_try_steal(cs->pipe, buf))
goto out_fallback;
- newpage = buf->page;
+ newfolio = page_folio(buf->page);
- if (!PageUptodate(newpage))
- SetPageUptodate(newpage);
+ if (!folio_test_uptodate(newfolio))
+ folio_mark_uptodate(newfolio);
- ClearPageMappedToDisk(newpage);
+ folio_clear_mappedtodisk(newfolio);
- if (fuse_check_page(newpage) != 0)
+ if (fuse_check_folio(newfolio) != 0)
goto out_fallback_unlock;
/*
* This is a new and locked page, it shouldn't be mapped or
* have any special flags on it
*/
- if (WARN_ON(page_mapped(oldpage)))
+ if (WARN_ON(folio_mapped(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(page_has_private(oldpage)))
+ if (WARN_ON(folio_has_private(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+ if (WARN_ON(folio_test_dirty(oldfolio) ||
+ folio_test_writeback(oldfolio)))
goto out_fallback_unlock;
- if (WARN_ON(PageMlocked(oldpage)))
+ if (WARN_ON(folio_test_mlocked(oldfolio)))
goto out_fallback_unlock;
- replace_page_cache_page(oldpage, newpage);
+ replace_page_cache_folio(oldfolio, newfolio);
- get_page(newpage);
+ folio_get(newfolio);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
- lru_cache_add(newpage);
+ folio_add_lru(newfolio);
/*
* Release while we have extra ref on stolen page. Otherwise
@@ -855,28 +856,28 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (test_bit(FR_ABORTED, &cs->req->flags))
err = -ENOENT;
else
- *pagep = newpage;
+ *pagep = &newfolio->page;
spin_unlock(&cs->req->waitq.lock);
if (err) {
- unlock_page(newpage);
- put_page(newpage);
+ folio_unlock(newfolio);
+ folio_put(newfolio);
goto out_put_old;
}
- unlock_page(oldpage);
+ folio_unlock(oldfolio);
/* Drop ref for ap->pages[] array */
- put_page(oldpage);
+ folio_put(oldfolio);
cs->len = 0;
err = 0;
out_put_old:
/* Drop ref obtained in this function */
- put_page(oldpage);
+ folio_put(oldfolio);
return err;
out_fallback_unlock:
- unlock_page(newpage);
+ folio_unlock(newfolio);
out_fallback:
cs->pg = buf->page;
cs->offset = buf->offset;
@@ -1498,7 +1499,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -1546,7 +1547,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
buf[outarg.namelen] = 0;
down_read(&fc->killsb);
- err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
+ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
up_read(&fc->killsb);
kfree(buf);
return err;
@@ -2267,8 +2268,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
* Check against file->f_op because CUSE
* uses the same ioctl handler.
*/
- if (old->f_op == file->f_op &&
- old->f_cred->user_ns == file->f_cred->user_ns)
+ if (old->f_op == file->f_op)
fud = fuse_get_dev(old);
if (fud) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bb97a384dc5d..cd1a071b625a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -214,7 +214,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
if (inode && fuse_is_bad(inode))
goto invalid;
else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
- (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) {
+ (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) {
struct fuse_entry_out outarg;
FUSE_ARGS(args);
struct fuse_forget_link *forget;
@@ -1170,7 +1170,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
}
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
- u64 child_nodeid, struct qstr *name)
+ u64 child_nodeid, struct qstr *name, u32 flags)
{
int err = -ENOTDIR;
struct inode *parent;
@@ -1197,7 +1197,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
goto unlock;
fuse_dir_changed(parent);
- fuse_invalidate_entry(entry);
+ if (!(flags & FUSE_EXPIRE_ONLY))
+ d_invalidate(entry);
+ fuse_invalidate_entry_cache(entry);
if (child_nodeid != 0 && d_really_is_positive(entry)) {
inode_lock(d_inode(entry));
@@ -1235,6 +1237,18 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
return err;
}
+static inline bool fuse_permissible_uidgid(struct fuse_conn *fc)
+{
+ const struct cred *cred = current_cred();
+
+ return (uid_eq(cred->euid, fc->user_id) &&
+ uid_eq(cred->suid, fc->user_id) &&
+ uid_eq(cred->uid, fc->user_id) &&
+ gid_eq(cred->egid, fc->group_id) &&
+ gid_eq(cred->sgid, fc->group_id) &&
+ gid_eq(cred->gid, fc->group_id));
+}
+
/*
* Calling into a user-controlled filesystem gives the filesystem
* daemon ptrace-like capabilities over the current process. This
@@ -1248,26 +1262,19 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
* for which the owner of the mount has ptrace privilege. This
* excludes processes started by other users, suid or sgid processes.
*/
-int fuse_allow_current_process(struct fuse_conn *fc)
+bool fuse_allow_current_process(struct fuse_conn *fc)
{
- const struct cred *cred;
-
- if (allow_sys_admin_access && capable(CAP_SYS_ADMIN))
- return 1;
+ bool allow;
if (fc->allow_other)
- return current_in_userns(fc->user_ns);
+ allow = current_in_userns(fc->user_ns);
+ else
+ allow = fuse_permissible_uidgid(fc);
- cred = current_cred();
- if (uid_eq(cred->euid, fc->user_id) &&
- uid_eq(cred->suid, fc->user_id) &&
- uid_eq(cred->uid, fc->user_id) &&
- gid_eq(cred->egid, fc->group_id) &&
- gid_eq(cred->sgid, fc->group_id) &&
- gid_eq(cred->gid, fc->group_id))
- return 1;
+ if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN))
+ allow = true;
- return 0;
+ return allow;
}
static int fuse_access(struct inode *inode, int mask)
@@ -1935,7 +1942,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
.permission = fuse_permission,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
- .get_acl = fuse_get_acl,
+ .get_inode_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
.fileattr_get = fuse_fileattr_get,
.fileattr_set = fuse_fileattr_set,
@@ -1957,7 +1964,7 @@ static const struct inode_operations fuse_common_inode_operations = {
.permission = fuse_permission,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
- .get_acl = fuse_get_acl,
+ .get_inode_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
.fileattr_get = fuse_fileattr_get,
.fileattr_set = fuse_fileattr_set,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 89f4741728ba..875314ee6f59 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1313,7 +1313,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
return err;
if (fc->handle_killpriv_v2 &&
- should_remove_suid(file_dentry(file))) {
+ setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
goto writethrough;
}
@@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
return res;
}
+static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
+}
+
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
+ bool exclusive_lock =
+ !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
+ iocb->ki_flags & IOCB_APPEND ||
+ fuse_direct_write_extending_i_size(iocb, from);
+
+ /*
+ * Take exclusive lock if
+ * - Parallel direct writes are disabled - a user space decision
+ * - Parallel direct writes are enabled and i_size is being extended.
+ * This might not be needed at all, but needs further investigation.
+ */
+ if (exclusive_lock)
+ inode_lock(inode);
+ else {
+ inode_lock_shared(inode);
+
+ /* A race with truncate might have come up as the decision for
+ * the lock type was done without holding the lock, check again.
+ */
+ if (fuse_direct_write_extending_i_size(iocb, from)) {
+ inode_unlock_shared(inode);
+ inode_lock(inode);
+ exclusive_lock = true;
+ }
+ }
- /* Don't allow parallel writes to the same file */
- inode_lock(inode);
res = generic_write_checks(iocb, from);
if (res > 0) {
if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
@@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
- inode_unlock(inode);
+ if (exclusive_lock)
+ inode_unlock(inode);
+ else
+ inode_unlock_shared(inode);
return res;
}
@@ -2931,6 +2967,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (iov_iter_rw(iter) == WRITE) {
fuse_write_update_attr(inode, pos, ret);
+ /* For extending writes we already hold exclusive lock */
if (ret < 0 && offset + count > i_size)
fuse_do_truncate(file);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 98a9cf531873..c673faefdcb9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1179,7 +1179,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr);
/**
* Is current process allowed to perform filesystem operation?
*/
-int fuse_allow_current_process(struct fuse_conn *fc);
+bool fuse_allow_current_process(struct fuse_conn *fc);
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
@@ -1220,7 +1220,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
* then the dentry is unhashed (d_delete()).
*/
int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
- u64 child_nodeid, struct qstr *name);
+ u64 child_nodeid, struct qstr *name, u32 flags);
int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir);
@@ -1269,7 +1269,7 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
struct posix_acl;
struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu);
-int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
/* readdir.c */
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 61d8afcb10a3..fcce94ace2c2 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -255,7 +255,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
ap.args.in_pages = true;
err = -EFAULT;
- iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
+ iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size);
for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
if (c != PAGE_SIZE && iov_iter_count(&ii))
@@ -324,7 +324,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
goto out;
err = -EFAULT;
- iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
+ iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred);
for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
if (c != PAGE_SIZE && iov_iter_count(&ii))
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index e8deaacf1832..dc603479b30e 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -547,9 +547,9 @@ retry_locked:
* Contents of the page are now protected against changing by holding
* the page lock.
*/
- addr = kmap(page);
+ addr = kmap_local_page(page);
res = fuse_parse_cache(ff, addr, size, ctx);
- kunmap(page);
+ kunmap_local(addr);
unlock_page(page);
put_page(page);
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 734d1f05d823..3dcde4912413 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -109,9 +109,10 @@ out:
return error;
}
-int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
bool need_unlock = false;
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index cd180ca7c959..b8de8c148f5c 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -13,7 +13,7 @@
extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu);
extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 05bee80ac7de..e782b4f1d104 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -427,8 +427,6 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
return error;
kaddr = kmap_atomic(page);
- if (dsize > gfs2_max_stuffed_size(ip))
- dsize = gfs2_max_stuffed_size(ip);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap_atomic(kaddr);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3bdb2c668a71..e7537fd305dd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -61,9 +61,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
void *kaddr = kmap(page);
u64 dsize = i_size_read(inode);
- if (dsize > gfs2_max_stuffed_size(ip))
- dsize = gfs2_max_stuffed_size(ip);
-
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap(page);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 60c6fb91fb58..eea5be4fbf0e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1445,14 +1445,13 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh)
{
- struct gfs2_glock *gl = fl_gh->gh_gl;
+ struct gfs2_glock *gl = gfs2_glock_hold(fl_gh->gh_gl);
/*
* Make sure gfs2_glock_put() won't sleep under the file->f_lock
* spinlock.
*/
- gfs2_glock_hold(gl);
spin_lock(&file->f_lock);
gfs2_holder_uninit(fl_gh);
spin_unlock(&file->f_lock);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index df335c258eb0..524f3c96b9a4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -186,10 +186,11 @@ void gfs2_glock_free(struct gfs2_glock *gl)
*
*/
-void gfs2_glock_hold(struct gfs2_glock *gl)
+struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl)
{
GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
lockref_get(&gl->gl_lockref);
+ return gl;
}
/**
@@ -205,12 +206,6 @@ static int demote_ok(const struct gfs2_glock *gl)
if (gl->gl_state == LM_ST_UNLOCKED)
return 0;
- /*
- * Note that demote_ok is used for the lru process of disposing of
- * glocks. For this purpose, we don't care if the glock's holders
- * have the HIF_MAY_DEMOTE flag set or not. If someone is using
- * them, don't demote.
- */
if (!list_empty(&gl->gl_holders))
return 0;
if (glops->go_demote_ok)
@@ -393,7 +388,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)
struct gfs2_holder *gh, *tmp;
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- if (!test_bit(HIF_WAIT, &gh->gh_iflags))
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
if (ret & LM_OUT_ERROR)
gh->gh_error = -EIO;
@@ -408,45 +403,6 @@ static void do_error(struct gfs2_glock *gl, const int ret)
}
/**
- * demote_incompat_holders - demote incompatible demoteable holders
- * @gl: the glock we want to promote
- * @current_gh: the newly promoted holder
- *
- * We're passing the newly promoted holder in @current_gh, but actually, any of
- * the strong holders would do.
- */
-static void demote_incompat_holders(struct gfs2_glock *gl,
- struct gfs2_holder *current_gh)
-{
- struct gfs2_holder *gh, *tmp;
-
- /*
- * Demote incompatible holders before we make ourselves eligible.
- * (This holder may or may not allow auto-demoting, but we don't want
- * to demote the new holder before it's even granted.)
- */
- list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- /*
- * Since holders are at the front of the list, we stop when we
- * find the first non-holder.
- */
- if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
- return;
- if (gh == current_gh)
- continue;
- if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
- !may_grant(gl, current_gh, gh)) {
- /*
- * We should not recurse into do_promote because
- * __gfs2_glock_dq only calls handle_callback,
- * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
- */
- __gfs2_glock_dq(gh);
- }
- }
-}
-
-/**
* find_first_holder - find the first "holder" gh
* @gl: the glock
*/
@@ -464,26 +420,6 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
return NULL;
}
-/**
- * find_first_strong_holder - find the first non-demoteable holder
- * @gl: the glock
- *
- * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
- */
-static inline struct gfs2_holder *
-find_first_strong_holder(struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh;
-
- list_for_each_entry(gh, &gl->gl_holders, gh_list) {
- if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
- return NULL;
- if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
- return gh;
- }
- return NULL;
-}
-
/*
* gfs2_instantiate - Call the glops instantiate function
* @gh: The glock holder
@@ -540,9 +476,8 @@ done:
static int do_promote(struct gfs2_glock *gl)
{
struct gfs2_holder *gh, *current_gh;
- bool incompat_holders_demoted = false;
- current_gh = find_first_strong_holder(gl);
+ current_gh = find_first_holder(gl);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
@@ -561,11 +496,8 @@ static int do_promote(struct gfs2_glock *gl)
set_bit(HIF_HOLDER, &gh->gh_iflags);
trace_gfs2_promote(gh);
gfs2_holder_wake(gh);
- if (!incompat_holders_demoted) {
+ if (!current_gh)
current_gh = gh;
- demote_incompat_holders(gl, current_gh);
- incompat_holders_demoted = true;
- }
}
return 0;
}
@@ -927,6 +859,48 @@ out_unlock:
return;
}
+/**
+ * glock_set_object - set the gl_object field of a glock
+ * @gl: the glock
+ * @object: the object
+ */
+void glock_set_object(struct gfs2_glock *gl, void *object)
+{
+ void *prev_object;
+
+ spin_lock(&gl->gl_lockref.lock);
+ prev_object = gl->gl_object;
+ gl->gl_object = object;
+ spin_unlock(&gl->gl_lockref.lock);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) {
+ pr_warn("glock=%u/%llx\n",
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ gfs2_dump_glock(NULL, gl, true);
+ }
+}
+
+/**
+ * glock_clear_object - clear the gl_object field of a glock
+ * @gl: the glock
+ */
+void glock_clear_object(struct gfs2_glock *gl, void *object)
+{
+ void *prev_object;
+
+ spin_lock(&gl->gl_lockref.lock);
+ prev_object = gl->gl_object;
+ gl->gl_object = NULL;
+ spin_unlock(&gl->gl_lockref.lock);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd,
+ prev_object == object || prev_object == NULL)) {
+ pr_warn("glock=%u/%llx\n",
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ gfs2_dump_glock(NULL, gl, true);
+ }
+}
+
void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation)
{
struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr;
@@ -980,8 +954,6 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
ip = NULL;
spin_unlock(&gl->gl_lockref.lock);
if (ip) {
- struct gfs2_glock *inode_gl = NULL;
-
gl->gl_no_formal_ino = ip->i_no_formal_ino;
set_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
d_prune_aliases(&ip->i_inode);
@@ -991,14 +963,14 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
spin_lock(&gl->gl_lockref.lock);
ip = gl->gl_object;
if (ip) {
- inode_gl = ip->i_gl;
- lockref_get(&inode_gl->gl_lockref);
clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
+ if (!igrab(&ip->i_inode))
+ ip = NULL;
}
spin_unlock(&gl->gl_lockref.lock);
- if (inode_gl) {
- gfs2_glock_poke(inode_gl);
- gfs2_glock_put(inode_gl);
+ if (ip) {
+ gfs2_glock_poke(ip->i_gl);
+ iput(&ip->i_inode);
}
evicted = !ip;
}
@@ -1039,6 +1011,7 @@ static void delete_work_func(struct work_struct *work)
if (gfs2_queue_delete_work(gl, 5 * HZ))
return;
}
+ goto out;
}
inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino,
@@ -1051,6 +1024,7 @@ static void delete_work_func(struct work_struct *work)
d_prune_aliases(inode);
iput(inode);
}
+out:
gfs2_glock_put(gl);
}
@@ -1256,13 +1230,12 @@ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
struct gfs2_holder *gh, unsigned long ip)
{
INIT_LIST_HEAD(&gh->gh_list);
- gh->gh_gl = gl;
+ gh->gh_gl = gfs2_glock_hold(gl);
gh->gh_ip = ip;
gh->gh_owner_pid = get_pid(task_pid(current));
gh->gh_state = state;
gh->gh_flags = flags;
gh->gh_iflags = 0;
- gfs2_glock_hold(gl);
}
/**
@@ -1496,7 +1469,7 @@ __acquires(&gl->gl_lockref.lock)
if (test_bit(GLF_LOCK, &gl->gl_flags)) {
struct gfs2_holder *current_gh;
- current_gh = find_first_strong_holder(gl);
+ current_gh = find_first_holder(gl);
try_futile = !may_grant(gl, current_gh, gh);
}
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
@@ -1508,8 +1481,6 @@ __acquires(&gl->gl_lockref.lock)
continue;
if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK)
continue;
- if (test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))
- continue;
if (!pid_is_meaningful(gh2))
continue;
goto trap_recursive;
@@ -1619,69 +1590,28 @@ static inline bool needs_demote(struct gfs2_glock *gl)
static void __gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned delay = 0;
int fast_path = 0;
/*
- * This while loop is similar to function demote_incompat_holders:
- * If the glock is due to be demoted (which may be from another node
- * or even if this holder is GL_NOCACHE), the weak holders are
- * demoted as well, allowing the glock to be demoted.
+ * This holder should not be cached, so mark it for demote.
+ * Note: this should be done before the check for needs_demote
+ * below.
*/
- while (gh) {
- /*
- * If we're in the process of file system withdraw, we cannot
- * just dequeue any glocks until our journal is recovered, lest
- * we introduce file system corruption. We need two exceptions
- * to this rule: We need to allow unlocking of nondisk glocks
- * and the glock for our own journal that needs recovery.
- */
- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
- glock_blocked_by_withdraw(gl) &&
- gh->gh_gl != sdp->sd_jinode_gl) {
- sdp->sd_glock_dqs_held++;
- spin_unlock(&gl->gl_lockref.lock);
- might_sleep();
- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&gl->gl_lockref.lock);
- }
+ if (gh->gh_flags & GL_NOCACHE)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- /*
- * This holder should not be cached, so mark it for demote.
- * Note: this should be done before the check for needs_demote
- * below.
- */
- if (gh->gh_flags & GL_NOCACHE)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
-
- list_del_init(&gh->gh_list);
- clear_bit(HIF_HOLDER, &gh->gh_iflags);
- trace_gfs2_glock_queue(gh, 0);
+ list_del_init(&gh->gh_list);
+ clear_bit(HIF_HOLDER, &gh->gh_iflags);
+ trace_gfs2_glock_queue(gh, 0);
- /*
- * If there hasn't been a demote request we are done.
- * (Let the remaining holders, if any, keep holding it.)
- */
- if (!needs_demote(gl)) {
- if (list_empty(&gl->gl_holders))
- fast_path = 1;
- break;
- }
- /*
- * If we have another strong holder (we cannot auto-demote)
- * we are done. It keeps holding it until it is done.
- */
- if (find_first_strong_holder(gl))
- break;
-
- /*
- * If we have a weak holder at the head of the list, it
- * (and all others like it) must be auto-demoted. If there
- * are no more weak holders, we exit the while loop.
- */
- gh = find_first_holder(gl);
+ /*
+ * If there hasn't been a demote request we are done.
+ * (Let the remaining holders, if any, keep holding it.)
+ */
+ if (!needs_demote(gl)) {
+ if (list_empty(&gl->gl_holders))
+ fast_path = 1;
}
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
@@ -1705,8 +1635,17 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh)
void gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
spin_lock(&gl->gl_lockref.lock);
+ if (!gfs2_holder_queued(gh)) {
+ /*
+ * May have already been dequeued because the locking request
+ * was GL_ASYNC and it has failed in the meantime.
+ */
+ goto out;
+ }
+
if (list_is_first(&gh->gh_list, &gl->gl_holders) &&
!test_bit(HIF_HOLDER, &gh->gh_iflags)) {
spin_unlock(&gl->gl_lockref.lock);
@@ -1715,7 +1654,26 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
spin_lock(&gl->gl_lockref.lock);
}
+ /*
+ * If we're in the process of file system withdraw, we cannot just
+ * dequeue any glocks until our journal is recovered, lest we introduce
+ * file system corruption. We need two exceptions to this rule: We need
+ * to allow unlocking of nondisk glocks and the glock for our own
+ * journal that needs recovery.
+ */
+ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+ glock_blocked_by_withdraw(gl) &&
+ gh->gh_gl != sdp->sd_jinode_gl) {
+ sdp->sd_glock_dqs_held++;
+ spin_unlock(&gl->gl_lockref.lock);
+ might_sleep();
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
+ }
+
__gfs2_glock_dq(gh);
+out:
spin_unlock(&gl->gl_lockref.lock);
}
@@ -1888,33 +1846,6 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
delay = gl->gl_hold_time;
}
- /*
- * Note 1: We cannot call demote_incompat_holders from handle_callback
- * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
- * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
- * Plus, we only want to demote the holders if the request comes from
- * a remote cluster node because local holder conflicts are resolved
- * elsewhere.
- *
- * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
- * request that we set our state to UNLOCKED. Here we mock up a holder
- * to make it look like someone wants the lock EX locally. Any SH
- * and DF requests should be able to share the lock without demoting.
- *
- * Note 3: We only want to demote the demoteable holders when there
- * are no more strong holders. The demoteable holders might as well
- * keep the glock until the last strong holder is done with it.
- */
- if (!find_first_strong_holder(gl)) {
- struct gfs2_holder mock_gh = {
- .gh_gl = gl,
- .gh_state = (state == LM_ST_UNLOCKED) ?
- LM_ST_EXCLUSIVE : state,
- .gh_iflags = BIT(HIF_HOLDER)
- };
-
- demote_incompat_holders(gl, &mock_gh);
- }
handle_callback(gl, state, delay, true);
__gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
@@ -2306,8 +2237,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
*p++ = 'H';
if (test_bit(HIF_WAIT, &iflags))
*p++ = 'W';
- if (test_bit(HIF_MAY_DEMOTE, &iflags))
- *p++ = 'D';
if (flags & GL_SKIP)
*p++ = 's';
*p = 0;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 0d068f4fd7d6..f37ac087e2c1 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -156,8 +156,6 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
break;
- if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
- continue;
if (gh->gh_owner_pid == pid)
goto out;
}
@@ -196,7 +194,7 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
int create, struct gfs2_glock **glp);
-extern void gfs2_glock_hold(struct gfs2_glock *gl);
+extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
@@ -288,6 +286,9 @@ extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
extern void gfs2_register_debugfs(void);
extern void gfs2_unregister_debugfs(void);
+extern void glock_set_object(struct gfs2_glock *gl, void *object);
+extern void glock_clear_object(struct gfs2_glock *gl, void *object);
+
extern const struct lm_lockops gfs2_dlm_ops;
static inline void gfs2_holder_mark_uninitialized(struct gfs2_holder *gh)
@@ -305,64 +306,6 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh)
return !list_empty(&gh->gh_list);
}
-/**
- * glock_set_object - set the gl_object field of a glock
- * @gl: the glock
- * @object: the object
- */
-static inline void glock_set_object(struct gfs2_glock *gl, void *object)
-{
- spin_lock(&gl->gl_lockref.lock);
- if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL))
- gfs2_dump_glock(NULL, gl, true);
- gl->gl_object = object;
- spin_unlock(&gl->gl_lockref.lock);
-}
-
-/**
- * glock_clear_object - clear the gl_object field of a glock
- * @gl: the glock
- * @object: the object
- *
- * I'd love to similarly add this:
- * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object))
- * gfs2_dump_glock(NULL, gl, true);
- * Unfortunately, that's not possible because as soon as gfs2_delete_inode
- * frees the block in the rgrp, another process can reassign it for an I_NEW
- * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget.
- * That means gfs2_delete_inode may subsequently try to call this function
- * for a glock that's already pointing to a brand new inode. If we clear the
- * new inode's gl_object, we'll introduce metadata corruption. Function
- * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also
- * tries to clear gl_object, so it's more than just gfs2_delete_inode.
- *
- */
-static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
-{
- spin_lock(&gl->gl_lockref.lock);
- if (gl->gl_object == object)
- gl->gl_object = NULL;
- spin_unlock(&gl->gl_lockref.lock);
-}
-
-static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- spin_lock(&gl->gl_lockref.lock);
- set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
- spin_unlock(&gl->gl_lockref.lock);
-}
-
-static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- spin_lock(&gl->gl_lockref.lock);
- clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
- spin_unlock(&gl->gl_lockref.lock);
-}
-
extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49210a2e7ce7..d78b61ecc1cd 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -397,38 +397,39 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
struct timespec64 atime;
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
- bool is_new = ip->i_inode.i_state & I_NEW;
+ struct inode *inode = &ip->i_inode;
+ bool is_new = inode->i_state & I_NEW;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
goto corrupt;
- if (unlikely(!is_new && inode_wrong_type(&ip->i_inode, mode)))
+ if (unlikely(!is_new && inode_wrong_type(inode, mode)))
goto corrupt;
ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
- ip->i_inode.i_mode = mode;
+ inode->i_mode = mode;
if (is_new) {
- ip->i_inode.i_rdev = 0;
+ inode->i_rdev = 0;
switch (mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
- ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
- be32_to_cpu(str->di_minor));
+ inode->i_rdev = MKDEV(be32_to_cpu(str->di_major),
+ be32_to_cpu(str->di_minor));
break;
}
}
- i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
- i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));
- set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
- i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
- gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+ i_uid_write(inode, be32_to_cpu(str->di_uid));
+ i_gid_write(inode, be32_to_cpu(str->di_gid));
+ set_nlink(inode, be32_to_cpu(str->di_nlink));
+ i_size_write(inode, be64_to_cpu(str->di_size));
+ gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks));
atime.tv_sec = be64_to_cpu(str->di_atime);
atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
- if (timespec64_compare(&ip->i_inode.i_atime, &atime) < 0)
- ip->i_inode.i_atime = atime;
- ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
- ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
- ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
- ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+ if (timespec64_compare(&inode->i_atime, &atime) < 0)
+ inode->i_atime = atime;
+ inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
+ inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+ inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
+ inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
ip->i_goal = be64_to_cpu(str->di_goal_meta);
ip->i_generation = be64_to_cpu(str->di_generation);
@@ -436,7 +437,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_diskflags = be32_to_cpu(str->di_flags);
ip->i_eattr = be64_to_cpu(str->di_eattr);
/* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */
- gfs2_set_inode_flags(&ip->i_inode);
+ gfs2_set_inode_flags(inode);
height = be16_to_cpu(str->di_height);
if (unlikely(height > GFS2_MAX_META_HEIGHT))
goto corrupt;
@@ -448,8 +449,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_depth = (u8)depth;
ip->i_entries = be32_to_cpu(str->di_entries);
- if (S_ISREG(ip->i_inode.i_mode))
- gfs2_set_aops(&ip->i_inode);
+ if (gfs2_is_stuffed(ip) && inode->i_size > gfs2_max_stuffed_size(ip))
+ goto corrupt;
+
+ if (S_ISREG(inode->i_mode))
+ gfs2_set_aops(inode);
return 0;
corrupt:
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index d09d9892cd05..c26765080f28 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -252,7 +252,6 @@ struct gfs2_lkstats {
enum {
/* States */
- HIF_MAY_DEMOTE = 1,
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_WAIT = 10,
};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 04a201584fa7..614db3055c02 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -142,6 +142,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (unlikely(error))
goto fail;
+ /*
+ * The only caller that sets @blktype to GFS2_BLKST_UNLINKED is
+ * delete_work_func(). Make sure not to cancel the delete work
+ * from within itself here.
+ */
if (blktype == GFS2_BLKST_UNLINKED)
extra_flags |= LM_FLAG_TRY;
else
@@ -403,12 +408,17 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
goto out_ipreserv;
error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
+ if (error)
+ goto out_trans_end;
+
ip->i_no_formal_ino = ip->i_generation;
ip->i_inode.i_ino = ip->i_no_addr;
ip->i_goal = ip->i_no_addr;
+ if (*dblocks > 1)
+ ip->i_eattr = ip->i_no_addr + 1;
+out_trans_end:
gfs2_trans_end(sdp);
-
out_ipreserv:
gfs2_inplace_release(ip);
out_quota:
@@ -586,6 +596,12 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
* @size: The initial size of the inode (ignored for directories)
* @excl: Force fail if inode exists
*
+ * FIXME: Change to allocate the disk blocks and write them out in the same
+ * transaction. That way, we can no longer end up in a situation in which an
+ * inode is allocated, the node crashes, and the block looks like a valid
+ * inode. (With atomic creates in place, we will also no longer need to zero
+ * the link count and dirty the inode here on failure.)
+ *
* Returns: 0 on success, or error code
*/
@@ -596,12 +612,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
{
const struct qstr *name = &dentry->d_name;
struct posix_acl *default_acl, *acl;
- struct gfs2_holder ghs[2];
+ struct gfs2_holder d_gh, gh;
struct inode *inode = NULL;
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
- int error, free_vfs_inode = 1;
+ int error;
u32 aflags = 0;
unsigned blocks = 1;
struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
@@ -617,10 +633,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail;
- error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh);
if (error)
goto fail;
- gfs2_holder_mark_uninitialized(ghs + 1);
+ gfs2_holder_mark_uninitialized(&gh);
error = create_ok(dip, name, mode);
if (error)
@@ -642,7 +658,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
else
error = finish_no_open(file, NULL);
}
- gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(&d_gh);
goto fail;
} else if (error != -ENOENT) {
goto fail_gunlock;
@@ -656,12 +672,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = -ENOMEM;
if (!inode)
goto fail_gunlock;
+ ip = GFS2_I(inode);
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error)
goto fail_gunlock;
- ip = GFS2_I(inode);
error = gfs2_qa_get(ip);
if (error)
goto fail_free_acls;
@@ -723,15 +739,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail_free_inode;
gfs2_cancel_delete_work(io_gl);
+retry:
error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr);
- BUG_ON(error);
+ if (error == -EBUSY)
+ goto retry;
+ if (error)
+ goto fail_gunlock2;
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID,
&ip->i_iopen_gh);
if (error)
goto fail_gunlock2;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (error)
goto fail_gunlock3;
@@ -739,10 +759,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_gunlock3;
- if (blocks > 1) {
- ip->i_eattr = ip->i_no_addr + 1;
+ if (blocks > 1)
gfs2_init_xattr(ip);
- }
init_dinode(dip, ip, symname);
gfs2_trans_end(sdp);
@@ -750,9 +768,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
glock_set_object(io_gl, ip);
gfs2_set_iop(inode);
- free_vfs_inode = 0; /* After this point, the inode is no longer
- considered free. Any failures need to undo
- the gfs2 structures. */
if (default_acl) {
error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
@@ -785,9 +800,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
file->f_mode |= FMODE_CREATED;
error = finish_open(file, dentry, gfs2_open_common);
}
- gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(&d_gh);
gfs2_qa_put(ip);
- gfs2_glock_dq_uninit(ghs + 1);
+ gfs2_glock_dq_uninit(&gh);
gfs2_glock_put(io_gl);
gfs2_qa_put(dip);
unlock_new_inode(inode);
@@ -801,10 +816,6 @@ fail_gunlock3:
fail_gunlock2:
gfs2_glock_put(io_gl);
fail_free_inode:
- if (ip->i_gl) {
- if (free_vfs_inode) /* else evict will do the put for us */
- gfs2_glock_put(ip->i_gl);
- }
gfs2_rs_deltree(&ip->i_res);
gfs2_qa_put(ip);
fail_free_acls:
@@ -812,20 +823,19 @@ fail_free_acls:
posix_acl_release(acl);
fail_gunlock:
gfs2_dir_no_add(&da);
- gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(&d_gh);
if (!IS_ERR_OR_NULL(inode)) {
+ set_bit(GIF_ALLOC_FAILED, &ip->i_flags);
clear_nlink(inode);
- if (!free_vfs_inode)
+ if (ip->i_no_addr)
mark_inode_dirty(inode);
- set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
- &GFS2_I(inode)->i_flags);
if (inode->i_state & I_NEW)
iget_failed(inode);
else
iput(inode);
}
- if (gfs2_holder_initialized(ghs + 1))
- gfs2_glock_dq_uninit(ghs + 1);
+ if (gfs2_holder_initialized(&gh))
+ gfs2_glock_dq_uninit(&gh);
fail:
gfs2_qa_put(dip);
return error;
@@ -1997,7 +2007,7 @@ static int gfs2_setattr(struct user_namespace *mnt_userns,
else {
error = gfs2_setattr_simple(inode, attr);
if (!error && attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(&init_user_ns, inode,
+ error = posix_acl_chmod(&init_user_ns, dentry,
inode->i_mode);
}
@@ -2149,7 +2159,7 @@ static const struct inode_operations gfs2_file_iops = {
.getattr = gfs2_getattr,
.listxattr = gfs2_listxattr,
.fiemap = gfs2_fiemap,
- .get_acl = gfs2_get_acl,
+ .get_inode_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
.update_time = gfs2_update_time,
.fileattr_get = gfs2_fileattr_get,
@@ -2171,7 +2181,7 @@ static const struct inode_operations gfs2_dir_iops = {
.getattr = gfs2_getattr,
.listxattr = gfs2_listxattr,
.fiemap = gfs2_fiemap,
- .get_acl = gfs2_get_acl,
+ .get_inode_acl = gfs2_get_acl,
.set_acl = gfs2_set_acl,
.update_time = gfs2_update_time,
.atomic_open = gfs2_atomic_open,
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 6ed728aae9a5..3c41b864ee5b 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -442,6 +442,12 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
struct buffer_head *bh;
int ty;
+ if (!ip->i_gl) {
+ /* This can only happen during incomplete inode creation. */
+ BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ return;
+ }
+
gfs2_ail1_wipe(sdp, bstart, blen);
while (blen) {
ty = REMOVE_META;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b018957a1bb2..999cc146d708 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -379,6 +379,7 @@ out:
void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
{
+ const struct inode *inode = &ip->i_inode;
struct gfs2_dinode *str = buf;
str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -386,15 +387,15 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
- str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
- str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
- str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
- str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
- str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
- str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
- str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
- str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
- str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
+ str->di_mode = cpu_to_be32(inode->i_mode);
+ str->di_uid = cpu_to_be32(i_uid_read(inode));
+ str->di_gid = cpu_to_be32(i_gid_read(inode));
+ str->di_nlink = cpu_to_be32(inode->i_nlink);
+ str->di_size = cpu_to_be64(i_size_read(inode));
+ str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
+ str->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
+ str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
+ str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
str->di_goal_meta = cpu_to_be64(ip->i_goal);
str->di_goal_data = cpu_to_be64(ip->i_goal);
@@ -402,16 +403,16 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_flags = cpu_to_be32(ip->i_diskflags);
str->di_height = cpu_to_be16(ip->i_height);
- str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+ str->di_payload_format = cpu_to_be32(S_ISDIR(inode->i_mode) &&
!(ip->i_diskflags & GFS2_DIF_EXHASH) ?
GFS2_FORMAT_DE : 0);
str->di_depth = cpu_to_be16(ip->i_depth);
str->di_entries = cpu_to_be32(ip->i_entries);
str->di_eattr = cpu_to_be64(ip->i_eattr);
- str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
- str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
- str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
+ str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
+ str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
+ str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
}
/**
@@ -475,6 +476,12 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
int need_endtrans = 0;
int ret;
+ if (unlikely(!ip->i_gl)) {
+ /* This can only happen during incomplete inode creation. */
+ BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ return;
+ }
+
if (unlikely(gfs2_withdrawn(sdp)))
return;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
@@ -927,8 +934,7 @@ static int gfs2_drop_inode(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
- if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) &&
- inode->i_nlink &&
+ if (inode->i_nlink &&
gfs2_holder_initialized(&ip->i_iopen_gh)) {
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -1076,7 +1082,13 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip)
struct inode *inode = &ip->i_inode;
struct gfs2_glock *gl = ip->i_gl;
- truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0);
+ if (unlikely(!gl)) {
+ /* This can only happen during incomplete inode creation. */
+ BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ return;
+ }
+
+ truncate_inode_pages(gfs2_glock2aspace(gl), 0);
truncate_inode_pages(&inode->i_data, 0);
if (atomic_read(&gl->gl_revokes) == 0) {
@@ -1218,10 +1230,8 @@ static enum dinode_demise evict_should_delete(struct inode *inode,
struct gfs2_sbd *sdp = sb->s_fs_info;
int ret;
- if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
- BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
+ if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags)))
goto should_delete;
- }
if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags))
return SHOULD_DEFER_EVICTION;
@@ -1294,13 +1304,22 @@ static int evict_unlinked_inode(struct inode *inode)
goto out;
}
- /* We're about to clear the bitmap for the dinode, but as soon as we
- do, gfs2_create_inode can create another inode at the same block
- location and try to set gl_object again. We clear gl_object here so
- that subsequent inode creates don't see an old gl_object. */
- glock_clear_object(ip->i_gl, ip);
+ if (ip->i_gl)
+ gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino);
+
+ /*
+ * As soon as we clear the bitmap for the dinode, gfs2_create_inode()
+ * can get called to recreate it, or even gfs2_inode_lookup() if the
+ * inode was recreated on another node in the meantime.
+ *
+ * However, inserting the new inode into the inode hash table will not
+ * succeed until the old inode is removed, and that only happens after
+ * ->evict_inode() returns. The new inode is attached to its inode and
+ * iopen glocks after inserting it into the inode hash table, so at
+ * that point we can be sure that both glocks are unused.
+ */
+
ret = gfs2_dinode_dealloc(ip);
- gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino);
out:
return ret;
}
@@ -1367,12 +1386,7 @@ static void gfs2_evict_inode(struct inode *inode)
struct gfs2_holder gh;
int ret;
- if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
- clear_inode(inode);
- return;
- }
-
- if (inode->i_nlink || sb_rdonly(sb))
+ if (inode->i_nlink || sb_rdonly(sb) || !ip->i_no_addr)
goto out;
gfs2_holder_mark_uninitialized(&gh);
@@ -1405,12 +1419,9 @@ out:
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
glock_clear_object(gl, ip);
- if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq(&ip->i_iopen_gh);
- }
gfs2_glock_hold(gl);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
gfs2_glock_put_eventually(gl);
}
if (ip->i_gl) {
@@ -1429,6 +1440,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL);
if (!ip)
return NULL;
+ ip->i_no_addr = 0;
ip->i_flags = 0;
ip->i_gl = NULL;
gfs2_holder_mark_uninitialized(&ip->i_iopen_gh);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index f6a66050380e..518c0677e12a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1412,11 +1412,13 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
ip->i_eattr = 0;
gfs2_add_inode_blocks(&ip->i_inode, -1);
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (!error) {
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
+ if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) {
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ brelse(dibh);
+ }
}
gfs2_trans_end(sdp);
@@ -1445,14 +1447,16 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
if (error)
return error;
- error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
- if (error)
- goto out_quota;
-
- if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
- error = ea_dealloc_indirect(ip);
+ if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) {
+ error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
if (error)
goto out_quota;
+
+ if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+ error = ea_dealloc_indirect(ip);
+ if (error)
+ goto out_quota;
+ }
}
error = ea_dealloc_block(ip);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c4526f16355d..3a155c1d810e 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -173,12 +173,12 @@ const struct address_space_operations hfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfs_read_folio,
- .writepage = hfs_writepage,
.write_begin = hfs_write_begin,
.write_end = generic_write_end,
.bmap = hfs_bmap,
.direct_IO = hfs_direct_IO,
.writepages = hfs_writepages,
+ .migrate_folio = buffer_migrate_folio,
};
/*
@@ -458,13 +458,16 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
/* panic? */
return -EIO;
+ res = -EIO;
+ if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
+ goto out;
fd.search_key->cat = HFS_I(main_inode)->cat_key;
if (hfs_brec_find(&fd))
- /* panic? */
goto out;
if (S_ISDIR(main_inode->i_mode)) {
- WARN_ON(fd.entrylength < sizeof(struct hfs_cat_dir));
+ if (fd.entrylength < sizeof(struct hfs_cat_dir))
+ goto out;
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
if (rec.type != HFS_CDR_DIR ||
@@ -477,6 +480,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
} else if (HFS_IS_RSRC(inode)) {
+ if (fd.entrylength < sizeof(struct hfs_cat_file))
+ goto out;
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
hfs_inode_write_fork(inode, rec.file.RExtRec,
@@ -484,7 +489,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
} else {
- WARN_ON(fd.entrylength < sizeof(struct hfs_cat_file));
+ if (fd.entrylength < sizeof(struct hfs_cat_file))
+ goto out;
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
if (rec.type != HFS_CDR_FIL ||
@@ -501,9 +507,10 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
}
+ res = 0;
out:
hfs_find_exit(&fd);
- return 0;
+ return res;
}
static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index 39f5e343bf4d..fdb0edb8a607 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -109,7 +109,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr
if (nls_io) {
wchar_t ch;
- while (srclen > 0) {
+ while (srclen > 0 && dstlen > 0) {
size = nls_io->char2uni(src, srclen, &ch);
if (size < 0) {
ch = '?';
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a5db2e3b2980..6aa919e59483 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -198,6 +198,8 @@ struct hfsplus_sb_info {
#define HFSPLUS_SB_HFSX 3
#define HFSPLUS_SB_CASEFOLD 4
#define HFSPLUS_SB_NOBARRIER 5
+#define HFSPLUS_SB_UID 6
+#define HFSPLUS_SB_GID 7
static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
{
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index aeab83ed1c9c..840577a0c1e7 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -170,12 +170,12 @@ const struct address_space_operations hfsplus_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfsplus_read_folio,
- .writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
.write_end = generic_write_end,
.bmap = hfsplus_bmap,
.direct_IO = hfsplus_direct_IO,
.writepages = hfsplus_writepages,
+ .migrate_folio = buffer_migrate_folio,
};
const struct dentry_operations hfsplus_dentry_operations = {
@@ -192,11 +192,11 @@ static void hfsplus_get_perms(struct inode *inode,
mode = be16_to_cpu(perms->mode);
i_uid_write(inode, be32_to_cpu(perms->owner));
- if (!i_uid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode))
inode->i_uid = sbi->uid;
i_gid_write(inode, be32_to_cpu(perms->group));
- if (!i_gid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode))
inode->i_gid = sbi->gid;
if (dir) {
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 047e05c57560..c94a58762ad6 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -140,6 +140,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
if (!uid_valid(sbi->uid)) {
pr_err("invalid uid specified\n");
return 0;
+ } else {
+ set_bit(HFSPLUS_SB_UID, &sbi->flags);
}
break;
case opt_gid:
@@ -151,6 +153,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
if (!gid_valid(sbi->gid)) {
pr_err("invalid gid specified\n");
return 0;
+ } else {
+ set_bit(HFSPLUS_SB_GID, &sbi->flags);
}
break;
case opt_part:
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index f7547a62c81f..88952d4a631e 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -163,11 +163,6 @@ static int hpfs_read_folio(struct file *file, struct folio *folio)
return mpage_read_folio(folio, hpfs_get_block);
}
-static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, hpfs_get_block, wbc);
-}
-
static void hpfs_readahead(struct readahead_control *rac)
{
mpage_readahead(rac, hpfs_get_block);
@@ -248,12 +243,12 @@ const struct address_space_operations hpfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hpfs_read_folio,
- .writepage = hpfs_writepage,
.readahead = hpfs_readahead,
.writepages = hpfs_writepages,
.write_begin = hpfs_write_begin,
.write_end = hpfs_write_end,
- .bmap = _hpfs_bmap
+ .bmap = _hpfs_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
const struct file_operations hpfs_file_ops =
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index df7772335dc0..790d2727141a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -370,11 +370,11 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void hugetlb_delete_from_page_cache(struct page *page)
+static void hugetlb_delete_from_page_cache(struct folio *folio)
{
- ClearPageDirty(page);
- ClearPageUptodate(page);
- delete_from_page_cache(page);
+ folio_clear_dirty(folio);
+ folio_clear_uptodate(folio);
+ filemap_remove_folio(folio);
}
/*
@@ -580,8 +580,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
* map could fail. Correspondingly, the subpool and global
* reserve usage count can need to be adjusted.
*/
- VM_BUG_ON(HPageRestoreReserve(&folio->page));
- hugetlb_delete_from_page_cache(&folio->page);
+ VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
+ hugetlb_delete_from_page_cache(folio);
ret = true;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode, index,
@@ -1097,10 +1097,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- if (hugetlb_page_subpool(&src->page)) {
- hugetlb_set_page_subpool(&dst->page,
- hugetlb_page_subpool(&src->page));
- hugetlb_set_page_subpool(&src->page, NULL);
+ if (hugetlb_folio_subpool(src)) {
+ hugetlb_set_folio_subpool(dst,
+ hugetlb_folio_subpool(src));
+ hugetlb_set_folio_subpool(src, NULL);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1279,7 +1279,7 @@ static const struct address_space_operations hugetlbfs_aops = {
static void init_once(void *foo)
{
- struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
+ struct hugetlbfs_inode_info *ei = foo;
inode_init_once(&ei->vfs_inode);
}
@@ -1377,7 +1377,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->max_size_opt = memparse(param->string, &rest);
ctx->max_val_type = SIZE_STD;
@@ -1387,7 +1387,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_nr_inodes:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->nr_inodes = memparse(param->string, &rest);
return 0;
@@ -1403,7 +1403,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_min_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->min_size_opt = memparse(param->string, &rest);
ctx->min_val_type = SIZE_STD;
diff --git a/fs/inode.c b/fs/inode.c
index b608528efd3a..f453eb58fd03 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1949,40 +1949,12 @@ skip_update:
EXPORT_SYMBOL(touch_atime);
/*
- * The logic we want is
- *
- * if suid or (sgid and xgrp)
- * remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
- umode_t mode = d_inode(dentry)->i_mode;
- int kill = 0;
-
- /* suid always must be killed */
- if (unlikely(mode & S_ISUID))
- kill = ATTR_KILL_SUID;
-
- /*
- * sgid without any exec bits is just a mandatory locking mark; leave
- * it alone. If some exec bits are set, it's a real sgid; kill it.
- */
- if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
- kill |= ATTR_KILL_SGID;
-
- if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
- return kill;
-
- return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-
-/*
* Return mask of changes for notify_change() that need to be done as a
* response to write or truncate. Return 0 if nothing has to be changed.
* Negative value on error (change should be denied).
*/
-int dentry_needs_remove_privs(struct dentry *dentry)
+int dentry_needs_remove_privs(struct user_namespace *mnt_userns,
+ struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
int mask = 0;
@@ -1991,7 +1963,7 @@ int dentry_needs_remove_privs(struct dentry *dentry)
if (IS_NOSEC(inode))
return 0;
- mask = should_remove_suid(dentry);
+ mask = setattr_should_drop_suidgid(mnt_userns, inode);
ret = security_inode_need_killpriv(dentry);
if (ret < 0)
return ret;
@@ -2023,7 +1995,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0;
- kill = dentry_needs_remove_privs(dentry);
+ kill = dentry_needs_remove_privs(file_mnt_user_ns(file), dentry);
if (kill < 0)
return kill;
@@ -2071,9 +2043,6 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
sync_it |= S_VERSION;
- if (!sync_it)
- return 0;
-
return sync_it;
}
@@ -2354,15 +2323,15 @@ EXPORT_SYMBOL(inode_init_owner);
bool inode_owner_or_capable(struct user_namespace *mnt_userns,
const struct inode *inode)
{
- kuid_t i_uid;
+ vfsuid_t vfsuid;
struct user_namespace *ns;
- i_uid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid))
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
return true;
ns = current_user_ns();
- if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER))
+ if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER))
return true;
return false;
}
@@ -2488,6 +2457,28 @@ struct timespec64 current_time(struct inode *inode)
EXPORT_SYMBOL(current_time);
/**
+ * in_group_or_capable - check whether caller is CAP_FSETID privileged
+ * @mnt_userns: user namespace of the mount @inode was found from
+ * @inode: inode to check
+ * @vfsgid: the new/current vfsgid of @inode
+ *
+ * Check wether @vfsgid is in the caller's group list or if the caller is
+ * privileged with CAP_FSETID over @inode. This can be used to determine
+ * whether the setgid bit can be kept or must be dropped.
+ *
+ * Return: true if the caller is sufficiently privileged, false if not.
+ */
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode, vfsgid_t vfsgid)
+{
+ if (vfsgid_in_group_p(vfsgid))
+ return true;
+ if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ return true;
+ return false;
+}
+
+/**
* mode_strip_sgid - handle the sgid bit for non-directories
* @mnt_userns: User namespace of the mount the inode was created from
* @dir: parent directory inode
@@ -2508,11 +2499,9 @@ umode_t mode_strip_sgid(struct user_namespace *mnt_userns,
return mode;
if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
return mode;
- if (in_group_p(i_gid_into_mnt(mnt_userns, dir)))
- return mode;
- if (capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID))
+ if (in_group_or_capable(mnt_userns, dir,
+ i_gid_into_vfsgid(mnt_userns, dir)))
return mode;
-
return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/fs/internal.h b/fs/internal.h
index 6f0386b34fae..a803cc3cf716 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -150,7 +150,9 @@ extern int vfs_open(const struct path *, struct file *);
* inode.c
*/
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-extern int dentry_needs_remove_privs(struct dentry *dentry);
+int dentry_needs_remove_privs(struct user_namespace *, struct dentry *dentry);
+bool in_group_or_capable(struct user_namespace *mnt_userns,
+ const struct inode *inode, vfsgid_t vfsgid);
/*
* fs-writeback.c
@@ -225,12 +227,39 @@ struct xattr_ctx {
};
-ssize_t do_getxattr(struct user_namespace *mnt_userns,
+ssize_t do_getxattr(struct mnt_idmap *idmap,
struct dentry *d,
struct xattr_ctx *ctx);
int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
-int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct xattr_ctx *ctx);
+int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode);
+
+#ifdef CONFIG_FS_POSIX_ACL
+int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, const void *kvalue, size_t size);
+ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, void *kvalue, size_t size);
+#else
+static inline int do_set_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, const char *acl_name,
+ const void *kvalue, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
+ struct dentry *dentry, const char *acl_name,
+ void *kvalue, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+#endif
ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
+
+/*
+ * fs/attr.c
+ */
+int setattr_should_drop_sgid(struct user_namespace *mnt_userns,
+ const struct inode *inode);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 91ee0b308e13..356193e44cf0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -584,7 +584,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
}
-static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
+static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
size_t len, struct folio **foliop)
{
const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
@@ -618,6 +618,27 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
goto out_no_page;
}
+
+ /*
+ * Now we have a locked folio, before we do anything with it we need to
+ * check that the iomap we have cached is not stale. The inode extent
+ * mapping can change due to concurrent IO in flight (e.g.
+ * IOMAP_UNWRITTEN state can change and memory reclaim could have
+ * reclaimed a previously partially written page at this index after IO
+ * completion before this write reaches this file offset) and hence we
+ * could do the wrong thing here (zero a page range incorrectly or fail
+ * to zero) and corrupt data.
+ */
+ if (page_ops && page_ops->iomap_valid) {
+ bool iomap_valid = page_ops->iomap_valid(iter->inode,
+ &iter->iomap);
+ if (!iomap_valid) {
+ iter->iomap.flags |= IOMAP_F_STALE;
+ status = 0;
+ goto out_unlock;
+ }
+ }
+
if (pos + len > folio_pos(folio) + folio_size(folio))
len = folio_pos(folio) + folio_size(folio) - pos;
@@ -773,6 +794,8 @@ again:
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
break;
+ if (iter->iomap.flags & IOMAP_F_STALE)
+ break;
page = folio_file_page(folio, pos >> PAGE_SHIFT);
if (mapping_writably_mapped(mapping))
@@ -832,6 +855,231 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+/*
+ * Scan the data range passed to us for dirty page cache folios. If we find a
+ * dirty folio, punch out the preceeding range and update the offset from which
+ * the next punch will start from.
+ *
+ * We can punch out storage reservations under clean pages because they either
+ * contain data that has been written back - in which case the delalloc punch
+ * over that range is a no-op - or they have been read faults in which case they
+ * contain zeroes and we can remove the delalloc backing range and any new
+ * writes to those pages will do the normal hole filling operation...
+ *
+ * This makes the logic simple: we only need to keep the delalloc extents only
+ * over the dirty ranges of the page cache.
+ *
+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+ * simplify range iterations.
+ */
+static int iomap_write_delalloc_scan(struct inode *inode,
+ loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
+ int (*punch)(struct inode *inode, loff_t offset, loff_t length))
+{
+ while (start_byte < end_byte) {
+ struct folio *folio;
+
+ /* grab locked page */
+ folio = filemap_lock_folio(inode->i_mapping,
+ start_byte >> PAGE_SHIFT);
+ if (!folio) {
+ start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
+ PAGE_SIZE;
+ continue;
+ }
+
+ /* if dirty, punch up to offset */
+ if (folio_test_dirty(folio)) {
+ if (start_byte > *punch_start_byte) {
+ int error;
+
+ error = punch(inode, *punch_start_byte,
+ start_byte - *punch_start_byte);
+ if (error) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return error;
+ }
+ }
+
+ /*
+ * Make sure the next punch start is correctly bound to
+ * the end of this data range, not the end of the folio.
+ */
+ *punch_start_byte = min_t(loff_t, end_byte,
+ folio_next_index(folio) << PAGE_SHIFT);
+ }
+
+ /* move offset to start of next folio in range */
+ start_byte = folio_next_index(folio) << PAGE_SHIFT;
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ return 0;
+}
+
+/*
+ * Punch out all the delalloc blocks in the range given except for those that
+ * have dirty data still pending in the page cache - those are going to be
+ * written and so must still retain the delalloc backing for writeback.
+ *
+ * As we are scanning the page cache for data, we don't need to reimplement the
+ * wheel - mapping_seek_hole_data() does exactly what we need to identify the
+ * start and end of data ranges correctly even for sub-folio block sizes. This
+ * byte range based iteration is especially convenient because it means we
+ * don't have to care about variable size folios, nor where the start or end of
+ * the data range lies within a folio, if they lie within the same folio or even
+ * if there are multiple discontiguous data ranges within the folio.
+ *
+ * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
+ * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
+ * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
+ * date. A write page fault can then mark it dirty. If we then fail a write()
+ * beyond EOF into that up to date cached range, we allocate a delalloc block
+ * beyond EOF and then have to punch it out. Because the range is up to date,
+ * mapping_seek_hole_data() will return it, and we will skip the punch because
+ * the folio is dirty. THis is incorrect - we always need to punch out delalloc
+ * beyond EOF in this case as writeback will never write back and covert that
+ * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
+ * resulting in always punching out the range from the EOF to the end of the
+ * range the iomap spans.
+ *
+ * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
+ * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
+ * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
+ * returns the end of the data range (data_end). Using closed intervals would
+ * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
+ * the code to subtle off-by-one bugs....
+ */
+static int iomap_write_delalloc_release(struct inode *inode,
+ loff_t start_byte, loff_t end_byte,
+ int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+{
+ loff_t punch_start_byte = start_byte;
+ loff_t scan_end_byte = min(i_size_read(inode), end_byte);
+ int error = 0;
+
+ /*
+ * Lock the mapping to avoid races with page faults re-instantiating
+ * folios and dirtying them via ->page_mkwrite whilst we walk the
+ * cache and perform delalloc extent removal. Failing to do this can
+ * leave dirty pages with no space reservation in the cache.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ while (start_byte < scan_end_byte) {
+ loff_t data_end;
+
+ start_byte = mapping_seek_hole_data(inode->i_mapping,
+ start_byte, scan_end_byte, SEEK_DATA);
+ /*
+ * If there is no more data to scan, all that is left is to
+ * punch out the remaining range.
+ */
+ if (start_byte == -ENXIO || start_byte == scan_end_byte)
+ break;
+ if (start_byte < 0) {
+ error = start_byte;
+ goto out_unlock;
+ }
+ WARN_ON_ONCE(start_byte < punch_start_byte);
+ WARN_ON_ONCE(start_byte > scan_end_byte);
+
+ /*
+ * We find the end of this contiguous cached data range by
+ * seeking from start_byte to the beginning of the next hole.
+ */
+ data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
+ scan_end_byte, SEEK_HOLE);
+ if (data_end < 0) {
+ error = data_end;
+ goto out_unlock;
+ }
+ WARN_ON_ONCE(data_end <= start_byte);
+ WARN_ON_ONCE(data_end > scan_end_byte);
+
+ error = iomap_write_delalloc_scan(inode, &punch_start_byte,
+ start_byte, data_end, punch);
+ if (error)
+ goto out_unlock;
+
+ /* The next data search starts at the end of this one. */
+ start_byte = data_end;
+ }
+
+ if (punch_start_byte < end_byte)
+ error = punch(inode, punch_start_byte,
+ end_byte - punch_start_byte);
+out_unlock:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return error;
+}
+
+/*
+ * When a short write occurs, the filesystem may need to remove reserved space
+ * that was allocated in ->iomap_begin from it's ->iomap_end method. For
+ * filesystems that use delayed allocation, we need to punch out delalloc
+ * extents from the range that are not dirty in the page cache. As the write can
+ * race with page faults, there can be dirty pages over the delalloc extent
+ * outside the range of a short write but still within the delalloc extent
+ * allocated for this iomap.
+ *
+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+ * simplify range iterations.
+ *
+ * The punch() callback *must* only punch delalloc extents in the range passed
+ * to it. It must skip over all other types of extents in the range and leave
+ * them completely unchanged. It must do this punch atomically with respect to
+ * other extent modifications.
+ *
+ * The punch() callback may be called with a folio locked to prevent writeback
+ * extent allocation racing at the edge of the range we are currently punching.
+ * The locked folio may or may not cover the range being punched, so it is not
+ * safe for the punch() callback to lock folios itself.
+ *
+ * Lock order is:
+ *
+ * inode->i_rwsem (shared or exclusive)
+ * inode->i_mapping->invalidate_lock (exclusive)
+ * folio_lock()
+ * ->punch
+ * internal filesystem allocation lock
+ */
+int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+ struct iomap *iomap, loff_t pos, loff_t length,
+ ssize_t written,
+ int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+{
+ loff_t start_byte;
+ loff_t end_byte;
+ int blocksize = i_blocksize(inode);
+
+ if (iomap->type != IOMAP_DELALLOC)
+ return 0;
+
+ /* If we didn't reserve the blocks, we're not allowed to punch them. */
+ if (!(iomap->flags & IOMAP_F_NEW))
+ return 0;
+
+ /*
+ * start_byte refers to the first unused block after a short write. If
+ * nothing was written, round offset down to point at the first block in
+ * the range.
+ */
+ if (unlikely(!written))
+ start_byte = round_down(pos, blocksize);
+ else
+ start_byte = round_up(pos + written, blocksize);
+ end_byte = round_up(pos + length, blocksize);
+
+ /* Nothing to do if we've written the entire delalloc extent */
+ if (start_byte >= end_byte)
+ return 0;
+
+ return iomap_write_delalloc_release(inode, start_byte, end_byte,
+ punch);
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
+
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
@@ -856,6 +1104,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
+ if (iter->iomap.flags & IOMAP_F_STALE)
+ break;
status = iomap_write_end(iter, pos, bytes, bytes, folio);
if (WARN_ON_ONCE(status == 0))
@@ -911,6 +1161,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
status = iomap_write_begin(iter, pos, bytes, &folio);
if (status)
return status;
+ if (iter->iomap.flags & IOMAP_F_STALE)
+ break;
offset = offset_in_folio(folio, pos);
if (bytes > folio_size(folio) - offset)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4eb559a16c9e..9804714b1751 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -240,7 +240,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
{
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
- unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
@@ -252,7 +251,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
- if ((pos | length) & ((1 << blkbits) - 1) ||
+ if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index a1c7592d2ade..79a0614eaab7 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -7,12 +7,28 @@
#include <linux/iomap.h>
#include "trace.h"
+/*
+ * Advance to the next range we need to map.
+ *
+ * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
+ * processed - it was aborted because the extent the iomap spanned may have been
+ * changed during the operation. In this case, the iteration behaviour is to
+ * remap the unprocessed range of the iter, and that means we may need to remap
+ * even when we've made no progress (i.e. iter->processed = 0). Hence the
+ * "finished iterating" case needs to distinguish between
+ * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
+ * need to remap the entire remaining range.
+ */
static inline int iomap_iter_advance(struct iomap_iter *iter)
{
+ bool stale = iter->iomap.flags & IOMAP_F_STALE;
+
/* handle the previous iteration (if any) */
if (iter->iomap.length) {
- if (iter->processed <= 0)
+ if (iter->processed < 0)
return iter->processed;
+ if (!iter->processed && !stale)
+ return 0;
if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
return -EIO;
iter->pos += iter->processed;
@@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
WARN_ON_ONCE(iter->iomap.offset > iter->pos);
WARN_ON_ONCE(iter->iomap.length == 0);
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
+ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
if (iter->srcmap.type != IOMAP_HOLE)
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 885a7a6cc53e..4810438b7856 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -207,14 +207,13 @@ int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
}
/* Send all the data buffers related to an inode */
-int jbd2_submit_inode_data(struct jbd2_inode *jinode)
+int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode)
{
-
if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
return 0;
trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
- return jbd2_journal_submit_inode_data_buffers(jinode);
+ return journal->j_submit_inode_data_buffers(jinode);
}
EXPORT_SYMBOL(jbd2_submit_inode_data);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index e945e3484788..8bb58ce5c06c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -229,10 +229,11 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
return rc;
}
-int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int rc, xprefix;
+ struct inode *inode = d_inode(dentry);
switch (type) {
case ACL_TYPE_ACCESS:
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 9d9fb7cf093e..ca36a6eca594 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu);
-int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jffs2_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index c0aabbcbfd58..f399b390b5f6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -62,7 +62,7 @@ const struct inode_operations jffs2_dir_inode_operations =
.rmdir = jffs2_rmdir,
.mknod = jffs2_mknod,
.rename = jffs2_rename,
- .get_acl = jffs2_get_acl,
+ .get_inode_acl = jffs2_get_acl,
.set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
.listxattr = jffs2_listxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index ba86acbe12d3..3cf71befa475 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -64,7 +64,7 @@ const struct file_operations jffs2_file_operations =
const struct inode_operations jffs2_file_inode_operations =
{
- .get_acl = jffs2_get_acl,
+ .get_inode_acl = jffs2_get_acl,
.set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
.listxattr = jffs2_listxattr,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 39cec28096a7..66af51c41619 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -202,7 +202,7 @@ int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
rc = jffs2_do_setattr(inode, iattr);
if (!rc && (iattr->ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
return rc;
}
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a653f34c6e26..3b667eccc73b 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -94,12 +94,13 @@ out:
return rc;
}
-int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int rc;
tid_t tid;
int update_mode = 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
tid = txBegin(inode->i_sb, 0);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 332dc9ac47a9..88663465aecd 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -123,7 +123,7 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
mark_inode_dirty(inode);
if (iattr->ia_valid & ATTR_MODE)
- rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ rc = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
return rc;
}
@@ -133,7 +133,7 @@ const struct inode_operations jfs_file_inode_operations = {
.fileattr_get = jfs_fileattr_get,
.fileattr_set = jfs_fileattr_set,
#ifdef CONFIG_JFS_POSIX_ACL
- .get_acl = jfs_get_acl,
+ .get_inode_acl = jfs_get_acl,
.set_acl = jfs_set_acl,
#endif
};
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index d1ec920aa030..8ac10e396050 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -264,11 +264,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
return rc;
}
-static int jfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, jfs_get_block, wbc);
-}
-
static int jfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -355,12 +350,12 @@ const struct address_space_operations jfs_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = jfs_read_folio,
.readahead = jfs_readahead,
- .writepage = jfs_writepage,
.writepages = jfs_writepages,
.write_begin = jfs_write_begin,
.write_end = jfs_write_end,
.bmap = jfs_bmap,
.direct_IO = jfs_direct_IO,
+ .migrate_folio = buffer_migrate_folio,
};
/*
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 3de40286d31f..f0704a25835f 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -8,7 +8,7 @@
#ifdef CONFIG_JFS_POSIX_ACL
struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu);
-int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int jfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 6b838d3ae7c2..765838578a72 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -155,7 +155,7 @@ int dbMount(struct inode *ipbmap)
struct bmap *bmp;
struct dbmap_disk *dbmp_le;
struct metapage *mp;
- int i;
+ int i, err;
/*
* allocate/initialize the in-memory bmap descriptor
@@ -170,8 +170,8 @@ int dbMount(struct inode *ipbmap)
BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
PSIZE, 0);
if (mp == NULL) {
- kfree(bmp);
- return -EIO;
+ err = -EIO;
+ goto err_kfree_bmp;
}
/* copy the on-disk bmap descriptor to its in-memory version. */
@@ -181,9 +181,8 @@ int dbMount(struct inode *ipbmap)
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
if (!bmp->db_numag) {
- release_metapage(mp);
- kfree(bmp);
- return -EINVAL;
+ err = -EINVAL;
+ goto err_release_metapage;
}
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
@@ -194,6 +193,16 @@ int dbMount(struct inode *ipbmap)
bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+ if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
+ if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
for (i = 0; i < MAXAG; i++)
bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
@@ -214,6 +223,12 @@ int dbMount(struct inode *ipbmap)
BMAP_LOCK_INIT(bmp);
return (0);
+
+err_release_metapage:
+ release_metapage(mp);
+err_kfree_bmp:
+ kfree(bmp);
+ return err;
}
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
index 1c984214e95e..a0ee4ccea66e 100644
--- a/fs/jfs/jfs_extent.h
+++ b/fs/jfs/jfs_extent.h
@@ -10,9 +10,7 @@
(addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
extern int extAlloc(struct inode *, s64, s64, xad_t *, bool);
-extern int extFill(struct inode *, xad_t *);
extern int extHint(struct inode *, s64, xad_t *);
-extern int extRealloc(struct inode *, s64, xad_t *, bool);
extern int extRecord(struct inode *, xad_t *);
#endif /* _H_JFS_EXTENT */
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 799d3837e7c2..390cbfce391f 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -310,8 +310,8 @@ int diRead(struct inode *ip)
iagno = INOTOIAG(ip->i_ino);
/* read the iag */
- imap = JFS_IP(ipimap)->i_imap;
IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
+ imap = JFS_IP(ipimap)->i_imap;
rc = diIAGRead(imap, iagno, &mp);
IREAD_UNLOCK(ipimap);
if (rc) {
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 48d1f70f786c..b83aae56a1f2 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -234,11 +234,15 @@ int jfs_mount_rw(struct super_block *sb, int remount)
truncate_inode_pages(sbi->ipimap->i_mapping, 0);
truncate_inode_pages(sbi->ipbmap->i_mapping, 0);
+
+ IWRITE_LOCK(sbi->ipimap, RDWRLOCK_IMAP);
diUnmount(sbi->ipimap, 1);
if ((rc = diMount(sbi->ipimap))) {
+ IWRITE_UNLOCK(sbi->ipimap);
jfs_err("jfs_mount_rw: diMount failed!");
return rc;
}
+ IWRITE_UNLOCK(sbi->ipimap);
dbUnmount(sbi->ipbmap, 1);
if ((rc = dbMount(sbi->ipbmap))) {
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 3e8b13e6aa01..8ec43f53f686 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -68,7 +68,6 @@ int jfs_umount(struct super_block *sb)
/*
* close secondary aggregate inode allocation map
*/
- ipaimap2 = sbi->ipaimap2;
if (ipaimap2) {
diUnmount(ipaimap2, 0);
diFreeSpecial(ipaimap2);
@@ -78,7 +77,6 @@ int jfs_umount(struct super_block *sb)
/*
* close aggregate inode allocation map
*/
- ipaimap = sbi->ipaimap;
diUnmount(ipaimap, 0);
diFreeSpecial(ipaimap);
sbi->ipaimap = NULL;
@@ -89,7 +87,7 @@ int jfs_umount(struct super_block *sb)
dbUnmount(ipbmap, 0);
diFreeSpecial(ipbmap);
- sbi->ipimap = NULL;
+ sbi->ipbmap = NULL;
/*
* Make sure all metadata makes it to disk before we mark
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index c50167a7bc50..0d33816d251d 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -25,7 +25,7 @@ struct jfs_ea_list {
struct jfs_ea ea[]; /* Variable length list */
};
-/* Macros for defining maxiumum number of bytes supported for EAs */
+/* Macros for defining maximum number of bytes supported for EAs */
#define MAXEASIZE 65535
#define MAXEALISTSIZE MAXEASIZE
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 142caafc73b1..ad7592191d76 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -96,12 +96,8 @@ extern int xtInsert(tid_t tid, struct inode *ip,
extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen,
int flag);
extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad);
-extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen,
- int flag);
extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type);
extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size);
-extern int xtRelocate(tid_t tid, struct inode *ip,
- xad_t * oxad, s64 nxaddr, int xtype);
extern int xtAppend(tid_t tid,
struct inode *ip, int xflag, s64 xoff, int maxblocks,
int *xlenp, s64 * xaddrp, int flag);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 9db4f5789c0e..a38d14eed047 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -946,7 +946,7 @@ static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip,
if (ssize <= IDATASIZE) {
ip->i_op = &jfs_fast_symlink_inode_operations;
- ip->i_link = JFS_IP(ip)->i_inline;
+ ip->i_link = JFS_IP(ip)->i_inline_all;
memcpy(ip->i_link, name, ssize);
ip->i_size = ssize - 1;
@@ -1525,7 +1525,7 @@ const struct inode_operations jfs_dir_inode_operations = {
.fileattr_get = jfs_fileattr_get,
.fileattr_set = jfs_fileattr_set,
#ifdef CONFIG_JFS_POSIX_ACL
- .get_acl = jfs_get_acl,
+ .get_inode_acl = jfs_get_acl,
.set_acl = jfs_set_acl,
#endif
};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 85d4f44f2ac4..d2f82cb7db1b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -745,8 +745,7 @@ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
len = i_size-off;
toread = len;
while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, toread);
tmp_bh.b_state = 0;
tmp_bh.b_size = i_blocksize(inode);
@@ -785,8 +784,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
inode_lock(inode);
while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, towrite);
tmp_bh.b_state = 0;
tmp_bh.b_size = i_blocksize(inode);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index f33b3baad07c..935ef8cb02b2 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -125,9 +125,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
* kn_to: /n1/n2/n3 [depth=3]
* result: /../..
*
- * [3] when @kn_to is NULL result will be "(null)"
+ * [3] when @kn_to is %NULL result will be "(null)"
*
- * Returns the length of the full path. If the full length is equal to or
+ * Return: the length of the full path. If the full length is equal to or
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -185,10 +185,12 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
* @buflen: size of @buf
*
* Copies the name of @kn into @buf of @buflen bytes. The behavior is
- * similar to strlcpy(). It returns the length of @kn's name and if @buf
- * isn't long enough, it's filled upto @buflen-1 and nul terminated.
+ * similar to strlcpy().
*
- * Fills buffer with "(null)" if @kn is NULL.
+ * Fills buffer with "(null)" if @kn is %NULL.
+ *
+ * Return: the length of @kn's name and if @buf isn't long enough,
+ * it's filled up to @buflen-1 and nul terminated.
*
* This function can be called from any context.
*/
@@ -215,7 +217,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
* path (which includes '..'s) as needed to reach from @from to @to is
* returned.
*
- * Returns the length of the full path. If the full length is equal to or
+ * Return: the length of the full path. If the full length is equal to or
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -287,6 +289,8 @@ out:
*
* Determines @kn's parent, pins and returns it. This function can be
* called from any context.
+ *
+ * Return: parent node of @kn
*/
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
@@ -302,11 +306,11 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
}
/**
- * kernfs_name_hash
+ * kernfs_name_hash - calculate hash of @ns + @name
* @name: Null terminated string to hash
* @ns: Namespace tag to hash
*
- * Returns 31 bit hash of ns + name (so it fits in an off_t )
+ * Return: 31-bit hash of ns + name (so it fits in an off_t)
*/
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
@@ -354,8 +358,8 @@ static int kernfs_sd_compare(const struct kernfs_node *left,
* Locking:
* kernfs_rwsem held exclusive
*
- * RETURNS:
- * 0 on susccess -EEXIST on failure.
+ * Return:
+ * %0 on success, -EEXIST on failure.
*/
static int kernfs_link_sibling(struct kernfs_node *kn)
{
@@ -394,8 +398,10 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
* @kn: kernfs_node of interest
*
* Try to unlink @kn from its sibling rbtree which starts from
- * kn->parent->dir.children. Returns %true if @kn was actually
- * removed, %false if @kn wasn't on the rbtree.
+ * kn->parent->dir.children.
+ *
+ * Return: %true if @kn was actually removed,
+ * %false if @kn wasn't on the rbtree.
*
* Locking:
* kernfs_rwsem held exclusive
@@ -419,10 +425,10 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn)
* @kn: kernfs_node to get an active reference to
*
* Get an active reference of @kn. This function is noop if @kn
- * is NULL.
+ * is %NULL.
*
- * RETURNS:
- * Pointer to @kn on success, NULL on failure.
+ * Return:
+ * Pointer to @kn on success, %NULL on failure.
*/
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
{
@@ -442,7 +448,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
* @kn: kernfs_node to put an active reference to
*
* Put an active reference to @kn. This function is noop if @kn
- * is NULL.
+ * is %NULL.
*/
void kernfs_put_active(struct kernfs_node *kn)
{
@@ -464,7 +470,7 @@ void kernfs_put_active(struct kernfs_node *kn)
* kernfs_drain - drain kernfs_node
* @kn: kernfs_node to drain
*
- * Drain existing usages and nuke all existing mmaps of @kn. Mutiple
+ * Drain existing usages and nuke all existing mmaps of @kn. Multiple
* removers may invoke this function concurrently on @kn and all will
* return after draining is complete.
*/
@@ -577,7 +583,7 @@ EXPORT_SYMBOL_GPL(kernfs_put);
* kernfs_node_from_dentry - determine kernfs_node associated with a dentry
* @dentry: the dentry in question
*
- * Return the kernfs_node associated with @dentry. If @dentry is not a
+ * Return: the kernfs_node associated with @dentry. If @dentry is not a
* kernfs one, %NULL is returned.
*
* While the returned kernfs_node will stay accessible as long as @dentry
@@ -684,8 +690,8 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
* @id's lower 32bits encode ino and upper gen. If the gen portion is
* zero, all generations are matched.
*
- * RETURNS:
- * NULL on failure. Return a kernfs node with reference counter incremented
+ * Return: %NULL on failure,
+ * otherwise a kernfs node with reference counter incremented.
*/
struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
u64 id)
@@ -733,8 +739,8 @@ err_unlock:
* function increments nlink of the parent's inode if @kn is a
* directory and link into the children list of the parent.
*
- * RETURNS:
- * 0 on success, -EEXIST if entry with the given name already
+ * Return:
+ * %0 on success, -EEXIST if entry with the given name already
* exists.
*/
int kernfs_add_one(struct kernfs_node *kn)
@@ -797,8 +803,9 @@ out_unlock:
* @name: name to look for
* @ns: the namespace tag to use
*
- * Look for kernfs_node with name @name under @parent. Returns pointer to
- * the found kernfs_node on success, %NULL on failure.
+ * Look for kernfs_node with name @name under @parent.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
const unsigned char *name,
@@ -871,8 +878,9 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
* @ns: the namespace tag to use
*
* Look for kernfs_node with name @name under @parent and get a reference
- * if found. This function may sleep and returns pointer to the found
- * kernfs_node on success, %NULL on failure.
+ * if found. This function may sleep.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
const char *name, const void *ns)
@@ -896,8 +904,9 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
* @ns: the namespace tag to use
*
* Look for kernfs_node with path @path under @parent and get a reference
- * if found. This function may sleep and returns pointer to the found
- * kernfs_node on success, %NULL on failure.
+ * if found. This function may sleep.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
const char *path, const void *ns)
@@ -919,7 +928,7 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
* @flags: KERNFS_ROOT_* flags
* @priv: opaque data associated with the new directory
*
- * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * Return: the root of the new hierarchy on success, ERR_PTR() value on
* failure.
*/
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
@@ -991,6 +1000,8 @@ void kernfs_destroy_root(struct kernfs_root *root)
/**
* kernfs_root_to_node - return the kernfs_node associated with a kernfs_root
* @root: root to use to lookup
+ *
+ * Return: @root's kernfs_node
*/
struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
{
@@ -1007,7 +1018,7 @@ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
* @priv: opaque data associated with the new directory
* @ns: optional namespace tag of the directory
*
- * Returns the created node on success, ERR_PTR() value on failure.
+ * Return: the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
const char *name, umode_t mode,
@@ -1041,7 +1052,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
* @parent: parent in which to create a new directory
* @name: name of the new directory
*
- * Returns the created node on success, ERR_PTR() value on failure.
+ * Return: the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
const char *name)
@@ -1083,20 +1094,30 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
/* If the kernfs parent node has changed discard and
* proceed to ->lookup.
+ *
+ * There's nothing special needed here when getting the
+ * dentry parent, even if a concurrent rename is in
+ * progress. That's because the dentry is negative so
+ * it can only be the target of the rename and it will
+ * be doing a d_move() not a replace. Consequently the
+ * dentry d_parent won't change over the d_move().
+ *
+ * Also kernfs negative dentries transitioning from
+ * negative to positive during revalidate won't happen
+ * because they are invalidated on containing directory
+ * changes and the lookup re-done so that a new positive
+ * dentry can be properly created.
*/
- spin_lock(&dentry->d_lock);
+ root = kernfs_root_from_sb(dentry->d_sb);
+ down_read(&root->kernfs_rwsem);
parent = kernfs_dentry_node(dentry->d_parent);
if (parent) {
- spin_unlock(&dentry->d_lock);
- root = kernfs_root(parent);
- down_read(&root->kernfs_rwsem);
if (kernfs_dir_changed(parent, dentry)) {
up_read(&root->kernfs_rwsem);
return 0;
}
- up_read(&root->kernfs_rwsem);
- } else
- spin_unlock(&dentry->d_lock);
+ }
+ up_read(&root->kernfs_rwsem);
/* The kernfs parent node hasn't changed, leave the
* dentry negative and return success.
@@ -1290,6 +1311,8 @@ static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
* Find the next descendant to visit for post-order traversal of @root's
* descendants. @root is included in the iteration and the last node to be
* visited.
+ *
+ * Return: the next descendant to visit or %NULL when done.
*/
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
struct kernfs_node *root)
@@ -1553,6 +1576,8 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn)
* the whole kernfs_ops which won the arbitration. This can be used to
* guarantee, for example, all concurrent writes to a "delete" file to
* finish only after the whole operation is complete.
+ *
+ * Return: %true if @kn is removed by this call, otherwise %false.
*/
bool kernfs_remove_self(struct kernfs_node *kn)
{
@@ -1613,7 +1638,8 @@ bool kernfs_remove_self(struct kernfs_node *kn)
* @ns: namespace tag of the kernfs_node to remove
*
* Look for the kernfs_node with @name and @ns under @parent and remove it.
- * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ *
+ * Return: %0 on success, -ENOENT if such entry doesn't exist.
*/
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
const void *ns)
@@ -1651,6 +1677,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
* @new_parent: new parent to put @sd under
* @new_name: new name
* @new_ns: new namespace tag
+ *
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name, const void *new_ns)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 9ab6c92e02da..e4a50e4ff0d2 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -33,7 +33,7 @@ struct kernfs_open_node {
* pending queue is implemented as a singly linked list of kernfs_nodes.
* The list is terminated with the self pointer so that whether a
* kernfs_node is on the list or not can be determined by testing the next
- * pointer for NULL.
+ * pointer for %NULL.
*/
#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
@@ -59,8 +59,10 @@ static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
}
/**
- * of_on - Return the kernfs_open_node of the specified kernfs_open_file
- * @of: taret kernfs_open_file
+ * of_on - Get the kernfs_open_node of the specified kernfs_open_file
+ * @of: target kernfs_open_file
+ *
+ * Return: the kernfs_open_node of the kernfs_open_file
*/
static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
{
@@ -82,6 +84,8 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
* outside RCU read-side critical section.
*
* The caller needs to make sure that kernfs_open_file_mutex is held.
+ *
+ * Return: @kn->attr.open when kernfs_open_file_mutex is held.
*/
static struct kernfs_open_node *
kernfs_deref_open_node_locked(struct kernfs_node *kn)
@@ -548,11 +552,11 @@ out_unlock:
* If @kn->attr.open exists, increment its reference count; otherwise,
* create one. @of is chained to the files list.
*
- * LOCKING:
+ * Locking:
* Kernel thread context (may sleep).
*
- * RETURNS:
- * 0 on success, -errno on failure.
+ * Return:
+ * %0 on success, -errno on failure.
*/
static int kernfs_get_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
@@ -1024,7 +1028,7 @@ const struct file_operations kernfs_file_fops = {
* @ns: optional namespace tag of the file
* @key: lockdep key for the file's active_ref, %NULL to disable lockdep
*
- * Returns the created node on success, ERR_PTR() value on error.
+ * Return: the created node on success, ERR_PTR() value on error.
*/
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
const char *name,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5da..eac0f210299a 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -94,7 +94,7 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
* @kn: target node
* @iattr: iattr to set
*
- * Returns 0 on success, -errno on failure.
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
@@ -190,10 +190,8 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
struct kernfs_root *root = kernfs_root(kn);
down_read(&root->kernfs_rwsem);
- spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&init_user_ns, inode, stat);
- spin_unlock(&inode->i_lock);
up_read(&root->kernfs_rwsem);
return 0;
@@ -241,11 +239,11 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
* allocated and basics are initialized. New inode is returned
* locked.
*
- * LOCKING:
+ * Locking:
* Kernel thread context (may sleep).
*
- * RETURNS:
- * Pointer to allocated inode on success, NULL on failure.
+ * Return:
+ * Pointer to allocated inode on success, %NULL on failure.
*/
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{
@@ -288,10 +286,8 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
root = kernfs_root(kn);
down_read(&root->kernfs_rwsem);
- spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&init_user_ns, inode, mask);
- spin_unlock(&inode->i_lock);
up_read(&root->kernfs_rwsem);
return ret;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index fc5821effd97..9046d9f39e63 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -58,7 +58,7 @@ struct kernfs_root {
* kernfs_root - find out the kernfs_root a kernfs_node belongs to
* @kn: kernfs_node of interest
*
- * Return the kernfs_root @kn belongs to.
+ * Return: the kernfs_root @kn belongs to.
*/
static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
{
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d0859f72d2d6..e08e8d999807 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -153,7 +153,7 @@ static const struct export_operations kernfs_export_ops = {
* kernfs_root_from_sb - determine kernfs_root associated with a super_block
* @sb: the super_block in question
*
- * Return the kernfs_root associated with @sb. If @sb is not a kernfs one,
+ * Return: the kernfs_root associated with @sb. If @sb is not a kernfs one,
* %NULL is returned.
*/
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
@@ -167,7 +167,7 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
* find the next ancestor in the path down to @child, where @parent was the
* ancestor whose descendant we want to find.
*
- * Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root
+ * Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root
* node. If @parent is b, then we return the node for c.
* Passing in d as @parent is not ok.
*/
@@ -192,6 +192,8 @@ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
* kernfs_node_dentry - get a dentry for the given kernfs_node
* @kn: kernfs_node for which a dentry is needed
* @sb: the kernfs super_block
+ *
+ * Return: the dentry pointer
*/
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
struct super_block *sb)
@@ -296,7 +298,7 @@ static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
* kernfs_super_ns - determine the namespace tag of a kernfs super_block
* @sb: super_block of interest
*
- * Return the namespace tag associated with kernfs super_block @sb.
+ * Return: the namespace tag associated with kernfs super_block @sb.
*/
const void *kernfs_super_ns(struct super_block *sb)
{
@@ -313,6 +315,8 @@ const void *kernfs_super_ns(struct super_block *sb)
* implementation, which should set the specified ->@fs_type and ->@flags, and
* specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
* respectively.
+ *
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_get_tree(struct fs_context *fc)
{
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 0ab13824822f..45371a70caa7 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -19,7 +19,7 @@
* @name: name of the symlink
* @target: target node for the symlink to point to
*
- * Returns the created node on success, ERR_PTR() value on error.
+ * Return: the created node on success, ERR_PTR() value on error.
* Ownership of the link matches ownership of the target.
*/
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index 2a39ffb8423b..6e61b5bc7d86 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -322,7 +322,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
dn_off = le32_to_cpu(authblob->DomainName.BufferOffset);
dn_len = le16_to_cpu(authblob->DomainName.Length);
- if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len)
+ if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len ||
+ nt_len < CIFS_ENCPWD_SIZE)
return -EINVAL;
/* TODO : use domain name that imported from configuration file */
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index 12be8386446a..fd0a288af299 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -316,9 +316,12 @@ int ksmbd_conn_handler_loop(void *p)
/* 4 for rfc1002 length field */
size = pdu_size + 4;
- conn->request_buf = kvmalloc(size, GFP_KERNEL);
+ conn->request_buf = kvmalloc(size,
+ GFP_KERNEL |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
if (!conn->request_buf)
- continue;
+ break;
memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf));
if (!ksmbd_smb_request(conn))
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index ff07c67f4565..b6bd8311e6b4 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -74,6 +74,7 @@ struct ksmbd_heartbeat {
#define KSMBD_GLOBAL_FLAG_SMB2_LEASES BIT(0)
#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1)
#define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2)
+#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF BIT(3)
/*
* IPC request for ksmbd server startup
diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c
index 3fa2139a0b30..92b1603b5abe 100644
--- a/fs/ksmbd/mgmt/user_session.c
+++ b/fs/ksmbd/mgmt/user_session.c
@@ -108,15 +108,17 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
entry->method = method;
entry->id = ksmbd_ipc_id_alloc();
if (entry->id < 0)
- goto error;
+ goto free_entry;
resp = ksmbd_rpc_open(sess, entry->id);
if (!resp)
- goto error;
+ goto free_id;
kvfree(resp);
return entry->id;
-error:
+free_id:
+ ksmbd_rpc_id_free(entry->id);
+free_entry:
list_del(&entry->list);
kfree(entry);
return -EINVAL;
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index a0d635304754..394b6ceac431 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -432,11 +432,9 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr,
"reset",
"shutdown"
};
-
- ssize_t sz = scnprintf(buf, PAGE_SIZE, "%d %s %d %lu\n", stats_version,
- state[server_conf.state], server_conf.tcp_port,
- server_conf.ipc_last_active / HZ);
- return sz;
+ return sysfs_emit(buf, "%d %s %d %lu\n", stats_version,
+ state[server_conf.state], server_conf.tcp_port,
+ server_conf.ipc_last_active / HZ);
}
static ssize_t kill_server_store(struct class *class,
@@ -468,19 +466,13 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr,
for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) {
if ((ksmbd_debug_types >> i) & 1) {
- pos = scnprintf(buf + sz,
- PAGE_SIZE - sz,
- "[%s] ",
- debug_type_strings[i]);
+ pos = sysfs_emit_at(buf, sz, "[%s] ", debug_type_strings[i]);
} else {
- pos = scnprintf(buf + sz,
- PAGE_SIZE - sz,
- "%s ",
- debug_type_strings[i]);
+ pos = sysfs_emit_at(buf, sz, "%s ", debug_type_strings[i]);
}
sz += pos;
}
- sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+ sz += sysfs_emit_at(buf, sz, "\n");
return sz;
}
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
index ab23da2120b9..e401302478c3 100644
--- a/fs/ksmbd/smb2ops.c
+++ b/fs/ksmbd/smb2ops.c
@@ -247,8 +247,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
- if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION &&
- conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+ (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+ conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
@@ -271,6 +272,11 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+ (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+ conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index b2fc85d440d0..38fbda52e06f 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -903,7 +903,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn,
return;
}
- if (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION))
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF)
return;
for (i = 0; i < cph_cnt; i++) {
@@ -1508,7 +1508,8 @@ static int ntlm_authenticate(struct ksmbd_work *work)
return -EINVAL;
}
sess->enc = true;
- rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)
+ rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
/*
* signing is disable if encryption is enable
* on this session
@@ -1599,7 +1600,8 @@ static int krb5_authenticate(struct ksmbd_work *work)
return -EINVAL;
}
sess->enc = true;
- rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)
+ rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
sess->sign = false;
}
@@ -1926,13 +1928,13 @@ int smb2_tree_connect(struct ksmbd_work *work)
if (conn->posix_ext_supported)
status.tree_conn->posix_extensions = true;
-out_err1:
rsp->StructureSize = cpu_to_le16(16);
+ inc_rfc1001_len(work->response_buf, 16);
+out_err1:
rsp->Capabilities = 0;
rsp->Reserved = 0;
/* default manual caching */
rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING;
- inc_rfc1001_len(work->response_buf, 16);
if (!IS_ERR(treename))
kfree(treename);
@@ -1965,6 +1967,9 @@ out_err1:
rsp->hdr.Status = STATUS_ACCESS_DENIED;
}
+ if (status.ret != KSMBD_TREE_CONN_STATUS_OK)
+ smb2_set_err_rsp(work);
+
return rc;
}
@@ -2487,9 +2492,9 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr,
fattr->cf_dacls = NULL;
if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
- fattr->cf_acls = get_acl(inode, ACL_TYPE_ACCESS);
+ fattr->cf_acls = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (S_ISDIR(inode->i_mode))
- fattr->cf_dacls = get_acl(inode, ACL_TYPE_DEFAULT);
+ fattr->cf_dacls = get_inode_acl(inode, ACL_TYPE_DEFAULT);
}
}
@@ -2956,7 +2961,7 @@ int smb2_open(struct ksmbd_work *work)
struct inode *inode = d_inode(path.dentry);
posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns,
- inode,
+ path.dentry,
d_inode(path.dentry->d_parent));
if (posix_acl_rc)
ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc);
@@ -2972,7 +2977,7 @@ int smb2_open(struct ksmbd_work *work)
if (rc) {
if (posix_acl_rc)
ksmbd_vfs_set_init_posix_acl(user_ns,
- inode);
+ path.dentry);
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_ACL_XATTR)) {
@@ -3438,7 +3443,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
goto free_conv_name;
}
- struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len;
+ struct_sz = readdir_info_level_struct_sz(info_level) + conv_len;
next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT);
d_info->last_entry_off_align = next_entry_offset - struct_sz;
@@ -3690,7 +3695,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info,
return -EOPNOTSUPP;
conv_len = (d_info->name_len + 1) * 2;
- next_entry_offset = ALIGN(struct_sz - 1 + conv_len,
+ next_entry_offset = ALIGN(struct_sz + conv_len,
KSMBD_DIR_INFO_ALIGNMENT);
if (next_entry_offset > d_info->out_buf_len) {
@@ -6751,7 +6756,7 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags)
case SMB2_LOCKFLAG_UNLOCK:
ksmbd_debug(SMB, "received unlock request\n");
flock->fl_type = F_UNLCK;
- cmd = 0;
+ cmd = F_SETLK;
break;
}
@@ -6855,6 +6860,7 @@ int smb2_lock(struct ksmbd_work *work)
if (lock_start > U64_MAX - lock_length) {
pr_err("Invalid lock range requested\n");
rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+ locks_free_lock(flock);
goto out;
}
@@ -6874,6 +6880,7 @@ int smb2_lock(struct ksmbd_work *work)
"the end offset(%llx) is smaller than the start offset(%llx)\n",
flock->fl_end, flock->fl_start);
rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+ locks_free_lock(flock);
goto out;
}
@@ -6885,6 +6892,7 @@ int smb2_lock(struct ksmbd_work *work)
flock->fl_type != F_UNLCK) {
pr_err("conflict two locks in one request\n");
err = -EINVAL;
+ locks_free_lock(flock);
goto out;
}
}
@@ -6893,6 +6901,7 @@ int smb2_lock(struct ksmbd_work *work)
smb_lock = smb2_lock_init(flock, cmd, flags, &lock_list);
if (!smb_lock) {
err = -EINVAL;
+ locks_free_lock(flock);
goto out;
}
}
@@ -7129,7 +7138,7 @@ out:
rlock->fl_start = smb_lock->start;
rlock->fl_end = smb_lock->end;
- rc = vfs_lock_file(filp, 0, rlock, NULL);
+ rc = vfs_lock_file(filp, F_SETLK, rlock, NULL);
if (rc)
pr_err("rollback unlock fail : %d\n", rc);
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index 092fdd3f8750..aa5dbe54f5a1 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -443,7 +443,7 @@ struct smb2_posix_info {
/* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */
u8 SidBuffer[32];
__le32 name_len;
- u8 name[1];
+ u8 name[];
/*
* var sized owner SID
* var sized group SID
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index d96da872d70a..2a4fbbd55b91 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -623,7 +623,7 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
if (share->force_gid != KSMBD_SHARE_INVALID_GID)
gid = share->force_gid;
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index 318c16fa81da..e663ab9ea759 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -277,14 +277,14 @@ struct file_directory_info {
__le64 AllocationSize;
__le32 ExtFileAttributes;
__le32 FileNameLength;
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x101 FF resp data */
struct file_names_info {
__le32 NextEntryOffset;
__u32 FileIndex;
__le32 FileNameLength;
- char FileName[1];
+ char FileName[];
} __packed; /* level 0xc FF resp data */
struct file_full_directory_info {
@@ -299,7 +299,7 @@ struct file_full_directory_info {
__le32 ExtFileAttributes;
__le32 FileNameLength;
__le32 EaSize;
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x102 FF resp */
struct file_both_directory_info {
@@ -317,7 +317,7 @@ struct file_both_directory_info {
__u8 ShortNameLength;
__u8 Reserved;
__u8 ShortName[24];
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x104 FFrsp data */
struct file_id_both_directory_info {
@@ -337,7 +337,7 @@ struct file_id_both_directory_info {
__u8 ShortName[24];
__le16 Reserved2;
__le64 UniqueId;
- char FileName[1];
+ char FileName[];
} __packed;
struct file_id_full_dir_info {
@@ -354,7 +354,7 @@ struct file_id_full_dir_info {
__le32 EaSize; /* EA size */
__le32 Reserved;
__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x105 FF rsp data */
struct smb_version_values {
diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c
index b05ff9b146b5..ab5c68cc0e13 100644
--- a/fs/ksmbd/smbacl.c
+++ b/fs/ksmbd/smbacl.c
@@ -1289,7 +1289,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
}
if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) {
- posix_acls = get_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
+ posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS);
if (posix_acls && !found) {
unsigned int id = -1;
@@ -1386,14 +1386,14 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry);
/* Update posix acls */
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) {
- rc = set_posix_acl(user_ns, inode,
+ rc = set_posix_acl(user_ns, path->dentry,
ACL_TYPE_ACCESS, fattr.cf_acls);
if (rc < 0)
ksmbd_debug(SMB,
"Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
rc);
if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) {
- rc = set_posix_acl(user_ns, inode,
+ rc = set_posix_acl(user_ns, path->dentry,
ACL_TYPE_DEFAULT, fattr.cf_dacls);
if (rc)
ksmbd_debug(SMB,
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index 63d55f543bd2..4c6bd0b69979 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -295,6 +295,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig,
struct msghdr ksmbd_msg;
struct kvec *iov;
struct ksmbd_conn *conn = KSMBD_TRANS(t)->conn;
+ int max_retry = 2;
iov = get_conn_iovec(t, nr_segs);
if (!iov)
@@ -321,9 +322,11 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig,
} else if (conn->status == KSMBD_SESS_NEED_RECONNECT) {
total_read = -EAGAIN;
break;
- } else if (length == -ERESTARTSYS || length == -EAGAIN) {
+ } else if ((length == -ERESTARTSYS || length == -EAGAIN) &&
+ max_retry) {
usleep_range(1000, 2000);
length = 0;
+ max_retry--;
continue;
} else if (length <= 0) {
total_read = -EAGAIN;
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 94b8ed4ef870..ff0e7a4fcd4d 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -321,7 +321,7 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end,
unsigned char type)
{
struct file_lock *flock;
- struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+ struct file_lock_context *ctx = locks_inode_context(file_inode(filp));
int error = 0;
if (!ctx || list_empty_careful(&ctx->flc_posix))
@@ -1321,7 +1321,7 @@ int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns,
sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) ||
!strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) {
- err = ksmbd_vfs_remove_xattr(user_ns, dentry, name);
+ err = vfs_remove_acl(user_ns, dentry, name);
if (err)
ksmbd_debug(SMB,
"remove acl xattr failed : %s\n", name);
@@ -1375,7 +1375,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
return NULL;
- posix_acls = get_acl(inode, acl_type);
+ posix_acls = get_inode_acl(inode, acl_type);
if (!posix_acls)
return NULL;
@@ -1824,10 +1824,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock)
}
int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
- struct inode *inode)
+ struct dentry *dentry)
{
struct posix_acl_state acl_state;
struct posix_acl *acls;
+ struct inode *inode = d_inode(dentry);
int rc;
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
@@ -1856,14 +1857,13 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
return -ENOMEM;
}
posix_state_to_acl(&acl_state, acls->a_entries);
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
rc);
else if (S_ISDIR(inode->i_mode)) {
posix_state_to_acl(&acl_state, acls->a_entries);
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
- acls);
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
rc);
@@ -1874,16 +1874,17 @@ int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
}
int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
- struct inode *inode, struct inode *parent_inode)
+ struct dentry *dentry, struct inode *parent_inode)
{
struct posix_acl *acls;
struct posix_acl_entry *pace;
+ struct inode *inode = d_inode(dentry);
int rc, i;
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL))
return -EOPNOTSUPP;
- acls = get_acl(parent_inode, ACL_TYPE_DEFAULT);
+ acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT);
if (!acls)
return -ENOENT;
pace = acls->a_entries;
@@ -1895,12 +1896,12 @@ int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
}
}
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls);
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_ACCESS, acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n",
rc);
if (S_ISDIR(inode->i_mode)) {
- rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT,
+ rc = set_posix_acl(user_ns, dentry, ACL_TYPE_DEFAULT,
acls);
if (rc < 0)
ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n",
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 593059ca8511..0d73d735cc39 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -160,8 +160,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns,
struct dentry *dentry,
struct xattr_dos_attrib *da);
int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns,
- struct inode *inode);
+ struct dentry *dentry);
int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns,
- struct inode *inode,
+ struct dentry *dentry,
struct inode *parent_inode);
#endif /* __KSMBD_VFS_H__ */
diff --git a/fs/libfs.c b/fs/libfs.c
index 682d56345a1c..aada4e7c8713 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -995,8 +995,8 @@ out:
EXPORT_SYMBOL_GPL(simple_attr_read);
/* interpret the buffer as a number to call the set function with */
-ssize_t simple_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct simple_attr *attr;
unsigned long long val;
@@ -1017,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
goto out;
attr->set_buf[size] = '\0';
- ret = kstrtoull(attr->set_buf, 0, &val);
+ if (is_signed)
+ ret = kstrtoll(attr->set_buf, 0, &val);
+ else
+ ret = kstrtoull(attr->set_buf, 0, &val);
if (ret)
goto out;
ret = attr->set(attr->data, val);
@@ -1027,8 +1030,21 @@ out:
mutex_unlock(&attr->mutex);
return ret;
}
+
+ssize_t simple_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(simple_attr_write);
+ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(simple_attr_write_signed);
+
/**
* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
* @sb: filesystem to do the file handle conversion on
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 284b019cb652..b72023a6b4c1 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -52,6 +52,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
*filp = file;
/* Set up the missing parts of the file_lock structure */
+ lock->fl.fl_flags = FL_POSIX;
lock->fl.fl_file = file->f_file[mode];
lock->fl.fl_pid = current->tgid;
lock->fl.fl_start = (loff_t)lock->lock_start;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 9c1aa75441e1..4e30f3c50970 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -659,11 +659,13 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
nlmsvc_cancel_blocked(net, file, lock);
lock->fl.fl_type = F_UNLCK;
- if (file->f_file[O_RDONLY])
- error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK,
+ lock->fl.fl_file = file->f_file[O_RDONLY];
+ if (lock->fl.fl_file)
+ error = vfs_lock_file(lock->fl.fl_file, F_SETLK,
&lock->fl, NULL);
- if (file->f_file[O_WRONLY])
- error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK,
+ lock->fl.fl_file = file->f_file[O_WRONLY];
+ if (lock->fl.fl_file)
+ error |= vfs_lock_file(lock->fl.fl_file, F_SETLK,
&lock->fl, NULL);
return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
@@ -697,9 +699,10 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
block = nlmsvc_lookup_block(file, lock);
mutex_unlock(&file->f_mutex);
if (block != NULL) {
- mode = lock_to_openmode(&lock->fl);
- vfs_cancel_lock(block->b_file->f_file[mode],
- &block->b_call->a_args.lock.fl);
+ struct file_lock *fl = &block->b_call->a_args.lock.fl;
+
+ mode = lock_to_openmode(fl);
+ vfs_cancel_lock(block->b_file->f_file[mode], fl);
status = nlmsvc_unlink_block(block);
nlmsvc_release_block(block);
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index e35c05e27806..32784f508c81 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -77,6 +77,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Set up the missing parts of the file_lock structure */
mode = lock_to_openmode(&lock->fl);
+ lock->fl.fl_flags = FL_POSIX;
lock->fl.fl_file = file->f_file[mode];
lock->fl.fl_pid = current->tgid;
lock->fl.fl_lmops = &nlmsvc_lock_operations;
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index e1c4617de771..e3b6229e7ae5 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file)
}
}
-static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner)
+static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl)
{
struct file_lock lock;
@@ -184,12 +184,15 @@ static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner)
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
- lock.fl_owner = owner;
- if (file->f_file[O_RDONLY] &&
- vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL))
+ lock.fl_owner = fl->fl_owner;
+ lock.fl_pid = fl->fl_pid;
+ lock.fl_flags = FL_POSIX;
+
+ lock.fl_file = file->f_file[O_RDONLY];
+ if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
goto out_err;
- if (file->f_file[O_WRONLY] &&
- vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL))
+ lock.fl_file = file->f_file[O_WRONLY];
+ if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
goto out_err;
return 0;
out_err:
@@ -207,7 +210,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
{
struct inode *inode = nlmsvc_file_inode(file);
struct file_lock *fl;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct nlm_host *lockhost;
if (!flctx || list_empty_careful(&flctx->flc_posix))
@@ -226,7 +229,7 @@ again:
if (match(lockhost, host)) {
spin_unlock(&flctx->flc_lock);
- if (nlm_unlock_files(file, fl->fl_owner))
+ if (nlm_unlock_files(file, fl))
return 1;
goto again;
}
@@ -262,7 +265,7 @@ nlm_file_inuse(struct nlm_file *file)
{
struct inode *inode = nlmsvc_file_inode(file);
struct file_lock *fl;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
return 1;
diff --git a/fs/locks.c b/fs/locks.c
index 607f94a0e789..8f01bee17715 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -175,7 +175,7 @@ locks_get_lock_context(struct inode *inode, int type)
struct file_lock_context *ctx;
/* paired with cmpxchg() below */
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (likely(ctx) || type == F_UNLCK)
goto out;
@@ -194,7 +194,7 @@ locks_get_lock_context(struct inode *inode, int type)
*/
if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
kmem_cache_free(flctx_cache, ctx);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
}
out:
trace_locks_get_lock_context(inode, type, ctx);
@@ -247,7 +247,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list,
void
locks_free_lock_context(struct inode *inode)
{
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx = locks_inode_context(inode);
if (unlikely(ctx)) {
locks_check_ctx_lists(inode);
@@ -891,7 +891,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
void *owner;
void (*func)(void);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
fl->fl_type = F_UNLCK;
return;
@@ -1483,7 +1483,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
new_fl->fl_flags = type;
/* typically we will check that ctx is non-NULL before calling */
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx) {
WARN_ON_ONCE(1);
goto free_lock;
@@ -1588,7 +1588,7 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
struct file_lock_context *ctx;
struct file_lock *fl;
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
fl = list_first_entry_or_null(&ctx->flc_lease,
@@ -1634,7 +1634,7 @@ int fcntl_getlease(struct file *filp)
int type = F_UNLCK;
LIST_HEAD(dispose);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
@@ -1823,7 +1823,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
struct file_lock_context *ctx;
LIST_HEAD(dispose);
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx) {
trace_generic_delete_lease(inode, NULL);
return error;
@@ -2096,7 +2096,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
* throw a warning to let people know that they don't actually work.
*/
if (cmd & LOCK_MAND) {
- pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
+ pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid);
return 0;
}
@@ -2146,6 +2146,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
+ WARN_ON_ONCE(filp != fl->fl_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_GETLK, fl);
posix_test_lock(filp, fl);
@@ -2295,6 +2296,7 @@ out:
*/
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
+ WARN_ON_ONCE(filp != fl->fl_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, cmd, fl);
else
@@ -2561,7 +2563,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx || list_empty(&ctx->flc_posix))
return;
@@ -2634,7 +2636,7 @@ void locks_remove_file(struct file *filp)
{
struct file_lock_context *ctx;
- ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
+ ctx = locks_inode_context(locks_inode(filp));
if (!ctx)
return;
@@ -2663,12 +2665,36 @@ void locks_remove_file(struct file *filp)
*/
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
+ WARN_ON_ONCE(filp != fl->fl_file);
if (filp->f_op->lock)
return filp->f_op->lock(filp, F_CANCELLK, fl);
return 0;
}
EXPORT_SYMBOL_GPL(vfs_cancel_lock);
+/**
+ * vfs_inode_has_locks - are any file locks held on @inode?
+ * @inode: inode to check for locks
+ *
+ * Return true if there are any FL_POSIX or FL_FLOCK locks currently
+ * set on @inode.
+ */
+bool vfs_inode_has_locks(struct inode *inode)
+{
+ struct file_lock_context *ctx;
+ bool ret;
+
+ ctx = locks_inode_context(inode);
+ if (!ctx)
+ return false;
+
+ spin_lock(&ctx->flc_lock);
+ ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock);
+ spin_unlock(&ctx->flc_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vfs_inode_has_locks);
+
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -2839,7 +2865,7 @@ void show_fd_locks(struct seq_file *f,
struct file_lock_context *ctx;
int id = 0;
- ctx = smp_load_acquire(&inode->i_flctx);
+ ctx = locks_inode_context(inode);
if (!ctx)
return;
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e272ad738faf..2a4b8b549e93 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -100,8 +100,9 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
atomic_set(&entry->e_refcnt, 2);
entry->e_key = key;
entry->e_value = value;
- entry->e_reusable = reusable;
- entry->e_referenced = 0;
+ entry->e_flags = 0;
+ if (reusable)
+ set_bit(MBE_REUSABLE_B, &entry->e_flags);
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
@@ -165,7 +166,8 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
while (node) {
entry = hlist_bl_entry(node, struct mb_cache_entry,
e_hash_list);
- if (entry->e_key == key && entry->e_reusable &&
+ if (entry->e_key == key &&
+ test_bit(MBE_REUSABLE_B, &entry->e_flags) &&
atomic_inc_not_zero(&entry->e_refcnt))
goto out;
node = node->next;
@@ -284,7 +286,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
void mb_cache_entry_touch(struct mb_cache *cache,
struct mb_cache_entry *entry)
{
- entry->e_referenced = 1;
+ set_bit(MBE_REFERENCED_B, &entry->e_flags);
}
EXPORT_SYMBOL(mb_cache_entry_touch);
@@ -309,9 +311,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
entry = list_first_entry(&cache->c_list,
struct mb_cache_entry, e_list);
/* Drop initial hash reference if there is no user */
- if (entry->e_referenced ||
+ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) ||
atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
- entry->e_referenced = 0;
+ clear_bit(MBE_REFERENCED_B, &entry->e_flags);
list_move_tail(&entry->e_list, &cache->c_list);
continue;
}
diff --git a/fs/namei.c b/fs/namei.c
index 9155ecb547ce..309ae6fc8c99 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -297,13 +297,13 @@ static int check_acl(struct user_namespace *mnt_userns,
acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
if (!acl)
return -EAGAIN;
- /* no ->get_acl() calls in RCU mode... */
+ /* no ->get_inode_acl() calls in RCU mode... */
if (is_uncached_acl(acl))
return -ECHILD;
return posix_acl_permission(mnt_userns, inode, acl, mask);
}
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl) {
@@ -336,11 +336,11 @@ static int acl_permission_check(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
- kuid_t i_uid;
+ vfsuid_t vfsuid;
/* Are we the owner? If so, ACL's don't matter */
- i_uid = i_uid_into_mnt(mnt_userns, inode);
- if (likely(uid_eq(current_fsuid(), i_uid))) {
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
mask &= 7;
mode >>= 6;
return (mask & ~mode) ? -EACCES : 0;
@@ -362,8 +362,8 @@ static int acl_permission_check(struct user_namespace *mnt_userns,
* about? Need to check group ownership if so.
*/
if (mask & (mode ^ (mode >> 3))) {
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (in_group_p(kgid))
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (vfsgid_in_group_p(vfsgid))
mode >>= 3;
}
@@ -581,7 +581,7 @@ struct nameidata {
struct nameidata *saved;
unsigned root_seq;
int dfd;
- kuid_t dir_uid;
+ vfsuid_t dir_vfsuid;
umode_t dir_mode;
} __randomize_layout;
@@ -1095,15 +1095,15 @@ fs_initcall(init_fs_namei_sysctls);
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
struct user_namespace *mnt_userns;
- kuid_t i_uid;
+ vfsuid_t vfsuid;
if (!sysctl_protected_symlinks)
return 0;
mnt_userns = mnt_user_ns(nd->path.mnt);
- i_uid = i_uid_into_mnt(mnt_userns, inode);
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
/* Allowed if owner and follower match. */
- if (uid_eq(current_cred()->fsuid, i_uid))
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
return 0;
/* Allowed if parent directory not sticky and world-writable. */
@@ -1111,7 +1111,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
return 0;
/* Allowed if parent directory and link owner match. */
- if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid))
+ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
return 0;
if (nd->flags & LOOKUP_RCU)
@@ -1183,8 +1183,8 @@ int may_linkat(struct user_namespace *mnt_userns, const struct path *link)
struct inode *inode = link->dentry->d_inode;
/* Inode writeback is not safe when the uid or gid are invalid. */
- if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
if (!sysctl_protected_hardlinks)
@@ -1232,13 +1232,13 @@ static int may_create_in_sticky(struct user_namespace *mnt_userns,
struct nameidata *nd, struct inode *const inode)
{
umode_t dir_mode = nd->dir_mode;
- kuid_t dir_uid = nd->dir_uid;
+ vfsuid_t dir_vfsuid = nd->dir_vfsuid;
if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
(!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
likely(!(dir_mode & S_ISVTX)) ||
- uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) ||
- uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
+ vfsuid_eq(i_uid_into_vfsuid(mnt_userns, inode), dir_vfsuid) ||
+ vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid()))
return 0;
if (likely(dir_mode & 0002) ||
@@ -2307,7 +2307,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
OK:
/* pathname or trailing symlink, done */
if (!depth) {
- nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
+ nd->dir_vfsuid = i_uid_into_vfsuid(mnt_userns, nd->inode);
nd->dir_mode = nd->inode->i_mode;
nd->flags &= ~LOOKUP_PARENT;
return 0;
@@ -2885,9 +2885,9 @@ int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir,
{
kuid_t fsuid = current_fsuid();
- if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid))
+ if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), fsuid))
return 0;
- if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid))
+ if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, dir), fsuid))
return 0;
return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER);
}
@@ -2926,8 +2926,8 @@ static int may_delete(struct user_namespace *mnt_userns, struct inode *dir,
BUG_ON(victim->d_parent->d_inode != dir);
/* Inode writeback is not safe when the uid or gid are invalid. */
- if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ if (!vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
@@ -3211,7 +3211,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp)
if (error)
return error;
- error = security_path_truncate(path);
+ error = security_file_truncate(filp);
if (!error) {
error = do_truncate(mnt_userns, path->dentry, 0,
ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
diff --git a/fs/namespace.c b/fs/namespace.c
index df137ba19d37..ab467ee58341 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -75,6 +75,22 @@ static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+struct mnt_idmap {
+ struct user_namespace *owner;
+ refcount_t count;
+};
+
+/*
+ * Carries the initial idmapping of 0:0:4294967295 which is an identity
+ * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
+ * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
+ */
+struct mnt_idmap nop_mnt_idmap = {
+ .owner = &init_user_ns,
+ .count = REFCOUNT_INIT(1),
+};
+EXPORT_SYMBOL_GPL(nop_mnt_idmap);
+
struct mount_kattr {
unsigned int attr_set;
unsigned int attr_clr;
@@ -82,6 +98,7 @@ struct mount_kattr {
unsigned int lookup_flags;
bool recurse;
struct user_namespace *mnt_userns;
+ struct mnt_idmap *mnt_idmap;
};
/* /sys/fs */
@@ -193,6 +210,104 @@ int mnt_get_count(struct mount *mnt)
#endif
}
+/**
+ * mnt_idmap_owner - retrieve owner of the mount's idmapping
+ * @idmap: mount idmapping
+ *
+ * This helper will go away once the conversion to use struct mnt_idmap
+ * everywhere has finished at which point the helper will be unexported.
+ *
+ * Only code that needs to perform permission checks based on the owner of the
+ * idmapping will get access to it. All other code will solely rely on
+ * idmappings. This will get us type safety so it's impossible to conflate
+ * filesystems idmappings with mount idmappings.
+ *
+ * Return: The owner of the idmapping.
+ */
+struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap)
+{
+ return idmap->owner;
+}
+EXPORT_SYMBOL_GPL(mnt_idmap_owner);
+
+/**
+ * mnt_user_ns - retrieve owner of an idmapped mount
+ * @mnt: the relevant vfsmount
+ *
+ * This helper will go away once the conversion to use struct mnt_idmap
+ * everywhere has finished at which point the helper will be unexported.
+ *
+ * Only code that needs to perform permission checks based on the owner of the
+ * idmapping will get access to it. All other code will solely rely on
+ * idmappings. This will get us type safety so it's impossible to conflate
+ * filesystems idmappings with mount idmappings.
+ *
+ * Return: The owner of the idmapped.
+ */
+struct user_namespace *mnt_user_ns(const struct vfsmount *mnt)
+{
+ struct mnt_idmap *idmap = mnt_idmap(mnt);
+
+ /* Return the actual owner of the filesystem instead of the nop. */
+ if (idmap == &nop_mnt_idmap &&
+ !initial_idmapping(mnt->mnt_sb->s_user_ns))
+ return mnt->mnt_sb->s_user_ns;
+ return mnt_idmap_owner(idmap);
+}
+EXPORT_SYMBOL_GPL(mnt_user_ns);
+
+/**
+ * alloc_mnt_idmap - allocate a new idmapping for the mount
+ * @mnt_userns: owning userns of the idmapping
+ *
+ * Allocate a new struct mnt_idmap which carries the idmapping of the mount.
+ *
+ * Return: On success a new idmap, on error an error pointer is returned.
+ */
+static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
+{
+ struct mnt_idmap *idmap;
+
+ idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
+ if (!idmap)
+ return ERR_PTR(-ENOMEM);
+
+ idmap->owner = get_user_ns(mnt_userns);
+ refcount_set(&idmap->count, 1);
+ return idmap;
+}
+
+/**
+ * mnt_idmap_get - get a reference to an idmapping
+ * @idmap: the idmap to bump the reference on
+ *
+ * If @idmap is not the @nop_mnt_idmap bump the reference count.
+ *
+ * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
+ */
+static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
+{
+ if (idmap != &nop_mnt_idmap)
+ refcount_inc(&idmap->count);
+
+ return idmap;
+}
+
+/**
+ * mnt_idmap_put - put a reference to an idmapping
+ * @idmap: the idmap to put the reference on
+ *
+ * If this is a non-initial idmapping, put the reference count when a mount is
+ * released and free it if we're the last user.
+ */
+static inline void mnt_idmap_put(struct mnt_idmap *idmap)
+{
+ if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
+ put_user_ns(idmap->owner);
+ kfree(idmap);
+ }
+}
+
static struct mount *alloc_vfsmnt(const char *name)
{
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -232,7 +347,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
- mnt->mnt.mnt_userns = &init_user_ns;
+ mnt->mnt.mnt_idmap = &nop_mnt_idmap;
}
return mnt;
@@ -602,11 +717,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
- struct user_namespace *mnt_userns;
-
- mnt_userns = mnt_user_ns(&mnt->mnt);
- if (!initial_idmapping(mnt_userns))
- put_user_ns(mnt_userns);
+ mnt_idmap_put(mnt_idmap(&mnt->mnt));
kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
@@ -1009,7 +1120,6 @@ static struct mount *skip_mnt_tree(struct mount *p)
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
struct mount *mnt;
- struct user_namespace *fs_userns;
if (!fc->root)
return ERR_PTR(-EINVAL);
@@ -1027,10 +1137,6 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
- fs_userns = mnt->mnt.mnt_sb->s_user_ns;
- if (!initial_idmapping(fs_userns))
- mnt->mnt.mnt_userns = get_user_ns(fs_userns);
-
lock_mount_hash();
list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
unlock_mount_hash();
@@ -1120,9 +1226,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
atomic_inc(&sb->s_active);
- mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
- if (!initial_idmapping(mnt->mnt.mnt_userns))
- mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
+ mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
+
mnt->mnt.mnt_sb = sb;
mnt->mnt.mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
@@ -3515,8 +3620,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
q = next_mnt(q, new);
if (!q)
break;
+ // an mntns binding we'd skipped?
while (p->mnt.mnt_root != q->mnt.mnt_root)
- p = next_mnt(p, old);
+ p = next_mnt(skip_mnt_tree(p), old);
}
namespace_unlock();
@@ -3981,14 +4087,14 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
struct vfsmount *m = &mnt->mnt;
struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
- if (!kattr->mnt_userns)
+ if (!kattr->mnt_idmap)
return 0;
/*
* Creating an idmapped mount with the filesystem wide idmapping
* doesn't make sense so block that. We don't allow mushy semantics.
*/
- if (kattr->mnt_userns == fs_userns)
+ if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns)
return -EINVAL;
/*
@@ -4028,7 +4134,7 @@ static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
{
return (!(kattr->attr_set & MNT_READONLY) ||
(mnt->mnt.mnt_flags & MNT_READONLY)) &&
- !kattr->mnt_userns;
+ !kattr->mnt_idmap;
}
static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
@@ -4082,27 +4188,18 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
- struct user_namespace *mnt_userns, *old_mnt_userns;
-
- if (!kattr->mnt_userns)
+ if (!kattr->mnt_idmap)
return;
/*
- * We're the only ones able to change the mount's idmapping. So
- * mnt->mnt.mnt_userns is stable and we can retrieve it directly.
- */
- old_mnt_userns = mnt->mnt.mnt_userns;
-
- mnt_userns = get_user_ns(kattr->mnt_userns);
- /* Pairs with smp_load_acquire() in mnt_user_ns(). */
- smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
-
- /*
- * If this is an idmapped filesystem drop the reference we've taken
- * in vfs_create_mount() before.
+ * Pairs with smp_load_acquire() in mnt_idmap().
+ *
+ * Since we only allow a mount to change the idmapping once and
+ * verified this in can_idmap_mount() we know that the mount has
+ * @nop_mnt_idmap attached to it. So there's no need to drop any
+ * references.
*/
- if (!initial_idmapping(old_mnt_userns))
- put_user_ns(old_mnt_userns);
+ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
}
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4136,6 +4233,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
if (path->dentry != mnt->mnt.mnt_root)
return -EINVAL;
+ if (kattr->mnt_userns) {
+ struct mnt_idmap *mnt_idmap;
+
+ mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
+ if (IS_ERR(mnt_idmap))
+ return PTR_ERR(mnt_idmap);
+ kattr->mnt_idmap = mnt_idmap;
+ }
+
if (kattr->propagation) {
/*
* Only take namespace_lock() if we're actually changing
@@ -4323,6 +4429,9 @@ static void finish_mount_kattr(struct mount_kattr *kattr)
{
put_user_ns(kattr->mnt_userns);
kattr->mnt_userns = NULL;
+
+ if (kattr->mnt_idmap)
+ mnt_idmap_put(kattr->mnt_idmap);
}
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index e374767d1b68..7f753380e047 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -23,7 +23,7 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
struct iov_iter iter;
- iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
iov_iter_zero(iov_iter_count(&iter), &iter);
@@ -49,7 +49,7 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
struct iov_iter iter;
netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
@@ -208,7 +208,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
continue;
}
- iov_iter_xarray(&iter, WRITE, &rreq->mapping->i_pages,
+ iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
subreq->start, subreq->len);
atomic_inc(&rreq->nr_copy_ops);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 14a72224b657..1ead5bd740c2 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -209,8 +209,8 @@ config NFS_DISABLE_UDP_SUPPORT
config NFS_V4_2_READ_PLUS
bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation"
depends on NFS_V4_2
- default n
+ default y
help
- This is intended for developers only. The READ_PLUS operation has
- been shown to have issues under specific conditions and should not
- be used in production.
+ Choose Y here to enable the use of READ_PLUS over NFS v4.2. READ_PLUS
+ attempts to improve read performance by compressing out sparse holes
+ in the file contents.
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ead8a0e06abf..cf7365581031 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -146,7 +146,7 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state
{
struct inode *inode = state->inode;
struct file_lock *fl;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct list_head *list;
int status = 0;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f594dac436a7..f7e4a88d5d92 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1074,6 +1074,8 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
return res;
}
+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
/*
* Once we've found the start of the dirent within a page: fill 'er up...
*/
@@ -1083,6 +1085,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
struct file *file = desc->file;
struct nfs_cache_array *array;
unsigned int i;
+ bool first_emit = !desc->dir_cookie;
array = kmap_local_page(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -1106,6 +1109,10 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
desc->ctx->pos = desc->dir_cookie;
else
desc->ctx->pos++;
+ if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) {
+ desc->eob = true;
+ break;
+ }
}
if (array->page_is_eof)
desc->eof = !desc->eob;
@@ -1187,8 +1194,6 @@ out:
return status;
}
-#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
-
static bool nfs_readdir_handle_cache_misses(struct inode *inode,
struct nfs_readdir_descriptor *desc,
unsigned int cache_misses,
@@ -2948,9 +2953,30 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co
return NULL;
}
+static u64 nfs_access_login_time(const struct task_struct *task,
+ const struct cred *cred)
+{
+ const struct task_struct *parent;
+ const struct cred *pcred;
+ u64 ret;
+
+ rcu_read_lock();
+ for (;;) {
+ parent = rcu_dereference(task->real_parent);
+ pcred = rcu_dereference(parent->cred);
+ if (parent == task || cred_fscmp(pcred, cred) != 0)
+ break;
+ task = parent;
+ }
+ ret = task->start_time;
+ rcu_read_unlock();
+ return ret;
+}
+
static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ u64 login_time = nfs_access_login_time(current, cred);
struct nfs_access_entry *cache;
bool retry = true;
int err;
@@ -2978,6 +3004,9 @@ static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *
spin_lock(&inode->i_lock);
retry = false;
}
+ err = -ENOENT;
+ if ((s64)(login_time - cache->timestamp) > 0)
+ goto out;
*mask = cache->mask;
list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
err = 0;
@@ -2996,6 +3025,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre
* but do it without locking.
*/
struct nfs_inode *nfsi = NFS_I(inode);
+ u64 login_time = nfs_access_login_time(current, cred);
struct nfs_access_entry *cache;
int err = -ECHILD;
struct list_head *lh;
@@ -3010,6 +3040,8 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre
cache = NULL;
if (cache == NULL)
goto out;
+ if ((s64)(login_time - cache->timestamp) > 0)
+ goto out;
if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
goto out;
*mask = cache->mask;
@@ -3057,6 +3089,7 @@ static void nfs_access_add_rbtree(struct inode *inode,
else
goto found;
}
+ set->timestamp = ktime_get_ns();
rb_link_node(&set->rb_node, parent, p);
rb_insert_color(&set->rb_node, root_node);
list_add_tail(&set->lru, &nfsi->access_cache_entry_lru);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index ad34a33b0737..4974cd18ca46 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -783,6 +783,12 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
return &fl->generic_hdr;
}
+static bool
+filelayout_lseg_is_striped(const struct nfs4_filelayout_segment *flseg)
+{
+ return flseg->num_fh > 1;
+}
+
/*
* filelayout_pg_test(). Called by nfs_can_coalesce_requests()
*
@@ -803,6 +809,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
size = pnfs_generic_pg_test(pgio, prev, req);
if (!size)
return 0;
+ else if (!filelayout_lseg_is_striped(FILELAYOUT_LSEG(pgio->pg_lseg)))
+ return size;
/* see if req and prev are in the same stripe */
if (prev) {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 1ec79ccf89ad..7deb3cd76abe 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -493,10 +493,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
gid = make_kgid(&init_user_ns, id);
if (gfp_flags & __GFP_FS)
- kcred = prepare_kernel_cred(NULL);
+ kcred = prepare_kernel_cred(&init_task);
else {
unsigned int nofs_flags = memalloc_nofs_save();
- kcred = prepare_kernel_cred(NULL);
+ kcred = prepare_kernel_cred(&init_task);
memalloc_nofs_restore(nofs_flags);
}
rc = -ENOMEM;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 09833ec102fc..9bcd53d5c7d4 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -684,6 +684,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
return ret;
break;
case Opt_vers:
+ if (!param->string)
+ goto out_invalid_value;
trace_nfs_mount_assign(param->key, param->string);
ret = nfs_parse_version_string(fc, param->string);
if (ret < 0)
@@ -696,6 +698,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_proto:
+ if (!param->string)
+ goto out_invalid_value;
trace_nfs_mount_assign(param->key, param->string);
protofamily = AF_INET;
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
@@ -732,6 +736,8 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
break;
case Opt_mountproto:
+ if (!param->string)
+ goto out_invalid_value;
trace_nfs_mount_assign(param->key, param->string);
mountfamily = AF_INET;
switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) {
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index e861d7bae305..e731c00a9fcb 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -252,7 +252,7 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page)
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_read_operation(&cres, cookie);
if (ret < 0)
@@ -282,7 +282,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
bvec[0].bv_page = page;
bvec[0].bv_offset = 0;
bvec[0].bv_len = PAGE_SIZE;
- iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+ iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
ret = fscache_begin_write_operation(&cres, cookie);
if (ret < 0)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6b2cfa59a1a2..e98ee7599eeb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1168,7 +1168,8 @@ int nfs_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
+ ctx = alloc_nfs_open_context(file_dentry(filp),
+ flags_to_mode(filp->f_flags), filp);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
nfs_file_set_open_context(filp, ctx);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 647fc3f547cb..ae7d4a8c728c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -739,12 +739,10 @@ unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto)
iosize = NFS_DEF_FILE_IO_SIZE;
else if (iosize >= NFS_MAX_FILE_IO_SIZE)
iosize = NFS_MAX_FILE_IO_SIZE;
- else
- iosize = iosize & PAGE_MASK;
- if (proto == XPRT_TRANSPORT_UDP)
+ if (proto == XPRT_TRANSPORT_UDP || iosize < PAGE_SIZE)
return nfs_block_bits(iosize, NULL);
- return iosize;
+ return iosize & PAGE_MASK;
}
/*
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2f336ace7555..b0ef7e7ddb30 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -147,7 +147,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
struct nfs_fs_context *ctx;
struct fs_context *fc;
struct vfsmount *mnt = ERR_PTR(-ENOMEM);
- struct nfs_server *server = NFS_SERVER(d_inode(path->dentry));
+ struct nfs_server *server = NFS_SB(path->dentry->d_sb);
struct nfs_client *client = server->nfs_client;
int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
int ret;
@@ -354,7 +354,7 @@ static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp)
num = (num + (HZ - 1)) / HZ;
} else
num = -1;
- return scnprintf(buffer, PAGE_SIZE, "%li\n", num);
+ return sysfs_emit(buffer, "%li\n", num);
}
static const struct kernel_param_ops param_ops_nfs_timeout = {
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 03a4e679fd99..df9ca56db347 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -12,7 +12,7 @@
*/
#ifdef CONFIG_NFS_V3_ACL
extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu);
-extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
struct posix_acl *dfacl);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 93de0b58647a..74d11e3c4205 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -255,23 +255,24 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
}
-int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int nfs3_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
+ struct inode *inode = d_inode(dentry);
int status;
if (S_ISDIR(inode->i_mode)) {
switch(type) {
case ACL_TYPE_ACCESS:
- alloc = get_acl(inode, ACL_TYPE_DEFAULT);
+ alloc = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(alloc))
goto fail;
dfacl = alloc;
break;
case ACL_TYPE_DEFAULT:
- alloc = get_acl(inode, ACL_TYPE_ACCESS);
+ alloc = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(alloc))
goto fail;
dfacl = acl;
@@ -312,7 +313,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
struct posix_acl *acl;
char *p = data + *result;
- acl = get_acl(inode, type);
+ acl = get_inode_acl(inode, type);
if (IS_ERR_OR_NULL(acl))
return 0;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2e7579626cf0..4bf208a0a8e9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -998,7 +998,7 @@ static const struct inode_operations nfs3_dir_inode_operations = {
.setattr = nfs_setattr,
#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .get_acl = nfs3_get_acl,
+ .get_inode_acl = nfs3_get_acl,
.set_acl = nfs3_set_acl,
#endif
};
@@ -1009,7 +1009,7 @@ static const struct inode_operations nfs3_file_inode_operations = {
.setattr = nfs_setattr,
#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .get_acl = nfs3_get_acl,
+ .get_inode_acl = nfs3_get_acl,
.set_acl = nfs3_set_acl,
#endif
};
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index fe1aeb0f048f..d80ee88ca996 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -47,13 +47,14 @@
#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
#define encode_read_plus_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + 3)
-#define NFS42_READ_PLUS_SEGMENT_SIZE (1 /* data_content4 */ + \
+#define NFS42_READ_PLUS_DATA_SEGMENT_SIZE \
+ (1 /* data_content4 */ + \
2 /* data_info4.di_offset */ + \
- 2 /* data_info4.di_length */)
+ 1 /* data_info4.di_length */)
#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \
1 /* rpr_eof */ + \
1 /* rpr_contents count */ + \
- 2 * NFS42_READ_PLUS_SEGMENT_SIZE)
+ NFS42_READ_PLUS_DATA_SEGMENT_SIZE)
#define encode_seek_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
2 /* offset */ + \
@@ -1142,7 +1143,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
if (!segs)
return -ENOMEM;
- xdr_set_scratch_buffer(xdr, &scratch_buf, 32);
+ xdr_set_scratch_buffer(xdr, &scratch_buf, sizeof(scratch_buf));
status = -EIO;
for (i = 0; i < segments; i++) {
status = decode_read_plus_segment(xdr, &segs[i]);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index cfef738d765e..5edd1704f735 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -149,6 +149,7 @@ struct nfs4_lock_state {
struct nfs4_state * ls_state; /* Pointer to open state */
#define NFS_LOCK_INITIALIZED 0
#define NFS_LOCK_LOST 1
+#define NFS_LOCK_UNLOCKING 2
unsigned long ls_flags;
struct nfs_seqid_counter ls_seqid;
nfs4_stateid ls_stateid;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 9eb181287879..2563ed8580f3 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -32,7 +32,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
struct dentry *parent = NULL;
struct inode *dir;
unsigned openflags = filp->f_flags;
- fmode_t f_mode;
struct iattr attr;
int err;
@@ -51,17 +50,14 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
- f_mode = filp->f_mode;
- if ((openflags & O_ACCMODE) == 3)
- f_mode |= flags_to_mode(openflags);
-
/* We can't create new files here */
openflags &= ~(O_CREAT|O_EXCL);
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp);
+ ctx = alloc_nfs_open_context(file_dentry(filp),
+ flags_to_mode(openflags), filp);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
@@ -366,8 +362,8 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
goto out_free_name;
}
- ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode,
- filep);
+ ctx = alloc_nfs_open_context(filep->f_path.dentry,
+ flags_to_mode(filep->f_flags), filep);
if (IS_ERR(ctx)) {
res = ERR_CAST(ctx);
goto out_filep;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index e3fdd2f45b01..25a7c771cfd8 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -203,7 +203,7 @@ int nfs_idmap_init(void)
printk(KERN_NOTICE "NFS: Registering the %s key type\n",
key_type_id_resolver.name);
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 86ed5c0142c3..40d749f29ed3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -122,6 +122,11 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
return NULL;
+ label->lfs = 0;
+ label->pi = 0;
+ label->len = 0;
+ label->label = NULL;
+
err = security_dentry_init_security(dentry, sattr->ia_mode,
&dentry->d_name, NULL,
(void **)&label->label, &label->len);
@@ -2126,18 +2131,18 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
}
static int nfs4_open_recover_helper(struct nfs4_opendata *opendata,
- fmode_t fmode)
+ fmode_t fmode)
{
struct nfs4_state *newstate;
+ struct nfs_server *server = NFS_SB(opendata->dentry->d_sb);
+ int openflags = opendata->o_arg.open_flags;
int ret;
if (!nfs4_mode_match_open_stateid(opendata->state, fmode))
return 0;
- opendata->o_arg.open_flags = 0;
opendata->o_arg.fmode = fmode;
- opendata->o_arg.share_access = nfs4_map_atomic_open_share(
- NFS_SB(opendata->dentry->d_sb),
- fmode, 0);
+ opendata->o_arg.share_access =
+ nfs4_map_atomic_open_share(server, fmode, openflags);
memset(&opendata->o_res, 0, sizeof(opendata->o_res));
memset(&opendata->c_res, 0, sizeof(opendata->c_res));
nfs4_init_opendata_res(opendata);
@@ -2625,8 +2630,7 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
*/
static int nfs4_opendata_access(const struct cred *cred,
struct nfs4_opendata *opendata,
- struct nfs4_state *state, fmode_t fmode,
- int openflags)
+ struct nfs4_state *state, fmode_t fmode)
{
struct nfs_access_entry cache;
u32 mask, flags;
@@ -2637,11 +2641,7 @@ static int nfs4_opendata_access(const struct cred *cred,
return 0;
mask = 0;
- /*
- * Use openflags to check for exec, because fmode won't
- * always have FMODE_EXEC set when file open for exec.
- */
- if (openflags & __FMODE_EXEC) {
+ if (fmode & FMODE_EXEC) {
/* ONLY check for exec rights */
if (S_ISDIR(state->inode->i_mode))
mask = NFS4_ACCESS_LOOKUP;
@@ -2719,10 +2719,15 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
struct nfs4_opendata *opendata;
int ret;
- opendata = nfs4_open_recoverdata_alloc(ctx, state,
- NFS4_OPEN_CLAIM_FH);
+ opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_FH);
if (IS_ERR(opendata))
return PTR_ERR(opendata);
+ /*
+ * We're not recovering a delegation, so ask for no delegation.
+ * Otherwise the recovery thread could deadlock with an outstanding
+ * delegation return.
+ */
+ opendata->o_arg.open_flags = O_DIRECT;
ret = nfs4_open_recover(opendata, state);
if (ret == -ESTALE)
d_drop(ctx->dentry);
@@ -3024,7 +3029,7 @@ static unsigned nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
}
static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
- int flags, struct nfs_open_context *ctx)
+ struct nfs_open_context *ctx)
{
struct nfs4_state_owner *sp = opendata->owner;
struct nfs_server *server = sp->so_server;
@@ -3085,8 +3090,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
/* Parse layoutget results before we check for access */
pnfs_parse_lgopen(state->inode, opendata->lgp, ctx);
- ret = nfs4_opendata_access(sp->so_cred, opendata, state,
- acc_mode, flags);
+ ret = nfs4_opendata_access(sp->so_cred, opendata, state, acc_mode);
if (ret != 0)
goto out;
@@ -3160,7 +3164,7 @@ static int _nfs4_do_open(struct inode *dir,
if (d_really_is_positive(dentry))
opendata->state = nfs4_get_open_state(d_inode(dentry), sp);
- status = _nfs4_open_and_get_state(opendata, flags, ctx);
+ status = _nfs4_open_and_get_state(opendata, ctx);
if (status != 0)
goto err_opendata_put;
state = ctx->state;
@@ -3796,7 +3800,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
int open_flags, struct iattr *attr, int *opened)
{
struct nfs4_state *state;
- struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+ struct nfs4_label l, *label;
label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
@@ -4013,7 +4017,7 @@ static int _nfs4_discover_trunking(struct nfs_server *server,
page = alloc_page(GFP_KERNEL);
if (!page)
- return -ENOMEM;
+ goto out_put_cred;
locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
if (!locations)
goto out_free;
@@ -4035,6 +4039,8 @@ out_free_2:
kfree(locations);
out_free:
__free_page(page);
+out_put_cred:
+ put_cred(cred);
return status;
}
@@ -4682,7 +4688,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags)
{
struct nfs_server *server = NFS_SERVER(dir);
- struct nfs4_label l, *ilabel = NULL;
+ struct nfs4_label l, *ilabel;
struct nfs_open_context *ctx;
struct nfs4_state *state;
int status = 0;
@@ -5033,7 +5039,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
struct nfs4_exception exception = {
.interruptible = true,
};
- struct nfs4_label l, *label = NULL;
+ struct nfs4_label l, *label;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5074,7 +5080,7 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct nfs4_exception exception = {
.interruptible = true,
};
- struct nfs4_label l, *label = NULL;
+ struct nfs4_label l, *label;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5193,7 +5199,7 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct nfs4_exception exception = {
.interruptible = true,
};
- struct nfs4_label l, *label = NULL;
+ struct nfs4_label l, *label;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -7017,12 +7023,13 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
mutex_unlock(&sp->so_delegreturn_mutex);
goto out;
}
+ lsp = request->fl_u.nfs4_fl.owner;
+ set_bit(NFS_LOCK_UNLOCKING, &lsp->ls_flags);
up_read(&nfsi->rwsem);
mutex_unlock(&sp->so_delegreturn_mutex);
if (status != 0)
goto out;
/* Is this a delegated lock? */
- lsp = request->fl_u.nfs4_fl.owner;
if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
goto out;
alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a2d2d5d1b088..2a0ca5c7f082 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1230,6 +1230,8 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
if (IS_ERR(task)) {
printk(KERN_ERR "%s: kthread_run: %ld\n",
__func__, PTR_ERR(task));
+ if (!nfs_client_init_is_complete(clp))
+ nfs_mark_client_ready(clp, PTR_ERR(task));
nfs4_clear_state_manager_bit(clp);
clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs_put_client(clp);
@@ -1501,7 +1503,7 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
struct file_lock *fl;
struct nfs4_lock_state *lsp;
int status = 0;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct list_head *list;
if (flctx == NULL)
@@ -1619,7 +1621,8 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st
spin_lock(&state->state_lock);
list_for_each_entry(lock, &state->lock_states, ls_locks) {
trace_nfs4_state_lock_reclaim(state, lock);
- if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
+ if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags) &&
+ !test_bit(NFS_LOCK_UNLOCKING, &lock->ls_flags))
*lost_locks += 1;
}
spin_unlock(&state->state_lock);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2cff5901c689..214bc56f92d2 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -9,10 +9,10 @@
#define _TRACE_NFS4_H
#include <linux/tracepoint.h>
-#include <trace/events/sunrpc_base.h>
+#include <trace/misc/sunrpc.h>
-#include <trace/events/fs.h>
-#include <trace/events/nfs.h>
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
#define show_nfs_fattr_flags(valid) \
__print_flags((unsigned long)valid, "|", \
@@ -1815,7 +1815,7 @@ TRACE_EVENT(pnfs_update_layout,
__entry->count = count;
__entry->iomode = iomode;
__entry->reason = reason;
- if (lo != NULL) {
+ if (lo != NULL && pnfs_layout_is_valid(lo)) {
__entry->layoutstateid_seq =
be32_to_cpu(lo->plh_stateid.seqid);
__entry->layoutstateid_hash =
@@ -1869,7 +1869,7 @@ DECLARE_EVENT_CLASS(pnfs_layout_event,
__entry->pos = pos;
__entry->count = count;
__entry->iomode = iomode;
- if (lo != NULL) {
+ if (lo != NULL && pnfs_layout_is_valid(lo)) {
__entry->layoutstateid_seq =
be32_to_cpu(lo->plh_stateid.seqid);
__entry->layoutstateid_hash =
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index acfe5f4bda48..deec76cf5afe 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4234,19 +4234,17 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
return -EIO;
+ bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
if (len < NFS4_MAXLABELLEN) {
- if (label) {
- if (label->len) {
- if (label->len < len)
- return -ERANGE;
- memcpy(label->label, p, len);
- }
+ if (label && label->len) {
+ if (label->len < len)
+ return -ERANGE;
+ memcpy(label->label, p, len);
label->len = len;
label->pi = pi;
label->lfs = lfs;
status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
}
- bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
} else
printk(KERN_WARNING "%s: label too long (%u)!\n",
__func__, len);
@@ -4755,12 +4753,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
if (status < 0)
goto xdr_error;
- if (fattr->label) {
- status = decode_attr_security_label(xdr, bitmap, fattr->label);
- if (status < 0)
- goto xdr_error;
- fattr->valid |= status;
- }
+ status = decode_attr_security_label(xdr, bitmap, fattr->label);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
xdr_error:
dprintk("%s: xdr returned %d\n", __func__, -status);
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 8c6cc58679ff..642f6921852f 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -11,9 +11,9 @@
#include <linux/tracepoint.h>
#include <linux/iversion.h>
-#include <trace/events/fs.h>
-#include <trace/events/nfs.h>
-#include <trace/events/sunrpc_base.h>
+#include <trace/misc/fs.h>
+#include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
#define nfs_show_cache_validity(v) \
__print_flags(v, "|", \
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 317cedfa52bf..16be6dae524f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1055,7 +1055,7 @@ static unsigned int nfs_coalesce_size(struct nfs_page *prev,
if (prev) {
if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
return 0;
- flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
+ flctx = locks_inode_context(d_inode(nfs_req_openctx(req)->dentry));
if (flctx != NULL &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock)) &&
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index a6f740366963..0cbcd2dfa732 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -26,7 +26,7 @@ static void nfs_netns_object_release(struct kobject *kobj)
}
static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type(
- struct kobject *kobj)
+ const struct kobject *kobj)
{
return &net_ns_type_operations;
}
@@ -82,7 +82,7 @@ static ssize_t nfs_netns_identifier_show(struct kobject *kobj,
ssize_t ret;
rcu_read_lock();
- ret = scnprintf(buf, PAGE_SIZE, "%s\n", rcu_dereference(c->identifier));
+ ret = sysfs_emit(buf, "%s\n", rcu_dereference(c->identifier));
rcu_read_unlock();
return ret;
}
@@ -130,7 +130,7 @@ static void nfs_netns_client_release(struct kobject *kobj)
kfree(c);
}
-static const void *nfs_netns_client_namespace(struct kobject *kobj)
+static const void *nfs_netns_client_namespace(const struct kobject *kobj)
{
return container_of(kobj, struct nfs_netns_client, kobject)->net;
}
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 9697cd5d2561..150a953a8be9 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -139,6 +139,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf
*/
spin_lock(&alias->d_lock);
if (d_really_is_positive(alias) &&
+ !nfs_compare_fh(NFS_FH(inode), NFS_FH(d_inode(alias))) &&
!(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
devname_garbage = alias->d_fsdata;
alias->d_fsdata = data;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f41d24b54fd1..80c240e50952 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1185,7 +1185,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct nfs_lock_context *l_ctx;
- struct file_lock_context *flctx = file_inode(file)->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(file_inode(file));
struct nfs_page *req;
int do_flush, status;
/*
@@ -1321,7 +1321,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page,
struct inode *inode, unsigned int pagelen)
{
int ret;
- struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock_context *flctx = locks_inode_context(inode);
struct file_lock *fl;
if (file->f_flags & O_DSYNC)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f6a2fd3015e7..7c441f2bd444 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -8,6 +8,7 @@ config NFSD
select SUNRPC
select EXPORTFS
select NFS_ACL_SUPPORT if NFSD_V2_ACL
+ select NFS_ACL_SUPPORT if NFSD_V3_ACL
depends on MULTIUSER
help
Choose Y here if you want to allow other computers to access
@@ -26,19 +27,29 @@ config NFSD
Below you can choose which versions of the NFS protocol are
available to clients mounting the NFS server on this system.
- Support for NFS version 2 (RFC 1094) is always available when
+ Support for NFS version 3 (RFC 1813) is always available when
CONFIG_NFSD is selected.
If unsure, say N.
-config NFSD_V2_ACL
- bool
+config NFSD_V2
+ bool "NFS server support for NFS version 2 (DEPRECATED)"
depends on NFSD
+ default n
+ help
+ NFSv2 (RFC 1094) was the first publicly-released version of NFS.
+ Unless you are hosting ancient (1990's era) NFS clients, you don't
+ need this.
+
+ If unsure, say N.
+
+config NFSD_V2_ACL
+ bool "NFS server support for the NFSv2 ACL protocol extension"
+ depends on NFSD_V2
config NFSD_V3_ACL
bool "NFS server support for the NFSv3 ACL protocol extension"
depends on NFSD
- select NFSD_V2_ACL
help
Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
never became an official part of the NFS version 3 protocol.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 805c06d5f1b4..6fffc8f03f74 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -10,9 +10,10 @@ obj-$(CONFIG_NFSD) += nfsd.o
# this one should be compiled first, as the tracing macros can easily blow up
nfsd-y += trace.o
-nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
- export.o auth.o lockd.o nfscache.o nfsxdr.o \
+nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \
+ export.o auth.o lockd.o nfscache.o \
stats.o filecache.o nfs3proc.o nfs3xdr.o
+nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index b6d01d51a746..04697f8dc37d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -12,6 +12,7 @@
#include "blocklayoutxdr.h"
#include "pnfs.h"
#include "filecache.h"
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PNFS
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 442543304930..8e9c1a0f8d38 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -9,6 +9,7 @@
#include "nfsd.h"
#include "blocklayoutxdr.h"
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PNFS
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index ee0e3aba4a6e..d03f7f6a8642 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -115,7 +115,6 @@ struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *);
int exp_rootfh(struct net *, struct auth_domain *,
char *path, struct knfsd_fh *, int maxsize);
__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
-__be32 nfserrno(int errno);
static inline void exp_put(struct svc_export *exp)
{
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ec3fceb92236..0ef070349014 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1,7 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * Open file cache.
+ * The NFSD open file cache.
*
* (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
+ *
+ * An nfsd_file object is a per-file collection of open state that binds
+ * together:
+ * - a struct file *
+ * - a user credential
+ * - a network namespace
+ * - a read-ahead context
+ * - monitoring for writeback errors
+ *
+ * nfsd_file objects are reference-counted. Consumers acquire a new
+ * object via the nfsd_file_acquire API. They manage their interest in
+ * the acquired object, and hence the object's reference count, via
+ * nfsd_file_get and nfsd_file_put. There are two varieties of nfsd_file
+ * object:
+ *
+ * * non-garbage-collected: When a consumer wants to precisely control
+ * the lifetime of a file's open state, it acquires a non-garbage-
+ * collected nfsd_file. The final nfsd_file_put releases the open
+ * state immediately.
+ *
+ * * garbage-collected: When a consumer does not control the lifetime
+ * of open state, it acquires a garbage-collected nfsd_file. The
+ * final nfsd_file_put allows the open state to linger for a period
+ * during which it may be re-used.
*/
#include <linux/hash.h>
@@ -33,7 +58,6 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
static DEFINE_PER_CPU(unsigned long, nfsd_file_releases);
static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
-static DEFINE_PER_CPU(unsigned long, nfsd_file_pages_flushed);
static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
struct nfsd_fcache_disposal {
@@ -63,6 +87,7 @@ struct nfsd_file_lookup_key {
struct net *net;
const struct cred *cred;
unsigned char need;
+ bool gc;
enum nfsd_file_lookup_type type;
};
@@ -162,6 +187,8 @@ static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg,
return 1;
if (!nfsd_match_cred(nf->nf_cred, key->cred))
return 1;
+ if (!!test_bit(NFSD_FILE_GC, &nf->nf_flags) != key->gc)
+ return 1;
if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
return 1;
break;
@@ -184,12 +211,9 @@ static const struct rhashtable_params nfsd_file_rhash_params = {
static void
nfsd_file_schedule_laundrette(void)
{
- if ((atomic_read(&nfsd_file_rhash_tbl.nelems) == 0) ||
- test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0)
- return;
-
- queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
- NFSD_LAUNDRETTE_DELAY);
+ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags))
+ queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
+ NFSD_LAUNDRETTE_DELAY);
}
static void
@@ -297,56 +321,28 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
nf->nf_flags = 0;
__set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
__set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+ if (key->gc)
+ __set_bit(NFSD_FILE_GC, &nf->nf_flags);
nf->nf_inode = key->inode;
- /* nf_ref is pre-incremented for hash table */
- refcount_set(&nf->nf_ref, 2);
+ refcount_set(&nf->nf_ref, 1);
nf->nf_may = key->need;
nf->nf_mark = NULL;
}
return nf;
}
-static bool
-nfsd_file_free(struct nfsd_file *nf)
-{
- s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
- bool flush = false;
-
- this_cpu_inc(nfsd_file_releases);
- this_cpu_add(nfsd_file_total_age, age);
-
- trace_nfsd_file_put_final(nf);
- if (nf->nf_mark)
- nfsd_file_mark_put(nf->nf_mark);
- if (nf->nf_file) {
- get_file(nf->nf_file);
- filp_close(nf->nf_file, NULL);
- fput(nf->nf_file);
- flush = true;
- }
-
- /*
- * If this item is still linked via nf_lru, that's a bug.
- * WARN and leak it to preserve system stability.
- */
- if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
- return flush;
-
- call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
- return flush;
-}
-
-static bool
-nfsd_file_check_writeback(struct nfsd_file *nf)
+static void
+nfsd_file_fsync(struct nfsd_file *nf)
{
struct file *file = nf->nf_file;
- struct address_space *mapping;
+ int ret;
if (!file || !(file->f_mode & FMODE_WRITE))
- return false;
- mapping = file->f_mapping;
- return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
- mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
+ return;
+ ret = vfs_fsync(file, 1);
+ trace_nfsd_file_fsync(nf, ret);
+ if (ret)
+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
}
static int
@@ -360,31 +356,6 @@ nfsd_file_check_write_error(struct nfsd_file *nf)
}
static void
-nfsd_file_flush(struct nfsd_file *nf)
-{
- struct file *file = nf->nf_file;
-
- if (!file || !(file->f_mode & FMODE_WRITE))
- return;
- this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages);
- if (vfs_fsync(file, 1) != 0)
- nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
-}
-
-static void nfsd_file_lru_add(struct nfsd_file *nf)
-{
- set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
- trace_nfsd_file_lru_add(nf);
-}
-
-static void nfsd_file_lru_remove(struct nfsd_file *nf)
-{
- if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
- trace_nfsd_file_lru_del(nf);
-}
-
-static void
nfsd_file_hash_remove(struct nfsd_file *nf)
{
trace_nfsd_file_unhash(nf);
@@ -406,60 +377,76 @@ nfsd_file_unhash(struct nfsd_file *nf)
}
static void
-nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose)
+nfsd_file_free(struct nfsd_file *nf)
{
- trace_nfsd_file_unhash_and_dispose(nf);
- if (nfsd_file_unhash(nf)) {
- /* caller must call nfsd_file_dispose_list() later */
- nfsd_file_lru_remove(nf);
- list_add(&nf->nf_lru, dispose);
+ s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
+
+ trace_nfsd_file_free(nf);
+
+ this_cpu_inc(nfsd_file_releases);
+ this_cpu_add(nfsd_file_total_age, age);
+
+ nfsd_file_unhash(nf);
+
+ /*
+ * We call fsync here in order to catch writeback errors. It's not
+ * strictly required by the protocol, but an nfsd_file could get
+ * evicted from the cache before a COMMIT comes in. If another
+ * task were to open that file in the interim and scrape the error,
+ * then the client may never see it. By calling fsync here, we ensure
+ * that writeback happens before the entry is freed, and that any
+ * errors reported result in the write verifier changing.
+ */
+ nfsd_file_fsync(nf);
+
+ if (nf->nf_mark)
+ nfsd_file_mark_put(nf->nf_mark);
+ if (nf->nf_file) {
+ get_file(nf->nf_file);
+ filp_close(nf->nf_file, NULL);
+ fput(nf->nf_file);
}
+
+ /*
+ * If this item is still linked via nf_lru, that's a bug.
+ * WARN and leak it to preserve system stability.
+ */
+ if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
+ return;
+
+ call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
}
-static void
-nfsd_file_put_noref(struct nfsd_file *nf)
+static bool
+nfsd_file_check_writeback(struct nfsd_file *nf)
{
- trace_nfsd_file_put(nf);
+ struct file *file = nf->nf_file;
+ struct address_space *mapping;
- if (refcount_dec_and_test(&nf->nf_ref)) {
- WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
- nfsd_file_lru_remove(nf);
- nfsd_file_free(nf);
- }
+ if (!file || !(file->f_mode & FMODE_WRITE))
+ return false;
+ mapping = file->f_mapping;
+ return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
+ mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
}
-void
-nfsd_file_put(struct nfsd_file *nf)
+static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
- might_sleep();
-
- nfsd_file_lru_add(nf);
- if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
- nfsd_file_flush(nf);
- nfsd_file_put_noref(nf);
- } else if (nf->nf_file) {
- nfsd_file_put_noref(nf);
- nfsd_file_schedule_laundrette();
- } else
- nfsd_file_put_noref(nf);
+ set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
+ trace_nfsd_file_lru_add(nf);
+ return true;
+ }
+ return false;
}
-/**
- * nfsd_file_close - Close an nfsd_file
- * @nf: nfsd_file to close
- *
- * If this is the final reference for @nf, free it immediately.
- * This reflects an on-the-wire CLOSE or DELEGRETURN into the
- * VFS and exported filesystem.
- */
-void nfsd_file_close(struct nfsd_file *nf)
+static bool nfsd_file_lru_remove(struct nfsd_file *nf)
{
- nfsd_file_put(nf);
- if (refcount_dec_if_one(&nf->nf_ref)) {
- nfsd_file_unhash(nf);
- nfsd_file_lru_remove(nf);
- nfsd_file_free(nf);
+ if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
+ trace_nfsd_file_lru_del(nf);
+ return true;
}
+ return false;
}
struct nfsd_file *
@@ -470,36 +457,60 @@ nfsd_file_get(struct nfsd_file *nf)
return NULL;
}
-static void
-nfsd_file_dispose_list(struct list_head *dispose)
+/**
+ * nfsd_file_put - put the reference to a nfsd_file
+ * @nf: nfsd_file of which to put the reference
+ *
+ * Put a reference to a nfsd_file. In the non-GC case, we just put the
+ * reference immediately. In the GC case, if the reference would be
+ * the last one, the put it on the LRU instead to be cleaned up later.
+ */
+void
+nfsd_file_put(struct nfsd_file *nf)
{
- struct nfsd_file *nf;
+ might_sleep();
+ trace_nfsd_file_put(nf);
- while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
- nfsd_file_flush(nf);
- nfsd_file_put_noref(nf);
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
+ test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ /*
+ * If this is the last reference (nf_ref == 1), then try to
+ * transfer it to the LRU.
+ */
+ if (refcount_dec_not_one(&nf->nf_ref))
+ return;
+
+ /* Try to add it to the LRU. If that fails, decrement. */
+ if (nfsd_file_lru_add(nf)) {
+ /* If it's still hashed, we're done */
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ nfsd_file_schedule_laundrette();
+ return;
+ }
+
+ /*
+ * We're racing with unhashing, so try to remove it from
+ * the LRU. If removal fails, then someone else already
+ * has our reference.
+ */
+ if (!nfsd_file_lru_remove(nf))
+ return;
+ }
}
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
}
static void
-nfsd_file_dispose_list_sync(struct list_head *dispose)
+nfsd_file_dispose_list(struct list_head *dispose)
{
- bool flush = false;
struct nfsd_file *nf;
- while(!list_empty(dispose)) {
+ while (!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del_init(&nf->nf_lru);
- nfsd_file_flush(nf);
- if (!refcount_dec_and_test(&nf->nf_ref))
- continue;
- if (nfsd_file_free(nf))
- flush = true;
+ nfsd_file_free(nf);
}
- if (flush)
- flush_delayed_fput();
}
static void
@@ -569,21 +580,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
struct list_head *head = arg;
struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
- /*
- * Do a lockless refcount check. The hashtable holds one reference, so
- * we look to see if anything else has a reference, or if any have
- * been put since the shrinker last ran. Those don't get unhashed and
- * released.
- *
- * Note that in the put path, we set the flag and then decrement the
- * counter. Here we check the counter and then test and clear the flag.
- * That order is deliberate to ensure that we can do this locklessly.
- */
- if (refcount_read(&nf->nf_ref) > 1) {
- list_lru_isolate(lru, &nf->nf_lru);
- trace_nfsd_file_gc_in_use(nf);
- return LRU_REMOVED;
- }
+ /* We should only be dealing with GC entries here */
+ WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
/*
* Don't throw out files that are still undergoing I/O or
@@ -594,40 +592,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
return LRU_SKIP;
}
+ /* If it was recently added to the list, skip it */
if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
trace_nfsd_file_gc_referenced(nf);
return LRU_ROTATE;
}
- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- trace_nfsd_file_gc_hashed(nf);
- return LRU_SKIP;
+ /*
+ * Put the reference held on behalf of the LRU. If it wasn't the last
+ * one, then just remove it from the LRU and ignore it.
+ */
+ if (!refcount_dec_and_test(&nf->nf_ref)) {
+ trace_nfsd_file_gc_in_use(nf);
+ list_lru_isolate(lru, &nf->nf_lru);
+ return LRU_REMOVED;
}
+ /* Refcount went to zero. Unhash it and queue it to the dispose list */
+ nfsd_file_unhash(nf);
list_lru_isolate_move(lru, &nf->nf_lru, head);
this_cpu_inc(nfsd_file_evictions);
trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
}
-/*
- * Unhash items on @dispose immediately, then queue them on the
- * disposal workqueue to finish releasing them in the background.
- *
- * cel: Note that between the time list_lru_shrink_walk runs and
- * now, these items are in the hash table but marked unhashed.
- * Why release these outside of lru_cb ? There's no lock ordering
- * problem since lru_cb currently takes no lock.
- */
-static void nfsd_file_gc_dispose_list(struct list_head *dispose)
-{
- struct nfsd_file *nf;
-
- list_for_each_entry(nf, dispose, nf_lru)
- nfsd_file_hash_remove(nf);
- nfsd_file_dispose_list_delayed(dispose);
-}
-
static void
nfsd_file_gc(void)
{
@@ -637,14 +625,15 @@ nfsd_file_gc(void)
ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
&dispose, list_lru_count(&nfsd_file_lru));
trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}
static void
nfsd_file_gc_worker(struct work_struct *work)
{
nfsd_file_gc();
- nfsd_file_schedule_laundrette();
+ if (list_lru_count(&nfsd_file_lru))
+ nfsd_file_schedule_laundrette();
}
static unsigned long
@@ -662,7 +651,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
nfsd_file_lru_cb, &dispose);
trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
return ret;
}
@@ -672,72 +661,111 @@ static struct shrinker nfsd_file_shrinker = {
.seeks = 1,
};
-/*
- * Find all cache items across all net namespaces that match @inode and
- * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
+/**
+ * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode
+ * @inode: inode on which to close out nfsd_files
+ * @dispose: list on which to gather nfsd_files to close out
+ *
+ * An nfsd_file represents a struct file being held open on behalf of nfsd. An
+ * open file however can block other activity (such as leases), or cause
+ * undesirable behavior (e.g. spurious silly-renames when reexporting NFS).
+ *
+ * This function is intended to find open nfsd_files when this sort of
+ * conflicting access occurs and then attempt to close those files out.
+ *
+ * Populates the dispose list with entries that have already had their
+ * refcounts go to zero. The actual free of an nfsd_file can be expensive,
+ * so we leave it up to the caller whether it wants to wait or not.
*/
-static unsigned int
-__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
+static void
+nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose)
{
struct nfsd_file_lookup_key key = {
.type = NFSD_FILE_KEY_INODE,
.inode = inode,
};
- unsigned int count = 0;
struct nfsd_file *nf;
rcu_read_lock();
do {
+ int decrement = 1;
+
nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key,
nfsd_file_rhash_params);
if (!nf)
break;
- nfsd_file_unhash_and_dispose(nf, dispose);
- count++;
+
+ /* If we raced with someone else unhashing, ignore it */
+ if (!nfsd_file_unhash(nf))
+ continue;
+
+ /* If we can't get a reference, ignore it */
+ if (!nfsd_file_get(nf))
+ continue;
+
+ /* Extra decrement if we remove from the LRU */
+ if (nfsd_file_lru_remove(nf))
+ ++decrement;
+
+ /* If refcount goes to 0, then put on the dispose list */
+ if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
+ list_add(&nf->nf_lru, dispose);
+ trace_nfsd_file_closing(nf);
+ }
} while (1);
rcu_read_unlock();
- return count;
}
/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put, then flush and fput all cache items associated with @inode.
+ * Close out any open nfsd_files that can be reaped for @inode. The
+ * actual freeing is deferred to the dispose_list_delayed infrastructure.
+ *
+ * This is used by the fsnotify callbacks and setlease notifier.
*/
-void
-nfsd_file_close_inode_sync(struct inode *inode)
+static void
+nfsd_file_close_inode(struct inode *inode)
{
LIST_HEAD(dispose);
- unsigned int count;
- count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode_sync(inode, count);
- nfsd_file_dispose_list_sync(&dispose);
+ nfsd_file_queue_for_close(inode, &dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}
/**
- * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put all cache item associated with @inode.
+ * Close out any open nfsd_files that can be reaped for @inode. The
+ * nfsd_files are closed out synchronously.
+ *
+ * This is called from nfsd_rename and nfsd_unlink to avoid silly-renames
+ * when reexporting NFS.
*/
-static void
-nfsd_file_close_inode(struct inode *inode)
+void
+nfsd_file_close_inode_sync(struct inode *inode)
{
+ struct nfsd_file *nf;
LIST_HEAD(dispose);
- unsigned int count;
- count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode(inode, count);
- nfsd_file_dispose_list_delayed(&dispose);
+ trace_nfsd_file_close(inode);
+
+ nfsd_file_queue_for_close(inode, &dispose);
+ while (!list_empty(&dispose)) {
+ nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
+ list_del_init(&nf->nf_lru);
+ nfsd_file_free(nf);
+ }
+ flush_delayed_fput();
}
/**
* nfsd_file_delayed_close - close unused nfsd_files
* @work: dummy
*
- * Walk the LRU list and close any entries that have not been used since
+ * Walk the LRU list and destroy any entries that have not been used since
* the last scan.
*/
static void
@@ -759,7 +787,7 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
/* Only close files for F_SETLEASE leases */
if (fl->fl_flags & FL_LEASE)
- nfsd_file_close_inode_sync(file_inode(fl->fl_file));
+ nfsd_file_close_inode(file_inode(fl->fl_file));
return 0;
}
@@ -880,6 +908,13 @@ out_err:
goto out;
}
+/**
+ * __nfsd_file_cache_purge: clean out the cache for shutdown
+ * @net: net-namespace to shut down the cache (may be NULL)
+ *
+ * Walk the nfsd_file cache and close out any that match @net. If @net is NULL,
+ * then close out everything. Called when an nfsd instance is being shut down.
+ */
static void
__nfsd_file_cache_purge(struct net *net)
{
@@ -893,8 +928,11 @@ __nfsd_file_cache_purge(struct net *net)
nf = rhashtable_walk_next(&iter);
while (!IS_ERR_OR_NULL(nf)) {
- if (!net || nf->nf_net == net)
- nfsd_file_unhash_and_dispose(nf, &dispose);
+ if (!net || nf->nf_net == net) {
+ nfsd_file_unhash(nf);
+ nfsd_file_lru_remove(nf);
+ list_add(&nf->nf_lru, &dispose);
+ }
nf = rhashtable_walk_next(&iter);
}
@@ -1000,7 +1038,6 @@ nfsd_file_cache_shutdown(void)
per_cpu(nfsd_file_acquisitions, i) = 0;
per_cpu(nfsd_file_releases, i) = 0;
per_cpu(nfsd_file_total_age, i) = 0;
- per_cpu(nfsd_file_pages_flushed, i) = 0;
per_cpu(nfsd_file_evictions, i) = 0;
}
}
@@ -1034,12 +1071,14 @@ nfsd_file_is_cached(struct inode *inode)
static __be32
nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf, bool open)
+ unsigned int may_flags, struct file *file,
+ struct nfsd_file **pnf, bool want_gc)
{
struct nfsd_file_lookup_key key = {
.type = NFSD_FILE_KEY_FULL,
.need = may_flags & NFSD_FILE_MAY_MASK,
.net = SVC_NET(rqstp),
+ .gc = want_gc,
};
bool open_retry = true;
struct nfsd_file *nf;
@@ -1060,8 +1099,12 @@ retry:
if (nf)
nf = nfsd_file_get(nf);
rcu_read_unlock();
- if (nf)
+
+ if (nf) {
+ if (nfsd_file_lru_remove(nf))
+ WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref));
goto wait_for_construction;
+ }
nf = nfsd_file_alloc(&key, may_flags);
if (!nf) {
@@ -1094,49 +1137,53 @@ wait_for_construction:
goto out;
}
open_retry = false;
- nfsd_file_put_noref(nf);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
goto retry;
}
- nfsd_file_lru_remove(nf);
this_cpu_inc(nfsd_file_cache_hits);
status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
out:
if (status == nfs_ok) {
- if (open)
- this_cpu_inc(nfsd_file_acquisitions);
+ this_cpu_inc(nfsd_file_acquisitions);
*pnf = nf;
} else {
- nfsd_file_put(nf);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
nf = NULL;
}
out_status:
put_cred(key.cred);
- if (open)
- trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status);
+ trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status);
return status;
open_file:
trace_nfsd_file_alloc(nf);
nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode);
if (nf->nf_mark) {
- if (open) {
+ if (file) {
+ get_file(file);
+ nf->nf_file = file;
+ status = nfs_ok;
+ trace_nfsd_file_opened(nf, status);
+ } else {
status = nfsd_open_verified(rqstp, fhp, may_flags,
&nf->nf_file);
trace_nfsd_file_open(nf, status);
- } else
- status = nfs_ok;
+ }
} else
status = nfserr_jukebox;
/*
* If construction failed, or we raced with a call to unlink()
* then unhash.
*/
- if (status != nfs_ok || key.inode->i_nlink == 0)
- if (nfsd_file_unhash(nf))
- nfsd_file_put_noref(nf);
+ if (status == nfs_ok && key.inode->i_nlink == 0)
+ status = nfserr_jukebox;
+ if (status != nfs_ok)
+ nfsd_file_unhash(nf);
clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
@@ -1144,12 +1191,38 @@ open_file:
}
/**
+ * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file
+ * @rqstp: the RPC transaction being executed
+ * @fhp: the NFS filehandle of the file to be opened
+ * @may_flags: NFSD_MAY_ settings for the file
+ * @pnf: OUT: new or found "struct nfsd_file" object
+ *
+ * The nfsd_file object returned by this API is reference-counted
+ * and garbage-collected. The object is retained for a few
+ * seconds after the final nfsd_file_put() in case the caller
+ * wants to re-use it.
+ *
+ * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
+ * network byte order is returned.
+ */
+__be32
+nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **pnf)
+{
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, true);
+}
+
+/**
* nfsd_file_acquire - Get a struct nfsd_file with an open file
* @rqstp: the RPC transaction being executed
* @fhp: the NFS filehandle of the file to be opened
* @may_flags: NFSD_MAY_ settings for the file
* @pnf: OUT: new or found "struct nfsd_file" object
*
+ * The nfsd_file_object returned by this API is reference-counted
+ * but not garbage-collected. The object is unhashed after the
+ * final nfsd_file_put().
+ *
* Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
* network byte order is returned.
*/
@@ -1157,24 +1230,30 @@ __be32
nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, false);
}
/**
- * nfsd_file_create - Get a struct nfsd_file, do not open
+ * nfsd_file_acquire_opened - Get a struct nfsd_file using existing open file
* @rqstp: the RPC transaction being executed
* @fhp: the NFS filehandle of the file just created
* @may_flags: NFSD_MAY_ settings for the file
+ * @file: cached, already-open file (may be NULL)
* @pnf: OUT: new or found "struct nfsd_file" object
*
+ * Acquire a nfsd_file object that is not GC'ed. If one doesn't already exist,
+ * and @file is non-NULL, use it to instantiate a new nfsd_file instead of
+ * opening a new one.
+ *
* Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
* network byte order is returned.
*/
__be32
-nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf)
+nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct file *file,
+ struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, file, pnf, false);
}
/*
@@ -1184,7 +1263,7 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
*/
int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
{
- unsigned long releases = 0, pages_flushed = 0, evictions = 0;
+ unsigned long releases = 0, evictions = 0;
unsigned long hits = 0, acquisitions = 0;
unsigned int i, count = 0, buckets = 0;
unsigned long lru = 0, total_age = 0;
@@ -1212,7 +1291,6 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
releases += per_cpu(nfsd_file_releases, i);
total_age += per_cpu(nfsd_file_total_age, i);
evictions += per_cpu(nfsd_file_evictions, i);
- pages_flushed += per_cpu(nfsd_file_pages_flushed, i);
}
seq_printf(m, "total entries: %u\n", count);
@@ -1226,6 +1304,5 @@ int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
seq_printf(m, "mean age (ms): %ld\n", total_age / releases);
else
seq_printf(m, "mean age (ms): -\n");
- seq_printf(m, "pages flushed: %lu\n", pages_flushed);
return 0;
}
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 357832bac736..41516a4263ea 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -38,6 +38,7 @@ struct nfsd_file {
#define NFSD_FILE_HASHED (0)
#define NFSD_FILE_PENDING (1)
#define NFSD_FILE_REFERENCED (2)
+#define NFSD_FILE_GC (3)
unsigned long nf_flags;
struct inode *nf_inode; /* don't deref */
refcount_t nf_ref;
@@ -52,13 +53,15 @@ void nfsd_file_cache_shutdown(void);
int nfsd_file_cache_start_net(struct net *net);
void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
-void nfsd_file_close(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
bool nfsd_file_is_cached(struct inode *inode);
-__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
-__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
+__be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct file *file,
+ struct nfsd_file **nfp);
int nfsd_file_cache_stats_show(struct seq_file *m, void *v);
#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index 070f90ed09b6..3ca5304440ff 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -15,6 +15,7 @@
#include "flexfilelayoutxdr.h"
#include "pnfs.h"
+#include "vfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PNFS
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 8c854ba3285b..51a4b7885cae 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -195,7 +195,7 @@ struct nfsd_net {
atomic_t nfsd_courtesy_clients;
struct shrinker nfsd_client_shrinker;
- struct delayed_work nfsd_shrinker_work;
+ struct work_struct nfsd_shrinker_work;
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 13e6e6897f6c..1457f59f447a 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -55,7 +55,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
goto out;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -69,7 +69,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst *rqstp)
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
/* Check how Solaris handles requests for the Default ACL
of a non-directory! */
- acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
resp->status = nfserrno(PTR_ERR(acl));
goto fail;
@@ -113,11 +113,11 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
inode_lock(inode);
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS,
argp->acl_access);
if (error)
goto out_drop_lock;
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT,
argp->acl_default);
if (error)
goto out_drop_lock;
@@ -246,7 +246,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
struct inode *inode;
- int w;
if (!svcxdr_encode_stat(xdr, resp->status))
return false;
@@ -260,15 +259,6 @@ nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
return false;
- rqstp->rq_res.page_len = w = nfsacl_size(
- (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
- (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
- while (w > 0) {
- if (!*(rqstp->rq_next_page++))
- return true;
- w -= PAGE_SIZE;
- }
-
if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
resp->mask & NFS_ACL, 0))
return false;
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 2fb9ee356455..647108138e8a 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -47,7 +47,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
resp->mask = argp->mask;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -61,7 +61,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst *rqstp)
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
/* Check how Solaris handles requests for the Default ACL
of a non-directory! */
- acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
resp->status = nfserrno(PTR_ERR(acl));
goto fail;
@@ -103,11 +103,11 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
inode_lock(inode);
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_ACCESS,
argp->acl_access);
if (error)
goto out_drop_lock;
- error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT,
+ error = set_posix_acl(&init_user_ns, fh->fh_dentry, ACL_TYPE_DEFAULT,
argp->acl_default);
out_drop_lock:
@@ -171,11 +171,7 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
{
struct nfsd3_getaclres *resp = rqstp->rq_resp;
struct dentry *dentry = resp->fh.fh_dentry;
- struct kvec *head = rqstp->rq_res.head;
struct inode *inode;
- unsigned int base;
- int n;
- int w;
if (!svcxdr_encode_nfsstat3(xdr, resp->status))
return false;
@@ -187,26 +183,12 @@ nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
return false;
- base = (char *)xdr->p - (char *)head->iov_base;
-
- rqstp->rq_res.page_len = w = nfsacl_size(
- (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
- (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
- while (w > 0) {
- if (!*(rqstp->rq_next_page++))
- return false;
- w -= PAGE_SIZE;
- }
-
- n = nfsacl_encode(&rqstp->rq_res, base, inode,
- resp->acl_access,
- resp->mask & NFS_ACL, 0);
- if (n > 0)
- n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
- resp->acl_default,
- resp->mask & NFS_DFACL,
- NFS_ACL_DEFAULT);
- if (n <= 0)
+ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
+ resp->mask & NFS_ACL, 0))
+ return false;
+ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default,
+ resp->mask & NFS_DFACL,
+ NFS_ACL_DEFAULT))
return false;
break;
default:
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 923d9a80df92..d01b29aba662 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -13,6 +13,7 @@
#include "cache.h"
#include "xdr3.h"
#include "vfs.h"
+#include "filecache.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -763,6 +764,7 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
{
struct nfsd3_commitargs *argp = rqstp->rq_argp;
struct nfsd3_commitres *resp = rqstp->rq_resp;
+ struct nfsd_file *nf;
dprintk("nfsd: COMMIT(3) %s %u@%Lu\n",
SVCFH_fmt(&argp->fh),
@@ -770,8 +772,14 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
(unsigned long long) argp->offset);
fh_copy(&resp->fh, &argp->fh);
- resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset,
+ resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
+ NFSD_MAY_NOT_BREAK_LEASE, &nf);
+ if (resp->status)
+ goto out;
+ resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
argp->count, resp->verf);
+ nfsd_file_put(nf);
+out:
return rpc_success;
}
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index bb8e2f6d7d03..518203821790 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -135,7 +135,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
unsigned int flags = 0;
int size = 0;
- pacl = get_acl(inode, ACL_TYPE_ACCESS);
+ pacl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (!pacl)
pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
@@ -147,7 +147,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
if (S_ISDIR(inode->i_mode)) {
flags = NFS4_ACL_DIR;
- dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+ dpacl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(dpacl)) {
error = PTR_ERR(dpacl);
goto rel_pacl;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index f0e69edf5f0f..2a815f5a52c4 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -76,6 +76,17 @@ static __be32 *xdr_encode_empty_array(__be32 *p)
* 1 Protocol"
*/
+static void encode_uint32(struct xdr_stream *xdr, u32 n)
+{
+ WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0);
+}
+
+static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
+ size_t len)
+{
+ WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
+}
+
/*
* nfs_cb_opnum4
*
@@ -329,6 +340,24 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
}
/*
+ * CB_RECALLANY4args
+ *
+ * struct CB_RECALLANY4args {
+ * uint32_t craa_objects_to_keep;
+ * bitmap4 craa_type_mask;
+ * };
+ */
+static void
+encode_cb_recallany4args(struct xdr_stream *xdr,
+ struct nfs4_cb_compound_hdr *hdr, struct nfsd4_cb_recall_any *ra)
+{
+ encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY);
+ encode_uint32(xdr, ra->ra_keep);
+ encode_bitmap4(xdr, ra->ra_bmval, ARRAY_SIZE(ra->ra_bmval));
+ hdr->nops++;
+}
+
+/*
* CB_SEQUENCE4args
*
* struct CB_SEQUENCE4args {
@@ -482,6 +511,26 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_cb_nops(&hdr);
}
+/*
+ * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
+ */
+static void
+nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req,
+ struct xdr_stream *xdr, const void *data)
+{
+ const struct nfsd4_callback *cb = data;
+ struct nfsd4_cb_recall_any *ra;
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = cb->cb_clp->cl_cb_ident,
+ .minorversion = cb->cb_clp->cl_minorversion,
+ };
+
+ ra = container_of(cb, struct nfsd4_cb_recall_any, ra_cb);
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_recallany4args(xdr, &hdr, ra);
+ encode_cb_nops(&hdr);
+}
/*
* NFSv4.0 and NFSv4.1 XDR decode functions
@@ -520,6 +569,28 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
}
+/*
+ * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
+ */
+static int
+nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfsd4_callback *cb = data;
+ struct nfs4_cb_compound_hdr hdr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ return status;
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status || cb->cb_seq_status))
+ return status;
+ status = decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status);
+ return status;
+}
+
#ifdef CONFIG_NFSD_PNFS
/*
* CB_LAYOUTRECALL4args
@@ -783,6 +854,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
#endif
PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock),
PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload),
+ PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any),
};
static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
@@ -870,7 +942,7 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r
} else {
struct cred *kcred;
- kcred = prepare_kernel_cred(NULL);
+ kcred = prepare_kernel_cred(&init_task);
if (!kcred)
return NULL;
@@ -916,7 +988,6 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
} else {
if (!conn->cb_xprt)
return -EINVAL;
- clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_session = ses;
args.bc_xprt = conn->cb_xprt;
args.prognumber = clp->cl_cb_session->se_cb_prog;
@@ -936,6 +1007,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
rpc_shutdown_client(client);
return -ENOMEM;
}
+
+ if (clp->cl_minorversion != 0)
+ clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_client = client;
clp->cl_cb_cred = cred;
rcu_read_lock();
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index e70a1a2999b7..5e9809aff37e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -41,6 +41,7 @@
#include "idmap.h"
#include "nfsd.h"
#include "netns.h"
+#include "vfs.h"
/*
* Turn off idmapping when using AUTH_SYS.
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8beb2bc4c328..f189ba7995f5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -731,10 +731,19 @@ nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
union nfsd4_op_u *u)
{
struct nfsd4_commit *commit = &u->commit;
+ struct nfsd_file *nf;
+ __be32 status;
+
+ status = nfsd_file_acquire(rqstp, &cstate->current_fh, NFSD_MAY_WRITE |
+ NFSD_MAY_NOT_BREAK_LEASE, &nf);
+ if (status != nfs_ok)
+ return status;
- return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
+ status = nfsd_commit(rqstp, &cstate->current_fh, nf, commit->co_offset,
commit->co_count,
(__be32 *)commit->co_verf.data);
+ nfsd_file_put(nf);
+ return status;
}
static __be32
@@ -928,18 +937,13 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* the client wants us to do more in this compound:
*/
if (!nfsd4_last_compound_op(rqstp))
- __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&read->rd_stateid, RD_STATE,
&read->rd_nf, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
- goto out;
- }
- status = nfs_ok;
-out:
+
read->rd_rqstp = rqstp;
read->rd_fhp = &cstate->current_fh;
return status;
@@ -1108,10 +1112,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate,
&cstate->current_fh, &setattr->sa_stateid,
WR_STATE, NULL, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
+ if (status)
return status;
- }
}
err = fh_want_write(&cstate->current_fh);
if (err)
@@ -1133,6 +1135,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
0, (time64_t)0);
if (!status)
status = nfserrno(attrs.na_labelerr);
+ if (!status)
+ status = nfserrno(attrs.na_aclerr);
out:
nfsd_attrs_free(&attrs);
fh_drop_write(&cstate->current_fh);
@@ -1159,10 +1163,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
write->wr_offset, cnt);
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
stateid, WR_STATE, &nf, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
+ if (status)
return status;
- }
write->wr_how_written = write->wr_stable_how;
@@ -1193,17 +1195,13 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
src_stateid, RD_STATE, src, NULL);
- if (status) {
- dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
+ if (status)
goto out;
- }
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
dst_stateid, WR_STATE, dst, NULL);
- if (status) {
- dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
+ if (status)
goto out_put_src;
- }
/* fix up for NFS-specific error code */
if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) ||
@@ -1320,6 +1318,7 @@ try_again:
/* allow 20secs for mount/unmount for now - revisit */
if (signal_pending(current) ||
(schedule_timeout(20*HZ) == 0)) {
+ finish_wait(&nn->nfsd_ssc_waitq, &wait);
kfree(work);
return nfserr_eagain;
}
@@ -1463,13 +1462,6 @@ out_err:
return status;
}
-static void
-nfsd4_interssc_disconnect(struct vfsmount *ss_mnt)
-{
- nfs_do_sb_deactive(ss_mnt->mnt_sb);
- mntput(ss_mnt);
-}
-
/*
* Verify COPY destination stateid.
*
@@ -1572,11 +1564,6 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
{
}
-static void
-nfsd4_interssc_disconnect(struct vfsmount *ss_mnt)
-{
-}
-
static struct file *nfs42_ssc_open(struct vfsmount *ss_mnt,
struct nfs_fh *src_fh,
nfs4_stateid *stateid)
@@ -1644,6 +1631,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
u64 src_pos = copy->cp_src_pos;
u64 dst_pos = copy->cp_dst_pos;
int status;
+ loff_t end;
/* See RFC 7862 p.67: */
if (bytes_total == 0)
@@ -1663,8 +1651,8 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
/* for a non-zero asynchronous copy do a commit of data */
if (nfsd4_copy_is_async(copy) && copy->cp_res.wr_bytes_written > 0) {
since = READ_ONCE(dst->f_wb_err);
- status = vfs_fsync_range(dst, copy->cp_dst_pos,
- copy->cp_res.wr_bytes_written, 0);
+ end = copy->cp_dst_pos + copy->cp_res.wr_bytes_written - 1;
+ status = vfs_fsync_range(dst, copy->cp_dst_pos, end, 0);
if (!status)
status = filemap_check_wb_err(dst->f_mapping, since);
if (!status)
@@ -1771,7 +1759,7 @@ static int nfsd4_do_async_copy(void *data)
default:
nfserr = nfserr_offload_denied;
}
- nfsd4_interssc_disconnect(copy->ss_mnt);
+ /* ss_mnt will be unmounted by the laundromat */
goto do_callback;
}
nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
@@ -1852,8 +1840,10 @@ out_err:
if (async_copy)
cleanup_async_copy(async_copy);
status = nfserrno(-ENOMEM);
- if (nfsd4_ssc_is_inter(copy))
- nfsd4_interssc_disconnect(copy->ss_mnt);
+ /*
+ * source's vfsmount of inter-copy will be unmounted
+ * by the laundromat
+ */
goto out;
}
@@ -1948,10 +1938,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&fallocate->falloc_stateid,
WR_STATE, &nf, NULL);
- if (status != nfs_ok) {
- dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
+ if (status != nfs_ok)
return status;
- }
status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file,
fallocate->falloc_offset,
@@ -2007,10 +1995,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&seek->seek_stateid,
RD_STATE, &nf, NULL);
- if (status) {
- dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
+ if (status)
return status;
- }
switch (seek->seek_whence) {
case NFS4_CONTENT_DATA:
@@ -2622,12 +2608,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
cstate->minorversion = args->minorversion;
fh_init(current_fh, NFS4_FHSIZE);
fh_init(save_fh, NFS4_FHSIZE);
-
/*
* Don't use the deferral mechanism for NFSv4; compounds make it
* too hard to avoid non-idempotency problems.
*/
- __clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
/*
* According to RFC3010, this takes precedence over all other errors.
@@ -2749,7 +2734,7 @@ encode_op:
out:
cstate->status = status;
/* Reset deferral mechanism for RPC deferrals */
- __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
return rpc_success;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 836bd825ca4a..4ef529379065 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,7 +44,9 @@
#include <linux/jhash.h>
#include <linux/string_helpers.h>
#include <linux/fsnotify.h>
+#include <linux/rhashtable.h>
#include <linux/nfs_ssc.h>
+
#include "xdr4.h"
#include "xdr4cb.h"
#include "vfs.h"
@@ -84,6 +86,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
void nfsd4_end_grace(struct nfsd_net *nn);
static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
+static void nfsd4_file_hash_remove(struct nfs4_file *fi);
/* Locking: */
@@ -588,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
void
put_nfs4_file(struct nfs4_file *fi)
{
- might_lock(&state_lock);
-
- if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
- hlist_del_rcu(&fi->fi_hash);
- spin_unlock(&state_lock);
+ if (refcount_dec_and_test(&fi->fi_ref)) {
+ nfsd4_file_hash_remove(fi);
WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
@@ -675,15 +675,26 @@ find_any_file(struct nfs4_file *f)
return ret;
}
-static struct nfsd_file *find_deleg_file(struct nfs4_file *f)
+static struct nfsd_file *find_any_file_locked(struct nfs4_file *f)
{
- struct nfsd_file *ret = NULL;
+ lockdep_assert_held(&f->fi_lock);
+
+ if (f->fi_fds[O_RDWR])
+ return f->fi_fds[O_RDWR];
+ if (f->fi_fds[O_WRONLY])
+ return f->fi_fds[O_WRONLY];
+ if (f->fi_fds[O_RDONLY])
+ return f->fi_fds[O_RDONLY];
+ return NULL;
+}
+
+static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f)
+{
+ lockdep_assert_held(&f->fi_lock);
- spin_lock(&f->fi_lock);
if (f->fi_deleg_file)
- ret = nfsd_file_get(f->fi_deleg_file);
- spin_unlock(&f->fi_lock);
- return ret;
+ return f->fi_deleg_file;
+ return NULL;
}
static atomic_long_t num_delegations;
@@ -706,19 +717,20 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
return ret & OWNER_HASH_MASK;
}
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS 8
-#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
+static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
-static unsigned int file_hashval(struct svc_fh *fh)
-{
- struct inode *inode = d_inode(fh->fh_dentry);
+static const struct rhashtable_params nfs4_file_rhash_params = {
+ .key_len = sizeof_field(struct nfs4_file, fi_inode),
+ .key_offset = offsetof(struct nfs4_file, fi_inode),
+ .head_offset = offsetof(struct nfs4_file, fi_rlist),
- /* XXX: why not (here & in file cache) use inode? */
- return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS);
-}
-
-static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
+ /*
+ * Start with a single page hash table to reduce resizing churn
+ * on light workloads.
+ */
+ .min_size = 256,
+ .automatic_shrinking = true,
+};
/*
* Check if courtesy clients have conflicting access and resolve it if possible
@@ -831,9 +843,9 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
swap(f2, fp->fi_fds[O_RDWR]);
spin_unlock(&fp->fi_lock);
if (f1)
- nfsd_file_close(f1);
+ nfsd_file_put(f1);
if (f2)
- nfsd_file_close(f2);
+ nfsd_file_put(f2);
}
}
@@ -1355,6 +1367,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
WARN_ON(!list_empty(&dp->dl_recall_lru));
+ trace_nfsd_stid_revoke(&dp->dl_stid);
+
if (clp->cl_minorversion) {
dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
refcount_inc(&dp->dl_stid.sc_count);
@@ -1819,13 +1833,12 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
int numslots = fattrs->maxreqs;
int slotsize = slot_bytes(fattrs);
struct nfsd4_session *new;
- int mem, i;
+ int i;
- BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
- + sizeof(struct nfsd4_session) > PAGE_SIZE);
- mem = numslots * sizeof(struct nfsd4_slot *);
+ BUILD_BUG_ON(struct_size(new, se_slots, NFSD_MAX_SLOTS_PER_SESSION)
+ > PAGE_SIZE);
- new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
+ new = kzalloc(struct_size(new, se_slots, numslots), GFP_KERNEL);
if (!new)
return NULL;
/* allocate each struct nfsd4_slot and data cache in one piece */
@@ -2131,6 +2144,7 @@ static void __free_client(struct kref *k)
kfree(clp->cl_nii_domain.data);
kfree(clp->cl_nii_name.data);
idr_destroy(&clp->cl_stateids);
+ kfree(clp->cl_ra);
kmem_cache_free(client_slab, clp);
}
@@ -2613,9 +2627,11 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
ols = openlockstateid(st);
oo = ols->st_stateowner;
nf = st->sc_file;
- file = find_any_file(nf);
+
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
if (!file)
- return 0;
+ goto out;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2637,8 +2653,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
- nfsd_file_put(file);
-
+out:
+ spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2652,9 +2668,10 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
ols = openlockstateid(st);
oo = ols->st_stateowner;
nf = st->sc_file;
- file = find_any_file(nf);
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
if (!file)
- return 0;
+ goto out;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2674,8 +2691,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
- nfsd_file_put(file);
-
+out:
+ spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2687,9 +2704,10 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
ds = delegstateid(st);
nf = st->sc_file;
- file = find_deleg_file(nf);
+ spin_lock(&nf->fi_lock);
+ file = find_deleg_file_locked(nf);
if (!file)
- return 0;
+ goto out;
seq_printf(s, "- ");
nfs4_show_stateid(s, &st->sc_stateid);
@@ -2705,8 +2723,8 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_fname(s, file);
seq_printf(s, " }\n");
- nfsd_file_put(file);
-
+out:
+ spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2854,6 +2872,37 @@ static const struct tree_descr client_files[] = {
[3] = {""},
};
+static int
+nfsd4_cb_recall_any_done(struct nfsd4_callback *cb,
+ struct rpc_task *task)
+{
+ trace_nfsd_cb_recall_any_done(cb, task);
+ switch (task->tk_status) {
+ case -NFS4ERR_DELAY:
+ rpc_delay(task, 2 * HZ);
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+static void
+nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_client *clp = cb->cb_clp;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
+ put_client_renew_locked(clp);
+ spin_unlock(&nn->client_lock);
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
+ .done = nfsd4_cb_recall_any_done,
+ .release = nfsd4_cb_recall_any_release,
+};
+
static struct nfs4_client *create_client(struct xdr_netobj name,
struct svc_rqst *rqstp, nfs4_verifier *verf)
{
@@ -2891,6 +2940,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
free_client(clp);
return NULL;
}
+ clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL);
+ if (!clp->cl_ra) {
+ free_client(clp);
+ return NULL;
+ }
+ clp->cl_ra_time = 0;
+ nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops,
+ NFSPROC4_CLNT_CB_RECALL_ANY);
return clp;
}
@@ -4260,11 +4317,9 @@ static struct nfs4_file *nfsd4_alloc_file(void)
}
/* OPEN Share state helper functions */
-static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
- struct nfs4_file *fp)
-{
- lockdep_assert_held(&state_lock);
+static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp)
+{
refcount_set(&fp->fi_ref, 1);
spin_lock_init(&fp->fi_lock);
INIT_LIST_HEAD(&fp->fi_stateids);
@@ -4282,7 +4337,6 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval,
INIT_LIST_HEAD(&fp->fi_lo_states);
atomic_set(&fp->fi_lo_recalls, 0);
#endif
- hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
}
void
@@ -4347,25 +4401,27 @@ out:
}
static unsigned long
-nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc)
+nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
{
- int cnt;
+ int count;
struct nfsd_net *nn = container_of(shrink,
struct nfsd_net, nfsd_client_shrinker);
- cnt = atomic_read(&nn->nfsd_courtesy_clients);
- if (cnt > 0)
- mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
- return (unsigned long)cnt;
+ count = atomic_read(&nn->nfsd_courtesy_clients);
+ if (!count)
+ count = atomic_long_read(&num_delegations);
+ if (count)
+ queue_work(laundry_wq, &nn->nfsd_shrinker_work);
+ return (unsigned long)count;
}
static unsigned long
-nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc)
+nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
{
return SHRINK_STOP;
}
-int
+void
nfsd4_init_leases_net(struct nfsd_net *nn)
{
struct sysinfo si;
@@ -4387,16 +4443,6 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
atomic_set(&nn->nfsd_courtesy_clients, 0);
- nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan;
- nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count;
- nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
- return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client");
-}
-
-void
-nfsd4_leases_net_shutdown(struct nfsd_net *nn)
-{
- unregister_shrinker(&nn->nfsd_client_shrinker);
}
static void init_nfs4_replay(struct nfs4_replay *rp)
@@ -4667,71 +4713,80 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
nfs4_put_stid(&last->st_stid);
}
-/* search file_hashtbl[] for file */
-static struct nfs4_file *
-find_file_locked(struct svc_fh *fh, unsigned int hashval)
+static noinline_for_stack struct nfs4_file *
+nfsd4_file_hash_lookup(const struct svc_fh *fhp)
{
- struct nfs4_file *fp;
+ struct inode *inode = d_inode(fhp->fh_dentry);
+ struct rhlist_head *tmp, *list;
+ struct nfs4_file *fi;
- hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
- lockdep_is_held(&state_lock)) {
- if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
- if (refcount_inc_not_zero(&fp->fi_ref))
- return fp;
+ rcu_read_lock();
+ list = rhltable_lookup(&nfs4_file_rhltable, &inode,
+ nfs4_file_rhash_params);
+ rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
+ if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+ if (refcount_inc_not_zero(&fi->fi_ref)) {
+ rcu_read_unlock();
+ return fi;
+ }
}
}
+ rcu_read_unlock();
return NULL;
}
-static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh,
- unsigned int hashval)
+/*
+ * On hash insertion, identify entries with the same inode but
+ * distinct filehandles. They will all be on the list returned
+ * by rhltable_lookup().
+ *
+ * inode->i_lock prevents racing insertions from adding an entry
+ * for the same inode/fhp pair twice.
+ */
+static noinline_for_stack struct nfs4_file *
+nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp)
{
- struct nfs4_file *fp;
+ struct inode *inode = d_inode(fhp->fh_dentry);
+ struct rhlist_head *tmp, *list;
struct nfs4_file *ret = NULL;
bool alias_found = false;
+ struct nfs4_file *fi;
+ int err;
- spin_lock(&state_lock);
- hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
- lockdep_is_held(&state_lock)) {
- if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) {
- if (refcount_inc_not_zero(&fp->fi_ref))
- ret = fp;
- } else if (d_inode(fh->fh_dentry) == fp->fi_inode)
- fp->fi_aliased = alias_found = true;
- }
- if (likely(ret == NULL)) {
- nfsd4_init_file(fh, hashval, new);
- new->fi_aliased = alias_found;
- ret = new;
+ rcu_read_lock();
+ spin_lock(&inode->i_lock);
+
+ list = rhltable_lookup(&nfs4_file_rhltable, &inode,
+ nfs4_file_rhash_params);
+ rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
+ if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
+ if (refcount_inc_not_zero(&fi->fi_ref))
+ ret = fi;
+ } else
+ fi->fi_aliased = alias_found = true;
}
- spin_unlock(&state_lock);
- return ret;
-}
+ if (ret)
+ goto out_unlock;
-static struct nfs4_file * find_file(struct svc_fh *fh)
-{
- struct nfs4_file *fp;
- unsigned int hashval = file_hashval(fh);
+ nfsd4_file_init(fhp, new);
+ err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist,
+ nfs4_file_rhash_params);
+ if (err)
+ goto out_unlock;
- rcu_read_lock();
- fp = find_file_locked(fh, hashval);
+ new->fi_aliased = alias_found;
+ ret = new;
+
+out_unlock:
+ spin_unlock(&inode->i_lock);
rcu_read_unlock();
- return fp;
+ return ret;
}
-static struct nfs4_file *
-find_or_add_file(struct nfs4_file *new, struct svc_fh *fh)
+static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi)
{
- struct nfs4_file *fp;
- unsigned int hashval = file_hashval(fh);
-
- rcu_read_lock();
- fp = find_file_locked(fh, hashval);
- rcu_read_unlock();
- if (fp)
- return fp;
-
- return insert_file(new, fh, hashval);
+ rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
+ nfs4_file_rhash_params);
}
/*
@@ -4744,9 +4799,10 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
struct nfs4_file *fp;
__be32 ret = nfs_ok;
- fp = find_file(current_fh);
+ fp = nfsd4_file_hash_lookup(current_fh);
if (!fp)
return ret;
+
/* Check for conflicting share reservations */
spin_lock(&fp->fi_lock);
if (fp->fi_share_deny & deny_type)
@@ -4758,7 +4814,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
static bool nfsd4_deleg_present(const struct inode *inode)
{
- struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx);
+ struct file_lock_context *ctx = locks_inode_context(inode);
return ctx && !list_empty_careful(&ctx->flc_lease);
}
@@ -5196,18 +5252,10 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
if (!fp->fi_fds[oflag]) {
spin_unlock(&fp->fi_lock);
- if (!open->op_filp) {
- status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
- if (status != nfs_ok)
- goto out_put_access;
- } else {
- status = nfsd_file_create(rqstp, cur_fh, access, &nf);
- if (status != nfs_ok)
- goto out_put_access;
- nf->nf_file = open->op_filp;
- open->op_filp = NULL;
- trace_nfsd_file_create(rqstp, access, nf);
- }
+ status = nfsd_file_acquire_opened(rqstp, cur_fh, access,
+ open->op_filp, &nf);
+ if (status != nfs_ok)
+ goto out_put_access;
spin_lock(&fp->fi_lock);
if (!fp->fi_fds[oflag]) {
@@ -5620,7 +5668,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* and check for delegations in the process of being recalled.
* If not found, create the nfs4_file struct
*/
- fp = find_or_add_file(open->op_file, current_fh);
+ fp = nfsd4_file_hash_insert(open->op_file, current_fh);
+ if (unlikely(!fp))
+ return nfserr_jukebox;
if (fp != open->op_file) {
status = nfs4_check_deleg(cl, open, &dp);
if (status)
@@ -5897,7 +5947,7 @@ nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo)
list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
nf = stp->st_stid.sc_file;
- ctx = nf->fi_inode->i_flctx;
+ ctx = locks_inode_context(nf->fi_inode);
if (!ctx)
continue;
if (locks_owner_has_blockers(ctx, lo))
@@ -6125,17 +6175,63 @@ laundromat_main(struct work_struct *laundry)
}
static void
-courtesy_client_reaper(struct work_struct *reaper)
+courtesy_client_reaper(struct nfsd_net *nn)
{
struct list_head reaplist;
- struct delayed_work *dwork = to_delayed_work(reaper);
- struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
- nfsd_shrinker_work);
nfs4_get_courtesy_client_reaplist(nn, &reaplist);
nfs4_process_client_reaplist(&reaplist);
}
+static void
+deleg_reaper(struct nfsd_net *nn)
+{
+ struct list_head *pos, *next;
+ struct nfs4_client *clp;
+ struct list_head cblist;
+
+ INIT_LIST_HEAD(&cblist);
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
+ clp = list_entry(pos, struct nfs4_client, cl_lru);
+ if (clp->cl_state != NFSD4_ACTIVE ||
+ list_empty(&clp->cl_delegations) ||
+ atomic_read(&clp->cl_delegs_in_recall) ||
+ test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) ||
+ (ktime_get_boottime_seconds() -
+ clp->cl_ra_time < 5)) {
+ continue;
+ }
+ list_add(&clp->cl_ra_cblist, &cblist);
+
+ /* release in nfsd4_cb_recall_any_release */
+ atomic_inc(&clp->cl_rpc_users);
+ set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
+ clp->cl_ra_time = ktime_get_boottime_seconds();
+ }
+ spin_unlock(&nn->client_lock);
+
+ while (!list_empty(&cblist)) {
+ clp = list_first_entry(&cblist, struct nfs4_client,
+ cl_ra_cblist);
+ list_del_init(&clp->cl_ra_cblist);
+ clp->cl_ra->ra_keep = 0;
+ clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG);
+ trace_nfsd_cb_recall_any(clp->cl_ra);
+ nfsd4_run_cb(&clp->cl_ra->ra_cb);
+ }
+}
+
+static void
+nfsd4_state_shrinker_worker(struct work_struct *work)
+{
+ struct nfsd_net *nn = container_of(work, struct nfsd_net,
+ nfsd_shrinker_work);
+
+ courtesy_client_reaper(nn);
+ deleg_reaper(nn);
+}
+
static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
{
if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle))
@@ -6902,6 +6998,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (status)
goto put_stateid;
+ trace_nfsd_deleg_return(stateid);
wake_up_var(d_inode(cstate->current_fh.fh_dentry));
destroy_delegation(dp);
put_stateid:
@@ -7713,7 +7810,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
}
inode = locks_inode(nf->nf_file);
- flctx = inode->i_flctx;
+ flctx = locks_inode_context(inode);
if (flctx && !list_empty_careful(&flctx->flc_posix)) {
spin_lock(&flctx->flc_lock);
@@ -7958,11 +8055,20 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->blocked_locks_lru);
INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
- INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, courtesy_client_reaper);
+ INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
get_net(net);
+ nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan;
+ nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count;
+ nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
+
+ if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"))
+ goto err_shrinker;
return 0;
+err_shrinker:
+ put_net(net);
+ kfree(nn->sessionid_hashtbl);
err_sessionid:
kfree(nn->unconf_id_hashtbl);
err_unconf_id:
@@ -8034,10 +8140,16 @@ nfs4_state_start(void)
{
int ret;
- ret = nfsd4_create_callback_queue();
+ ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
if (ret)
return ret;
+ ret = nfsd4_create_callback_queue();
+ if (ret) {
+ rhltable_destroy(&nfs4_file_rhltable);
+ return ret;
+ }
+
set_max_delegations();
return 0;
}
@@ -8049,6 +8161,8 @@ nfs4_state_shutdown_net(struct net *net)
struct list_head *pos, *next, reaplist;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ unregister_shrinker(&nn->nfsd_client_shrinker);
+ cancel_work(&nn->nfsd_shrinker_work);
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
@@ -8068,6 +8182,7 @@ nfs4_state_shutdown_net(struct net *net)
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
+ rhltable_destroy(&nfs4_file_rhltable);
#ifdef CONFIG_NFSD_V4_2_INTER_SSC
nfsd4_ssc_shutdown_umount(nn);
#endif
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index bcfeb1a922c0..97edb32be77f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -770,16 +770,18 @@ nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
static __be32
nfsd4_decode_access(struct nfsd4_compoundargs *argp,
- struct nfsd4_access *access)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_access *access = &u->access;
if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0)
return nfserr_bad_xdr;
return nfs_ok;
}
static __be32
-nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
+nfsd4_decode_close(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_close *close = &u->close;
if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0)
return nfserr_bad_xdr;
return nfsd4_decode_stateid4(argp, &close->cl_stateid);
@@ -787,8 +789,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
static __be32
-nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
+nfsd4_decode_commit(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_commit *commit = &u->commit;
if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0)
return nfserr_bad_xdr;
if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0)
@@ -798,8 +801,9 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
}
static __be32
-nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
+nfsd4_decode_create(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_create *create = &u->create;
__be32 *p, status;
memset(create, 0, sizeof(*create));
@@ -844,22 +848,25 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
}
static inline __be32
-nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
+nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_delegreturn *dr = &u->delegreturn;
return nfsd4_decode_stateid4(argp, &dr->dr_stateid);
}
static inline __be32
-nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
+nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_getattr *getattr = &u->getattr;
memset(getattr, 0, sizeof(*getattr));
return nfsd4_decode_bitmap4(argp, getattr->ga_bmval,
ARRAY_SIZE(getattr->ga_bmval));
}
static __be32
-nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
+nfsd4_decode_link(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_link *link = &u->link;
memset(link, 0, sizeof(*link));
return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen);
}
@@ -907,8 +914,9 @@ nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
}
static __be32
-nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
+nfsd4_decode_lock(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_lock *lock = &u->lock;
memset(lock, 0, sizeof(*lock));
if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0)
return nfserr_bad_xdr;
@@ -924,8 +932,9 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
}
static __be32
-nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
+nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_lockt *lockt = &u->lockt;
memset(lockt, 0, sizeof(*lockt));
if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0)
return nfserr_bad_xdr;
@@ -940,8 +949,9 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
}
static __be32
-nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
+nfsd4_decode_locku(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_locku *locku = &u->locku;
__be32 status;
if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0)
@@ -962,8 +972,9 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
}
static __be32
-nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
+nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_lookup *lookup = &u->lookup;
return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len);
}
@@ -1143,8 +1154,9 @@ nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp,
}
static __be32
-nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
+nfsd4_decode_open(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_open *open = &u->open;
__be32 status;
u32 dummy;
@@ -1171,8 +1183,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
}
static __be32
-nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
+nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open_confirm *open_conf = &u->open_confirm;
__be32 status;
if (argp->minorversion >= 1)
@@ -1190,8 +1204,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
}
static __be32
-nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
+nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open_downgrade *open_down = &u->open_downgrade;
__be32 status;
memset(open_down, 0, sizeof(*open_down));
@@ -1209,8 +1225,9 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
}
static __be32
-nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
+nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_putfh *putfh = &u->putfh;
__be32 *p;
if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0)
@@ -1229,7 +1246,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
}
static __be32
-nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
+nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
{
if (argp->minorversion == 0)
return nfs_ok;
@@ -1237,8 +1254,9 @@ nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
}
static __be32
-nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
+nfsd4_decode_read(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_read *read = &u->read;
__be32 status;
memset(read, 0, sizeof(*read));
@@ -1254,8 +1272,9 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
}
static __be32
-nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
+nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_readdir *readdir = &u->readdir;
__be32 status;
memset(readdir, 0, sizeof(*readdir));
@@ -1276,15 +1295,17 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
}
static __be32
-nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
+nfsd4_decode_remove(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_remove *remove = &u->remove;
memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo));
return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen);
}
static __be32
-nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
+nfsd4_decode_rename(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_rename *rename = &u->rename;
__be32 status;
memset(rename, 0, sizeof(*rename));
@@ -1295,22 +1316,25 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
}
static __be32
-nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
+nfsd4_decode_renew(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ clientid_t *clientid = &u->renew;
return nfsd4_decode_clientid4(argp, clientid);
}
static __be32
nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
- struct nfsd4_secinfo *secinfo)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_secinfo *secinfo = &u->secinfo;
secinfo->si_exp = NULL;
return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen);
}
static __be32
-nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
+nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_setattr *setattr = &u->setattr;
__be32 status;
memset(setattr, 0, sizeof(*setattr));
@@ -1324,8 +1348,9 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
}
static __be32
-nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
+nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_setclientid *setclientid = &u->setclientid;
__be32 *p, status;
memset(setclientid, 0, sizeof(*setclientid));
@@ -1367,8 +1392,10 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
}
static __be32
-nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
+nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setclientid_confirm *scd_c = &u->setclientid_confirm;
__be32 status;
if (argp->minorversion >= 1)
@@ -1382,8 +1409,9 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
/* Also used for NVERIFY */
static __be32
-nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
+nfsd4_decode_verify(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_verify *verify = &u->verify;
__be32 *p, status;
memset(verify, 0, sizeof(*verify));
@@ -1409,8 +1437,9 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
}
static __be32
-nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
+nfsd4_decode_write(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_write *write = &u->write;
__be32 status;
status = nfsd4_decode_stateid4(argp, &write->wr_stateid);
@@ -1434,8 +1463,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
}
static __be32
-nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
+nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner;
__be32 status;
if (argp->minorversion >= 1)
@@ -1452,16 +1483,20 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
return nfs_ok;
}
-static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
+static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl;
memset(bc, 0, sizeof(*bc));
if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0)
return nfserr_bad_xdr;
return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
}
-static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
+static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
u32 use_conn_in_rdma_mode;
__be32 status;
@@ -1603,8 +1638,9 @@ nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
- struct nfsd4_exchange_id *exid)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_exchange_id *exid = &u->exchange_id;
__be32 status;
memset(exid, 0, sizeof(*exid));
@@ -1656,8 +1692,9 @@ nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
- struct nfsd4_create_session *sess)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_create_session *sess = &u->create_session;
__be32 status;
memset(sess, 0, sizeof(*sess));
@@ -1681,23 +1718,26 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
- struct nfsd4_destroy_session *destroy_session)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_destroy_session *destroy_session = &u->destroy_session;
return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid);
}
static __be32
nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
- struct nfsd4_free_stateid *free_stateid)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_free_stateid *free_stateid = &u->free_stateid;
return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid);
}
#ifdef CONFIG_NFSD_PNFS
static __be32
nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
- struct nfsd4_getdeviceinfo *gdev)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
__be32 status;
memset(gdev, 0, sizeof(*gdev));
@@ -1717,8 +1757,9 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
- struct nfsd4_layoutcommit *lcp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
__be32 *p, status;
memset(lcp, 0, sizeof(*lcp));
@@ -1753,8 +1794,9 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
- struct nfsd4_layoutget *lgp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutget *lgp = &u->layoutget;
__be32 status;
memset(lgp, 0, sizeof(*lgp));
@@ -1781,8 +1823,9 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
- struct nfsd4_layoutreturn *lrp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
memset(lrp, 0, sizeof(*lrp));
if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0)
return nfserr_bad_xdr;
@@ -1795,8 +1838,9 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
#endif /* CONFIG_NFSD_PNFS */
static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
- struct nfsd4_secinfo_no_name *sin)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_secinfo_no_name *sin = &u->secinfo_no_name;
if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0)
return nfserr_bad_xdr;
@@ -1806,8 +1850,9 @@ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
- struct nfsd4_sequence *seq)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_sequence *seq = &u->sequence;
__be32 *p, status;
status = nfsd4_decode_sessionid4(argp, &seq->sessionid);
@@ -1826,8 +1871,10 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
}
static __be32
-nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
+nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
struct nfsd4_test_stateid_id *stateid;
__be32 status;
u32 i;
@@ -1852,14 +1899,16 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
}
static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp,
- struct nfsd4_destroy_clientid *dc)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_destroy_clientid *dc = &u->destroy_clientid;
return nfsd4_decode_clientid4(argp, &dc->clientid);
}
static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp,
- struct nfsd4_reclaim_complete *rc)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_reclaim_complete *rc = &u->reclaim_complete;
if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0)
return nfserr_bad_xdr;
return nfs_ok;
@@ -1867,8 +1916,9 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
- struct nfsd4_fallocate *fallocate)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_fallocate *fallocate = &u->allocate;
__be32 status;
status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid);
@@ -1924,8 +1974,9 @@ static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp,
}
static __be32
-nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
+nfsd4_decode_copy(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_copy *copy = &u->copy;
u32 consecutive, i, count, sync;
struct nl4_server *ns_dummy;
__be32 status;
@@ -1982,8 +2033,9 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
static __be32
nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
- struct nfsd4_copy_notify *cn)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_copy_notify *cn = &u->copy_notify;
__be32 status;
memset(cn, 0, sizeof(*cn));
@@ -2002,16 +2054,18 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
- struct nfsd4_offload_status *os)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_offload_status *os = &u->offload_status;
os->count = 0;
os->status = 0;
return nfsd4_decode_stateid4(argp, &os->stateid);
}
static __be32
-nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+nfsd4_decode_seek(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_seek *seek = &u->seek;
__be32 status;
status = nfsd4_decode_stateid4(argp, &seek->seek_stateid);
@@ -2028,8 +2082,9 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
}
static __be32
-nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
+nfsd4_decode_clone(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
{
+ struct nfsd4_clone *clone = &u->clone;
__be32 status;
status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid);
@@ -2154,8 +2209,9 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep)
*/
static __be32
nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
- struct nfsd4_getxattr *getxattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getxattr *getxattr = &u->getxattr;
__be32 status;
u32 maxcount;
@@ -2173,8 +2229,9 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
- struct nfsd4_setxattr *setxattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setxattr *setxattr = &u->setxattr;
u32 flags, maxcount, size;
__be32 status;
@@ -2214,8 +2271,9 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
- struct nfsd4_listxattrs *listxattrs)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_listxattrs *listxattrs = &u->listxattrs;
u32 maxcount;
memset(listxattrs, 0, sizeof(*listxattrs));
@@ -2245,113 +2303,114 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp,
- struct nfsd4_removexattr *removexattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_removexattr *removexattr = &u->removexattr;
memset(removexattr, 0, sizeof(*removexattr));
return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name);
}
static __be32
-nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
{
return nfs_ok;
}
static __be32
-nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
{
return nfserr_notsupp;
}
-typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u);
static const nfsd4_dec nfsd4_dec_ops[] = {
- [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access,
- [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close,
- [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit,
- [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create,
- [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn,
- [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr,
- [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_LINK] = (nfsd4_dec)nfsd4_decode_link,
- [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock,
- [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt,
- [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku,
- [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup,
- [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify,
- [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open,
- [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
- [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
- [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
- [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh,
- [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
- [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
- [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove,
- [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename,
- [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew,
- [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo,
- [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr,
- [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid,
- [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
- [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify,
- [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write,
- [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
+ [OP_ACCESS] = nfsd4_decode_access,
+ [OP_CLOSE] = nfsd4_decode_close,
+ [OP_COMMIT] = nfsd4_decode_commit,
+ [OP_CREATE] = nfsd4_decode_create,
+ [OP_DELEGPURGE] = nfsd4_decode_notsupp,
+ [OP_DELEGRETURN] = nfsd4_decode_delegreturn,
+ [OP_GETATTR] = nfsd4_decode_getattr,
+ [OP_GETFH] = nfsd4_decode_noop,
+ [OP_LINK] = nfsd4_decode_link,
+ [OP_LOCK] = nfsd4_decode_lock,
+ [OP_LOCKT] = nfsd4_decode_lockt,
+ [OP_LOCKU] = nfsd4_decode_locku,
+ [OP_LOOKUP] = nfsd4_decode_lookup,
+ [OP_LOOKUPP] = nfsd4_decode_noop,
+ [OP_NVERIFY] = nfsd4_decode_verify,
+ [OP_OPEN] = nfsd4_decode_open,
+ [OP_OPENATTR] = nfsd4_decode_notsupp,
+ [OP_OPEN_CONFIRM] = nfsd4_decode_open_confirm,
+ [OP_OPEN_DOWNGRADE] = nfsd4_decode_open_downgrade,
+ [OP_PUTFH] = nfsd4_decode_putfh,
+ [OP_PUTPUBFH] = nfsd4_decode_putpubfh,
+ [OP_PUTROOTFH] = nfsd4_decode_noop,
+ [OP_READ] = nfsd4_decode_read,
+ [OP_READDIR] = nfsd4_decode_readdir,
+ [OP_READLINK] = nfsd4_decode_noop,
+ [OP_REMOVE] = nfsd4_decode_remove,
+ [OP_RENAME] = nfsd4_decode_rename,
+ [OP_RENEW] = nfsd4_decode_renew,
+ [OP_RESTOREFH] = nfsd4_decode_noop,
+ [OP_SAVEFH] = nfsd4_decode_noop,
+ [OP_SECINFO] = nfsd4_decode_secinfo,
+ [OP_SETATTR] = nfsd4_decode_setattr,
+ [OP_SETCLIENTID] = nfsd4_decode_setclientid,
+ [OP_SETCLIENTID_CONFIRM] = nfsd4_decode_setclientid_confirm,
+ [OP_VERIFY] = nfsd4_decode_verify,
+ [OP_WRITE] = nfsd4_decode_write,
+ [OP_RELEASE_LOCKOWNER] = nfsd4_decode_release_lockowner,
/* new operations for NFSv4.1 */
- [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
- [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
- [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
- [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
- [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
- [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
- [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_BACKCHANNEL_CTL] = nfsd4_decode_backchannel_ctl,
+ [OP_BIND_CONN_TO_SESSION] = nfsd4_decode_bind_conn_to_session,
+ [OP_EXCHANGE_ID] = nfsd4_decode_exchange_id,
+ [OP_CREATE_SESSION] = nfsd4_decode_create_session,
+ [OP_DESTROY_SESSION] = nfsd4_decode_destroy_session,
+ [OP_FREE_STATEID] = nfsd4_decode_free_stateid,
+ [OP_GET_DIR_DELEGATION] = nfsd4_decode_notsupp,
#ifdef CONFIG_NFSD_PNFS
- [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
- [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
- [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
- [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
+ [OP_GETDEVICEINFO] = nfsd4_decode_getdeviceinfo,
+ [OP_GETDEVICELIST] = nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] = nfsd4_decode_layoutcommit,
+ [OP_LAYOUTGET] = nfsd4_decode_layoutget,
+ [OP_LAYOUTRETURN] = nfsd4_decode_layoutreturn,
#else
- [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_GETDEVICEINFO] = nfsd4_decode_notsupp,
+ [OP_GETDEVICELIST] = nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] = nfsd4_decode_notsupp,
+ [OP_LAYOUTGET] = nfsd4_decode_notsupp,
+ [OP_LAYOUTRETURN] = nfsd4_decode_notsupp,
#endif
- [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
- [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
- [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid,
- [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
- [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
+ [OP_SECINFO_NO_NAME] = nfsd4_decode_secinfo_no_name,
+ [OP_SEQUENCE] = nfsd4_decode_sequence,
+ [OP_SET_SSV] = nfsd4_decode_notsupp,
+ [OP_TEST_STATEID] = nfsd4_decode_test_stateid,
+ [OP_WANT_DELEGATION] = nfsd4_decode_notsupp,
+ [OP_DESTROY_CLIENTID] = nfsd4_decode_destroy_clientid,
+ [OP_RECLAIM_COMPLETE] = nfsd4_decode_reclaim_complete,
/* new operations for NFSv4.2 */
- [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
- [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy,
- [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_copy_notify,
- [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
- [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status,
- [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status,
- [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read,
- [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
- [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone,
+ [OP_ALLOCATE] = nfsd4_decode_fallocate,
+ [OP_COPY] = nfsd4_decode_copy,
+ [OP_COPY_NOTIFY] = nfsd4_decode_copy_notify,
+ [OP_DEALLOCATE] = nfsd4_decode_fallocate,
+ [OP_IO_ADVISE] = nfsd4_decode_notsupp,
+ [OP_LAYOUTERROR] = nfsd4_decode_notsupp,
+ [OP_LAYOUTSTATS] = nfsd4_decode_notsupp,
+ [OP_OFFLOAD_CANCEL] = nfsd4_decode_offload_status,
+ [OP_OFFLOAD_STATUS] = nfsd4_decode_offload_status,
+ [OP_READ_PLUS] = nfsd4_decode_read,
+ [OP_SEEK] = nfsd4_decode_seek,
+ [OP_WRITE_SAME] = nfsd4_decode_notsupp,
+ [OP_CLONE] = nfsd4_decode_clone,
/* RFC 8276 extended atributes operations */
- [OP_GETXATTR] = (nfsd4_dec)nfsd4_decode_getxattr,
- [OP_SETXATTR] = (nfsd4_dec)nfsd4_decode_setxattr,
- [OP_LISTXATTRS] = (nfsd4_dec)nfsd4_decode_listxattrs,
- [OP_REMOVEXATTR] = (nfsd4_dec)nfsd4_decode_removexattr,
+ [OP_GETXATTR] = nfsd4_decode_getxattr,
+ [OP_SETXATTR] = nfsd4_decode_setxattr,
+ [OP_LISTXATTRS] = nfsd4_decode_listxattrs,
+ [OP_REMOVEXATTR] = nfsd4_decode_removexattr,
};
static inline bool
@@ -2464,7 +2523,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
- __clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
+ clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
return true;
}
@@ -3570,6 +3629,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
case nfserr_noent:
xdr_truncate_encode(xdr, start_offset);
goto skip_entry;
+ case nfserr_jukebox:
+ /*
+ * The pseudoroot should only display dentries that lead to
+ * exports. If we get EJUKEBOX here, then we can't tell whether
+ * this entry should be included. Just fail the whole READDIR
+ * with NFS4ERR_DELAY in that case, and hope that the situation
+ * will resolve itself by the client's next attempt.
+ */
+ if (cd->rd_fhp->fh_export->ex_flags & NFSEXP_V4ROOT)
+ goto fail;
+ fallthrough;
default:
/*
* If the client requested the RDATTR_ERROR attribute,
@@ -3630,8 +3700,10 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
}
static __be32
-nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
+nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_access *access = &u->access;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3643,8 +3715,10 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
return 0;
}
-static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
+static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3660,8 +3734,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
}
static __be32
-nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
+nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_close *close = &u->close;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &close->cl_stateid);
@@ -3669,8 +3745,10 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
static __be32
-nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
+nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_commit *commit = &u->commit;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3683,8 +3761,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
}
static __be32
-nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
+nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_create *create = &u->create;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3697,8 +3777,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
}
static __be32
-nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
+nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getattr *getattr = &u->getattr;
struct svc_fh *fhp = getattr->ga_fhp;
struct xdr_stream *xdr = resp->xdr;
@@ -3707,8 +3789,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
}
static __be32
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct svc_fh **fhpp = &u->getfh;
struct xdr_stream *xdr = resp->xdr;
struct svc_fh *fhp = *fhpp;
unsigned int len;
@@ -3762,8 +3846,10 @@ again:
}
static __be32
-nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
+nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_lock *lock = &u->lock;
struct xdr_stream *xdr = resp->xdr;
if (!nfserr)
@@ -3775,8 +3861,10 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
}
static __be32
-nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
+nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_lockt *lockt = &u->lockt;
struct xdr_stream *xdr = resp->xdr;
if (nfserr == nfserr_denied)
@@ -3785,8 +3873,10 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
}
static __be32
-nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
+nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_locku *locku = &u->locku;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
@@ -3794,8 +3884,10 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
static __be32
-nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
+nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_link *link = &u->link;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3808,8 +3900,10 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
static __be32
-nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open *open = &u->open;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -3902,16 +3996,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
}
static __be32
-nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
+nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open_confirm *oc = &u->open_confirm;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
}
static __be32
-nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
+nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_open_downgrade *od = &u->open_downgrade;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_encode_stateid(xdr, &od->od_stateid);
@@ -4010,8 +4108,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
static __be32
nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_read *read)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_read *read = &u->read;
bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
unsigned long maxcount;
struct xdr_stream *xdr = resp->xdr;
@@ -4052,8 +4151,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
-nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_readlink *readlink = &u->readlink;
__be32 *p, *maxcount_p, zero = xdr_zero;
struct xdr_stream *xdr = resp->xdr;
int length_offset = xdr->buf->len;
@@ -4097,8 +4198,10 @@ out_err:
}
static __be32
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_readdir *readdir = &u->readdir;
int maxcount;
int bytes_left;
loff_t offset;
@@ -4188,8 +4291,10 @@ err_no_verf:
}
static __be32
-nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
+nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_remove *remove = &u->remove;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4201,8 +4306,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
}
static __be32
-nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
+nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_rename *rename = &u->rename;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4284,8 +4391,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
static __be32
nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_secinfo *secinfo)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_secinfo *secinfo = &u->secinfo;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp);
@@ -4293,8 +4401,9 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_secinfo_no_name *secinfo)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_secinfo_no_name *secinfo = &u->secinfo_no_name;
struct xdr_stream *xdr = resp->xdr;
return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
@@ -4305,8 +4414,10 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
* regardless of the error status.
*/
static __be32
-nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
+nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setattr *setattr = &u->setattr;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4329,8 +4440,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
}
static __be32
-nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
+nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setclientid *scd = &u->setclientid;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4353,8 +4466,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
}
static __be32
-nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
+nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *u)
{
+ struct nfsd4_write *write = &u->write;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4370,8 +4485,9 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_exchange_id *exid)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_exchange_id *exid = &u->exchange_id;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
char *major_id;
@@ -4448,8 +4564,9 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_create_session *sess)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_create_session *sess = &u->create_session;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4501,8 +4618,9 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_sequence *seq)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_sequence *seq = &u->sequence;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4524,8 +4642,9 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_test_stateid *test_stateid)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
struct xdr_stream *xdr = resp->xdr;
struct nfsd4_test_stateid_id *stateid, *next;
__be32 *p;
@@ -4545,8 +4664,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
#ifdef CONFIG_NFSD_PNFS
static __be32
nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_getdeviceinfo *gdev)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
struct xdr_stream *xdr = resp->xdr;
const struct nfsd4_layout_ops *ops;
u32 starting_len = xdr->buf->len, needed_len;
@@ -4601,8 +4721,9 @@ toosmall:
static __be32
nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_layoutget *lgp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutget *lgp = &u->layoutget;
struct xdr_stream *xdr = resp->xdr;
const struct nfsd4_layout_ops *ops;
__be32 *p;
@@ -4628,8 +4749,9 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_layoutcommit *lcp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4649,8 +4771,9 @@ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_layoutreturn *lrp)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4735,8 +4858,9 @@ nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
static __be32
nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_copy *copy)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_copy *copy = &u->copy;
__be32 *p;
nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
@@ -4752,8 +4876,9 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_offload_status *os)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_offload_status *os = &u->offload_status;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4767,156 +4892,83 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
- struct nfsd4_read *read,
- unsigned long *maxcount, u32 *eof,
- loff_t *pos)
+ struct nfsd4_read *read)
{
- struct xdr_stream *xdr = resp->xdr;
+ bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
struct file *file = read->rd_nf->nf_file;
- int starting_len = xdr->buf->len;
- loff_t hole_pos;
- __be32 nfserr;
- __be32 *p, tmp;
- __be64 tmp64;
-
- hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
- if (hole_pos > read->rd_offset)
- *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
- *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
+ struct xdr_stream *xdr = resp->xdr;
+ unsigned long maxcount;
+ __be32 nfserr, *p;
/* Content type, offset, byte count */
p = xdr_reserve_space(xdr, 4 + 8 + 4);
if (!p)
- return nfserr_resource;
+ return nfserr_io;
+ if (resp->xdr->buf->page_len && splice_ok) {
+ WARN_ON_ONCE(splice_ok);
+ return nfserr_serverfault;
+ }
- read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
- if (read->rd_vlen < 0)
- return nfserr_resource;
+ maxcount = min_t(unsigned long, read->rd_length,
+ (xdr->buf->buflen - xdr->buf->len));
- nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
- resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
+ if (file->f_op->splice_read && splice_ok)
+ nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ else
+ nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
if (nfserr)
return nfserr;
- xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
-
- tmp = htonl(NFS4_CONTENT_DATA);
- write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
- tmp64 = cpu_to_be64(read->rd_offset);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
- tmp = htonl(*maxcount);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
-
- tmp = xdr_zero;
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
- xdr_pad_size(*maxcount));
- return nfs_ok;
-}
-
-static __be32
-nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
- struct nfsd4_read *read,
- unsigned long *maxcount, u32 *eof)
-{
- struct file *file = read->rd_nf->nf_file;
- loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
- loff_t f_size = i_size_read(file_inode(file));
- unsigned long count;
- __be32 *p;
- if (data_pos == -ENXIO)
- data_pos = f_size;
- else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
- return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
- count = data_pos - read->rd_offset;
-
- /* Content type, offset, byte count */
- p = xdr_reserve_space(resp->xdr, 4 + 8 + 8);
- if (!p)
- return nfserr_resource;
-
- *p++ = htonl(NFS4_CONTENT_HOLE);
+ *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
p = xdr_encode_hyper(p, read->rd_offset);
- p = xdr_encode_hyper(p, count);
+ *p = cpu_to_be32(read->rd_length);
- *eof = (read->rd_offset + count) >= f_size;
- *maxcount = min_t(unsigned long, count, *maxcount);
return nfs_ok;
}
static __be32
nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_read *read)
+ union nfsd4_op_u *u)
{
- unsigned long maxcount, count;
+ struct nfsd4_read *read = &u->read;
+ struct file *file = read->rd_nf->nf_file;
struct xdr_stream *xdr = resp->xdr;
- struct file *file;
int starting_len = xdr->buf->len;
- int last_segment = xdr->buf->len;
- int segments = 0;
- __be32 *p, tmp;
- bool is_data;
- loff_t pos;
- u32 eof;
+ u32 segments = 0;
+ __be32 *p;
if (nfserr)
return nfserr;
- file = read->rd_nf->nf_file;
/* eof flag, segment count */
p = xdr_reserve_space(xdr, 4 + 4);
if (!p)
- return nfserr_resource;
+ return nfserr_io;
xdr_commit_encode(xdr);
- maxcount = min_t(unsigned long, read->rd_length,
- (xdr->buf->buflen - xdr->buf->len));
- count = maxcount;
-
- eof = read->rd_offset >= i_size_read(file_inode(file));
- if (eof)
+ read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
+ if (read->rd_eof)
goto out;
- pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
- is_data = pos > read->rd_offset;
-
- while (count > 0 && !eof) {
- maxcount = count;
- if (is_data)
- nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
- segments == 0 ? &pos : NULL);
- else
- nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
- if (nfserr)
- goto out;
- count -= maxcount;
- read->rd_offset += maxcount;
- is_data = !is_data;
- last_segment = xdr->buf->len;
- segments++;
- }
-
-out:
- if (nfserr && segments == 0)
+ nfserr = nfsd4_encode_read_plus_data(resp, read);
+ if (nfserr) {
xdr_truncate_encode(xdr, starting_len);
- else {
- if (nfserr) {
- xdr_truncate_encode(xdr, last_segment);
- nfserr = nfs_ok;
- eof = 0;
- }
- tmp = htonl(eof);
- write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
- tmp = htonl(segments);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+ return nfserr;
}
+ segments++;
+
+out:
+ p = xdr_encode_bool(p, read->rd_eof);
+ *p = cpu_to_be32(segments);
return nfserr;
}
static __be32
nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_copy_notify *cn)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_copy_notify *cn = &u->copy_notify;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -4950,8 +5002,9 @@ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_seek *seek)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_seek *seek = &u->seek;
__be32 *p;
p = xdr_reserve_space(resp->xdr, 4 + 8);
@@ -4962,7 +5015,8 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
-nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr,
+ union nfsd4_op_u *p)
{
return nfserr;
}
@@ -5013,8 +5067,9 @@ nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen)
static __be32
nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_getxattr *getxattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_getxattr *getxattr = &u->getxattr;
struct xdr_stream *xdr = resp->xdr;
__be32 *p, err;
@@ -5037,8 +5092,9 @@ nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_setxattr *setxattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_setxattr *setxattr = &u->setxattr;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -5078,8 +5134,9 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs,
static __be32
nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_listxattrs *listxattrs)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_listxattrs *listxattrs = &u->listxattrs;
struct xdr_stream *xdr = resp->xdr;
u32 cookie_offset, count_offset, eof;
u32 left, xdrleft, slen, count;
@@ -5189,8 +5246,9 @@ out:
static __be32
nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_removexattr *removexattr)
+ union nfsd4_op_u *u)
{
+ struct nfsd4_removexattr *removexattr = &u->removexattr;
struct xdr_stream *xdr = resp->xdr;
__be32 *p;
@@ -5202,7 +5260,7 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
return 0;
}
-typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u);
/*
* Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
@@ -5210,93 +5268,93 @@ typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
* done in the decoding phase.
*/
static const nfsd4_enc nfsd4_enc_ops[] = {
- [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
- [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
- [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit,
- [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create,
- [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr,
- [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh,
- [OP_LINK] = (nfsd4_enc)nfsd4_encode_link,
- [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock,
- [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt,
- [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku,
- [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open,
- [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm,
- [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade,
- [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_READ] = (nfsd4_enc)nfsd4_encode_read,
- [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir,
- [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink,
- [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove,
- [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename,
- [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo,
- [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr,
- [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid,
- [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
- [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_ACCESS] = nfsd4_encode_access,
+ [OP_CLOSE] = nfsd4_encode_close,
+ [OP_COMMIT] = nfsd4_encode_commit,
+ [OP_CREATE] = nfsd4_encode_create,
+ [OP_DELEGPURGE] = nfsd4_encode_noop,
+ [OP_DELEGRETURN] = nfsd4_encode_noop,
+ [OP_GETATTR] = nfsd4_encode_getattr,
+ [OP_GETFH] = nfsd4_encode_getfh,
+ [OP_LINK] = nfsd4_encode_link,
+ [OP_LOCK] = nfsd4_encode_lock,
+ [OP_LOCKT] = nfsd4_encode_lockt,
+ [OP_LOCKU] = nfsd4_encode_locku,
+ [OP_LOOKUP] = nfsd4_encode_noop,
+ [OP_LOOKUPP] = nfsd4_encode_noop,
+ [OP_NVERIFY] = nfsd4_encode_noop,
+ [OP_OPEN] = nfsd4_encode_open,
+ [OP_OPENATTR] = nfsd4_encode_noop,
+ [OP_OPEN_CONFIRM] = nfsd4_encode_open_confirm,
+ [OP_OPEN_DOWNGRADE] = nfsd4_encode_open_downgrade,
+ [OP_PUTFH] = nfsd4_encode_noop,
+ [OP_PUTPUBFH] = nfsd4_encode_noop,
+ [OP_PUTROOTFH] = nfsd4_encode_noop,
+ [OP_READ] = nfsd4_encode_read,
+ [OP_READDIR] = nfsd4_encode_readdir,
+ [OP_READLINK] = nfsd4_encode_readlink,
+ [OP_REMOVE] = nfsd4_encode_remove,
+ [OP_RENAME] = nfsd4_encode_rename,
+ [OP_RENEW] = nfsd4_encode_noop,
+ [OP_RESTOREFH] = nfsd4_encode_noop,
+ [OP_SAVEFH] = nfsd4_encode_noop,
+ [OP_SECINFO] = nfsd4_encode_secinfo,
+ [OP_SETATTR] = nfsd4_encode_setattr,
+ [OP_SETCLIENTID] = nfsd4_encode_setclientid,
+ [OP_SETCLIENTID_CONFIRM] = nfsd4_encode_noop,
+ [OP_VERIFY] = nfsd4_encode_noop,
+ [OP_WRITE] = nfsd4_encode_write,
+ [OP_RELEASE_LOCKOWNER] = nfsd4_encode_noop,
/* NFSv4.1 operations */
- [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
- [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
- [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
- [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_BACKCHANNEL_CTL] = nfsd4_encode_noop,
+ [OP_BIND_CONN_TO_SESSION] = nfsd4_encode_bind_conn_to_session,
+ [OP_EXCHANGE_ID] = nfsd4_encode_exchange_id,
+ [OP_CREATE_SESSION] = nfsd4_encode_create_session,
+ [OP_DESTROY_SESSION] = nfsd4_encode_noop,
+ [OP_FREE_STATEID] = nfsd4_encode_noop,
+ [OP_GET_DIR_DELEGATION] = nfsd4_encode_noop,
#ifdef CONFIG_NFSD_PNFS
- [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
- [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
- [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
- [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
+ [OP_GETDEVICEINFO] = nfsd4_encode_getdeviceinfo,
+ [OP_GETDEVICELIST] = nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = nfsd4_encode_layoutcommit,
+ [OP_LAYOUTGET] = nfsd4_encode_layoutget,
+ [OP_LAYOUTRETURN] = nfsd4_encode_layoutreturn,
#else
- [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GETDEVICEINFO] = nfsd4_encode_noop,
+ [OP_GETDEVICELIST] = nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = nfsd4_encode_noop,
+ [OP_LAYOUTGET] = nfsd4_encode_noop,
+ [OP_LAYOUTRETURN] = nfsd4_encode_noop,
#endif
- [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
- [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
- [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid,
- [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SECINFO_NO_NAME] = nfsd4_encode_secinfo_no_name,
+ [OP_SEQUENCE] = nfsd4_encode_sequence,
+ [OP_SET_SSV] = nfsd4_encode_noop,
+ [OP_TEST_STATEID] = nfsd4_encode_test_stateid,
+ [OP_WANT_DELEGATION] = nfsd4_encode_noop,
+ [OP_DESTROY_CLIENTID] = nfsd4_encode_noop,
+ [OP_RECLAIM_COMPLETE] = nfsd4_encode_noop,
/* NFSv4.2 operations */
- [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy,
- [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_copy_notify,
- [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status,
- [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus,
- [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
- [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_ALLOCATE] = nfsd4_encode_noop,
+ [OP_COPY] = nfsd4_encode_copy,
+ [OP_COPY_NOTIFY] = nfsd4_encode_copy_notify,
+ [OP_DEALLOCATE] = nfsd4_encode_noop,
+ [OP_IO_ADVISE] = nfsd4_encode_noop,
+ [OP_LAYOUTERROR] = nfsd4_encode_noop,
+ [OP_LAYOUTSTATS] = nfsd4_encode_noop,
+ [OP_OFFLOAD_CANCEL] = nfsd4_encode_noop,
+ [OP_OFFLOAD_STATUS] = nfsd4_encode_offload_status,
+ [OP_READ_PLUS] = nfsd4_encode_read_plus,
+ [OP_SEEK] = nfsd4_encode_seek,
+ [OP_WRITE_SAME] = nfsd4_encode_noop,
+ [OP_CLONE] = nfsd4_encode_noop,
/* RFC 8276 extended atributes operations */
- [OP_GETXATTR] = (nfsd4_enc)nfsd4_encode_getxattr,
- [OP_SETXATTR] = (nfsd4_enc)nfsd4_encode_setxattr,
- [OP_LISTXATTRS] = (nfsd4_enc)nfsd4_encode_listxattrs,
- [OP_REMOVEXATTR] = (nfsd4_enc)nfsd4_encode_removexattr,
+ [OP_GETXATTR] = nfsd4_encode_getxattr,
+ [OP_SETXATTR] = nfsd4_encode_setxattr,
+ [OP_LISTXATTRS] = nfsd4_encode_listxattrs,
+ [OP_REMOVEXATTR] = nfsd4_encode_removexattr,
};
/*
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index dc74a947a440..c2577ee7ffb2 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -581,7 +581,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET;
switch(num) {
+#ifdef CONFIG_NFSD_V2
case 2:
+#endif
case 3:
nfsd_vers(nn, num, cmd);
break;
@@ -601,7 +603,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
}
break;
default:
- return -EINVAL;
+ /* Ignore requests to disable non-existent versions */
+ if (cmd == NFSD_SET)
+ return -EINVAL;
}
vers += len + 1;
} while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -1453,9 +1457,7 @@ static __net_init int nfsd_init_net(struct net *net)
goto out_idmap_error;
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
- retval = nfsd4_init_leases_net(nn);
- if (retval)
- goto out_drc_error;
+ nfsd4_init_leases_net(nn);
retval = nfsd_reply_cache_init(nn);
if (retval)
goto out_cache_error;
@@ -1465,8 +1467,6 @@ static __net_init int nfsd_init_net(struct net *net)
return 0;
out_cache_error:
- nfsd4_leases_net_shutdown(nn);
-out_drc_error:
nfsd_idmap_shutdown(net);
out_idmap_error:
nfsd_export_shutdown(net);
@@ -1482,7 +1482,6 @@ static __net_exit void nfsd_exit_net(struct net *net)
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
- nfsd4_leases_net_shutdown(nn);
}
static struct pernet_operations nfsd_net_ops = {
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 09726c5b9a31..fa0144a74267 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -64,8 +64,7 @@ struct readdir_cd {
extern struct svc_program nfsd_program;
-extern const struct svc_version nfsd_version2, nfsd_version3,
- nfsd_version4;
+extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4;
extern struct mutex nfsd_mutex;
extern spinlock_t nfsd_drc_lock;
extern unsigned long nfsd_drc_max_mem;
@@ -505,8 +504,7 @@ extern void unregister_cld_notifier(void);
extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
#endif
-extern int nfsd4_init_leases_net(struct nfsd_net *nn);
-extern void nfsd4_leases_net_shutdown(struct nfsd_net *nn);
+extern void nfsd4_init_leases_net(struct nfsd_net *nn);
#else /* CONFIG_NFSD_V4 */
static inline int nfsd4_is_junction(struct dentry *dentry)
@@ -514,8 +512,7 @@ static inline int nfsd4_is_junction(struct dentry *dentry)
return 0;
}
-static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; };
-static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) {};
+static inline void nfsd4_init_leases_net(struct nfsd_net *nn) { };
#define register_cld_notifier() 0
#define unregister_cld_notifier() do { } while(0)
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index c3ae6414fc5c..513e028b0bbe 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -220,7 +220,7 @@ __be32 fh_update(struct svc_fh *);
void fh_put(struct svc_fh *);
static __inline__ struct svc_fh *
-fh_copy(struct svc_fh *dst, struct svc_fh *src)
+fh_copy(struct svc_fh *dst, const struct svc_fh *src)
{
WARN_ON(src->fh_dentry);
@@ -229,7 +229,7 @@ fh_copy(struct svc_fh *dst, struct svc_fh *src)
}
static inline void
-fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+fh_copy_shallow(struct knfsd_fh *dst, const struct knfsd_fh *src)
{
dst->fh_size = src->fh_size;
memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
@@ -243,7 +243,8 @@ fh_init(struct svc_fh *fhp, int maxsize)
return fhp;
}
-static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+static inline bool fh_match(const struct knfsd_fh *fh1,
+ const struct knfsd_fh *fh2)
{
if (fh1->fh_size != fh2->fh_size)
return false;
@@ -252,7 +253,8 @@ static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
return true;
}
-static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+static inline bool fh_fsid_match(const struct knfsd_fh *fh1,
+ const struct knfsd_fh *fh2)
{
if (fh1->fh_fsid_type != fh2->fh_fsid_type)
return false;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 82b3ddeacc33..9744443c3965 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -211,7 +211,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
- return rpc_drop_reply;
+ set_bit(RQ_DROPME, &rqstp->rq_flags);
return rpc_success;
}
@@ -246,7 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
- return rpc_drop_reply;
+ set_bit(RQ_DROPME, &rqstp->rq_flags);
return rpc_success;
}
@@ -848,65 +848,3 @@ const struct svc_version nfsd_version2 = {
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS2_SVC_XDRSIZE,
};
-
-/*
- * Map errnos to NFS errnos.
- */
-__be32
-nfserrno (int errno)
-{
- static struct {
- __be32 nfserr;
- int syserr;
- } nfs_errtbl[] = {
- { nfs_ok, 0 },
- { nfserr_perm, -EPERM },
- { nfserr_noent, -ENOENT },
- { nfserr_io, -EIO },
- { nfserr_nxio, -ENXIO },
- { nfserr_fbig, -E2BIG },
- { nfserr_stale, -EBADF },
- { nfserr_acces, -EACCES },
- { nfserr_exist, -EEXIST },
- { nfserr_xdev, -EXDEV },
- { nfserr_mlink, -EMLINK },
- { nfserr_nodev, -ENODEV },
- { nfserr_notdir, -ENOTDIR },
- { nfserr_isdir, -EISDIR },
- { nfserr_inval, -EINVAL },
- { nfserr_fbig, -EFBIG },
- { nfserr_nospc, -ENOSPC },
- { nfserr_rofs, -EROFS },
- { nfserr_mlink, -EMLINK },
- { nfserr_nametoolong, -ENAMETOOLONG },
- { nfserr_notempty, -ENOTEMPTY },
-#ifdef EDQUOT
- { nfserr_dquot, -EDQUOT },
-#endif
- { nfserr_stale, -ESTALE },
- { nfserr_jukebox, -ETIMEDOUT },
- { nfserr_jukebox, -ERESTARTSYS },
- { nfserr_jukebox, -EAGAIN },
- { nfserr_jukebox, -EWOULDBLOCK },
- { nfserr_jukebox, -ENOMEM },
- { nfserr_io, -ETXTBSY },
- { nfserr_notsupp, -EOPNOTSUPP },
- { nfserr_toosmall, -ETOOSMALL },
- { nfserr_serverfault, -ESERVERFAULT },
- { nfserr_serverfault, -ENFILE },
- { nfserr_io, -EREMOTEIO },
- { nfserr_stale, -EOPENSTALE },
- { nfserr_io, -EUCLEAN },
- { nfserr_perm, -ENOKEY },
- { nfserr_no_grace, -ENOGRACE},
- };
- int i;
-
- for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
- if (nfs_errtbl[i].syserr == errno)
- return nfs_errtbl[i].nfserr;
- }
- WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
- return nfserr_io;
-}
-
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index bfbd9f672f59..325d3d3f1211 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -91,8 +91,12 @@ unsigned long nfsd_drc_mem_used;
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
static struct svc_stat nfsd_acl_svcstats;
static const struct svc_version *nfsd_acl_version[] = {
+# if defined(CONFIG_NFSD_V2_ACL)
[2] = &nfsd_acl_version2,
+# endif
+# if defined(CONFIG_NFSD_V3_ACL)
[3] = &nfsd_acl_version3,
+# endif
};
#define NFSD_ACL_MINVERS 2
@@ -116,7 +120,9 @@ static struct svc_stat nfsd_acl_svcstats = {
#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
static const struct svc_version *nfsd_version[] = {
+#if defined(CONFIG_NFSD_V2)
[2] = &nfsd_version2,
+#endif
[3] = &nfsd_version3,
#if defined(CONFIG_NFSD_V4)
[4] = &nfsd_version4,
@@ -447,8 +453,8 @@ static void nfsd_shutdown_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- nfsd_file_cache_shutdown_net(net);
nfs4_state_shutdown_net(net);
+ nfsd_file_cache_shutdown_net(net);
if (nn->lockd_up) {
lockd_down(net);
nn->lockd_up = false;
@@ -1054,7 +1060,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
svcxdr_init_encode(rqstp);
*statp = proc->pc_func(rqstp);
- if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
+ if (test_bit(RQ_DROPME, &rqstp->rq_flags))
goto out_update_drop;
if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e2daef3cc003..e94634d30591 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -368,6 +368,7 @@ struct nfs4_client {
#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
1 << NFSD4_CLIENT_CB_KILL)
+#define NFSD4_CLIENT_CB_RECALL_ANY (6)
unsigned long cl_flags;
const struct cred *cl_cb_cred;
struct rpc_clnt *cl_cb_client;
@@ -411,6 +412,10 @@ struct nfs4_client {
unsigned int cl_state;
atomic_t cl_delegs_in_recall;
+
+ struct nfsd4_cb_recall_any *cl_ra;
+ time64_t cl_ra_time;
+ struct list_head cl_ra_cblist;
};
/* struct nfs4_client_reset
@@ -536,16 +541,13 @@ struct nfs4_clnt_odstate {
* inode can have multiple filehandles associated with it, so there is
* (potentially) a many to one relationship between this struct and struct
* inode.
- *
- * These are hashed by filehandle in the file_hashtbl, which is protected by
- * the global state_lock spinlock.
*/
struct nfs4_file {
refcount_t fi_ref;
struct inode * fi_inode;
bool fi_aliased;
spinlock_t fi_lock;
- struct hlist_node fi_hash; /* hash on fi_fhandle */
+ struct rhlist_head fi_rlist;
struct list_head fi_stateids;
union {
struct list_head fi_delegations;
@@ -639,6 +641,7 @@ enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_OFFLOAD,
NFSPROC4_CLNT_CB_SEQUENCE,
NFSPROC4_CLNT_CB_NOTIFY_LOCK,
+ NFSPROC4_CLNT_CB_RECALL_ANY,
};
/* Returns true iff a is later than b: */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index d4b6839bb459..8f9c82d9e075 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,9 +9,12 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+#include <linux/sunrpc/xprt.h>
+#include <trace/misc/nfs.h>
#include "export.h"
#include "nfsfh.h"
+#include "xdr4.h"
#define NFSD_TRACE_PROC_RES_FIELDS \
__field(unsigned int, netns_ino) \
@@ -604,6 +607,7 @@ DEFINE_STATEID_EVENT(layout_recall_release);
DEFINE_STATEID_EVENT(open);
DEFINE_STATEID_EVENT(deleg_read);
+DEFINE_STATEID_EVENT(deleg_return);
DEFINE_STATEID_EVENT(deleg_recall);
DECLARE_EVENT_CLASS(nfsd_stateseqid_class,
@@ -636,6 +640,61 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
DEFINE_STATESEQID_EVENT(preprocess);
DEFINE_STATESEQID_EVENT(open_confirm);
+TRACE_DEFINE_ENUM(NFS4_OPEN_STID);
+TRACE_DEFINE_ENUM(NFS4_LOCK_STID);
+TRACE_DEFINE_ENUM(NFS4_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_CLOSED_STID);
+TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID);
+TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID);
+
+#define show_stid_type(x) \
+ __print_flags(x, "|", \
+ { NFS4_OPEN_STID, "OPEN" }, \
+ { NFS4_LOCK_STID, "LOCK" }, \
+ { NFS4_DELEG_STID, "DELEG" }, \
+ { NFS4_CLOSED_STID, "CLOSED" }, \
+ { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \
+ { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \
+ { NFS4_LAYOUT_STID, "LAYOUT" })
+
+DECLARE_EVENT_CLASS(nfsd_stid_class,
+ TP_PROTO(
+ const struct nfs4_stid *stid
+ ),
+ TP_ARGS(stid),
+ TP_STRUCT__entry(
+ __field(unsigned long, sc_type)
+ __field(int, sc_count)
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ ),
+ TP_fast_assign(
+ const stateid_t *stp = &stid->sc_stateid;
+
+ __entry->sc_type = stid->sc_type;
+ __entry->sc_count = refcount_read(&stid->sc_count);
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ ),
+ TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s",
+ __entry->cl_boot, __entry->cl_id,
+ __entry->si_id, __entry->si_generation,
+ __entry->sc_count, show_stid_type(__entry->sc_type)
+ )
+);
+
+#define DEFINE_STID_EVENT(name) \
+DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \
+ TP_PROTO(const struct nfs4_stid *stid), \
+ TP_ARGS(stid))
+
+DEFINE_STID_EVENT(revoke);
+
DECLARE_EVENT_CLASS(nfsd_clientid_class,
TP_PROTO(const clientid_t *clid),
TP_ARGS(clid),
@@ -817,7 +876,8 @@ DEFINE_CLID_EVENT(confirmed_r);
__print_flags(val, "|", \
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
{ 1 << NFSD_FILE_PENDING, "PENDING" }, \
- { 1 << NFSD_FILE_REFERENCED, "REFERENCED"})
+ { 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \
+ { 1 << NFSD_FILE_GC, "GC" })
DECLARE_EVENT_CLASS(nfsd_file_class,
TP_PROTO(struct nfsd_file *nf),
@@ -849,10 +909,11 @@ DEFINE_EVENT(nfsd_file_class, name, \
TP_PROTO(struct nfsd_file *nf), \
TP_ARGS(nf))
-DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
TRACE_EVENT(nfsd_file_alloc,
TP_PROTO(
@@ -920,43 +981,6 @@ TRACE_EVENT(nfsd_file_acquire,
)
);
-TRACE_EVENT(nfsd_file_create,
- TP_PROTO(
- const struct svc_rqst *rqstp,
- unsigned int may_flags,
- const struct nfsd_file *nf
- ),
-
- TP_ARGS(rqstp, may_flags, nf),
-
- TP_STRUCT__entry(
- __field(const void *, nf_inode)
- __field(const void *, nf_file)
- __field(unsigned long, may_flags)
- __field(unsigned long, nf_flags)
- __field(unsigned long, nf_may)
- __field(unsigned int, nf_ref)
- __field(u32, xid)
- ),
-
- TP_fast_assign(
- __entry->nf_inode = nf->nf_inode;
- __entry->nf_file = nf->nf_file;
- __entry->may_flags = may_flags;
- __entry->nf_flags = nf->nf_flags;
- __entry->nf_may = nf->nf_may;
- __entry->nf_ref = refcount_read(&nf->nf_ref);
- __entry->xid = be32_to_cpu(rqstp->rq_xid);
- ),
-
- TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p",
- __entry->xid, __entry->nf_inode,
- show_nfsd_may_flags(__entry->may_flags),
- __entry->nf_ref, show_nf_flags(__entry->nf_flags),
- show_nfsd_may_flags(__entry->nf_may), __entry->nf_file
- )
-);
-
TRACE_EVENT(nfsd_file_insert_err,
TP_PROTO(
const struct svc_rqst *rqstp,
@@ -1018,8 +1042,8 @@ TRACE_EVENT(nfsd_file_cons_err,
)
);
-TRACE_EVENT(nfsd_file_open,
- TP_PROTO(struct nfsd_file *nf, __be32 status),
+DECLARE_EVENT_CLASS(nfsd_file_open_class,
+ TP_PROTO(const struct nfsd_file *nf, __be32 status),
TP_ARGS(nf, status),
TP_STRUCT__entry(
__field(void *, nf_inode) /* cannot be dereferenced */
@@ -1043,34 +1067,16 @@ TRACE_EVENT(nfsd_file_open,
__entry->nf_file)
)
-DECLARE_EVENT_CLASS(nfsd_file_search_class,
- TP_PROTO(
- const struct inode *inode,
- unsigned int count
- ),
- TP_ARGS(inode, count),
- TP_STRUCT__entry(
- __field(const struct inode *, inode)
- __field(unsigned int, count)
- ),
- TP_fast_assign(
- __entry->inode = inode;
- __entry->count = count;
- ),
- TP_printk("inode=%p count=%u",
- __entry->inode, __entry->count)
-);
-
-#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \
-DEFINE_EVENT(nfsd_file_search_class, name, \
+#define DEFINE_NFSD_FILE_OPEN_EVENT(name) \
+DEFINE_EVENT(nfsd_file_open_class, name, \
TP_PROTO( \
- const struct inode *inode, \
- unsigned int count \
+ const struct nfsd_file *nf, \
+ __be32 status \
), \
- TP_ARGS(inode, count))
+ TP_ARGS(nf, status))
-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
+DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_open);
+DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_opened);
TRACE_EVENT(nfsd_file_is_cached,
TP_PROTO(
@@ -1149,7 +1155,6 @@ DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced);
-DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_hashed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed);
DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class,
@@ -1181,6 +1186,53 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \
DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed);
DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed);
+TRACE_EVENT(nfsd_file_close,
+ TP_PROTO(
+ const struct inode *inode
+ ),
+ TP_ARGS(inode),
+ TP_STRUCT__entry(
+ __field(const void *, inode)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ ),
+ TP_printk("inode=%p",
+ __entry->inode
+ )
+);
+
+TRACE_EVENT(nfsd_file_fsync,
+ TP_PROTO(
+ const struct nfsd_file *nf,
+ int ret
+ ),
+ TP_ARGS(nf, ret),
+ TP_STRUCT__entry(
+ __field(void *, nf_inode)
+ __field(int, nf_ref)
+ __field(int, ret)
+ __field(unsigned long, nf_flags)
+ __field(unsigned char, nf_may)
+ __field(struct file *, nf_file)
+ ),
+ TP_fast_assign(
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->ret = ret;
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+ TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d",
+ __entry->nf_inode,
+ __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+ show_nfsd_may_flags(__entry->nf_may),
+ __entry->nf_file, __entry->ret
+ )
+);
+
#include "cache.h"
TRACE_DEFINE_ENUM(RC_DROPIT);
@@ -1474,6 +1526,32 @@ TRACE_EVENT(nfsd_cb_offload,
__entry->fh_hash, __entry->count, __entry->status)
);
+TRACE_EVENT(nfsd_cb_recall_any,
+ TP_PROTO(
+ const struct nfsd4_cb_recall_any *ra
+ ),
+ TP_ARGS(ra),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, keep)
+ __field(unsigned long, bmval0)
+ __sockaddr(addr, ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = ra->ra_cb.cb_clp->cl_clientid.cl_boot;
+ __entry->cl_id = ra->ra_cb.cb_clp->cl_clientid.cl_id;
+ __entry->keep = ra->ra_keep;
+ __entry->bmval0 = ra->ra_bmval[0];
+ __assign_sockaddr(addr, &ra->ra_cb.cb_clp->cl_addr,
+ ra->ra_cb.cb_clp->cl_cb_conn.cb_addrlen);
+ ),
+ TP_printk("addr=%pISpc client %08x:%08x keep=%u bmval0=%s",
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->keep, show_rca_mask(__entry->bmval0)
+ )
+);
+
DECLARE_EVENT_CLASS(nfsd_cb_done_class,
TP_PROTO(
const stateid_t *stp,
@@ -1513,6 +1591,27 @@ DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done);
DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done);
DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done);
+TRACE_EVENT(nfsd_cb_recall_any_done,
+ TP_PROTO(
+ const struct nfsd4_callback *cb,
+ const struct rpc_task *task
+ ),
+ TP_ARGS(cb, task),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(int, status)
+ ),
+ TP_fast_assign(
+ __entry->status = task->tk_status;
+ __entry->cl_boot = cb->cb_clp->cl_clientid.cl_boot;
+ __entry->cl_id = cb->cb_clp->cl_clientid.cl_id;
+ ),
+ TP_printk("client %08x:%08x status=%d",
+ __entry->cl_boot, __entry->cl_id, __entry->status
+ )
+);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 849a720ab43f..4c3a0d84043c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -49,6 +49,69 @@
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
+/**
+ * nfserrno - Map Linux errnos to NFS errnos
+ * @errno: POSIX(-ish) error code to be mapped
+ *
+ * Returns the appropriate (net-endian) nfserr_* (or nfs_ok if errno is 0). If
+ * it's an error we don't expect, log it once and return nfserr_io.
+ */
+__be32
+nfserrno (int errno)
+{
+ static struct {
+ __be32 nfserr;
+ int syserr;
+ } nfs_errtbl[] = {
+ { nfs_ok, 0 },
+ { nfserr_perm, -EPERM },
+ { nfserr_noent, -ENOENT },
+ { nfserr_io, -EIO },
+ { nfserr_nxio, -ENXIO },
+ { nfserr_fbig, -E2BIG },
+ { nfserr_stale, -EBADF },
+ { nfserr_acces, -EACCES },
+ { nfserr_exist, -EEXIST },
+ { nfserr_xdev, -EXDEV },
+ { nfserr_mlink, -EMLINK },
+ { nfserr_nodev, -ENODEV },
+ { nfserr_notdir, -ENOTDIR },
+ { nfserr_isdir, -EISDIR },
+ { nfserr_inval, -EINVAL },
+ { nfserr_fbig, -EFBIG },
+ { nfserr_nospc, -ENOSPC },
+ { nfserr_rofs, -EROFS },
+ { nfserr_mlink, -EMLINK },
+ { nfserr_nametoolong, -ENAMETOOLONG },
+ { nfserr_notempty, -ENOTEMPTY },
+ { nfserr_dquot, -EDQUOT },
+ { nfserr_stale, -ESTALE },
+ { nfserr_jukebox, -ETIMEDOUT },
+ { nfserr_jukebox, -ERESTARTSYS },
+ { nfserr_jukebox, -EAGAIN },
+ { nfserr_jukebox, -EWOULDBLOCK },
+ { nfserr_jukebox, -ENOMEM },
+ { nfserr_io, -ETXTBSY },
+ { nfserr_notsupp, -EOPNOTSUPP },
+ { nfserr_toosmall, -ETOOSMALL },
+ { nfserr_serverfault, -ESERVERFAULT },
+ { nfserr_serverfault, -ENFILE },
+ { nfserr_io, -EREMOTEIO },
+ { nfserr_stale, -EOPENSTALE },
+ { nfserr_io, -EUCLEAN },
+ { nfserr_perm, -ENOKEY },
+ { nfserr_no_grace, -ENOGRACE},
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
+ if (nfs_errtbl[i].syserr == errno)
+ return nfs_errtbl[i].nfserr;
+ }
+ WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
+ return nfserr_io;
+}
+
/*
* Called from nfsd_lookup and encode_dirent. Check if we have crossed
* a mount point.
@@ -480,12 +543,12 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
attr->na_seclabel->data, attr->na_seclabel->len);
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl)
attr->na_aclerr = set_posix_acl(&init_user_ns,
- inode, ACL_TYPE_ACCESS,
+ dentry, ACL_TYPE_ACCESS,
attr->na_pacl);
if (IS_ENABLED(CONFIG_FS_POSIX_ACL) &&
!attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode))
attr->na_aclerr = set_posix_acl(&init_user_ns,
- inode, ACL_TYPE_DEFAULT,
+ dentry, ACL_TYPE_DEFAULT,
attr->na_dpacl);
inode_unlock(inode);
if (size_change)
@@ -943,7 +1006,7 @@ __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
ssize_t host_err;
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_kvec(&iter, READ, vec, vlen, *count);
+ iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count);
host_err = vfs_iter_read(file, &iter, &ppos, 0);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
@@ -1033,7 +1096,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
if (stable && !use_wgather)
flags |= RWF_SYNC;
- iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
+ iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt);
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
@@ -1085,7 +1148,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 err;
trace_nfsd_read_start(rqstp, fhp, offset, *count);
- err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+ err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf);
if (err)
return err;
@@ -1117,7 +1180,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
- err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
+ err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf);
if (err)
goto out;
@@ -1133,6 +1196,7 @@ out:
* nfsd_commit - Commit pending writes to stable storage
* @rqstp: RPC request being processed
* @fhp: NFS filehandle
+ * @nf: target file
* @offset: raw offset from beginning of file
* @count: raw count of bytes to sync
* @verf: filled in with the server's current write verifier
@@ -1149,19 +1213,13 @@ out:
* An nfsstat value in network byte order.
*/
__be32
-nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
- u32 count, __be32 *verf)
+nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+ u64 offset, u32 count, __be32 *verf)
{
+ __be32 err = nfs_ok;
u64 maxbytes;
loff_t start, end;
struct nfsd_net *nn;
- struct nfsd_file *nf;
- __be32 err;
-
- err = nfsd_file_acquire(rqstp, fhp,
- NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
- if (err)
- goto out;
/*
* Convert the client-provided (offset, count) range to a
@@ -1202,8 +1260,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
} else
nfsd_copy_write_verifier(verf, nn);
- nfsd_file_put(nf);
-out:
return err;
}
@@ -1305,7 +1361,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
iap->ia_mode &= ~current_umask();
err = 0;
- host_err = 0;
switch (type) {
case S_IFREG:
host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 120521bc7b24..dbdfef7ae85b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -60,6 +60,7 @@ static inline void nfsd_attrs_free(struct nfsd_attrs *attrs)
posix_acl_release(attrs->na_dpacl);
}
+__be32 nfserrno (int errno);
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
struct svc_export **expp);
__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
@@ -88,7 +89,8 @@ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct svc_fh *resfhp, struct nfsd_attrs *iap);
__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
- u64 offset, u32 count, __be32 *verf);
+ struct nfsd_file *nf, u64 offset, u32 count,
+ __be32 *verf);
#ifdef CONFIG_NFSD_V4
__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *name, void **bufp, int *lenp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 0eb00105d845..4fd2cf6d1d2d 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -896,5 +896,10 @@ struct nfsd4_operation {
union nfsd4_op_u *);
};
+struct nfsd4_cb_recall_any {
+ struct nfsd4_callback ra_cb;
+ u32 ra_keep;
+ u32 ra_bmval[1];
+};
#endif
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 547cf07cf4e0..0d39af1b00a0 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -48,3 +48,9 @@
#define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+#define NFS4_enc_cb_recall_any_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + 1 + 1)
+#define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b9d15c3df3cc..40ce92a332fe 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -480,9 +480,18 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh,
&submit_ptr);
if (ret) {
- if (ret != -EEXIST)
- return ret;
- goto out_check;
+ if (likely(ret == -EEXIST))
+ goto out_check;
+ if (ret == -ENOENT) {
+ /*
+ * Block address translation failed due to invalid
+ * value of 'ptr'. In this case, return internal code
+ * -EINVAL (broken bmap) to notify bmap layer of fatal
+ * metadata corruption.
+ */
+ ret = -EINVAL;
+ }
+ return ret;
}
if (ra) {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3335ef352915..76c3bd88b858 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2752,7 +2752,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
down_write(&nilfs->ns_segctor_sem);
- del_timer_sync(&sci->sc_timer);
+ timer_shutdown_sync(&sci->sc_timer);
kfree(sci);
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index c8b89b4f94e0..2064e6473d30 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,6 +13,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/random.h>
+#include <linux/log2.h>
#include <linux/crc32.h>
#include "nilfs.h"
#include "segment.h"
@@ -193,6 +194,34 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
}
/**
+ * nilfs_get_blocksize - get block size from raw superblock data
+ * @sb: super block instance
+ * @sbp: superblock raw data buffer
+ * @blocksize: place to store block size
+ *
+ * nilfs_get_blocksize() calculates the block size from the block size
+ * exponent information written in @sbp and stores it in @blocksize,
+ * or aborts with an error message if it's too large.
+ *
+ * Return Value: On success, 0 is returned. If the block size is too
+ * large, -EINVAL is returned.
+ */
+static int nilfs_get_blocksize(struct super_block *sb,
+ struct nilfs_super_block *sbp, int *blocksize)
+{
+ unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
+
+ if (unlikely(shift_bits >
+ ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)) {
+ nilfs_err(sb, "too large filesystem blocksize: 2 ^ %u KiB",
+ shift_bits);
+ return -EINVAL;
+ }
+ *blocksize = BLOCK_SIZE << shift_bits;
+ return 0;
+}
+
+/**
* load_nilfs - load and recover the nilfs
* @nilfs: the_nilfs structure to be released
* @sb: super block instance used to recover past segment
@@ -245,11 +274,15 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
/* verify consistency between two super blocks */
- blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
+ err = nilfs_get_blocksize(sb, sbp[0], &blocksize);
+ if (err)
+ goto scan_error;
+
if (blocksize != nilfs->ns_blocksize) {
nilfs_warn(sb,
"blocksize differs between two super blocks (%d != %d)",
blocksize, nilfs->ns_blocksize);
+ err = -EINVAL;
goto scan_error;
}
@@ -443,11 +476,33 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp)
return crc == le32_to_cpu(sbp->s_sum);
}
-static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+/**
+ * nilfs_sb2_bad_offset - check the location of the second superblock
+ * @sbp: superblock raw data buffer
+ * @offset: byte offset of second superblock calculated from device size
+ *
+ * nilfs_sb2_bad_offset() checks if the position on the second
+ * superblock is valid or not based on the filesystem parameters
+ * stored in @sbp. If @offset points to a location within the segment
+ * area, or if the parameters themselves are not normal, it is
+ * determined to be invalid.
+ *
+ * Return Value: true if invalid, false if valid.
+ */
+static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
{
- return offset < ((le64_to_cpu(sbp->s_nsegments) *
- le32_to_cpu(sbp->s_blocks_per_segment)) <<
- (le32_to_cpu(sbp->s_log_block_size) + 10));
+ unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
+ u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+ u64 nsegments = le64_to_cpu(sbp->s_nsegments);
+ u64 index;
+
+ if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS ||
+ shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)
+ return true;
+
+ index = offset >> (shift_bits + BLOCK_SIZE_BITS);
+ do_div(index, blocks_per_segment);
+ return index < nsegments;
}
static void nilfs_release_super_block(struct the_nilfs *nilfs)
@@ -586,9 +641,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
if (err)
goto failed_sbh;
- blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
- if (blocksize < NILFS_MIN_BLOCK_SIZE ||
- blocksize > NILFS_MAX_BLOCK_SIZE) {
+ err = nilfs_get_blocksize(sb, sbp, &blocksize);
+ if (err)
+ goto failed_sbh;
+
+ if (blocksize < NILFS_MIN_BLOCK_SIZE) {
nilfs_err(sb,
"couldn't mount because of unsupported filesystem blocksize %d",
blocksize);
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 71f870d497ae..5e6bafb10f42 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -55,33 +55,6 @@ static inline u64 get_pre_allocated(u64 size)
}
/*
- * attr_must_be_resident
- *
- * Return: True if attribute must be resident.
- */
-static inline bool attr_must_be_resident(struct ntfs_sb_info *sbi,
- enum ATTR_TYPE type)
-{
- const struct ATTR_DEF_ENTRY *de;
-
- switch (type) {
- case ATTR_STD:
- case ATTR_NAME:
- case ATTR_ID:
- case ATTR_LABEL:
- case ATTR_VOL_INFO:
- case ATTR_ROOT:
- case ATTR_EA_INFO:
- return true;
- default:
- de = ntfs_query_def(sbi, type);
- if (de && (de->flags & NTFS_ATTR_MUST_BE_RESIDENT))
- return true;
- return false;
- }
-}
-
-/*
* attr_load_runs - Load all runs stored in @attr.
*/
static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
@@ -101,6 +74,10 @@ static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
asize = le32_to_cpu(attr->size);
run_off = le16_to_cpu(attr->nres.run_off);
+
+ if (run_off > asize)
+ return -EINVAL;
+
err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn,
vcn ? *vcn : svcn, Add2Ptr(attr, run_off),
asize - run_off);
@@ -172,7 +149,7 @@ out:
int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
- CLST *new_lcn)
+ CLST *new_lcn, CLST *new_len)
{
int err;
CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0;
@@ -192,20 +169,36 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
if (err)
goto out;
- if (new_lcn && vcn == vcn0)
- *new_lcn = lcn;
+ if (vcn == vcn0) {
+ /* Return the first fragment. */
+ if (new_lcn)
+ *new_lcn = lcn;
+ if (new_len)
+ *new_len = flen;
+ }
/* Add new fragment into run storage. */
- if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) {
+ if (!run_add_entry(run, vcn, lcn, flen, opt & ALLOCATE_MFT)) {
/* Undo last 'ntfs_look_for_free_space' */
mark_as_free_ex(sbi, lcn, len, false);
err = -ENOMEM;
goto out;
}
+ if (opt & ALLOCATE_ZERO) {
+ u8 shift = sbi->cluster_bits - SECTOR_SHIFT;
+
+ err = blkdev_issue_zeroout(sbi->sb->s_bdev,
+ (sector_t)lcn << shift,
+ (sector_t)flen << shift,
+ GFP_NOFS, 0);
+ if (err)
+ goto out;
+ }
+
vcn += flen;
- if (flen >= len || opt == ALLOCATE_MFT ||
+ if (flen >= len || (opt & ALLOCATE_MFT) ||
(fr && run->count - cnt >= fr)) {
*alen = vcn - vcn0;
return 0;
@@ -280,7 +273,8 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
const char *data = resident_data(attr);
err = attr_allocate_clusters(sbi, run, 0, 0, len, NULL,
- ALLOCATE_DEF, &alen, 0, NULL);
+ ALLOCATE_DEF, &alen, 0, NULL,
+ NULL);
if (err)
goto out1;
@@ -420,6 +414,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn;
CLST next_svcn, pre_alloc = -1, done = 0;
bool is_ext, is_bad = false;
+ bool dirty = false;
u32 align;
struct MFT_REC *rec;
@@ -440,8 +435,10 @@ again:
return err;
/* Return if file is still resident. */
- if (!attr_b->non_res)
+ if (!attr_b->non_res) {
+ dirty = true;
goto ok1;
+ }
/* Layout of records may be changed, so do a full search. */
goto again;
@@ -464,7 +461,7 @@ again_1:
if (keep_prealloc && new_size < old_size) {
attr_b->nres.data_size = cpu_to_le64(new_size);
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
goto ok;
}
@@ -510,7 +507,7 @@ next_le:
if (new_alloc <= old_alloc) {
attr_b->nres.data_size = cpu_to_le64(new_size);
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
goto ok;
}
@@ -575,13 +572,13 @@ add_alloc_in_same_attr_seg:
/* ~3 bytes per fragment. */
err = attr_allocate_clusters(
sbi, run, vcn, lcn, to_allocate, &pre_alloc,
- is_mft ? ALLOCATE_MFT : 0, &alen,
+ is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen,
is_mft ? 0
: (sbi->record_size -
le32_to_cpu(rec->used) + 8) /
3 +
1,
- NULL);
+ NULL, NULL);
if (err)
goto out;
}
@@ -601,7 +598,7 @@ pack_runs:
next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
new_alloc_tmp = (u64)next_svcn << cluster_bits;
attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp);
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
if (next_svcn >= vcn && !to_allocate) {
/* Normal way. Update attribute and exit. */
@@ -687,7 +684,7 @@ pack_runs:
old_valid = old_size = old_alloc = (u64)vcn << cluster_bits;
attr_b->nres.valid_size = attr_b->nres.data_size =
attr_b->nres.alloc_size = cpu_to_le64(old_size);
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
goto again_1;
}
@@ -749,7 +746,7 @@ pack_runs:
attr_b->nres.valid_size =
attr_b->nres.alloc_size;
}
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen,
true);
@@ -810,16 +807,9 @@ ok1:
if (ret)
*ret = attr_b;
- /* Update inode_set_bytes. */
if (((type == ATTR_DATA && !name_len) ||
(type == ATTR_ALLOC && name == I30_NAME))) {
- bool dirty = false;
-
- if (ni->vfs_inode.i_size != new_size) {
- ni->vfs_inode.i_size = new_size;
- dirty = true;
- }
-
+ /* Update inode_set_bytes. */
if (attr_b->non_res) {
new_alloc = le64_to_cpu(attr_b->nres.alloc_size);
if (inode_get_bytes(&ni->vfs_inode) != new_alloc) {
@@ -828,6 +818,7 @@ ok1:
}
}
+ /* Don't forget to update duplicate information in parent. */
if (dirty) {
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
mark_inode_dirty(&ni->vfs_inode);
@@ -878,8 +869,19 @@ bad_inode:
return err;
}
+/*
+ * attr_data_get_block - Returns 'lcn' and 'len' for given 'vcn'.
+ *
+ * @new == NULL means just to get current mapping for 'vcn'
+ * @new != NULL means allocate real cluster if 'vcn' maps to hole
+ * @zero - zeroout new allocated clusters
+ *
+ * NOTE:
+ * - @new != NULL is called only for sparsed or compressed attributes.
+ * - new allocated clusters are zeroed via blkdev_issue_zeroout.
+ */
int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
- CLST *len, bool *new)
+ CLST *len, bool *new, bool zero)
{
int err = 0;
struct runs_tree *run = &ni->file.run;
@@ -888,29 +890,29 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
struct ATTRIB *attr = NULL, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
- CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end;
- u64 total_size;
- u32 clst_per_frame;
- bool ok;
+ CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen;
+ CLST alloc, evcn;
+ unsigned fr;
+ u64 total_size, total_size0;
+ int step = 0;
if (new)
*new = false;
+ /* Try to find in cache. */
down_read(&ni->file.run_lock);
- ok = run_lookup_entry(run, vcn, lcn, len, NULL);
+ if (!run_lookup_entry(run, vcn, lcn, len, NULL))
+ *len = 0;
up_read(&ni->file.run_lock);
- if (ok && (*lcn != SPARSE_LCN || !new)) {
- /* Normal way. */
- return 0;
+ if (*len) {
+ if (*lcn != SPARSE_LCN || !new)
+ return 0; /* Fast normal way without allocation. */
+ else if (clen > *len)
+ clen = *len;
}
- if (!clen)
- clen = 1;
-
- if (ok && clen > *len)
- clen = *len;
-
+ /* No cluster in cache or we need to allocate cluster in hole. */
sbi = ni->mi.sbi;
cluster_bits = sbi->cluster_bits;
@@ -932,16 +934,15 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits;
if (vcn >= asize) {
- err = -EINVAL;
+ if (new) {
+ err = -EINVAL;
+ } else {
+ *len = 1;
+ *lcn = SPARSE_LCN;
+ }
goto out;
}
- clst_per_frame = 1u << attr_b->nres.c_unit;
- to_alloc = (clen + clst_per_frame - 1) & ~(clst_per_frame - 1);
-
- if (vcn + to_alloc > asize)
- to_alloc = asize - vcn;
-
svcn = le64_to_cpu(attr_b->nres.svcn);
evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
@@ -960,36 +961,68 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
}
+ /* Load in cache actual information. */
err = attr_load_runs(attr, ni, run, NULL);
if (err)
goto out;
- if (!ok) {
- ok = run_lookup_entry(run, vcn, lcn, len, NULL);
- if (ok && (*lcn != SPARSE_LCN || !new)) {
- /* Normal way. */
- err = 0;
- goto ok;
- }
+ if (!*len) {
+ if (run_lookup_entry(run, vcn, lcn, len, NULL)) {
+ if (*lcn != SPARSE_LCN || !new)
+ goto ok; /* Slow normal way without allocation. */
- if (!ok && !new) {
- *len = 0;
- err = 0;
+ if (clen > *len)
+ clen = *len;
+ } else if (!new) {
+ /* Here we may return -ENOENT.
+ * In any case caller gets zero length. */
goto ok;
}
-
- if (ok && clen > *len) {
- clen = *len;
- to_alloc = (clen + clst_per_frame - 1) &
- ~(clst_per_frame - 1);
- }
}
if (!is_attr_ext(attr_b)) {
+ /* The code below only for sparsed or compressed attributes. */
err = -EINVAL;
goto out;
}
+ vcn0 = vcn;
+ to_alloc = clen;
+ fr = (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1;
+ /* Allocate frame aligned clusters.
+ * ntfs.sys usually uses 16 clusters per frame for sparsed or compressed.
+ * ntfs3 uses 1 cluster per frame for new created sparsed files. */
+ if (attr_b->nres.c_unit) {
+ CLST clst_per_frame = 1u << attr_b->nres.c_unit;
+ CLST cmask = ~(clst_per_frame - 1);
+
+ /* Get frame aligned vcn and to_alloc. */
+ vcn = vcn0 & cmask;
+ to_alloc = ((vcn0 + clen + clst_per_frame - 1) & cmask) - vcn;
+ if (fr < clst_per_frame)
+ fr = clst_per_frame;
+ zero = true;
+
+ /* Check if 'vcn' and 'vcn0' in different attribute segments. */
+ if (vcn < svcn || evcn1 <= vcn) {
+ /* Load attribute for truncated vcn. */
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0,
+ &vcn, &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ err = attr_load_runs(attr, ni, run, NULL);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (vcn + to_alloc > asize)
+ to_alloc = asize - vcn;
+
/* Get the last LCN to allocate from. */
hint = 0;
@@ -1003,18 +1036,35 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
hint = -1;
}
- err = attr_allocate_clusters(
- sbi, run, vcn, hint + 1, to_alloc, NULL, 0, len,
- (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1,
- lcn);
+ /* Allocate and zeroout new clusters. */
+ err = attr_allocate_clusters(sbi, run, vcn, hint + 1, to_alloc, NULL,
+ zero ? ALLOCATE_ZERO : ALLOCATE_DEF, &alen,
+ fr, lcn, len);
if (err)
goto out;
*new = true;
+ step = 1;
- end = vcn + *len;
+ end = vcn + alen;
+ /* Save 'total_size0' to restore if error. */
+ total_size0 = le64_to_cpu(attr_b->nres.total_size);
+ total_size = total_size0 + ((u64)alen << cluster_bits);
- total_size = le64_to_cpu(attr_b->nres.total_size) +
- ((u64)*len << cluster_bits);
+ if (vcn != vcn0) {
+ if (!run_lookup_entry(run, vcn0, lcn, len, NULL)) {
+ err = -EINVAL;
+ goto out;
+ }
+ if (*lcn == SPARSE_LCN) {
+ /* Internal error. Should not happened. */
+ WARN_ON(1);
+ err = -EINVAL;
+ goto out;
+ }
+ /* Check case when vcn0 + len overlaps new allocated clusters. */
+ if (vcn0 + *len > end)
+ *len = end - vcn0;
+ }
repack:
err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn);
@@ -1040,7 +1090,7 @@ repack:
if (!ni->attr_list.size) {
err = ni_create_attr_list(ni);
if (err)
- goto out;
+ goto undo1;
/* Layout of records is changed. */
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL,
@@ -1057,67 +1107,83 @@ repack:
}
}
+ /*
+ * The code below may require additional cluster (to extend attribute list)
+ * and / or one MFT record
+ * It is too complex to undo operations if -ENOSPC occurs deep inside
+ * in 'ni_insert_nonresident'.
+ * Return in advance -ENOSPC here if there are no free cluster and no free MFT.
+ */
+ if (!ntfs_check_for_free_space(sbi, 1, 1)) {
+ /* Undo step 1. */
+ err = -ENOSPC;
+ goto undo1;
+ }
+
+ step = 2;
svcn = evcn1;
/* Estimate next attribute. */
attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi);
- if (attr) {
- CLST alloc = bytes_to_cluster(
- sbi, le64_to_cpu(attr_b->nres.alloc_size));
- CLST evcn = le64_to_cpu(attr->nres.evcn);
-
- if (end < next_svcn)
- end = next_svcn;
- while (end > evcn) {
- /* Remove segment [svcn : evcn). */
- mi_remove_attr(NULL, mi, attr);
-
- if (!al_remove_le(ni, le)) {
- err = -EINVAL;
- goto out;
- }
+ if (!attr) {
+ /* Insert new attribute segment. */
+ goto ins_ext;
+ }
- if (evcn + 1 >= alloc) {
- /* Last attribute segment. */
- evcn1 = evcn + 1;
- goto ins_ext;
- }
+ /* Try to update existed attribute segment. */
+ alloc = bytes_to_cluster(sbi, le64_to_cpu(attr_b->nres.alloc_size));
+ evcn = le64_to_cpu(attr->nres.evcn);
- if (ni_load_mi(ni, le, &mi)) {
- attr = NULL;
- goto out;
- }
+ if (end < next_svcn)
+ end = next_svcn;
+ while (end > evcn) {
+ /* Remove segment [svcn : evcn). */
+ mi_remove_attr(NULL, mi, attr);
- attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0,
- &le->id);
- if (!attr) {
- err = -EINVAL;
- goto out;
- }
- svcn = le64_to_cpu(attr->nres.svcn);
- evcn = le64_to_cpu(attr->nres.evcn);
+ if (!al_remove_le(ni, le)) {
+ err = -EINVAL;
+ goto out;
}
- if (end < svcn)
- end = svcn;
+ if (evcn + 1 >= alloc) {
+ /* Last attribute segment. */
+ evcn1 = evcn + 1;
+ goto ins_ext;
+ }
- err = attr_load_runs(attr, ni, run, &end);
- if (err)
+ if (ni_load_mi(ni, le, &mi)) {
+ attr = NULL;
goto out;
+ }
- evcn1 = evcn + 1;
- attr->nres.svcn = cpu_to_le64(next_svcn);
- err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn);
- if (err)
+ attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id);
+ if (!attr) {
+ err = -EINVAL;
goto out;
+ }
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+ }
- le->vcn = cpu_to_le64(next_svcn);
- ni->attr_list.dirty = true;
- mi->dirty = true;
+ if (end < svcn)
+ end = svcn;
+
+ err = attr_load_runs(attr, ni, run, &end);
+ if (err)
+ goto out;
+
+ evcn1 = evcn + 1;
+ attr->nres.svcn = cpu_to_le64(next_svcn);
+ err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn);
+ if (err)
+ goto out;
+
+ le->vcn = cpu_to_le64(next_svcn);
+ ni->attr_list.dirty = true;
+ mi->dirty = true;
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
- next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
- }
ins_ext:
if (evcn1 > next_svcn) {
err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
@@ -1129,10 +1195,26 @@ ins_ext:
ok:
run_truncate_around(run, vcn);
out:
+ if (err && step > 1) {
+ /* Too complex to restore. */
+ _ntfs_bad_inode(&ni->vfs_inode);
+ }
up_write(&ni->file.run_lock);
ni_unlock(ni);
return err;
+
+undo1:
+ /* Undo step1. */
+ attr_b->nres.total_size = cpu_to_le64(total_size0);
+ inode_set_bytes(&ni->vfs_inode, total_size0);
+
+ if (run_deallocate_ex(sbi, run, vcn, alen, NULL, false) ||
+ !run_add_entry(run, vcn, SPARSE_LCN, alen, false) ||
+ mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn)) {
+ _ntfs_bad_inode(&ni->vfs_inode);
+ }
+ goto out;
}
int attr_data_read_resident(struct ntfs_inode *ni, struct page *page)
@@ -1217,6 +1299,11 @@ int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
CLST svcn, evcn;
u16 ro;
+ if (!ni) {
+ /* Is record corrupted? */
+ return -ENOENT;
+ }
+
attr = ni_find_attr(ni, NULL, NULL, type, name, name_len, &vcn, NULL);
if (!attr) {
/* Is record corrupted? */
@@ -1232,6 +1319,10 @@ int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
}
ro = le16_to_cpu(attr->nres.run_off);
+
+ if (ro > le32_to_cpu(attr->size))
+ return -EINVAL;
+
err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, svcn,
Add2Ptr(attr, ro), le32_to_cpu(attr->size) - ro);
if (err < 0)
@@ -1530,7 +1621,7 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
struct ATTRIB *attr = NULL, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
- CLST svcn, evcn1, next_svcn, lcn, len;
+ CLST svcn, evcn1, next_svcn, len;
CLST vcn, end, clst_data;
u64 total_size, valid_size, data_size;
@@ -1606,8 +1697,9 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
}
err = attr_allocate_clusters(sbi, run, vcn + clst_data,
- hint + 1, len - clst_data, NULL, 0,
- &alen, 0, &lcn);
+ hint + 1, len - clst_data, NULL,
+ ALLOCATE_DEF, &alen, 0, NULL,
+ NULL);
if (err)
goto out;
@@ -1901,6 +1993,11 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
u16 le_sz;
u16 roff = le16_to_cpu(attr->nres.run_off);
+ if (roff > le32_to_cpu(attr->size)) {
+ err = -EINVAL;
+ goto out;
+ }
+
run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn,
evcn1 - 1, svcn, Add2Ptr(attr, roff),
le32_to_cpu(attr->size) - roff);
@@ -2020,7 +2117,7 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
return -ENOENT;
if (!attr_b->non_res) {
- u32 data_size = le32_to_cpu(attr->res.data_size);
+ u32 data_size = le32_to_cpu(attr_b->res.data_size);
u32 from, to;
if (vbo > data_size)
@@ -2290,7 +2387,8 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
if (!attr_b->non_res) {
/* Still resident. */
- char *data = Add2Ptr(attr_b, attr_b->res.data_off);
+ char *data = Add2Ptr(attr_b,
+ le16_to_cpu(attr_b->res.data_off));
memmove(data + bytes, data, bytes);
memset(data, 0, bytes);
@@ -2382,8 +2480,8 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
if (vbo <= ni->i_valid)
ni->i_valid += bytes;
- attr_b->nres.data_size = le64_to_cpu(data_size + bytes);
- attr_b->nres.alloc_size = le64_to_cpu(alloc_size + bytes);
+ attr_b->nres.data_size = cpu_to_le64(data_size + bytes);
+ attr_b->nres.alloc_size = cpu_to_le64(alloc_size + bytes);
/* ni->valid may be not equal valid_size (temporary). */
if (ni->i_valid > data_size + bytes)
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
index bad6d8a849a2..c0c6bcbc8c05 100644
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -68,6 +68,11 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr)
run_init(&ni->attr_list.run);
+ if (run_off > le32_to_cpu(attr->size)) {
+ err = -EINVAL;
+ goto out;
+ }
+
err = run_unpack_ex(&ni->attr_list.run, ni->mi.sbi, ni->mi.rno,
0, le64_to_cpu(attr->nres.evcn), 0,
Add2Ptr(attr, run_off),
diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c
index 50d838093790..25a4d4896aa9 100644
--- a/fs/ntfs3/bitfunc.c
+++ b/fs/ntfs3/bitfunc.c
@@ -30,7 +30,7 @@ static const u8 zero_mask[] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0,
*
* Return: True if all bits [bit, bit+nbits) are zeros "0".
*/
-bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits)
+bool are_bits_clear(const void *lmap, size_t bit, size_t nbits)
{
size_t pos = bit & 7;
const u8 *map = (u8 *)lmap + (bit >> 3);
@@ -78,7 +78,7 @@ bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits)
*
* Return: True if all bits [bit, bit+nbits) are ones "1".
*/
-bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits)
+bool are_bits_set(const void *lmap, size_t bit, size_t nbits)
{
u8 mask;
size_t pos = bit & 7;
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index e92bbd754365..723fb64e6531 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -59,14 +59,14 @@ void ntfs3_exit_bitmap(void)
*
* Return: -1 if not found.
*/
-static size_t wnd_scan(const ulong *buf, size_t wbit, u32 wpos, u32 wend,
+static size_t wnd_scan(const void *buf, size_t wbit, u32 wpos, u32 wend,
size_t to_alloc, size_t *prev_tail, size_t *b_pos,
size_t *b_len)
{
while (wpos < wend) {
size_t free_len;
u32 free_bits, end;
- u32 used = find_next_zero_bit(buf, wend, wpos);
+ u32 used = find_next_zero_bit_le(buf, wend, wpos);
if (used >= wend) {
if (*b_len < *prev_tail) {
@@ -92,7 +92,7 @@ static size_t wnd_scan(const ulong *buf, size_t wbit, u32 wpos, u32 wend,
* Now we have a fragment [wpos, wend) staring with 0.
*/
end = wpos + to_alloc - *prev_tail;
- free_bits = find_next_bit(buf, min(end, wend), wpos);
+ free_bits = find_next_bit_le(buf, min(end, wend), wpos);
free_len = *prev_tail + free_bits - wpos;
@@ -504,7 +504,6 @@ static int wnd_rescan(struct wnd_bitmap *wnd)
u8 cluster_bits = sbi->cluster_bits;
u32 wbits = 8 * sb->s_blocksize;
u32 used, frb;
- const ulong *buf;
size_t wpos, wbit, iw, vbo;
struct buffer_head *bh = NULL;
CLST lcn, clen;
@@ -558,9 +557,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd)
goto out;
}
- buf = (ulong *)bh->b_data;
-
- used = bitmap_weight(buf, wbits);
+ used = ntfs_bitmap_weight_le(bh->b_data, wbits);
if (used < wbits) {
frb = wbits - used;
wnd->free_bits[iw] = frb;
@@ -574,7 +571,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd)
wbits = wnd->nbits - wbit;
do {
- used = find_next_zero_bit(buf, wbits, wpos);
+ used = find_next_zero_bit_le(bh->b_data, wbits, wpos);
if (used > wpos && prev_tail) {
wnd_add_free_ext(wnd, wbit + wpos - prev_tail,
@@ -590,7 +587,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd)
break;
}
- frb = find_next_bit(buf, wbits, wpos);
+ frb = find_next_bit_le(bh->b_data, wbits, wpos);
if (frb >= wbits) {
/* Keep last free block. */
prev_tail += frb - wpos;
@@ -661,7 +658,7 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits)
if (!wnd->bits_last)
wnd->bits_last = wbits;
- wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS);
+ wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN);
if (!wnd->free_bits)
return -ENOMEM;
@@ -718,7 +715,6 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
while (iw < wnd->nwnd && bits) {
u32 tail, op;
- ulong *buf;
if (iw + 1 == wnd->nwnd)
wbits = wnd->bits_last;
@@ -732,11 +728,9 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
break;
}
- buf = (ulong *)bh->b_data;
-
lock_buffer(bh);
- __bitmap_clear(buf, wbit, op);
+ ntfs_bitmap_clear_le(bh->b_data, wbit, op);
wnd->free_bits[iw] += op;
@@ -771,7 +765,6 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
while (iw < wnd->nwnd && bits) {
u32 tail, op;
- ulong *buf;
if (unlikely(iw + 1 == wnd->nwnd))
wbits = wnd->bits_last;
@@ -784,11 +777,10 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
err = PTR_ERR(bh);
break;
}
- buf = (ulong *)bh->b_data;
lock_buffer(bh);
- __bitmap_set(buf, wbit, op);
+ ntfs_bitmap_set_le(bh->b_data, wbit, op);
wnd->free_bits[iw] -= op;
set_buffer_uptodate(bh);
@@ -809,6 +801,44 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
}
/*
+ * wnd_set_used_safe - Mark the bits range from bit to bit + bits as used.
+ *
+ * Unlikely wnd_set_used/wnd_set_free this function is not full trusted.
+ * It scans every bit in bitmap and marks free bit as used.
+ * @done - how many bits were marked as used.
+ *
+ * NOTE: normally *done should be 0.
+ */
+int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits,
+ size_t *done)
+{
+ size_t i, from = 0, len = 0;
+ int err = 0;
+
+ *done = 0;
+ for (i = 0; i < bits; i++) {
+ if (wnd_is_free(wnd, bit + i, 1)) {
+ if (!len)
+ from = bit + i;
+ len += 1;
+ } else if (len) {
+ err = wnd_set_used(wnd, from, len);
+ *done += len;
+ len = 0;
+ if (err)
+ break;
+ }
+ }
+
+ if (len) {
+ /* last fragment. */
+ err = wnd_set_used(wnd, from, len);
+ *done += len;
+ }
+ return err;
+}
+
+/*
* wnd_is_free_hlp
*
* Return: True if all clusters [bit, bit+bits) are free (bitmap only).
@@ -836,7 +866,7 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits)
if (IS_ERR(bh))
return false;
- ret = are_bits_clear((ulong *)bh->b_data, wbit, op);
+ ret = are_bits_clear(bh->b_data, wbit, op);
put_bh(bh);
if (!ret)
@@ -928,7 +958,7 @@ use_wnd:
if (IS_ERR(bh))
goto out;
- ret = are_bits_set((ulong *)bh->b_data, wbit, op);
+ ret = are_bits_set(bh->b_data, wbit, op);
put_bh(bh);
if (!ret)
goto out;
@@ -959,7 +989,6 @@ size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint,
size_t fnd, max_alloc, b_len, b_pos;
size_t iw, prev_tail, nwnd, wbit, ebit, zbit, zend;
size_t to_alloc0 = to_alloc;
- const ulong *buf;
const struct e_node *e;
const struct rb_node *pr, *cr;
u8 log2_bits;
@@ -1185,14 +1214,13 @@ Again:
continue;
}
- buf = (ulong *)bh->b_data;
-
/* Scan range [wbit, zbit). */
if (wpos < wzbit) {
/* Scan range [wpos, zbit). */
- fnd = wnd_scan(buf, wbit, wpos, wzbit,
- to_alloc, &prev_tail,
- &b_pos, &b_len);
+ fnd = wnd_scan(bh->b_data, wbit, wpos,
+ wzbit, to_alloc,
+ &prev_tail, &b_pos,
+ &b_len);
if (fnd != MINUS_ONE_T) {
put_bh(bh);
goto found;
@@ -1203,7 +1231,7 @@ Again:
/* Scan range [zend, ebit). */
if (wzend < wbits) {
- fnd = wnd_scan(buf, wbit,
+ fnd = wnd_scan(bh->b_data, wbit,
max(wzend, wpos), wbits,
to_alloc, &prev_tail,
&b_pos, &b_len);
@@ -1242,11 +1270,9 @@ Again:
continue;
}
- buf = (ulong *)bh->b_data;
-
/* Scan range [wpos, eBits). */
- fnd = wnd_scan(buf, wbit, wpos, wbits, to_alloc, &prev_tail,
- &b_pos, &b_len);
+ fnd = wnd_scan(bh->b_data, wbit, wpos, wbits, to_alloc,
+ &prev_tail, &b_pos, &b_len);
put_bh(bh);
if (fnd != MINUS_ONE_T)
goto found;
@@ -1324,7 +1350,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
new_last = wbits;
if (new_wnd != wnd->nwnd) {
- new_free = kmalloc(new_wnd * sizeof(u16), GFP_NOFS);
+ new_free = kmalloc_array(new_wnd, sizeof(u16), GFP_NOFS);
if (!new_free)
return -ENOMEM;
@@ -1344,7 +1370,6 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
size_t frb;
u64 vbo, lbo, bytes;
struct buffer_head *bh;
- ulong *buf;
if (iw + 1 == new_wnd)
wbits = new_last;
@@ -1361,10 +1386,9 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
return -EIO;
lock_buffer(bh);
- buf = (ulong *)bh->b_data;
- __bitmap_clear(buf, b0, blocksize * 8 - b0);
- frb = wbits - bitmap_weight(buf, wbits);
+ ntfs_bitmap_clear_le(bh->b_data, b0, blocksize * 8 - b0);
+ frb = wbits - ntfs_bitmap_weight_le(bh->b_data, wbits);
wnd->total_zeroes += frb - wnd->free_bits[iw];
wnd->free_bits[iw] = frb;
@@ -1411,7 +1435,6 @@ int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range)
CLST lcn_from = bytes_to_cluster(sbi, range->start);
size_t iw = lcn_from >> (sb->s_blocksize_bits + 3);
u32 wbit = lcn_from & (wbits - 1);
- const ulong *buf;
CLST lcn_to;
if (!minlen)
@@ -1424,7 +1447,7 @@ int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range)
down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
- for (; iw < wnd->nbits; iw++, wbit = 0) {
+ for (; iw < wnd->nwnd; iw++, wbit = 0) {
CLST lcn_wnd = iw * wbits;
struct buffer_head *bh;
@@ -1446,10 +1469,8 @@ int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range)
break;
}
- buf = (ulong *)bh->b_data;
-
for (; wbit < wbits; wbit++) {
- if (!test_bit(wbit, buf)) {
+ if (!test_bit_le(wbit, bh->b_data)) {
if (!len)
lcn = lcn_wnd + wbit;
len += 1;
@@ -1481,3 +1502,70 @@ out:
return err;
}
+
+#if BITS_PER_LONG == 64
+typedef __le64 bitmap_ulong;
+#define cpu_to_ul(x) cpu_to_le64(x)
+#define ul_to_cpu(x) le64_to_cpu(x)
+#else
+typedef __le32 bitmap_ulong;
+#define cpu_to_ul(x) cpu_to_le32(x)
+#define ul_to_cpu(x) le32_to_cpu(x)
+#endif
+
+void ntfs_bitmap_set_le(void *map, unsigned int start, int len)
+{
+ bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start);
+ const unsigned int size = start + len;
+ int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+ bitmap_ulong mask_to_set = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start));
+
+ while (len - bits_to_set >= 0) {
+ *p |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_LONG;
+ mask_to_set = cpu_to_ul(~0UL);
+ p++;
+ }
+ if (len) {
+ mask_to_set &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size));
+ *p |= mask_to_set;
+ }
+}
+
+void ntfs_bitmap_clear_le(void *map, unsigned int start, int len)
+{
+ bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start);
+ const unsigned int size = start + len;
+ int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+ bitmap_ulong mask_to_clear = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start));
+
+ while (len - bits_to_clear >= 0) {
+ *p &= ~mask_to_clear;
+ len -= bits_to_clear;
+ bits_to_clear = BITS_PER_LONG;
+ mask_to_clear = cpu_to_ul(~0UL);
+ p++;
+ }
+ if (len) {
+ mask_to_clear &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size));
+ *p &= ~mask_to_clear;
+ }
+}
+
+unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits)
+{
+ const ulong *bmp = bitmap;
+ unsigned int k, lim = bits / BITS_PER_LONG;
+ unsigned int w = 0;
+
+ for (k = 0; k < lim; k++)
+ w += hweight_long(bmp[k]);
+
+ if (bits % BITS_PER_LONG) {
+ w += hweight_long(ul_to_cpu(((bitmap_ulong *)bitmap)[k]) &
+ BITMAP_LAST_WORD_MASK(bits));
+ }
+
+ return w;
+}
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index fb438d604040..063a6654199b 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -26,8 +26,8 @@ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
if (!nls) {
/* UTF-16 -> UTF-8 */
- ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf,
- buf_len);
+ ret = utf16s_to_utf8s((wchar_t *)name, len, UTF16_LITTLE_ENDIAN,
+ buf, buf_len);
buf[ret] = '\0';
return ret;
}
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 4f2ffc7ef296..d294cd975688 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -122,31 +122,15 @@ static int ntfs_extend_initialized_size(struct file *file,
bits = sbi->cluster_bits;
vcn = pos >> bits;
- err = attr_data_get_block(ni, vcn, 0, &lcn, &clen,
- NULL);
+ err = attr_data_get_block(ni, vcn, 1, &lcn, &clen, NULL,
+ false);
if (err)
goto out;
if (lcn == SPARSE_LCN) {
- loff_t vbo = (loff_t)vcn << bits;
- loff_t to = vbo + ((loff_t)clen << bits);
-
- if (to <= new_valid) {
- ni->i_valid = to;
- pos = to;
- goto next;
- }
-
- if (vbo < pos) {
- pos = vbo;
- } else {
- to = (new_valid >> bits) << bits;
- if (pos < to) {
- ni->i_valid = to;
- pos = to;
- goto next;
- }
- }
+ pos = ((loff_t)clen + vcn) << bits;
+ ni->i_valid = pos;
+ goto next;
}
}
@@ -196,18 +180,18 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
struct address_space *mapping = inode->i_mapping;
u32 blocksize = 1 << inode->i_blkbits;
pgoff_t idx = vbo >> PAGE_SHIFT;
- u32 z_start = vbo & (PAGE_SIZE - 1);
+ u32 from = vbo & (PAGE_SIZE - 1);
pgoff_t idx_end = (vbo_to + PAGE_SIZE - 1) >> PAGE_SHIFT;
loff_t page_off;
struct buffer_head *head, *bh;
- u32 bh_next, bh_off, z_end;
+ u32 bh_next, bh_off, to;
sector_t iblock;
struct page *page;
- for (; idx < idx_end; idx += 1, z_start = 0) {
+ for (; idx < idx_end; idx += 1, from = 0) {
page_off = (loff_t)idx << PAGE_SHIFT;
- z_end = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off)
- : PAGE_SIZE;
+ to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off)
+ : PAGE_SIZE;
iblock = page_off >> inode->i_blkbits;
page = find_or_create_page(mapping, idx,
@@ -224,7 +208,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
do {
bh_next = bh_off + blocksize;
- if (bh_next <= z_start || bh_off >= z_end)
+ if (bh_next <= from || bh_off >= to)
continue;
if (!buffer_mapped(bh)) {
@@ -258,7 +242,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
} while (bh_off = bh_next, iblock += 1,
head != (bh = bh->b_this_page));
- zero_user_segment(page, z_start, z_end);
+ zero_user_segment(page, from, to);
unlock_page(page);
put_page(page);
@@ -270,81 +254,6 @@ out:
}
/*
- * ntfs_sparse_cluster - Helper function to zero a new allocated clusters.
- *
- * NOTE: 512 <= cluster size <= 2M
- */
-void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
- CLST len)
-{
- struct address_space *mapping = inode->i_mapping;
- struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
- u64 vbo = (u64)vcn << sbi->cluster_bits;
- u64 bytes = (u64)len << sbi->cluster_bits;
- u32 blocksize = 1 << inode->i_blkbits;
- pgoff_t idx0 = page0 ? page0->index : -1;
- loff_t vbo_clst = vbo & sbi->cluster_mask_inv;
- loff_t end = ntfs_up_cluster(sbi, vbo + bytes);
- pgoff_t idx = vbo_clst >> PAGE_SHIFT;
- u32 from = vbo_clst & (PAGE_SIZE - 1);
- pgoff_t idx_end = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- loff_t page_off;
- u32 to;
- bool partial;
- struct page *page;
-
- for (; idx < idx_end; idx += 1, from = 0) {
- page = idx == idx0 ? page0 : grab_cache_page(mapping, idx);
-
- if (!page)
- continue;
-
- page_off = (loff_t)idx << PAGE_SHIFT;
- to = (page_off + PAGE_SIZE) > end ? (end - page_off)
- : PAGE_SIZE;
- partial = false;
-
- if ((from || PAGE_SIZE != to) &&
- likely(!page_has_buffers(page))) {
- create_empty_buffers(page, blocksize, 0);
- }
-
- if (page_has_buffers(page)) {
- struct buffer_head *head, *bh;
- u32 bh_off = 0;
-
- bh = head = page_buffers(page);
- do {
- u32 bh_next = bh_off + blocksize;
-
- if (from <= bh_off && bh_next <= to) {
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- } else if (!buffer_uptodate(bh)) {
- partial = true;
- }
- bh_off = bh_next;
- } while (head != (bh = bh->b_this_page));
- }
-
- zero_user_segment(page, from, to);
-
- if (!partial) {
- if (!PageUptodate(page))
- SetPageUptodate(page);
- set_page_dirty(page);
- }
-
- if (idx != idx0) {
- unlock_page(page);
- put_page(page);
- }
- cond_resched();
- }
- mark_inode_dirty(inode);
-}
-
-/*
* ntfs_file_mmap - file_operations::mmap
*/
static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -385,13 +294,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
for (; vcn < end; vcn += len) {
err = attr_data_get_block(ni, vcn, 1, &lcn,
- &len, &new);
+ &len, &new, true);
if (err)
goto out;
-
- if (!new)
- continue;
- ntfs_sparse_cluster(inode, NULL, vcn, 1);
}
}
@@ -432,7 +337,6 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
err = ntfs_set_size(inode, end);
if (err)
goto out;
- inode->i_size = end;
}
if (extend_init && !is_compressed(ni)) {
@@ -486,10 +390,10 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
new_valid = ntfs_up_block(sb, min_t(u64, ni->i_valid, new_size));
- ni_lock(ni);
-
truncate_setsize(inode, new_size);
+ ni_lock(ni);
+
down_write(&ni->file.run_lock);
err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size,
&new_valid, ni->mi.sbi->options->prealloc, NULL);
@@ -535,7 +439,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct ntfs_inode *ni = ntfs_i(inode);
loff_t end = vbo + len;
- loff_t vbo_down = round_down(vbo, PAGE_SIZE);
+ loff_t vbo_down = round_down(vbo, max_t(unsigned long,
+ sbi->cluster_size, PAGE_SIZE));
bool is_supported_holes = is_sparsed(ni) || is_compressed(ni);
loff_t i_size, new_size;
bool map_locked;
@@ -588,11 +493,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
u32 frame_size;
loff_t mask, vbo_a, end_a, tmp;
- err = filemap_write_and_wait_range(mapping, vbo, end - 1);
- if (err)
- goto out;
-
- err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
+ err = filemap_write_and_wait_range(mapping, vbo_down,
+ LLONG_MAX);
if (err)
goto out;
@@ -685,47 +587,45 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
if (err)
goto out;
- /*
- * Allocate clusters, do not change 'valid' size.
- */
- err = ntfs_set_size(inode, new_size);
- if (err)
- goto out;
+ if (new_size > i_size) {
+ /*
+ * Allocate clusters, do not change 'valid' size.
+ */
+ err = ntfs_set_size(inode, new_size);
+ if (err)
+ goto out;
+ }
if (is_supported_holes) {
- CLST vcn_v = ni->i_valid >> sbi->cluster_bits;
CLST vcn = vbo >> sbi->cluster_bits;
CLST cend = bytes_to_cluster(sbi, end);
+ CLST cend_v = bytes_to_cluster(sbi, ni->i_valid);
CLST lcn, clen;
bool new;
+ if (cend_v > cend)
+ cend_v = cend;
+
/*
- * Allocate but do not zero new clusters. (see below comments)
- * This breaks security: One can read unused on-disk areas.
+ * Allocate and zero new clusters.
* Zeroing these clusters may be too long.
- * Maybe we should check here for root rights?
+ */
+ for (; vcn < cend_v; vcn += clen) {
+ err = attr_data_get_block(ni, vcn, cend_v - vcn,
+ &lcn, &clen, &new,
+ true);
+ if (err)
+ goto out;
+ }
+ /*
+ * Allocate but not zero new clusters.
*/
for (; vcn < cend; vcn += clen) {
err = attr_data_get_block(ni, vcn, cend - vcn,
- &lcn, &clen, &new);
+ &lcn, &clen, &new,
+ false);
if (err)
goto out;
- if (!new || vcn >= vcn_v)
- continue;
-
- /*
- * Unwritten area.
- * NTFS is not able to store several unwritten areas.
- * Activate 'ntfs_sparse_cluster' to zero new allocated clusters.
- *
- * Dangerous in case:
- * 1G of sparsed clusters + 1 cluster of data =>
- * valid_size == 1G + 1 cluster
- * fallocate(1G) will zero 1G and this can be very long
- * xfstest 016/086 will fail without 'ntfs_sparse_cluster'.
- */
- ntfs_sparse_cluster(inode, NULL, vcn,
- min(vcn_v - vcn, clen));
}
}
@@ -736,6 +636,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
&ni->file.run, i_size, &ni->i_valid,
true, NULL);
ni_unlock(ni);
+ } else if (new_size > i_size) {
+ inode->i_size = new_size;
}
}
@@ -779,7 +681,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
goto out;
if (ia_valid & ATTR_SIZE) {
- loff_t oldsize = inode->i_size;
+ loff_t newsize, oldsize;
if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
/* Should never be here, see ntfs_file_open(). */
@@ -787,22 +689,25 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
goto out;
}
inode_dio_wait(inode);
+ oldsize = inode->i_size;
+ newsize = attr->ia_size;
- if (attr->ia_size <= oldsize)
- err = ntfs_truncate(inode, attr->ia_size);
- else if (attr->ia_size > oldsize)
- err = ntfs_extend(inode, attr->ia_size, 0, NULL);
+ if (newsize <= oldsize)
+ err = ntfs_truncate(inode, newsize);
+ else
+ err = ntfs_extend(inode, newsize, 0, NULL);
if (err)
goto out;
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ inode->i_size = newsize;
}
setattr_copy(mnt_userns, inode, attr);
if (mode != inode->i_mode) {
- err = ntfs_acl_chmod(mnt_userns, inode);
+ err = ntfs_acl_chmod(mnt_userns, dentry);
if (err)
goto out;
@@ -946,8 +851,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
frame_vbo = valid & ~(frame_size - 1);
off = valid & (frame_size - 1);
- err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 0, &lcn,
- &clen, NULL);
+ err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 1, &lcn,
+ &clen, NULL, false);
if (err)
goto out;
@@ -1255,7 +1160,7 @@ const struct inode_operations ntfs_file_inode_operations = {
.setattr = ntfs3_setattr,
.listxattr = ntfs_listxattr,
.permission = ntfs_permission,
- .get_acl = ntfs_get_acl,
+ .get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.fiemap = ntfs_fiemap,
};
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 381a38a06ec2..f1df52dfab74 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -557,7 +557,7 @@ static int ni_repack(struct ntfs_inode *ni)
}
if (!mi_p) {
- /* Do not try if not enogh free space. */
+ /* Do not try if not enough free space. */
if (le32_to_cpu(mi->mrec->used) + 8 >= rs)
continue;
@@ -568,6 +568,12 @@ static int ni_repack(struct ntfs_inode *ni)
}
roff = le16_to_cpu(attr->nres.run_off);
+
+ if (roff > le32_to_cpu(attr->size)) {
+ err = -EINVAL;
+ break;
+ }
+
err = run_unpack(&run, sbi, ni->mi.rno, svcn, evcn, svcn,
Add2Ptr(attr, roff),
le32_to_cpu(attr->size) - roff);
@@ -1589,6 +1595,9 @@ int ni_delete_all(struct ntfs_inode *ni)
asize = le32_to_cpu(attr->size);
roff = le16_to_cpu(attr->nres.run_off);
+ if (roff > asize)
+ return -EINVAL;
+
/* run==1 means unpack and deallocate. */
run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn,
Add2Ptr(attr, roff), asize - roff);
@@ -1636,6 +1645,7 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
{
struct ATTRIB *attr = NULL;
struct ATTR_FILE_NAME *fname;
+ struct le_str *fns;
if (le)
*le = NULL;
@@ -1659,8 +1669,8 @@ next:
if (uni->len != fname->name_len)
goto next;
- if (ntfs_cmp_names_cpu(uni, (struct le_str *)&fname->name_len, NULL,
- false))
+ fns = (struct le_str *)&fname->name_len;
+ if (ntfs_cmp_names_cpu(uni, fns, NULL, false))
goto next;
return fname;
@@ -2214,7 +2224,7 @@ int ni_decompress_file(struct ntfs_inode *ni)
for (vcn = vbo >> sbi->cluster_bits; vcn < end; vcn += clen) {
err = attr_data_get_block(ni, vcn, cend - vcn, &lcn,
- &clen, &new);
+ &clen, &new, false);
if (err)
goto out;
}
@@ -2291,6 +2301,11 @@ remove_wof:
asize = le32_to_cpu(attr->size);
roff = le16_to_cpu(attr->nres.run_off);
+ if (roff > asize) {
+ err = -EINVAL;
+ goto out;
+ }
+
/*run==1 Means unpack and deallocate. */
run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn,
Add2Ptr(attr, roff), asize - roff);
@@ -2997,6 +3012,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct NTFS_DE *de)
{
int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ATTRIB *attr;
struct ATTR_LIST_ENTRY *le;
struct mft_inode *mi;
@@ -3004,6 +3020,19 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1);
u16 de_key_size = le16_to_cpu(de->key_size);
+ if (sbi->options->windows_names &&
+ !valid_windows_name(sbi, (struct le_str *)&de_name->name_len))
+ return -EINVAL;
+
+ /* If option "hide_dot_files" then set hidden attribute for dot files. */
+ if (ni->mi.sbi->options->hide_dot_files) {
+ if (de_name->name_len > 0 &&
+ le16_to_cpu(de_name->name[0]) == '.')
+ ni->std_fa |= FILE_ATTRIBUTE_HIDDEN;
+ else
+ ni->std_fa &= ~FILE_ATTRIBUTE_HIDDEN;
+ }
+
mi_get_ref(&ni->mi, &de->ref);
mi_get_ref(&dir_ni->mi, &de_name->home);
@@ -3022,7 +3051,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size);
/* Insert new name into directory. */
- err = indx_insert_entry(&dir_ni->dir, dir_ni, de, ni->mi.sbi, NULL, 0);
+ err = indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 0);
if (err)
ni_remove_attr_le(ni, attr, mi, le);
@@ -3265,6 +3294,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
modified = true;
}
+ /* std attribute is always in primary MFT record. */
if (modified)
ni->mi.dirty = true;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 0d611a6c5511..c6eb371a3695 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -1132,7 +1132,7 @@ static int read_log_page(struct ntfs_log *log, u32 vbo,
return -EINVAL;
if (!*buffer) {
- to_free = kmalloc(bytes, GFP_NOFS);
+ to_free = kmalloc(log->page_size, GFP_NOFS);
if (!to_free)
return -ENOMEM;
*buffer = to_free;
@@ -1180,10 +1180,7 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
struct restart_info *info)
{
u32 skip, vbo;
- struct RESTART_HDR *r_page = kmalloc(DefaultLogPageSize, GFP_NOFS);
-
- if (!r_page)
- return -ENOMEM;
+ struct RESTART_HDR *r_page = NULL;
/* Determine which restart area we are looking for. */
if (first) {
@@ -1197,7 +1194,6 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
/* Loop continuously until we succeed. */
for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) {
bool usa_error;
- u32 sys_page_size;
bool brst, bchk;
struct RESTART_AREA *ra;
@@ -1251,24 +1247,6 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
goto check_result;
}
- /* Read the entire restart area. */
- sys_page_size = le32_to_cpu(r_page->sys_page_size);
- if (DefaultLogPageSize != sys_page_size) {
- kfree(r_page);
- r_page = kzalloc(sys_page_size, GFP_NOFS);
- if (!r_page)
- return -ENOMEM;
-
- if (read_log_page(log, vbo,
- (struct RECORD_PAGE_HDR **)&r_page,
- &usa_error)) {
- /* Ignore any errors. */
- kfree(r_page);
- r_page = NULL;
- continue;
- }
- }
-
if (is_client_area_valid(r_page, usa_error)) {
info->valid_page = true;
ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off));
@@ -2727,6 +2705,9 @@ static inline bool check_attr(const struct MFT_REC *rec,
return false;
}
+ if (run_off > asize)
+ return false;
+
if (run_unpack(NULL, sbi, 0, svcn, evcn, svcn,
Add2Ptr(attr, run_off), asize - run_off) < 0) {
return false;
@@ -3048,7 +3029,7 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe,
struct NEW_ATTRIBUTE_SIZES *new_sz;
struct ATTR_FILE_NAME *fname;
struct OpenAttr *oa, *oa2;
- u32 nsize, t32, asize, used, esize, bmp_off, bmp_bits;
+ u32 nsize, t32, asize, used, esize, off, bits;
u16 id, id2;
u32 record_size = sbi->record_size;
u64 t64;
@@ -3635,30 +3616,28 @@ move_data:
break;
case SetBitsInNonresidentBitMap:
- bmp_off =
- le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
- bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
+ off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
+ bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
- if (cbo + (bmp_off + 7) / 8 > lco ||
- cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) {
+ if (cbo + (off + 7) / 8 > lco ||
+ cbo + ((off + bits + 7) / 8) > lco) {
goto dirty_vol;
}
- __bitmap_set(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits);
+ ntfs_bitmap_set_le(Add2Ptr(buffer_le, roff), off, bits);
a_dirty = true;
break;
case ClearBitsInNonresidentBitMap:
- bmp_off =
- le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
- bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
+ off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
+ bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
- if (cbo + (bmp_off + 7) / 8 > lco ||
- cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) {
+ if (cbo + (off + 7) / 8 > lco ||
+ cbo + ((off + bits + 7) / 8) > lco) {
goto dirty_vol;
}
- __bitmap_clear(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits);
+ ntfs_bitmap_clear_le(Add2Ptr(buffer_le, roff), off, bits);
a_dirty = true;
break;
@@ -4771,6 +4750,12 @@ fake_attr:
u16 roff = le16_to_cpu(attr->nres.run_off);
CLST svcn = le64_to_cpu(attr->nres.svcn);
+ if (roff > t32) {
+ kfree(oa->attr);
+ oa->attr = NULL;
+ goto fake_attr;
+ }
+
err = run_unpack(&oa->run0, sbi, inode->i_ino, svcn,
le64_to_cpu(attr->nres.evcn), svcn,
Add2Ptr(attr, roff), t32 - roff);
@@ -4839,8 +4824,7 @@ next_dirty_page_vcn:
goto out;
}
attr = oa->attr;
- t64 = le64_to_cpu(attr->nres.alloc_size);
- if (size > t64) {
+ if (size > le64_to_cpu(attr->nres.alloc_size)) {
attr->nres.valid_size = attr->nres.data_size =
attr->nres.alloc_size = cpu_to_le64(size);
}
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 4ed15f64b17f..567563771bf8 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -98,6 +98,30 @@ const __le16 WOF_NAME[17] = {
};
#endif
+static const __le16 CON_NAME[3] = {
+ cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('N'),
+};
+
+static const __le16 NUL_NAME[3] = {
+ cpu_to_le16('N'), cpu_to_le16('U'), cpu_to_le16('L'),
+};
+
+static const __le16 AUX_NAME[3] = {
+ cpu_to_le16('A'), cpu_to_le16('U'), cpu_to_le16('X'),
+};
+
+static const __le16 PRN_NAME[3] = {
+ cpu_to_le16('P'), cpu_to_le16('R'), cpu_to_le16('N'),
+};
+
+static const __le16 COM_NAME[3] = {
+ cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('M'),
+};
+
+static const __le16 LPT_NAME[3] = {
+ cpu_to_le16('L'), cpu_to_le16('P'), cpu_to_le16('T'),
+};
+
// clang-format on
/*
@@ -322,35 +346,6 @@ out:
}
/*
- * ntfs_query_def
- *
- * Return: Current ATTR_DEF_ENTRY for given attribute type.
- */
-const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi,
- enum ATTR_TYPE type)
-{
- int type_in = le32_to_cpu(type);
- size_t min_idx = 0;
- size_t max_idx = sbi->def_entries - 1;
-
- while (min_idx <= max_idx) {
- size_t i = min_idx + ((max_idx - min_idx) >> 1);
- const struct ATTR_DEF_ENTRY *entry = sbi->def_table + i;
- int diff = le32_to_cpu(entry->type) - type_in;
-
- if (!diff)
- return entry;
- if (diff < 0)
- min_idx = i + 1;
- else if (i)
- max_idx = i - 1;
- else
- return NULL;
- }
- return NULL;
-}
-
-/*
* ntfs_look_for_free_space - Look for a free space in bitmap.
*/
int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
@@ -449,6 +444,39 @@ up_write:
}
/*
+ * ntfs_check_for_free_space
+ *
+ * Check if it is possible to allocate 'clen' clusters and 'mlen' Mft records
+ */
+bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen)
+{
+ size_t free, zlen, avail;
+ struct wnd_bitmap *wnd;
+
+ wnd = &sbi->used.bitmap;
+ down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
+ free = wnd_zeroes(wnd);
+ zlen = min_t(size_t, NTFS_MIN_MFT_ZONE, wnd_zone_len(wnd));
+ up_read(&wnd->rw_lock);
+
+ if (free < zlen + clen)
+ return false;
+
+ avail = free - (zlen + clen);
+
+ wnd = &sbi->mft.bitmap;
+ down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
+ free = wnd_zeroes(wnd);
+ zlen = wnd_zone_len(wnd);
+ up_read(&wnd->rw_lock);
+
+ if (free >= zlen + mlen)
+ return true;
+
+ return avail >= bytes_to_cluster(sbi, mlen << sbi->record_bits);
+}
+
+/*
* ntfs_extend_mft - Allocate additional MFT records.
*
* sbi->mft.bitmap is locked for write.
@@ -475,7 +503,7 @@ static int ntfs_extend_mft(struct ntfs_sb_info *sbi)
struct ATTRIB *attr;
struct wnd_bitmap *wnd = &sbi->mft.bitmap;
- new_mft_total = (wnd->nbits + MFT_INCREASE_CHUNK + 127) & (CLST)~127;
+ new_mft_total = ALIGN(wnd->nbits + NTFS_MFT_INCREASE_STEP, 128);
new_mft_bytes = (u64)new_mft_total << sbi->record_bits;
/* Step 1: Resize $MFT::DATA. */
@@ -618,13 +646,13 @@ next:
NULL, 0, NULL, NULL))
goto next;
- __clear_bit(ir - MFT_REC_RESERVED,
+ __clear_bit_le(ir - MFT_REC_RESERVED,
&sbi->mft.reserved_bitmap);
}
}
/* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */
- zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap,
+ zbit = find_next_zero_bit_le(&sbi->mft.reserved_bitmap,
MFT_REC_FREE, MFT_REC_RESERVED);
if (zbit >= MFT_REC_FREE) {
sbi->mft.next_reserved = MFT_REC_FREE;
@@ -692,7 +720,7 @@ found:
if (*rno >= MFT_REC_FREE)
wnd_set_used(wnd, *rno, 1);
else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited)
- __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+ __set_bit_le(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
out:
if (!mft)
@@ -720,7 +748,7 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft)
else
wnd_set_free(wnd, rno, 1);
} else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) {
- __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+ __clear_bit_le(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
}
if (rno < wnd_zone_bit(wnd))
@@ -830,7 +858,6 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
if (!(sbi->flags & NTFS_FLAGS_MFTMIRR))
return;
- err = 0;
bytes = sbi->mft.recs_mirr << sbi->record_bits;
block1 = sbi->mft.lbo >> sb->s_blocksize_bits;
block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits;
@@ -860,8 +887,7 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
put_bh(bh1);
bh1 = NULL;
- if (wait)
- err = sync_dirty_buffer(bh2);
+ err = wait ? sync_dirty_buffer(bh2) : 0;
put_bh(bh2);
if (err)
@@ -1849,9 +1875,10 @@ int ntfs_security_init(struct ntfs_sb_info *sbi)
goto out;
}
- root_sdh = resident_data(attr);
+ root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
if (root_sdh->type != ATTR_ZERO ||
- root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH) {
+ root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH ||
+ offsetof(struct INDEX_ROOT, ihdr) + root_sdh->ihdr.used > attr->res.data_size) {
err = -EINVAL;
goto out;
}
@@ -1867,9 +1894,10 @@ int ntfs_security_init(struct ntfs_sb_info *sbi)
goto out;
}
- root_sii = resident_data(attr);
+ root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
if (root_sii->type != ATTR_ZERO ||
- root_sii->rule != NTFS_COLLATION_TYPE_UINT) {
+ root_sii->rule != NTFS_COLLATION_TYPE_UINT ||
+ offsetof(struct INDEX_ROOT, ihdr) + root_sii->ihdr.used > attr->res.data_size) {
err = -EINVAL;
goto out;
}
@@ -2502,3 +2530,83 @@ int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim)
return 0;
}
+
+static inline bool name_has_forbidden_chars(const struct le_str *fname)
+{
+ int i, ch;
+
+ /* check for forbidden chars */
+ for (i = 0; i < fname->len; ++i) {
+ ch = le16_to_cpu(fname->name[i]);
+
+ /* control chars */
+ if (ch < 0x20)
+ return true;
+
+ switch (ch) {
+ /* disallowed by Windows */
+ case '\\':
+ case '/':
+ case ':':
+ case '*':
+ case '?':
+ case '<':
+ case '>':
+ case '|':
+ case '\"':
+ return true;
+
+ default:
+ /* allowed char */
+ break;
+ }
+ }
+
+ /* file names cannot end with space or . */
+ if (fname->len > 0) {
+ ch = le16_to_cpu(fname->name[fname->len - 1]);
+ if (ch == ' ' || ch == '.')
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool is_reserved_name(struct ntfs_sb_info *sbi,
+ const struct le_str *fname)
+{
+ int port_digit;
+ const __le16 *name = fname->name;
+ int len = fname->len;
+ u16 *upcase = sbi->upcase;
+
+ /* check for 3 chars reserved names (device names) */
+ /* name by itself or with any extension is forbidden */
+ if (len == 3 || (len > 3 && le16_to_cpu(name[3]) == '.'))
+ if (!ntfs_cmp_names(name, 3, CON_NAME, 3, upcase, false) ||
+ !ntfs_cmp_names(name, 3, NUL_NAME, 3, upcase, false) ||
+ !ntfs_cmp_names(name, 3, AUX_NAME, 3, upcase, false) ||
+ !ntfs_cmp_names(name, 3, PRN_NAME, 3, upcase, false))
+ return true;
+
+ /* check for 4 chars reserved names (port name followed by 1..9) */
+ /* name by itself or with any extension is forbidden */
+ if (len == 4 || (len > 4 && le16_to_cpu(name[4]) == '.')) {
+ port_digit = le16_to_cpu(name[3]);
+ if (port_digit >= '1' && port_digit <= '9')
+ if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, false) ||
+ !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, false))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * valid_windows_name - Check if a file name is valid in Windows.
+ */
+bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *fname)
+{
+ return !name_has_forbidden_chars(fname) &&
+ !is_reserved_name(sbi, fname);
+}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 440328147e7e..51ab75954640 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -47,7 +47,7 @@ static int cmp_fnames(const void *key1, size_t l1, const void *key2, size_t l2,
if (l2 < fsize2)
return -1;
- both_case = f2->type != FILE_NAME_DOS /*&& !sbi->options.nocase*/;
+ both_case = f2->type != FILE_NAME_DOS && !sbi->options->nocase;
if (!l1) {
const struct le_str *s2 = (struct le_str *)&f2->name_len;
@@ -323,7 +323,7 @@ static int indx_mark_used(struct ntfs_index *indx, struct ntfs_inode *ni,
if (err)
return err;
- __set_bit(bit - bbuf.bit, bbuf.buf);
+ __set_bit_le(bit - bbuf.bit, bbuf.buf);
bmp_buf_put(&bbuf, true);
@@ -343,7 +343,7 @@ static int indx_mark_free(struct ntfs_index *indx, struct ntfs_inode *ni,
if (err)
return err;
- __clear_bit(bit - bbuf.bit, bbuf.buf);
+ __clear_bit_le(bit - bbuf.bit, bbuf.buf);
bmp_buf_put(&bbuf, true);
@@ -457,7 +457,7 @@ next_run:
static bool scan_for_free(const ulong *buf, u32 bit, u32 bits, size_t *ret)
{
- size_t pos = find_next_zero_bit(buf, bits, bit);
+ size_t pos = find_next_zero_bit_le(buf, bits, bit);
if (pos >= bits)
return false;
@@ -489,7 +489,7 @@ static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni,
if (!b->non_res) {
u32 nbits = 8 * le32_to_cpu(b->res.data_size);
- size_t pos = find_next_zero_bit(resident_data(b), nbits, 0);
+ size_t pos = find_next_zero_bit_le(resident_data(b), nbits, 0);
if (pos < nbits)
*bit = pos;
@@ -505,7 +505,7 @@ static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni,
static bool scan_for_used(const ulong *buf, u32 bit, u32 bits, size_t *ret)
{
- size_t pos = find_next_bit(buf, bits, bit);
+ size_t pos = find_next_bit_le(buf, bits, bit);
if (pos >= bits)
return false;
@@ -536,7 +536,7 @@ int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit)
if (!b->non_res) {
u32 nbits = le32_to_cpu(b->res.data_size) * 8;
- size_t pos = find_next_bit(resident_data(b), nbits, from);
+ size_t pos = find_next_bit_le(resident_data(b), nbits, from);
if (pos < nbits)
*bit = pos;
@@ -605,11 +605,58 @@ static const struct NTFS_DE *hdr_insert_head(struct INDEX_HDR *hdr,
return e;
}
+/*
+ * index_hdr_check
+ *
+ * return true if INDEX_HDR is valid
+ */
+static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes)
+{
+ u32 end = le32_to_cpu(hdr->used);
+ u32 tot = le32_to_cpu(hdr->total);
+ u32 off = le32_to_cpu(hdr->de_off);
+
+ if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot ||
+ off + sizeof(struct NTFS_DE) > end) {
+ /* incorrect index buffer. */
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * index_buf_check
+ *
+ * return true if INDEX_BUFFER seems is valid
+ */
+static bool index_buf_check(const struct INDEX_BUFFER *ib, u32 bytes,
+ const CLST *vbn)
+{
+ const struct NTFS_RECORD_HEADER *rhdr = &ib->rhdr;
+ u16 fo = le16_to_cpu(rhdr->fix_off);
+ u16 fn = le16_to_cpu(rhdr->fix_num);
+
+ if (bytes <= offsetof(struct INDEX_BUFFER, ihdr) ||
+ rhdr->sign != NTFS_INDX_SIGNATURE ||
+ fo < sizeof(struct INDEX_BUFFER)
+ /* Check index buffer vbn. */
+ || (vbn && *vbn != le64_to_cpu(ib->vbn)) || (fo % sizeof(short)) ||
+ fo + fn * sizeof(short) >= bytes ||
+ fn != ((bytes >> SECTOR_SHIFT) + 1)) {
+ /* incorrect index buffer. */
+ return false;
+ }
+
+ return index_hdr_check(&ib->ihdr,
+ bytes - offsetof(struct INDEX_BUFFER, ihdr));
+}
+
void fnd_clear(struct ntfs_fnd *fnd)
{
int i;
- for (i = 0; i < fnd->level; i++) {
+ for (i = fnd->level - 1; i >= 0; i--) {
struct indx_node *n = fnd->nodes[i];
if (!n)
@@ -625,9 +672,8 @@ void fnd_clear(struct ntfs_fnd *fnd)
static int fnd_push(struct ntfs_fnd *fnd, struct indx_node *n,
struct NTFS_DE *e)
{
- int i;
+ int i = fnd->level;
- i = fnd->level;
if (i < 0 || i >= ARRAY_SIZE(fnd->nodes))
return -EINVAL;
fnd->nodes[i] = n;
@@ -820,9 +866,16 @@ int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
u32 t32;
const struct INDEX_ROOT *root = resident_data(attr);
+ t32 = le32_to_cpu(attr->res.data_size);
+ if (t32 <= offsetof(struct INDEX_ROOT, ihdr) ||
+ !index_hdr_check(&root->ihdr,
+ t32 - offsetof(struct INDEX_ROOT, ihdr))) {
+ goto out;
+ }
+
/* Check root fields. */
if (!root->index_block_clst)
- return -EINVAL;
+ goto out;
indx->type = type;
indx->idx2vbn_bits = __ffs(root->index_block_clst);
@@ -834,19 +887,19 @@ int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
if (t32 < sbi->cluster_size) {
/* Index record is smaller than a cluster, use 512 blocks. */
if (t32 != root->index_block_clst * SECTOR_SIZE)
- return -EINVAL;
+ goto out;
/* Check alignment to a cluster. */
if ((sbi->cluster_size >> SECTOR_SHIFT) &
(root->index_block_clst - 1)) {
- return -EINVAL;
+ goto out;
}
indx->vbn2vbo_bits = SECTOR_SHIFT;
} else {
/* Index record must be a multiple of cluster size. */
if (t32 != root->index_block_clst << sbi->cluster_bits)
- return -EINVAL;
+ goto out;
indx->vbn2vbo_bits = sbi->cluster_bits;
}
@@ -854,7 +907,14 @@ int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
init_rwsem(&indx->run_lock);
indx->cmp = get_cmp_func(root);
- return indx->cmp ? 0 : -EINVAL;
+ if (!indx->cmp)
+ goto out;
+
+ return 0;
+
+out:
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
+ return -EINVAL;
}
static struct indx_node *indx_new(struct ntfs_index *indx,
@@ -1012,11 +1072,24 @@ int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn,
goto out;
ok:
+ if (!index_buf_check(ib, bytes, &vbn)) {
+ ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
+ ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+ err = -EINVAL;
+ goto out;
+ }
+
if (err == -E_NTFS_FIXUP) {
ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &in->nb, 0);
err = 0;
}
+ /* check for index header length */
+ if (offsetof(struct INDEX_BUFFER, ihdr) + ib->ihdr.used > bytes) {
+ err = -EINVAL;
+ goto out;
+ }
+
in->index = ib;
*node = in;
@@ -1341,8 +1414,8 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
run_init(&run);
- err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, 0, &alen, 0,
- NULL);
+ err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, ALLOCATE_DEF,
+ &alen, 0, NULL, NULL);
if (err)
goto out;
@@ -1440,6 +1513,9 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
goto out1;
}
+ if (in->name == I30_NAME)
+ ni->vfs_inode.i_size = data_size;
+
*vbn = bit << indx->idx2vbn_bits;
return 0;
@@ -1593,9 +1669,9 @@ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni,
if (err) {
/* Restore root. */
- if (mi_resize_attr(mi, attr, -ds_root))
+ if (mi_resize_attr(mi, attr, -ds_root)) {
memcpy(attr, a_root, asize);
- else {
+ } else {
/* Bug? */
ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
}
@@ -1947,7 +2023,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni,
if (bit >= nbits)
return 0;
- pos = find_next_bit(bm, nbits, bit);
+ pos = find_next_bit_le(bm, nbits, bit);
if (pos < nbits)
return 0;
} else {
@@ -1973,6 +2049,9 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni,
if (err)
return err;
+ if (in->name == I30_NAME)
+ ni->vfs_inode.i_size = new_data;
+
bpb = bitmap_size(bit);
if (bpb * 8 == nbits)
return 0;
@@ -2115,9 +2194,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx,
fnd->de[level] = e;
indx_write(indx, ni, n, 0);
- /* Check to see if this action created an empty leaf. */
- if (ib_is_leaf(ib) && ib_is_empty(ib))
+ if (ib_is_leaf(ib) && ib_is_empty(ib)) {
+ /* An empty leaf. */
return 0;
+ }
out:
fnd_clear(fnd);
@@ -2455,6 +2535,9 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
&indx->alloc_run, 0, NULL, false, NULL);
+ if (in->name == I30_NAME)
+ ni->vfs_inode.i_size = 0;
+
err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len,
false, NULL);
run_close(&indx->alloc_run);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index d5a3afbbbfd8..20b953871574 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -81,7 +81,7 @@ static struct inode *ntfs_read_mft(struct inode *inode,
le16_to_cpu(ref->seq), le16_to_cpu(rec->seq));
goto out;
} else if (!is_rec_inuse(rec)) {
- err = -EINVAL;
+ err = -ESTALE;
ntfs_err(sb, "Inode r=%x is not in use!", (u32)ino);
goto out;
}
@@ -92,8 +92,10 @@ static struct inode *ntfs_read_mft(struct inode *inode,
goto out;
}
- if (!is_rec_base(rec))
- goto Ok;
+ if (!is_rec_base(rec)) {
+ err = -EINVAL;
+ goto out;
+ }
/* Record should contain $I30 root. */
is_dir = rec->flags & RECORD_FLAG_DIR;
@@ -129,6 +131,16 @@ next_attr:
rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size);
asize = le32_to_cpu(attr->size);
+ if (le16_to_cpu(attr->name_off) + attr->name_len > asize)
+ goto out;
+
+ if (attr->non_res) {
+ t64 = le64_to_cpu(attr->nres.alloc_size);
+ if (le64_to_cpu(attr->nres.data_size) > t64 ||
+ le64_to_cpu(attr->nres.valid_size) > t64)
+ goto out;
+ }
+
switch (attr->type) {
case ATTR_STD:
if (attr->non_res ||
@@ -364,7 +376,13 @@ next_attr:
attr_unpack_run:
roff = le16_to_cpu(attr->nres.run_off);
+ if (roff > asize) {
+ err = -EINVAL;
+ goto out;
+ }
+
t64 = le64_to_cpu(attr->nres.svcn);
+
err = run_unpack_ex(run, sbi, ino, t64, le64_to_cpu(attr->nres.evcn),
t64, Add2Ptr(attr, roff), asize - roff);
if (err < 0)
@@ -450,7 +468,6 @@ end_enum:
inode->i_flags |= S_NOSEC;
}
-Ok:
if (ino == MFT_REC_MFT && !sb->s_root)
sbi->mft.ni = NULL;
@@ -504,6 +521,9 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
_ntfs_bad_inode(inode);
}
+ if (IS_ERR(inode) && name)
+ ntfs_set_state(sb->s_fs_info, NTFS_DIRTY_ERROR);
+
return inode;
}
@@ -535,17 +555,6 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
clear_buffer_new(bh);
clear_buffer_uptodate(bh);
- /* Direct write uses 'create=0'. */
- if (!create && vbo >= ni->i_valid) {
- /* Out of valid. */
- return 0;
- }
-
- if (vbo >= inode->i_size) {
- /* Out of size. */
- return 0;
- }
-
if (is_resident(ni)) {
ni_lock(ni);
err = attr_data_read_resident(ni, page);
@@ -561,7 +570,8 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
off = vbo & sbi->cluster_mask;
new = false;
- err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL);
+ err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL,
+ create && sbi->cluster_size > PAGE_SIZE);
if (err)
goto out;
@@ -579,11 +589,8 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
WARN_ON(1);
}
- if (new) {
+ if (new)
set_buffer_new(bh);
- if ((len << cluster_bits) > block_size)
- ntfs_sparse_cluster(inode, page, vcn, len);
- }
lbo = ((u64)lcn << cluster_bits) + off;
@@ -611,7 +618,6 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
}
} else if (vbo >= valid) {
/* Read out of valid data. */
- /* Should never be here 'cause already checked. */
clear_buffer_mapped(bh);
} else if (vbo + bytes <= valid) {
/* Normal read. */
@@ -953,6 +959,11 @@ int ntfs_write_end(struct file *file, struct address_space *mapping,
dirty = true;
}
+ if (pos + err > inode->i_size) {
+ inode->i_size = pos + err;
+ dirty = true;
+ }
+
if (dirty)
mark_inode_dirty(inode);
}
@@ -1162,6 +1173,18 @@ out:
return ERR_PTR(err);
}
+/*
+ * ntfs_create_inode
+ *
+ * Helper function for:
+ * - ntfs_create
+ * - ntfs_mknod
+ * - ntfs_symlink
+ * - ntfs_mkdir
+ * - ntfs_atomic_open
+ *
+ * NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked
+ */
struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
const struct cpu_str *uni, umode_t mode,
@@ -1191,7 +1214,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
struct REPARSE_DATA_BUFFER *rp = NULL;
bool rp_inserted = false;
- ni_lock_dir(dir_ni);
+ if (!fnd)
+ ni_lock_dir(dir_ni);
dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL);
if (!dir_root) {
@@ -1254,6 +1278,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
fa = FILE_ATTRIBUTE_ARCHIVE;
}
+ /* If option "hide_dot_files" then set hidden attribute for dot files. */
+ if (sbi->options->hide_dot_files && name->name[0] == '.')
+ fa |= FILE_ATTRIBUTE_HIDDEN;
+
if (!(mode & 0222))
fa |= FILE_ATTRIBUTE_READONLY;
@@ -1339,6 +1367,13 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
mi_get_ref(&ni->mi, &new_de->ref);
fname = (struct ATTR_FILE_NAME *)(new_de + 1);
+
+ if (sbi->options->windows_names &&
+ !valid_windows_name(sbi, (struct le_str *)&fname->name_len)) {
+ err = -EINVAL;
+ goto out4;
+ }
+
mi_get_ref(&dir_ni->mi, &fname->home);
fname->dup.cr_time = fname->dup.m_time = fname->dup.c_time =
fname->dup.a_time = std5->cr_time;
@@ -1502,8 +1537,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
cpu_to_le64(ntfs_up_cluster(sbi, nsize));
err = attr_allocate_clusters(sbi, &ni->file.run, 0, 0,
- clst, NULL, 0, &alen, 0,
- NULL);
+ clst, NULL, ALLOCATE_DEF,
+ &alen, 0, NULL, NULL);
if (err)
goto out5;
@@ -1550,7 +1585,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
goto out6;
/* Unlock parent directory before ntfs_init_acl. */
- ni_unlock(dir_ni);
+ if (!fnd)
+ ni_unlock(dir_ni);
inode->i_generation = le16_to_cpu(rec->seq);
@@ -1610,7 +1646,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
out7:
/* Undo 'indx_insert_entry'. */
- ni_lock_dir(dir_ni);
+ if (!fnd)
+ ni_lock_dir(dir_ni);
indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1,
le16_to_cpu(new_de->key_size), sbi);
/* ni_unlock(dir_ni); will be called later. */
@@ -1619,10 +1656,8 @@ out6:
ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref);
out5:
- if (S_ISDIR(mode) || run_is_empty(&ni->file.run))
- goto out4;
-
- run_deallocate(sbi, &ni->file.run, false);
+ if (!S_ISDIR(mode))
+ run_deallocate(sbi, &ni->file.run, false);
out4:
clear_rec_inuse(rec);
@@ -1638,7 +1673,8 @@ out2:
out1:
if (err) {
- ni_unlock(dir_ni);
+ if (!fnd)
+ ni_unlock(dir_ni);
return ERR_PTR(err);
}
@@ -1746,7 +1782,103 @@ void ntfs_evict_inode(struct inode *inode)
ni_clear(ntfs_i(inode));
}
-static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
+/*
+ * ntfs_translate_junction
+ *
+ * Translate a Windows junction target to the Linux equivalent.
+ * On junctions, targets are always absolute (they include the drive
+ * letter). We have no way of knowing if the target is for the current
+ * mounted device or not so we just assume it is.
+ */
+static int ntfs_translate_junction(const struct super_block *sb,
+ const struct dentry *link_de, char *target,
+ int target_len, int target_max)
+{
+ int tl_len, err = target_len;
+ char *link_path_buffer = NULL, *link_path;
+ char *translated = NULL;
+ char *target_start;
+ int copy_len;
+
+ link_path_buffer = kmalloc(PATH_MAX, GFP_NOFS);
+ if (!link_path_buffer) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /* Get link path, relative to mount point */
+ link_path = dentry_path_raw(link_de, link_path_buffer, PATH_MAX);
+ if (IS_ERR(link_path)) {
+ ntfs_err(sb, "Error getting link path");
+ err = -EINVAL;
+ goto out;
+ }
+
+ translated = kmalloc(PATH_MAX, GFP_NOFS);
+ if (!translated) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Make translated path a relative path to mount point */
+ strcpy(translated, "./");
+ ++link_path; /* Skip leading / */
+ for (tl_len = sizeof("./") - 1; *link_path; ++link_path) {
+ if (*link_path == '/') {
+ if (PATH_MAX - tl_len < sizeof("../")) {
+ ntfs_err(sb,
+ "Link path %s has too many components",
+ link_path);
+ err = -EINVAL;
+ goto out;
+ }
+ strcpy(translated + tl_len, "../");
+ tl_len += sizeof("../") - 1;
+ }
+ }
+
+ /* Skip drive letter */
+ target_start = target;
+ while (*target_start && *target_start != ':')
+ ++target_start;
+
+ if (!*target_start) {
+ ntfs_err(sb, "Link target (%s) missing drive separator",
+ target);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Skip drive separator and leading /, if exists */
+ target_start += 1 + (target_start[1] == '/');
+ copy_len = target_len - (target_start - target);
+
+ if (PATH_MAX - tl_len <= copy_len) {
+ ntfs_err(sb, "Link target %s too large for buffer (%d <= %d)",
+ target_start, PATH_MAX - tl_len, copy_len);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* translated path has a trailing / and target_start does not */
+ strcpy(translated + tl_len, target_start);
+ tl_len += copy_len;
+ if (target_max <= tl_len) {
+ ntfs_err(sb, "Target path %s too large for buffer (%d <= %d)",
+ translated, target_max, tl_len);
+ err = -EINVAL;
+ goto out;
+ }
+ strcpy(target, translated);
+ err = tl_len;
+
+out:
+ kfree(link_path_buffer);
+ kfree(translated);
+ return err;
+}
+
+static noinline int ntfs_readlink_hlp(const struct dentry *link_de,
+ struct inode *inode, char *buffer,
int buflen)
{
int i, err = -EINVAL;
@@ -1889,6 +2021,11 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
/* Always set last zero. */
buffer[err] = 0;
+
+ /* If this is a junction, translate the link target. */
+ if (rp->ReparseTag == IO_REPARSE_TAG_MOUNT_POINT)
+ err = ntfs_translate_junction(sb, link_de, buffer, err, buflen);
+
out:
kfree(to_free);
return err;
@@ -1907,7 +2044,7 @@ static const char *ntfs_get_link(struct dentry *de, struct inode *inode,
if (!ret)
return ERR_PTR(-ENOMEM);
- err = ntfs_readlink_hlp(inode, ret, PAGE_SIZE);
+ err = ntfs_readlink_hlp(de, inode, ret, PAGE_SIZE);
if (err < 0) {
kfree(ret);
return ERR_PTR(err);
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index bc22cc321a74..c8db35e2ae17 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -7,6 +7,8 @@
#include <linux/fs.h>
#include <linux/nls.h>
+#include <linux/ctype.h>
+#include <linux/posix_acl.h>
#include "debug.h"
#include "ntfs.h"
@@ -303,6 +305,8 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir,
ni_lock_dir(dir_ni);
ni_lock(ni);
+ if (dir_ni != new_dir_ni)
+ ni_lock_dir2(new_dir_ni);
is_bad = false;
err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad);
@@ -326,6 +330,8 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir,
ntfs_sync_inode(inode);
}
+ if (dir_ni != new_dir_ni)
+ ni_unlock(new_dir_ni);
ni_unlock(ni);
ni_unlock(dir_ni);
out:
@@ -333,6 +339,104 @@ out:
return err;
}
+/*
+ * ntfs_atomic_open
+ *
+ * inode_operations::atomic_open
+ */
+static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, u32 flags, umode_t mode)
+{
+ int err;
+ struct inode *inode;
+ struct ntfs_fnd *fnd = NULL;
+ struct ntfs_inode *ni = ntfs_i(dir);
+ struct dentry *d = NULL;
+ struct cpu_str *uni = __getname();
+ bool locked = false;
+
+ if (!uni)
+ return -ENOMEM;
+
+ err = ntfs_nls_to_utf16(ni->mi.sbi, dentry->d_name.name,
+ dentry->d_name.len, uni, NTFS_NAME_LEN,
+ UTF16_HOST_ENDIAN);
+ if (err < 0)
+ goto out;
+
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+ if (IS_POSIXACL(dir)) {
+ /*
+ * Load in cache current acl to avoid ni_lock(dir):
+ * ntfs_create_inode -> ntfs_init_acl -> posix_acl_create ->
+ * ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock
+ */
+ struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (IS_ERR(p)) {
+ err = PTR_ERR(p);
+ goto out;
+ }
+ posix_acl_release(p);
+ }
+#endif
+
+ if (d_in_lookup(dentry)) {
+ ni_lock_dir(ni);
+ locked = true;
+ fnd = fnd_get();
+ if (!fnd) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ d = d_splice_alias(dir_search_u(dir, uni, fnd), dentry);
+ if (IS_ERR(d)) {
+ err = PTR_ERR(d);
+ d = NULL;
+ goto out2;
+ }
+
+ if (d)
+ dentry = d;
+ }
+
+ if (!(flags & O_CREAT) || d_really_is_positive(dentry)) {
+ err = finish_no_open(file, d);
+ goto out2;
+ }
+
+ file->f_mode |= FMODE_CREATED;
+
+ /*
+ * fnd contains tree's path to insert to.
+ * If fnd is not NULL then dir is locked.
+ */
+
+ /*
+ * Unfortunately I don't know how to get here correct 'struct nameidata *nd'
+ * or 'struct user_namespace *mnt_userns'.
+ * See atomic_open in fs/namei.c.
+ * This is why xfstest/633 failed.
+ * Looks like ntfs_atomic_open must accept 'struct user_namespace *mnt_userns' as argument.
+ */
+
+ inode = ntfs_create_inode(&init_user_ns, dir, dentry, uni, mode, 0,
+ NULL, 0, fnd);
+ err = IS_ERR(inode) ? PTR_ERR(inode)
+ : finish_open(file, dentry, ntfs_file_open);
+ dput(d);
+
+out2:
+ fnd_put(fnd);
+out1:
+ if (locked)
+ ni_unlock(ni);
+out:
+ __putname(uni);
+ return err;
+}
+
struct dentry *ntfs3_get_parent(struct dentry *child)
{
struct inode *inode = d_inode(child);
@@ -355,6 +459,133 @@ struct dentry *ntfs3_get_parent(struct dentry *child)
return ERR_PTR(-ENOENT);
}
+/*
+ * dentry_operations::d_hash
+ */
+static int ntfs_d_hash(const struct dentry *dentry, struct qstr *name)
+{
+ struct ntfs_sb_info *sbi;
+ const char *n = name->name;
+ unsigned int len = name->len;
+ unsigned long hash;
+ struct cpu_str *uni;
+ unsigned int c;
+ int err;
+
+ /* First try fast implementation. */
+ hash = init_name_hash(dentry);
+
+ for (;;) {
+ if (!len--) {
+ name->hash = end_name_hash(hash);
+ return 0;
+ }
+
+ c = *n++;
+ if (c >= 0x80)
+ break;
+
+ hash = partial_name_hash(toupper(c), hash);
+ }
+
+ /*
+ * Try slow way with current upcase table
+ */
+ uni = __getname();
+ if (!uni)
+ return -ENOMEM;
+
+ sbi = dentry->d_sb->s_fs_info;
+
+ err = ntfs_nls_to_utf16(sbi, name->name, name->len, uni, NTFS_NAME_LEN,
+ UTF16_HOST_ENDIAN);
+ if (err < 0)
+ goto out;
+
+ if (!err) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ hash = ntfs_names_hash(uni->name, uni->len, sbi->upcase,
+ init_name_hash(dentry));
+ name->hash = end_name_hash(hash);
+ err = 0;
+
+out:
+ __putname(uni);
+ return err;
+}
+
+/*
+ * dentry_operations::d_compare
+ */
+static int ntfs_d_compare(const struct dentry *dentry, unsigned int len1,
+ const char *str, const struct qstr *name)
+{
+ struct ntfs_sb_info *sbi;
+ int ret;
+ const char *n1 = str;
+ const char *n2 = name->name;
+ unsigned int len2 = name->len;
+ unsigned int lm = min(len1, len2);
+ unsigned char c1, c2;
+ struct cpu_str *uni1;
+ struct le_str *uni2;
+
+ /* First try fast implementation. */
+ for (;;) {
+ if (!lm--)
+ return len1 != len2;
+
+ if ((c1 = *n1++) == (c2 = *n2++))
+ continue;
+
+ if (c1 >= 0x80 || c2 >= 0x80)
+ break;
+
+ if (toupper(c1) != toupper(c2))
+ return 1;
+ }
+
+ /*
+ * Try slow way with current upcase table
+ */
+ sbi = dentry->d_sb->s_fs_info;
+ uni1 = __getname();
+ if (!uni1)
+ return -ENOMEM;
+
+ ret = ntfs_nls_to_utf16(sbi, str, len1, uni1, NTFS_NAME_LEN,
+ UTF16_HOST_ENDIAN);
+ if (ret < 0)
+ goto out;
+
+ if (!ret) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ uni2 = Add2Ptr(uni1, 2048);
+
+ ret = ntfs_nls_to_utf16(sbi, name->name, name->len,
+ (struct cpu_str *)uni2, NTFS_NAME_LEN,
+ UTF16_LITTLE_ENDIAN);
+ if (ret < 0)
+ goto out;
+
+ if (!ret) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = !ntfs_cmp_names_cpu(uni1, uni2, sbi->upcase, false) ? 0 : 1;
+
+out:
+ __putname(uni1);
+ return ret;
+}
+
// clang-format off
const struct inode_operations ntfs_dir_inode_operations = {
.lookup = ntfs_lookup,
@@ -367,11 +598,12 @@ const struct inode_operations ntfs_dir_inode_operations = {
.mknod = ntfs_mknod,
.rename = ntfs_rename,
.permission = ntfs_permission,
- .get_acl = ntfs_get_acl,
+ .get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
.listxattr = ntfs_listxattr,
+ .atomic_open = ntfs_atomic_open,
.fiemap = ntfs_fiemap,
};
@@ -379,7 +611,13 @@ const struct inode_operations ntfs_special_inode_operations = {
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
.listxattr = ntfs_listxattr,
- .get_acl = ntfs_get_acl,
+ .get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
};
+
+const struct dentry_operations ntfs_dentry_ops = {
+ .d_hash = ntfs_d_hash,
+ .d_compare = ntfs_d_compare,
+};
+
// clang-format on
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 9cc396b117bf..86ea1826d099 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -84,7 +84,6 @@ typedef u32 CLST;
#define COMPRESSION_UNIT 4
#define COMPRESS_MAX_CLUSTER 0x1000
-#define MFT_INCREASE_CHUNK 1024
enum RECORD_NUM {
MFT_REC_MFT = 0,
@@ -715,12 +714,13 @@ static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr)
{
u32 de_off = le32_to_cpu(hdr->de_off);
u32 used = le32_to_cpu(hdr->used);
- struct NTFS_DE *e = Add2Ptr(hdr, de_off);
+ struct NTFS_DE *e;
u16 esize;
- if (de_off >= used || de_off >= le32_to_cpu(hdr->total))
+ if (de_off >= used || de_off + sizeof(struct NTFS_DE) > used )
return NULL;
+ e = Add2Ptr(hdr, de_off);
esize = le16_to_cpu(e->size);
if (esize < sizeof(struct NTFS_DE) || de_off + esize > used)
return NULL;
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 2c791222c4e2..0e051c5595a2 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -97,9 +97,12 @@ struct ntfs_mount_options {
unsigned sparse : 1; /* Create sparse files. */
unsigned showmeta : 1; /* Show meta files. */
unsigned nohidden : 1; /* Do not show hidden files. */
+ unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */
+ unsigned windows_names : 1; /* Disallow names forbidden by Windows. */
unsigned force : 1; /* RW mount dirty volume. */
unsigned noacsrules : 1; /* Exclude acs rules. */
unsigned prealloc : 1; /* Preallocate space when file is growing. */
+ unsigned nocase : 1; /* case insensitive. */
};
/* Special value to unpack and deallocate. */
@@ -124,6 +127,7 @@ struct ntfs_buffers {
enum ALLOCATE_OPT {
ALLOCATE_DEF = 0, // Allocate all clusters.
ALLOCATE_MFT = 1, // Allocate for MFT.
+ ALLOCATE_ZERO = 2, // Zeroout new allocated clusters
};
enum bitmap_mutex_classes {
@@ -195,6 +199,8 @@ struct ntfs_index {
/* Minimum MFT zone. */
#define NTFS_MIN_MFT_ZONE 100
+/* Step to increase the MFT. */
+#define NTFS_MFT_INCREASE_STEP 1024
/* Ntfs file system in-core superblock data. */
struct ntfs_sb_info {
@@ -330,6 +336,7 @@ enum ntfs_inode_mutex_lock_class {
NTFS_INODE_MUTEX_REPARSE,
NTFS_INODE_MUTEX_NORMAL,
NTFS_INODE_MUTEX_PARENT,
+ NTFS_INODE_MUTEX_PARENT2,
};
/*
@@ -412,7 +419,7 @@ enum REPARSE_SIGN {
int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
- CLST *new_lcn);
+ CLST *new_lcn, CLST *new_len);
int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
struct ATTR_LIST_ENTRY *le, struct mft_inode *mi,
u64 new_size, struct runs_tree *run,
@@ -422,7 +429,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
u64 new_size, const u64 *new_valid, bool keep_prealloc,
struct ATTRIB **ret);
int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
- CLST *len, bool *new);
+ CLST *len, bool *new, bool zero);
int attr_data_read_resident(struct ntfs_inode *ni, struct page *page);
int attr_data_write_resident(struct ntfs_inode *ni, struct page *page);
int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
@@ -469,9 +476,9 @@ static inline size_t al_aligned(size_t size)
}
/* Globals from bitfunc.c */
-bool are_bits_clear(const ulong *map, size_t bit, size_t nbits);
-bool are_bits_set(const ulong *map, size_t bit, size_t nbits);
-size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits);
+bool are_bits_clear(const void *map, size_t bit, size_t nbits);
+bool are_bits_set(const void *map, size_t bit, size_t nbits);
+size_t get_set_bits_ex(const void *map, size_t bit, size_t nbits);
/* Globals from dir.c */
int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
@@ -487,8 +494,6 @@ extern const struct file_operations ntfs_dir_operations;
/* Globals from file.c */
int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, u32 flags);
-void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
- CLST len);
int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr);
int ntfs_file_open(struct inode *inode, struct file *file);
@@ -582,11 +587,10 @@ int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
bool simple);
int ntfs_extend_init(struct ntfs_sb_info *sbi);
int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi);
-const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi,
- enum ATTR_TYPE Type);
int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
CLST *new_lcn, CLST *new_len,
enum ALLOCATE_OPT opt);
+bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen);
int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft,
struct ntfs_inode *ni, struct mft_inode **mi);
void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft);
@@ -643,6 +647,7 @@ int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
const struct MFT_REF *ref);
void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim);
int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim);
+bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *name);
/* Globals from index.c */
int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit);
@@ -720,6 +725,7 @@ struct dentry *ntfs3_get_parent(struct dentry *child);
extern const struct inode_operations ntfs_dir_inode_operations;
extern const struct inode_operations ntfs_special_inode_operations;
+extern const struct dentry_operations ntfs_dentry_ops;
/* Globals from record.c */
int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi);
@@ -793,12 +799,12 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
u32 run_buf_size, CLST *packed_vcns);
int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
- u32 run_buf_size);
+ int run_buf_size);
#ifdef NTFS3_CHECK_FREE_CLST
int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
- u32 run_buf_size);
+ int run_buf_size);
#else
#define run_unpack_ex run_unpack
#endif
@@ -822,6 +828,8 @@ static inline size_t wnd_zeroes(const struct wnd_bitmap *wnd)
int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits);
int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits);
int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits);
+int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits,
+ size_t *done);
bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits);
bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits);
@@ -834,16 +842,22 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits);
void wnd_zone_set(struct wnd_bitmap *wnd, size_t Lcn, size_t Len);
int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range);
+void ntfs_bitmap_set_le(void *map, unsigned int start, int len);
+void ntfs_bitmap_clear_le(void *map, unsigned int start, int len);
+unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits);
+
/* Globals from upcase.c */
int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2,
const u16 *upcase, bool bothcase);
int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2,
const u16 *upcase, bool bothcase);
+unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase,
+ unsigned long hash);
/* globals from xattr.c */
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu);
-int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct inode *dir);
@@ -852,7 +866,7 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
#define ntfs_set_acl NULL
#endif
-int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode);
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry);
int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
int mask);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -1113,6 +1127,11 @@ static inline void ni_lock_dir(struct ntfs_inode *ni)
mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT);
}
+static inline void ni_lock_dir2(struct ntfs_inode *ni)
+{
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT2);
+}
+
static inline void ni_unlock(struct ntfs_inode *ni)
{
mutex_unlock(&ni->ni_lock);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 7d2fac5ee215..defce6a5c8e1 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -220,6 +220,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
return NULL;
}
+ if (off + asize < off) {
+ /* overflow check */
+ return NULL;
+ }
+
attr = Add2Ptr(attr, asize);
off += asize;
}
@@ -260,6 +265,10 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
if (t16 + t32 > asize)
return NULL;
+ t32 = sizeof(short) * attr->name_len;
+ if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
+ return NULL;
+
return attr;
}
@@ -537,6 +546,10 @@ bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes)
return true;
}
+/*
+ * Pack runs in MFT record.
+ * If failed record is not changed.
+ */
int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr,
struct runs_tree *run, CLST len)
{
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index aaaa0d3d35a2..a5af71cd8d14 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -919,12 +919,15 @@ out:
*/
int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
- u32 run_buf_size)
+ int run_buf_size)
{
u64 prev_lcn, vcn64, lcn, next_vcn;
const u8 *run_last, *run_0;
bool is_mft = ino == MFT_REC_MFT;
+ if (run_buf_size < 0)
+ return -EINVAL;
+
/* Check for empty. */
if (evcn + 1 == svcn)
return 0;
@@ -1046,7 +1049,7 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
*/
int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
- u32 run_buf_size)
+ int run_buf_size)
{
int ret, err;
CLST next_vcn, lcn, len;
@@ -1093,25 +1096,8 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
if (down_write_trylock(&wnd->rw_lock)) {
/* Mark all zero bits as used in range [lcn, lcn+len). */
- CLST i, lcn_f = 0, len_f = 0;
-
- err = 0;
- for (i = 0; i < len; i++) {
- if (wnd_is_free(wnd, lcn + i, 1)) {
- if (!len_f)
- lcn_f = lcn + i;
- len_f += 1;
- } else if (len_f) {
- err = wnd_set_used(wnd, lcn_f, len_f);
- len_f = 0;
- if (err)
- break;
- }
- }
-
- if (len_f)
- err = wnd_set_used(wnd, lcn_f, len_f);
-
+ size_t done;
+ err = wnd_set_used_safe(wnd, lcn, len, &done);
up_write(&wnd->rw_lock);
if (err)
return err;
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 47012c9bf505..ef4ea3f21905 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -21,6 +21,30 @@
* https://docs.microsoft.com/en-us/windows/wsl/file-permissions
* It stores uid/gid/mode/dev in xattr
*
+ * ntfs allows up to 2^64 clusters per volume.
+ * It means you should use 64 bits lcn to operate with ntfs.
+ * Implementation of ntfs.sys uses only 32 bits lcn.
+ * Default ntfs3 uses 32 bits lcn too.
+ * ntfs3 built with CONFIG_NTFS3_64BIT_CLUSTER (ntfs3_64) uses 64 bits per lcn.
+ *
+ *
+ * ntfs limits, cluster size is 4K (2^12)
+ * -----------------------------------------------------------------------------
+ * | Volume size | Clusters | ntfs.sys | ntfs3 | ntfs3_64 | mkntfs | chkdsk |
+ * -----------------------------------------------------------------------------
+ * | < 16T, 2^44 | < 2^32 | yes | yes | yes | yes | yes |
+ * | > 16T, 2^44 | > 2^32 | no | no | yes | yes | yes |
+ * ----------------------------------------------------------|------------------
+ *
+ * To mount large volumes as ntfs one should use large cluster size (up to 2M)
+ * The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P
+ *
+ * ntfs limits, cluster size is 2M (2^31)
+ * -----------------------------------------------------------------------------
+ * | < 8P, 2^54 | < 2^32 | yes | yes | yes | yes | yes |
+ * | > 8P, 2^54 | > 2^32 | no | no | yes | yes | yes |
+ * ----------------------------------------------------------|------------------
+ *
*/
#include <linux/blkdev.h>
@@ -223,11 +247,14 @@ enum Opt {
Opt_force,
Opt_sparse,
Opt_nohidden,
+ Opt_hide_dot_files,
+ Opt_windows_names,
Opt_showmeta,
Opt_acl,
Opt_iocharset,
Opt_prealloc,
Opt_noacsrules,
+ Opt_nocase,
Opt_err,
};
@@ -242,10 +269,13 @@ static const struct fs_parameter_spec ntfs_fs_parameters[] = {
fsparam_flag_no("force", Opt_force),
fsparam_flag_no("sparse", Opt_sparse),
fsparam_flag_no("hidden", Opt_nohidden),
+ fsparam_flag_no("hide_dot_files", Opt_hide_dot_files),
+ fsparam_flag_no("windows_names", Opt_windows_names),
fsparam_flag_no("acl", Opt_acl),
fsparam_flag_no("showmeta", Opt_showmeta),
fsparam_flag_no("prealloc", Opt_prealloc),
fsparam_flag_no("acsrules", Opt_noacsrules),
+ fsparam_flag_no("nocase", Opt_nocase),
fsparam_string("iocharset", Opt_iocharset),
{}
};
@@ -330,6 +360,12 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
case Opt_nohidden:
opts->nohidden = result.negated ? 1 : 0;
break;
+ case Opt_hide_dot_files:
+ opts->hide_dot_files = result.negated ? 0 : 1;
+ break;
+ case Opt_windows_names:
+ opts->windows_names = result.negated ? 0 : 1;
+ break;
case Opt_acl:
if (!result.negated)
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
@@ -354,6 +390,9 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
case Opt_noacsrules:
opts->noacsrules = result.negated ? 1 : 0;
break;
+ case Opt_nocase:
+ opts->nocase = result.negated ? 1 : 0;
+ break;
default:
/* Should not be here unless we forget add case. */
return -EINVAL;
@@ -406,27 +445,18 @@ static struct inode *ntfs_alloc_inode(struct super_block *sb)
return NULL;
memset(ni, 0, offsetof(struct ntfs_inode, vfs_inode));
-
mutex_init(&ni->ni_lock);
-
return &ni->vfs_inode;
}
-static void ntfs_i_callback(struct rcu_head *head)
+static void ntfs_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
struct ntfs_inode *ni = ntfs_i(inode);
mutex_destroy(&ni->ni_lock);
-
kmem_cache_free(ntfs_inode_cachep, ni);
}
-static void ntfs_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, ntfs_i_callback);
-}
-
static void init_once(void *foo)
{
struct ntfs_inode *ni = foo;
@@ -519,9 +549,9 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",gid=%u",
from_kgid_munged(user_ns, opts->fs_gid));
if (opts->fmask)
- seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv);
+ seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff);
if (opts->dmask)
- seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv);
+ seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff);
if (opts->nls)
seq_printf(m, ",iocharset=%s", opts->nls->charset);
else
@@ -536,6 +566,10 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",showmeta");
if (opts->nohidden)
seq_puts(m, ",nohidden");
+ if (opts->windows_names)
+ seq_puts(m, ",windows_names");
+ if (opts->hide_dot_files)
+ seq_puts(m, ",hide_dot_files");
if (opts->force)
seq_puts(m, ",force");
if (opts->noacsrules)
@@ -592,7 +626,7 @@ static int ntfs_sync_fs(struct super_block *sb, int wait)
static const struct super_operations ntfs_sops = {
.alloc_inode = ntfs_alloc_inode,
- .destroy_inode = ntfs_destroy_inode,
+ .free_inode = ntfs_free_inode,
.evict_inode = ntfs_evict_inode,
.put_super = ntfs_put_super,
.statfs = ntfs_statfs,
@@ -672,7 +706,7 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot)
if (boot->sectors_per_clusters <= 0x80)
return boot->sectors_per_clusters;
if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */
- return 1U << (0 - boot->sectors_per_clusters);
+ return 1U << -(s8)boot->sectors_per_clusters;
return -EINVAL;
}
@@ -789,7 +823,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
: (u32)boot->record_size
<< sbi->cluster_bits;
- if (record_size > MAXIMUM_BYTES_PER_MFT)
+ if (record_size > MAXIMUM_BYTES_PER_MFT || record_size < SECTOR_SIZE)
goto out;
sbi->record_bits = blksize_bits(record_size);
@@ -896,7 +930,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
struct block_device *bdev = sb->s_bdev;
struct inode *inode;
struct ntfs_inode *ni;
- size_t i, tt;
+ size_t i, tt, bad_len, bad_frags;
CLST vcn, lcn, len;
struct ATTRIB *attr;
const struct VOLUME_INFO *info;
@@ -916,6 +950,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_export_op = &ntfs_export_ops;
sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
sb->s_xattr = ntfs_xattr_handlers;
+ sb->s_d_op = sbi->options->nocase ? &ntfs_dentry_ops : NULL;
sbi->options->nls = ntfs_load_nls(sbi->options->nls_name);
if (IS_ERR(sbi->options->nls)) {
@@ -1065,30 +1100,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
sbi->mft.ni = ni;
- /* Load $BadClus. */
- ref.low = cpu_to_le32(MFT_REC_BADCLUST);
- ref.seq = cpu_to_le16(MFT_REC_BADCLUST);
- inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS);
- if (IS_ERR(inode)) {
- ntfs_err(sb, "Failed to load $BadClus.");
- err = PTR_ERR(inode);
- goto out;
- }
-
- ni = ntfs_i(inode);
-
- for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) {
- if (lcn == SPARSE_LCN)
- continue;
-
- if (!sbi->bad_clusters)
- ntfs_notice(sb, "Volume contains bad blocks");
-
- sbi->bad_clusters += len;
- }
-
- iput(inode);
-
/* Load $Bitmap. */
ref.low = cpu_to_le32(MFT_REC_BITMAP);
ref.seq = cpu_to_le16(MFT_REC_BITMAP);
@@ -1126,6 +1137,44 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
goto out;
+ /* Load $BadClus. */
+ ref.low = cpu_to_le32(MFT_REC_BADCLUST);
+ ref.seq = cpu_to_le16(MFT_REC_BADCLUST);
+ inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $BadClus (%d).", err);
+ goto out;
+ }
+
+ ni = ntfs_i(inode);
+ bad_len = bad_frags = 0;
+ for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) {
+ if (lcn == SPARSE_LCN)
+ continue;
+
+ bad_len += len;
+ bad_frags += 1;
+ if (sb_rdonly(sb))
+ continue;
+
+ if (wnd_set_used_safe(&sbi->used.bitmap, lcn, len, &tt) || tt) {
+ /* Bad blocks marked as free in bitmap. */
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ }
+ }
+ if (bad_len) {
+ /*
+ * Notice about bad blocks.
+ * In normal cases these blocks are marked as used in bitmap.
+ * And we never allocate space in it.
+ */
+ ntfs_notice(sb,
+ "Volume contains %zu bad blocks in %zu fragments.",
+ bad_len, bad_frags);
+ }
+ iput(inode);
+
/* Load $AttrDef. */
ref.low = cpu_to_le32(MFT_REC_ATTR);
ref.seq = cpu_to_le16(MFT_REC_ATTR);
@@ -1141,7 +1190,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto put_inode_out;
}
bytes = inode->i_size;
- sbi->def_table = t = kmalloc(bytes, GFP_NOFS);
+ sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN);
if (!t) {
err = -ENOMEM;
goto put_inode_out;
@@ -1260,9 +1309,9 @@ load_root:
ref.low = cpu_to_le32(MFT_REC_ROOT);
ref.seq = cpu_to_le16(MFT_REC_ROOT);
inode = ntfs_iget5(sb, &ref, &NAME_ROOT);
- if (IS_ERR(inode)) {
+ if (IS_ERR(inode) || !inode->i_op) {
ntfs_err(sb, "Failed to load root.");
- err = PTR_ERR(inode);
+ err = IS_ERR(inode) ? PTR_ERR(inode) : -EINVAL;
goto out;
}
@@ -1281,6 +1330,7 @@ out:
* Free resources here.
* ntfs_fs_free will be called with fc->s_fs_info = NULL
*/
+ put_mount_options(sbi->options);
put_ntfs(sbi);
sb->s_fs_info = NULL;
@@ -1488,11 +1538,8 @@ out1:
static void __exit exit_ntfs_fs(void)
{
- if (ntfs_inode_cachep) {
- rcu_barrier();
- kmem_cache_destroy(ntfs_inode_cachep);
- }
-
+ rcu_barrier();
+ kmem_cache_destroy(ntfs_inode_cachep);
unregister_filesystem(&ntfs_fs_type);
ntfs3_exit_bitmap();
}
diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c
index b5e8256fd710..7681eefacb4b 100644
--- a/fs/ntfs3/upcase.c
+++ b/fs/ntfs3/upcase.c
@@ -102,3 +102,15 @@ case_insentive:
diff2 = l1 - l2;
return diff2 ? diff2 : diff1;
}
+
+/* Helper function for ntfs_d_hash. */
+unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase,
+ unsigned long hash)
+{
+ while (len--) {
+ unsigned int c = upcase_unicode_char(upcase, *name++);
+ hash = partial_name_hash(c, hash);
+ }
+
+ return hash;
+}
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 7de8718c68a9..616df209feea 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -15,9 +15,10 @@
#include "ntfs_fs.h"
// clang-format off
-#define SYSTEM_DOS_ATTRIB "system.dos_attrib"
-#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib"
-#define SYSTEM_NTFS_SECURITY "system.ntfs_security"
+#define SYSTEM_DOS_ATTRIB "system.dos_attrib"
+#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib"
+#define SYSTEM_NTFS_ATTRIB_BE "system.ntfs_attrib_be"
+#define SYSTEM_NTFS_SECURITY "system.ntfs_security"
// clang-format on
static inline size_t unpacked_ea_size(const struct EA_FULL *ea)
@@ -42,28 +43,26 @@ static inline size_t packed_ea_size(const struct EA_FULL *ea)
* Assume there is at least one xattr in the list.
*/
static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes,
- const char *name, u8 name_len, u32 *off)
+ const char *name, u8 name_len, u32 *off, u32 *ea_sz)
{
- *off = 0;
+ u32 ea_size;
- if (!ea_all || !bytes)
+ *off = 0;
+ if (!ea_all)
return false;
- for (;;) {
+ for (; *off < bytes; *off += ea_size) {
const struct EA_FULL *ea = Add2Ptr(ea_all, *off);
- u32 next_off = *off + unpacked_ea_size(ea);
-
- if (next_off > bytes)
- return false;
-
+ ea_size = unpacked_ea_size(ea);
if (ea->name_len == name_len &&
- !memcmp(ea->name, name, name_len))
+ !memcmp(ea->name, name, name_len)) {
+ if (ea_sz)
+ *ea_sz = ea_size;
return true;
-
- *off = next_off;
- if (next_off >= bytes)
- return false;
+ }
}
+
+ return false;
}
/*
@@ -74,12 +73,12 @@ static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes,
static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
size_t add_bytes, const struct EA_INFO **info)
{
- int err;
+ int err = -EINVAL;
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ATTR_LIST_ENTRY *le = NULL;
struct ATTRIB *attr_info, *attr_ea;
void *ea_p;
- u32 size;
+ u32 size, off, ea_size;
static_assert(le32_to_cpu(ATTR_EA_INFO) < le32_to_cpu(ATTR_EA));
@@ -96,24 +95,31 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
*info = resident_data_ex(attr_info, sizeof(struct EA_INFO));
if (!*info)
- return -EINVAL;
+ goto out;
/* Check Ea limit. */
size = le32_to_cpu((*info)->size);
- if (size > sbi->ea_max_size)
- return -EFBIG;
+ if (size > sbi->ea_max_size) {
+ err = -EFBIG;
+ goto out;
+ }
- if (attr_size(attr_ea) > sbi->ea_max_size)
- return -EFBIG;
+ if (attr_size(attr_ea) > sbi->ea_max_size) {
+ err = -EFBIG;
+ goto out;
+ }
+
+ if (!size) {
+ /* EA info persists, but xattr is empty. Looks like EA problem. */
+ goto out;
+ }
/* Allocate memory for packed Ea. */
- ea_p = kmalloc(size + add_bytes, GFP_NOFS);
+ ea_p = kmalloc(size_add(size, add_bytes), GFP_NOFS);
if (!ea_p)
return -ENOMEM;
- if (!size) {
- /* EA info persists, but xattr is empty. Looks like EA problem. */
- } else if (attr_ea->non_res) {
+ if (attr_ea->non_res) {
struct runs_tree run;
run_init(&run);
@@ -124,24 +130,52 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
run_close(&run);
if (err)
- goto out;
+ goto out1;
} else {
void *p = resident_data_ex(attr_ea, size);
- if (!p) {
- err = -EINVAL;
- goto out;
- }
+ if (!p)
+ goto out1;
memcpy(ea_p, p, size);
}
memset(Add2Ptr(ea_p, size), 0, add_bytes);
+
+ /* Check all attributes for consistency. */
+ for (off = 0; off < size; off += ea_size) {
+ const struct EA_FULL *ef = Add2Ptr(ea_p, off);
+ u32 bytes = size - off;
+
+ /* Check if we can use field ea->size. */
+ if (bytes < sizeof(ef->size))
+ goto out1;
+
+ if (ef->size) {
+ ea_size = le32_to_cpu(ef->size);
+ if (ea_size > bytes)
+ goto out1;
+ continue;
+ }
+
+ /* Check if we can use fields ef->name_len and ef->elength. */
+ if (bytes < offsetof(struct EA_FULL, name))
+ goto out1;
+
+ ea_size = ALIGN(struct_size(ef, name,
+ 1 + ef->name_len +
+ le16_to_cpu(ef->elength)),
+ 4);
+ if (ea_size > bytes)
+ goto out1;
+ }
+
*ea = ea_p;
return 0;
-out:
+out1:
kfree(ea_p);
- *ea = NULL;
+out:
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
return err;
}
@@ -163,6 +197,7 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
const struct EA_FULL *ea;
u32 off, size;
int err;
+ int ea_size;
size_t ret;
err = ntfs_read_ea(ni, &ea_all, 0, &info);
@@ -175,8 +210,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
size = le32_to_cpu(info->size);
/* Enumerate all xattrs. */
- for (ret = 0, off = 0; off < size; off += unpacked_ea_size(ea)) {
+ for (ret = 0, off = 0; off < size; off += ea_size) {
ea = Add2Ptr(ea_all, off);
+ ea_size = unpacked_ea_size(ea);
if (buffer) {
if (ret + ea->name_len + 1 > bytes_per_buffer) {
@@ -227,7 +263,8 @@ static int ntfs_get_ea(struct inode *inode, const char *name, size_t name_len,
goto out;
/* Enumerate all xattrs. */
- if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off)) {
+ if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off,
+ NULL)) {
err = -ENODATA;
goto out;
}
@@ -269,7 +306,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
struct EA_FULL *new_ea;
struct EA_FULL *ea_all = NULL;
size_t add, new_pack;
- u32 off, size;
+ u32 off, size, ea_sz;
__le16 size_pack;
struct ATTRIB *attr;
struct ATTR_LIST_ENTRY *le;
@@ -304,9 +341,8 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
size_pack = ea_info.size_pack;
}
- if (info && find_ea(ea_all, size, name, name_len, &off)) {
+ if (info && find_ea(ea_all, size, name, name_len, &off, &ea_sz)) {
struct EA_FULL *ea;
- size_t ea_sz;
if (flags & XATTR_CREATE) {
err = -EEXIST;
@@ -329,8 +365,6 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
if (ea->flags & FILE_NEED_EA)
le16_add_cpu(&ea_info.count, -1);
- ea_sz = unpacked_ea_size(ea);
-
le16_add_cpu(&ea_info.size_pack, 0 - packed_ea_size(ea));
memmove(ea, Add2Ptr(ea, ea_sz), size - off - ea_sz);
@@ -604,10 +638,9 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
err = 0; /* Removing non existed xattr. */
if (!err) {
set_cached_acl(inode, type, acl);
- if (inode->i_mode != mode) {
- inode->i_mode = mode;
- mark_inode_dirty(inode);
- }
+ inode->i_mode = mode;
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
}
out:
@@ -619,10 +652,10 @@ out:
/*
* ntfs_set_acl - inode_operations::set_acl
*/
-int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
- return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false);
+ return ntfs_set_acl_ex(mnt_userns, d_inode(dentry), acl, type, false);
}
/*
@@ -664,8 +697,9 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
/*
* ntfs_acl_chmod - Helper for ntfs3_setattr().
*/
-int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry)
{
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
if (!(sb->s_flags & SB_POSIXACL))
@@ -674,7 +708,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- return posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ return posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
}
/*
@@ -720,11 +754,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
{
int err;
struct ntfs_inode *ni = ntfs_i(inode);
- size_t name_len = strlen(name);
/* Dispatch request. */
- if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 &&
- !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) {
+ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) {
/* system.dos_attrib */
if (!buffer) {
err = sizeof(u8);
@@ -737,8 +769,8 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
goto out;
}
- if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 &&
- !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) {
+ if (!strcmp(name, SYSTEM_NTFS_ATTRIB) ||
+ !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) {
/* system.ntfs_attrib */
if (!buffer) {
err = sizeof(u32);
@@ -747,12 +779,13 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
} else {
err = sizeof(u32);
*(u32 *)buffer = le32_to_cpu(ni->std_fa);
+ if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE))
+ *(u32 *)buffer = cpu_to_be32(*(u32 *)buffer);
}
goto out;
}
- if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 &&
- !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) {
+ if (!strcmp(name, SYSTEM_NTFS_SECURITY)) {
/* system.ntfs_security*/
struct SECURITY_DESCRIPTOR_RELATIVE *sd = NULL;
size_t sd_size = 0;
@@ -792,7 +825,7 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
}
/* Deal with NTFS extended attribute. */
- err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL);
+ err = ntfs_get_ea(inode, name, strlen(name), buffer, size, NULL);
out:
return err;
@@ -809,23 +842,24 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler,
{
int err = -EINVAL;
struct ntfs_inode *ni = ntfs_i(inode);
- size_t name_len = strlen(name);
enum FILE_ATTRIBUTE new_fa;
/* Dispatch request. */
- if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 &&
- !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) {
+ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) {
if (sizeof(u8) != size)
goto out;
new_fa = cpu_to_le32(*(u8 *)value);
goto set_new_fa;
}
- if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 &&
- !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) {
+ if (!strcmp(name, SYSTEM_NTFS_ATTRIB) ||
+ !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) {
if (size != sizeof(u32))
goto out;
- new_fa = cpu_to_le32(*(u32 *)value);
+ if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE))
+ new_fa = cpu_to_le32(be32_to_cpu(*(u32 *)value));
+ else
+ new_fa = cpu_to_le32(*(u32 *)value);
if (S_ISREG(inode->i_mode)) {
/* Process compressed/sparsed in special way. */
@@ -860,8 +894,7 @@ set_new_fa:
goto out;
}
- if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 &&
- !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) {
+ if (!strcmp(name, SYSTEM_NTFS_SECURITY)) {
/* system.ntfs_security*/
__le32 security_id;
bool inserted;
@@ -904,7 +937,7 @@ set_new_fa:
}
/* Deal with NTFS extended attribute. */
- err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
+ err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0);
out:
inode->i_ctime = current_time(inode);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 23a72a423955..9f19cf9a5a9f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -260,12 +260,13 @@ static int ocfs2_set_acl(handle_t *handle,
return ret;
}
-int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
struct buffer_head *bh = NULL;
int status, had_lock;
struct ocfs2_lock_holder oh;
+ struct inode *inode = d_inode(dentry);
had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
if (had_lock < 0)
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 95a57c888ab6..a897c4e41b26 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -17,7 +17,7 @@ struct ocfs2_acl_entry {
};
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu);
-int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b13d344d40b6..60b97c92e2b2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -335,7 +335,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg)
/* negotiate timeout must be less than write timeout. */
schedule_delayed_work(&reg->hr_nego_timeout_work,
msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS));
- memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap));
+ bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES);
}
static void o2hb_disarm_timeout(struct o2hb_region *reg)
@@ -375,7 +375,7 @@ static void o2hb_nego_timeout(struct work_struct *work)
if (reg->hr_last_hb_status)
return;
- o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
/* lowest node as master node to make negotiate decision. */
master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES);
@@ -386,8 +386,8 @@ static void o2hb_nego_timeout(struct work_struct *work)
config_item_name(&reg->hr_item), reg->hr_bdev);
set_bit(master_node, reg->hr_nego_node_bitmap);
}
- if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap,
- sizeof(reg->hr_nego_node_bitmap))) {
+ if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap,
+ O2NM_MAX_NODES)) {
/* check negotiate bitmap every second to do timeout
* approve decision.
*/
@@ -856,8 +856,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
* live nodes heartbeat on it. In other words, the region has been
* added to all nodes.
*/
- if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
- sizeof(o2hb_live_node_bitmap)))
+ if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ O2NM_MAX_NODES))
goto unlock;
printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n",
@@ -1087,7 +1087,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* If a node is not configured but is in the livemap, we still need
* to read the slot so as to be able to remove it from the livemap.
*/
- o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES);
i = -1;
while ((i = find_next_bit(live_node_bitmap,
O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
@@ -1437,11 +1437,11 @@ void o2hb_init(void)
for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
INIT_LIST_HEAD(&o2hb_live_slots[i]);
- memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
- memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
- memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
- memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
- memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
+ bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES);
+ bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS);
+ bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS);
o2hb_dependent_users = 0;
@@ -1450,23 +1450,21 @@ void o2hb_init(void)
/* if we're already in a callback then we're already serialized by the sem */
static void o2hb_fill_node_map_from_callback(unsigned long *map,
- unsigned bytes)
+ unsigned int bits)
{
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memcpy(map, &o2hb_live_node_bitmap, bytes);
+ bitmap_copy(map, o2hb_live_node_bitmap, bits);
}
/*
* get a map of all nodes that are heartbeating in any regions
*/
-void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
+void o2hb_fill_node_map(unsigned long *map, unsigned int bits)
{
/* callers want to serialize this map and callbacks so that they
* can trust that they don't miss nodes coming to the party */
down_read(&o2hb_callback_sem);
spin_lock(&o2hb_live_lock);
- o2hb_fill_node_map_from_callback(map, bytes);
+ o2hb_fill_node_map_from_callback(map, bits);
spin_unlock(&o2hb_live_lock);
up_read(&o2hb_callback_sem);
}
@@ -2460,7 +2458,7 @@ int o2hb_check_node_heartbeating_no_sem(u8 node_num)
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
spin_lock(&o2hb_live_lock);
- o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
@@ -2477,7 +2475,7 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
+ o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 1d4100abf6f8..8ef8c1b9eeb7 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -59,7 +59,7 @@ int o2hb_register_callback(const char *region_uuid,
void o2hb_unregister_callback(const char *region_uuid,
struct o2hb_callback_func *hc);
void o2hb_fill_node_map(unsigned long *map,
- unsigned bytes);
+ unsigned int bits);
void o2hb_exit(void);
void o2hb_init(void);
int o2hb_check_node_heartbeating_no_sem(u8 node_num);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7524994e3199..35c05c18de59 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -438,7 +438,7 @@ static int o2net_fill_bitmap(char *buf, int len)
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
int i = -1, out = 0;
- o2net_fill_node_map(map, sizeof(map));
+ o2net_fill_node_map(map, O2NM_MAX_NODES);
while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 27fee68f860a..2f61d39e4e50 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -54,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
return -EINVAL;
read_lock(&cluster->cl_nodes_lock);
- memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
+ bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES);
read_unlock(&cluster->cl_nodes_lock);
return 0;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index f660c0dbdb63..a07b24d170f2 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -900,7 +900,7 @@ static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
struct kvec vec = { .iov_len = len, .iov_base = data, };
struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
- iov_iter_kvec(&msg.msg_iter, READ, &vec, 1, len);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len);
return sock_recvmsg(sock, &msg, MSG_DONTWAIT);
}
@@ -990,14 +990,12 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
}
/* Get a map of all nodes to which this node is currently connected to */
-void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+void o2net_fill_node_map(unsigned long *map, unsigned int bits)
{
struct o2net_sock_container *sc;
int node, ret;
- BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
-
- memset(map, 0, bytes);
+ bitmap_zero(map, bits);
for (node = 0; node < O2NM_MAX_NODES; ++node) {
if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
continue;
@@ -1604,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work)
sc->sc_sock = sock; /* freed by sc_kref_release */
sock->sk->sk_allocation = GFP_ATOMIC;
+ sock->sk->sk_use_task_frag = false;
myaddr.sin_family = AF_INET;
myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fd2022712167..20f790a47484 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1094,7 +1094,7 @@ static inline enum dlm_status dlm_err_to_dlm_status(int err)
static inline void dlm_node_iter_init(unsigned long *map,
struct dlm_node_iter *iter)
{
- memcpy(iter->node_map, map, sizeof(iter->node_map));
+ bitmap_copy(iter->node_map, map, O2NM_MAX_NODES);
iter->curnode = -1;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c4eccd499db8..5c04dde99981 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1576,8 +1576,8 @@ static int dlm_should_restart_join(struct dlm_ctxt *dlm,
spin_lock(&dlm->spinlock);
/* For now, we restart the process if the node maps have
* changed at all */
- ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
- sizeof(dlm->live_nodes_map));
+ ret = !bitmap_equal(ctxt->live_map, dlm->live_nodes_map,
+ O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
if (ret)
@@ -1604,13 +1604,11 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
/* group sem locking should work for us here -- we're already
* registered for heartbeat events so filling this should be
* atomic wrt getting those handlers called. */
- o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
+ o2hb_fill_node_map(dlm->live_nodes_map, O2NM_MAX_NODES);
spin_lock(&dlm->spinlock);
- memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
-
+ bitmap_copy(ctxt->live_map, dlm->live_nodes_map, O2NM_MAX_NODES);
__dlm_set_joining_node(dlm, dlm->node_num);
-
spin_unlock(&dlm->spinlock);
node = -1;
@@ -1643,8 +1641,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
* yes_resp_map. Copy that into our domain map and send a join
* assert message to clean up everyone elses state. */
spin_lock(&dlm->spinlock);
- memcpy(dlm->domain_map, ctxt->yes_resp_map,
- sizeof(ctxt->yes_resp_map));
+ bitmap_copy(dlm->domain_map, ctxt->yes_resp_map, O2NM_MAX_NODES);
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
@@ -2009,9 +2006,9 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
dlm->recovery_map, &(dlm->recovery_map[0]));
- memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
- memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
- memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
+ bitmap_zero(dlm->recovery_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->live_nodes_map, O2NM_MAX_NODES);
+ bitmap_zero(dlm->domain_map, O2NM_MAX_NODES);
dlm->dlm_thread_task = NULL;
dlm->dlm_reco_thread_task = NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 227da5b1b6ab..d610da8e2f24 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -258,12 +258,12 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
mle->type = type;
INIT_HLIST_NODE(&mle->master_hash_node);
INIT_LIST_HEAD(&mle->hb_events);
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
spin_lock_init(&mle->spinlock);
init_waitqueue_head(&mle->wq);
atomic_set(&mle->woken, 0);
kref_init(&mle->mle_refs);
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
mle->master = O2NM_MAX_NODES;
mle->new_master = O2NM_MAX_NODES;
mle->inuse = 0;
@@ -290,8 +290,8 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
atomic_inc(&dlm->mle_cur_count[mle->type]);
/* copy off the node_map and register hb callbacks on our copy */
- memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
- memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
+ bitmap_copy(mle->node_map, dlm->domain_map, O2NM_MAX_NODES);
+ bitmap_copy(mle->vote_map, dlm->domain_map, O2NM_MAX_NODES);
clear_bit(dlm->node_num, mle->vote_map);
clear_bit(dlm->node_num, mle->node_map);
@@ -572,7 +572,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
spin_unlock(&dlm->track_lock);
memset(res->lvb, 0, DLM_LVB_LEN);
- memset(res->refmap, 0, sizeof(res->refmap));
+ bitmap_zero(res->refmap, O2NM_MAX_NODES);
}
struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
@@ -1036,10 +1036,10 @@ recheck:
spin_lock(&mle->spinlock);
m = mle->master;
- map_changed = (memcmp(mle->vote_map, mle->node_map,
- sizeof(mle->vote_map)) != 0);
- voting_done = (memcmp(mle->vote_map, mle->response_map,
- sizeof(mle->vote_map)) == 0);
+ map_changed = !bitmap_equal(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
+ voting_done = bitmap_equal(mle->vote_map, mle->response_map,
+ O2NM_MAX_NODES);
/* restart if we hit any errors */
if (map_changed) {
@@ -1277,11 +1277,11 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
/* now blank out everything, as if we had never
* contacted anyone */
- memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
- memset(mle->response_map, 0, sizeof(mle->response_map));
+ bitmap_zero(mle->maybe_map, O2NM_MAX_NODES);
+ bitmap_zero(mle->response_map, O2NM_MAX_NODES);
/* reset the vote_map to the current node_map */
- memcpy(mle->vote_map, mle->node_map,
- sizeof(mle->node_map));
+ bitmap_copy(mle->vote_map, mle->node_map,
+ O2NM_MAX_NODES);
/* put myself into the maybe map */
if (mle->type != DLM_MLE_BLOCK)
set_bit(dlm->node_num, mle->maybe_map);
@@ -2094,7 +2094,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
flags = item->u.am.flags;
spin_lock(&dlm->spinlock);
- memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
+ bitmap_copy(nodemap, dlm->domain_map, O2NM_MAX_NODES);
spin_unlock(&dlm->spinlock);
clear_bit(dlm->node_num, nodemap);
@@ -3447,7 +3447,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
ret = 0;
}
- memset(iter.node_map, 0, sizeof(iter.node_map));
+ bitmap_zero(iter.node_map, O2NM_MAX_NODES);
set_bit(old_master, iter.node_map);
mlog(0, "doing assert master of %.*s back to %u\n",
res->lockname.len, res->lockname.name, old_master);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 52ad342fec3e..50da8af988c1 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -733,7 +733,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_reco_node_data *ndata;
spin_lock(&dlm->spinlock);
- memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
+ bitmap_copy(dlm->reco.node_map, dlm->domain_map, O2NM_MAX_NODES);
/* nodes can only be removed (by dying) after dropping
* this lock, and death will be trapped later, so this should do */
spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9c67edd215d5..5c60b6bc85bf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1991,7 +1991,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
}
}
- if (file && should_remove_suid(file->f_path.dentry)) {
+ if (file && setattr_should_drop_suidgid(&init_user_ns, file_inode(file))) {
ret = __ocfs2_write_remove_suid(inode, di_bh);
if (ret) {
mlog_errno(ret);
@@ -2279,7 +2279,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* inode. There's also the dinode i_size state which
* can be lost via setattr during extending writes (we
* set inode->i_size at the end of a write. */
- if (should_remove_suid(dentry)) {
+ if (setattr_should_drop_suidgid(&init_user_ns, inode)) {
if (meta_level == 0) {
ocfs2_inode_unlock_for_extent_tree(inode,
&di_bh,
@@ -2712,7 +2712,7 @@ const struct inode_operations ocfs2_file_iops = {
.permission = ocfs2_permission,
.listxattr = ocfs2_listxattr,
.fiemap = ocfs2_fiemap,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
.fileattr_get = ocfs2_fileattr_get,
.fileattr_set = ocfs2_fileattr_set,
@@ -2722,7 +2722,7 @@ const struct inode_operations ocfs2_special_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
};
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 126671e6caed..3fb98b4569a2 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -157,7 +157,7 @@ static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
replay_map->rm_state = REPLAY_DONE;
}
-static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
{
struct ocfs2_replay_map *replay_map = osb->replay_map;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 969d0aa28718..41c382f68529 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -150,6 +150,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb);
void ocfs2_recovery_exit(struct ocfs2_super *osb);
int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
+void ocfs2_free_replay_slots(struct ocfs2_super *osb);
/*
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 05f32989bad6..a8fd51afb794 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2915,7 +2915,7 @@ const struct inode_operations ocfs2_dir_iops = {
.permission = ocfs2_permission,
.listxattr = ocfs2_listxattr,
.fiemap = ocfs2_fiemap,
- .get_acl = ocfs2_iop_get_acl,
+ .get_inode_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
.fileattr_get = ocfs2_fileattr_get,
.fileattr_set = ocfs2_fileattr_set,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 740b64238312..a503c553bab2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -560,8 +560,7 @@ static inline unsigned int ocfs2_read_links_count(struct ocfs2_dinode *di)
u32 nlink = le16_to_cpu(di->i_links_count);
u32 hi = le16_to_cpu(di->i_links_count_hi);
- if (di->i_dyn_features & cpu_to_le16(OCFS2_INDEXED_DIR_FL))
- nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
+ nlink |= (hi << OCFS2_LINKS_HI_SHIFT);
return nlink;
}
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 88f75f7f02d7..c973c03f6fd8 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -273,17 +273,17 @@ static int o2cb_cluster_check(void)
*/
#define O2CB_MAP_STABILIZE_COUNT 60
for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
- o2hb_fill_node_map(hbmap, sizeof(hbmap));
+ o2hb_fill_node_map(hbmap, O2NM_MAX_NODES);
if (!test_bit(node_num, hbmap)) {
printk(KERN_ERR "o2cb: %s heartbeat has not been "
"started.\n", (o2hb_global_heartbeat_active() ?
"Global" : "Local"));
return -EINVAL;
}
- o2net_fill_node_map(netmap, sizeof(netmap));
+ o2net_fill_node_map(netmap, O2NM_MAX_NODES);
/* Force set the current node to allow easy compare */
set_bit(node_num, netmap);
- if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+ if (bitmap_equal(hbmap, netmap, O2NM_MAX_NODES))
return 0;
if (i < O2CB_MAP_STABILIZE_COUNT - 1)
msleep(1000);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 317126261523..a8d5ca98fa57 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -669,6 +669,8 @@ static struct ctl_table_header *ocfs2_table_header;
static int __init ocfs2_stack_glue_init(void)
{
+ int ret;
+
strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table);
@@ -678,7 +680,11 @@ static int __init ocfs2_stack_glue_init(void)
return -ENOMEM; /* or something. */
}
- return ocfs2_sysfs_init();
+ ret = ocfs2_sysfs_init();
+ if (ret)
+ unregister_sysctl_table(ocfs2_table_header);
+
+ return ret;
}
static void __exit ocfs2_stack_glue_exit(void)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 42c993e53924..0b0e6a132101 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1159,6 +1159,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
out_dismount:
atomic_set(&osb->vol_state, VOLUME_DISABLED);
wake_up(&osb->osb_mount_event);
+ ocfs2_free_replay_slots(osb);
ocfs2_dismount_volume(sb, 1);
goto out;
@@ -1822,12 +1823,14 @@ static int ocfs2_mount_volume(struct super_block *sb)
status = ocfs2_truncate_log_init(osb);
if (status < 0) {
mlog_errno(status);
- goto out_system_inodes;
+ goto out_check_volume;
}
ocfs2_super_unlock(osb, 1);
return 0;
+out_check_volume:
+ ocfs2_free_replay_slots(osb);
out_system_inodes:
if (osb->local_alloc_state == OCFS2_LA_ENABLED)
ocfs2_shutdown_local_alloc(osb);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index fa7fe2393ff6..3a5b4b88a583 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -294,11 +294,6 @@ static void omfs_readahead(struct readahead_control *rac)
mpage_readahead(rac, omfs_get_block);
}
-static int omfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, omfs_get_block, wbc);
-}
-
static int
omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
@@ -375,10 +370,10 @@ const struct address_space_operations omfs_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = omfs_read_folio,
.readahead = omfs_readahead,
- .writepage = omfs_writepage,
.writepages = omfs_writepages,
.write_begin = omfs_write_begin,
.write_end = generic_write_end,
.bmap = omfs_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
diff --git a/fs/open.c b/fs/open.c
index a81319b6177f..82c1a28b3308 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -54,7 +54,7 @@ int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry,
}
/* Remove suid, sgid, and file capabilities on truncate too */
- ret = dentry_needs_remove_privs(dentry);
+ ret = dentry_needs_remove_privs(mnt_userns, dentry);
if (ret < 0)
return ret;
if (ret)
@@ -188,7 +188,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
if (IS_APPEND(file_inode(f.file)))
goto out_putf;
sb_start_write(inode->i_sb);
- error = security_path_truncate(&f.file->f_path);
+ error = security_file_truncate(f.file);
if (!error)
error = do_truncate(file_mnt_user_ns(f.file), dentry, length,
ATTR_MTIME | ATTR_CTIME, f.file);
@@ -723,10 +723,10 @@ retry_deleg:
return -EINVAL;
if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
return -EINVAL;
- if (!S_ISDIR(inode->i_mode))
- newattrs.ia_valid |=
- ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
inode_lock(inode);
+ if (!S_ISDIR(inode->i_mode))
+ newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
+ setattr_should_drop_sgid(mnt_userns, inode);
/* Continue to send actual fs values, not the mount values. */
error = security_path_chown(
path,
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 605e5a3506ec..c5da2091cefb 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -64,8 +64,7 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
-static int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl,
- int type)
+int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int error = 0;
void *value = NULL;
@@ -119,12 +118,13 @@ out:
return error;
}
-int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int orangefs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error;
struct iattr iattr;
int rc;
+ struct inode *inode = d_inode(dentry);
memset(&iattr, 0, sizeof iattr);
@@ -153,46 +153,7 @@ int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
rc = __orangefs_set_acl(inode, acl, type);
if (!rc && (iattr.ia_valid == ATTR_MODE))
- rc = __orangefs_setattr(inode, &iattr);
+ rc = __orangefs_setattr_mode(dentry, &iattr);
return rc;
}
-
-int orangefs_init_acl(struct inode *inode, struct inode *dir)
-{
- struct posix_acl *default_acl, *acl;
- umode_t mode = inode->i_mode;
- struct iattr iattr;
- int error = 0;
-
- error = posix_acl_create(dir, &mode, &default_acl, &acl);
- if (error)
- return error;
-
- if (default_acl) {
- error = __orangefs_set_acl(inode, default_acl,
- ACL_TYPE_DEFAULT);
- posix_acl_release(default_acl);
- } else {
- inode->i_default_acl = NULL;
- }
-
- if (acl) {
- if (!error)
- error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
- posix_acl_release(acl);
- } else {
- inode->i_acl = NULL;
- }
-
- /* If mode of the inode was changed, then do a forcible ->setattr */
- if (mode != inode->i_mode) {
- memset(&iattr, 0, sizeof iattr);
- inode->i_mode = mode;
- iattr.ia_mode = mode;
- iattr.ia_valid |= ATTR_MODE;
- __orangefs_setattr(inode, &iattr);
- }
-
- return error;
-}
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 732661aa2680..167fa43b24f9 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -273,7 +273,6 @@ out:
gossip_debug(GOSSIP_FILE_DEBUG,
"%s(%pU): PUT buffer_index %d\n",
__func__, handle, buffer_index);
- buffer_index = -1;
}
op_release(new_op);
return ret;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 7a8c0c6e698d..4df560894386 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -53,7 +53,7 @@ static int orangefs_writepage_locked(struct page *page,
bv.bv_len = wlen;
bv.bv_offset = off % PAGE_SIZE;
WARN_ON(wlen == 0);
- iov_iter_bvec(&iter, WRITE, &bv, 1, wlen);
+ iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen);
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
len, wr, NULL, NULL);
@@ -112,7 +112,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
else
ow->bv[i].bv_offset = 0;
}
- iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len);
+ iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len);
WARN_ON(ow->off >= len);
if (ow->off + ow->len > len)
@@ -270,7 +270,7 @@ static void orangefs_readahead(struct readahead_control *rac)
offset = readahead_pos(rac);
i_pages = &rac->mapping->i_pages;
- iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
+ iov_iter_xarray(&iter, ITER_DEST, i_pages, offset, readahead_length(rac));
/* read in the pages. */
if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
@@ -303,7 +303,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
bv.bv_page = &folio->page;
bv.bv_len = folio_size(folio);
bv.bv_offset = 0;
- iov_iter_bvec(&iter, READ, &bv, 1, folio_size(folio));
+ iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio));
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
folio_size(folio), inode->i_size, NULL, NULL, file);
@@ -530,7 +530,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
size_t count = iov_iter_count(iter);
ssize_t total_count = 0;
ssize_t ret = -EINVAL;
- int i = 0;
gossip_debug(GOSSIP_FILE_DEBUG,
"%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
@@ -556,7 +555,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
while (iov_iter_count(iter)) {
size_t each_count = iov_iter_count(iter);
size_t amt_complete;
- i++;
/* how much to transfer in this loop iteration */
if (each_count > orangefs_bufmap_size_query())
@@ -828,15 +826,23 @@ again:
spin_unlock(&inode->i_lock);
mark_inode_dirty(inode);
- if (iattr->ia_valid & ATTR_MODE)
- /* change mod on a file that has ACLs */
- ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
-
ret = 0;
out:
return ret;
}
+int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr)
+{
+ int ret;
+ struct inode *inode = d_inode(dentry);
+
+ ret = __orangefs_setattr(inode, iattr);
+ /* change mode on a file that has ACLs */
+ if (!ret && (iattr->ia_valid & ATTR_MODE))
+ ret = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
+ return ret;
+}
+
/*
* Change attributes of an object referenced by dentry.
*/
@@ -849,7 +855,7 @@ int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
ret = setattr_prepare(&init_user_ns, dentry, iattr);
if (ret)
goto out;
- ret = __orangefs_setattr(d_inode(dentry), iattr);
+ ret = __orangefs_setattr_mode(dentry, iattr);
sync_inode_metadata(d_inode(dentry), 1);
out:
gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n",
@@ -967,7 +973,7 @@ static int orangefs_fileattr_set(struct user_namespace *mnt_userns,
/* ORANGEFS2 implementation of VFS inode operations for files */
static const struct inode_operations orangefs_file_inode_operations = {
- .get_acl = orangefs_get_acl,
+ .get_inode_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
.setattr = orangefs_setattr,
.getattr = orangefs_getattr,
@@ -1097,8 +1103,9 @@ struct inode *orangefs_iget(struct super_block *sb,
* Allocate an inode for a newly created file and insert it into the inode hash.
*/
struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
- int mode, dev_t dev, struct orangefs_object_kref *ref)
+ umode_t mode, dev_t dev, struct orangefs_object_kref *ref)
{
+ struct posix_acl *acl = NULL, *default_acl = NULL;
unsigned long hash = orangefs_handle_hash(ref);
struct inode *inode;
int error;
@@ -1115,6 +1122,10 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
if (!inode)
return ERR_PTR(-ENOMEM);
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error)
+ goto out_iput;
+
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
@@ -1125,6 +1136,19 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_init_iops(inode);
inode->i_rdev = dev;
+ if (default_acl) {
+ error = __orangefs_set_acl(inode, default_acl,
+ ACL_TYPE_DEFAULT);
+ if (error)
+ goto out_iput;
+ }
+
+ if (acl) {
+ error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ if (error)
+ goto out_iput;
+ }
+
error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
if (error < 0)
goto out_iput;
@@ -1132,10 +1156,22 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
gossip_debug(GOSSIP_INODE_DEBUG,
"Initializing ACL's for inode %pU\n",
get_khandle_from_ino(inode));
- orangefs_init_acl(inode, dir);
+ if (mode != inode->i_mode) {
+ struct iattr iattr = {
+ .ia_mode = mode,
+ .ia_valid = ATTR_MODE,
+ };
+ inode->i_mode = mode;
+ __orangefs_setattr(inode, &iattr);
+ __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ }
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
return inode;
out_iput:
iput(inode);
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
return ERR_PTR(error);
}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 600e8eee541f..75c1a3dcf68c 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -430,7 +430,7 @@ static int orangefs_rename(struct user_namespace *mnt_userns,
/* ORANGEFS implementation of VFS inode operations for directories */
const struct inode_operations orangefs_dir_inode_operations = {
.lookup = orangefs_lookup,
- .get_acl = orangefs_get_acl,
+ .get_inode_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
.create = orangefs_create,
.unlink = orangefs_unlink,
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 29eaa4544372..1b508f543384 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -194,15 +194,10 @@ void orangefs_debugfs_init(int debug_mask)
*/
static void orangefs_kernel_debug_init(void)
{
- int rc = -ENOMEM;
- char *k_buffer = NULL;
+ static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!k_buffer)
- goto out;
-
if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcpy(k_buffer, kernel_debug_string);
strcat(k_buffer, "\n");
@@ -213,15 +208,14 @@ static void orangefs_kernel_debug_init(void)
debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
&kernel_debug_fops);
-
-out:
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
}
void orangefs_debugfs_cleanup(void)
{
debugfs_remove_recursive(debug_dir);
+ kfree(debug_help_string);
+ debug_help_string = NULL;
}
/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
@@ -297,18 +291,13 @@ static int help_show(struct seq_file *m, void *v)
/*
* initialize the client-debug file.
*/
-static int orangefs_client_debug_init(void)
+static void orangefs_client_debug_init(void)
{
- int rc = -ENOMEM;
- char *c_buffer = NULL;
+ static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!c_buffer)
- goto out;
-
if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcpy(c_buffer, client_debug_string);
strcat(c_buffer, "\n");
@@ -322,13 +311,6 @@ static int orangefs_client_debug_init(void)
debug_dir,
c_buffer,
&kernel_debug_fops);
-
- rc = 0;
-
-out:
-
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
- return rc;
}
/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
@@ -671,6 +653,7 @@ int orangefs_prepare_debugfs_help_string(int at_boot)
memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE);
strlcat(debug_help_string, new, string_size);
mutex_unlock(&orangefs_help_file_lock);
+ kfree(new);
}
rc = 0;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index b5940ec1836a..6e0cc01b3a14 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -103,13 +103,13 @@ enum orangefs_vfs_op_states {
#define ORANGEFS_CACHE_CREATE_FLAGS 0
#endif
-extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
extern const struct xattr_handler *orangefs_xattr_handlers[];
extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
extern int orangefs_set_acl(struct user_namespace *mnt_userns,
- struct inode *inode, struct posix_acl *acl,
+ struct dentry *dentry, struct posix_acl *acl,
int type);
+int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
/*
* orangefs data structures
@@ -356,11 +356,12 @@ void fsid_key_table_finalize(void);
vm_fault_t orangefs_page_mkwrite(struct vm_fault *);
struct inode *orangefs_new_inode(struct super_block *sb,
struct inode *dir,
- int mode,
+ umode_t mode,
dev_t dev,
struct orangefs_object_kref *ref);
int __orangefs_setattr(struct inode *, struct iattr *);
+int __orangefs_setattr_mode(struct dentry *dentry, struct iattr *iattr);
int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *);
int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path,
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index cd7297815f91..5ab741c60b7e 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -141,7 +141,7 @@ static int __init orangefs_init(void)
gossip_err("%s: could not initialize device subsystem %d!\n",
__func__,
ret);
- goto cleanup_device;
+ goto cleanup_sysfs;
}
ret = register_filesystem(&orangefs_fs_type);
@@ -152,11 +152,11 @@ static int __init orangefs_init(void)
goto out;
}
- orangefs_sysfs_exit();
-
-cleanup_device:
orangefs_dev_cleanup();
+cleanup_sysfs:
+ orangefs_sysfs_exit();
+
sysfs_init_failed:
orangefs_debugfs_cleanup();
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index de80b62553bb..be4ba03a01a0 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -896,9 +896,18 @@ static struct attribute *orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(orangefs_default);
+static struct kobject *orangefs_obj;
+
+static void orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(orangefs_obj);
+ orangefs_obj = NULL;
+}
+
static struct kobj_type orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = orangefs_default_groups,
+ .release = orangefs_obj_release,
};
static struct orangefs_attribute acache_hard_limit_attribute =
@@ -934,9 +943,18 @@ static struct attribute *acache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(acache_orangefs_default);
+static struct kobject *acache_orangefs_obj;
+
+static void acache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(acache_orangefs_obj);
+ acache_orangefs_obj = NULL;
+}
+
static struct kobj_type acache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = acache_orangefs_default_groups,
+ .release = acache_orangefs_obj_release,
};
static struct orangefs_attribute capcache_hard_limit_attribute =
@@ -972,9 +990,18 @@ static struct attribute *capcache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(capcache_orangefs_default);
+static struct kobject *capcache_orangefs_obj;
+
+static void capcache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(capcache_orangefs_obj);
+ capcache_orangefs_obj = NULL;
+}
+
static struct kobj_type capcache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = capcache_orangefs_default_groups,
+ .release = capcache_orangefs_obj_release,
};
static struct orangefs_attribute ccache_hard_limit_attribute =
@@ -1010,9 +1037,18 @@ static struct attribute *ccache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(ccache_orangefs_default);
+static struct kobject *ccache_orangefs_obj;
+
+static void ccache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(ccache_orangefs_obj);
+ ccache_orangefs_obj = NULL;
+}
+
static struct kobj_type ccache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = ccache_orangefs_default_groups,
+ .release = ccache_orangefs_obj_release,
};
static struct orangefs_attribute ncache_hard_limit_attribute =
@@ -1048,9 +1084,18 @@ static struct attribute *ncache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(ncache_orangefs_default);
+static struct kobject *ncache_orangefs_obj;
+
+static void ncache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(ncache_orangefs_obj);
+ ncache_orangefs_obj = NULL;
+}
+
static struct kobj_type ncache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = ncache_orangefs_default_groups,
+ .release = ncache_orangefs_obj_release,
};
static struct orangefs_attribute pc_acache_attribute =
@@ -1079,9 +1124,18 @@ static struct attribute *pc_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(pc_orangefs_default);
+static struct kobject *pc_orangefs_obj;
+
+static void pc_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(pc_orangefs_obj);
+ pc_orangefs_obj = NULL;
+}
+
static struct kobj_type pc_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = pc_orangefs_default_groups,
+ .release = pc_orangefs_obj_release,
};
static struct orangefs_attribute stats_reads_attribute =
@@ -1103,19 +1157,20 @@ static struct attribute *stats_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(stats_orangefs_default);
+static struct kobject *stats_orangefs_obj;
+
+static void stats_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(stats_orangefs_obj);
+ stats_orangefs_obj = NULL;
+}
+
static struct kobj_type stats_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = stats_orangefs_default_groups,
+ .release = stats_orangefs_obj_release,
};
-static struct kobject *orangefs_obj;
-static struct kobject *acache_orangefs_obj;
-static struct kobject *capcache_orangefs_obj;
-static struct kobject *ccache_orangefs_obj;
-static struct kobject *ncache_orangefs_obj;
-static struct kobject *pc_orangefs_obj;
-static struct kobject *stats_orangefs_obj;
-
int orangefs_sysfs_init(void)
{
int rc = -EINVAL;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index dd188c7996b3..6708e54b0e30 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -96,7 +96,7 @@ config OVERLAY_FS_XINO_AUTO
depends on 64BIT
help
If this config option is enabled then overlay filesystems will use
- unused high bits in undelying filesystem inode numbers to map all
+ unused high bits in underlying filesystem inode numbers to map all
inodes to a unified address space. The mapped 64bit inode numbers
might not be compatible with applications that expect 32bit inodes.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index f436d8847f08..6e4e65ee050d 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -44,6 +44,35 @@ static bool ovl_must_copy_xattr(const char *name)
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
}
+static int ovl_copy_acl(struct ovl_fs *ofs, const struct path *path,
+ struct dentry *dentry, const char *acl_name)
+{
+ int err;
+ struct posix_acl *clone, *real_acl = NULL;
+
+ real_acl = ovl_get_acl_path(path, acl_name, false);
+ if (!real_acl)
+ return 0;
+
+ if (IS_ERR(real_acl)) {
+ err = PTR_ERR(real_acl);
+ if (err == -ENODATA || err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ clone = posix_acl_clone(real_acl, GFP_KERNEL);
+ posix_acl_release(real_acl); /* release original acl */
+ if (!clone)
+ return -ENOMEM;
+
+ err = ovl_do_set_acl(ofs, dentry, acl_name, clone);
+
+ /* release cloned acl */
+ posix_acl_release(clone);
+ return err;
+}
+
int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct dentry *new)
{
struct dentry *old = oldpath->dentry;
@@ -93,6 +122,15 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de
error = 0;
continue; /* Discard */
}
+
+ if (is_posix_acl_xattr(name)) {
+ error = ovl_copy_acl(OVL_FS(sb), oldpath, new, name);
+ if (!error)
+ continue;
+ /* POSIX ACLs must be copied. */
+ break;
+ }
+
retry:
size = ovl_do_getxattr(oldpath, name, value, value_size);
if (size == -ERANGE)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 6b03457f72bb..f61e37f4c8ff 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -435,28 +435,12 @@ out:
}
static int ovl_set_upper_acl(struct ovl_fs *ofs, struct dentry *upperdentry,
- const char *name, const struct posix_acl *acl)
+ const char *acl_name, struct posix_acl *acl)
{
- void *buffer;
- size_t size;
- int err;
-
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
return 0;
- size = posix_acl_xattr_size(acl->a_count);
- buffer = kmalloc(size, GFP_KERNEL);
- if (!buffer)
- return -ENOMEM;
-
- err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- if (err < 0)
- goto out_free;
-
- err = ovl_do_setxattr(ofs, upperdentry, name, buffer, size, XATTR_CREATE);
-out_free:
- kfree(buffer);
- return err;
+ return ovl_do_set_acl(ofs, upperdentry, acl_name, acl);
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
@@ -592,28 +576,42 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
goto out_revert_creds;
}
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (override_cred) {
+ if (!attr->hardlink) {
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_revert_creds;
+ /*
+ * In the creation cases(create, mkdir, mknod, symlink),
+ * ovl should transfer current's fs{u,g}id to underlying
+ * fs. Because underlying fs want to initialize its new
+ * inode owner using current's fs{u,g}id. And in this
+ * case, the @inode is a new inode that is initialized
+ * in inode_init_owner() to current's fs{u,g}id. So use
+ * the inode's i_{u,g}id to override the cred's fs{u,g}id.
+ *
+ * But in the other hardlink case, ovl_link() does not
+ * create a new inode, so just use the ovl mounter's
+ * fs{u,g}id.
+ */
override_cred->fsuid = inode->i_uid;
override_cred->fsgid = inode->i_gid;
- if (!attr->hardlink) {
- err = security_dentry_create_files_as(dentry,
- attr->mode, &dentry->d_name, old_cred,
- override_cred);
- if (err) {
- put_cred(override_cred);
- goto out_revert_creds;
- }
+ err = security_dentry_create_files_as(dentry,
+ attr->mode, &dentry->d_name, old_cred,
+ override_cred);
+ if (err) {
+ put_cred(override_cred);
+ goto out_revert_creds;
}
put_cred(override_creds(override_cred));
put_cred(override_cred);
-
- if (!ovl_dentry_is_whiteout(dentry))
- err = ovl_create_upper(dentry, inode, attr);
- else
- err = ovl_create_over_whiteout(dentry, inode, attr);
}
+
+ if (!ovl_dentry_is_whiteout(dentry))
+ err = ovl_create_upper(dentry, inode, attr);
+ else
+ err = ovl_create_over_whiteout(dentry, inode, attr);
+
out_revert_creds:
revert_creds(old_cred);
return err;
@@ -1311,7 +1309,9 @@ const struct inode_operations ovl_dir_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
+ .get_inode_acl = ovl_get_inode_acl,
.get_acl = ovl_get_acl,
+ .set_acl = ovl_set_acl,
.update_time = ovl_update_time,
.fileattr_get = ovl_fileattr_get,
.fileattr_set = ovl_fileattr_set,
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index e065a5b9a442..a25bb3453dde 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -339,7 +339,7 @@ out_iput:
return dentry;
}
-/* Get the upper or lower dentry in stach whose on layer @idx */
+/* Get the upper or lower dentry in stack whose on layer @idx */
static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -463,7 +463,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
/* Get connected upper overlay dir from index */
if (index) {
- struct dentry *upper = ovl_index_upper(ofs, index);
+ struct dentry *upper = ovl_index_upper(ofs, index, true);
dput(index);
if (IS_ERR_OR_NULL(upper))
@@ -739,7 +739,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
/* Then try to get a connected upper dir by index */
if (index && d_is_dir(index)) {
- struct dentry *upper = ovl_index_upper(ofs, index);
+ struct dentry *upper = ovl_index_upper(ofs, index, true);
err = PTR_ERR(upper);
if (IS_ERR_OR_NULL(upper))
@@ -796,7 +796,7 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type)
return ERR_PTR(-ENOMEM);
/* Copy unaligned inner fh into aligned buffer */
- memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET);
+ memcpy(fh->buf, fid, buflen - OVL_FH_WIRE_OFFSET);
return fh;
}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index a1a22f58ba18..c9d0c362c7ef 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -34,7 +34,7 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
-/* No atime modificaton nor notify on underlying */
+/* No atime modification nor notify on underlying */
#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
static struct file *ovl_open_realfile(const struct file *file,
@@ -96,6 +96,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
spin_lock(&file->f_lock);
file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags;
+ file->f_iocb_flags = iocb_flags(file);
spin_unlock(&file->f_lock);
return 0;
@@ -517,9 +518,16 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
const struct cred *old_cred;
int ret;
+ inode_lock(inode);
+ /* Update mode */
+ ovl_copyattr(inode);
+ ret = file_remove_privs(file);
+ if (ret)
+ goto out_unlock;
+
ret = ovl_real_fdget(file, &real);
if (ret)
- return ret;
+ goto out_unlock;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
ret = vfs_fallocate(real.file, mode, offset, len);
@@ -530,6 +538,9 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
fdput(real);
+out_unlock:
+ inode_unlock(inode);
+
return ret;
}
@@ -567,14 +578,23 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
const struct cred *old_cred;
loff_t ret;
+ inode_lock(inode_out);
+ if (op != OVL_DEDUPE) {
+ /* Update mode */
+ ovl_copyattr(inode_out);
+ ret = file_remove_privs(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
ret = ovl_real_fdget(file_out, &real_out);
if (ret)
- return ret;
+ goto out_unlock;
ret = ovl_real_fdget(file_in, &real_in);
if (ret) {
fdput(real_out);
- return ret;
+ goto out_unlock;
}
old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
@@ -603,6 +623,9 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
fdput(real_in);
fdput(real_out);
+out_unlock:
+ inode_unlock(inode_out);
+
return ret;
}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 9e61511de7a7..ee6dfa577c93 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -14,6 +14,8 @@
#include <linux/fileattr.h>
#include <linux/security.h>
#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include "overlayfs.h"
@@ -460,7 +462,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
* of the POSIX ACLs retrieved from the lower layer to this function to not
* alter the POSIX ACLs for the underlying filesystem.
*/
-static void ovl_idmap_posix_acl(struct inode *realinode,
+static void ovl_idmap_posix_acl(const struct inode *realinode,
struct user_namespace *mnt_userns,
struct posix_acl *acl)
{
@@ -485,6 +487,64 @@ static void ovl_idmap_posix_acl(struct inode *realinode,
}
/*
+ * The @noperm argument is used to skip permission checking and is a temporary
+ * measure. Quoting Miklos from an earlier discussion:
+ *
+ * > So there are two paths to getting an acl:
+ * > 1) permission checking and 2) retrieving the value via getxattr(2).
+ * > This is a similar situation as reading a symlink vs. following it.
+ * > When following a symlink overlayfs always reads the link on the
+ * > underlying fs just as if it was a readlink(2) call, calling
+ * > security_inode_readlink() instead of security_inode_follow_link().
+ * > This is logical: we are reading the link from the underlying storage,
+ * > and following it on overlayfs.
+ * >
+ * > Applying the same logic to acl: we do need to call the
+ * > security_inode_getxattr() on the underlying fs, even if just want to
+ * > check permissions on overlay. This is currently not done, which is an
+ * > inconsistency.
+ * >
+ * > Maybe adding the check to ovl_get_acl() is the right way to go, but
+ * > I'm a little afraid of a performance regression. Will look into that.
+ *
+ * Until we have made a decision allow this helper to take the @noperm
+ * argument. We should hopefully be able to remove it soon.
+ */
+struct posix_acl *ovl_get_acl_path(const struct path *path,
+ const char *acl_name, bool noperm)
+{
+ struct posix_acl *real_acl, *clone;
+ struct user_namespace *mnt_userns;
+ struct inode *realinode = d_inode(path->dentry);
+
+ mnt_userns = mnt_user_ns(path->mnt);
+
+ if (noperm)
+ real_acl = get_inode_acl(realinode, posix_acl_type(acl_name));
+ else
+ real_acl = vfs_get_acl(mnt_userns, path->dentry, acl_name);
+ if (IS_ERR_OR_NULL(real_acl))
+ return real_acl;
+
+ if (!is_idmapped_mnt(path->mnt))
+ return real_acl;
+
+ /*
+ * We cannot alter the ACLs returned from the relevant layer as that
+ * would alter the cached values filesystem wide for the lower
+ * filesystem. Instead we can clone the ACLs and then apply the
+ * relevant idmapping of the layer.
+ */
+ clone = posix_acl_clone(real_acl, GFP_KERNEL);
+ posix_acl_release(real_acl); /* release original acl */
+ if (!clone)
+ return ERR_PTR(-ENOMEM);
+
+ ovl_idmap_posix_acl(realinode, mnt_userns, clone);
+ return clone;
+}
+
+/*
* When the relevant layer is an idmapped mount we need to take the idmapping
* of the layer into account and translate any ACL_{GROUP,USER} values
* according to the idmapped mount.
@@ -495,10 +555,12 @@ static void ovl_idmap_posix_acl(struct inode *realinode,
*
* This is obviously only relevant when idmapped layers are used.
*/
-struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
+struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
+ bool rcu, bool noperm)
{
struct inode *realinode = ovl_inode_real(inode);
- struct posix_acl *acl, *clone;
+ struct posix_acl *acl;
struct path realpath;
if (!IS_POSIXACL(realinode))
@@ -512,40 +574,115 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
}
if (rcu) {
+ /*
+ * If the layer is idmapped drop out of RCU path walk
+ * so we can clone the ACLs.
+ */
+ if (is_idmapped_mnt(realpath.mnt))
+ return ERR_PTR(-ECHILD);
+
acl = get_cached_acl_rcu(realinode, type);
} else {
const struct cred *old_cred;
old_cred = ovl_override_creds(inode->i_sb);
- acl = get_acl(realinode, type);
+ acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm);
revert_creds(old_cred);
}
- /*
- * If there are no POSIX ACLs, or we encountered an error,
- * or the layer isn't idmapped we don't need to do anything.
- */
- if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl))
- return acl;
+
+ return acl;
+}
+
+static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode,
+ struct posix_acl *acl, int type)
+{
+ int err;
+ struct path realpath;
+ const char *acl_name;
+ const struct cred *old_cred;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct dentry *upperdentry = ovl_dentry_upper(dentry);
+ struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
+
+ err = ovl_want_write(dentry);
+ if (err)
+ return err;
/*
- * We only get here if the layer is idmapped. So drop out of RCU path
- * walk so we can clone the ACLs. There's no need to release the ACLs
- * since get_cached_acl_rcu() doesn't take a reference on the ACLs.
+ * If ACL is to be removed from a lower file, check if it exists in
+ * the first place before copying it up.
*/
- if (rcu)
- return ERR_PTR(-ECHILD);
+ acl_name = posix_acl_xattr_name(type);
+ if (!acl && !upperdentry) {
+ struct posix_acl *real_acl;
- clone = posix_acl_clone(acl, GFP_KERNEL);
- if (!clone)
- clone = ERR_PTR(-ENOMEM);
+ ovl_path_lower(dentry, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ real_acl = vfs_get_acl(mnt_user_ns(realpath.mnt), realdentry,
+ acl_name);
+ revert_creds(old_cred);
+ if (IS_ERR(real_acl)) {
+ err = PTR_ERR(real_acl);
+ goto out_drop_write;
+ }
+ posix_acl_release(real_acl);
+ }
+
+ if (!upperdentry) {
+ err = ovl_copy_up(dentry);
+ if (err)
+ goto out_drop_write;
+
+ realdentry = ovl_dentry_upper(dentry);
+ }
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (acl)
+ err = ovl_do_set_acl(ofs, realdentry, acl_name, acl);
else
- ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone);
+ err = ovl_do_remove_acl(ofs, realdentry, acl_name);
+ revert_creds(old_cred);
+
+ /* copy c/mtime */
+ ovl_copyattr(inode);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+ return err;
+}
+
+int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type)
+{
+ int err;
+ struct inode *inode = d_inode(dentry);
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *realinode = ovl_inode_real(inode);
+
+ if (!IS_POSIXACL(d_inode(workdir)))
+ return -EOPNOTSUPP;
+ if (!realinode->i_op->set_acl)
+ return -EOPNOTSUPP;
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ if (!inode_owner_or_capable(&init_user_ns, inode))
+ return -EPERM;
+
/*
- * Since we're not in RCU path walk we always need to release the
- * original ACLs.
+ * Check if sgid bit needs to be cleared (actual setacl operation will
+ * be done with mounter's capabilities and so that won't do it for us).
*/
- posix_acl_release(acl);
- return clone;
+ if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS &&
+ !in_group_p(inode->i_gid) &&
+ !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
+ struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
+
+ err = ovl_setattr(&init_user_ns, dentry, &iattr);
+ if (err)
+ return err;
+ }
+
+ return ovl_set_or_remove_acl(dentry, inode, acl, type);
}
#endif
@@ -721,7 +858,9 @@ static const struct inode_operations ovl_file_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
+ .get_inode_acl = ovl_get_inode_acl,
.get_acl = ovl_get_acl,
+ .set_acl = ovl_set_acl,
.update_time = ovl_update_time,
.fiemap = ovl_fiemap,
.fileattr_get = ovl_fileattr_get,
@@ -741,7 +880,9 @@ static const struct inode_operations ovl_special_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
+ .get_inode_acl = ovl_get_inode_acl,
.get_acl = ovl_get_acl,
+ .set_acl = ovl_set_acl,
.update_time = ovl_update_time,
};
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 0fd1d5fdfc72..46753134533a 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -487,7 +487,8 @@ fail:
}
/* Get upper dentry from index */
-struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
+ bool connected)
{
struct ovl_fh *fh;
struct dentry *upper;
@@ -499,7 +500,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index)
if (IS_ERR_OR_NULL(fh))
return ERR_CAST(fh);
- upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), true);
+ upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected);
kfree(fh);
if (IS_ERR_OR_NULL(upper))
@@ -572,7 +573,7 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
* directly from the index dentry, but for dir index we first need to
* decode the upper directory.
*/
- upper = ovl_index_upper(ofs, index);
+ upper = ovl_index_upper(ofs, index, false);
if (IS_ERR_OR_NULL(upper)) {
err = PTR_ERR(upper);
/*
@@ -1085,6 +1086,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
.mnt = ovl_upper_mnt(ofs),
};
+ /*
+ * It's safe to assign upperredirect here: the previous
+ * assignment of happens only if upperdentry is non-NULL, and
+ * this one only if upperdentry is NULL.
+ */
upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
if (IS_ERR(upperredirect)) {
err = PTR_ERR(upperredirect);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index eee8f08d32b6..1df7f850ff3b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -8,6 +8,8 @@
#include <linux/uuid.h>
#include <linux/fs.h>
#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include "ovl_entry.h"
#undef pr_fmt
@@ -108,7 +110,7 @@ struct ovl_fh {
u8 padding[3]; /* make sure fb.fid is 32bit aligned */
union {
struct ovl_fb fb;
- u8 buf[0];
+ DECLARE_FLEX_ARRAY(u8, buf);
};
} __packed;
@@ -278,6 +280,18 @@ static inline int ovl_removexattr(struct ovl_fs *ofs, struct dentry *dentry,
return ovl_do_removexattr(ofs, dentry, ovl_xattr(ofs, ox));
}
+static inline int ovl_do_set_acl(struct ovl_fs *ofs, struct dentry *dentry,
+ const char *acl_name, struct posix_acl *acl)
+{
+ return vfs_set_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name, acl);
+}
+
+static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
+ const char *acl_name)
+{
+ return vfs_remove_acl(ovl_upper_mnt_userns(ofs), dentry, acl_name);
+}
+
static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
struct dentry *olddentry, struct inode *newdir,
struct dentry *newdentry, unsigned int flags)
@@ -401,7 +415,7 @@ const char *ovl_dentry_get_redirect(struct dentry *dentry);
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
void ovl_dir_modified(struct dentry *dentry, bool impurity);
-u64 ovl_dentry_version_get(struct dentry *dentry);
+u64 ovl_inode_version_get(struct inode *inode);
bool ovl_is_whiteout(struct dentry *dentry);
struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
@@ -525,7 +539,8 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry,
enum ovl_xattr ox, struct dentry *real, bool is_upper,
bool set);
-struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index);
+struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index,
+ bool connected);
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
struct qstr *name);
@@ -570,9 +585,9 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs);
* lower dir was removed under it and possibly before it was rotated from upper
* to lower layer.
*/
-static inline bool ovl_dir_is_real(struct dentry *dir)
+static inline bool ovl_dir_is_real(struct inode *dir)
{
- return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
+ return !ovl_test_flag(OVL_WHITEOUTS, dir);
}
/* inode.c */
@@ -594,9 +609,33 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
#ifdef CONFIG_FS_POSIX_ACL
-struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu);
+struct posix_acl *do_ovl_get_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
+ bool rcu, bool noperm);
+static inline struct posix_acl *ovl_get_inode_acl(struct inode *inode, int type,
+ bool rcu)
+{
+ return do_ovl_get_acl(&init_user_ns, inode, type, rcu, true);
+}
+static inline struct posix_acl *ovl_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ return do_ovl_get_acl(mnt_userns, d_inode(dentry), type, false, false);
+}
+int ovl_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct posix_acl *acl, int type);
+struct posix_acl *ovl_get_acl_path(const struct path *path,
+ const char *acl_name, bool noperm);
#else
-#define ovl_get_acl NULL
+#define ovl_get_inode_acl NULL
+#define ovl_get_acl NULL
+#define ovl_set_acl NULL
+static inline struct posix_acl *ovl_get_acl_path(const struct path *path,
+ const char *acl_name,
+ bool noperm)
+{
+ return NULL;
+}
#endif
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 2b210640036c..8cd2b9947de1 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -235,15 +235,15 @@ void ovl_dir_cache_free(struct inode *inode)
}
}
-static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
+static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
- if (ovl_dir_cache(d_inode(dentry)) == cache)
- ovl_set_dir_cache(d_inode(dentry), NULL);
+ if (ovl_dir_cache(inode) == cache)
+ ovl_set_dir_cache(inode, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
@@ -323,15 +323,15 @@ static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
- struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = file_inode(file);
bool is_real;
- if (cache && ovl_dentry_version_get(dentry) != cache->version) {
- ovl_cache_put(od, dentry);
+ if (cache && ovl_inode_version_get(inode) != cache->version) {
+ ovl_cache_put(od, inode);
od->cache = NULL;
od->cursor = NULL;
}
- is_real = ovl_dir_is_real(dentry);
+ is_real = ovl_dir_is_real(inode);
if (od->is_real != is_real) {
/* is_real can only become false when dir is copied up */
if (WARN_ON(is_real))
@@ -394,9 +394,10 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
+ struct inode *inode = d_inode(dentry);
- cache = ovl_dir_cache(d_inode(dentry));
- if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+ cache = ovl_dir_cache(inode);
+ if (cache && ovl_inode_version_get(inode) == cache->version) {
WARN_ON(!cache->refcount);
cache->refcount++;
return cache;
@@ -418,8 +419,8 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
return ERR_PTR(res);
}
- cache->version = ovl_dentry_version_get(dentry);
- ovl_set_dir_cache(d_inode(dentry), cache);
+ cache->version = ovl_inode_version_get(inode);
+ ovl_set_dir_cache(inode, cache);
return cache;
}
@@ -596,16 +597,17 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
{
int res;
struct dentry *dentry = path->dentry;
+ struct inode *inode = d_inode(dentry);
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct ovl_dir_cache *cache;
- cache = ovl_dir_cache(d_inode(dentry));
- if (cache && ovl_dentry_version_get(dentry) == cache->version)
+ cache = ovl_dir_cache(inode);
+ if (cache && ovl_inode_version_get(inode) == cache->version)
return cache;
/* Impure cache is not refcounted, free it here */
- ovl_dir_cache_free(d_inode(dentry));
- ovl_set_dir_cache(d_inode(dentry), NULL);
+ ovl_dir_cache_free(inode);
+ ovl_set_dir_cache(inode, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
@@ -627,13 +629,13 @@ static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path)
OVL_XATTR_IMPURE);
ovl_drop_write(dentry);
}
- ovl_clear_flag(OVL_IMPURE, d_inode(dentry));
+ ovl_clear_flag(OVL_IMPURE, inode);
kfree(cache);
return NULL;
}
- cache->version = ovl_dentry_version_get(dentry);
- ovl_set_dir_cache(d_inode(dentry), cache);
+ cache->version = ovl_inode_version_get(inode);
+ ovl_set_dir_cache(inode, cache);
return cache;
}
@@ -675,7 +677,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
static bool ovl_is_impure_dir(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
- struct inode *dir = d_inode(file->f_path.dentry);
+ struct inode *dir = file_inode(file);
/*
* Only upper dir can be impure, but if we are in the middle of
@@ -893,7 +895,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
struct file *realfile;
int err;
- err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb));
+ err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb));
if (err <= 0)
return err;
@@ -913,7 +915,7 @@ static int ovl_dir_release(struct inode *inode, struct file *file)
if (od->cache) {
inode_lock(inode);
- ovl_cache_put(od, file->f_path.dentry);
+ ovl_cache_put(od, inode);
inode_unlock(inode);
}
fput(od->realfile);
@@ -942,7 +944,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file)
return PTR_ERR(realfile);
}
od->realfile = realfile;
- od->is_real = ovl_dir_is_real(file->f_path.dentry);
+ od->is_real = ovl_dir_is_real(inode);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
@@ -1071,14 +1073,10 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
int err;
struct inode *dir = path->dentry->d_inode;
LIST_HEAD(list);
- struct rb_root root = RB_ROOT;
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
- .ctx.actor = ovl_fill_merge,
- .dentry = NULL,
+ .ctx.actor = ovl_fill_plain,
.list = &list,
- .root = &root,
- .is_lowest = false,
};
bool incompat = false;
@@ -1159,14 +1157,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
struct inode *dir = indexdir->d_inode;
struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
LIST_HEAD(list);
- struct rb_root root = RB_ROOT;
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
- .ctx.actor = ovl_fill_merge,
- .dentry = NULL,
+ .ctx.actor = ovl_fill_plain,
.list = &list,
- .root = &root,
- .is_lowest = false,
};
err = ovl_dir_read(&path, &rdd);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a29a8afe9b26..85b891152a2c 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -139,11 +139,16 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
unsigned int flags, bool weak)
{
struct ovl_entry *oe = dentry->d_fsdata;
+ struct inode *inode = d_inode_rcu(dentry);
struct dentry *upper;
unsigned int i;
int ret = 1;
- upper = ovl_dentry_upper(dentry);
+ /* Careful in RCU mode */
+ if (!inode)
+ return -ECHILD;
+
+ upper = ovl_i_dentry_upper(inode);
if (upper)
ret = ovl_revalidate_real(upper, flags, weak);
@@ -813,13 +818,11 @@ retry:
* allowed as upper are limited to "normal" ones, where checking
* for the above two errors is sufficient.
*/
- err = ovl_do_removexattr(ofs, work,
- XATTR_NAME_POSIX_ACL_DEFAULT);
+ err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_DEFAULT);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
- err = ovl_do_removexattr(ofs, work,
- XATTR_NAME_POSIX_ACL_ACCESS);
+ err = ovl_do_remove_acl(ofs, work, XATTR_NAME_POSIX_ACL_ACCESS);
if (err && err != -ENODATA && err != -EOPNOTSUPP)
goto out_dput;
@@ -1001,83 +1004,6 @@ static unsigned int ovl_split_lowerdirs(char *str)
return ctr;
}
-static int __maybe_unused
-ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *buffer, size_t size)
-{
- return ovl_xattr_get(dentry, inode, handler->name, buffer, size);
-}
-
-static int __maybe_unused
-ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
-{
- struct dentry *workdir = ovl_workdir(dentry);
- struct inode *realinode = ovl_inode_real(inode);
- struct posix_acl *acl = NULL;
- int err;
-
- /* Check that everything is OK before copy-up */
- if (value) {
- /* The above comment can be understood in two ways:
- *
- * 1. We just want to check whether the basic POSIX ACL format
- * is ok. For example, if the header is correct and the size
- * is sane.
- * 2. We want to know whether the ACL_{GROUP,USER} entries can
- * be mapped according to the underlying filesystem.
- *
- * Currently, we only check 1. If we wanted to check 2. we
- * would need to pass the mnt_userns and the fs_userns of the
- * underlying filesystem. But frankly, I think checking 1. is
- * enough to start the copy-up.
- */
- acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- err = -EOPNOTSUPP;
- if (!IS_POSIXACL(d_inode(workdir)))
- goto out_acl_release;
- if (!realinode->i_op->set_acl)
- goto out_acl_release;
- if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) {
- err = acl ? -EACCES : 0;
- goto out_acl_release;
- }
- err = -EPERM;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- goto out_acl_release;
-
- posix_acl_release(acl);
-
- /*
- * Check if sgid bit needs to be cleared (actual setacl operation will
- * be done with mounter's capabilities and so that won't do it for us).
- */
- if (unlikely(inode->i_mode & S_ISGID) &&
- handler->flags == ACL_TYPE_ACCESS &&
- !in_group_p(inode->i_gid) &&
- !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) {
- struct iattr iattr = { .ia_valid = ATTR_KILL_SGID };
-
- err = ovl_setattr(&init_user_ns, dentry, &iattr);
- if (err)
- return err;
- }
-
- err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags);
- return err;
-
-out_acl_release:
- posix_acl_release(acl);
- return err;
-}
-
static int ovl_own_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *buffer, size_t size)
@@ -1110,22 +1036,6 @@ static int ovl_other_xattr_set(const struct xattr_handler *handler,
return ovl_xattr_set(dentry, inode, name, value, size, flags);
}
-static const struct xattr_handler __maybe_unused
-ovl_posix_acl_access_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = ovl_posix_acl_xattr_get,
- .set = ovl_posix_acl_xattr_set,
-};
-
-static const struct xattr_handler __maybe_unused
-ovl_posix_acl_default_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = ovl_posix_acl_xattr_get,
- .set = ovl_posix_acl_xattr_set,
-};
-
static const struct xattr_handler ovl_own_trusted_xattr_handler = {
.prefix = OVL_XATTR_TRUSTED_PREFIX,
.get = ovl_own_xattr_get,
@@ -1146,8 +1056,8 @@ static const struct xattr_handler ovl_other_xattr_handler = {
static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
#ifdef CONFIG_FS_POSIX_ACL
- &ovl_posix_acl_access_xattr_handler,
- &ovl_posix_acl_default_xattr_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
&ovl_own_trusted_xattr_handler,
&ovl_other_xattr_handler,
@@ -1156,8 +1066,8 @@ static const struct xattr_handler *ovl_trusted_xattr_handlers[] = {
static const struct xattr_handler *ovl_user_xattr_handlers[] = {
#ifdef CONFIG_FS_POSIX_ACL
- &ovl_posix_acl_access_xattr_handler,
- &ovl_posix_acl_default_xattr_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
&ovl_own_user_xattr_handler,
&ovl_other_xattr_handler,
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 81a57a8d80d9..bde291623c8c 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -463,7 +463,7 @@ static void ovl_dir_version_inc(struct dentry *dentry, bool impurity)
* which have been copied up and have origins), so only need to note
* changes to impure entries.
*/
- if (!ovl_dir_is_real(dentry) || impurity)
+ if (!ovl_dir_is_real(inode) || impurity)
OVL_I(inode)->version++;
}
@@ -475,10 +475,8 @@ void ovl_dir_modified(struct dentry *dentry, bool impurity)
ovl_dir_version_inc(dentry, impurity);
}
-u64 ovl_dentry_version_get(struct dentry *dentry)
+u64 ovl_inode_version_get(struct inode *inode)
{
- struct inode *inode = d_inode(dentry);
-
WARN_ON(!inode_is_locked(inode));
return OVL_I(inode)->version;
}
@@ -1104,13 +1102,18 @@ void ovl_copyattr(struct inode *inode)
struct path realpath;
struct inode *realinode;
struct user_namespace *real_mnt_userns;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
ovl_i_path_real(inode, &realpath);
realinode = d_inode(realpath.dentry);
real_mnt_userns = mnt_user_ns(realpath.mnt);
- inode->i_uid = i_uid_into_mnt(real_mnt_userns, realinode);
- inode->i_gid = i_gid_into_mnt(real_mnt_userns, realinode);
+ vfsuid = i_uid_into_vfsuid(real_mnt_userns, realinode);
+ vfsgid = i_gid_into_vfsgid(real_mnt_userns, realinode);
+
+ inode->i_uid = vfsuid_into_kuid(vfsuid);
+ inode->i_gid = vfsgid_into_kgid(vfsgid);
inode->i_mode = realinode->i_mode;
inode->i_atime = realinode->i_atime;
inode->i_mtime = realinode->i_mtime;
diff --git a/fs/pnode.c b/fs/pnode.c
index 1106137c747a..468e4e65a615 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -244,7 +244,7 @@ static int propagate_one(struct mount *m)
}
do {
struct mount *parent = last_source->mnt_parent;
- if (last_source == first_source)
+ if (peers(last_source, first_source))
break;
done = parent->mnt_master == p;
if (done && peers(n, parent))
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 74dc0f571dc9..d7bc81fc0840 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -25,6 +25,11 @@
#include <linux/namei.h>
#include <linux/mnt_idmapping.h>
#include <linux/iversion.h>
+#include <linux/security.h>
+#include <linux/evm.h>
+#include <linux/fsnotify.h>
+
+#include "internal.h"
static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
@@ -64,7 +69,7 @@ struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
if (acl == ACL_DONT_CACHE) {
struct posix_acl *ret;
- ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU);
+ ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU);
if (!IS_ERR(ret))
acl = ret;
}
@@ -106,15 +111,17 @@ void forget_all_cached_acls(struct inode *inode)
}
EXPORT_SYMBOL(forget_all_cached_acls);
-struct posix_acl *get_acl(struct inode *inode, int type)
+static struct posix_acl *__get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct inode *inode,
+ int type)
{
- void *sentinel;
+ struct posix_acl *sentinel;
struct posix_acl **p;
struct posix_acl *acl;
/*
* The sentinel is used to detect when another operation like
- * set_cached_acl() or forget_cached_acl() races with get_acl().
+ * set_cached_acl() or forget_cached_acl() races with get_inode_acl().
* It is guaranteed that is_uncached_acl(sentinel) is true.
*/
@@ -133,25 +140,27 @@ struct posix_acl *get_acl(struct inode *inode, int type)
* current value of the ACL will not be ACL_NOT_CACHED and so our own
* sentinel will not be set; another task will update the cache. We
* could wait for that other task to complete its job, but it's easier
- * to just call ->get_acl to fetch the ACL ourself. (This is going to
- * be an unlikely race.)
+ * to just call ->get_inode_acl to fetch the ACL ourself. (This is
+ * going to be an unlikely race.)
*/
cmpxchg(p, ACL_NOT_CACHED, sentinel);
/*
- * Normally, the ACL returned by ->get_acl will be cached.
+ * Normally, the ACL returned by ->get{_inode}_acl will be cached.
* A filesystem can prevent that by calling
- * forget_cached_acl(inode, type) in ->get_acl.
+ * forget_cached_acl(inode, type) in ->get{_inode}_acl.
*
- * If the filesystem doesn't have a get_acl() function at all, we'll
- * just create the negative cache entry.
+ * If the filesystem doesn't have a get{_inode}_ acl() function at all,
+ * we'll just create the negative cache entry.
*/
- if (!inode->i_op->get_acl) {
+ if (dentry && inode->i_op->get_acl) {
+ acl = inode->i_op->get_acl(mnt_userns, dentry, type);
+ } else if (inode->i_op->get_inode_acl) {
+ acl = inode->i_op->get_inode_acl(inode, type, false);
+ } else {
set_cached_acl(inode, type, NULL);
return NULL;
}
- acl = inode->i_op->get_acl(inode, type, false);
-
if (IS_ERR(acl)) {
/*
* Remove our sentinel so that we don't block future attempts
@@ -169,7 +178,12 @@ struct posix_acl *get_acl(struct inode *inode, int type)
posix_acl_release(acl);
return acl;
}
-EXPORT_SYMBOL(get_acl);
+
+struct posix_acl *get_inode_acl(struct inode *inode, int type)
+{
+ return __get_acl(&init_user_ns, NULL, inode, type);
+}
+EXPORT_SYMBOL(get_inode_acl);
/*
* Init a fresh posix_acl
@@ -578,19 +592,20 @@ EXPORT_SYMBOL(__posix_acl_chmod);
* posix_acl_chmod - chmod a posix acl
*
* @mnt_userns: user namespace of the mount @inode was found from
- * @inode: inode to check permissions on
+ * @dentry: dentry to check permissions on
* @mode: the new mode of @inode
*
- * If the inode has been found through an idmapped mount the user namespace of
+ * If the dentry has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
* take care to map the inode according to @mnt_userns before checking
* permissions. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs init_user_ns.
*/
int
- posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode,
+ posix_acl_chmod(struct user_namespace *mnt_userns, struct dentry *dentry,
umode_t mode)
{
+ struct inode *inode = d_inode(dentry);
struct posix_acl *acl;
int ret = 0;
@@ -599,7 +614,7 @@ int
if (!inode->i_op->set_acl)
return -EOPNOTSUPP;
- acl = get_acl(inode, ACL_TYPE_ACCESS);
+ acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR_OR_NULL(acl)) {
if (acl == ERR_PTR(-EOPNOTSUPP))
return 0;
@@ -609,7 +624,7 @@ int
ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
if (ret)
return ret;
- ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS);
+ ret = inode->i_op->set_acl(mnt_userns, dentry, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
return ret;
}
@@ -629,7 +644,7 @@ posix_acl_create(struct inode *dir, umode_t *mode,
if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
return 0;
- p = get_acl(dir, ACL_TYPE_DEFAULT);
+ p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
*mode &= ~current_umask();
return 0;
@@ -732,118 +747,32 @@ static int posix_acl_fix_xattr_common(const void *value, size_t size)
return count;
}
-void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
- const struct inode *inode,
- void *value, size_t size)
-{
- struct posix_acl_xattr_header *header = value;
- struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
- struct user_namespace *fs_userns = i_user_ns(inode);
- int count;
- vfsuid_t vfsuid;
- vfsgid_t vfsgid;
- kuid_t uid;
- kgid_t gid;
-
- if (no_idmapping(mnt_userns, i_user_ns(inode)))
- return;
-
- count = posix_acl_fix_xattr_common(value, size);
- if (count <= 0)
- return;
-
- for (end = entry + count; entry != end; entry++) {
- switch (le16_to_cpu(entry->e_tag)) {
- case ACL_USER:
- uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsuid = make_vfsuid(mnt_userns, fs_userns, uid);
- entry->e_id = cpu_to_le32(from_kuid(&init_user_ns,
- vfsuid_into_kuid(vfsuid)));
- break;
- case ACL_GROUP:
- gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
- vfsgid = make_vfsgid(mnt_userns, fs_userns, gid);
- entry->e_id = cpu_to_le32(from_kgid(&init_user_ns,
- vfsgid_into_kgid(vfsgid)));
- break;
- default:
- break;
- }
- }
-}
-
-static void posix_acl_fix_xattr_userns(
- struct user_namespace *to, struct user_namespace *from,
- void *value, size_t size)
-{
- struct posix_acl_xattr_header *header = value;
- struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
- int count;
- kuid_t uid;
- kgid_t gid;
-
- count = posix_acl_fix_xattr_common(value, size);
- if (count <= 0)
- return;
-
- for (end = entry + count; entry != end; entry++) {
- switch(le16_to_cpu(entry->e_tag)) {
- case ACL_USER:
- uid = make_kuid(from, le32_to_cpu(entry->e_id));
- entry->e_id = cpu_to_le32(from_kuid(to, uid));
- break;
- case ACL_GROUP:
- gid = make_kgid(from, le32_to_cpu(entry->e_id));
- entry->e_id = cpu_to_le32(from_kgid(to, gid));
- break;
- default:
- break;
- }
- }
-}
-
-void posix_acl_fix_xattr_from_user(void *value, size_t size)
-{
- struct user_namespace *user_ns = current_user_ns();
- if (user_ns == &init_user_ns)
- return;
- posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
-}
-
-void posix_acl_fix_xattr_to_user(void *value, size_t size)
-{
- struct user_namespace *user_ns = current_user_ns();
- if (user_ns == &init_user_ns)
- return;
- posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
-}
-
/**
- * make_posix_acl - convert POSIX ACLs from uapi to VFS format using the
- * provided callbacks to map ACL_{GROUP,USER} entries into the
- * appropriate format
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
+ * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
+ * @userns: the filesystem's idmapping
* @value: the uapi representation of POSIX ACLs
* @size: the size of @void
- * @uid_cb: callback to use for mapping the uid stored in ACL_USER entries
- * @gid_cb: callback to use for mapping the gid stored in ACL_GROUP entries
*
- * The make_posix_acl() helper is an abstraction to translate from uapi format
- * into the VFS format allowing the caller to specific callbacks to map
- * ACL_{GROUP,USER} entries into the expected format. This is used in
- * posix_acl_from_xattr() and vfs_set_acl_prepare() and avoids pointless code
- * duplication.
+ * Filesystems that store POSIX ACLs in the unaltered uapi format should use
+ * posix_acl_from_xattr() when reading them from the backing store and
+ * converting them into the struct posix_acl VFS format. The helper is
+ * specifically intended to be called from the acl inode operation.
+ *
+ * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
+ * in ACL_{GROUP,USER} entries into idmapping in @userns.
+ *
+ * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
+ * If it did it calling it from the get acl inode operation would return POSIX
+ * ACLs mapped according to an idmapped mount which would mean that the value
+ * couldn't be cached for the filesystem. Idmapped mounts are taken into
+ * account on the fly during permission checking or right at the VFS -
+ * userspace boundary before reporting them to the user.
*
* Return: Allocated struct posix_acl on success, NULL for a valid header but
* without actual POSIX ACL entries, or ERR_PTR() encoded error code.
*/
-static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns, const void *value, size_t size,
- kuid_t (*uid_cb)(struct user_namespace *, struct user_namespace *,
- const struct posix_acl_xattr_entry *),
- kgid_t (*gid_cb)(struct user_namespace *, struct user_namespace *,
- const struct posix_acl_xattr_entry *))
+struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
+ const void *value, size_t size)
{
const struct posix_acl_xattr_header *header = value;
const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
@@ -874,12 +803,14 @@ static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns,
break;
case ACL_USER:
- acl_e->e_uid = uid_cb(mnt_userns, fs_userns, entry);
+ acl_e->e_uid = make_kuid(userns,
+ le32_to_cpu(entry->e_id));
if (!uid_valid(acl_e->e_uid))
goto fail;
break;
case ACL_GROUP:
- acl_e->e_gid = gid_cb(mnt_userns, fs_userns, entry);
+ acl_e->e_gid = make_kgid(userns,
+ le32_to_cpu(entry->e_id));
if (!gid_valid(acl_e->e_gid))
goto fail;
break;
@@ -894,181 +825,6 @@ fail:
posix_acl_release(acl);
return ERR_PTR(-EINVAL);
}
-
-/**
- * vfs_set_acl_prepare_kuid - map ACL_USER uid according to mount- and
- * filesystem idmapping
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_USER entry in POSIX ACL uapi format
- *
- * The uid stored as ACL_USER entry in @e is a kuid_t stored as a raw {g,u}id
- * value. The vfs_set_acl_prepare_kuid() will recover the kuid_t through
- * KUIDT_INIT() and then map it according to the idmapped mount. The resulting
- * kuid_t is the value which the filesystem can map up into a raw backing store
- * id in the filesystem's idmapping.
- *
- * This is used in vfs_set_acl_prepare() to generate the proper VFS
- * representation of POSIX ACLs with ACL_USER entries during setxattr().
- *
- * Return: A kuid in @fs_userns for the uid stored in @e.
- */
-static inline kuid_t
-vfs_set_acl_prepare_kuid(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const struct posix_acl_xattr_entry *e)
-{
- kuid_t kuid = KUIDT_INIT(le32_to_cpu(e->e_id));
- return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid));
-}
-
-/**
- * vfs_set_acl_prepare_kgid - map ACL_GROUP gid according to mount- and
- * filesystem idmapping
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_GROUP entry in POSIX ACL uapi format
- *
- * The gid stored as ACL_GROUP entry in @e is a kgid_t stored as a raw {g,u}id
- * value. The vfs_set_acl_prepare_kgid() will recover the kgid_t through
- * KGIDT_INIT() and then map it according to the idmapped mount. The resulting
- * kgid_t is the value which the filesystem can map up into a raw backing store
- * id in the filesystem's idmapping.
- *
- * This is used in vfs_set_acl_prepare() to generate the proper VFS
- * representation of POSIX ACLs with ACL_GROUP entries during setxattr().
- *
- * Return: A kgid in @fs_userns for the gid stored in @e.
- */
-static inline kgid_t
-vfs_set_acl_prepare_kgid(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const struct posix_acl_xattr_entry *e)
-{
- kgid_t kgid = KGIDT_INIT(le32_to_cpu(e->e_id));
- return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid));
-}
-
-/**
- * vfs_set_acl_prepare - convert POSIX ACLs from uapi to VFS format taking
- * mount and filesystem idmappings into account
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- * @value: the uapi representation of POSIX ACLs
- * @size: the size of @void
- *
- * When setting POSIX ACLs with ACL_{GROUP,USER} entries they need to be
- * mapped according to the relevant mount- and filesystem idmapping. It is
- * important that the ACL_{GROUP,USER} entries in struct posix_acl will be
- * mapped into k{g,u}id_t that are supposed to be mapped up in the filesystem
- * idmapping. This is crucial since the resulting struct posix_acl might be
- * cached filesystem wide. The vfs_set_acl_prepare() function will take care to
- * perform all necessary idmappings.
- *
- * Note, that since basically forever the {g,u}id values encoded as
- * ACL_{GROUP,USER} entries in the uapi POSIX ACLs passed via @value contain
- * values that have been mapped according to the caller's idmapping. In other
- * words, POSIX ACLs passed in uapi format as @value during setxattr() contain
- * {g,u}id values in their ACL_{GROUP,USER} entries that should actually have
- * been stored as k{g,u}id_t.
- *
- * This means, vfs_set_acl_prepare() needs to first recover the k{g,u}id_t by
- * calling K{G,U}IDT_INIT(). Afterwards they can be interpreted as vfs{g,u}id_t
- * through from_vfs{g,u}id() to account for any idmapped mounts. The
- * vfs_set_acl_prepare_k{g,u}id() helpers will take care to generate the
- * correct k{g,u}id_t.
- *
- * The filesystem will then receive the POSIX ACLs ready to be cached
- * filesystem wide and ready to be written to the backing store taking the
- * filesystem's idmapping into account.
- *
- * Return: Allocated struct posix_acl on success, NULL for a valid header but
- * without actual POSIX ACL entries, or ERR_PTR() encoded error code.
- */
-struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const void *value, size_t size)
-{
- return make_posix_acl(mnt_userns, fs_userns, value, size,
- vfs_set_acl_prepare_kuid,
- vfs_set_acl_prepare_kgid);
-}
-EXPORT_SYMBOL(vfs_set_acl_prepare);
-
-/**
- * posix_acl_from_xattr_kuid - map ACL_USER uid into filesystem idmapping
- * @mnt_userns: unused
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_USER entry in POSIX ACL uapi format
- *
- * Map the uid stored as ACL_USER entry in @e into the filesystem's idmapping.
- * This is used in posix_acl_from_xattr() to generate the proper VFS
- * representation of POSIX ACLs with ACL_USER entries.
- *
- * Return: A kuid in @fs_userns for the uid stored in @e.
- */
-static inline kuid_t
-posix_acl_from_xattr_kuid(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const struct posix_acl_xattr_entry *e)
-{
- return make_kuid(fs_userns, le32_to_cpu(e->e_id));
-}
-
-/**
- * posix_acl_from_xattr_kgid - map ACL_GROUP gid into filesystem idmapping
- * @mnt_userns: unused
- * @fs_userns: the filesystem's idmapping
- * @e: a ACL_GROUP entry in POSIX ACL uapi format
- *
- * Map the gid stored as ACL_GROUP entry in @e into the filesystem's idmapping.
- * This is used in posix_acl_from_xattr() to generate the proper VFS
- * representation of POSIX ACLs with ACL_GROUP entries.
- *
- * Return: A kgid in @fs_userns for the gid stored in @e.
- */
-static inline kgid_t
-posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- const struct posix_acl_xattr_entry *e)
-{
- return make_kgid(fs_userns, le32_to_cpu(e->e_id));
-}
-
-/**
- * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
- * @fs_userns: the filesystem's idmapping
- * @value: the uapi representation of POSIX ACLs
- * @size: the size of @void
- *
- * Filesystems that store POSIX ACLs in the unaltered uapi format should use
- * posix_acl_from_xattr() when reading them from the backing store and
- * converting them into the struct posix_acl VFS format. The helper is
- * specifically intended to be called from the ->get_acl() inode operation.
- *
- * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
- * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The
- * posix_acl_from_xattr_k{g,u}id() helpers will take care to generate the
- * correct k{g,u}id_t. The returned struct posix_acl can be cached.
- *
- * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
- * If it did it calling is from the ->get_acl() inode operation would return
- * POSIX ACLs mapped according to an idmapped mount which would mean that the
- * value couldn't be cached for the filesystem. Idmapped mounts are taken into
- * account on the fly during permission checking or right at the VFS -
- * userspace boundary before reporting them to the user.
- *
- * Return: Allocated struct posix_acl on success, NULL for a valid header but
- * without actual POSIX ACL entries, or ERR_PTR() encoded error code.
- */
-struct posix_acl *
-posix_acl_from_xattr(struct user_namespace *fs_userns,
- const void *value, size_t size)
-{
- return make_posix_acl(&init_user_ns, fs_userns, value, size,
- posix_acl_from_xattr_kuid,
- posix_acl_from_xattr_kgid);
-}
EXPORT_SYMBOL (posix_acl_from_xattr);
/*
@@ -1113,35 +869,76 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
}
EXPORT_SYMBOL (posix_acl_to_xattr);
-static int
-posix_acl_xattr_get(const struct xattr_handler *handler,
- struct dentry *unused, struct inode *inode,
- const char *name, void *value, size_t size)
-{
- struct posix_acl *acl;
- int error;
+/**
+ * vfs_posix_acl_to_xattr - convert from kernel to userspace representation
+ * @idmap: idmap of the mount
+ * @inode: inode the posix acls are set on
+ * @acl: the posix acls as represented by the vfs
+ * @buffer: the buffer into which to convert @acl
+ * @size: size of @buffer
+ *
+ * This converts @acl from the VFS representation in the filesystem idmapping
+ * to the uapi form reportable to userspace. And mount and caller idmappings
+ * are handled appropriately.
+ *
+ * Return: On success, the size of the stored uapi posix acls, on error a
+ * negative errno.
+ */
+static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap,
+ struct inode *inode,
+ const struct posix_acl *acl, void *buffer,
+ size_t size)
- if (!IS_POSIXACL(inode))
- return -EOPNOTSUPP;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
+{
+ struct posix_acl_xattr_header *ext_acl = buffer;
+ struct posix_acl_xattr_entry *ext_entry;
+ struct user_namespace *fs_userns, *caller_userns;
+ struct user_namespace *mnt_userns;
+ ssize_t real_size, n;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
- acl = get_acl(inode, handler->flags);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
+ real_size = posix_acl_xattr_size(acl->a_count);
+ if (!buffer)
+ return real_size;
+ if (real_size > size)
+ return -ERANGE;
- error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- posix_acl_release(acl);
+ ext_entry = (void *)(ext_acl + 1);
+ ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
- return error;
+ fs_userns = i_user_ns(inode);
+ caller_userns = current_user_ns();
+ mnt_userns = mnt_idmap_owner(idmap);
+ for (n=0; n < acl->a_count; n++, ext_entry++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+ ext_entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch(acl_e->e_tag) {
+ case ACL_USER:
+ vfsuid = make_vfsuid(mnt_userns, fs_userns, acl_e->e_uid);
+ ext_entry->e_id = cpu_to_le32(from_kuid(
+ caller_userns, vfsuid_into_kuid(vfsuid)));
+ break;
+ case ACL_GROUP:
+ vfsgid = make_vfsgid(mnt_userns, fs_userns, acl_e->e_gid);
+ ext_entry->e_id = cpu_to_le32(from_kgid(
+ caller_userns, vfsgid_into_kgid(vfsgid)));
+ break;
+ default:
+ ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+ break;
+ }
+ }
+ return real_size;
}
int
-set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
+set_posix_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
int type, struct posix_acl *acl)
{
+ struct inode *inode = d_inode(dentry);
+
if (!IS_POSIXACL(inode))
return -EOPNOTSUPP;
if (!inode->i_op->set_acl)
@@ -1157,40 +954,10 @@ set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (ret)
return ret;
}
- return inode->i_op->set_acl(mnt_userns, inode, acl, type);
+ return inode->i_op->set_acl(mnt_userns, dentry, acl, type);
}
EXPORT_SYMBOL(set_posix_acl);
-static int
-posix_acl_xattr_set(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *unused, struct inode *inode,
- const char *name, const void *value, size_t size,
- int flags)
-{
- struct posix_acl *acl = NULL;
- int ret;
-
- if (value) {
- /*
- * By the time we end up here the {g,u}ids stored in
- * ACL_{GROUP,USER} have already been mapped according to the
- * caller's idmapping. The vfs_set_acl_prepare() helper will
- * recover them and take idmapped mounts into account. The
- * filesystem will receive the POSIX ACLs in the correct
- * format ready to be cached or written to the backing store
- * taking the filesystem idmapping into account.
- */
- acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode),
- value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- ret = set_posix_acl(mnt_userns, inode, handler->flags, acl);
- posix_acl_release(acl);
- return ret;
-}
-
static bool
posix_acl_xattr_list(struct dentry *dentry)
{
@@ -1201,8 +968,6 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
.name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = posix_acl_xattr_list,
- .get = posix_acl_xattr_get,
- .set = posix_acl_xattr_set,
};
EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
@@ -1210,15 +975,14 @@ const struct xattr_handler posix_acl_default_xattr_handler = {
.name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = posix_acl_xattr_list,
- .get = posix_acl_xattr_get,
- .set = posix_acl_xattr_set,
};
EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
-int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int simple_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error;
+ struct inode *inode = d_inode(dentry);
if (type == ACL_TYPE_ACCESS) {
error = posix_acl_update_mode(mnt_userns, inode,
@@ -1252,3 +1016,252 @@ int simple_acl_create(struct inode *dir, struct inode *inode)
posix_acl_release(acl);
return 0;
}
+
+static int vfs_set_acl_idmapped_mnt(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ struct posix_acl *acl)
+{
+ for (int n = 0; n < acl->a_count; n++) {
+ struct posix_acl_entry *acl_e = &acl->a_entries[n];
+
+ switch (acl_e->e_tag) {
+ case ACL_USER:
+ acl_e->e_uid = from_vfsuid(mnt_userns, fs_userns,
+ VFSUIDT_INIT(acl_e->e_uid));
+ break;
+ case ACL_GROUP:
+ acl_e->e_gid = from_vfsgid(mnt_userns, fs_userns,
+ VFSGIDT_INIT(acl_e->e_gid));
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * vfs_set_acl - set posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to set the posix acls
+ * @acl_name: the name of the posix acl
+ * @kacl: the posix acls in the appropriate VFS format
+ *
+ * This function sets @kacl. The caller must all posix_acl_release() on @kacl
+ * afterwards.
+ *
+ * Return: On success 0, on error negative errno.
+ */
+int vfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ const char *acl_name, struct posix_acl *kacl)
+{
+ int acl_type;
+ int error;
+ struct inode *inode = d_inode(dentry);
+ struct inode *delegated_inode = NULL;
+
+ acl_type = posix_acl_type(acl_name);
+ if (acl_type < 0)
+ return -EINVAL;
+
+ if (kacl) {
+ /*
+ * If we're on an idmapped mount translate from mount specific
+ * vfs{g,u}id_t into global filesystem k{g,u}id_t.
+ * Afterwards we can cache the POSIX ACLs filesystem wide and -
+ * if this is a filesystem with a backing store - ultimately
+ * translate them to backing store values.
+ */
+ error = vfs_set_acl_idmapped_mnt(mnt_userns, i_user_ns(inode), kacl);
+ if (error)
+ return error;
+ }
+
+retry_deleg:
+ inode_lock(inode);
+
+ /*
+ * We only care about restrictions the inode struct itself places upon
+ * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
+ */
+ error = may_write_xattr(mnt_userns, inode);
+ if (error)
+ goto out_inode_unlock;
+
+ error = security_inode_set_acl(mnt_userns, dentry, acl_name, kacl);
+ if (error)
+ goto out_inode_unlock;
+
+ error = try_break_deleg(inode, &delegated_inode);
+ if (error)
+ goto out_inode_unlock;
+
+ if (inode->i_opflags & IOP_XATTR)
+ error = set_posix_acl(mnt_userns, dentry, acl_type, kacl);
+ else if (unlikely(is_bad_inode(inode)))
+ error = -EIO;
+ else
+ error = -EOPNOTSUPP;
+ if (!error) {
+ fsnotify_xattr(dentry);
+ evm_inode_post_set_acl(dentry, acl_name, kacl);
+ }
+
+out_inode_unlock:
+ inode_unlock(inode);
+
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(vfs_set_acl);
+
+/**
+ * vfs_get_acl - get posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to retrieve the posix acls
+ * @acl_name: the name of the posix acl
+ *
+ * This function retrieves @kacl from the filesystem. The caller must all
+ * posix_acl_release() on @kacl.
+ *
+ * Return: On success POSIX ACLs in VFS format, on error negative errno.
+ */
+struct posix_acl *vfs_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, const char *acl_name)
+{
+ struct inode *inode = d_inode(dentry);
+ struct posix_acl *acl;
+ int acl_type, error;
+
+ acl_type = posix_acl_type(acl_name);
+ if (acl_type < 0)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * The VFS has no restrictions on reading POSIX ACLs so calling
+ * something like xattr_permission() isn't needed. Only LSMs get a say.
+ */
+ error = security_inode_get_acl(mnt_userns, dentry, acl_name);
+ if (error)
+ return ERR_PTR(error);
+
+ if (!IS_POSIXACL(inode))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (S_ISLNK(inode->i_mode))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ acl = __get_acl(mnt_userns, dentry, inode, acl_type);
+ if (IS_ERR(acl))
+ return acl;
+ if (!acl)
+ return ERR_PTR(-ENODATA);
+
+ return acl;
+}
+EXPORT_SYMBOL_GPL(vfs_get_acl);
+
+/**
+ * vfs_remove_acl - remove posix acls
+ * @mnt_userns: user namespace of the mount
+ * @dentry: the dentry based on which to retrieve the posix acls
+ * @acl_name: the name of the posix acl
+ *
+ * This function removes posix acls.
+ *
+ * Return: On success 0, on error negative errno.
+ */
+int vfs_remove_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
+ const char *acl_name)
+{
+ int acl_type;
+ int error;
+ struct inode *inode = d_inode(dentry);
+ struct inode *delegated_inode = NULL;
+
+ acl_type = posix_acl_type(acl_name);
+ if (acl_type < 0)
+ return -EINVAL;
+
+retry_deleg:
+ inode_lock(inode);
+
+ /*
+ * We only care about restrictions the inode struct itself places upon
+ * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
+ */
+ error = may_write_xattr(mnt_userns, inode);
+ if (error)
+ goto out_inode_unlock;
+
+ error = security_inode_remove_acl(mnt_userns, dentry, acl_name);
+ if (error)
+ goto out_inode_unlock;
+
+ error = try_break_deleg(inode, &delegated_inode);
+ if (error)
+ goto out_inode_unlock;
+
+ if (inode->i_opflags & IOP_XATTR)
+ error = set_posix_acl(mnt_userns, dentry, acl_type, NULL);
+ else if (unlikely(is_bad_inode(inode)))
+ error = -EIO;
+ else
+ error = -EOPNOTSUPP;
+ if (!error) {
+ fsnotify_xattr(dentry);
+ evm_inode_post_remove_acl(mnt_userns, dentry, acl_name);
+ }
+
+out_inode_unlock:
+ inode_unlock(inode);
+
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(vfs_remove_acl);
+
+int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, const void *kvalue, size_t size)
+{
+ int error;
+ struct posix_acl *acl = NULL;
+
+ if (size) {
+ /*
+ * Note that posix_acl_from_xattr() uses GFP_NOFS when it
+ * probably doesn't need to here.
+ */
+ acl = posix_acl_from_xattr(current_user_ns(), kvalue, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+
+ error = vfs_set_acl(mnt_idmap_owner(idmap), dentry, acl_name, acl);
+ posix_acl_release(acl);
+ return error;
+}
+
+ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
+ const char *acl_name, void *kvalue, size_t size)
+{
+ ssize_t error;
+ struct posix_acl *acl;
+
+ acl = vfs_get_acl(mnt_idmap_owner(idmap), dentry, acl_name);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+
+ error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry),
+ acl, kvalue, size);
+ posix_acl_release(acl);
+ return error;
+}
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index fa762c5fbcb2..91fe1597af7b 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -3,6 +3,7 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include "internal.h"
static int cmdline_proc_show(struct seq_file *m, void *v)
{
@@ -13,7 +14,10 @@ static int cmdline_proc_show(struct seq_file *m, void *v)
static int __init proc_cmdline_init(void)
{
- proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show);
+ pde->size = saved_command_line_len + 1;
return 0;
}
fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index dfe6ce3505ce..e0758fe7936d 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -33,7 +33,16 @@ static int show_console_dev(struct seq_file *m, void *v)
if (con->device) {
const struct tty_driver *driver;
int index;
+
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
+ console_lock();
driver = con->device(con, &index);
+ console_unlock();
+
if (driver) {
dev = MKDEV(driver->major, driver->minor_start);
dev += index;
@@ -63,7 +72,12 @@ static void *c_start(struct seq_file *m, loff_t *pos)
struct console *con;
loff_t off = 0;
- console_lock();
+ /*
+ * Hold the console_list_lock to guarantee safe traversal of the
+ * console list. SRCU cannot be used because there is no
+ * place to store the SRCU cookie.
+ */
+ console_list_lock();
for_each_console(con)
if (off++ == *pos)
break;
@@ -74,13 +88,14 @@ static void *c_start(struct seq_file *m, loff_t *pos)
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
struct console *con = v;
+
++*pos;
- return con->next;
+ return hlist_entry_safe(con->node.next, struct console, node);
}
static void c_stop(struct seq_file *m, void *v)
{
- console_unlock();
+ console_list_unlock();
}
static const struct seq_operations consoles_op = {
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 913bef0d2a36..fc46d6fe080c 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -7,6 +7,7 @@
#include <linux/namei.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
+#include <linux/bitmap.h>
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
@@ -279,6 +280,30 @@ out:
return 0;
}
+static int proc_readfd_count(struct inode *inode, loff_t *count)
+{
+ struct task_struct *p = get_proc_task(inode);
+ struct fdtable *fdt;
+
+ if (!p)
+ return -ENOENT;
+
+ task_lock(p);
+ if (p->files) {
+ rcu_read_lock();
+
+ fdt = files_fdtable(p->files);
+ *count = bitmap_weight(fdt->open_fds, fdt->max_fds);
+
+ rcu_read_unlock();
+ }
+ task_unlock(p);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
static int proc_readfd(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
@@ -319,9 +344,29 @@ int proc_fd_permission(struct user_namespace *mnt_userns,
return rv;
}
+static int proc_fd_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ int rv = 0;
+
+ generic_fillattr(&init_user_ns, inode, stat);
+
+ /* If it's a directory, put the number of open fds there */
+ if (S_ISDIR(inode->i_mode)) {
+ rv = proc_readfd_count(inode, &stat->size);
+ if (rv < 0)
+ return rv;
+ }
+
+ return rv;
+}
+
const struct inode_operations proc_fd_inode_operations = {
.lookup = proc_lookupfd,
.permission = proc_fd_permission,
+ .getattr = proc_fd_getattr,
.setattr = proc_setattr,
};
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index dff921f7ca33..71157ee35c1a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -18,7 +18,6 @@
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
-#include <linux/notifier.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/printk.h>
@@ -541,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
fallthrough;
case KCORE_VMEMMAP:
case KCORE_TEXT:
- if (kern_addr_valid(start)) {
- /*
- * Using bounce buffer to bypass the
- * hardened user copy kernel text checks.
- */
- if (copy_from_kernel_nofault(buf, (void *)start,
- tsz)) {
- if (clear_user(buffer, tsz)) {
- ret = -EFAULT;
- goto out;
- }
- } else {
- if (copy_to_user(buffer, buf, tsz)) {
- ret = -EFAULT;
- goto out;
- }
+ /*
+ * Using bounce buffer to bypass the
+ * hardened user copy kernel text checks.
+ */
+ if (copy_from_kernel_nofault(buf, (void *)start, tsz)) {
+ if (clear_user(buffer, tsz)) {
+ ret = -EFAULT;
+ goto out;
}
} else {
- if (clear_user(buffer, tsz)) {
+ if (copy_to_user(buffer, buf, tsz)) {
ret = -EFAULT;
goto out;
}
@@ -638,10 +629,6 @@ static int __meminit kcore_callback(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block kcore_callback_nb __meminitdata = {
- .notifier_call = kcore_callback,
- .priority = 0,
-};
static struct kcore_list kcore_vmalloc;
@@ -694,7 +681,7 @@ static int __init proc_kcore_init(void)
add_modules_range();
/* Store direct-map area from physical memory map */
kcore_update_ram();
- register_hotmemory_notifier(&kcore_callback_nb);
+ hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI);
return 0;
}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index f2273b164535..6249c347809a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -219,8 +219,9 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+ u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
#endif
return u;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8a74cdcc9af0..e35a0398db63 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -274,6 +274,7 @@ static void show_vma_header_prefix(struct seq_file *m,
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
+ struct anon_vma_name *anon_name = NULL;
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
vm_flags_t flags = vma->vm_flags;
@@ -293,6 +294,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
start = vma->vm_start;
end = vma->vm_end;
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
+ if (mm)
+ anon_name = anon_vma_name(vma);
/*
* Print the dentry name for named mappings, and a
@@ -300,7 +303,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
*/
if (file) {
seq_pad(m, ' ');
- seq_file_path(m, file, "\n");
+ /*
+ * If user named this anon shared memory via
+ * prctl(PR_SET_VMA ..., use the provided name.
+ */
+ if (anon_name)
+ seq_printf(m, "[anon_shmem:%s]", anon_name->name);
+ else
+ seq_file_path(m, file, "\n");
goto done;
}
@@ -312,8 +322,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma);
if (!name) {
- struct anon_vma_name *anon_name;
-
if (!mm) {
name = "[vdso]";
goto done;
@@ -330,7 +338,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- anon_name = anon_vma_name(vma);
if (anon_name) {
seq_pad(m, ' ');
seq_printf(m, "[anon:%s]", anon_name->name);
@@ -667,6 +674,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_RAND_READ)] = "rr",
[ilog2(VM_DONTCOPY)] = "dc",
[ilog2(VM_DONTEXPAND)] = "de",
+ [ilog2(VM_LOCKONFAULT)] = "lf",
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index f2aa86c421f2..09a81e4b1273 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -199,7 +199,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
struct kvec kvec = { .iov_base = buf, .iov_len = count };
struct iov_iter iter;
- iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
return read_from_oldmem(&iter, count, ppos, false);
}
@@ -212,7 +212,7 @@ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
struct kvec kvec = { .iov_base = buf, .iov_len = count };
struct iov_iter iter;
- iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
return read_from_oldmem(&iter, count, ppos,
cc_platform_has(CC_ATTR_MEM_ENCRYPT));
@@ -437,7 +437,7 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
offset = (loff_t) index << PAGE_SHIFT;
kvec.iov_base = page_address(page);
kvec.iov_len = PAGE_SIZE;
- iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, PAGE_SIZE);
rc = __read_vmcore(&iter, &offset);
if (rc < 0) {
@@ -1567,6 +1567,7 @@ static int __init vmcore_init(void)
return rc;
rc = parse_crash_elf_headers();
if (rc) {
+ elfcorehdr_free(elfcorehdr_addr);
pr_warn("Kdump: vmcore not initialized\n");
return rc;
}
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 8adabde685f1..c49d554cc9ae 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -126,6 +126,7 @@ config PSTORE_CONSOLE
config PSTORE_PMSG
bool "Log user space messages"
depends on PSTORE
+ select RT_MUTEXES
help
When the option is enabled, pstore will export a character
interface /dev/pmsg0 to log user space messages. On reboot
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0c034ea39954..cbc0b468c1ab 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -89,6 +89,11 @@ static char *compress =
module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
+/* How much of the kernel log to snapshot */
+unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
+module_param(kmsg_bytes, ulong, 0444);
+MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)");
+
/* Compression parameters */
static struct crypto_comp *tfm;
@@ -100,9 +105,6 @@ struct pstore_zbackend {
static char *big_oops_buf;
static size_t big_oops_buf_sz;
-/* How much of the console log to snapshot */
-unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
-
void pstore_set_kmsg_bytes(int bytes)
{
kmsg_bytes = bytes;
@@ -391,6 +393,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
const char *why;
unsigned int part = 1;
unsigned long flags = 0;
+ int saved_ret = 0;
int ret;
why = kmsg_dump_reason_str(reason);
@@ -461,12 +464,21 @@ static void pstore_dump(struct kmsg_dumper *dumper,
if (ret == 0 && reason == KMSG_DUMP_OOPS) {
pstore_new_entry = 1;
pstore_timer_kick();
+ } else {
+ /* Preserve only the first non-zero returned value. */
+ if (!saved_ret)
+ saved_ret = ret;
}
total += record.size;
part++;
}
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+
+ if (saved_ret) {
+ pr_err_once("backend (%s) writing error (%d)\n", psinfo->name,
+ saved_ret);
+ }
}
static struct kmsg_dumper pstore_dumper = {
@@ -562,8 +574,9 @@ out:
int pstore_register(struct pstore_info *psi)
{
if (backend && strcmp(backend, psi->name)) {
- pr_warn("ignoring unexpected backend '%s'\n", psi->name);
- return -EPERM;
+ pr_warn("backend '%s' already in use: ignoring '%s'\n",
+ backend, psi->name);
+ return -EBUSY;
}
/* Sanity check flags. */
@@ -662,6 +675,8 @@ void pstore_unregister(struct pstore_info *psi)
psinfo = NULL;
kfree(backend);
backend = NULL;
+
+ pr_info("Unregistered %s as persistent store backend\n", psi->name);
mutex_unlock(&psinfo_lock);
}
EXPORT_SYMBOL_GPL(pstore_unregister);
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index d8542ec2f38c..ab82e5f05346 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -7,9 +7,10 @@
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
+#include <linux/rtmutex.h>
#include "internal.h"
-static DEFINE_MUTEX(pmsg_lock);
+static DEFINE_RT_MUTEX(pmsg_lock);
static ssize_t write_pmsg(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
@@ -28,9 +29,9 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
if (!access_ok(buf, count))
return -EFAULT;
- mutex_lock(&pmsg_lock);
+ rt_mutex_lock(&pmsg_lock);
ret = psinfo->write_user(&record, buf);
- mutex_unlock(&pmsg_lock);
+ rt_mutex_unlock(&pmsg_lock);
return ret ? ret : count;
}
@@ -46,7 +47,7 @@ static int pmsg_major;
#undef pr_fmt
#define pr_fmt(fmt) PMSG_NAME ": " fmt
-static char *pmsg_devnode(struct device *dev, umode_t *mode)
+static char *pmsg_devnode(const struct device *dev, umode_t *mode)
{
if (mode)
*mode = 0220;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fefe3d391d3a..ade66dbe5f39 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -18,10 +18,11 @@
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/compiler.h>
-#include <linux/pstore_ram.h>
#include <linux/of.h>
#include <linux/of_address.h>
+
#include "internal.h"
+#include "ram_internal.h"
#define RAMOOPS_KERNMSG_HDR "===="
#define MIN_MEM_SIZE 4096UL
@@ -451,20 +452,28 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
{
int i;
+ /* Free pmsg PRZ */
+ persistent_ram_free(&cxt->mprz);
+
+ /* Free console PRZ */
+ persistent_ram_free(&cxt->cprz);
+
/* Free dump PRZs */
if (cxt->dprzs) {
for (i = 0; i < cxt->max_dump_cnt; i++)
- persistent_ram_free(cxt->dprzs[i]);
+ persistent_ram_free(&cxt->dprzs[i]);
kfree(cxt->dprzs);
+ cxt->dprzs = NULL;
cxt->max_dump_cnt = 0;
}
/* Free ftrace PRZs */
if (cxt->fprzs) {
for (i = 0; i < cxt->max_ftrace_cnt; i++)
- persistent_ram_free(cxt->fprzs[i]);
+ persistent_ram_free(&cxt->fprzs[i]);
kfree(cxt->fprzs);
+ cxt->fprzs = NULL;
cxt->max_ftrace_cnt = 0;
}
}
@@ -548,9 +557,10 @@ static int ramoops_init_przs(const char *name,
while (i > 0) {
i--;
- persistent_ram_free(prz_ar[i]);
+ persistent_ram_free(&prz_ar[i]);
}
kfree(prz_ar);
+ prz_ar = NULL;
goto fail;
}
*paddr += zone_sz;
@@ -670,7 +680,7 @@ static int ramoops_parse_dt(struct platform_device *pdev,
field = value; \
}
- parse_u32("mem-type", pdata->record_size, pdata->mem_type);
+ parse_u32("mem-type", pdata->mem_type, pdata->mem_type);
parse_u32("record-size", pdata->record_size, 0);
parse_u32("console-size", pdata->console_size, 0);
parse_u32("ftrace-size", pdata->ftrace_size, 0);
@@ -735,6 +745,7 @@ static int ramoops_probe(struct platform_device *pdev)
/* Make sure we didn't get bogus platform data pointer. */
if (!pdata) {
pr_err("NULL platform data\n");
+ err = -EINVAL;
goto fail_out;
}
@@ -742,6 +753,7 @@ static int ramoops_probe(struct platform_device *pdev)
!pdata->ftrace_size && !pdata->pmsg_size)) {
pr_err("The memory size and the record/console size must be "
"non-zero\n");
+ err = -EINVAL;
goto fail_out;
}
@@ -772,12 +784,17 @@ static int ramoops_probe(struct platform_device *pdev)
dump_mem_sz, cxt->record_size,
&cxt->max_dump_cnt, 0, 0);
if (err)
- goto fail_out;
+ goto fail_init;
err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr,
cxt->console_size, 0);
if (err)
- goto fail_init_cprz;
+ goto fail_init;
+
+ err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
+ cxt->pmsg_size, 0);
+ if (err)
+ goto fail_init;
cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
? nr_cpu_ids
@@ -788,12 +805,7 @@ static int ramoops_probe(struct platform_device *pdev)
(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
? PRZ_FLAG_NO_LOCK : 0);
if (err)
- goto fail_init_fprz;
-
- err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
- cxt->pmsg_size, 0);
- if (err)
- goto fail_init_mprz;
+ goto fail_init;
cxt->pstore.data = cxt;
/*
@@ -857,11 +869,7 @@ fail_buf:
kfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
- persistent_ram_free(cxt->mprz);
-fail_init_mprz:
-fail_init_fprz:
- persistent_ram_free(cxt->cprz);
-fail_init_cprz:
+fail_init:
ramoops_free_przs(cxt);
fail_out:
return err;
@@ -876,8 +884,6 @@ static int ramoops_remove(struct platform_device *pdev)
kfree(cxt->pstore.buf);
cxt->pstore.bufsize = 0;
- persistent_ram_free(cxt->mprz);
- persistent_ram_free(cxt->cprz);
ramoops_free_przs(cxt);
return 0;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index a89e33719fcf..966191d3a5ba 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -13,13 +13,14 @@
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/memblock.h>
-#include <linux/pstore_ram.h>
#include <linux/rslib.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <asm/page.h>
+#include "ram_internal.h"
+
/**
* struct persistent_ram_buffer - persistent circular RAM buffer
*
@@ -439,7 +440,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size,
phys_addr_t addr = page_start + i * PAGE_SIZE;
pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
}
- vaddr = vmap(pages, page_count, VM_MAP, prot);
+ /*
+ * VM_IOREMAP used here to bypass this region during vread()
+ * and kmap_atomic() (i.e. kcore) to avoid __va() failures.
+ */
+ vaddr = vmap(pages, page_count, VM_MAP | VM_IOREMAP, prot);
kfree(pages);
/*
@@ -543,8 +548,14 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
return 0;
}
-void persistent_ram_free(struct persistent_ram_zone *prz)
+void persistent_ram_free(struct persistent_ram_zone **_prz)
{
+ struct persistent_ram_zone *prz;
+
+ if (!_prz)
+ return;
+
+ prz = *_prz;
if (!prz)
return;
@@ -568,6 +579,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
persistent_ram_free_old(prz);
kfree(prz->label);
kfree(prz);
+ *_prz = NULL;
}
struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
@@ -604,6 +616,6 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
return prz;
err:
- persistent_ram_free(prz);
+ persistent_ram_free(&prz);
return ERR_PTR(ret);
}
diff --git a/fs/pstore/ram_internal.h b/fs/pstore/ram_internal.h
new file mode 100644
index 000000000000..5f694698351f
--- /dev/null
+++ b/fs/pstore/ram_internal.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2010 Marco Stornelli <marco.stornelli@gmail.com>
+ * Copyright (C) 2011 Kees Cook <keescook@chromium.org>
+ * Copyright (C) 2011 Google, Inc.
+ */
+
+#include <linux/pstore_ram.h>
+
+/*
+ * Choose whether access to the RAM zone requires locking or not. If a zone
+ * can be written to from different CPUs like with ftrace for example, then
+ * PRZ_FLAG_NO_LOCK is used. For all other cases, locking is required.
+ */
+#define PRZ_FLAG_NO_LOCK BIT(0)
+/*
+ * If a PRZ should only have a single-boot lifetime, this marks it as
+ * getting wiped after its contents get copied out after boot.
+ */
+#define PRZ_FLAG_ZAP_OLD BIT(1)
+
+/**
+ * struct persistent_ram_zone - Details of a persistent RAM zone (PRZ)
+ * used as a pstore backend
+ *
+ * @paddr: physical address of the mapped RAM area
+ * @size: size of mapping
+ * @label: unique name of this PRZ
+ * @type: frontend type for this PRZ
+ * @flags: holds PRZ_FLAGS_* bits
+ *
+ * @buffer_lock:
+ * locks access to @buffer "size" bytes and "start" offset
+ * @buffer:
+ * pointer to actual RAM area managed by this PRZ
+ * @buffer_size:
+ * bytes in @buffer->data (not including any trailing ECC bytes)
+ *
+ * @par_buffer:
+ * pointer into @buffer->data containing ECC bytes for @buffer->data
+ * @par_header:
+ * pointer into @buffer->data containing ECC bytes for @buffer header
+ * (i.e. all fields up to @data)
+ * @rs_decoder:
+ * RSLIB instance for doing ECC calculations
+ * @corrected_bytes:
+ * ECC corrected bytes accounting since boot
+ * @bad_blocks:
+ * ECC uncorrectable bytes accounting since boot
+ * @ecc_info:
+ * ECC configuration details
+ *
+ * @old_log:
+ * saved copy of @buffer->data prior to most recent wipe
+ * @old_log_size:
+ * bytes contained in @old_log
+ *
+ */
+struct persistent_ram_zone {
+ phys_addr_t paddr;
+ size_t size;
+ void *vaddr;
+ char *label;
+ enum pstore_type_id type;
+ u32 flags;
+
+ raw_spinlock_t buffer_lock;
+ struct persistent_ram_buffer *buffer;
+ size_t buffer_size;
+
+ char *par_buffer;
+ char *par_header;
+ struct rs_control *rs_decoder;
+ int corrected_bytes;
+ int bad_blocks;
+ struct persistent_ram_ecc_info ecc_info;
+
+ char *old_log;
+ size_t old_log_size;
+};
+
+struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
+ u32 sig, struct persistent_ram_ecc_info *ecc_info,
+ unsigned int memtype, u32 flags, char *label);
+void persistent_ram_free(struct persistent_ram_zone **_prz);
+void persistent_ram_zap(struct persistent_ram_zone *prz);
+
+int persistent_ram_write(struct persistent_ram_zone *prz, const void *s,
+ unsigned int count);
+int persistent_ram_write_user(struct persistent_ram_zone *prz,
+ const void __user *s, unsigned int count);
+
+void persistent_ram_save_old(struct persistent_ram_zone *prz);
+size_t persistent_ram_old_size(struct persistent_ram_zone *prz);
+void *persistent_ram_old(struct persistent_ram_zone *prz);
+void persistent_ram_free_old(struct persistent_ram_zone *prz);
+ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
+ char *str, size_t len);
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 017d0d4ad329..2770746bb7aa 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -761,7 +761,7 @@ static inline int notrace psz_kmsg_write_record(struct psz_context *cxt,
/* avoid destroying old data, allocate a new one */
len = zone->buffer_size + sizeof(*zone->buffer);
zone->oldbuf = zone->buffer;
- zone->buffer = kzalloc(len, GFP_KERNEL);
+ zone->buffer = kzalloc(len, GFP_ATOMIC);
if (!zone->buffer) {
zone->buffer = zone->oldbuf;
return -ENOMEM;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0427b44bfee5..f27faf5db554 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2324,6 +2324,8 @@ static int vfs_setup_quota_inode(struct inode *inode, int type)
struct super_block *sb = inode->i_sb;
struct quota_info *dqopt = sb_dqopt(sb);
+ if (is_bad_inode(inode))
+ return -EUCLEAN;
if (!S_ISREG(inode->i_mode))
return -EACCES;
if (IS_RDONLY(inode))
diff --git a/fs/read_write.c b/fs/read_write.c
index 24b9668d6377..7a2ff6157eda 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -384,7 +384,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_ubuf(&iter, READ, buf, len);
+ iov_iter_ubuf(&iter, ITER_DEST, buf, len);
ret = call_read_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -424,7 +424,7 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos ? *pos : 0;
- iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+ iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
ret = file->f_op->read_iter(&kiocb, &iter);
if (ret > 0) {
if (pos)
@@ -486,7 +486,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
- iov_iter_ubuf(&iter, WRITE, (void __user *)buf, len);
+ iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
ret = call_write_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
@@ -533,7 +533,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
.iov_len = min_t(size_t, count, MAX_RW_COUNT),
};
struct iov_iter iter;
- iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+ iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
return __kernel_write_iter(file, &iter, pos);
}
/*
@@ -911,7 +911,7 @@ static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
struct iov_iter iter;
ssize_t ret;
- ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret >= 0) {
ret = do_iter_read(file, &iter, pos, flags);
kfree(iov);
@@ -928,7 +928,7 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
struct iov_iter iter;
ssize_t ret;
- ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
if (ret >= 0) {
file_start_write(file);
ret = do_iter_write(file, &iter, pos, flags);
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index d9052b8ce6dd..29c503a06db4 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -49,9 +49,9 @@ static inline int reiserfs_acl_count(size_t size)
#ifdef CONFIG_REISERFS_FS_POSIX_ACL
struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu);
-int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+int reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
-int reiserfs_acl_chmod(struct inode *inode);
+int reiserfs_acl_chmod(struct dentry *dentry);
int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
struct inode *dir, struct dentry *dentry,
struct inode *inode);
@@ -63,7 +63,7 @@ int reiserfs_cache_default_acl(struct inode *dir);
#define reiserfs_get_acl NULL
#define reiserfs_set_acl NULL
-static inline int reiserfs_acl_chmod(struct inode *inode)
+static inline int reiserfs_acl_chmod(struct dentry *dentry)
{
return 0;
}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 6e228bfbe7ef..467d13da198f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -256,7 +256,7 @@ const struct inode_operations reiserfs_file_inode_operations = {
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
+ .get_inode_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
.fileattr_get = reiserfs_fileattr_get,
.fileattr_set = reiserfs_fileattr_set,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index b9580a6515ee..c7d1fa526dea 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3404,7 +3404,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (!error && reiserfs_posixacl(inode->i_sb)) {
if (attr->ia_valid & ATTR_MODE)
- error = reiserfs_acl_chmod(inode);
+ error = reiserfs_acl_chmod(dentry);
}
out:
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 3d7a35d6a18b..0b8aa99749f1 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -696,6 +696,7 @@ static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir,
out_failed:
reiserfs_write_unlock(dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -779,6 +780,7 @@ static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
out_failed:
reiserfs_write_unlock(dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -878,6 +880,7 @@ static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -1194,6 +1197,7 @@ static int reiserfs_symlink(struct user_namespace *mnt_userns,
retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(parent_dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -1659,7 +1663,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
+ .get_inode_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
.fileattr_get = reiserfs_fileattr_get,
.fileattr_set = reiserfs_fileattr_set,
@@ -1683,6 +1687,6 @@ const struct inode_operations reiserfs_special_inode_operations = {
.setattr = reiserfs_setattr,
.listxattr = reiserfs_listxattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
+ .get_inode_acl = reiserfs_get_acl,
.set_acl = reiserfs_set_acl,
};
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index d6fcddc46f5b..93fe414fed18 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -18,7 +18,7 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
int
-reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+reiserfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
int error, error2;
@@ -26,6 +26,7 @@ reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
size_t jcreate_blocks;
int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
int update_mode = 0;
+ struct inode *inode = d_inode(dentry);
umode_t mode = inode->i_mode;
/*
@@ -371,7 +372,7 @@ int reiserfs_cache_default_acl(struct inode *inode)
if (IS_PRIVATE(inode))
return 0;
- acl = get_acl(inode, ACL_TYPE_DEFAULT);
+ acl = get_inode_acl(inode, ACL_TYPE_DEFAULT);
if (acl && !IS_ERR(acl)) {
int size = reiserfs_acl_size(acl->a_count);
@@ -396,13 +397,15 @@ int reiserfs_cache_default_acl(struct inode *inode)
/*
* Called under i_mutex
*/
-int reiserfs_acl_chmod(struct inode *inode)
+int reiserfs_acl_chmod(struct dentry *dentry)
{
+ struct inode *inode = d_inode(dentry);
+
if (IS_PRIVATE(inode))
return 0;
if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
!reiserfs_posixacl(inode->i_sb))
return 0;
- return posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ return posix_acl_chmod(&init_user_ns, dentry, inode->i_mode);
}
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 8965c8e5e172..857a65b05726 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -50,6 +50,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
int error;
sec->name = NULL;
+ sec->value = NULL;
/* Don't add selinux attributes on xattrs - they'll never get used */
if (IS_PRIVATE(dir))
@@ -95,7 +96,6 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th,
void reiserfs_security_free(struct reiserfs_security_handle *sec)
{
- kfree(sec->name);
kfree(sec->value);
sec->name = NULL;
sec->value = NULL;
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 654912d06862..41f60477bb41 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -304,7 +304,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
/* Check that we don't violate system file offset limits. */
ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
remap_flags);
- if (ret)
+ if (ret || *len == 0)
return ret;
/* Wait for the completion of any pending IOs on both files */
@@ -328,9 +328,6 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
- if (*len == 0)
- return 0;
-
if (!IS_DAX(inode_in))
ret = vfs_dedupe_file_range_compare(file_in, pos_in,
file_out, pos_out, *len, &is_same);
@@ -348,7 +345,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
remap_flags);
- if (ret)
+ if (ret || *len == 0)
return ret;
/* If can't alter the file contents, we're done. */
@@ -429,7 +426,7 @@ static bool allow_file_dedupe(struct file *file)
return true;
if (file->f_mode & FMODE_WRITE)
return true;
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
+ if (vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode), current_fsuid()))
return true;
if (!inode_permission(mnt_userns, inode, MAY_WRITE))
return true;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 9456a2032224..f5fdaf3b1572 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -156,7 +156,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
ssize_t ret;
init_sync_kiocb(&kiocb, file);
- iov_iter_init(&iter, READ, &iov, 1, size);
+ iov_iter_init(&iter, ITER_DEST, &iov, 1, size);
kiocb.ki_pos = *ppos;
ret = seq_read_iter(&kiocb, &iter);
diff --git a/fs/splice.c b/fs/splice.c
index 0878b852b355..5969b7a1d353 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -303,7 +303,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct kiocb kiocb;
int ret;
- iov_iter_pipe(&to, READ, pipe, len);
+ iov_iter_pipe(&to, ITER_DEST, pipe, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -682,7 +682,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
n++;
}
- iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left);
+ iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
ret = vfs_iter_write(out, &from, &sd.pos, 0);
if (ret <= 0)
break;
@@ -1263,9 +1263,9 @@ static int vmsplice_type(struct fd f, int *type)
if (!f.file)
return -EBADF;
if (f.file->f_mode & FMODE_WRITE) {
- *type = WRITE;
+ *type = ITER_SOURCE;
} else if (f.file->f_mode & FMODE_READ) {
- *type = READ;
+ *type = ITER_DEST;
} else {
fdput(f);
return -EBADF;
@@ -1314,7 +1314,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
if (!iov_iter_count(&iter))
error = 0;
- else if (iov_iter_rw(&iter) == WRITE)
+ else if (type == ITER_SOURCE)
error = vmsplice_to_pipe(f.file, &iter, flags);
else
error = vmsplice_to_user(f.file, &iter, flags);
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 916e78fabcaa..60fc98bdf421 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -54,9 +54,35 @@ config SQUASHFS_FILE_DIRECT
endchoice
+config SQUASHFS_DECOMP_SINGLE
+ depends on SQUASHFS
+ def_bool n
+
+config SQUASHFS_DECOMP_MULTI
+ depends on SQUASHFS
+ def_bool n
+
+config SQUASHFS_DECOMP_MULTI_PERCPU
+ depends on SQUASHFS
+ def_bool n
+
+config SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+ bool "Select the parallel decompression mode during mount"
+ depends on SQUASHFS
+ default n
+ select SQUASHFS_DECOMP_SINGLE
+ select SQUASHFS_DECOMP_MULTI
+ select SQUASHFS_DECOMP_MULTI_PERCPU
+ select SQUASHFS_MOUNT_DECOMP_THREADS
+ help
+ Compile all parallel decompression modes and specify the
+ decompression mode by setting "threads=" during mount.
+ default Decompressor parallelisation is SQUASHFS_DECOMP_SINGLE
+
choice
- prompt "Decompressor parallelisation options"
+ prompt "Select decompression parallel mode at compile time"
depends on SQUASHFS
+ depends on !SQUASHFS_CHOICE_DECOMP_BY_MOUNT
help
Squashfs now supports three parallelisation options for
decompression. Each one exhibits various trade-offs between
@@ -64,15 +90,17 @@ choice
If in doubt, select "Single threaded compression"
-config SQUASHFS_DECOMP_SINGLE
+config SQUASHFS_COMPILE_DECOMP_SINGLE
bool "Single threaded compression"
+ select SQUASHFS_DECOMP_SINGLE
help
Traditionally Squashfs has used single-threaded decompression.
Only one block (data or metadata) can be decompressed at any
one time. This limits CPU and memory usage to a minimum.
-config SQUASHFS_DECOMP_MULTI
+config SQUASHFS_COMPILE_DECOMP_MULTI
bool "Use multiple decompressors for parallel I/O"
+ select SQUASHFS_DECOMP_MULTI
help
By default Squashfs uses a single decompressor but it gives
poor performance on parallel I/O workloads when using multiple CPU
@@ -85,8 +113,9 @@ config SQUASHFS_DECOMP_MULTI
decompressors per core. It dynamically allocates decompressors
on a demand basis.
-config SQUASHFS_DECOMP_MULTI_PERCPU
+config SQUASHFS_COMPILE_DECOMP_MULTI_PERCPU
bool "Use percpu multiple decompressors for parallel I/O"
+ select SQUASHFS_DECOMP_MULTI_PERCPU
help
By default Squashfs uses a single decompressor but it gives
poor performance on parallel I/O workloads when using multiple CPU
@@ -95,9 +124,21 @@ config SQUASHFS_DECOMP_MULTI_PERCPU
This decompressor implementation uses a maximum of one
decompressor per core. It uses percpu variables to ensure
decompression is load-balanced across the cores.
-
endchoice
+config SQUASHFS_MOUNT_DECOMP_THREADS
+ bool "Add the mount parameter 'threads=' for squashfs"
+ depends on SQUASHFS
+ depends on SQUASHFS_DECOMP_MULTI
+ default n
+ help
+ Use threads= to set the decompression parallel mode and the number of threads.
+ If SQUASHFS_CHOICE_DECOMP_BY_MOUNT=y
+ threads=<single|multi|percpu|1|2|3|...>
+ else
+ threads=<2|3|...>
+ The upper limit is num_online_cpus() * 2.
+
config SQUASHFS_XATTR
bool "Squashfs XATTR support"
depends on SQUASHFS
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 833aca92301f..bed3bb8b27fa 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -216,7 +216,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
res = -EIO;
goto out_free_bio;
}
- res = squashfs_decompress(msblk, bio, offset, length, output);
+ res = msblk->thread_ops->decompress(msblk, bio, offset, length, output);
} else {
res = copy_bio_to_actor(bio, output, offset, length);
}
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index d57bef91ab08..8893cb9b4198 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -134,7 +134,7 @@ void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
if (IS_ERR(comp_opts))
return comp_opts;
- stream = squashfs_decompressor_create(msblk, comp_opts);
+ stream = msblk->thread_ops->create(msblk, comp_opts);
if (IS_ERR(stream))
kfree(comp_opts);
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
index db9f12a3ea05..416c53eedbd1 100644
--- a/fs/squashfs/decompressor_multi.c
+++ b/fs/squashfs/decompressor_multi.c
@@ -29,12 +29,11 @@
#define MAX_DECOMPRESSOR (num_online_cpus() * 2)
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
{
return MAX_DECOMPRESSOR;
}
-
struct squashfs_stream {
void *comp_opts;
struct list_head strm_list;
@@ -59,7 +58,7 @@ static void put_decomp_stream(struct decomp_stream *decomp_strm,
wake_up(&stream->wait);
}
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
void *comp_opts)
{
struct squashfs_stream *stream;
@@ -103,7 +102,7 @@ out:
}
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream *stream = msblk->stream;
if (stream) {
@@ -145,7 +144,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
* If there is no available decomp and already full,
* let's wait for releasing decomp from other users.
*/
- if (stream->avail_decomp >= MAX_DECOMPRESSOR)
+ if (stream->avail_decomp >= msblk->max_thread_num)
goto wait;
/* Let's allocate new decomp */
@@ -161,7 +160,7 @@ static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
}
stream->avail_decomp++;
- WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR);
+ WARN_ON(stream->avail_decomp > msblk->max_thread_num);
mutex_unlock(&stream->mutex);
break;
@@ -180,7 +179,7 @@ wait:
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
int offset, int length,
struct squashfs_page_actor *output)
{
@@ -195,3 +194,10 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
msblk->decompressor->name);
return res;
}
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi = {
+ .create = squashfs_decompressor_create,
+ .destroy = squashfs_decompressor_destroy,
+ .decompress = squashfs_decompress,
+ .max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
index b881b9283b7f..1dfadf76ed9a 100644
--- a/fs/squashfs/decompressor_multi_percpu.c
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -25,7 +25,7 @@ struct squashfs_stream {
local_lock_t lock;
};
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
void *comp_opts)
{
struct squashfs_stream *stream;
@@ -59,7 +59,7 @@ out:
return ERR_PTR(err);
}
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream __percpu *percpu =
(struct squashfs_stream __percpu *) msblk->stream;
@@ -75,19 +75,21 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
}
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
int offset, int length, struct squashfs_page_actor *output)
{
struct squashfs_stream *stream;
+ struct squashfs_stream __percpu *percpu =
+ (struct squashfs_stream __percpu *) msblk->stream;
int res;
- local_lock(&msblk->stream->lock);
- stream = this_cpu_ptr(msblk->stream);
+ local_lock(&percpu->lock);
+ stream = this_cpu_ptr(percpu);
res = msblk->decompressor->decompress(msblk, stream->stream, bio,
offset, length, output);
- local_unlock(&msblk->stream->lock);
+ local_unlock(&percpu->lock);
if (res < 0)
ERROR("%s decompression failed, data probably corrupt\n",
@@ -96,7 +98,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
return res;
}
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
{
return num_possible_cpus();
}
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu = {
+ .create = squashfs_decompressor_create,
+ .destroy = squashfs_decompressor_destroy,
+ .decompress = squashfs_decompress,
+ .max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
index 4eb3d083d45e..6f161887710b 100644
--- a/fs/squashfs/decompressor_single.c
+++ b/fs/squashfs/decompressor_single.c
@@ -24,7 +24,7 @@ struct squashfs_stream {
struct mutex mutex;
};
-void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+static void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
void *comp_opts)
{
struct squashfs_stream *stream;
@@ -49,7 +49,7 @@ out:
return ERR_PTR(err);
}
-void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+static void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
{
struct squashfs_stream *stream = msblk->stream;
@@ -59,7 +59,7 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
}
}
-int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
+static int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
int offset, int length,
struct squashfs_page_actor *output)
{
@@ -78,7 +78,14 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio,
return res;
}
-int squashfs_max_decompressors(void)
+static int squashfs_max_decompressors(void)
{
return 1;
}
+
+const struct squashfs_decompressor_thread_ops squashfs_decompressor_single = {
+ .create = squashfs_decompressor_create,
+ .destroy = squashfs_decompressor_destroy,
+ .decompress = squashfs_decompress,
+ .max_decompressors = squashfs_max_decompressors,
+};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 9783e01c8100..a6164fdf9435 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -38,11 +38,24 @@ extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
/* decompressor_xxx.c */
-extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
-extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
-extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *,
- int, int, struct squashfs_page_actor *);
-extern int squashfs_max_decompressors(void);
+
+struct squashfs_decompressor_thread_ops {
+ void * (*create)(struct squashfs_sb_info *msblk, void *comp_opts);
+ void (*destroy)(struct squashfs_sb_info *msblk);
+ int (*decompress)(struct squashfs_sb_info *msblk, struct bio *bio,
+ int offset, int length, struct squashfs_page_actor *output);
+ int (*max_decompressors)(void);
+};
+
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_single;
+#endif
+#ifdef CONFIG_SQUASHFS_DECOMP_MULTI
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_multi;
+#endif
+#ifdef CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU
+extern const struct squashfs_decompressor_thread_ops squashfs_decompressor_percpu;
+#endif
/* export.c */
extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 1e90c2575f9b..659082e9e51d 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -53,7 +53,7 @@ struct squashfs_sb_info {
__le64 *xattr_id_table;
struct mutex meta_index_mutex;
struct meta_index *meta_index;
- struct squashfs_stream *stream;
+ void *stream;
__le64 *inode_lookup_table;
u64 inode_table;
u64 directory_table;
@@ -66,5 +66,7 @@ struct squashfs_sb_info {
int xattr_ids;
unsigned int ids;
bool panic_on_errors;
+ const struct squashfs_decompressor_thread_ops *thread_ops;
+ int max_thread_num;
};
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 32565dafa7f3..e090fae48e68 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -47,10 +47,13 @@ enum Opt_errors {
enum squashfs_param {
Opt_errors,
+ Opt_threads,
};
struct squashfs_mount_opts {
enum Opt_errors errors;
+ const struct squashfs_decompressor_thread_ops *thread_ops;
+ int thread_num;
};
static const struct constant_table squashfs_param_errors[] = {
@@ -61,9 +64,66 @@ static const struct constant_table squashfs_param_errors[] = {
static const struct fs_parameter_spec squashfs_fs_parameters[] = {
fsparam_enum("errors", Opt_errors, squashfs_param_errors),
+ fsparam_string("threads", Opt_threads),
{}
};
+
+static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts)
+{
+#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+ if (strcmp(str, "single") == 0) {
+ opts->thread_ops = &squashfs_decompressor_single;
+ return 0;
+ }
+ if (strcmp(str, "multi") == 0) {
+ opts->thread_ops = &squashfs_decompressor_multi;
+ return 0;
+ }
+ if (strcmp(str, "percpu") == 0) {
+ opts->thread_ops = &squashfs_decompressor_percpu;
+ return 0;
+ }
+#endif
+ return -EINVAL;
+}
+
+static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts)
+{
+#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
+ int ret;
+ unsigned long num;
+
+ ret = kstrtoul(str, 0, &num);
+ if (ret != 0)
+ return -EINVAL;
+ if (num > 1) {
+ opts->thread_ops = &squashfs_decompressor_multi;
+ if (num > opts->thread_ops->max_decompressors())
+ return -EINVAL;
+ opts->thread_num = (int)num;
+ return 0;
+ }
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+ if (num == 1) {
+ opts->thread_ops = &squashfs_decompressor_single;
+ opts->thread_num = 1;
+ return 0;
+ }
+#endif
+#endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */
+ return -EINVAL;
+}
+
+static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts)
+{
+ int ret = squashfs_parse_param_threads_str(str, opts);
+
+ if (ret == 0)
+ return ret;
+ return squashfs_parse_param_threads_num(str, opts);
+}
+
static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct squashfs_mount_opts *opts = fc->fs_private;
@@ -78,6 +138,10 @@ static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *para
case Opt_errors:
opts->errors = result.uint_32;
break;
+ case Opt_threads:
+ if (squashfs_parse_param_threads(param->string, opts) != 0)
+ return -EINVAL;
+ break;
default:
return -EINVAL;
}
@@ -133,6 +197,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
}
msblk = sb->s_fs_info;
+ msblk->thread_ops = opts->thread_ops;
msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
@@ -168,6 +233,12 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto failed_mount;
}
+ if (opts->thread_num == 0) {
+ msblk->max_thread_num = msblk->thread_ops->max_decompressors();
+ } else {
+ msblk->max_thread_num = opts->thread_num;
+ }
+
/* Check the MAJOR & MINOR versions and lookup compression type */
msblk->decompressor = supported_squashfs_filesystem(
fc,
@@ -252,7 +323,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
/* Allocate read_page block */
msblk->read_page = squashfs_cache_init("data",
- squashfs_max_decompressors(), msblk->block_size);
+ msblk->max_thread_num, msblk->block_size);
if (msblk->read_page == NULL) {
errorf(fc, "Failed to allocate read_page block");
goto failed_mount;
@@ -383,7 +454,7 @@ failed_mount:
squashfs_cache_delete(msblk->block_cache);
squashfs_cache_delete(msblk->fragment_cache);
squashfs_cache_delete(msblk->read_page);
- squashfs_decompressor_destroy(msblk);
+ msblk->thread_ops->destroy(msblk);
kfree(msblk->inode_lookup_table);
kfree(msblk->fragment_index);
kfree(msblk->id_table);
@@ -435,6 +506,19 @@ static int squashfs_show_options(struct seq_file *s, struct dentry *root)
else
seq_puts(s, ",errors=continue");
+#ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT
+ if (msblk->thread_ops == &squashfs_decompressor_single) {
+ seq_puts(s, ",threads=single");
+ return 0;
+ }
+ if (msblk->thread_ops == &squashfs_decompressor_percpu) {
+ seq_puts(s, ",threads=percpu");
+ return 0;
+ }
+#endif
+#ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS
+ seq_printf(s, ",threads=%d", msblk->max_thread_num);
+#endif
return 0;
}
@@ -446,6 +530,16 @@ static int squashfs_init_fs_context(struct fs_context *fc)
if (!opts)
return -ENOMEM;
+#ifdef CONFIG_SQUASHFS_DECOMP_SINGLE
+ opts->thread_ops = &squashfs_decompressor_single;
+#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI)
+ opts->thread_ops = &squashfs_decompressor_multi;
+#elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU)
+ opts->thread_ops = &squashfs_decompressor_percpu;
+#else
+#error "fail: unknown squashfs decompression thread mode?"
+#endif
+ opts->thread_num = 0;
fc->fs_private = opts;
fc->ops = &squashfs_context_ops;
return 0;
@@ -478,7 +572,7 @@ static void squashfs_put_super(struct super_block *sb)
squashfs_cache_delete(sbi->block_cache);
squashfs_cache_delete(sbi->fragment_cache);
squashfs_cache_delete(sbi->read_page);
- squashfs_decompressor_destroy(sbi);
+ sbi->thread_ops->destroy(sbi);
kfree(sbi->id_table);
kfree(sbi->fragment_index);
kfree(sbi->meta_index);
@@ -568,7 +662,7 @@ static struct file_system_type squashfs_fs_type = {
.init_fs_context = squashfs_init_fs_context,
.parameters = squashfs_fs_parameters,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("squashfs");
diff --git a/fs/stat.c b/fs/stat.c
index ef50573c72a2..d6cc74ca8486 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -44,12 +44,15 @@
void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode,
struct kstat *stat)
{
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+
stat->dev = inode->i_sb->s_dev;
stat->ino = inode->i_ino;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
- stat->uid = i_uid_into_mnt(mnt_userns, inode);
- stat->gid = i_gid_into_mnt(mnt_userns, inode);
+ stat->uid = vfsuid_into_kuid(vfsuid);
+ stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode->i_atime;
diff --git a/fs/super.c b/fs/super.c
index 8d39e4f11cfa..12c08cb20405 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1112,55 +1112,14 @@ static int test_single_super(struct super_block *s, struct fs_context *fc)
return 1;
}
-/**
- * vfs_get_super - Get a superblock with a search key set in s_fs_info.
- * @fc: The filesystem context holding the parameters
- * @keying: How to distinguish superblocks
- * @fill_super: Helper to initialise a new superblock
- *
- * Search for a superblock and create a new one if not found. The search
- * criterion is controlled by @keying. If the search fails, a new superblock
- * is created and @fill_super() is called to initialise it.
- *
- * @keying can take one of a number of values:
- *
- * (1) vfs_get_single_super - Only one superblock of this type may exist on the
- * system. This is typically used for special system filesystems.
- *
- * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have
- * distinct keys (where the key is in s_fs_info). Searching for the same
- * key again will turn up the superblock for that key.
- *
- * (3) vfs_get_independent_super - Multiple superblocks may exist and are
- * unkeyed. Each call will get a new superblock.
- *
- * A permissions check is made by sget_fc() unless we're getting a superblock
- * for a kernel-internal mount or a submount.
- */
-int vfs_get_super(struct fs_context *fc,
- enum vfs_get_super_keying keying,
- int (*fill_super)(struct super_block *sb,
- struct fs_context *fc))
+static int vfs_get_super(struct fs_context *fc, bool reconf,
+ int (*test)(struct super_block *, struct fs_context *),
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc))
{
- int (*test)(struct super_block *, struct fs_context *);
struct super_block *sb;
int err;
- switch (keying) {
- case vfs_get_single_super:
- case vfs_get_single_reconf_super:
- test = test_single_super;
- break;
- case vfs_get_keyed_super:
- test = test_keyed_super;
- break;
- case vfs_get_independent_super:
- test = NULL;
- break;
- default:
- BUG();
- }
-
sb = sget_fc(fc, test, set_anon_super_fc);
if (IS_ERR(sb))
return PTR_ERR(sb);
@@ -1174,7 +1133,7 @@ int vfs_get_super(struct fs_context *fc,
fc->root = dget(sb->s_root);
} else {
fc->root = dget(sb->s_root);
- if (keying == vfs_get_single_reconf_super) {
+ if (reconf) {
err = reconfigure_super(fc);
if (err < 0) {
dput(fc->root);
@@ -1190,13 +1149,12 @@ error:
deactivate_locked_super(sb);
return err;
}
-EXPORT_SYMBOL(vfs_get_super);
int get_tree_nodev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, vfs_get_independent_super, fill_super);
+ return vfs_get_super(fc, false, NULL, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);
@@ -1204,7 +1162,7 @@ int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, vfs_get_single_super, fill_super);
+ return vfs_get_super(fc, false, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);
@@ -1212,7 +1170,7 @@ int get_tree_single_reconf(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super);
+ return vfs_get_super(fc, true, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single_reconf);
@@ -1222,7 +1180,7 @@ int get_tree_keyed(struct fs_context *fc,
void *key)
{
fc->s_fs_info = key;
- return vfs_get_super(fc, vfs_get_keyed_super, fill_super);
+ return vfs_get_super(fc, false, test_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index d4ec9bb97de9..3b8567564e7e 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -438,7 +438,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
res += blocks;
direct = 1;
}
- return blocks;
+ return res;
}
int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path,
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 3f128b9fdfbb..9c9d3f0e36a4 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2467,7 +2467,7 @@ error_dump:
static inline int chance(unsigned int n, unsigned int out_of)
{
- return !!(prandom_u32_max(out_of) + 1 <= n);
+ return !!(get_random_u32_below(out_of) + 1 <= n);
}
@@ -2485,13 +2485,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
if (chance(1, 2)) {
d->pc_delay = 1;
/* Fail within 1 minute */
- delay = prandom_u32_max(60000);
+ delay = get_random_u32_below(60000);
d->pc_timeout = jiffies;
d->pc_timeout += msecs_to_jiffies(delay);
ubifs_warn(c, "failing after %lums", delay);
} else {
d->pc_delay = 2;
- delay = prandom_u32_max(10000);
+ delay = get_random_u32_below(10000);
/* Fail within 10000 operations */
d->pc_cnt_max = delay;
ubifs_warn(c, "failing after %lu calls", delay);
@@ -2571,7 +2571,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
unsigned int from, to, ffs = chance(1, 2);
unsigned char *p = (void *)buf;
- from = prandom_u32_max(len);
+ from = get_random_u32_below(len);
/* Corruption span max to end of write unit */
to = min(len, ALIGN(from + 1, c->max_write_size));
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index cfbc31f709f4..c4d079328b92 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1970,28 +1970,28 @@ static int dbg_populate_lsave(struct ubifs_info *c)
if (!dbg_is_chk_gen(c))
return 0;
- if (prandom_u32_max(4))
+ if (get_random_u32_below(4))
return 0;
for (i = 0; i < c->lsave_cnt; i++)
c->lsave[i] = c->main_first;
list_for_each_entry(lprops, &c->empty_list, list)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
list_for_each_entry(lprops, &c->freeable_list, list)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
list_for_each_entry(lprops, &c->frdi_idx_list, list)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = lprops->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = lprops->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_DIRTY - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
heap = &c->lpt_heap[LPROPS_FREE - 1];
for (i = 0; i < heap->cnt; i++)
- c->lsave[prandom_u32_max(c->lsave_cnt)] = heap->arr[i]->lnum;
+ c->lsave[get_random_u32_below(c->lsave_cnt)] = heap->arr[i]->lnum;
return 1;
}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 01362ad5f804..a55e04822d16 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -700,7 +700,7 @@ static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
c->ilebs[c->ileb_cnt++] = lnum;
dbg_cmt("LEB %d", lnum);
}
- if (dbg_is_chk_index(c) && !prandom_u32_max(8))
+ if (dbg_is_chk_index(c) && !get_random_u32_below(8))
return -ENOSPC;
return 0;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index dce6ae9ae306..34e416327dd4 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -182,11 +182,6 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int udf_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, udf_get_block, wbc);
-}
-
static int udf_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -239,12 +234,12 @@ const struct address_space_operations udf_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = udf_read_folio,
.readahead = udf_readahead,
- .writepage = udf_writepage,
.writepages = udf_writepages,
.write_begin = udf_write_begin,
.write_end = generic_write_end,
.direct_IO = udf_direct_IO,
.bmap = udf_bmap,
+ .migrate_folio = buffer_migrate_folio,
};
/*
@@ -439,6 +434,12 @@ static int udf_get_block(struct inode *inode, sector_t block,
iinfo->i_next_alloc_goal++;
}
+ /*
+ * Block beyond EOF and prealloc extents? Just discard preallocation
+ * as it is not useful and complicates things.
+ */
+ if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents)
+ udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
phys = inode_getblk(inode, block, &err, &new);
if (!phys)
@@ -488,8 +489,6 @@ static int udf_do_extend_file(struct inode *inode,
uint32_t add;
int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
struct super_block *sb = inode->i_sb;
- struct kernel_lb_addr prealloc_loc = {};
- uint32_t prealloc_len = 0;
struct udf_inode_info *iinfo;
int err;
@@ -510,19 +509,6 @@ static int udf_do_extend_file(struct inode *inode,
~(sb->s_blocksize - 1);
}
- /* Last extent are just preallocated blocks? */
- if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
- EXT_NOT_RECORDED_ALLOCATED) {
- /* Save the extent so that we can reattach it to the end */
- prealloc_loc = last_ext->extLocation;
- prealloc_len = last_ext->extLength;
- /* Mark the extent as a hole */
- last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
- (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
- last_ext->extLocation.logicalBlockNum = 0;
- last_ext->extLocation.partitionReferenceNum = 0;
- }
-
/* Can we merge with the previous extent? */
if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
EXT_NOT_RECORDED_NOT_ALLOCATED) {
@@ -550,7 +536,7 @@ static int udf_do_extend_file(struct inode *inode,
* more extents, we may need to enter possible following
* empty indirect extent.
*/
- if (new_block_bytes || prealloc_len)
+ if (new_block_bytes)
udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
}
@@ -584,17 +570,6 @@ static int udf_do_extend_file(struct inode *inode,
}
out:
- /* Do we have some preallocated blocks saved? */
- if (prealloc_len) {
- err = udf_add_aext(inode, last_pos, &prealloc_loc,
- prealloc_len, 1);
- if (err)
- return err;
- last_ext->extLocation = prealloc_loc;
- last_ext->extLength = prealloc_len;
- count++;
- }
-
/* last_pos should point to the last written extent... */
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
last_pos->offset -= sizeof(struct short_ad);
@@ -610,13 +585,17 @@ out:
static void udf_do_extend_final_block(struct inode *inode,
struct extent_position *last_pos,
struct kernel_long_ad *last_ext,
- uint32_t final_block_len)
+ uint32_t new_elen)
{
- struct super_block *sb = inode->i_sb;
uint32_t added_bytes;
- added_bytes = final_block_len -
- (last_ext->extLength & (sb->s_blocksize - 1));
+ /*
+ * Extent already large enough? It may be already rounded up to block
+ * size...
+ */
+ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK))
+ return;
+ added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
last_ext->extLength += added_bytes;
UDF_I(inode)->i_lenExtents += added_bytes;
@@ -633,12 +612,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
int8_t etype;
struct super_block *sb = inode->i_sb;
sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
- unsigned long partial_final_block;
+ loff_t new_elen;
int adsize;
struct udf_inode_info *iinfo = UDF_I(inode);
struct kernel_long_ad extent;
int err = 0;
- int within_final_block;
+ bool within_last_ext;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
adsize = sizeof(struct short_ad);
@@ -647,8 +626,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
else
BUG();
+ /*
+ * When creating hole in file, just don't bother with preserving
+ * preallocation. It likely won't be very useful anyway.
+ */
+ udf_discard_prealloc(inode);
+
etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
- within_final_block = (etype != -1);
+ within_last_ext = (etype != -1);
+ /* We don't expect extents past EOF... */
+ WARN_ON_ONCE(within_last_ext &&
+ elen > ((loff_t)offset + 1) << inode->i_blkbits);
if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
(epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
@@ -664,19 +652,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
extent.extLength |= etype << 30;
}
- partial_final_block = newsize & (sb->s_blocksize - 1);
+ new_elen = ((loff_t)offset << inode->i_blkbits) |
+ (newsize & (sb->s_blocksize - 1));
/* File has extent covering the new size (could happen when extending
* inside a block)?
*/
- if (within_final_block) {
+ if (within_last_ext) {
/* Extending file within the last file block */
- udf_do_extend_final_block(inode, &epos, &extent,
- partial_final_block);
+ udf_do_extend_final_block(inode, &epos, &extent, new_elen);
} else {
- loff_t add = ((loff_t)offset << sb->s_blocksize_bits) |
- partial_final_block;
- err = udf_do_extend_file(inode, &epos, &extent, add);
+ err = udf_do_extend_file(inode, &epos, &extent, new_elen);
}
if (err < 0)
@@ -698,7 +684,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
struct kernel_lb_addr eloc, tmpeloc;
int c = 1;
loff_t lbcount = 0, b_off = 0;
- udf_pblk_t newblocknum, newblock;
+ udf_pblk_t newblocknum, newblock = 0;
sector_t offset = 0;
int8_t etype;
struct udf_inode_info *iinfo = UDF_I(inode);
@@ -777,10 +763,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
goto out_free;
}
- /* Are we beyond EOF? */
+ /* Are we beyond EOF and preallocated extent? */
if (etype == -1) {
int ret;
loff_t hole_len;
+
isBeyondEOF = true;
if (count) {
if (c)
@@ -800,7 +787,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len);
if (ret < 0) {
*err = ret;
- newblock = 0;
goto out_free;
}
c = 0;
@@ -865,7 +851,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
goal, err);
if (!newblocknum) {
*err = -ENOSPC;
- newblock = 0;
goto out_free;
}
if (isBeyondEOF)
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ae7bc13a5298..7c95c549dd64 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1091,8 +1091,9 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
return -EINVAL;
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
- if (IS_ERR(ofi)) {
- retval = PTR_ERR(ofi);
+ if (!ofi || IS_ERR(ofi)) {
+ if (IS_ERR(ofi))
+ retval = PTR_ERR(ofi);
goto end_rename;
}
@@ -1101,8 +1102,7 @@ static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
brelse(ofibh.sbh);
tloc = lelb_to_cpu(ocfi.icb.extLocation);
- if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
- != old_inode->i_ino)
+ if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino)
goto end_rename;
nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4042d9739fb7..06eda8177b5f 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -162,7 +162,7 @@ static void udf_free_in_core_inode(struct inode *inode)
static void init_once(void *foo)
{
- struct udf_inode_info *ei = (struct udf_inode_info *)foo;
+ struct udf_inode_info *ei = foo;
ei->i_data = NULL;
inode_init_once(&ei->vfs_inode);
@@ -820,7 +820,7 @@ static int udf_find_fileset(struct super_block *sb,
struct kernel_lb_addr *fileset,
struct kernel_lb_addr *root)
{
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
uint16_t ident;
int ret;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 532cda99644e..036ebd892b85 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -120,60 +120,42 @@ void udf_truncate_tail_extent(struct inode *inode)
void udf_discard_prealloc(struct inode *inode)
{
- struct extent_position epos = { NULL, 0, {0, 0} };
+ struct extent_position epos = {};
+ struct extent_position prev_epos = {};
struct kernel_lb_addr eloc;
uint32_t elen;
uint64_t lbcount = 0;
int8_t etype = -1, netype;
- int adsize;
struct udf_inode_info *iinfo = UDF_I(inode);
+ int bsize = 1 << inode->i_blkbits;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
- inode->i_size == iinfo->i_lenExtents)
+ ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize))
return;
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(struct short_ad);
- else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(struct long_ad);
- else
- adsize = 0;
-
epos.block = iinfo->i_location;
/* Find the last extent in the file */
- while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
- etype = netype;
+ while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) {
+ brelse(prev_epos.bh);
+ prev_epos = epos;
+ if (prev_epos.bh)
+ get_bh(prev_epos.bh);
+
+ etype = udf_next_aext(inode, &epos, &eloc, &elen, 1);
lbcount += elen;
}
if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
- epos.offset -= adsize;
lbcount -= elen;
- extent_trunc(inode, &epos, &eloc, etype, elen, 0);
- if (!epos.bh) {
- iinfo->i_lenAlloc =
- epos.offset -
- udf_file_entry_alloc_offset(inode);
- mark_inode_dirty(inode);
- } else {
- struct allocExtDesc *aed =
- (struct allocExtDesc *)(epos.bh->b_data);
- aed->lengthAllocDescs =
- cpu_to_le32(epos.offset -
- sizeof(struct allocExtDesc));
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
- UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
- udf_update_tag(epos.bh->b_data, epos.offset);
- else
- udf_update_tag(epos.bh->b_data,
- sizeof(struct allocExtDesc));
- mark_buffer_dirty_inode(epos.bh, inode);
- }
+ udf_delete_aext(inode, prev_epos);
+ udf_free_blocks(inode->i_sb, inode, &eloc, 0,
+ DIV_ROUND_UP(elen, 1 << inode->i_blkbits));
}
/* This inode entry is in-memory only and thus we don't have to mark
* the inode dirty */
iinfo->i_lenExtents = lbcount;
brelse(epos.bh);
+ brelse(prev_epos.bh);
}
static void udf_update_alloc_ext_desc(struct inode *inode,
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 4fa620543d30..291b56dd011e 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -6,7 +6,11 @@
#include <linux/bitops.h>
#include <linux/magic.h>
-#define UDF_MAX_READ_VERSION 0x0250
+/*
+ * Even UDF 2.6 media should have version <= 0x250 but apparently there are
+ * some broken filesystems with version set to 0x260. Accommodate those.
+ */
+#define UDF_MAX_READ_VERSION 0x0260
#define UDF_MAX_WRITE_VERSION 0x0201
#define UDF_FLAG_USE_EXTENDED_FE 0
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 98ac37e34e3d..cc694846617a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
+
+ vma->vm_flags = flags;
+ /*
+ * For shared mappings, we want to enable writenotify while
+ * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
+ * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
+ */
+ if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
+ vma_set_page_prot(vma);
+}
+
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -618,7 +633,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- vma->vm_flags &= ~__VM_UFFD_FLAGS;
+ userfaultfd_set_vm_flags(vma,
+ vma->vm_flags & ~__VM_UFFD_FLAGS);
}
}
mmap_write_unlock(mm);
@@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- vma->vm_flags &= ~__VM_UFFD_FLAGS;
+ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
return 0;
}
@@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
} else {
/* Drop uffd context if remap feature not enabled */
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- vma->vm_flags &= ~__VM_UFFD_FLAGS;
+ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
}
}
@@ -895,7 +911,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
prev = vma;
}
- vma->vm_flags = new_flags;
+ userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
mmap_write_unlock(mm);
@@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
- vma->vm_flags = new_flags;
+ userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
@@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
- vma->vm_flags = new_flags;
+ userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
skip:
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index dbe1ce5b450a..c7fcb855e068 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -32,6 +32,11 @@ struct fsverity_hash_alg {
unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */
mempool_t req_pool; /* mempool with a preallocated hash request */
+ /*
+ * The HASH_ALGO_* constant for this algorithm. This is different from
+ * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme.
+ */
+ enum hash_algo algo_id;
};
/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 71d0fccb6d4c..6f8170cf4ae7 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -16,11 +16,13 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
.name = "sha256",
.digest_size = SHA256_DIGEST_SIZE,
.block_size = SHA256_BLOCK_SIZE,
+ .algo_id = HASH_ALGO_SHA256,
},
[FS_VERITY_HASH_ALG_SHA512] = {
.name = "sha512",
.digest_size = SHA512_DIGEST_SIZE,
.block_size = SHA512_BLOCK_SIZE,
+ .algo_id = HASH_ALGO_SHA512,
},
};
@@ -324,5 +326,9 @@ void __init fsverity_check_hash_algs(void)
*/
BUG_ON(!is_power_of_2(alg->digest_size));
BUG_ON(!is_power_of_2(alg->block_size));
+
+ /* Verify that there is a valid mapping to HASH_ALGO_*. */
+ BUG_ON(alg->algo_id == 0);
+ BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]);
}
}
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index e99c00350c28..5c79ea1b2468 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
* @alg: (out) pointer to the hash algorithm enumeration
*
* Return the file hash algorithm and digest of an fsverity protected file.
- * Assumption: before calling fsverity_get_digest(), the file must have been
- * opened.
+ * Assumption: before calling this, the file must have been opened.
*
* Return: 0 on success, -errno on failure
*/
@@ -76,27 +75,13 @@ int fsverity_get_digest(struct inode *inode,
{
const struct fsverity_info *vi;
const struct fsverity_hash_alg *hash_alg;
- int i;
vi = fsverity_get_info(inode);
if (!vi)
return -ENODATA; /* not a verity file */
hash_alg = vi->tree_params.hash_alg;
- memset(digest, 0, FS_VERITY_MAX_DIGEST_SIZE);
-
- /* convert the verity hash algorithm name to a hash_algo_name enum */
- i = match_string(hash_algo_name, HASH_ALGO__LAST, hash_alg->name);
- if (i < 0)
- return -EINVAL;
- *alg = i;
-
- if (WARN_ON_ONCE(hash_alg->digest_size != hash_digest_size[*alg]))
- return -EINVAL;
memcpy(digest, vi->file_digest, hash_alg->digest_size);
-
- pr_debug("file digest %s:%*phN\n", hash_algo_name[*alg],
- hash_digest_size[*alg], digest);
-
+ *alg = hash_alg->algo_id;
return 0;
}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index bde8c9b7d25f..961ba248021f 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -200,9 +200,8 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page);
* @bio: the bio to verify
*
* Verify a set of pages that have just been read from a verity file. The pages
- * must be pagecache pages that are still locked and not yet uptodate. Pages
- * that fail verification are set to the Error state. Verification is skipped
- * for pages already in the Error state, e.g. due to fscrypt decryption failure.
+ * must be pagecache pages that are still locked and not yet uptodate. If a
+ * page fails verification, then bio->bi_status is set to an error status.
*
* This is a helper function for use by the ->readahead() method of filesystems
* that issue bios to read data directly into the page cache. Filesystems that
@@ -244,9 +243,10 @@ void fsverity_verify_bio(struct bio *bio)
unsigned long level0_ra_pages =
min(max_ra_pages, params->level0_blocks - level0_index);
- if (!PageError(page) &&
- !verify_page(inode, vi, req, page, level0_ra_pages))
- SetPageError(page);
+ if (!verify_page(inode, vi, req, page, level0_ra_pages)) {
+ bio->bi_status = BLK_STS_IOERR;
+ break;
+ }
}
fsverity_free_hash_request(params->hash_alg, req);
diff --git a/fs/xattr.c b/fs/xattr.c
index 61107b6bbed2..adab9a70b536 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -80,6 +80,31 @@ xattr_resolve_name(struct inode *inode, const char **name)
return ERR_PTR(-EOPNOTSUPP);
}
+/**
+ * may_write_xattr - check whether inode allows writing xattr
+ * @mnt_userns: User namespace of the mount the inode was found from
+ * @inode: the inode on which to set an xattr
+ *
+ * Check whether the inode allows writing xattrs. Specifically, we can never
+ * set or remove an extended attribute on a read-only filesystem or on an
+ * immutable / append-only inode.
+ *
+ * We also need to ensure that the inode has a mapping in the mount to
+ * not risk writing back invalid i_{g,u}id values.
+ *
+ * Return: On success zero is returned. On error a negative errno is returned.
+ */
+int may_write_xattr(struct user_namespace *mnt_userns, struct inode *inode)
+{
+ if (IS_IMMUTABLE(inode))
+ return -EPERM;
+ if (IS_APPEND(inode))
+ return -EPERM;
+ if (HAS_UNMAPPED_ID(mnt_userns, inode))
+ return -EPERM;
+ return 0;
+}
+
/*
* Check permissions for extended attribute access. This is a bit complicated
* because different namespaces have very different rules.
@@ -88,20 +113,12 @@ static int
xattr_permission(struct user_namespace *mnt_userns, struct inode *inode,
const char *name, int mask)
{
- /*
- * We can never set or remove an extended attribute on a read-only
- * filesystem or on an immutable / append-only inode.
- */
if (mask & MAY_WRITE) {
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- /*
- * Updating an xattr will likely cause i_uid and i_gid
- * to be writen back improperly if their true value is
- * unknown to the vfs.
- */
- if (HAS_UNMAPPED_ID(mnt_userns, inode))
- return -EPERM;
+ int ret;
+
+ ret = may_write_xattr(mnt_userns, inode);
+ if (ret)
+ return ret;
}
/*
@@ -172,6 +189,9 @@ __vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
{
const struct xattr_handler *handler;
+ if (is_posix_acl_xattr(name))
+ return -EOPNOTSUPP;
+
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
@@ -282,12 +302,6 @@ out:
}
EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
-static inline bool is_posix_acl_xattr(const char *name)
-{
- return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
- (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
-}
-
int
vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
const char *name, const void *value, size_t size, int flags)
@@ -354,11 +368,12 @@ out_noalloc:
* vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
*
* Allocate memory, if not already allocated, or re-allocate correct size,
- * before retrieving the extended attribute.
+ * before retrieving the extended attribute. The xattr value buffer should
+ * always be freed by the caller, even on error.
*
* Returns the result of alloc, if failed, or the getxattr operation.
*/
-ssize_t
+int
vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry,
const char *name, char **xattr_value, size_t xattr_size,
gfp_t flags)
@@ -399,6 +414,9 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
{
const struct xattr_handler *handler;
+ if (is_posix_acl_xattr(name))
+ return -EOPNOTSUPP;
+
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
@@ -437,10 +455,7 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return ret;
}
nolsm:
- error = __vfs_getxattr(dentry, inode, name, value, size);
- if (error > 0 && is_posix_acl_xattr(name))
- posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size);
- return error;
+ return __vfs_getxattr(dentry, inode, name, value, size);
}
EXPORT_SYMBOL_GPL(vfs_getxattr);
@@ -471,6 +486,9 @@ __vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct inode *inode = d_inode(dentry);
const struct xattr_handler *handler;
+ if (is_posix_acl_xattr(name))
+ return -EOPNOTSUPP;
+
handler = xattr_resolve_name(inode, &name);
if (IS_ERR(handler))
return PTR_ERR(handler);
@@ -580,23 +598,19 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
return error;
}
-static void setxattr_convert(struct user_namespace *mnt_userns,
- struct dentry *d, struct xattr_ctx *ctx)
-{
- if (ctx->size && is_posix_acl_xattr(ctx->kname->name))
- posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size);
-}
-
-int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct xattr_ctx *ctx)
{
- setxattr_convert(mnt_userns, dentry, ctx);
- return vfs_setxattr(mnt_userns, dentry, ctx->kname->name,
+ if (is_posix_acl_xattr(ctx->kname->name))
+ return do_set_acl(idmap, dentry, ctx->kname->name,
+ ctx->kvalue, ctx->size);
+
+ return vfs_setxattr(mnt_idmap_owner(idmap), dentry, ctx->kname->name,
ctx->kvalue, ctx->size, ctx->flags);
}
static long
-setxattr(struct user_namespace *mnt_userns, struct dentry *d,
+setxattr(struct mnt_idmap *idmap, struct dentry *d,
const char __user *name, const void __user *value, size_t size,
int flags)
{
@@ -614,7 +628,7 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error)
return error;
- error = do_setxattr(mnt_userns, d, &ctx);
+ error = do_setxattr(idmap, d, &ctx);
kvfree(ctx.kvalue);
return error;
@@ -633,7 +647,7 @@ retry:
return error;
error = mnt_want_write(path.mnt);
if (!error) {
- error = setxattr(mnt_user_ns(path.mnt), path.dentry, name,
+ error = setxattr(mnt_idmap(path.mnt), path.dentry, name,
value, size, flags);
mnt_drop_write(path.mnt);
}
@@ -670,7 +684,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
- error = setxattr(file_mnt_user_ns(f.file),
+ error = setxattr(file_mnt_idmap(f.file),
f.file->f_path.dentry, name,
value, size, flags);
mnt_drop_write_file(f.file);
@@ -683,7 +697,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
* Extended attribute GET operations
*/
ssize_t
-do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
struct xattr_ctx *ctx)
{
ssize_t error;
@@ -697,10 +711,12 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
return -ENOMEM;
}
- error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
+ if (is_posix_acl_xattr(ctx->kname->name))
+ error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size);
+ else
+ error = vfs_getxattr(mnt_idmap_owner(idmap), d, kname,
+ ctx->kvalue, ctx->size);
if (error > 0) {
- if (is_posix_acl_xattr(kname))
- posix_acl_fix_xattr_to_user(ctx->kvalue, error);
if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
@@ -713,7 +729,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
}
static ssize_t
-getxattr(struct user_namespace *mnt_userns, struct dentry *d,
+getxattr(struct mnt_idmap *idmap, struct dentry *d,
const char __user *name, void __user *value, size_t size)
{
ssize_t error;
@@ -732,7 +748,7 @@ getxattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error < 0)
return error;
- error = do_getxattr(mnt_userns, d, &ctx);
+ error = do_getxattr(idmap, d, &ctx);
kvfree(ctx.kvalue);
return error;
@@ -748,7 +764,7 @@ retry:
error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
- error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size);
+ error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -778,7 +794,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
if (!f.file)
return error;
audit_file(f.file);
- error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry,
+ error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry,
name, value, size);
fdput(f);
return error;
@@ -863,7 +879,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
* Extended attribute REMOVE operations
*/
static long
-removexattr(struct user_namespace *mnt_userns, struct dentry *d,
+removexattr(struct mnt_idmap *idmap, struct dentry *d,
const char __user *name)
{
int error;
@@ -875,7 +891,10 @@ removexattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error < 0)
return error;
- return vfs_removexattr(mnt_userns, d, kname);
+ if (is_posix_acl_xattr(kname))
+ return vfs_remove_acl(mnt_idmap_owner(idmap), d, kname);
+
+ return vfs_removexattr(mnt_idmap_owner(idmap), d, kname);
}
static int path_removexattr(const char __user *pathname,
@@ -889,7 +908,7 @@ retry:
return error;
error = mnt_want_write(path.mnt);
if (!error) {
- error = removexattr(mnt_user_ns(path.mnt), path.dentry, name);
+ error = removexattr(mnt_idmap(path.mnt), path.dentry, name);
mnt_drop_write(path.mnt);
}
path_put(&path);
@@ -922,7 +941,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
audit_file(f.file);
error = mnt_want_write_file(f.file);
if (!error) {
- error = removexattr(file_mnt_user_ns(f.file),
+ error = removexattr(file_mnt_idmap(f.file),
f.file->f_path.dentry, name);
mnt_drop_write_file(f.file);
}
@@ -992,8 +1011,29 @@ const char *xattr_full_name(const struct xattr_handler *handler,
}
EXPORT_SYMBOL(xattr_full_name);
-/*
- * Allocate new xattr and copy in the value; but leave the name to callers.
+/**
+ * free_simple_xattr - free an xattr object
+ * @xattr: the xattr object
+ *
+ * Free the xattr object. Can handle @xattr being NULL.
+ */
+static inline void free_simple_xattr(struct simple_xattr *xattr)
+{
+ if (xattr)
+ kfree(xattr->name);
+ kvfree(xattr);
+}
+
+/**
+ * simple_xattr_alloc - allocate new xattr object
+ * @value: value of the xattr object
+ * @size: size of @value
+ *
+ * Allocate a new xattr object and initialize respective members. The caller is
+ * responsible for handling the name of the xattr.
+ *
+ * Return: On success a new xattr object is returned. On failure NULL is
+ * returned.
*/
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
{
@@ -1014,20 +1054,69 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
return new_xattr;
}
-/*
- * xattr GET operation for in-memory/pseudo filesystems
+/**
+ * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry
+ * @key: xattr name
+ * @node: current node
+ *
+ * Compare the xattr name with the xattr name attached to @node in the rbtree.
+ *
+ * Return: Negative value if continuing left, positive if continuing right, 0
+ * if the xattr attached to @node matches @key.
+ */
+static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node)
+{
+ const char *xattr_name = key;
+ const struct simple_xattr *xattr;
+
+ xattr = rb_entry(node, struct simple_xattr, rb_node);
+ return strcmp(xattr->name, xattr_name);
+}
+
+/**
+ * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes
+ * @new_node: new node
+ * @node: current node
+ *
+ * Compare the xattr attached to @new_node with the xattr attached to @node.
+ *
+ * Return: Negative value if continuing left, positive if continuing right, 0
+ * if the xattr attached to @new_node matches the xattr attached to @node.
+ */
+static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node,
+ const struct rb_node *node)
+{
+ struct simple_xattr *xattr;
+ xattr = rb_entry(new_node, struct simple_xattr, rb_node);
+ return rbtree_simple_xattr_cmp(xattr->name, node);
+}
+
+/**
+ * simple_xattr_get - get an xattr object
+ * @xattrs: the header of the xattr object
+ * @name: the name of the xattr to retrieve
+ * @buffer: the buffer to store the value into
+ * @size: the size of @buffer
+ *
+ * Try to find and retrieve the xattr object associated with @name.
+ * If @buffer is provided store the value of @xattr in @buffer
+ * otherwise just return the length. The size of @buffer is limited
+ * to XATTR_SIZE_MAX which currently is 65536.
+ *
+ * Return: On success the length of the xattr value is returned. On error a
+ * negative error code is returned.
*/
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
void *buffer, size_t size)
{
- struct simple_xattr *xattr;
+ struct simple_xattr *xattr = NULL;
+ struct rb_node *rbp;
int ret = -ENODATA;
- spin_lock(&xattrs->lock);
- list_for_each_entry(xattr, &xattrs->head, list) {
- if (strcmp(name, xattr->name))
- continue;
-
+ read_lock(&xattrs->lock);
+ rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp);
+ if (rbp) {
+ xattr = rb_entry(rbp, struct simple_xattr, rb_node);
ret = xattr->size;
if (buffer) {
if (size < xattr->size)
@@ -1035,34 +1124,44 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
else
memcpy(buffer, xattr->value, xattr->size);
}
- break;
}
- spin_unlock(&xattrs->lock);
+ read_unlock(&xattrs->lock);
return ret;
}
/**
- * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
- * @xattrs: target simple_xattr list
- * @name: name of the extended attribute
- * @value: value of the xattr. If %NULL, will remove the attribute.
- * @size: size of the new xattr
- * @flags: %XATTR_{CREATE|REPLACE}
- * @removed_size: returns size of the removed xattr, -1 if none removed
+ * simple_xattr_set - set an xattr object
+ * @xattrs: the header of the xattr object
+ * @name: the name of the xattr to retrieve
+ * @value: the value to store along the xattr
+ * @size: the size of @value
+ * @flags: the flags determining how to set the xattr
+ * @removed_size: the size of the removed xattr
+ *
+ * Set a new xattr object.
+ * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE
+ * is specified in @flags a matching xattr object for @name must already exist.
+ * If it does it will be replaced with the new xattr object. If it doesn't we
+ * fail. If XATTR_CREATE is specified and a matching xattr does already exist
+ * we fail. If it doesn't we create a new xattr. If @flags is zero we simply
+ * insert the new xattr replacing any existing one.
*
- * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
- * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
- * otherwise, fails with -ENODATA.
+ * If @value is empty and a matching xattr object is found we delete it if
+ * XATTR_REPLACE is specified in @flags or @flags is zero.
*
- * Returns 0 on success, -errno on failure.
+ * If @value is empty and no matching xattr object for @name is found we do
+ * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For
+ * XATTR_REPLACE we fail as mentioned above.
+ *
+ * Return: On success zero and on error a negative error code is returned.
*/
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
const void *value, size_t size, int flags,
ssize_t *removed_size)
{
- struct simple_xattr *xattr;
- struct simple_xattr *new_xattr = NULL;
- int err = 0;
+ struct simple_xattr *xattr = NULL, *new_xattr = NULL;
+ struct rb_node *parent = NULL, **rbp;
+ int err = 0, ret;
if (removed_size)
*removed_size = -1;
@@ -1075,42 +1174,68 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
new_xattr->name = kstrdup(name, GFP_KERNEL);
if (!new_xattr->name) {
- kvfree(new_xattr);
+ free_simple_xattr(new_xattr);
return -ENOMEM;
}
}
- spin_lock(&xattrs->lock);
- list_for_each_entry(xattr, &xattrs->head, list) {
- if (!strcmp(name, xattr->name)) {
- if (flags & XATTR_CREATE) {
- xattr = new_xattr;
- err = -EEXIST;
- } else if (new_xattr) {
- list_replace(&xattr->list, &new_xattr->list);
- if (removed_size)
- *removed_size = xattr->size;
- } else {
- list_del(&xattr->list);
- if (removed_size)
- *removed_size = xattr->size;
- }
- goto out;
- }
- }
- if (flags & XATTR_REPLACE) {
- xattr = new_xattr;
- err = -ENODATA;
- } else {
- list_add(&new_xattr->list, &xattrs->head);
- xattr = NULL;
+ write_lock(&xattrs->lock);
+ rbp = &xattrs->rb_root.rb_node;
+ while (*rbp) {
+ parent = *rbp;
+ ret = rbtree_simple_xattr_cmp(name, *rbp);
+ if (ret < 0)
+ rbp = &(*rbp)->rb_left;
+ else if (ret > 0)
+ rbp = &(*rbp)->rb_right;
+ else
+ xattr = rb_entry(*rbp, struct simple_xattr, rb_node);
+ if (xattr)
+ break;
}
-out:
- spin_unlock(&xattrs->lock);
+
if (xattr) {
- kfree(xattr->name);
- kvfree(xattr);
+ /* Fail if XATTR_CREATE is requested and the xattr exists. */
+ if (flags & XATTR_CREATE) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ if (new_xattr)
+ rb_replace_node(&xattr->rb_node, &new_xattr->rb_node,
+ &xattrs->rb_root);
+ else
+ rb_erase(&xattr->rb_node, &xattrs->rb_root);
+ if (!err && removed_size)
+ *removed_size = xattr->size;
+ } else {
+ /* Fail if XATTR_REPLACE is requested but no xattr is found. */
+ if (flags & XATTR_REPLACE) {
+ err = -ENODATA;
+ goto out_unlock;
+ }
+
+ /*
+ * If XATTR_CREATE or no flags are specified together with a
+ * new value simply insert it.
+ */
+ if (new_xattr) {
+ rb_link_node(&new_xattr->rb_node, parent, rbp);
+ rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root);
+ }
+
+ /*
+ * If XATTR_CREATE or no flags are specified and neither an
+ * old or new xattr exist then we don't need to do anything.
+ */
}
+
+out_unlock:
+ write_unlock(&xattrs->lock);
+ if (err)
+ free_simple_xattr(new_xattr);
+ else
+ free_simple_xattr(xattr);
return err;
}
@@ -1134,14 +1259,31 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size,
return 0;
}
-/*
- * xattr LIST operation for in-memory/pseudo filesystems
+/**
+ * simple_xattr_list - list all xattr objects
+ * @inode: inode from which to get the xattrs
+ * @xattrs: the header of the xattr object
+ * @buffer: the buffer to store all xattrs into
+ * @size: the size of @buffer
+ *
+ * List all xattrs associated with @inode. If @buffer is NULL we returned
+ * the required size of the buffer. If @buffer is provided we store the
+ * xattrs value into it provided it is big enough.
+ *
+ * Note, the number of xattr names that can be listed with listxattr(2) is
+ * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed
+ * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names
+ * are found it will return -E2BIG.
+ *
+ * Return: On success the required size or the size of the copied xattrs is
+ * returned. On error a negative error code is returned.
*/
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
char *buffer, size_t size)
{
- bool trusted = capable(CAP_SYS_ADMIN);
+ bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
struct simple_xattr *xattr;
+ struct rb_node *rbp;
ssize_t remaining_size = size;
int err = 0;
@@ -1162,8 +1304,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
}
#endif
- spin_lock(&xattrs->lock);
- list_for_each_entry(xattr, &xattrs->head, list) {
+ read_lock(&xattrs->lock);
+ for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) {
+ xattr = rb_entry(rbp, struct simple_xattr, rb_node);
+
/* skip "trusted." attributes for unprivileged callers */
if (!trusted && xattr_is_trusted(xattr->name))
continue;
@@ -1172,18 +1316,76 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
if (err)
break;
}
- spin_unlock(&xattrs->lock);
+ read_unlock(&xattrs->lock);
return err ? err : size - remaining_size;
}
-/*
- * Adds an extended attribute to the list
+/**
+ * rbtree_simple_xattr_less - compare two xattr rbtree nodes
+ * @new_node: new node
+ * @node: current node
+ *
+ * Compare the xattr attached to @new_node with the xattr attached to @node.
+ * Note that this function technically tolerates duplicate entries.
+ *
+ * Return: True if insertion point in the rbtree is found.
+ */
+static bool rbtree_simple_xattr_less(struct rb_node *new_node,
+ const struct rb_node *node)
+{
+ return rbtree_simple_xattr_node_cmp(new_node, node) < 0;
+}
+
+/**
+ * simple_xattr_add - add xattr objects
+ * @xattrs: the header of the xattr object
+ * @new_xattr: the xattr object to add
+ *
+ * Add an xattr object to @xattrs. This assumes no replacement or removal
+ * of matching xattrs is wanted. Should only be called during inode
+ * initialization when a few distinct initial xattrs are supposed to be set.
+ */
+void simple_xattr_add(struct simple_xattrs *xattrs,
+ struct simple_xattr *new_xattr)
+{
+ write_lock(&xattrs->lock);
+ rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less);
+ write_unlock(&xattrs->lock);
+}
+
+/**
+ * simple_xattrs_init - initialize new xattr header
+ * @xattrs: header to initialize
+ *
+ * Initialize relevant fields of a an xattr header.
*/
-void simple_xattr_list_add(struct simple_xattrs *xattrs,
- struct simple_xattr *new_xattr)
+void simple_xattrs_init(struct simple_xattrs *xattrs)
{
- spin_lock(&xattrs->lock);
- list_add(&new_xattr->list, &xattrs->head);
- spin_unlock(&xattrs->lock);
+ xattrs->rb_root = RB_ROOT;
+ rwlock_init(&xattrs->lock);
+}
+
+/**
+ * simple_xattrs_free - free xattrs
+ * @xattrs: xattr header whose xattrs to destroy
+ *
+ * Destroy all xattrs in @xattr. When this is called no one can hold a
+ * reference to any of the xattrs anymore.
+ */
+void simple_xattrs_free(struct simple_xattrs *xattrs)
+{
+ struct rb_node *rbp;
+
+ rbp = rb_first(&xattrs->rb_root);
+ while (rbp) {
+ struct simple_xattr *xattr;
+ struct rb_node *rbp_next;
+
+ rbp_next = rb_next(rbp);
+ xattr = rb_entry(rbp, struct simple_xattr, rb_node);
+ rb_erase(&xattr->rb_node, &xattrs->rb_root);
+ free_simple_xattr(xattr);
+ rbp = rbp_next;
+ }
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index de79f5d07f65..989cf341779b 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1516,7 +1516,7 @@ xfs_alloc_ag_vextent_lastblock(
#ifdef DEBUG
/* Randomly don't execute the first algorithm. */
- if (prandom_u32_max(2))
+ if (get_random_u32_below(2))
return 0;
#endif
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 49d0d4ea63fc..0d56a8d862e8 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata(
* the busy list.
*/
bma->datatype = XFS_ALLOC_NOBUSY;
- if (whichfork == XFS_DATA_FORK) {
+ if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) {
bma->datatype |= XFS_ALLOC_USERDATA;
if (bma->offset == 0)
bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
@@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc(
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
+ xfs_iomap_inode_sequence(ip, flags));
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc(
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
+ xfs_iomap_inode_sequence(ip, flags));
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 4c16c8c31fcb..35f574421670 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4666,7 +4666,12 @@ xfs_btree_space_to_height(
const unsigned int *limits,
unsigned long long leaf_blocks)
{
- unsigned long long node_blocks = limits[1];
+ /*
+ * The root btree block can have fewer than minrecs pointers in it
+ * because the tree might not be big enough to require that amount of
+ * fanout. Hence it has a minimum size of 2 pointers, not limits[1].
+ */
+ unsigned long long node_blocks = 2;
unsigned long long blocks_left = leaf_blocks - 1;
unsigned int height = 1;
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index eef27858a013..29c4b4ccb909 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -556,7 +556,6 @@ xfs_btree_islastblock(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, level, &bp);
- ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 5362908164b0..01a9e86b3037 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -40,13 +40,12 @@
#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
#define XFS_ERRTAG_BMAP_FINISH_ONE 26
#define XFS_ERRTAG_AG_RESV_CRITICAL 27
+
/*
- * DEBUG mode instrumentation to test and/or trigger delayed allocation
- * block killing in the event of failed writes. When enabled, all
- * buffered writes are silenty dropped and handled as if they failed.
- * All delalloc blocks in the range of the write (including pre-existing
- * delalloc blocks!) are tossed as part of the write failure error
- * handling sequence.
+ * Drop-writes support removed because write error handling cannot trash
+ * pre-existing delalloc extents in any useful way anymore. We retain the
+ * definition so that we can reject it as an invalid value in
+ * xfs_errortag_valid().
*/
#define XFS_ERRTAG_DROP_WRITES 28
#define XFS_ERRTAG_LOG_BAD_CRC 29
@@ -62,7 +61,9 @@
#define XFS_ERRTAG_LARP 39
#define XFS_ERRTAG_DA_LEAF_SPLIT 40
#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
-#define XFS_ERRTAG_MAX 42
+#define XFS_ERRTAG_WB_DELAY_MS 42
+#define XFS_ERRTAG_WRITE_DELAY_MS 43
+#define XFS_ERRTAG_MAX 44
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -95,7 +96,6 @@
#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
#define XFS_RANDOM_BMAP_FINISH_ONE 1
#define XFS_RANDOM_AG_RESV_CRITICAL 4
-#define XFS_RANDOM_DROP_WRITES 1
#define XFS_RANDOM_LOG_BAD_CRC 1
#define XFS_RANDOM_LOG_ITEM_PIN 1
#define XFS_RANDOM_BUF_LRU_REF 2
@@ -109,5 +109,7 @@
#define XFS_RANDOM_LARP 1
#define XFS_RANDOM_DA_LEAF_SPLIT 1
#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
+#define XFS_RANDOM_WB_DELAY_MS 3000
+#define XFS_RANDOM_WRITE_DELAY_MS 3000
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 94db50eb706a..5118dedf9267 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -636,7 +636,7 @@ xfs_ialloc_ag_alloc(
/* randomly do sparse inode allocations */
if (xfs_has_sparseinodes(tp->t_mountp) &&
igeo->ialloc_min_blks < igeo->ialloc_blks)
- do_sparse = prandom_u32_max(2);
+ do_sparse = get_random_u32_below(2);
#endif
/*
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 3f34bafe18dd..6f7ed9288fe4 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -815,11 +815,136 @@ out_error:
/* Is this extent valid? */
static inline bool
xfs_refc_valid(
- struct xfs_refcount_irec *rc)
+ const struct xfs_refcount_irec *rc)
{
return rc->rc_startblock != NULLAGBLOCK;
}
+static inline xfs_nlink_t
+xfs_refc_merge_refcount(
+ const struct xfs_refcount_irec *irec,
+ enum xfs_refc_adjust_op adjust)
+{
+ /* Once a record hits MAXREFCOUNT, it is pinned there forever */
+ if (irec->rc_refcount == MAXREFCOUNT)
+ return MAXREFCOUNT;
+ return irec->rc_refcount + adjust;
+}
+
+static inline bool
+xfs_refc_want_merge_center(
+ const struct xfs_refcount_irec *left,
+ const struct xfs_refcount_irec *cleft,
+ const struct xfs_refcount_irec *cright,
+ const struct xfs_refcount_irec *right,
+ bool cleft_is_cright,
+ enum xfs_refc_adjust_op adjust,
+ unsigned long long *ulenp)
+{
+ unsigned long long ulen = left->rc_blockcount;
+ xfs_nlink_t new_refcount;
+
+ /*
+ * To merge with a center record, both shoulder records must be
+ * adjacent to the record we want to adjust. This is only true if
+ * find_left and find_right made all four records valid.
+ */
+ if (!xfs_refc_valid(left) || !xfs_refc_valid(right) ||
+ !xfs_refc_valid(cleft) || !xfs_refc_valid(cright))
+ return false;
+
+ /* There must only be one record for the entire range. */
+ if (!cleft_is_cright)
+ return false;
+
+ /* The shoulder record refcounts must match the new refcount. */
+ new_refcount = xfs_refc_merge_refcount(cleft, adjust);
+ if (left->rc_refcount != new_refcount)
+ return false;
+ if (right->rc_refcount != new_refcount)
+ return false;
+
+ /*
+ * The new record cannot exceed the max length. ulen is a ULL as the
+ * individual record block counts can be up to (u32 - 1) in length
+ * hence we need to catch u32 addition overflows here.
+ */
+ ulen += cleft->rc_blockcount + right->rc_blockcount;
+ if (ulen >= MAXREFCEXTLEN)
+ return false;
+
+ *ulenp = ulen;
+ return true;
+}
+
+static inline bool
+xfs_refc_want_merge_left(
+ const struct xfs_refcount_irec *left,
+ const struct xfs_refcount_irec *cleft,
+ enum xfs_refc_adjust_op adjust)
+{
+ unsigned long long ulen = left->rc_blockcount;
+ xfs_nlink_t new_refcount;
+
+ /*
+ * For a left merge, the left shoulder record must be adjacent to the
+ * start of the range. If this is true, find_left made left and cleft
+ * contain valid contents.
+ */
+ if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft))
+ return false;
+
+ /* Left shoulder record refcount must match the new refcount. */
+ new_refcount = xfs_refc_merge_refcount(cleft, adjust);
+ if (left->rc_refcount != new_refcount)
+ return false;
+
+ /*
+ * The new record cannot exceed the max length. ulen is a ULL as the
+ * individual record block counts can be up to (u32 - 1) in length
+ * hence we need to catch u32 addition overflows here.
+ */
+ ulen += cleft->rc_blockcount;
+ if (ulen >= MAXREFCEXTLEN)
+ return false;
+
+ return true;
+}
+
+static inline bool
+xfs_refc_want_merge_right(
+ const struct xfs_refcount_irec *cright,
+ const struct xfs_refcount_irec *right,
+ enum xfs_refc_adjust_op adjust)
+{
+ unsigned long long ulen = right->rc_blockcount;
+ xfs_nlink_t new_refcount;
+
+ /*
+ * For a right merge, the right shoulder record must be adjacent to the
+ * end of the range. If this is true, find_right made cright and right
+ * contain valid contents.
+ */
+ if (!xfs_refc_valid(right) || !xfs_refc_valid(cright))
+ return false;
+
+ /* Right shoulder record refcount must match the new refcount. */
+ new_refcount = xfs_refc_merge_refcount(cright, adjust);
+ if (right->rc_refcount != new_refcount)
+ return false;
+
+ /*
+ * The new record cannot exceed the max length. ulen is a ULL as the
+ * individual record block counts can be up to (u32 - 1) in length
+ * hence we need to catch u32 addition overflows here.
+ */
+ ulen += cright->rc_blockcount;
+ if (ulen >= MAXREFCEXTLEN)
+ return false;
+
+ return true;
+}
+
/*
* Try to merge with any extents on the boundaries of the adjustment range.
*/
@@ -861,23 +986,15 @@ xfs_refcount_merge_extents(
(cleft.rc_blockcount == cright.rc_blockcount);
/* Try to merge left, cleft, and right. cleft must == cright. */
- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
- right.rc_blockcount;
- if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
- xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
- left.rc_refcount == cleft.rc_refcount + adjust &&
- right.rc_refcount == cleft.rc_refcount + adjust &&
- ulen < MAXREFCEXTLEN) {
+ if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal,
+ adjust, &ulen)) {
*shape_changed = true;
return xfs_refcount_merge_center_extents(cur, &left, &cleft,
&right, ulen, aglen);
}
/* Try to merge left and cleft. */
- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
- if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
- left.rc_refcount == cleft.rc_refcount + adjust &&
- ulen < MAXREFCEXTLEN) {
+ if (xfs_refc_want_merge_left(&left, &cleft, adjust)) {
*shape_changed = true;
error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
agbno, aglen);
@@ -893,10 +1010,7 @@ xfs_refcount_merge_extents(
}
/* Try to merge cright and right. */
- ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
- if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
- right.rc_refcount == cright.rc_refcount + adjust &&
- ulen < MAXREFCEXTLEN) {
+ if (xfs_refc_want_merge_right(&cright, &right, adjust)) {
*shape_changed = true;
return xfs_refcount_merge_right_extent(cur, &right, &cright,
aglen);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a20cade590e9..1eeecf2eb2a7 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -972,7 +972,9 @@ xfs_log_sb(
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
- mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+ mp->m_sb.sb_ifree = min_t(uint64_t,
+ percpu_counter_sum(&mp->m_ifree),
+ mp->m_sb.sb_icount);
mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
}
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index b7b838bd4ba4..4dd52b15f09c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -609,9 +609,16 @@ out:
/* AGFL */
struct xchk_agfl_info {
- unsigned int sz_entries;
+ /* Number of AGFL entries that the AGF claims are in use. */
+ unsigned int agflcount;
+
+ /* Number of AGFL entries that we found. */
unsigned int nr_entries;
+
+ /* Buffer to hold AGFL entries for extent checking. */
xfs_agblock_t *entries;
+
+ struct xfs_buf *agfl_bp;
struct xfs_scrub *sc;
};
@@ -641,10 +648,10 @@ xchk_agfl_block(
struct xfs_scrub *sc = sai->sc;
if (xfs_verify_agbno(sc->sa.pag, agbno) &&
- sai->nr_entries < sai->sz_entries)
+ sai->nr_entries < sai->agflcount)
sai->entries[sai->nr_entries++] = agbno;
else
- xchk_block_set_corrupt(sc, sc->sa.agfl_bp);
+ xchk_block_set_corrupt(sc, sai->agfl_bp);
xchk_agfl_block_xref(sc, agbno);
@@ -696,19 +703,26 @@ int
xchk_agfl(
struct xfs_scrub *sc)
{
- struct xchk_agfl_info sai;
+ struct xchk_agfl_info sai = {
+ .sc = sc,
+ };
struct xfs_agf *agf;
xfs_agnumber_t agno = sc->sm->sm_agno;
- unsigned int agflcount;
unsigned int i;
int error;
+ /* Lock the AGF and AGI so that nobody can touch this AG. */
error = xchk_ag_read_headers(sc, agno, &sc->sa);
if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
- goto out;
+ return error;
if (!sc->sa.agf_bp)
return -EFSCORRUPTED;
- xchk_buffer_recheck(sc, sc->sa.agfl_bp);
+
+ /* Try to read the AGFL, and verify its structure if we get it. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp);
+ if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
+ return error;
+ xchk_buffer_recheck(sc, sai.agfl_bp);
xchk_agfl_xref(sc);
@@ -717,24 +731,21 @@ xchk_agfl(
/* Allocate buffer to ensure uniqueness of AGFL entries. */
agf = sc->sa.agf_bp->b_addr;
- agflcount = be32_to_cpu(agf->agf_flcount);
- if (agflcount > xfs_agfl_size(sc->mp)) {
+ sai.agflcount = be32_to_cpu(agf->agf_flcount);
+ if (sai.agflcount > xfs_agfl_size(sc->mp)) {
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
goto out;
}
- memset(&sai, 0, sizeof(sai));
- sai.sc = sc;
- sai.sz_entries = agflcount;
- sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount,
- KM_MAYFAIL);
+ sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t),
+ XCHK_GFP_FLAGS);
if (!sai.entries) {
error = -ENOMEM;
goto out;
}
/* Check the blocks in the AGFL. */
- error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr,
- sc->sa.agfl_bp, xchk_agfl_block, &sai);
+ error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp,
+ xchk_agfl_block, &sai);
if (error == -ECANCELED) {
error = 0;
goto out_free;
@@ -742,7 +753,7 @@ xchk_agfl(
if (error)
goto out_free;
- if (agflcount != sai.nr_entries) {
+ if (sai.agflcount != sai.nr_entries) {
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
goto out_free;
}
@@ -758,7 +769,7 @@ xchk_agfl(
}
out_free:
- kmem_free(sai.entries);
+ kvfree(sai.entries);
out:
return error;
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 1b0b4e243f77..d75d82151eeb 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -442,12 +442,18 @@ out_revert:
/* AGFL */
struct xrep_agfl {
+ /* Bitmap of alleged AGFL blocks that we're not going to add. */
+ struct xbitmap crossed;
+
/* Bitmap of other OWN_AG metadata blocks. */
struct xbitmap agmetablocks;
/* Bitmap of free space. */
struct xbitmap *freesp;
+ /* rmapbt cursor for finding crosslinked blocks */
+ struct xfs_btree_cur *rmap_cur;
+
struct xfs_scrub *sc;
};
@@ -477,6 +483,41 @@ xrep_agfl_walk_rmap(
return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
}
+/* Strike out the blocks that are cross-linked according to the rmapbt. */
+STATIC int
+xrep_agfl_check_extent(
+ struct xrep_agfl *ra,
+ uint64_t start,
+ uint64_t len)
+{
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start);
+ xfs_agblock_t last_agbno = agbno + len - 1;
+ int error;
+
+ ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno);
+
+ while (agbno <= last_agbno) {
+ bool other_owners;
+
+ error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1,
+ &XFS_RMAP_OINFO_AG, &other_owners);
+ if (error)
+ return error;
+
+ if (other_owners) {
+ error = xbitmap_set(&ra->crossed, agbno, 1);
+ if (error)
+ return error;
+ }
+
+ if (xchk_should_terminate(ra->sc, &error))
+ return error;
+ agbno++;
+ }
+
+ return 0;
+}
+
/*
* Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
* which blocks belong to the AGFL.
@@ -496,44 +537,58 @@ xrep_agfl_collect_blocks(
struct xrep_agfl ra;
struct xfs_mount *mp = sc->mp;
struct xfs_btree_cur *cur;
+ struct xbitmap_range *br, *n;
int error;
ra.sc = sc;
ra.freesp = agfl_extents;
xbitmap_init(&ra.agmetablocks);
+ xbitmap_init(&ra.crossed);
/* Find all space used by the free space btrees & rmapbt. */
cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
- if (error)
- goto err;
xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out_bmp;
/* Find all blocks currently being used by the bnobt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_BNO);
error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
- if (error)
- goto err;
xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out_bmp;
/* Find all blocks currently being used by the cntbt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_CNT);
error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
- if (error)
- goto err;
-
xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out_bmp;
/*
* Drop the freesp meta blocks that are in use by btrees.
* The remaining blocks /should/ be AGFL blocks.
*/
error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
- xbitmap_destroy(&ra.agmetablocks);
if (error)
- return error;
+ goto out_bmp;
+
+ /* Strike out the blocks that are cross-linked. */
+ ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
+ for_each_xbitmap_extent(br, n, agfl_extents) {
+ error = xrep_agfl_check_extent(&ra, br->start, br->len);
+ if (error)
+ break;
+ }
+ xfs_btree_del_cursor(ra.rmap_cur, error);
+ if (error)
+ goto out_bmp;
+ error = xbitmap_disunion(agfl_extents, &ra.crossed);
+ if (error)
+ goto out_bmp;
/*
* Calculate the new AGFL size. If we found more blocks than fit in
@@ -541,11 +596,10 @@ xrep_agfl_collect_blocks(
*/
*flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
xfs_agfl_size(mp));
- return 0;
-err:
+out_bmp:
+ xbitmap_destroy(&ra.crossed);
xbitmap_destroy(&ra.agmetablocks);
- xfs_btree_del_cursor(cur, error);
return error;
}
@@ -631,7 +685,7 @@ xrep_agfl_init_header(
if (br->len)
break;
list_del(&br->list);
- kmem_free(br);
+ kfree(br);
}
/* Write new AGFL to disk. */
@@ -697,7 +751,6 @@ xrep_agfl(
* freespace overflow to the freespace btrees.
*/
sc->sa.agf_bp = agf_bp;
- sc->sa.agfl_bp = agfl_bp;
error = xrep_roll_ag_trans(sc);
if (error)
goto err;
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index b6f0c9f3f124..31529b9bf389 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -49,7 +49,7 @@ xchk_setup_xattr_buf(
if (ab) {
if (sz <= ab->sz)
return 0;
- kmem_free(ab);
+ kvfree(ab);
sc->buf = NULL;
}
@@ -79,7 +79,8 @@ xchk_setup_xattr(
* without the inode lock held, which means we can sleep.
*/
if (sc->flags & XCHK_TRY_HARDER) {
- error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL);
+ error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX,
+ XCHK_GFP_FLAGS);
if (error)
return error;
}
@@ -138,8 +139,7 @@ xchk_xattr_listent(
* doesn't work, we overload the seen_enough variable to convey
* the error message back to the main scrub function.
*/
- error = xchk_setup_xattr_buf(sx->sc, valuelen,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS);
if (error == -ENOMEM)
error = -EDEADLOCK;
if (error) {
@@ -324,8 +324,7 @@ xchk_xattr_block(
return 0;
/* Allocate memory for block usage checking. */
- error = xchk_setup_xattr_buf(ds->sc, 0,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS);
if (error == -ENOMEM)
return -EDEADLOCK;
if (error)
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index b89bf9de9b1c..a255f09e9f0a 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -10,6 +10,7 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "scrub/scrub.h"
#include "scrub/bitmap.h"
/*
@@ -25,7 +26,7 @@ xbitmap_set(
{
struct xbitmap_range *bmr;
- bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL);
+ bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS);
if (!bmr)
return -ENOMEM;
@@ -47,7 +48,7 @@ xbitmap_destroy(
for_each_xbitmap_extent(bmr, n, bitmap) {
list_del(&bmr->list);
- kmem_free(bmr);
+ kfree(bmr);
}
}
@@ -174,15 +175,15 @@ xbitmap_disunion(
/* Total overlap, just delete ex. */
lp = lp->next;
list_del(&br->list);
- kmem_free(br);
+ kfree(br);
break;
case 0:
/*
* Deleting from the middle: add the new right extent
* and then shrink the left extent.
*/
- new_br = kmem_alloc(sizeof(struct xbitmap_range),
- KM_MAYFAIL);
+ new_br = kmalloc(sizeof(struct xbitmap_range),
+ XCHK_GFP_FLAGS);
if (!new_br) {
error = -ENOMEM;
goto out;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index f0b9cb6506fd..d50d0eab196a 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -90,6 +90,7 @@ out:
struct xchk_bmap_info {
struct xfs_scrub *sc;
+ struct xfs_iext_cursor icur;
xfs_fileoff_t lastoff;
bool is_rt;
bool is_shared;
@@ -146,6 +147,48 @@ xchk_bmap_get_rmap(
return has_rmap;
}
+static inline bool
+xchk_bmap_has_prev(
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+ if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got))
+ return false;
+ if (got.br_startoff + got.br_blockcount != irec->br_startoff)
+ return false;
+ if (got.br_startblock + got.br_blockcount != irec->br_startblock)
+ return false;
+ if (got.br_state != irec->br_state)
+ return false;
+ return true;
+}
+
+static inline bool
+xchk_bmap_has_next(
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+ if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got))
+ return false;
+ if (irec->br_startoff + irec->br_blockcount != got.br_startoff)
+ return false;
+ if (irec->br_startblock + irec->br_blockcount != got.br_startblock)
+ return false;
+ if (got.br_state != irec->br_state)
+ return false;
+ return true;
+}
+
/* Make sure that we have rmapbt records for this extent. */
STATIC void
xchk_bmap_xref_rmap(
@@ -214,6 +257,34 @@ xchk_bmap_xref_rmap(
if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+
+ /*
+ * If the rmap starts before this bmbt record, make sure there's a bmbt
+ * record for the previous offset that is contiguous with this mapping.
+ * Skip this for CoW fork extents because the refcount btree (and not
+ * the inode) is the ondisk owner for those extents.
+ */
+ if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno &&
+ !xchk_bmap_has_prev(info, irec)) {
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ return;
+ }
+
+ /*
+ * If the rmap ends after this bmbt record, make sure there's a bmbt
+ * record for the next offset that is contiguous with this mapping.
+ * Skip this for CoW fork extents because the refcount btree (and not
+ * the inode) is the ondisk owner for those extents.
+ */
+ rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
+ if (info->whichfork != XFS_COW_FORK &&
+ rmap_end > agbno + irec->br_blockcount &&
+ !xchk_bmap_has_next(info, irec)) {
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ return;
+ }
}
/* Cross-reference a single rtdev extent record. */
@@ -264,6 +335,8 @@ xchk_bmap_iextent_xref(
case XFS_COW_FORK:
xchk_xref_is_cow_staging(info->sc, agbno,
irec->br_blockcount);
+ xchk_xref_is_not_shared(info->sc, agbno,
+ irec->br_blockcount);
break;
}
@@ -297,14 +370,13 @@ xchk_bmap_dirattr_extent(
}
/* Scrub a single extent record. */
-STATIC int
+STATIC void
xchk_bmap_iextent(
struct xfs_inode *ip,
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_mount *mp = info->sc->mp;
- int error = 0;
/*
* Check for out-of-order extents. This record could have come
@@ -325,14 +397,6 @@ xchk_bmap_iextent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
- /*
- * Check for delalloc extents. We never iterate the ones in the
- * in-core extent scan, and we should never see these in the bmbt.
- */
- if (isnullstartblock(irec->br_startblock))
- xchk_fblock_set_corrupt(info->sc, info->whichfork,
- irec->br_startoff);
-
/* Make sure the extent points to a valid place. */
if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
@@ -353,15 +417,12 @@ xchk_bmap_iextent(
irec->br_startoff);
if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return 0;
+ return;
if (info->is_rt)
xchk_bmap_rt_iextent_xref(ip, info, irec);
else
xchk_bmap_iextent_xref(ip, info, irec);
-
- info->lastoff = irec->br_startoff + irec->br_blockcount;
- return error;
}
/* Scrub a bmbt record. */
@@ -599,14 +660,41 @@ xchk_bmap_check_rmaps(
for_each_perag(sc->mp, agno, pag) {
error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
- if (error)
- break;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- break;
+ if (error ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ xfs_perag_put(pag);
+ return error;
+ }
}
- if (pag)
- xfs_perag_put(pag);
- return error;
+
+ return 0;
+}
+
+/* Scrub a delalloc reservation from the incore extent map tree. */
+STATIC void
+xchk_bmap_iextent_delalloc(
+ struct xfs_inode *ip,
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = info->sc->mp;
+
+ /*
+ * Check for out-of-order extents. This record could have come
+ * from the incore list, for which there is no ordering check.
+ */
+ if (irec->br_startoff < info->lastoff)
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* Make sure the extent points to a valid place. */
+ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
}
/*
@@ -626,7 +714,6 @@ xchk_bmap(
struct xfs_inode *ip = sc->ip;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t endoff;
- struct xfs_iext_cursor icur;
int error = 0;
/* Non-existent forks can be ignored. */
@@ -661,6 +748,8 @@ xchk_bmap(
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_LOCAL:
/* No mappings to check. */
+ if (whichfork == XFS_COW_FORK)
+ xchk_fblock_set_corrupt(sc, whichfork, 0);
goto out;
case XFS_DINODE_FMT_EXTENTS:
break;
@@ -690,20 +779,22 @@ xchk_bmap(
/* Scrub extent records. */
info.lastoff = 0;
ifp = xfs_ifork_ptr(ip, whichfork);
- for_each_xfs_iext(ifp, &icur, &irec) {
+ for_each_xfs_iext(ifp, &info.icur, &irec) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
goto out;
- if (isnullstartblock(irec.br_startblock))
- continue;
+
if (irec.br_startoff >= endoff) {
xchk_fblock_set_corrupt(sc, whichfork,
irec.br_startoff);
goto out;
}
- error = xchk_bmap_iextent(ip, &info, &irec);
- if (error)
- goto out;
+
+ if (isnullstartblock(irec.br_startblock))
+ xchk_bmap_iextent_delalloc(ip, &info, &irec);
+ else
+ xchk_bmap_iextent(ip, &info, &irec);
+ info.lastoff = irec.br_startoff + irec.br_blockcount;
}
error = xchk_bmap_check_rmaps(sc, whichfork);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 2f4519590dc1..0fd36d5b4646 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -408,7 +408,6 @@ xchk_btree_check_owner(
struct xfs_buf *bp)
{
struct xfs_btree_cur *cur = bs->cur;
- struct check_owner *co;
/*
* In theory, xfs_btree_get_block should only give us a null buffer
@@ -431,10 +430,13 @@ xchk_btree_check_owner(
* later scanning.
*/
if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
- co = kmem_alloc(sizeof(struct check_owner),
- KM_MAYFAIL);
+ struct check_owner *co;
+
+ co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
if (!co)
return -ENOMEM;
+
+ INIT_LIST_HEAD(&co->list);
co->level = level;
co->daddr = xfs_buf_daddr(bp);
list_add_tail(&co->list, &bs->to_check);
@@ -649,7 +651,7 @@ xchk_btree(
xchk_btree_set_corrupt(sc, cur, 0);
return 0;
}
- bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
+ bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
if (!bs)
return -ENOMEM;
bs->cur = cur;
@@ -740,9 +742,9 @@ out:
error = xchk_btree_check_block_owner(bs, co->level,
co->daddr);
list_del(&co->list);
- kmem_free(co);
+ kfree(co);
}
- kmem_free(bs);
+ kfree(bs);
return error;
}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 9bbbf20f401b..613260b04a3d 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -424,10 +424,6 @@ xchk_ag_read_headers(
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
return error;
- error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp);
- if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
- return error;
-
return 0;
}
@@ -515,10 +511,6 @@ xchk_ag_free(
struct xchk_ag *sa)
{
xchk_ag_btcur_free(sa);
- if (sa->agfl_bp) {
- xfs_trans_brelse(sc->tp, sa->agfl_bp);
- sa->agfl_bp = NULL;
- }
if (sa->agf_bp) {
xfs_trans_brelse(sc->tp, sa->agf_bp);
sa->agf_bp = NULL;
@@ -789,6 +781,33 @@ xchk_buffer_recheck(
trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
}
+static inline int
+xchk_metadata_inode_subtype(
+ struct xfs_scrub *sc,
+ unsigned int scrub_type)
+{
+ __u32 smtype = sc->sm->sm_type;
+ int error;
+
+ sc->sm->sm_type = scrub_type;
+
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xchk_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xchk_bmap_data(sc);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ sc->sm->sm_type = smtype;
+ return error;
+}
+
/*
* Scrub the attr/data forks of a metadata inode. The metadata inode must be
* pointed to by sc->ip and the ILOCK must be held.
@@ -797,13 +816,17 @@ int
xchk_metadata_inode_forks(
struct xfs_scrub *sc)
{
- __u32 smtype;
bool shared;
int error;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return 0;
+ /* Check the inode record. */
+ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
/* Metadata inodes don't live on the rt device. */
if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -823,10 +846,7 @@ xchk_metadata_inode_forks(
}
/* Invoke the data fork scrubber. */
- smtype = sc->sm->sm_type;
- sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
- error = xchk_bmap_data(sc);
- sc->sm->sm_type = smtype;
+ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
@@ -841,7 +861,7 @@ xchk_metadata_inode_forks(
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
}
- return error;
+ return 0;
}
/*
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 454145db10e7..b73648d81d23 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -25,7 +25,7 @@ xchk_should_terminate(
if (fatal_signal_pending(current)) {
if (*error == 0)
- *error = -EAGAIN;
+ *error = -EINTR;
return true;
}
return false;
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 84fe3d33d699..d17cee177085 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -486,7 +486,7 @@ xchk_da_btree(
return 0;
/* Set up initial da state. */
- ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL);
+ ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS);
if (!ds)
return -ENOMEM;
ds->dargs.dp = sc->ip;
@@ -591,6 +591,6 @@ out:
out_state:
xfs_da_state_free(ds->state);
- kmem_free(ds);
+ kfree(ds);
return error;
}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 5c87800ab223..d1b0f23c2c59 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -666,7 +666,12 @@ xchk_directory_blocks(
struct xfs_scrub *sc)
{
struct xfs_bmbt_irec got;
- struct xfs_da_args args;
+ struct xfs_da_args args = {
+ .dp = sc ->ip,
+ .whichfork = XFS_DATA_FORK,
+ .geo = sc->mp->m_dir_geo,
+ .trans = sc->tp,
+ };
struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
xfs_fileoff_t leaf_lblk;
@@ -689,9 +694,6 @@ xchk_directory_blocks(
free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
/* Is this a block dir? */
- args.dp = sc->ip;
- args.geo = mp->m_dir_geo;
- args.trans = sc->tp;
error = xfs_dir2_isblock(&args, &is_block);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 6a6f8fe7f87c..4777e7b89fdc 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -14,6 +14,8 @@
#include "xfs_health.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -43,6 +45,16 @@
* our tolerance for mismatch between expected and actual counter values.
*/
+struct xchk_fscounters {
+ struct xfs_scrub *sc;
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
+ uint64_t frextents;
+ unsigned long long icount_min;
+ unsigned long long icount_max;
+};
+
/*
* Since the expected value computation is lockless but only browses incore
* values, the percpu counters should be fairly close to each other. However,
@@ -116,10 +128,11 @@ xchk_setup_fscounters(
struct xchk_fscounters *fsc;
int error;
- sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
+ sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
fsc = sc->buf;
+ fsc->sc = sc;
xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
@@ -138,6 +151,18 @@ xchk_setup_fscounters(
return xchk_trans_alloc(sc, 0);
}
+/*
+ * Part 1: Collecting filesystem summary counts. For each AG, we add its
+ * summary counts (total inodes, free inodes, free data blocks) to an incore
+ * copy of the overall filesystem summary counts.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error.
+ */
+
/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
static int
xchk_fscount_btreeblks(
@@ -225,8 +250,10 @@ retry:
}
if (pag)
xfs_perag_put(pag);
- if (error)
+ if (error) {
+ xchk_set_incomplete(sc);
return error;
+ }
/*
* The global incore space reservation is taken from the incore
@@ -267,6 +294,64 @@ retry:
return 0;
}
+#ifdef CONFIG_XFS_RT
+STATIC int
+xchk_fscount_add_frextent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xchk_fscounters *fsc = priv;
+ int error = 0;
+
+ fsc->frextents += rec->ar_extcount;
+
+ xchk_should_terminate(fsc->sc, &error);
+ return error;
+}
+
+/* Calculate the number of free realtime extents from the realtime bitmap. */
+STATIC int
+xchk_fscount_count_frextents(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ fsc->frextents = 0;
+ if (!xfs_has_realtime(mp))
+ return 0;
+
+ xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ error = xfs_rtalloc_query_all(sc->mp, sc->tp,
+ xchk_fscount_add_frextent, fsc);
+ if (error) {
+ xchk_set_incomplete(sc);
+ goto out_unlock;
+ }
+
+out_unlock:
+ xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ return error;
+}
+#else
+STATIC int
+xchk_fscount_count_frextents(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ fsc->frextents = 0;
+ return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Part 2: Comparing filesystem summary counters. All we have to do here is
+ * sum the percpu counters and compare them to what we've observed.
+ */
+
/*
* Is the @counter reasonably close to the @expected value?
*
@@ -333,16 +418,17 @@ xchk_fscounters(
{
struct xfs_mount *mp = sc->mp;
struct xchk_fscounters *fsc = sc->buf;
- int64_t icount, ifree, fdblocks;
+ int64_t icount, ifree, fdblocks, frextents;
int error;
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ frextents = percpu_counter_sum(&mp->m_frextents);
/* No negative values, please! */
- if (icount < 0 || ifree < 0 || fdblocks < 0)
+ if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0)
xchk_set_corrupt(sc);
/* See if icount is obviously wrong. */
@@ -353,6 +439,10 @@ xchk_fscounters(
if (fdblocks > mp->m_sb.sb_dblocks)
xchk_set_corrupt(sc);
+ /* See if frextents is obviously wrong. */
+ if (frextents > mp->m_sb.sb_rextents)
+ xchk_set_corrupt(sc);
+
/*
* If ifree exceeds icount by more than the minimum variance then
* something's probably wrong with the counters.
@@ -367,6 +457,13 @@ xchk_fscounters(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
return 0;
+ /* Count the free extents counter for rt volumes. */
+ error = xchk_fscount_count_frextents(sc, fsc);
+ if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
/* Compare the in-core counters with whatever we counted. */
if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
xchk_set_corrupt(sc);
@@ -378,5 +475,9 @@ xchk_fscounters(
fsc->fdblocks))
xchk_set_corrupt(sc);
+ if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
+ fsc->frextents))
+ xchk_set_corrupt(sc);
+
return 0;
}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 51820b40ab1c..7a2f38e5202c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -365,7 +365,7 @@ xchk_dinode(
* pagecache can't cache all the blocks in this file due to
* overly large offsets, flag the inode for admin review.
*/
- if (isize >= mp->m_super->s_maxbytes)
+ if (isize > mp->m_super->s_maxbytes)
xchk_ino_set_warning(sc, ino);
/* di_nblocks */
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 21b4c9006859..9eeac8565394 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -14,6 +14,7 @@
#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
+#include "xfs_bmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -84,7 +85,7 @@ xchk_quota_item(
int error = 0;
if (xchk_should_terminate(sc, &error))
- return -ECANCELED;
+ return error;
/*
* Except for the root dquot, the actual dquot we got must either have
@@ -189,11 +190,12 @@ xchk_quota_data_fork(
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error))
break;
+
/*
- * delalloc extents or blocks mapped above the highest
+ * delalloc/unwritten extents or blocks mapped above the highest
* quota id shouldn't happen.
*/
- if (isnullstartblock(irec.br_startblock) ||
+ if (!xfs_bmap_is_written_extent(&irec) ||
irec.br_startoff > max_dqid_off ||
irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index a26ee0f24ef2..d9c1b3cea4a5 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -127,8 +127,8 @@ xchk_refcountbt_rmap_check(
* is healthy each rmap_irec we see will be in agbno order
* so we don't need insertion sort here.
*/
- frag = kmem_alloc(sizeof(struct xchk_refcnt_frag),
- KM_MAYFAIL);
+ frag = kmalloc(sizeof(struct xchk_refcnt_frag),
+ XCHK_GFP_FLAGS);
if (!frag)
return -ENOMEM;
memcpy(&frag->rm, rec, sizeof(frag->rm));
@@ -215,7 +215,7 @@ xchk_refcountbt_process_rmap_fragments(
continue;
}
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
nr++;
}
@@ -257,11 +257,11 @@ done:
/* Delete fragments and work list. */
list_for_each_entry_safe(frag, n, &worklist, list) {
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
}
list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
}
}
@@ -306,7 +306,7 @@ xchk_refcountbt_xref_rmap(
out_free:
list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
}
}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c18bd039fce9..4b92f9253ccd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -61,7 +61,6 @@ xrep_attempt(
sc->flags |= XREP_ALREADY_FIXED;
return -EAGAIN;
case -EDEADLOCK:
- case -EAGAIN:
/* Tell the caller to try again having grabbed all the locks. */
if (!(sc->flags & XCHK_TRY_HARDER)) {
sc->flags |= XCHK_TRY_HARDER;
@@ -70,10 +69,15 @@ xrep_attempt(
/*
* We tried harder but still couldn't grab all the resources
* we needed to fix it. The corruption has not been fixed,
- * so report back to userspace.
+ * so exit to userspace with the scan's output flags unchanged.
*/
- return -EFSCORRUPTED;
+ return 0;
default:
+ /*
+ * EAGAIN tells the caller to re-scrub, so we cannot return
+ * that here.
+ */
+ ASSERT(error != -EAGAIN);
return error;
}
}
@@ -121,32 +125,40 @@ xrep_roll_ag_trans(
{
int error;
- /* Keep the AG header buffers locked so we can keep going. */
- if (sc->sa.agi_bp)
+ /*
+ * Keep the AG header buffers locked while we roll the transaction.
+ * Ensure that both AG buffers are dirty and held when we roll the
+ * transaction so that they move forward in the log without losing the
+ * bli (and hence the bli type) when the transaction commits.
+ *
+ * Normal code would never hold clean buffers across a roll, but repair
+ * needs both buffers to maintain a total lock on the AG.
+ */
+ if (sc->sa.agi_bp) {
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
- if (sc->sa.agf_bp)
+ }
+
+ if (sc->sa.agf_bp) {
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
- if (sc->sa.agfl_bp)
- xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+ }
/*
- * Roll the transaction. We still own the buffer and the buffer lock
- * regardless of whether or not the roll succeeds. If the roll fails,
- * the buffers will be released during teardown on our way out of the
- * kernel. If it succeeds, we join them to the new transaction and
- * move on.
+ * Roll the transaction. We still hold the AG header buffers locked
+ * regardless of whether or not that succeeds. On failure, the buffers
+ * will be released during teardown on our way out of the kernel. If
+ * successful, join the buffers to the new transaction and move on.
*/
error = xfs_trans_roll(&sc->tp);
if (error)
return error;
- /* Join AG headers to the new transaction. */
+ /* Join the AG headers to the new transaction. */
if (sc->sa.agi_bp)
xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
if (sc->sa.agf_bp)
xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
- if (sc->sa.agfl_bp)
- xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
return 0;
}
@@ -498,6 +510,7 @@ xrep_put_freelist(
struct xfs_scrub *sc,
xfs_agblock_t agbno)
{
+ struct xfs_buf *agfl_bp;
int error;
/* Make sure there's space on the freelist. */
@@ -516,8 +529,12 @@ xrep_put_freelist(
return error;
/* Put the block on the AGFL. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ return error;
+
error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
- sc->sa.agfl_bp, agbno, 0);
+ agfl_bp, agbno, 0);
if (error)
return error;
xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 2e8e400f10a9..07a7a75f987f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -174,7 +174,7 @@ xchk_teardown(
if (sc->flags & XCHK_REAPING_DISABLED)
xchk_start_reaping(sc);
if (sc->buf) {
- kmem_free(sc->buf);
+ kvfree(sc->buf);
sc->buf = NULL;
}
return error;
@@ -467,7 +467,7 @@ xfs_scrub_metadata(
xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
- sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
+ sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
if (!sc) {
error = -ENOMEM;
goto out;
@@ -557,7 +557,7 @@ out_nofix:
out_teardown:
error = xchk_teardown(sc, error);
out_sc:
- kmem_free(sc);
+ kfree(sc);
out:
trace_xchk_done(XFS_I(file_inode(file)), sm, error);
if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 3de5287e98d8..b4d391b4c938 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -8,6 +8,15 @@
struct xfs_scrub;
+/*
+ * Standard flags for allocating memory within scrub. NOFS context is
+ * configured by the process allocation scope. Scrub and repair must be able
+ * to back out gracefully if there isn't enough memory. Force-cast to avoid
+ * complaints from static checkers.
+ */
+#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \
+ __GFP_RETRY_MAYFAIL))
+
/* Type info and names for the scrub types. */
enum xchk_type {
ST_NONE = 1, /* disabled */
@@ -39,7 +48,6 @@ struct xchk_ag {
/* AG btree roots */
struct xfs_buf *agf_bp;
- struct xfs_buf *agfl_bp;
struct xfs_buf *agi_bp;
/* AG btrees */
@@ -161,12 +169,4 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
#endif
-struct xchk_fscounters {
- uint64_t icount;
- uint64_t ifree;
- uint64_t fdblocks;
- unsigned long long icount_min;
- unsigned long long icount_max;
-};
-
#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 75311f8daeeb..c1c99ffe7408 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -21,7 +21,7 @@ xchk_setup_symlink(
struct xfs_scrub *sc)
{
/* Allocate the buffer without the inode lock held. */
- sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL);
+ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b744c62052b6..a05f44eb8178 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -242,12 +242,13 @@ xfs_acl_set_mode(
}
int
-xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
umode_t mode;
bool set_mode = false;
int error = 0;
+ struct inode *inode = d_inode(dentry);
if (!acl)
goto set_acl;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 263404d0bfda..dcd176149c7a 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -11,7 +11,7 @@ struct posix_acl;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu);
-extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+extern int xfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
void xfs_forget_acl(struct inode *inode, const char *name);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5d1a995b15f8..41734202796f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -17,6 +17,8 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_reflink.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -114,9 +116,8 @@ xfs_end_ioend(
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- xfs_bmap_punch_delalloc_range(ip,
- XFS_B_TO_FSBT(mp, offset),
- XFS_B_TO_FSB(mp, size));
+ xfs_bmap_punch_delalloc_range(ip, offset,
+ offset + size);
}
goto done;
}
@@ -218,11 +219,17 @@ xfs_imap_valid(
* checked (and found nothing at this offset) could have added
* overlapping blocks.
*/
- if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
+ if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
+ trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
+ XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
return false;
+ }
if (xfs_inode_has_cow_data(ip) &&
- XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
+ trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
+ XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
return false;
+ }
return true;
}
@@ -286,6 +293,8 @@ xfs_map_blocks(
if (xfs_is_shutdown(mp))
return -EIO;
+ XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
+
/*
* COW fork blocks can overlap data fork blocks even if the blocks
* aren't shared. COW I/O always takes precedent, so we must always
@@ -373,7 +382,7 @@ retry:
isnullstartblock(imap.br_startblock))
goto allocate_blocks;
- xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
@@ -455,12 +464,8 @@ xfs_discard_folio(
struct folio *folio,
loff_t pos)
{
- struct inode *inode = folio->mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(folio->mapping->host);
struct xfs_mount *mp = ip->i_mount;
- size_t offset = offset_in_folio(folio, pos);
- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos);
- xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
int error;
if (xfs_is_shutdown(mp))
@@ -470,8 +475,9 @@ xfs_discard_folio(
"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
folio, ip->i_ino, pos);
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- i_blocks_per_folio(inode, folio) - pageoff_fsb);
+ error = xfs_bmap_punch_delalloc_range(ip, pos,
+ round_up(pos, folio_size(folio)));
+
if (error && !xfs_is_shutdown(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 04d0c2bff67c..867645b74d88 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -590,11 +590,13 @@ out_unlock_iolock:
int
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
- xfs_fileoff_t start_fsb,
- xfs_fileoff_t length)
+ xfs_off_t start_byte,
+ xfs_off_t end_byte)
{
+ struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = &ip->i_df;
- xfs_fileoff_t end_fsb = start_fsb + length;
+ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
int error = 0;
@@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range(
while (got.br_startoff + got.br_blockcount > start_fsb) {
del = got;
- xfs_trim_extent(&del, start_fsb, length);
+ xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb);
/*
* A delete can push the cursor forward. Step back to the
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 24b37d211f1d..6888078f5c31 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
#endif /* CONFIG_XFS_RT */
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
- xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+ xfs_off_t start_byte, xfs_off_t end_byte);
struct kgetbmap {
__s64 bmv_offset; /* file offset of segment in blocks */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index dde346450952..54c774af6e1c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1945,6 +1945,7 @@ xfs_free_buftarg(
list_lru_destroy(&btp->bt_lru);
blkdev_issue_flush(btp->bt_bdev);
+ invalidate_bdev(btp->bt_bdev);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
kmem_free(btp);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 522d450a94b1..df7322ed73fa 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1018,6 +1018,8 @@ xfs_buf_item_relse(
trace_xfs_buf_item_relse(bp, _RET_IP_);
ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
+ if (atomic_read(&bip->bli_refcount))
+ return;
bp->b_log_item = NULL;
xfs_buf_rele(bp);
xfs_buf_item_free(bip);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index c6b2aabd6f18..ae082808cfed 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_REFCOUNT_FINISH_ONE,
XFS_RANDOM_BMAP_FINISH_ONE,
XFS_RANDOM_AG_RESV_CRITICAL,
- XFS_RANDOM_DROP_WRITES,
+ 0, /* XFS_RANDOM_DROP_WRITES has been removed */
XFS_RANDOM_LOG_BAD_CRC,
XFS_RANDOM_LOG_ITEM_PIN,
XFS_RANDOM_BUF_LRU_REF,
@@ -60,6 +60,8 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_LARP,
XFS_RANDOM_DA_LEAF_SPLIT,
XFS_RANDOM_ATTR_LEAF_TO_NODE,
+ XFS_RANDOM_WB_DELAY_MS,
+ XFS_RANDOM_WRITE_DELAY_MS,
};
struct xfs_errortag_attr {
@@ -162,7 +164,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDA
XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE);
XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE);
XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
-XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
@@ -176,6 +177,8 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
+XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS);
+XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -206,7 +209,6 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
- XFS_ERRORTAG_ATTR_LIST(drop_writes),
XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
XFS_ERRORTAG_ATTR_LIST(log_item_pin),
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
@@ -220,6 +222,8 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(larp),
XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
+ XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
+ XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
@@ -256,6 +260,32 @@ xfs_errortag_del(
kmem_free(mp->m_errortag);
}
+static bool
+xfs_errortag_valid(
+ unsigned int error_tag)
+{
+ if (error_tag >= XFS_ERRTAG_MAX)
+ return false;
+
+ /* Error out removed injection types */
+ if (error_tag == XFS_ERRTAG_DROP_WRITES)
+ return false;
+ return true;
+}
+
+bool
+xfs_errortag_enabled(
+ struct xfs_mount *mp,
+ unsigned int tag)
+{
+ if (!mp->m_errortag)
+ return false;
+ if (!xfs_errortag_valid(tag))
+ return false;
+
+ return mp->m_errortag[tag] != 0;
+}
+
bool
xfs_errortag_test(
struct xfs_mount *mp,
@@ -277,9 +307,11 @@ xfs_errortag_test(
if (!mp->m_errortag)
return false;
- ASSERT(error_tag < XFS_ERRTAG_MAX);
+ if (!xfs_errortag_valid(error_tag))
+ return false;
+
randfactor = mp->m_errortag[error_tag];
- if (!randfactor || prandom_u32_max(randfactor))
+ if (!randfactor || get_random_u32_below(randfactor))
return false;
xfs_warn_ratelimited(mp,
@@ -293,7 +325,7 @@ xfs_errortag_get(
struct xfs_mount *mp,
unsigned int error_tag)
{
- if (error_tag >= XFS_ERRTAG_MAX)
+ if (!xfs_errortag_valid(error_tag))
return -EINVAL;
return mp->m_errortag[error_tag];
@@ -305,7 +337,7 @@ xfs_errortag_set(
unsigned int error_tag,
unsigned int tag_value)
{
- if (error_tag >= XFS_ERRTAG_MAX)
+ if (!xfs_errortag_valid(error_tag))
return -EINVAL;
mp->m_errortag[error_tag] = tag_value;
@@ -319,7 +351,7 @@ xfs_errortag_add(
{
BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
- if (error_tag >= XFS_ERRTAG_MAX)
+ if (!xfs_errortag_valid(error_tag))
return -EINVAL;
return xfs_errortag_set(mp, error_tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 5191e9145e55..dbe6c37dc697 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -45,6 +45,18 @@ extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
const char *file, int line, unsigned int error_tag);
#define XFS_TEST_ERROR(expr, mp, tag) \
((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
+bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
+#define XFS_ERRORTAG_DELAY(mp, tag) \
+ do { \
+ might_sleep(); \
+ if (!xfs_errortag_enabled((mp), (tag))) \
+ break; \
+ xfs_warn_ratelimited((mp), \
+"Injecting %ums delay at file %s, line %d, on filesystem \"%s\"", \
+ (mp)->m_errortag[(tag)], __FILE__, __LINE__, \
+ (mp)->m_super->s_id); \
+ mdelay((mp)->m_errortag[(tag)]); \
+ } while (0)
extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
@@ -55,6 +67,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define xfs_errortag_init(mp) (0)
#define xfs_errortag_del(mp)
#define XFS_TEST_ERROR(expr, mp, tag) (expr)
+#define XFS_ERRORTAG_DELAY(mp, tag) ((void)0)
#define xfs_errortag_set(mp, tag, val) (ENOSYS)
#define xfs_errortag_add(mp, tag) (ENOSYS)
#define xfs_errortag_clearall(mp) (ENOSYS)
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index ad22a003f959..f3d328e4a440 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -236,6 +236,7 @@ xfs_extent_busy_update_extent(
*
*/
busyp->bno = fend;
+ busyp->length = bend - fend;
} else if (bbno < fbno) {
/*
* Case 8:
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e462d39c840e..595a5bcf46b9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1325,7 +1325,7 @@ __xfs_filemap_fault(
if (write_fault) {
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = iomap_page_mkwrite(vmf,
- &xfs_buffered_write_iomap_ops);
+ &xfs_page_mkwrite_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
ret = filemap_fault(vmf);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index d8337274c74d..88a88506ffff 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -524,7 +524,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
struct xfs_mount *mp = tp->t_mountp;
int error;
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
/*
* Set up query parameters to return free rtextents covering the range
@@ -551,7 +551,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
if (error)
goto err;
err:
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
return error;
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index eae7427062cf..ddeaccc04aec 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -342,6 +342,9 @@ xfs_iget_recycle(
trace_xfs_iget_recycle(ip);
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return -EAGAIN;
+
/*
* We need to make it look like the inode is being reclaimed to prevent
* the actual reclaim workers from stomping over us while we recycle
@@ -355,6 +358,7 @@ xfs_iget_recycle(
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
error = xfs_reinit_inode(mp, inode);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error) {
/*
* Re-initializing the inode failed, and we are in deep
@@ -518,6 +522,8 @@ xfs_iget_cache_hit(
if (ip->i_flags & XFS_IRECLAIMABLE) {
/* Drops i_flags_lock and RCU read lock. */
error = xfs_iget_recycle(pag, ip);
+ if (error == -EAGAIN)
+ goto out_skip;
if (error)
return error;
} else {
@@ -1847,12 +1853,20 @@ xfs_inodegc_worker(
struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
+ unsigned int nofs_flag;
WRITE_ONCE(gc->items, 0);
if (!node)
return;
+ /*
+ * We can allocate memory here while doing writeback on behalf of
+ * memory reclaim. To avoid memory allocation deadlocks set the
+ * task-wide nofs context for the following operations.
+ */
+ nofs_flag = memalloc_nofs_save();
+
ip = llist_entry(node, struct xfs_inode, i_gclist);
trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
@@ -1861,6 +1875,8 @@ xfs_inodegc_worker(
xfs_iflags_set(ip, XFS_INACTIVATING);
xfs_inodegc_inactivate(ip);
}
+
+ memalloc_nofs_restore(nofs_flag);
}
/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index aa303be11576..d354ea2b74f9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2479,7 +2479,7 @@ xfs_remove(
error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
tp->t_mountp->m_sb.sb_rootino, 0);
if (error)
- return error;
+ goto out_trans_cancel;
}
} else {
/*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 1f783e979629..736510bc241b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -754,7 +754,7 @@ xfs_bulkstat_fmt(
static int
xfs_bulk_ireq_setup(
struct xfs_mount *mp,
- struct xfs_bulk_ireq *hdr,
+ const struct xfs_bulk_ireq *hdr,
struct xfs_ibulk *breq,
void __user *ubuffer)
{
@@ -780,7 +780,7 @@ xfs_bulk_ireq_setup(
switch (hdr->ino) {
case XFS_BULK_IREQ_SPECIAL_ROOT:
- hdr->ino = mp->m_sb.sb_rootino;
+ breq->startino = mp->m_sb.sb_rootino;
break;
default:
return -EINVAL;
@@ -1138,10 +1138,6 @@ xfs_ioctl_setattr_xflags(
if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
- /* Don't allow us to set DAX mode for a reflinked file for now. */
- if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
- return -EINVAL;
-
/* diflags2 only valid for v3 inodes. */
i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
if (i_flags2 && !xfs_has_v3inodes(mp))
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 07da03976ec1..fc1946f80a4a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -48,13 +48,53 @@ xfs_alert_fsblock_zero(
return -EFSCORRUPTED;
}
+u64
+xfs_iomap_inode_sequence(
+ struct xfs_inode *ip,
+ u16 iomap_flags)
+{
+ u64 cookie = 0;
+
+ if (iomap_flags & IOMAP_F_XATTR)
+ return READ_ONCE(ip->i_af.if_seq);
+ if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp)
+ cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32;
+ return cookie | READ_ONCE(ip->i_df.if_seq);
+}
+
+/*
+ * Check that the iomap passed to us is still valid for the given offset and
+ * length.
+ */
+static bool
+xfs_iomap_valid(
+ struct inode *inode,
+ const struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (iomap->validity_cookie !=
+ xfs_iomap_inode_sequence(ip, iomap->flags)) {
+ trace_xfs_iomap_invalid(ip, iomap);
+ return false;
+ }
+
+ XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS);
+ return true;
+}
+
+static const struct iomap_page_ops xfs_iomap_page_ops = {
+ .iomap_valid = xfs_iomap_valid,
+};
+
int
xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
unsigned int mapping_flags,
- u16 iomap_flags)
+ u16 iomap_flags,
+ u64 sequence_cookie)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
@@ -91,6 +131,9 @@ xfs_bmbt_to_iomap(
if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
+
+ iomap->validity_cookie = sequence_cookie;
+ iomap->page_ops = &xfs_iomap_page_ops;
return 0;
}
@@ -195,7 +238,8 @@ xfs_iomap_write_direct(
xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb,
unsigned int flags,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ u64 *seq)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -285,6 +329,7 @@ xfs_iomap_write_direct(
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
+ *seq = xfs_iomap_inode_sequence(ip, 0);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -743,6 +788,7 @@ xfs_direct_write_iomap_begin(
bool shared = false;
u16 iomap_flags = 0;
unsigned int lockmode = XFS_ILOCK_SHARED;
+ u64 seq;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@@ -811,9 +857,10 @@ xfs_direct_write_iomap_begin(
goto out_unlock;
}
+ seq = xfs_iomap_inode_sequence(ip, iomap_flags);
xfs_iunlock(ip, lockmode);
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
allocate_blocks:
error = -EAGAIN;
@@ -839,24 +886,26 @@ allocate_blocks:
xfs_iunlock(ip, lockmode);
error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
- flags, &imap);
+ flags, &imap, &seq);
if (error)
return error;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
- iomap_flags | IOMAP_F_NEW);
+ iomap_flags | IOMAP_F_NEW, seq);
out_found_cow:
- xfs_iunlock(ip, lockmode);
length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
if (imap.br_startblock != HOLESTARTBLOCK) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+ seq = xfs_iomap_inode_sequence(ip, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
if (error)
- return error;
+ goto out_unlock;
}
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, lockmode);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
out_unlock:
if (lockmode)
@@ -915,6 +964,7 @@ xfs_buffered_write_iomap_begin(
int allocfork = XFS_DATA_FORK;
int error = 0;
unsigned int lockmode = XFS_ILOCK_EXCL;
+ u64 seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -926,6 +976,10 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
@@ -1029,10 +1083,6 @@ xfs_buffered_write_iomap_begin(
allocfork = XFS_COW_FORK;
}
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out_unlock;
-
if (eof && offset + count > XFS_ISIZE(ip)) {
/*
* Determine the initial size of the preallocation.
@@ -1094,26 +1144,31 @@ retry:
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
*/
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
found_imap:
+ seq = xfs_iomap_inode_sequence(ip, 0);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
found_cow:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
if (error)
- return error;
+ goto out_unlock;
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
- IOMAP_F_SHARED);
+ IOMAP_F_SHARED, seq);
}
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1121,6 +1176,16 @@ out_unlock:
}
static int
+xfs_buffered_write_delalloc_punch(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length)
+{
+ return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset,
+ offset + length);
+}
+
+static int
xfs_buffered_write_iomap_end(
struct inode *inode,
loff_t offset,
@@ -1129,56 +1194,17 @@ xfs_buffered_write_iomap_end(
unsigned flags,
struct iomap *iomap)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t start_fsb;
- xfs_fileoff_t end_fsb;
- int error = 0;
-
- if (iomap->type != IOMAP_DELALLOC)
- return 0;
-
- /*
- * Behave as if the write failed if drop writes is enabled. Set the NEW
- * flag to force delalloc cleanup.
- */
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
- iomap->flags |= IOMAP_F_NEW;
- written = 0;
- }
- /*
- * start_fsb refers to the first unused block after a short write. If
- * nothing was written, round offset down to point at the first block in
- * the range.
- */
- if (unlikely(!written))
- start_fsb = XFS_B_TO_FSBT(mp, offset);
- else
- start_fsb = XFS_B_TO_FSB(mp, offset + written);
- end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ struct xfs_mount *mp = XFS_M(inode->i_sb);
+ int error;
- /*
- * Trim delalloc blocks if they were allocated by this write and we
- * didn't manage to write the whole range.
- *
- * We don't need to care about racing delalloc as we hold i_mutex
- * across the reserve/allocate/unreserve calls. If there are delalloc
- * blocks in the range, they are ours.
- */
- if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
- truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
- XFS_FSB_TO_B(mp, end_fsb) - 1);
-
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- end_fsb - start_fsb);
- if (error && !xfs_is_shutdown(mp)) {
- xfs_alert(mp, "%s: unable to clean up ino %lld",
- __func__, ip->i_ino);
- return error;
- }
+ error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
+ length, written, &xfs_buffered_write_delalloc_punch);
+ if (error && !xfs_is_shutdown(mp)) {
+ xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
+ __func__, XFS_I(inode)->i_ino);
+ return error;
}
-
return 0;
}
@@ -1187,6 +1213,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = {
.iomap_end = xfs_buffered_write_iomap_end,
};
+/*
+ * iomap_page_mkwrite() will never fail in a way that requires delalloc extents
+ * that it allocated to be revoked. Hence we do not need an .iomap_end method
+ * for this operation.
+ */
+const struct iomap_ops xfs_page_mkwrite_iomap_ops = {
+ .iomap_begin = xfs_buffered_write_iomap_begin,
+};
+
static int
xfs_read_iomap_begin(
struct inode *inode,
@@ -1204,6 +1239,7 @@ xfs_read_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
unsigned int lockmode = XFS_ILOCK_SHARED;
+ u64 seq;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
@@ -1215,15 +1251,16 @@ xfs_read_iomap_begin(
return error;
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, 0);
- if (!error && (flags & IOMAP_REPORT))
+ if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode)))
error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+ seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0);
xfs_iunlock(ip, lockmode);
if (error)
return error;
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
- shared ? IOMAP_F_SHARED : 0);
+ shared ? IOMAP_F_SHARED : 0, seq);
}
const struct iomap_ops xfs_read_iomap_ops = {
@@ -1248,6 +1285,7 @@ xfs_seek_iomap_begin(
struct xfs_bmbt_irec imap, cmap;
int error = 0;
unsigned lockmode;
+ u64 seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -1282,8 +1320,9 @@ xfs_seek_iomap_begin(
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
- IOMAP_F_SHARED);
+ IOMAP_F_SHARED, seq);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
@@ -1304,8 +1343,9 @@ xfs_seek_iomap_begin(
imap.br_startblock = HOLESTARTBLOCK;
imap.br_state = XFS_EXT_NORM;
done:
+ seq = xfs_iomap_inode_sequence(ip, 0);
xfs_trim_extent(&imap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
@@ -1331,6 +1371,7 @@ xfs_xattr_iomap_begin(
struct xfs_bmbt_irec imap;
int nimaps = 1, error = 0;
unsigned lockmode;
+ int seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -1347,12 +1388,14 @@ xfs_xattr_iomap_begin(
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, XFS_BMAPI_ATTRFORK);
out_unlock:
+
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR);
xfs_iunlock(ip, lockmode);
if (error)
return error;
ASSERT(nimaps);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
@@ -1370,7 +1413,7 @@ xfs_zero_range(
if (IS_DAX(inode))
return dax_zero_range(inode, pos, len, did_zero,
- &xfs_direct_write_iomap_ops);
+ &xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
&xfs_buffered_write_iomap_ops);
}
@@ -1385,7 +1428,7 @@ xfs_truncate_page(
if (IS_DAX(inode))
return dax_truncate_page(inode, pos, did_zero,
- &xfs_direct_write_iomap_ops);
+ &xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
&xfs_buffered_write_iomap_ops);
}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c782e8c0479c..4da13440bae9 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,14 +13,15 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
- struct xfs_bmbt_irec *imap);
+ struct xfs_bmbt_irec *imap, u64 *sequence);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);
+u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags);
int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
- u16 iomap_flags);
+ u16 iomap_flags, u64 sequence_cookie);
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
bool *did_zero);
@@ -47,6 +48,7 @@ xfs_aligned_fsb_count(
}
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
+extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2e10e1c66ad6..515318dfbc38 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -651,6 +651,7 @@ xfs_vn_change_ok(
static int
xfs_setattr_nonsize(
struct user_namespace *mnt_userns,
+ struct dentry *dentry,
struct xfs_inode *ip,
struct iattr *iattr)
{
@@ -757,7 +758,7 @@ xfs_setattr_nonsize(
* Posix ACL code seems to care about this issue either.
*/
if (mask & ATTR_MODE) {
- error = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+ error = posix_acl_chmod(mnt_userns, dentry, inode->i_mode);
if (error)
return error;
}
@@ -779,6 +780,7 @@ out_dqrele:
STATIC int
xfs_setattr_size(
struct user_namespace *mnt_userns,
+ struct dentry *dentry,
struct xfs_inode *ip,
struct iattr *iattr)
{
@@ -810,7 +812,7 @@ xfs_setattr_size(
* Use the regular setattr path to update the timestamps.
*/
iattr->ia_valid &= ~ATTR_SIZE;
- return xfs_setattr_nonsize(mnt_userns, ip, iattr);
+ return xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
}
/*
@@ -987,7 +989,7 @@ xfs_vn_setattr_size(
error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (error)
return error;
- return xfs_setattr_size(mnt_userns, ip, iattr);
+ return xfs_setattr_size(mnt_userns, dentry, ip, iattr);
}
STATIC int
@@ -1019,7 +1021,7 @@ xfs_vn_setattr(
error = xfs_vn_change_ok(mnt_userns, dentry, iattr);
if (!error)
- error = xfs_setattr_nonsize(mnt_userns, ip, iattr);
+ error = xfs_setattr_nonsize(mnt_userns, dentry, ip, iattr);
}
return error;
@@ -1101,7 +1103,7 @@ xfs_vn_tmpfile(
}
static const struct inode_operations xfs_inode_operations = {
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -1128,7 +1130,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -1155,7 +1157,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
- .get_acl = xfs_get_acl,
+ .get_inode_acl = xfs_get_acl,
.set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
@@ -1185,10 +1187,6 @@ xfs_inode_supports_dax(
if (!S_ISREG(VFS_I(ip)->i_mode))
return false;
- /* Only supported on non-reflinked files. */
- if (xfs_is_reflink_inode(ip))
- return false;
-
/* Block size must match page size */
if (mp->m_sb.sb_blocksize != PAGE_SIZE)
return false;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f02a0dd522b3..fc61cc024023 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -644,12 +644,14 @@ xfs_log_mount(
int min_logfsbs;
if (!xfs_has_norecovery(mp)) {
- xfs_notice(mp, "Mounting V%d Filesystem",
- XFS_SB_VERSION_NUM(&mp->m_sb));
+ xfs_notice(mp, "Mounting V%d Filesystem %pU",
+ XFS_SB_VERSION_NUM(&mp->m_sb),
+ &mp->m_sb.sb_uuid);
} else {
xfs_notice(mp,
-"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
- XFS_SB_VERSION_NUM(&mp->m_sb));
+"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.",
+ XFS_SB_VERSION_NUM(&mp->m_sb),
+ &mp->m_sb.sb_uuid);
ASSERT(xfs_is_readonly(mp));
}
@@ -887,6 +889,23 @@ xlog_force_iclog(
}
/*
+ * Cycle all the iclogbuf locks to make sure all log IO completion
+ * is done before we tear down these buffers.
+ */
+static void
+xlog_wait_iclog_completion(struct xlog *log)
+{
+ int i;
+ struct xlog_in_core *iclog = log->l_iclog;
+
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ down(&iclog->ic_sema);
+ up(&iclog->ic_sema);
+ iclog = iclog->ic_next;
+ }
+}
+
+/*
* Wait for the iclog and all prior iclogs to be written disk as required by the
* log force state machine. Waiting on ic_force_wait ensures iclog completions
* have been ordered and callbacks run before we are woken here, hence
@@ -1111,6 +1130,14 @@ xfs_log_unmount(
{
xfs_log_clean(mp);
+ /*
+ * If shutdown has come from iclog IO context, the log
+ * cleaning will have been skipped and so we need to wait
+ * for the iclog to complete shutdown processing before we
+ * tear anything down.
+ */
+ xlog_wait_iclog_completion(mp->m_log);
+
xfs_buftarg_drain(mp->m_ddev_targp);
xfs_trans_ail_destroy(mp);
@@ -2114,17 +2141,6 @@ xlog_dealloc_log(
int i;
/*
- * Cycle all the iclogbuf locks to make sure all log IO completion
- * is done before we tear down these buffers.
- */
- iclog = log->l_iclog;
- for (i = 0; i < log->l_iclog_bufs; i++) {
- down(&iclog->ic_sema);
- up(&iclog->ic_sema);
- iclog = iclog->ic_next;
- }
-
- /*
* Destroy the CIL after waiting for iclog IO completion because an
* iclog EIO error will try to shut down the log, which accesses the
* CIL to wake up the waiters.
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8bb3c2e847e..fb87ffb48f7f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -538,6 +538,20 @@ xfs_check_summary_counts(
return 0;
}
+static void
+xfs_unmount_check(
+ struct xfs_mount *mp)
+{
+ if (xfs_is_shutdown(mp))
+ return;
+
+ if (percpu_counter_sum(&mp->m_ifree) >
+ percpu_counter_sum(&mp->m_icount)) {
+ xfs_alert(mp, "ifree/icount mismatch at unmount");
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
+ }
+}
+
/*
* Flush and reclaim dirty inodes in preparation for unmount. Inodes and
* internal inode structures can be sitting in the CIL and AIL at this point,
@@ -1077,6 +1091,7 @@ xfs_unmountfs(
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
+ xfs_unmount_check(mp);
xfs_log_unmount(mp);
xfs_da_unmount(mp);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 37a24f0f7cd4..38d23f0e703a 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -125,6 +125,7 @@ xfs_fs_map_blocks(
int nimaps = 1;
uint lock_flags;
int error = 0;
+ u64 seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -176,6 +177,7 @@ xfs_fs_map_blocks(
lock_flags = xfs_ilock_data_map_shared(ip);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
&imap, &nimaps, bmapi_flags);
+ seq = xfs_iomap_inode_sequence(ip, 0);
ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
@@ -189,7 +191,7 @@ xfs_fs_map_blocks(
xfs_iunlock(ip, lock_flags);
error = xfs_iomap_write_direct(ip, offset_fsb,
- end_fsb - offset_fsb, 0, &imap);
+ end_fsb - offset_fsb, 0, &imap, &seq);
if (error)
goto out_unlock;
@@ -209,7 +211,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 18bb4ec4d7c9..e2c542f6dcd4 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -68,7 +68,7 @@ restart:
while (1) {
struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH];
- int error = 0;
+ int error;
int i;
mutex_lock(&qi->qi_tree_lock);
@@ -423,6 +423,14 @@ xfs_qm_dquot_isolate(
goto out_miss_busy;
/*
+ * If something else is freeing this dquot and hasn't yet removed it
+ * from the LRU, leave it for the freeing task to complete the freeing
+ * process rather than risk it being free from under us here.
+ */
+ if (dqp->q_flags & XFS_DQFLAG_FREEING)
+ goto out_miss_unlock;
+
+ /*
* This dquot has acquired a reference in the meantime remove it from
* the freelist and try again.
*/
@@ -441,10 +449,8 @@ xfs_qm_dquot_isolate(
* skip it so there is time for the IO to complete before we try to
* reclaim it again on the next LRU pass.
*/
- if (!xfs_dqflock_nowait(dqp)) {
- xfs_dqunlock(dqp);
- goto out_miss_busy;
- }
+ if (!xfs_dqflock_nowait(dqp))
+ goto out_miss_unlock;
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
@@ -478,6 +484,8 @@ xfs_qm_dquot_isolate(
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
return LRU_REMOVED;
+out_miss_unlock:
+ xfs_dqunlock(dqp);
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 93bdd25680bc..5535778a98f9 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -416,8 +416,6 @@ xfs_reflink_fill_cow_hole(
goto convert;
}
- ASSERT(cmap->br_startoff > imap->br_startoff);
-
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
@@ -1693,8 +1691,12 @@ xfs_reflink_unshare(
inode_dio_wait(inode);
- error = iomap_file_unshare(inode, offset, len,
- &xfs_buffered_write_iomap_ops);
+ if (IS_DAX(inode))
+ error = dax_file_unshare(inode, offset, len,
+ &xfs_dax_write_iomap_ops);
+ else
+ error = iomap_file_unshare(inode, offset, len,
+ &xfs_buffered_write_iomap_ops);
if (error)
goto out;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 292d5e54a92c..16534e9873f6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1311,10 +1311,10 @@ xfs_rtalloc_reinit_frextents(
uint64_t val = 0;
int error;
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
&val);
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
if (error)
return error;
@@ -1326,6 +1326,41 @@ xfs_rtalloc_reinit_frextents(
}
/*
+ * Read in the bmbt of an rt metadata inode so that we never have to load them
+ * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use
+ * an empty transaction to avoid deadlocking on loops in the bmbt.
+ */
+static inline int
+xfs_rtmount_iread_extents(
+ struct xfs_inode *ip,
+ unsigned int lock_class)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc_empty(ip->i_mount, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class);
+
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ if (xfs_inode_has_attr_fork(ip)) {
+ error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class);
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+/*
* Get the bitmap and summary inodes and the summary cache into the mount
* structure at mount time.
*/
@@ -1342,14 +1377,27 @@ xfs_rtmount_inodes(
return error;
ASSERT(mp->m_rbmip != NULL);
+ error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP);
+ if (error)
+ goto out_rele_bitmap;
+
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
- if (error) {
- xfs_irele(mp->m_rbmip);
- return error;
- }
+ if (error)
+ goto out_rele_bitmap;
ASSERT(mp->m_rsumip != NULL);
+
+ error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM);
+ if (error)
+ goto out_rele_summary;
+
xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
return 0;
+
+out_rele_summary:
+ xfs_irele(mp->m_rsumip);
+out_rele_bitmap:
+ xfs_irele(mp->m_rbmip);
+ return error;
}
void
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ee4b429a2f2c..0c4b73e9b29d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1110,7 +1110,7 @@ xfs_fs_put_super(
if (!sb->s_fs_info)
return;
- xfs_notice(mp, "Unmounting Filesystem");
+ xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid);
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index d269ef57ff01..8a5dc1538aa8 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -34,6 +34,8 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_error.h"
+#include <linux/iomap.h>
+#include "xfs_iomap.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 372d871bccc5..421d1e504ac4 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3352,6 +3352,92 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \
TP_ARGS(ip, irec))
+/* inode iomap invalidation events */
+DECLARE_EVENT_CLASS(xfs_wb_invalid_class,
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork),
+ TP_ARGS(ip, iomap, wpcseq, whichfork),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(u64, addr)
+ __field(loff_t, pos)
+ __field(u64, len)
+ __field(u16, type)
+ __field(u16, flags)
+ __field(u32, wpcseq)
+ __field(u32, forkseq)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->addr = iomap->addr;
+ __entry->pos = iomap->offset;
+ __entry->len = iomap->length;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->wpcseq = wpcseq;
+ __entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pos,
+ __entry->addr,
+ __entry->len,
+ __entry->type,
+ __entry->flags,
+ __entry->wpcseq,
+ __entry->forkseq)
+);
+#define DEFINE_WB_INVALID_EVENT(name) \
+DEFINE_EVENT(xfs_wb_invalid_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \
+ TP_ARGS(ip, iomap, wpcseq, whichfork))
+DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid);
+DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid);
+
+DECLARE_EVENT_CLASS(xfs_iomap_invalid_class,
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap),
+ TP_ARGS(ip, iomap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(u64, addr)
+ __field(loff_t, pos)
+ __field(u64, len)
+ __field(u64, validity_cookie)
+ __field(u64, inodeseq)
+ __field(u16, type)
+ __field(u16, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->addr = iomap->addr;
+ __entry->pos = iomap->offset;
+ __entry->len = iomap->length;
+ __entry->validity_cookie = iomap->validity_cookie;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pos,
+ __entry->addr,
+ __entry->len,
+ __entry->type,
+ __entry->flags,
+ __entry->validity_cookie,
+ __entry->inodeseq)
+);
+#define DEFINE_IOMAP_INVALID_EVENT(name) \
+DEFINE_EVENT(xfs_iomap_invalid_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \
+ TP_ARGS(ip, iomap))
+DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid);
+
/* refcount/reflink tracepoint definitions */
/* reflink tracepoints */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index f51df7d94ef7..7d4109af193e 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -422,7 +422,7 @@ xfsaild_push(
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
- xfs_lsn_t target;
+ xfs_lsn_t target = NULLCOMMITLSN;
long tout;
int stuck = 0;
int flushing = 0;
@@ -472,6 +472,8 @@ xfsaild_push(
XFS_STATS_INC(mp, xs_push_ail);
+ ASSERT(target != NULLCOMMITLSN);
+
lsn = lip->li_lsn;
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
int lock_result;
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c325a28b89a8..10aa1fd39d2b 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -210,7 +210,7 @@ __xfs_xattr_put_listent(
return;
}
offset = context->buffer + context->count;
- strncpy(offset, prefix, prefix_len);
+ memcpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 2c53fbb8d918..a9c5c3f720ad 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -442,6 +442,10 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
data_size = zonefs_check_zone_condition(inode, zone,
false, false);
}
+ } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO &&
+ data_size > isize) {
+ /* Do not expose garbage data */
+ data_size = isize;
}
/*
@@ -805,6 +809,24 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
ret = submit_bio_wait(bio);
+ /*
+ * If the file zone was written underneath the file system, the zone
+ * write pointer may not be where we expect it to be, but the zone
+ * append write can still succeed. So check manually that we wrote where
+ * we intended to, that is, at zi->i_wpoffset.
+ */
+ if (!ret) {
+ sector_t wpsector =
+ zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT);
+
+ if (bio->bi_iter.bi_sector != wpsector) {
+ zonefs_warn(inode->i_sb,
+ "Corrupted write pointer %llu for zone at %llu\n",
+ wpsector, zi->i_zsector);
+ ret = -EIO;
+ }
+ }
+
zonefs_file_write_dio_end_io(iocb, size, ret, 0);
trace_zonefs_file_dio_append(inode, size, ret);