summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c4
-rw-r--r--fs/9p/v9fs.c50
-rw-r--r--fs/9p/v9fs.h3
-rw-r--r--fs/9p/v9fs_vfs.h4
-rw-r--r--fs/9p/vfs_addr.c87
-rw-r--r--fs/9p/vfs_dentry.c4
-rw-r--r--fs/9p/vfs_dir.c17
-rw-r--r--fs/9p/vfs_file.c326
-rw-r--r--fs/9p/vfs_inode.c160
-rw-r--r--fs/9p/vfs_inode_dotl.c58
-rw-r--r--fs/9p/vfs_super.c16
-rw-r--r--fs/9p/xattr.c80
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/dir_fplus.c1
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c2
-rw-r--r--fs/adfs/super.c22
-rw-r--r--fs/affs/affs.h30
-rw-r--r--fs/affs/amigaffs.c12
-rw-r--r--fs/affs/file.c15
-rw-r--r--fs/affs/inode.c36
-rw-r--r--fs/affs/namei.c16
-rw-r--r--fs/affs/super.c43
-rw-r--r--fs/affs/symlink.c4
-rw-r--r--fs/afs/dir.c42
-rw-r--r--fs/afs/file.c2
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/afs/misc.c16
-rw-r--r--fs/afs/mntpt.c8
-rw-r--r--fs/afs/rxrpc.c7
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c278
-rw-r--r--fs/autofs4/autofs_i.h9
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/autofs4/inode.c6
-rw-r--r--fs/autofs4/root.c18
-rw-r--r--fs/autofs4/symlink.c5
-rw-r--r--fs/autofs4/waitq.c6
-rw-r--r--fs/befs/befs.h24
-rw-r--r--fs/befs/btree.c6
-rw-r--r--fs/befs/datastream.c4
-rw-r--r--fs/befs/io.c2
-rw-r--r--fs/befs/linuxvfs.c69
-rw-r--r--fs/befs/super.c4
-rw-r--r--fs/bfs/dir.c12
-rw-r--r--fs/bfs/file.c2
-rw-r--r--fs/bfs/inode.c1
-rw-r--r--fs/binfmt_elf.c35
-rw-r--r--fs/binfmt_misc.c46
-rw-r--r--fs/block_dev.c52
-rw-r--r--fs/btrfs/async-thread.c5
-rw-r--r--fs/btrfs/async-thread.h4
-rw-r--r--fs/btrfs/backref.c80
-rw-r--r--fs/btrfs/btrfs_inode.h16
-rw-r--r--fs/btrfs/check-integrity.c9
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c78
-rw-r--r--fs/btrfs/ctree.h75
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/delayed-ref.c394
-rw-r--r--fs/btrfs/delayed-ref.h39
-rw-r--r--fs/btrfs/dev-replace.c13
-rw-r--r--fs/btrfs/disk-io.c679
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/export.c6
-rw-r--r--fs/btrfs/extent-tree.c913
-rw-r--r--fs/btrfs/extent-tree.h0
-rw-r--r--fs/btrfs/extent_io.c89
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file-item.c6
-rw-r--r--fs/btrfs/file.c108
-rw-r--r--fs/btrfs/free-space-cache.c357
-rw-r--r--fs/btrfs/free-space-cache.h9
-rw-r--r--fs/btrfs/inode-map.c19
-rw-r--r--fs/btrfs/inode.c343
-rw-r--r--fs/btrfs/ioctl.c361
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/math.h6
-rw-r--r--fs/btrfs/ordered-data.c56
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/props.c2
-rw-r--r--fs/btrfs/qgroup.c1406
-rw-r--r--fs/btrfs/qgroup.h64
-rw-r--r--fs/btrfs/raid56.c16
-rw-r--r--fs/btrfs/relocation.c32
-rw-r--r--fs/btrfs/scrub.c82
-rw-r--r--fs/btrfs/send.c230
-rw-r--r--fs/btrfs/super.c406
-rw-r--r--fs/btrfs/sysfs.c150
-rw-r--r--fs/btrfs/sysfs.h30
-rw-r--r--fs/btrfs/tests/qgroup-tests.c113
-rw-r--r--fs/btrfs/transaction.c140
-rw-r--r--fs/btrfs/transaction.h36
-rw-r--r--fs/btrfs/tree-defrag.c3
-rw-r--r--fs/btrfs/tree-log.c628
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/ulist.c47
-rw-r--r--fs/btrfs/ulist.h1
-rw-r--r--fs/btrfs/volumes.c410
-rw-r--r--fs/btrfs/volumes.h14
-rw-r--r--fs/btrfs/xattr.c69
-rw-r--r--fs/btrfs/zlib.c2
-rw-r--r--fs/buffer.c73
-rw-r--r--fs/cachefiles/bind.c10
-rw-r--r--fs/cachefiles/interface.c6
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/namei.c155
-rw-r--r--fs/cachefiles/rdwr.c14
-rw-r--r--fs/cachefiles/security.c6
-rw-r--r--fs/cachefiles/xattr.c22
-rw-r--r--fs/ceph/acl.c4
-rw-r--r--fs/ceph/addr.c347
-rw-r--r--fs/ceph/caps.c877
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c463
-rw-r--r--fs/ceph/export.c28
-rw-r--r--fs/ceph/file.c94
-rw-r--r--fs/ceph/inode.c216
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c508
-rw-r--r--fs/ceph/mds_client.h23
-rw-r--r--fs/ceph/snap.c173
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c85
-rw-r--r--fs/ceph/super.h128
-rw-r--r--fs/ceph/xattr.c104
-rw-r--r--fs/cifs/Kconfig9
-rw-r--r--fs/cifs/cifs_dfs_ref.c5
-rw-r--r--fs/cifs/cifs_unicode.c182
-rw-r--r--fs/cifs/cifsfs.c16
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h13
-rw-r--r--fs/cifs/cifspdu.h12
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c32
-rw-r--r--fs/cifs/connect.c22
-rw-r--r--fs/cifs/dir.c11
-rw-r--r--fs/cifs/file.c148
-rw-r--r--fs/cifs/inode.c63
-rw-r--r--fs/cifs/ioctl.c27
-rw-r--r--fs/cifs/link.c43
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/cifs/smb1ops.c5
-rw-r--r--fs/cifs/smb2file.c4
-rw-r--r--fs/cifs/smb2misc.c4
-rw-r--r--fs/cifs/smb2ops.c190
-rw-r--r--fs/cifs/smb2pdu.c69
-rw-r--r--fs/cifs/smb2pdu.h81
-rw-r--r--fs/cifs/smbfsctl.h3
-rw-r--r--fs/cifs/xattr.c22
-rw-r--r--fs/coda/cache.c4
-rw-r--r--fs/coda/coda_linux.h2
-rw-r--r--fs/coda/dir.c22
-rw-r--r--fs/coda/file.c38
-rw-r--r--fs/coda/inode.c6
-rw-r--r--fs/coda/pioctl.c2
-rw-r--r--fs/coda/upcall.c4
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/configfs/dir.c70
-rw-r--r--fs/configfs/file.c4
-rw-r--r--fs/configfs/inode.c16
-rw-r--r--fs/configfs/item.c7
-rw-r--r--fs/configfs/mount.c12
-rw-r--r--fs/configfs/symlink.c31
-rw-r--r--fs/coredump.c23
-rw-r--r--fs/dax.c96
-rw-r--r--fs/dcache.c109
-rw-r--r--fs/debugfs/file.c12
-rw-r--r--fs/debugfs/inode.c87
-rw-r--r--fs/devpts/inode.c47
-rw-r--r--fs/direct-io.c51
-rw-r--r--fs/dlm/lowcomms.c16
-rw-r--r--fs/ecryptfs/crypto.c4
-rw-r--r--fs/ecryptfs/dentry.c6
-rw-r--r--fs/ecryptfs/file.c12
-rw-r--r--fs/ecryptfs/inode.c175
-rw-r--r--fs/ecryptfs/kthread.c2
-rw-r--r--fs/ecryptfs/main.c6
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/efivarfs/inode.c4
-rw-r--r--fs/efivarfs/super.c4
-rw-r--r--fs/efs/namei.c4
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/exec.c91
-rw-r--r--fs/exofs/Kbuild2
-rw-r--r--fs/exofs/dir.c10
-rw-r--r--fs/exofs/exofs.h4
-rw-r--r--fs/exofs/file.c2
-rw-r--r--fs/exofs/inode.c15
-rw-r--r--fs/exofs/namei.c15
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/exofs/symlink.c55
-rw-r--r--fs/ext2/dir.c7
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/file.c25
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c21
-rw-r--r--fs/ext2/namei.c27
-rw-r--r--fs/ext2/super.c1
-rw-r--r--fs/ext2/symlink.c10
-rw-r--r--fs/ext2/xattr.c4
-rw-r--r--fs/ext2/xattr_security.c4
-rw-r--r--fs/ext2/xattr_trusted.c4
-rw-r--r--fs/ext2/xattr_user.c4
-rw-r--r--fs/ext3/file.c2
-rw-r--r--fs/ext3/ialloc.c2
-rw-r--r--fs/ext3/inode.c19
-rw-r--r--fs/ext3/namei.c37
-rw-r--r--fs/ext3/super.c10
-rw-r--r--fs/ext3/symlink.c10
-rw-r--r--fs/ext3/xattr.c13
-rw-r--r--fs/ext3/xattr_security.c4
-rw-r--r--fs/ext3/xattr_trusted.c4
-rw-r--r--fs/ext3/xattr_user.c4
-rw-r--r--fs/ext4/Kconfig23
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c5
-rw-r--r--fs/ext4/balloc.c7
-rw-r--r--fs/ext4/bitmap.c1
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/crypto.c475
-rw-r--r--fs/ext4/crypto_fname.c469
-rw-r--r--fs/ext4/crypto_key.c260
-rw-r--r--fs/ext4/crypto_policy.c215
-rw-r--r--fs/ext4/dir.c84
-rw-r--r--fs/ext4/ext4.h279
-rw-r--r--fs/ext4/ext4_crypto.h159
-rw-r--r--fs/ext4/ext4_jbd2.c6
-rw-r--r--fs/ext4/extents.c450
-rw-r--r--fs/ext4/extents_status.c10
-rw-r--r--fs/ext4/file.c94
-rw-r--r--fs/ext4/fsync.c3
-rw-r--r--fs/ext4/hash.c1
-rw-r--r--fs/ext4/ialloc.c25
-rw-r--r--fs/ext4/indirect.c37
-rw-r--r--fs/ext4/inline.c47
-rw-r--r--fs/ext4/inode.c313
-rw-r--r--fs/ext4/ioctl.c92
-rw-r--r--fs/ext4/mballoc.c60
-rw-r--r--fs/ext4/migrate.c19
-rw-r--r--fs/ext4/move_extent.c19
-rw-r--r--fs/ext4/namei.c780
-rw-r--r--fs/ext4/page-io.c49
-rw-r--r--fs/ext4/readpage.c328
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c119
-rw-r--r--fs/ext4/symlink.c83
-rw-r--r--fs/ext4/xattr.c14
-rw-r--r--fs/ext4/xattr.h3
-rw-r--r--fs/ext4/xattr_security.c4
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/f2fs/Kconfig21
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/acl.c58
-rw-r--r--fs/f2fs/checkpoint.c92
-rw-r--r--fs/f2fs/crypto.c491
-rw-r--r--fs/f2fs/crypto_fname.c440
-rw-r--r--fs/f2fs/crypto_key.c255
-rw-r--r--fs/f2fs/crypto_policy.c209
-rw-r--r--fs/f2fs/data.c1270
-rw-r--r--fs/f2fs/debug.c33
-rw-r--r--fs/f2fs/dir.c277
-rw-r--r--fs/f2fs/f2fs.h495
-rw-r--r--fs/f2fs/f2fs_crypto.h151
-rw-r--r--fs/f2fs/file.c585
-rw-r--r--fs/f2fs/gc.c135
-rw-r--r--fs/f2fs/hash.c3
-rw-r--r--fs/f2fs/inline.c112
-rw-r--r--fs/f2fs/inode.c34
-rw-r--r--fs/f2fs/namei.c445
-rw-r--r--fs/f2fs/node.c70
-rw-r--r--fs/f2fs/node.h23
-rw-r--r--fs/f2fs/recovery.c98
-rw-r--r--fs/f2fs/segment.c266
-rw-r--r--fs/f2fs/segment.h7
-rw-r--r--fs/f2fs/super.c215
-rw-r--r--fs/f2fs/trace.c6
-rw-r--r--fs/f2fs/trace.h4
-rw-r--r--fs/f2fs/xattr.c17
-rw-r--r--fs/f2fs/xattr.h4
-rw-r--r--fs/fat/cache.c2
-rw-r--r--fs/fat/dir.c4
-rw-r--r--fs/fat/fat.h5
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/fat/file.c11
-rw-r--r--fs/fat/inode.c24
-rw-r--r--fs/fat/misc.c4
-rw-r--r--fs/fat/namei_msdos.c10
-rw-r--r--fs/fat/namei_vfat.c16
-rw-r--r--fs/fat/nfs.c4
-rw-r--r--fs/fhandle.c5
-rw-r--r--fs/file.c78
-rw-r--r--fs/file_table.c29
-rw-r--r--fs/freevxfs/vxfs_extern.h3
-rw-r--r--fs/freevxfs/vxfs_immed.c34
-rw-r--r--fs/freevxfs/vxfs_inode.c7
-rw-r--r--fs/freevxfs/vxfs_lookup.c7
-rw-r--r--fs/fs-writeback.c1168
-rw-r--r--fs/fs_pin.c4
-rw-r--r--fs/fscache/cookie.c8
-rw-r--r--fs/fscache/internal.h12
-rw-r--r--fs/fscache/object.c69
-rw-r--r--fs/fscache/operation.c254
-rw-r--r--fs/fscache/page.c86
-rw-r--r--fs/fscache/stats.c14
-rw-r--r--fs/fuse/control.c6
-rw-r--r--fs/fuse/cuse.c42
-rw-r--r--fs/fuse/dev.c873
-rw-r--r--fs/fuse/dir.c82
-rw-r--r--fs/fuse/file.c185
-rw-r--r--fs/fuse/fuse_i.h168
-rw-r--r--fs/fuse/inode.c99
-rw-r--r--fs/gfs2/acl.c6
-rw-r--r--fs/gfs2/aops.c36
-rw-r--r--fs/gfs2/bmap.c2
-rw-r--r--fs/gfs2/dentry.c12
-rw-r--r--fs/gfs2/export.c8
-rw-r--r--fs/gfs2/file.c112
-rw-r--r--fs/gfs2/glock.c50
-rw-r--r--fs/gfs2/glops.c20
-rw-r--r--fs/gfs2/incore.h6
-rw-r--r--fs/gfs2/inode.c271
-rw-r--r--fs/gfs2/ops_fstype.c9
-rw-r--r--fs/gfs2/quota.c294
-rw-r--r--fs/gfs2/quota.h8
-rw-r--r--fs/gfs2/rgrp.c68
-rw-r--r--fs/gfs2/rgrp.h4
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/sys.c66
-rw-r--r--fs/gfs2/xattr.c8
-rw-r--r--fs/hfs/attr.c6
-rw-r--r--fs/hfs/dir.c12
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hfs/inode.c14
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfs/sysdep.c2
-rw-r--r--fs/hfsplus/bfind.c4
-rw-r--r--fs/hfsplus/catalog.c3
-rw-r--r--fs/hfsplus/dir.c16
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c21
-rw-r--r--fs/hfsplus/ioctl.c14
-rw-r--r--fs/hfsplus/super.c1
-rw-r--r--fs/hfsplus/xattr.c90
-rw-r--r--fs/hfsplus/xattr.h22
-rw-r--r--fs/hfsplus/xattr_security.c38
-rw-r--r--fs/hfsplus/xattr_trusted.c37
-rw-r--r--fs/hfsplus/xattr_user.c35
-rw-r--r--fs/hostfs/hostfs.h6
-rw-r--r--fs/hostfs/hostfs_kern.c131
-rw-r--r--fs/hostfs/hostfs_user.c29
-rw-r--r--fs/hpfs/alloc.c95
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/file.c3
-rw-r--r--fs/hpfs/hpfs_fn.h6
-rw-r--r--fs/hpfs/inode.c2
-rw-r--r--fs/hpfs/namei.c8
-rw-r--r--fs/hpfs/super.c47
-rw-r--r--fs/hppfs/Makefile6
-rw-r--r--fs/hppfs/hppfs.c766
-rw-r--r--fs/hugetlbfs/inode.c190
-rw-r--r--fs/inode.c116
-rw-r--r--fs/internal.h1
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/jbd2/checkpoint.c7
-rw-r--r--fs/jbd2/journal.c67
-rw-r--r--fs/jbd2/recovery.c10
-rw-r--r--fs/jbd2/revoke.c33
-rw-r--r--fs/jbd2/transaction.c295
-rw-r--r--fs/jffs2/dir.c41
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/fs.c10
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/readinode.c27
-rw-r--r--fs/jffs2/security.c4
-rw-r--r--fs/jffs2/super.c4
-rw-r--r--fs/jffs2/symlink.c45
-rw-r--r--fs/jffs2/xattr.c5
-rw-r--r--fs/jffs2/xattr_trusted.c4
-rw-r--r--fs/jffs2/xattr_user.c4
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/inode.c17
-rw-r--r--fs/jfs/ioctl.c3
-rw-r--r--fs/jfs/jfs_incore.h2
-rw-r--r--fs/jfs/jfs_metapage.c31
-rw-r--r--fs/jfs/jfs_metapage.h1
-rw-r--r--fs/jfs/namei.c50
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/jfs/symlink.c10
-rw-r--r--fs/jfs/xattr.c12
-rw-r--r--fs/kernfs/dir.c49
-rw-r--r--fs/kernfs/file.c1
-rw-r--r--fs/kernfs/inode.c10
-rw-r--r--fs/kernfs/kernfs-internal.h1
-rw-r--r--fs/kernfs/symlink.c25
-rw-r--r--fs/libfs.c149
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/locks.c132
-rw-r--r--fs/logfs/dir.c15
-rw-r--r--fs/logfs/file.c4
-rw-r--r--fs/minix/dir.c9
-rw-r--r--fs/minix/file.c4
-rw-r--r--fs/minix/inode.c6
-rw-r--r--fs/minix/minix.h2
-rw-r--r--fs/minix/namei.c10
-rw-r--r--fs/mount.h4
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namei.c1558
-rw-r--r--fs/namespace.c244
-rw-r--r--fs/ncpfs/dir.c50
-rw-r--r--fs/ncpfs/file.c88
-rw-r--r--fs/ncpfs/inode.c6
-rw-r--r--fs/ncpfs/ioctl.c8
-rw-r--r--fs/ncpfs/ncplib_kernel.c8
-rw-r--r--fs/ncpfs/ncplib_kernel.h2
-rw-r--r--fs/ncpfs/symlink.c2
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1
-rw-r--r--fs/nfs/blocklayout/dev.c2
-rw-r--r--fs/nfs/callback.c12
-rw-r--r--fs/nfs/callback_proc.c38
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c43
-rw-r--r--fs/nfs/delegation.c4
-rw-r--r--fs/nfs/dir.c66
-rw-r--r--fs/nfs/direct.c93
-rw-r--r--fs/nfs/file.c36
-rw-r--r--fs/nfs/filelayout/filelayout.c11
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c492
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h33
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c9
-rw-r--r--fs/nfs/getroot.c4
-rw-r--r--fs/nfs/inode.c81
-rw-r--r--fs/nfs/internal.h23
-rw-r--r--fs/nfs/namespace.c10
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c12
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs42.h9
-rw-r--r--fs/nfs/nfs42proc.c137
-rw-r--r--fs/nfs/nfs42xdr.c126
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4file.c30
-rw-r--r--fs/nfs/nfs4getroot.c7
-rw-r--r--fs/nfs/nfs4idmap.c (renamed from fs/nfs/idmap.c)9
-rw-r--r--fs/nfs/nfs4idmap.h68
-rw-r--r--fs/nfs/nfs4namespace.c4
-rw-r--r--fs/nfs/nfs4proc.c357
-rw-r--r--fs/nfs/nfs4state.c39
-rw-r--r--fs/nfs/nfs4super.c7
-rw-r--r--fs/nfs/nfs4sysctl.c2
-rw-r--r--fs/nfs/nfs4trace.h4
-rw-r--r--fs/nfs/nfs4xdr.c37
-rw-r--r--fs/nfs/nfstrace.c3
-rw-r--r--fs/nfs/objlayout/objio_osd.c4
-rw-r--r--fs/nfs/pagelist.c19
-rw-r--r--fs/nfs/pnfs.c233
-rw-r--r--fs/nfs/pnfs.h41
-rw-r--r--fs/nfs/pnfs_dev.c21
-rw-r--r--fs/nfs/pnfs_nfs.c12
-rw-r--r--fs/nfs/proc.c4
-rw-r--r--fs/nfs/read.c10
-rw-r--r--fs/nfs/super.c12
-rw-r--r--fs/nfs/symlink.c21
-rw-r--r--fs/nfs/unlink.c20
-rw-r--r--fs/nfs/write.c58
-rw-r--r--fs/nfsd/Kconfig3
-rw-r--r--fs/nfsd/blocklayout.c11
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/nfs2acl.c8
-rw-r--r--fs/nfsd/nfs3acl.c8
-rw-r--r--fs/nfsd/nfs3proc.c6
-rw-r--r--fs/nfsd/nfs3xdr.c28
-rw-r--r--fs/nfsd/nfs4acl.c72
-rw-r--r--fs/nfsd/nfs4callback.c132
-rw-r--r--fs/nfsd/nfs4layouts.c1
-rw-r--r--fs/nfsd/nfs4proc.c59
-rw-r--r--fs/nfsd/nfs4recover.c22
-rw-r--r--fs/nfsd/nfs4state.c322
-rw-r--r--fs/nfsd/nfs4xdr.c96
-rw-r--r--fs/nfsd/nfsctl.c16
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c20
-rw-r--r--fs/nfsd/nfsfh.h6
-rw-r--r--fs/nfsd/nfsproc.c56
-rw-r--r--fs/nfsd/nfsxdr.c2
-rw-r--r--fs/nfsd/state.h26
-rw-r--r--fs/nfsd/vfs.c190
-rw-r--r--fs/nfsd/vfs.h11
-rw-r--r--fs/nfsd/xdr4.h7
-rw-r--r--fs/nilfs2/alloc.c5
-rw-r--r--fs/nilfs2/bmap.c48
-rw-r--r--fs/nilfs2/bmap.h13
-rw-r--r--fs/nilfs2/btree.c65
-rw-r--r--fs/nilfs2/cpfile.c58
-rw-r--r--fs/nilfs2/dir.c7
-rw-r--r--fs/nilfs2/direct.c17
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/inode.c57
-rw-r--r--fs/nilfs2/ioctl.c1
-rw-r--r--fs/nilfs2/mdt.c54
-rw-r--r--fs/nilfs2/mdt.h10
-rw-r--r--fs/nilfs2/namei.c23
-rw-r--r--fs/nilfs2/page.c24
-rw-r--r--fs/nilfs2/segbuf.c12
-rw-r--r--fs/nilfs2/segment.c17
-rw-r--r--fs/nilfs2/super.c8
-rw-r--r--fs/notify/inotify/inotify_user.c4
-rw-r--r--fs/notify/mark.c30
-rw-r--r--fs/nsfs.c4
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/file.c779
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ntfs/inode.h2
-rw-r--r--fs/ntfs/malloc.h7
-rw-r--r--fs/ntfs/namei.c6
-rw-r--r--fs/ocfs2/alloc.c83
-rw-r--r--fs/ocfs2/aops.c203
-rw-r--r--fs/ocfs2/aops.h9
-rw-r--r--fs/ocfs2/cluster/masklog.c34
-rw-r--r--fs/ocfs2/cluster/masklog.h47
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/dcache.c14
-rw-r--r--fs/ocfs2/dir.c40
-rw-r--r--fs/ocfs2/dir.h2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c13
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmglue.c15
-rw-r--r--fs/ocfs2/export.c4
-rw-r--r--fs/ocfs2/file.c185
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/ioctl.c1
-rw-r--r--fs/ocfs2/journal.c76
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/namei.c53
-rw-r--r--fs/ocfs2/namei.h4
-rw-r--r--fs/ocfs2/ocfs2.h10
-rw-r--r--fs/ocfs2/refcounttree.c18
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/stack_user.c8
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/super.c37
-rw-r--r--fs/ocfs2/xattr.c32
-rw-r--r--fs/omfs/bitmap.c2
-rw-r--r--fs/omfs/dir.c10
-rw-r--r--fs/omfs/file.c4
-rw-r--r--fs/omfs/inode.c10
-rw-r--r--fs/open.c80
-rw-r--r--fs/overlayfs/copy_up.c3
-rw-r--r--fs/overlayfs/dir.c33
-rw-r--r--fs/overlayfs/inode.c60
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/readdir.c77
-rw-r--r--fs/overlayfs/super.c124
-rw-r--r--fs/pipe.c5
-rw-r--r--fs/pnode.c60
-rw-r--r--fs/pnode.h5
-rw-r--r--fs/posix_acl.c54
-rw-r--r--fs/proc/Kconfig10
-rw-r--r--fs/proc/array.c38
-rw-r--r--fs/proc/base.c339
-rw-r--r--fs/proc/fd.c33
-rw-r--r--fs/proc/generic.c27
-rw-r--r--fs/proc/inode.c15
-rw-r--r--fs/proc/internal.h6
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/namespaces.c8
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/proc_net.c2
-rw-r--r--fs/proc/proc_sysctl.c49
-rw-r--r--fs/proc/root.c11
-rw-r--r--fs/proc/self.c26
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/proc/task_nommu.c2
-rw-r--r--fs/proc/thread_self.c24
-rw-r--r--fs/proc_namespace.c34
-rw-r--r--fs/pstore/inode.c25
-rw-r--r--fs/pstore/platform.c8
-rw-r--r--fs/pstore/ram.c53
-rw-r--r--fs/qnx6/dir.c5
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/quota/dquot.c161
-rw-r--r--fs/quota/quota.c217
-rw-r--r--fs/quota/quota_tree.c7
-rw-r--r--fs/quota/quota_v2.c12
-rw-r--r--fs/quota/quotaio_v2.h6
-rw-r--r--fs/ramfs/file-mmu.c2
-rw-r--r--fs/ramfs/file-nommu.c4
-rw-r--r--fs/read_write.c213
-rw-r--r--fs/reiserfs/dir.c4
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/reiserfs/namei.c12
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/reiserfs/super.c10
-rw-r--r--fs/reiserfs/xattr.c126
-rw-r--r--fs/reiserfs/xattr.h2
-rw-r--r--fs/reiserfs/xattr_security.c10
-rw-r--r--fs/reiserfs/xattr_trusted.c10
-rw-r--r--fs/reiserfs/xattr_user.c4
-rw-r--r--fs/romfs/mmap-nommu.c1
-rw-r--r--fs/select.c6
-rw-r--r--fs/seq_file.c34
-rw-r--r--fs/signalfd.c5
-rw-r--r--fs/splice.c46
-rw-r--r--fs/squashfs/export.c2
-rw-r--r--fs/squashfs/squashfs_fs_i.h2
-rw-r--r--fs/squashfs/xattr.c8
-rw-r--r--fs/stat.c6
-rw-r--r--fs/super.c4
-rw-r--r--fs/sysfs/dir.c34
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/sysfs/group.c17
-rw-r--r--fs/sysfs/mount.c5
-rw-r--r--fs/sysv/Makefile2
-rw-r--r--fs/sysv/dir.c9
-rw-r--r--fs/sysv/file.c4
-rw-r--r--fs/sysv/inode.c5
-rw-r--r--fs/sysv/itree.c2
-rw-r--r--fs/sysv/namei.c10
-rw-r--r--fs/sysv/symlink.c20
-rw-r--r--fs/sysv/sysv.h3
-rw-r--r--fs/tracefs/Makefile4
-rw-r--r--fs/tracefs/inode.c643
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/commit.c12
-rw-r--r--fs/ubifs/compress.c22
-rw-r--r--fs/ubifs/debug.c186
-rw-r--r--fs/ubifs/dir.c38
-rw-r--r--fs/ubifs/file.c33
-rw-r--r--fs/ubifs/io.c40
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c21
-rw-r--r--fs/ubifs/log.c4
-rw-r--r--fs/ubifs/lprops.c62
-rw-r--r--fs/ubifs/lpt.c59
-rw-r--r--fs/ubifs/lpt_commit.c34
-rw-r--r--fs/ubifs/master.c6
-rw-r--r--fs/ubifs/orphan.c26
-rw-r--r--fs/ubifs/recovery.c44
-rw-r--r--fs/ubifs/replay.c34
-rw-r--r--fs/ubifs/sb.c30
-rw-r--r--fs/ubifs/scan.c24
-rw-r--r--fs/ubifs/super.c113
-rw-r--r--fs/ubifs/tnc.c20
-rw-r--r--fs/ubifs/tnc_commit.c12
-rw-r--r--fs/ubifs/tnc_misc.c24
-rw-r--r--fs/ubifs/ubifs.h40
-rw-r--r--fs/ubifs/xattr.c28
-rw-r--r--fs/udf/balloc.c20
-rw-r--r--fs/udf/dir.c3
-rw-r--r--fs/udf/directory.c1
-rw-r--r--fs/udf/file.c34
-rw-r--r--fs/udf/inode.c31
-rw-r--r--fs/udf/misc.c1
-rw-r--r--fs/udf/namei.c121
-rw-r--r--fs/udf/partition.c1
-rw-r--r--fs/udf/super.c27
-rw-r--r--fs/udf/symlink.c4
-rw-r--r--fs/udf/truncate.c1
-rw-r--r--fs/udf/udf_i.h2
-rw-r--r--fs/udf/unicode.c49
-rw-r--r--fs/ufs/balloc.c34
-rw-r--r--fs/ufs/dir.c21
-rw-r--r--fs/ufs/file.c2
-rw-r--r--fs/ufs/ialloc.c16
-rw-r--r--fs/ufs/inode.c10
-rw-r--r--fs/ufs/namei.c92
-rw-r--r--fs/ufs/super.c16
-rw-r--r--fs/ufs/symlink.c13
-rw-r--r--fs/ufs/truncate.c2
-rw-r--r--fs/ufs/ufs.h3
-rw-r--r--fs/xattr.c10
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c385
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h10
-rw-r--r--fs/xfs/libxfs/xfs_attr.c25
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c158
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h8
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c44
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c606
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h13
-rw-r--r--fs/xfs/libxfs/xfs_btree.c24
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c39
-rw-r--r--fs/xfs/libxfs/xfs_format.h127
-rw-r--r--fs/xfs/libxfs/xfs_fs.h1
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c593
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h15
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c93
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c8
-rw-r--r--fs/xfs/libxfs/xfs_sb.c54
-rw-r--r--fs/xfs/libxfs/xfs_shared.h6
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h4
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_aops.c398
-rw-r--r--fs/xfs/xfs_aops.h7
-rw-r--r--fs/xfs/xfs_attr_inactive.c88
-rw-r--r--fs/xfs/xfs_attr_list.c9
-rw-r--r--fs/xfs/xfs_bmap_util.c253
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c6
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_buf_item.c4
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c8
-rw-r--r--fs/xfs/xfs_error.c6
-rw-r--r--fs/xfs/xfs_error.h12
-rw-r--r--fs/xfs/xfs_export.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_file.c340
-rw-r--r--fs/xfs/xfs_filestream.c7
-rw-r--r--fs/xfs/xfs_fsops.c30
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_inode.c746
-rw-r--r--fs/xfs/xfs_inode.h49
-rw-r--r--fs/xfs/xfs_ioctl.c39
-rw-r--r--fs/xfs/xfs_ioctl32.c12
-rw-r--r--fs/xfs/xfs_iomap.c21
-rw-r--r--fs/xfs/xfs_iops.c168
-rw-r--r--fs/xfs/xfs_iops.h2
-rw-r--r--fs/xfs/xfs_itable.c15
-rw-r--r--fs/xfs/xfs_linux.h23
-rw-r--r--fs/xfs/xfs_log.c51
-rw-r--r--fs/xfs/xfs_log.h13
-rw-r--r--fs/xfs/xfs_log_cil.c12
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c112
-rw-r--r--fs/xfs/xfs_mount.c932
-rw-r--r--fs/xfs/xfs_mount.h99
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_pnfs.c11
-rw-r--r--fs/xfs/xfs_pnfs.h5
-rw-r--r--fs/xfs/xfs_qm.c18
-rw-r--r--fs/xfs/xfs_qm.h4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c196
-rw-r--r--fs/xfs/xfs_quota.h1
-rw-r--r--fs/xfs/xfs_quotaops.c117
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_super.c159
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_symlink.c77
-rw-r--r--fs/xfs/xfs_trace.h76
-rw-r--r--fs/xfs/xfs_trans.c325
-rw-r--r--fs/xfs/xfs_trans.h7
-rw-r--r--fs/xfs/xfs_trans_ail.c6
-rw-r--r--fs/xfs/xfs_trans_dquot.c32
-rw-r--r--fs/xfs/xfs_trans_priv.h2
-rw-r--r--fs/xfs/xfs_xattr.c6
761 files changed, 32756 insertions, 18310 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 8482f2d11606..31c010372660 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -247,7 +247,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
return v9fs_remote_get_acl(dentry, name, buffer, size, type);
- acl = v9fs_get_cached_acl(dentry->d_inode, type);
+ acl = v9fs_get_cached_acl(d_inode(dentry), type);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -285,7 +285,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
int retval;
struct posix_acl *acl;
struct v9fs_session_info *v9ses;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (strcmp(name, "") != 0)
return -EINVAL;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 620d93489539..8aa56bb6e861 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -320,31 +320,21 @@ fail_option_alloc:
struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
const char *dev_name, char *data)
{
- int retval = -EINVAL;
struct p9_fid *fid;
- int rc;
+ int rc = -ENOMEM;
v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
if (!v9ses->uname)
- return ERR_PTR(-ENOMEM);
+ goto err_names;
v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
- if (!v9ses->aname) {
- kfree(v9ses->uname);
- return ERR_PTR(-ENOMEM);
- }
+ if (!v9ses->aname)
+ goto err_names;
init_rwsem(&v9ses->rename_sem);
rc = bdi_setup_and_register(&v9ses->bdi, "9p");
- if (rc) {
- kfree(v9ses->aname);
- kfree(v9ses->uname);
- return ERR_PTR(rc);
- }
-
- spin_lock(&v9fs_sessionlist_lock);
- list_add(&v9ses->slist, &v9fs_sessionlist);
- spin_unlock(&v9fs_sessionlist_lock);
+ if (rc)
+ goto err_names;
v9ses->uid = INVALID_UID;
v9ses->dfltuid = V9FS_DEFUID;
@@ -352,10 +342,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
v9ses->clnt = p9_client_create(dev_name, data);
if (IS_ERR(v9ses->clnt)) {
- retval = PTR_ERR(v9ses->clnt);
- v9ses->clnt = NULL;
+ rc = PTR_ERR(v9ses->clnt);
p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
- goto error;
+ goto err_bdi;
}
v9ses->flags = V9FS_ACCESS_USER;
@@ -368,10 +357,8 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
}
rc = v9fs_parse_options(v9ses, data);
- if (rc < 0) {
- retval = rc;
- goto error;
- }
+ if (rc < 0)
+ goto err_clnt;
v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
@@ -405,10 +392,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,
v9ses->aname);
if (IS_ERR(fid)) {
- retval = PTR_ERR(fid);
- fid = NULL;
+ rc = PTR_ERR(fid);
p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
- goto error;
+ goto err_clnt;
}
if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
@@ -420,12 +406,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
/* register the session for caching */
v9fs_cache_session_get_cookie(v9ses);
#endif
+ spin_lock(&v9fs_sessionlist_lock);
+ list_add(&v9ses->slist, &v9fs_sessionlist);
+ spin_unlock(&v9fs_sessionlist_lock);
return fid;
-error:
+err_clnt:
+ p9_client_destroy(v9ses->clnt);
+err_bdi:
bdi_destroy(&v9ses->bdi);
- return ERR_PTR(retval);
+err_names:
+ kfree(v9ses->uname);
+ kfree(v9ses->aname);
+ return ERR_PTR(rc);
}
/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 099c7712631c..0923f2cf3c80 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -78,7 +78,6 @@ enum p9_cache_modes {
* @cache: cache mode of type &p9_cache_modes
* @cachetag: the tag of the cache associated with this session
* @fscache: session cookie associated with FS-Cache
- * @options: copy of options string given by user
* @uname: string user name to mount hierarchy as
* @aname: mount specifier for remote hierarchy
* @maxdata: maximum data to be sent/recvd per protocol message
@@ -150,8 +149,6 @@ extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d);
extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry);
-extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
- void *p);
extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
struct p9_fid *fid,
struct super_block *sb, int new);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index b83ebfbf3fdc..5a0db6dec8d1 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -68,14 +68,10 @@ int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
int v9fs_uflags2omode(int uflags, int extended);
-ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
-ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
void v9fs_blank_wstat(struct p9_wstat *wstat);
int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
int datasync);
-ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
- const char __user *, size_t, loff_t *, int);
int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
static inline void v9fs_invalidate_inode_attr(struct inode *inode)
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index eb14e055ea83..e9e04376c52c 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,7 +33,7 @@
#include <linux/pagemap.h>
#include <linux/idr.h>
#include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -51,12 +51,11 @@
*/
static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
{
- int retval;
- loff_t offset;
- char *buffer;
- struct inode *inode;
+ struct inode *inode = page->mapping->host;
+ struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE};
+ struct iov_iter to;
+ int retval, err;
- inode = page->mapping->host;
p9_debug(P9_DEBUG_VFS, "\n");
BUG_ON(!PageLocked(page));
@@ -65,16 +64,16 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
if (retval == 0)
return retval;
- buffer = kmap(page);
- offset = page_offset(page);
+ iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE);
- retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset);
- if (retval < 0) {
+ retval = p9_client_read(fid, page_offset(page), &to, &err);
+ if (err) {
v9fs_uncache_page(inode, page);
+ retval = err;
goto done;
}
- memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval);
+ zero_user(page, retval, PAGE_SIZE - retval);
flush_dcache_page(page);
SetPageUptodate(page);
@@ -82,7 +81,6 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
retval = 0;
done:
- kunmap(page);
unlock_page(page);
return retval;
}
@@ -161,41 +159,32 @@ static void v9fs_invalidate_page(struct page *page, unsigned int offset,
static int v9fs_vfs_writepage_locked(struct page *page)
{
- char *buffer;
- int retval, len;
- loff_t offset, size;
- mm_segment_t old_fs;
- struct v9fs_inode *v9inode;
struct inode *inode = page->mapping->host;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ loff_t size = i_size_read(inode);
+ struct iov_iter from;
+ struct bio_vec bvec;
+ int err, len;
- v9inode = V9FS_I(inode);
- size = i_size_read(inode);
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
else
len = PAGE_CACHE_SIZE;
- set_page_writeback(page);
-
- buffer = kmap(page);
- offset = page_offset(page);
+ bvec.bv_page = page;
+ bvec.bv_offset = 0;
+ bvec.bv_len = len;
+ iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len);
- old_fs = get_fs();
- set_fs(get_ds());
/* We should have writeback_fid always set */
BUG_ON(!v9inode->writeback_fid);
- retval = v9fs_file_write_internal(inode,
- v9inode->writeback_fid,
- (__force const char __user *)buffer,
- len, &offset, 0);
- if (retval > 0)
- retval = 0;
+ set_page_writeback(page);
+
+ p9_client_write(v9inode->writeback_fid, page_offset(page), &from, &err);
- set_fs(old_fs);
- kunmap(page);
end_page_writeback(page);
- return retval;
+ return err;
}
static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -241,11 +230,8 @@ static int v9fs_launder_page(struct page *page)
/**
* v9fs_direct_IO - 9P address space operation for direct I/O
- * @rw: direction (read or write)
* @iocb: target I/O control block
- * @iov: array of vectors that define I/O buffer
* @pos: offset in file to begin the operation
- * @nr_segs: size of iovec array
*
* The presence of v9fs_direct_IO() in the address space ops vector
* allowes open() O_DIRECT flags which would have failed otherwise.
@@ -259,18 +245,23 @@ static int v9fs_launder_page(struct page *page)
*
*/
static ssize_t
-v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
- /*
- * FIXME
- * Now that we do caching with cache mode enabled, We need
- * to support direct IO
- */
- p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%pD) off/no(%lld/%lu) EINVAL\n",
- iocb->ki_filp,
- (long long)pos, iter->nr_segs);
-
- return -EINVAL;
+ struct file *file = iocb->ki_filp;
+ ssize_t n;
+ int err = 0;
+ if (iov_iter_rw(iter) == WRITE) {
+ n = p9_client_write(file->private_data, pos, iter, &err);
+ if (n) {
+ struct inode *inode = file_inode(file);
+ loff_t i_size = i_size_read(inode);
+ if (pos + n > i_size)
+ inode_add_bytes(inode, pos + n - i_size);
+ }
+ } else {
+ n = p9_client_read(file->private_data, pos, iter, &err);
+ }
+ return n ? n : err;
}
static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index a345b2d659cc..bd456c668d39 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -53,7 +53,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
dentry, dentry);
/* Don't cache negative dentries */
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
return 1;
return 0;
}
@@ -83,7 +83,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
if (!inode)
goto out_valid;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 4f1151088ebe..5cc00e56206e 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -33,6 +33,7 @@
#include <linux/inet.h>
#include <linux/idr.h>
#include <linux/slab.h>
+#include <linux/uio.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -115,6 +116,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
int buflen;
int reclen = 0;
struct p9_rdir *rdir;
+ struct kvec kvec;
p9_debug(P9_DEBUG_VFS, "name %pD\n", file);
fid = file->private_data;
@@ -124,16 +126,23 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
rdir = v9fs_alloc_rdir_buf(file, buflen);
if (!rdir)
return -ENOMEM;
+ kvec.iov_base = rdir->buf;
+ kvec.iov_len = buflen;
while (1) {
if (rdir->tail == rdir->head) {
- err = v9fs_file_readn(file, rdir->buf, NULL,
- buflen, ctx->pos);
- if (err <= 0)
+ struct iov_iter to;
+ int n;
+ iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen);
+ n = p9_client_read(file->private_data, ctx->pos, &to,
+ &err);
+ if (err)
return err;
+ if (n == 0)
+ return 0;
rdir->head = 0;
- rdir->tail = err;
+ rdir->tail = n;
}
while (rdir->head < rdir->tail) {
p9stat_init(&st);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index b40133796b87..1ef16bd8280b 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -36,6 +36,8 @@
#include <linux/utsname.h>
#include <asm/uaccess.h>
#include <linux/idr.h>
+#include <linux/uio.h>
+#include <linux/slab.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -149,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
{
struct p9_flock flock;
struct p9_fid *fid;
- uint8_t status;
+ uint8_t status = P9_LOCK_ERROR;
int res = 0;
unsigned char fl_type;
@@ -194,7 +196,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
for (;;) {
res = p9_client_lock_dotl(fid, &flock, &status);
if (res < 0)
- break;
+ goto out_unlock;
if (status != P9_LOCK_BLOCKED)
break;
@@ -212,14 +214,16 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
case P9_LOCK_BLOCKED:
res = -EAGAIN;
break;
+ default:
+ WARN_ONCE(1, "unknown lock status code: %d\n", status);
+ /* fallthough */
case P9_LOCK_ERROR:
case P9_LOCK_GRACE:
res = -ENOLCK;
break;
- default:
- BUG();
}
+out_unlock:
/*
* incase server returned error for lock request, revert
* it locally
@@ -285,6 +289,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
fl->fl_end = glock.start + glock.length - 1;
fl->fl_pid = glock.proc_id;
}
+ kfree(glock.client_id);
return res;
}
@@ -364,63 +369,6 @@ out_err:
}
/**
- * v9fs_fid_readn - read from a fid
- * @fid: fid to read
- * @data: data buffer to read data into
- * @udata: user data buffer to read data into
- * @count: size of buffer
- * @offset: offset at which to read data
- *
- */
-ssize_t
-v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
- u64 offset)
-{
- int n, total, size;
-
- p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n",
- fid->fid, (long long unsigned)offset, count);
- n = 0;
- total = 0;
- size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
- do {
- n = p9_client_read(fid, data, udata, offset, count);
- if (n <= 0)
- break;
-
- if (data)
- data += n;
- if (udata)
- udata += n;
-
- offset += n;
- count -= n;
- total += n;
- } while (count > 0 && n == size);
-
- if (n < 0)
- total = n;
-
- return total;
-}
-
-/**
- * v9fs_file_readn - read from a file
- * @filp: file pointer to read
- * @data: data buffer to read data into
- * @udata: user data buffer to read data into
- * @count: size of buffer
- * @offset: offset at which to read data
- *
- */
-ssize_t
-v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
- u64 offset)
-{
- return v9fs_fid_readn(filp->private_data, data, udata, count, offset);
-}
-
-/**
* v9fs_file_read - read from a file
* @filp: file pointer to read
* @udata: user data buffer to read data into
@@ -430,69 +378,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
*/
static ssize_t
-v9fs_file_read(struct file *filp, char __user *udata, size_t count,
- loff_t * offset)
+v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- int ret;
- struct p9_fid *fid;
- size_t size;
-
- p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
- fid = filp->private_data;
+ struct p9_fid *fid = iocb->ki_filp->private_data;
+ int ret, err;
- size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
- if (count > size)
- ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
- else
- ret = p9_client_read(fid, NULL, udata, *offset, count);
+ p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n",
+ iov_iter_count(to), iocb->ki_pos);
- if (ret > 0)
- *offset += ret;
+ ret = p9_client_read(fid, iocb->ki_pos, to, &err);
+ if (!ret)
+ return err;
+ iocb->ki_pos += ret;
return ret;
}
-ssize_t
-v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
- const char __user *data, size_t count,
- loff_t *offset, int invalidate)
-{
- int n;
- loff_t i_size;
- size_t total = 0;
- loff_t origin = *offset;
- unsigned long pg_start, pg_end;
-
- p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
- data, (int)count, (int)*offset);
-
- do {
- n = p9_client_write(fid, NULL, data+total, origin+total, count);
- if (n <= 0)
- break;
- count -= n;
- total += n;
- } while (count > 0);
-
- if (invalidate && (total > 0)) {
- pg_start = origin >> PAGE_CACHE_SHIFT;
- pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
- if (inode->i_mapping && inode->i_mapping->nrpages)
- invalidate_inode_pages2_range(inode->i_mapping,
- pg_start, pg_end);
- *offset += total;
- i_size = i_size_read(inode);
- if (*offset > i_size) {
- inode_add_bytes(inode, *offset - i_size);
- i_size_write(inode, *offset);
- }
- }
- if (n < 0)
- return n;
-
- return total;
-}
-
/**
* v9fs_file_write - write to a file
* @filp: file pointer to write
@@ -502,35 +403,39 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
*
*/
static ssize_t
-v9fs_file_write(struct file *filp, const char __user * data,
- size_t count, loff_t *offset)
+v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- ssize_t retval = 0;
- loff_t origin = *offset;
-
-
- retval = generic_write_checks(filp, &origin, &count, 0);
- if (retval)
- goto out;
+ struct file *file = iocb->ki_filp;
+ ssize_t retval;
+ loff_t origin;
+ int err = 0;
- retval = -EINVAL;
- if ((ssize_t) count < 0)
- goto out;
- retval = 0;
- if (!count)
- goto out;
+ retval = generic_write_checks(iocb, from);
+ if (retval <= 0)
+ return retval;
- retval = v9fs_file_write_internal(file_inode(filp),
- filp->private_data,
- data, count, &origin, 1);
- /* update offset on successful write */
- if (retval > 0)
- *offset = origin;
-out:
- return retval;
+ origin = iocb->ki_pos;
+ retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err);
+ if (retval > 0) {
+ struct inode *inode = file_inode(file);
+ loff_t i_size;
+ unsigned long pg_start, pg_end;
+ pg_start = origin >> PAGE_CACHE_SHIFT;
+ pg_end = (origin + retval - 1) >> PAGE_CACHE_SHIFT;
+ if (inode->i_mapping && inode->i_mapping->nrpages)
+ invalidate_inode_pages2_range(inode->i_mapping,
+ pg_start, pg_end);
+ iocb->ki_pos += retval;
+ i_size = i_size_read(inode);
+ if (iocb->ki_pos > i_size) {
+ inode_add_bytes(inode, iocb->ki_pos - i_size);
+ i_size_write(inode, iocb->ki_pos);
+ }
+ return retval;
+ }
+ return err;
}
-
static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
int datasync)
{
@@ -657,44 +562,6 @@ out_unlock:
return VM_FAULT_NOPAGE;
}
-static ssize_t
-v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
- loff_t *offsetp)
-{
- loff_t size, offset;
- struct inode *inode;
- struct address_space *mapping;
-
- offset = *offsetp;
- mapping = filp->f_mapping;
- inode = mapping->host;
- if (!count)
- return 0;
- size = i_size_read(inode);
- if (offset < size)
- filemap_write_and_wait_range(mapping, offset,
- offset + count - 1);
-
- return v9fs_file_read(filp, udata, count, offsetp);
-}
-
-/**
- * v9fs_cached_file_read - read from a file
- * @filp: file pointer to read
- * @data: user data buffer to read data into
- * @count: size of buffer
- * @offset: offset at which to read data
- *
- */
-static ssize_t
-v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
- loff_t *offset)
-{
- if (filp->f_flags & O_DIRECT)
- return v9fs_direct_read(filp, data, count, offset);
- return new_sync_read(filp, data, count, offset);
-}
-
/**
* v9fs_mmap_file_read - read from a file
* @filp: file pointer to read
@@ -704,84 +571,12 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
*
*/
static ssize_t
-v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count,
- loff_t *offset)
+v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
/* TODO: Check if there are dirty pages */
- return v9fs_file_read(filp, data, count, offset);
-}
-
-static ssize_t
-v9fs_direct_write(struct file *filp, const char __user * data,
- size_t count, loff_t *offsetp)
-{
- loff_t offset;
- ssize_t retval;
- struct inode *inode;
- struct address_space *mapping;
-
- offset = *offsetp;
- mapping = filp->f_mapping;
- inode = mapping->host;
- if (!count)
- return 0;
-
- mutex_lock(&inode->i_mutex);
- retval = filemap_write_and_wait_range(mapping, offset,
- offset + count - 1);
- if (retval)
- goto err_out;
- /*
- * After a write we want buffered reads to be sure to go to disk to get
- * the new data. We invalidate clean cached page from the region we're
- * about to write. We do this *before* the write so that if we fail
- * here we fall back to buffered write
- */
- if (mapping->nrpages) {
- pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
- pgoff_t pg_end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
-
- retval = invalidate_inode_pages2_range(mapping,
- pg_start, pg_end);
- /*
- * If a page can not be invalidated, fall back
- * to buffered write.
- */
- if (retval) {
- if (retval == -EBUSY)
- goto buff_write;
- goto err_out;
- }
- }
- retval = v9fs_file_write(filp, data, count, offsetp);
-err_out:
- mutex_unlock(&inode->i_mutex);
- return retval;
-
-buff_write:
- mutex_unlock(&inode->i_mutex);
- return new_sync_write(filp, data, count, offsetp);
-}
-
-/**
- * v9fs_cached_file_write - write to a file
- * @filp: file pointer to write
- * @data: data buffer to write data from
- * @count: size of buffer
- * @offset: offset at which to write data
- *
- */
-static ssize_t
-v9fs_cached_file_write(struct file *filp, const char __user * data,
- size_t count, loff_t *offset)
-{
-
- if (filp->f_flags & O_DIRECT)
- return v9fs_direct_write(filp, data, count, offset);
- return new_sync_write(filp, data, count, offset);
+ return v9fs_file_read_iter(iocb, to);
}
-
/**
* v9fs_mmap_file_write - write to a file
* @filp: file pointer to write
@@ -791,14 +586,13 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
*
*/
static ssize_t
-v9fs_mmap_file_write(struct file *filp, const char __user *data,
- size_t count, loff_t *offset)
+v9fs_mmap_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
/*
* TODO: invalidate mmaps on filp's inode between
* offset and offset+count
*/
- return v9fs_file_write(filp, data, count, offset);
+ return v9fs_file_write_iter(iocb, from);
}
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
@@ -843,8 +637,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
const struct file_operations v9fs_cached_file_operations = {
.llseek = generic_file_llseek,
- .read = v9fs_cached_file_read,
- .write = v9fs_cached_file_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.open = v9fs_file_open,
@@ -856,8 +648,6 @@ const struct file_operations v9fs_cached_file_operations = {
const struct file_operations v9fs_cached_file_operations_dotl = {
.llseek = generic_file_llseek,
- .read = v9fs_cached_file_read,
- .write = v9fs_cached_file_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.open = v9fs_file_open,
@@ -870,8 +660,8 @@ const struct file_operations v9fs_cached_file_operations_dotl = {
const struct file_operations v9fs_file_operations = {
.llseek = generic_file_llseek,
- .read = v9fs_file_read,
- .write = v9fs_file_write,
+ .read_iter = v9fs_file_read_iter,
+ .write_iter = v9fs_file_write_iter,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock,
@@ -881,8 +671,8 @@ const struct file_operations v9fs_file_operations = {
const struct file_operations v9fs_file_operations_dotl = {
.llseek = generic_file_llseek,
- .read = v9fs_file_read,
- .write = v9fs_file_write,
+ .read_iter = v9fs_file_read_iter,
+ .write_iter = v9fs_file_write_iter,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock_dotl,
@@ -893,8 +683,8 @@ const struct file_operations v9fs_file_operations_dotl = {
const struct file_operations v9fs_mmap_file_operations = {
.llseek = generic_file_llseek,
- .read = v9fs_mmap_file_read,
- .write = v9fs_mmap_file_write,
+ .read_iter = v9fs_mmap_file_read_iter,
+ .write_iter = v9fs_mmap_file_write_iter,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock,
@@ -904,8 +694,8 @@ const struct file_operations v9fs_mmap_file_operations = {
const struct file_operations v9fs_mmap_file_operations_dotl = {
.llseek = generic_file_llseek,
- .read = v9fs_mmap_file_read,
- .write = v9fs_mmap_file_write,
+ .read_iter = v9fs_mmap_file_read_iter,
+ .write_iter = v9fs_mmap_file_write_iter,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock_dotl,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3662f1d1d9cf..b1dc51888048 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -540,8 +540,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
unlock_new_inode(inode);
return inode;
error:
- unlock_new_inode(inode);
- iput(inode);
+ iget_failed(inode);
return ERR_PTR(retval);
}
@@ -595,7 +594,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
dir, dentry, flags);
v9ses = v9fs_inode2v9ses(dir);
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
dfid = v9fs_fid_lookup(dentry->d_parent);
if (IS_ERR(dfid)) {
retval = PTR_ERR(dfid);
@@ -864,7 +863,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
}
/* Only creates */
- if (!(flags & O_CREAT) || dentry->d_inode)
+ if (!(flags & O_CREAT) || d_really_is_positive(dentry))
return finish_no_open(file, res);
err = 0;
@@ -881,7 +880,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
}
v9fs_invalidate_inode_attr(dir);
- v9inode = V9FS_I(dentry->d_inode);
+ v9inode = V9FS_I(d_inode(dentry));
mutex_lock(&v9inode->v_mutex);
if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
!v9inode->writeback_fid &&
@@ -908,7 +907,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
file->private_data = fid;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
- v9fs_cache_inode_set_cookie(dentry->d_inode, file);
+ v9fs_cache_inode_set_cookie(d_inode(dentry), file);
*opened |= FILE_CREATED;
out:
@@ -969,8 +968,8 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
p9_debug(P9_DEBUG_VFS, "\n");
retval = 0;
- old_inode = old_dentry->d_inode;
- new_inode = new_dentry->d_inode;
+ old_inode = d_inode(old_dentry);
+ new_inode = d_inode(new_dentry);
v9ses = v9fs_inode2v9ses(old_inode);
oldfid = v9fs_fid_lookup(old_dentry);
if (IS_ERR(oldfid))
@@ -1061,7 +1060,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
- generic_fillattr(dentry->d_inode, stat);
+ generic_fillattr(d_inode(dentry), stat);
return 0;
}
fid = v9fs_fid_lookup(dentry);
@@ -1072,8 +1071,8 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
- generic_fillattr(dentry->d_inode, stat);
+ v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb);
+ generic_fillattr(d_inode(dentry), stat);
p9stat_free(st);
kfree(st);
@@ -1095,7 +1094,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
struct p9_wstat wstat;
p9_debug(P9_DEBUG_VFS, "\n");
- retval = inode_change_ok(dentry->d_inode, iattr);
+ retval = inode_change_ok(d_inode(dentry), iattr);
if (retval)
return retval;
@@ -1128,20 +1127,20 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
/* Write all dirty data */
if (d_is_reg(dentry))
- filemap_write_and_wait(dentry->d_inode->i_mapping);
+ filemap_write_and_wait(d_inode(dentry)->i_mapping);
retval = p9_client_wstat(fid, &wstat);
if (retval < 0)
return retval;
if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(dentry->d_inode))
- truncate_setsize(dentry->d_inode, iattr->ia_size);
+ iattr->ia_size != i_size_read(d_inode(dentry)))
+ truncate_setsize(d_inode(dentry), iattr->ia_size);
- v9fs_invalidate_inode_attr(dentry->d_inode);
+ v9fs_invalidate_inode_attr(d_inode(dentry));
- setattr_copy(dentry->d_inode, iattr);
- mark_inode_dirty(dentry->d_inode);
+ setattr_copy(d_inode(dentry), iattr);
+ mark_inode_dirty(d_inode(dentry));
return 0;
}
@@ -1224,100 +1223,43 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
}
/**
- * v9fs_readlink - read a symlink's location (internal version)
+ * v9fs_vfs_follow_link - follow a symlink path
* @dentry: dentry for symlink
- * @buffer: buffer to load symlink location into
- * @buflen: length of buffer
- *
+ * @cookie: place to pass the data to put_link()
*/
-static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
+static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
{
- int retval;
-
- struct v9fs_session_info *v9ses;
- struct p9_fid *fid;
+ struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
+ struct p9_fid *fid = v9fs_fid_lookup(dentry);
struct p9_wstat *st;
+ char *res;
+
+ p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
- p9_debug(P9_DEBUG_VFS, " %pd\n", dentry);
- retval = -EPERM;
- v9ses = v9fs_dentry2v9ses(dentry);
- fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
- return PTR_ERR(fid);
+ return ERR_CAST(fid);
if (!v9fs_proto_dotu(v9ses))
- return -EBADF;
+ return ERR_PTR(-EBADF);
st = p9_client_stat(fid);
if (IS_ERR(st))
- return PTR_ERR(st);
+ return ERR_CAST(st);
if (!(st->mode & P9_DMSYMLINK)) {
- retval = -EINVAL;
- goto done;
+ p9stat_free(st);
+ kfree(st);
+ return ERR_PTR(-EINVAL);
}
+ res = st->extension;
+ st->extension = NULL;
+ if (strlen(res) >= PATH_MAX)
+ res[PATH_MAX - 1] = '\0';
- /* copy extension buffer into buffer */
- retval = min(strlen(st->extension)+1, (size_t)buflen);
- memcpy(buffer, st->extension, retval);
-
- p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n",
- dentry, st->extension, buflen, buffer);
-
-done:
p9stat_free(st);
kfree(st);
- return retval;
-}
-
-/**
- * v9fs_vfs_follow_link - follow a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- *
- */
-
-static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- int len = 0;
- char *link = __getname();
-
- p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
-
- if (!link)
- link = ERR_PTR(-ENOMEM);
- else {
- len = v9fs_readlink(dentry, link, PATH_MAX);
-
- if (len < 0) {
- __putname(link);
- link = ERR_PTR(len);
- } else
- link[min(len, PATH_MAX-1)] = 0;
- }
- nd_set_link(nd, link);
-
- return NULL;
-}
-
-/**
- * v9fs_vfs_put_link - release a symlink path
- * @dentry: dentry for symlink
- * @nd: nameidata
- * @p: unused
- *
- */
-
-void
-v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
- char *s = nd_get_link(nd);
-
- p9_debug(P9_DEBUG_VFS, " %pd %s\n",
- dentry, IS_ERR(s) ? "<error>" : s);
- if (!IS_ERR(s))
- __putname(s);
+ return *cookie = res;
}
/**
@@ -1370,6 +1312,8 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
}
+#define U32_MAX_DIGITS 10
+
/**
* v9fs_vfs_link - create a hardlink
* @old_dentry: dentry for file to link to
@@ -1383,7 +1327,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
int retval;
- char *name;
+ char name[1 + U32_MAX_DIGITS + 2]; /* sign + number + \n + \0 */
struct p9_fid *oldfid;
p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n",
@@ -1393,20 +1337,12 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
if (IS_ERR(oldfid))
return PTR_ERR(oldfid);
- name = __getname();
- if (unlikely(!name)) {
- retval = -ENOMEM;
- goto clunk_fid;
- }
-
sprintf(name, "%d\n", oldfid->fid);
retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
- __putname(name);
if (!retval) {
- v9fs_refresh_inode(oldfid, old_dentry->d_inode);
+ v9fs_refresh_inode(oldfid, d_inode(old_dentry));
v9fs_invalidate_inode_attr(dir);
}
-clunk_fid:
p9_client_clunk(oldfid);
return retval;
}
@@ -1425,7 +1361,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
{
struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
int retval;
- char *name;
+ char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1];
u32 perm;
p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
@@ -1435,26 +1371,16 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
if (!new_valid_dev(rdev))
return -EINVAL;
- name = __getname();
- if (!name)
- return -ENOMEM;
/* build extension */
if (S_ISBLK(mode))
sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
else if (S_ISCHR(mode))
sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
- else if (S_ISFIFO(mode))
- *name = 0;
- else if (S_ISSOCK(mode))
+ else
*name = 0;
- else {
- __putname(name);
- return -EINVAL;
- }
perm = unixmode2p9mode(v9ses, mode);
retval = v9fs_vfs_mkspecial(dir, dentry, perm, name);
- __putname(name);
return retval;
}
@@ -1530,7 +1456,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
static const struct inode_operations v9fs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = v9fs_vfs_follow_link,
- .put_link = v9fs_vfs_put_link,
+ .put_link = kfree_put_link,
.getattr = v9fs_vfs_getattr,
.setattr = v9fs_vfs_setattr,
};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 6054c16b8fae..e8aa57dc8d6d 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -149,8 +149,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
unlock_new_inode(inode);
return inode;
error:
- unlock_new_inode(inode);
- iput(inode);
+ iget_failed(inode);
return ERR_PTR(retval);
}
@@ -265,7 +264,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
}
/* Only creates */
- if (!(flags & O_CREAT) || dentry->d_inode)
+ if (!(flags & O_CREAT) || d_really_is_positive(dentry))
return finish_no_open(file, res);
v9ses = v9fs_inode2v9ses(dir);
@@ -481,7 +480,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
- generic_fillattr(dentry->d_inode, stat);
+ generic_fillattr(d_inode(dentry), stat);
return 0;
}
fid = v9fs_fid_lookup(dentry);
@@ -496,8 +495,8 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
if (IS_ERR(st))
return PTR_ERR(st);
- v9fs_stat2inode_dotl(st, dentry->d_inode);
- generic_fillattr(dentry->d_inode, stat);
+ v9fs_stat2inode_dotl(st, d_inode(dentry));
+ generic_fillattr(d_inode(dentry), stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;
@@ -557,7 +556,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
int retval;
struct p9_fid *fid;
struct p9_iattr_dotl p9attr;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
p9_debug(P9_DEBUG_VFS, "\n");
@@ -795,10 +794,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
if (IS_ERR(fid))
return PTR_ERR(fid);
- v9fs_refresh_inode_dotl(fid, old_dentry->d_inode);
+ v9fs_refresh_inode_dotl(fid, d_inode(old_dentry));
}
- ihold(old_dentry->d_inode);
- d_instantiate(dentry, old_dentry->d_inode);
+ ihold(d_inode(old_dentry));
+ d_instantiate(dentry, d_inode(old_dentry));
return err;
}
@@ -905,41 +904,24 @@ error:
/**
* v9fs_vfs_follow_link_dotl - follow a symlink path
* @dentry: dentry for symlink
- * @nd: nameidata
- *
+ * @cookie: place to pass the data to put_link()
*/
-static void *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
+static const char *
+v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
{
- int retval;
- struct p9_fid *fid;
- char *link = __getname();
+ struct p9_fid *fid = v9fs_fid_lookup(dentry);
char *target;
+ int retval;
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
- if (!link) {
- link = ERR_PTR(-ENOMEM);
- goto ndset;
- }
- fid = v9fs_fid_lookup(dentry);
- if (IS_ERR(fid)) {
- __putname(link);
- link = ERR_CAST(fid);
- goto ndset;
- }
+ if (IS_ERR(fid))
+ return ERR_CAST(fid);
retval = p9_client_readlink(fid, &target);
- if (!retval) {
- strcpy(link, target);
- kfree(target);
- goto ndset;
- }
- __putname(link);
- link = ERR_PTR(retval);
-ndset:
- nd_set_link(nd, link);
- return NULL;
+ if (retval)
+ return ERR_PTR(retval);
+ return *cookie = target;
}
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
@@ -1006,7 +988,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
const struct inode_operations v9fs_symlink_inode_operations_dotl = {
.readlink = generic_readlink,
.follow_link = v9fs_vfs_follow_link_dotl,
- .put_link = v9fs_vfs_put_link,
+ .put_link = kfree_put_link,
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.setxattr = generic_setxattr,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 0afd0382822b..bf495cedec26 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -130,11 +130,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
fid = v9fs_session_init(v9ses, dev_name, data);
if (IS_ERR(fid)) {
retval = PTR_ERR(fid);
- /*
- * we need to call session_close to tear down some
- * of the data structure setup by session_init
- */
- goto close_session;
+ goto free_session;
}
sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
@@ -168,8 +164,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
retval = PTR_ERR(st);
goto release_sb;
}
- root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode_dotl(st, root->d_inode);
+ d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
+ v9fs_stat2inode_dotl(st, d_inode(root));
kfree(st);
} else {
struct p9_wstat *st = NULL;
@@ -179,8 +175,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
goto release_sb;
}
- root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
- v9fs_stat2inode(st, root->d_inode, sb);
+ d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
+ v9fs_stat2inode(st, d_inode(root), sb);
p9stat_free(st);
kfree(st);
@@ -195,8 +191,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
clunk_fid:
p9_client_clunk(fid);
-close_session:
v9fs_session_close(v9ses);
+free_session:
kfree(v9ses);
return ERR_PTR(retval);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index f95e01e058e4..0cf44b6cccd6 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/sched.h>
+#include <linux/uio.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
@@ -25,50 +26,34 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
void *buffer, size_t buffer_size)
{
ssize_t retval;
- int msize, read_count;
- u64 offset = 0, attr_size;
+ u64 attr_size;
struct p9_fid *attr_fid;
+ struct kvec kvec = {.iov_base = buffer, .iov_len = buffer_size};
+ struct iov_iter to;
+ int err;
+
+ iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size);
attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
if (IS_ERR(attr_fid)) {
retval = PTR_ERR(attr_fid);
p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n",
retval);
- attr_fid = NULL;
- goto error;
- }
- if (!buffer_size) {
- /* request to get the attr_size */
- retval = attr_size;
- goto error;
+ return retval;
}
if (attr_size > buffer_size) {
- retval = -ERANGE;
- goto error;
- }
- msize = attr_fid->clnt->msize;
- while (attr_size) {
- if (attr_size > (msize - P9_IOHDRSZ))
- read_count = msize - P9_IOHDRSZ;
+ if (!buffer_size) /* request to get the attr_size */
+ retval = attr_size;
else
- read_count = attr_size;
- read_count = p9_client_read(attr_fid, ((char *)buffer)+offset,
- NULL, offset, read_count);
- if (read_count < 0) {
- /* error in xattr read */
- retval = read_count;
- goto error;
- }
- offset += read_count;
- attr_size -= read_count;
+ retval = -ERANGE;
+ } else {
+ iov_iter_truncate(&to, attr_size);
+ retval = p9_client_read(attr_fid, 0, &to, &err);
+ if (err)
+ retval = err;
}
- /* Total read xattr bytes */
- retval = offset;
-error:
- if (attr_fid)
- p9_client_clunk(attr_fid);
+ p9_client_clunk(attr_fid);
return retval;
-
}
@@ -120,8 +105,11 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
const void *value, size_t value_len, int flags)
{
- u64 offset = 0;
- int retval, msize, write_count;
+ struct kvec kvec = {.iov_base = (void *)value, .iov_len = value_len};
+ struct iov_iter from;
+ int retval;
+
+ iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len);
p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
name, value_len, flags);
@@ -135,29 +123,11 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
* On success fid points to xattr
*/
retval = p9_client_xattrcreate(fid, name, value_len, flags);
- if (retval < 0) {
+ if (retval < 0)
p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
retval);
- goto err;
- }
- msize = fid->clnt->msize;
- while (value_len) {
- if (value_len > (msize - P9_IOHDRSZ))
- write_count = msize - P9_IOHDRSZ;
- else
- write_count = value_len;
- write_count = p9_client_write(fid, ((char *)value)+offset,
- NULL, offset, write_count);
- if (write_count < 0) {
- /* error in xattr write */
- retval = write_count;
- goto err;
- }
- offset += write_count;
- value_len -= write_count;
- }
- retval = 0;
-err:
+ else
+ p9_client_write(fid, 0, &from, &retval);
p9_client_clunk(fid);
return retval;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index ec35851e5b71..011f43365d7b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -32,6 +32,7 @@ source "fs/gfs2/Kconfig"
source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
+source "fs/f2fs/Kconfig"
config FS_DAX
bool "Direct Access (DAX) support"
@@ -217,7 +218,6 @@ source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
-source "fs/f2fs/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 270c48148f79..2d0cbbd14cfc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,9 +27,6 @@ config COMPAT_BINFMT_ELF
bool
depends on COMPAT && BINFMT_ELF
-config ARCH_BINFMT_ELF_RANDOMIZE_PIE
- bool
-
config ARCH_BINFMT_ELF_STATE
bool
diff --git a/fs/Makefile b/fs/Makefile
index a88ac4838c9e..cb20e4bf2303 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -115,9 +115,9 @@ obj-$(CONFIG_AFS_FS) += afs/
obj-$(CONFIG_NILFS2_FS) += nilfs2/
obj-$(CONFIG_BEFS_FS) += befs/
obj-$(CONFIG_HOSTFS) += hostfs/
-obj-$(CONFIG_HPPFS) += hppfs/
obj-$(CONFIG_CACHEFILES) += cachefiles/
obj-$(CONFIG_DEBUG_FS) += debugfs/
+obj-$(CONFIG_TRACING) += tracefs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index f2ba88ab4aed..82d14cdf70f9 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -61,6 +61,7 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
kcalloc(size, sizeof(struct buffer_head *),
GFP_KERNEL);
if (!bh_fplus) {
+ ret = -ENOMEM;
adfs_error(sb, "not enough memory for"
" dir object %X (%d blocks)", id, size);
goto out;
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 07c9edce5aa7..46c0d5671cd5 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -23,11 +23,9 @@
const struct file_operations adfs_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index b9acadafa4a1..335055d828e4 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -298,7 +298,7 @@ out:
int
adfs_notify_change(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
unsigned int ia_valid = attr->ia_valid;
int error;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9852bdf34d76..4d4a0df8344f 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep;
static struct inode *adfs_alloc_inode(struct super_block *sb)
{
struct adfs_inode_info *ei;
- ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -316,7 +316,7 @@ static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_di
dm = kmalloc(nzones * sizeof(*dm), GFP_KERNEL);
if (dm == NULL) {
adfs_error(sb, "not enough memory");
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
for (zone = 0; zone < nzones; zone++, map_addr++) {
@@ -349,7 +349,7 @@ error_free:
brelse(dm[zone].dm_bh);
kfree(dm);
- return NULL;
+ return ERR_PTR(-EIO);
}
static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits)
@@ -370,6 +370,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
unsigned char *b_data;
struct adfs_sb_info *asb;
struct inode *root;
+ int ret = -EINVAL;
sb->s_flags |= MS_NODIRATIME;
@@ -391,6 +392,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
sb_set_blocksize(sb, BLOCK_SIZE);
if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) {
adfs_error(sb, "unable to read superblock");
+ ret = -EIO;
goto error;
}
@@ -400,6 +402,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
if (!silent)
printk("VFS: Can't find an adfs filesystem on dev "
"%s.\n", sb->s_id);
+ ret = -EINVAL;
goto error_free_bh;
}
@@ -412,6 +415,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
if (!silent)
printk("VPS: Can't find an adfs filesystem on dev "
"%s.\n", sb->s_id);
+ ret = -EINVAL;
goto error_free_bh;
}
@@ -421,11 +425,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
if (!bh) {
adfs_error(sb, "couldn't read superblock on "
"2nd try.");
+ ret = -EIO;
goto error;
}
b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize);
if (adfs_checkbblk(b_data)) {
adfs_error(sb, "disc record mismatch, very weird!");
+ ret = -EINVAL;
goto error_free_bh;
}
dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET);
@@ -433,6 +439,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
if (!silent)
printk(KERN_ERR "VFS: Unsupported blocksize on dev "
"%s.\n", sb->s_id);
+ ret = -EINVAL;
goto error;
}
@@ -447,10 +454,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
asb->s_size = adfs_discsize(dr, sb->s_blocksize_bits);
asb->s_version = dr->format_version;
asb->s_log2sharesize = dr->log2sharesize;
-
+
asb->s_map = adfs_read_map(sb, dr);
- if (!asb->s_map)
+ if (IS_ERR(asb->s_map)) {
+ ret = PTR_ERR(asb->s_map);
goto error_free_bh;
+ }
brelse(bh);
@@ -499,6 +508,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
brelse(asb->s_map[i].dm_bh);
kfree(asb->s_map);
adfs_error(sb, "get root inode failed\n");
+ ret = -EIO;
goto error;
}
return 0;
@@ -508,7 +518,7 @@ error_free_bh:
error:
sb->s_fs_info = NULL;
kfree(asb);
- return -EINVAL;
+ return ret;
}
static struct dentry *adfs_mount(struct file_system_type *fs_type,
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index c8764bd7497d..c69a87eaf57d 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -64,7 +64,7 @@ struct affs_inode_info {
/* short cut to get to the affs specific inode data */
static inline struct affs_inode_info *AFFS_I(struct inode *inode)
{
- return list_entry(inode, struct affs_inode_info, vfs_inode);
+ return container_of(inode, struct affs_inode_info, vfs_inode);
}
/*
@@ -106,18 +106,22 @@ struct affs_sb_info {
spinlock_t work_lock; /* protects sb_work and work_queued */
};
-#define SF_INTL 0x0001 /* International filesystem. */
-#define SF_BM_VALID 0x0002 /* Bitmap is valid. */
-#define SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */
-#define SF_QUIET 0x0008 /* chmod errors will be not reported */
-#define SF_SETUID 0x0010 /* Ignore Amiga uid */
-#define SF_SETGID 0x0020 /* Ignore Amiga gid */
-#define SF_SETMODE 0x0040 /* Ignore Amiga protection bits */
-#define SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */
-#define SF_OFS 0x0200 /* Old filesystem */
-#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */
-#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */
-#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */
+#define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */
+#define AFFS_MOUNT_SF_BM_VALID 0x0002 /* Bitmap is valid. */
+#define AFFS_MOUNT_SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */
+#define AFFS_MOUNT_SF_QUIET 0x0008 /* chmod errors will be not reported */
+#define AFFS_MOUNT_SF_SETUID 0x0010 /* Ignore Amiga uid */
+#define AFFS_MOUNT_SF_SETGID 0x0020 /* Ignore Amiga gid */
+#define AFFS_MOUNT_SF_SETMODE 0x0040 /* Ignore Amiga protection bits */
+#define AFFS_MOUNT_SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */
+#define AFFS_MOUNT_SF_OFS 0x0200 /* Old filesystem */
+#define AFFS_MOUNT_SF_PREFIX 0x0400 /* Buffer for prefix is allocated */
+#define AFFS_MOUNT_SF_VERBOSE 0x0800 /* Talk about fs when mounting */
+#define AFFS_MOUNT_SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */
+
+#define affs_clear_opt(o, opt) (o &= ~AFFS_MOUNT_##opt)
+#define affs_set_opt(o, opt) (o |= AFFS_MOUNT_##opt)
+#define affs_test_opt(o, opt) ((o) & AFFS_MOUNT_##opt)
/* short cut to get to the affs specific sb data */
static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 388da1ea815d..5fa92bc790ef 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -138,9 +138,9 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino)
static int
affs_remove_link(struct dentry *dentry)
{
- struct inode *dir, *inode = dentry->d_inode;
+ struct inode *dir, *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
- struct buffer_head *bh = NULL, *link_bh = NULL;
+ struct buffer_head *bh, *link_bh = NULL;
u32 link_ino, ino;
int retval;
@@ -268,11 +268,11 @@ affs_remove_header(struct dentry *dentry)
struct buffer_head *bh = NULL;
int retval;
- dir = dentry->d_parent->d_inode;
+ dir = d_inode(dentry->d_parent);
sb = dir->i_sb;
retval = -ENOENT;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
if (!inode)
goto done;
@@ -471,9 +471,9 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
bool
affs_nofilenametruncate(const struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
- return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE;
+ struct inode *inode = d_inode(dentry);
+ return affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_NO_TRUNCATE);
}
/* Check if the name is valid for a affs object. */
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a91795e01a7f..659c579c4588 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -12,7 +12,7 @@
* affs regular file handling primitives
*/
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "affs.h"
static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
@@ -389,8 +389,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
}
static ssize_t
-affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -398,15 +397,15 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
ssize_t ret;
- if (rw == WRITE) {
+ if (iov_iter_rw(iter) == WRITE) {
loff_t size = offset + count;
if (AFFS_I(inode)->mmu_private < size)
return 0;
}
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block);
- if (ret < 0 && (rw & WRITE))
+ ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block);
+ if (ret < 0 && iov_iter_rw(iter) == WRITE)
affs_write_failed(mapping, offset + count);
return ret;
}
@@ -915,7 +914,7 @@ affs_truncate(struct inode *inode)
if (inode->i_size) {
AFFS_I(inode)->i_blkcnt = last_blk + 1;
AFFS_I(inode)->i_extcnt = ext + 1;
- if (AFFS_SB(sb)->s_flags & SF_OFS) {
+ if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS)) {
struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
u32 tmp;
if (IS_ERR(bh)) {
@@ -969,9 +968,7 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
}
const struct file_operations affs_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = affs_file_open,
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 6f34510449e8..17349500592d 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -66,23 +66,23 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
AFFS_I(inode)->i_lastalloc = 0;
AFFS_I(inode)->i_pa_cnt = 0;
- if (sbi->s_flags & SF_SETMODE)
+ if (affs_test_opt(sbi->s_flags, SF_SETMODE))
inode->i_mode = sbi->s_mode;
else
inode->i_mode = prot_to_mode(prot);
id = be16_to_cpu(tail->uid);
- if (id == 0 || sbi->s_flags & SF_SETUID)
+ if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETUID))
inode->i_uid = sbi->s_uid;
- else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
+ else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS))
i_uid_write(inode, 0);
else
i_uid_write(inode, id);
id = be16_to_cpu(tail->gid);
- if (id == 0 || sbi->s_flags & SF_SETGID)
+ if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETGID))
inode->i_gid = sbi->s_gid;
- else if (id == 0xFFFF && sbi->s_flags & SF_MUFS)
+ else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS))
i_gid_write(inode, 0);
else
i_gid_write(inode, id);
@@ -94,7 +94,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
/* fall through */
case ST_USERDIR:
if (be32_to_cpu(tail->stype) == ST_USERDIR ||
- sbi->s_flags & SF_SETMODE) {
+ affs_test_opt(sbi->s_flags, SF_SETMODE)) {
if (inode->i_mode & S_IRUSR)
inode->i_mode |= S_IXUSR;
if (inode->i_mode & S_IRGRP)
@@ -133,7 +133,8 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
}
if (tail->link_chain)
set_nlink(inode, 2);
- inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
+ inode->i_mapping->a_ops = affs_test_opt(sbi->s_flags, SF_OFS) ?
+ &affs_aops_ofs : &affs_aops;
inode->i_op = &affs_file_inode_operations;
inode->i_fop = &affs_file_operations;
break;
@@ -190,15 +191,15 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) {
uid = i_uid_read(inode);
gid = i_gid_read(inode);
- if (AFFS_SB(sb)->s_flags & SF_MUFS) {
+ if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_MUFS)) {
if (uid == 0 || uid == 0xFFFF)
uid = uid ^ ~0;
if (gid == 0 || gid == 0xFFFF)
gid = gid ^ ~0;
}
- if (!(AFFS_SB(sb)->s_flags & SF_SETUID))
+ if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETUID))
tail->uid = cpu_to_be16(uid);
- if (!(AFFS_SB(sb)->s_flags & SF_SETGID))
+ if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETGID))
tail->gid = cpu_to_be16(gid);
}
}
@@ -212,7 +213,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
int
affs_notify_change(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid);
@@ -221,11 +222,14 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
if (error)
goto out;
- if (((attr->ia_valid & ATTR_UID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETUID)) ||
- ((attr->ia_valid & ATTR_GID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETGID)) ||
+ if (((attr->ia_valid & ATTR_UID) &&
+ affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETUID)) ||
+ ((attr->ia_valid & ATTR_GID) &&
+ affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETGID)) ||
((attr->ia_valid & ATTR_MODE) &&
- (AFFS_SB(inode->i_sb)->s_flags & (SF_SETMODE | SF_IMMUTABLE)))) {
- if (!(AFFS_SB(inode->i_sb)->s_flags & SF_QUIET))
+ (AFFS_SB(inode->i_sb)->s_flags &
+ (AFFS_MOUNT_SF_SETMODE | AFFS_MOUNT_SF_IMMUTABLE)))) {
+ if (!affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_QUIET))
error = -EPERM;
goto out;
}
@@ -342,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
{
struct super_block *sb = dir->i_sb;
struct buffer_head *inode_bh = NULL;
- struct buffer_head *bh = NULL;
+ struct buffer_head *bh;
u32 block = 0;
int retval;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index ffb7bd82c2a5..181e05b46e72 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -53,7 +53,8 @@ affs_intl_toupper(int ch)
static inline toupper_t
affs_get_toupper(struct super_block *sb)
{
- return AFFS_SB(sb)->s_flags & SF_INTL ? affs_intl_toupper : affs_toupper;
+ return affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL) ?
+ affs_intl_toupper : affs_toupper;
}
/*
@@ -250,7 +251,7 @@ int
affs_unlink(struct inode *dir, struct dentry *dentry)
{
pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
- dentry->d_inode->i_ino, dentry);
+ d_inode(dentry)->i_ino, dentry);
return affs_remove_header(dentry);
}
@@ -275,7 +276,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
inode->i_op = &affs_file_inode_operations;
inode->i_fop = &affs_file_operations;
- inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops;
+ inode->i_mapping->a_ops = affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS) ?
+ &affs_aops_ofs : &affs_aops;
error = affs_add_entry(dir, inode, dentry, ST_FILE);
if (error) {
clear_nlink(inode);
@@ -318,7 +320,7 @@ int
affs_rmdir(struct inode *dir, struct dentry *dentry)
{
pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
- dentry->d_inode->i_ino, dentry);
+ d_inode(dentry)->i_ino, dentry);
return affs_remove_header(dentry);
}
@@ -401,7 +403,7 @@ err:
int
affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino,
dentry);
@@ -428,13 +430,13 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
return retval;
/* Unlink destination if it already exists */
- if (new_dentry->d_inode) {
+ if (d_really_is_positive(new_dentry)) {
retval = affs_remove_header(new_dentry);
if (retval)
return retval;
}
- bh = affs_bread(sb, old_dentry->d_inode->i_ino);
+ bh = affs_bread(sb, d_inode(old_dentry)->i_ino);
if (!bh)
return -EIO;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4cf0e9113fb6..3f89c9e05b40 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -227,22 +227,22 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
if (match_octal(&args[0], &option))
return 0;
*mode = option & 0777;
- *mount_opts |= SF_SETMODE;
+ affs_set_opt(*mount_opts, SF_SETMODE);
break;
case Opt_mufs:
- *mount_opts |= SF_MUFS;
+ affs_set_opt(*mount_opts, SF_MUFS);
break;
case Opt_notruncate:
- *mount_opts |= SF_NO_TRUNCATE;
+ affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
break;
case Opt_prefix:
*prefix = match_strdup(&args[0]);
if (!*prefix)
return 0;
- *mount_opts |= SF_PREFIX;
+ affs_set_opt(*mount_opts, SF_PREFIX);
break;
case Opt_protect:
- *mount_opts |= SF_IMMUTABLE;
+ affs_set_opt(*mount_opts, SF_IMMUTABLE);
break;
case Opt_reserved:
if (match_int(&args[0], reserved))
@@ -258,7 +258,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
*gid = make_kgid(current_user_ns(), option);
if (!gid_valid(*gid))
return 0;
- *mount_opts |= SF_SETGID;
+ affs_set_opt(*mount_opts, SF_SETGID);
break;
case Opt_setuid:
if (match_int(&args[0], &option))
@@ -266,10 +266,10 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
*uid = make_kuid(current_user_ns(), option);
if (!uid_valid(*uid))
return 0;
- *mount_opts |= SF_SETUID;
+ affs_set_opt(*mount_opts, SF_SETUID);
break;
case Opt_verbose:
- *mount_opts |= SF_VERBOSE;
+ affs_set_opt(*mount_opts, SF_VERBOSE);
break;
case Opt_volume: {
char *vol = match_strdup(&args[0]);
@@ -435,30 +435,31 @@ got_root:
case MUFS_FS:
case MUFS_INTLFFS:
case MUFS_DCFFS:
- sbi->s_flags |= SF_MUFS;
+ affs_set_opt(sbi->s_flags, SF_MUFS);
/* fall thru */
case FS_INTLFFS:
case FS_DCFFS:
- sbi->s_flags |= SF_INTL;
+ affs_set_opt(sbi->s_flags, SF_INTL);
break;
case MUFS_FFS:
- sbi->s_flags |= SF_MUFS;
+ affs_set_opt(sbi->s_flags, SF_MUFS);
break;
case FS_FFS:
break;
case MUFS_OFS:
- sbi->s_flags |= SF_MUFS;
+ affs_set_opt(sbi->s_flags, SF_MUFS);
/* fall thru */
case FS_OFS:
- sbi->s_flags |= SF_OFS;
+ affs_set_opt(sbi->s_flags, SF_OFS);
sb->s_flags |= MS_NOEXEC;
break;
case MUFS_DCOFS:
case MUFS_INTLOFS:
- sbi->s_flags |= SF_MUFS;
+ affs_set_opt(sbi->s_flags, SF_MUFS);
case FS_DCOFS:
case FS_INTLOFS:
- sbi->s_flags |= SF_INTL | SF_OFS;
+ affs_set_opt(sbi->s_flags, SF_INTL);
+ affs_set_opt(sbi->s_flags, SF_OFS);
sb->s_flags |= MS_NOEXEC;
break;
default:
@@ -467,7 +468,7 @@ got_root:
return -EINVAL;
}
- if (mount_flags & SF_VERBOSE) {
+ if (affs_test_opt(mount_flags, SF_VERBOSE)) {
u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
len > 31 ? 31 : len,
@@ -478,7 +479,7 @@ got_root:
sb->s_flags |= MS_NODEV | MS_NOSUID;
sbi->s_data_blksize = sb->s_blocksize;
- if (sbi->s_flags & SF_OFS)
+ if (affs_test_opt(sbi->s_flags, SF_OFS))
sbi->s_data_blksize -= 24;
tmp_flags = sb->s_flags;
@@ -493,7 +494,7 @@ got_root:
if (IS_ERR(root_inode))
return PTR_ERR(root_inode);
- if (AFFS_SB(sb)->s_flags & SF_INTL)
+ if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL))
sb->s_d_op = &affs_intl_dentry_operations;
else
sb->s_d_op = &affs_dentry_operations;
@@ -520,10 +521,14 @@ affs_remount(struct super_block *sb, int *flags, char *data)
int root_block;
unsigned long mount_flags;
int res = 0;
- char *new_opts = kstrdup(data, GFP_KERNEL);
+ char *new_opts;
char volume[32];
char *prefix = NULL;
+ new_opts = kstrdup(data, GFP_KERNEL);
+ if (!new_opts)
+ return -ENOMEM;
+
pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
sync_filesystem(sb);
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index f39b71c3981e..ea5b69a18ba9 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -16,14 +16,12 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
char *link = kmap(page);
struct slink_front *lf;
- int err;
int i, j;
char c;
char lc;
pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
- err = -EIO;
bh = affs_bread(inode->i_sb, inode->i_ino);
if (!bh)
goto fail;
@@ -66,7 +64,7 @@ fail:
SetPageError(page);
kunmap(page);
unlock_page(page);
- return err;
+ return -EIO;
}
const struct address_space_operations affs_symlink_aops = {
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 4ec35e9130e1..e10e17788f06 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -505,7 +505,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
_enter("{%x:%u},%p{%pd},",
vnode->fid.vid, vnode->fid.vnode, dentry, dentry);
- ASSERTCMP(dentry->d_inode, ==, NULL);
+ ASSERTCMP(d_inode(dentry), ==, NULL);
if (dentry->d_name.len >= AFSNAMEMAX) {
_leave(" = -ENAMETOOLONG");
@@ -563,8 +563,8 @@ success:
_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }",
fid.vnode,
fid.unique,
- dentry->d_inode->i_ino,
- dentry->d_inode->i_generation);
+ d_inode(dentry)->i_ino,
+ d_inode(dentry)->i_generation);
return NULL;
}
@@ -586,9 +586,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- vnode = AFS_FS_I(dentry->d_inode);
+ vnode = AFS_FS_I(d_inode(dentry));
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
_enter("{v={%x:%u} n=%pd fl=%lx},",
vnode->fid.vid, vnode->fid.vnode, dentry,
vnode->flags);
@@ -601,7 +601,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
/* lock down the parent dentry so we can peer at it */
parent = dget_parent(dentry);
- dir = AFS_FS_I(parent->d_inode);
+ dir = AFS_FS_I(d_inode(parent));
/* validate the parent directory */
if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
@@ -623,9 +623,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
switch (ret) {
case 0:
/* the filename maps to something */
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
goto out_bad;
- if (is_bad_inode(dentry->d_inode)) {
+ if (is_bad_inode(d_inode(dentry))) {
printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
dentry);
goto out_bad;
@@ -647,7 +647,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
_debug("%pd: file deleted (uq %u -> %u I:%u)",
dentry, fid.unique,
vnode->fid.unique,
- dentry->d_inode->i_generation);
+ d_inode(dentry)->i_generation);
spin_lock(&vnode->lock);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
spin_unlock(&vnode->lock);
@@ -658,7 +658,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
case -ENOENT:
/* the filename is unknown */
_debug("%pd: dirent not found", dentry);
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
goto not_found;
goto out_valid;
@@ -703,9 +703,9 @@ static int afs_d_delete(const struct dentry *dentry)
if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
goto zap;
- if (dentry->d_inode &&
- (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) ||
- test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags)))
+ if (d_really_is_positive(dentry) &&
+ (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(d_inode(dentry))->flags) ||
+ test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(d_inode(dentry))->flags)))
goto zap;
_leave(" = 0 [keep]");
@@ -814,8 +814,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
if (ret < 0)
goto rmdir_error;
- if (dentry->d_inode) {
- vnode = AFS_FS_I(dentry->d_inode);
+ if (d_really_is_positive(dentry)) {
+ vnode = AFS_FS_I(d_inode(dentry));
clear_nlink(&vnode->vfs_inode);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
afs_discard_callback_on_delete(vnode);
@@ -856,8 +856,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
goto error;
}
- if (dentry->d_inode) {
- vnode = AFS_FS_I(dentry->d_inode);
+ if (d_really_is_positive(dentry)) {
+ vnode = AFS_FS_I(d_inode(dentry));
/* make sure we have a callback promise on the victim */
ret = afs_validate(vnode, key);
@@ -869,7 +869,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
if (ret < 0)
goto remove_error;
- if (dentry->d_inode) {
+ if (d_really_is_positive(dentry)) {
/* if the file wasn't deleted due to excess hard links, the
* fileserver will break the callback promise on the file - if
* it had one - before it returns to us, and if it was deleted,
@@ -879,7 +879,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
* or it was outstanding on a different server, then it won't
* break it either...
*/
- vnode = AFS_FS_I(dentry->d_inode);
+ vnode = AFS_FS_I(d_inode(dentry));
if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
_debug("AFS_VNODE_DELETED");
if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
@@ -977,7 +977,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
struct key *key;
int ret;
- vnode = AFS_FS_I(from->d_inode);
+ vnode = AFS_FS_I(d_inode(from));
dvnode = AFS_FS_I(dir);
_enter("{%x:%u},{%x:%u},{%pd}",
@@ -1089,7 +1089,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct key *key;
int ret;
- vnode = AFS_FS_I(old_dentry->d_inode);
+ vnode = AFS_FS_I(d_inode(old_dentry));
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 932ce07948b3..999bc3caec92 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -31,8 +31,6 @@ const struct file_operations afs_file_operations = {
.open = afs_open,
.release = afs_release,
.llseek = generic_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = afs_file_write,
.mmap = generic_file_readonly_mmap,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 8a1d38ef0fc2..e06f5a23352a 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -379,7 +379,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
{
struct inode *inode;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
@@ -458,7 +458,7 @@ void afs_evict_inode(struct inode *inode)
*/
int afs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+ struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
struct key *key;
int ret;
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 0dd4dafee10b..91ea1aa0d8b3 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -22,9 +22,12 @@
int afs_abort_to_error(u32 abort_code)
{
switch (abort_code) {
+ /* low errno codes inserted into abort namespace */
case 13: return -EACCES;
case 27: return -EFBIG;
case 30: return -EROFS;
+
+ /* VICE "special error" codes; 101 - 111 */
case VSALVAGE: return -EIO;
case VNOVNODE: return -ENOENT;
case VNOVOL: return -ENOMEDIUM;
@@ -36,11 +39,18 @@ int afs_abort_to_error(u32 abort_code)
case VOVERQUOTA: return -EDQUOT;
case VBUSY: return -EBUSY;
case VMOVED: return -ENXIO;
- case 0x2f6df0a: return -EWOULDBLOCK;
+
+ /* Unified AFS error table; ET "uae" == 0x2f6df00 */
+ case 0x2f6df00: return -EPERM;
+ case 0x2f6df01: return -ENOENT;
+ case 0x2f6df04: return -EIO;
+ case 0x2f6df0a: return -EAGAIN;
+ case 0x2f6df0b: return -ENOMEM;
case 0x2f6df0c: return -EACCES;
case 0x2f6df0f: return -EBUSY;
case 0x2f6df10: return -EEXIST;
case 0x2f6df11: return -EXDEV;
+ case 0x2f6df12: return -ENODEV;
case 0x2f6df13: return -ENOTDIR;
case 0x2f6df14: return -EISDIR;
case 0x2f6df15: return -EINVAL;
@@ -54,8 +64,12 @@ int afs_abort_to_error(u32 abort_code)
case 0x2f6df23: return -ENAMETOOLONG;
case 0x2f6df24: return -ENOLCK;
case 0x2f6df26: return -ENOTEMPTY;
+ case 0x2f6df28: return -EWOULDBLOCK;
+ case 0x2f6df69: return -ENOTCONN;
+ case 0x2f6df6c: return -ETIMEDOUT;
case 0x2f6df78: return -EDQUOT;
+ /* RXKAD abort codes; from include/rxrpc/packet.h. ET "RXK" == 0x1260B00 */
case RXKADINCONSISTENCY: return -EPROTO;
case RXKADPACKETSHORT: return -EPROTO;
case RXKADLEVELFAIL: return -EKEYREJECTED;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 938c5ab06d5a..ccd0b212e82a 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -134,7 +134,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
_enter("{%pd}", mntpt);
- BUG_ON(!mntpt->d_inode);
+ BUG_ON(!d_inode(mntpt));
ret = -ENOMEM;
devname = (char *) get_zeroed_page(GFP_KERNEL);
@@ -145,7 +145,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
if (!options)
goto error_no_options;
- vnode = AFS_FS_I(mntpt->d_inode);
+ vnode = AFS_FS_I(d_inode(mntpt));
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
/* if the directory is a pseudo directory, use the d_name */
static const char afs_root_cell[] = ":root.cell.";
@@ -169,14 +169,14 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
}
} else {
/* read the contents of the AFS special symlink */
- loff_t size = i_size_read(mntpt->d_inode);
+ loff_t size = i_size_read(d_inode(mntpt));
char *buf;
ret = -EINVAL;
if (size > PAGE_SIZE - 1)
goto error_no_page;
- page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
+ page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto error_no_page;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index dbc732e9a5c0..b50642870a43 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -85,7 +85,7 @@ int afs_open_socket(void)
return -ENOMEM;
}
- ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+ ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
if (ret < 0) {
destroy_workqueue(afs_async_calls);
_leave(" = %d [socket]", ret);
@@ -770,15 +770,12 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
void afs_send_empty_reply(struct afs_call *call)
{
struct msghdr msg;
- struct kvec iov[1];
_enter("");
- iov[0].iov_base = NULL;
- iov[0].iov_len = 0;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0); /* WTF? */
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index c4861557e385..1fb4a5129f7d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -529,7 +529,7 @@ static void afs_destroy_inode(struct inode *inode)
static int afs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct afs_volume_status vs;
- struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
+ struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
struct key *key;
int ret;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c13cb08964ed..0714abcd7f32 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,7 +14,6 @@
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
-#include <linux/aio.h>
#include "internal.h"
static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/aio.c b/fs/aio.c
index f8e52a1854c1..480440f4701f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -77,6 +77,11 @@ struct kioctx_cpu {
unsigned reqs_available;
};
+struct ctx_rq_wait {
+ struct completion comp;
+ atomic_t count;
+};
+
struct kioctx {
struct percpu_ref users;
atomic_t dead;
@@ -115,7 +120,7 @@ struct kioctx {
/*
* signals when all in-flight requests are done
*/
- struct completion *requests_done;
+ struct ctx_rq_wait *rq_wait;
struct {
/*
@@ -151,6 +156,38 @@ struct kioctx {
unsigned id;
};
+/*
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+ */
+#define KIOCB_CANCELLED ((void *) (~0ULL))
+
+struct aio_kiocb {
+ struct kiocb common;
+
+ struct kioctx *ki_ctx;
+ kiocb_cancel_fn *ki_cancel;
+
+ struct iocb __user *ki_user_iocb; /* user's aiocb */
+ __u64 ki_user_data; /* user's data for completion */
+
+ struct list_head ki_list; /* the aio core uses this
+ * for cancellation */
+
+ /*
+ * If the aio_resfd field of the userspace iocb is not zero,
+ * this is the underlying eventfd context to deliver events to.
+ */
+ struct eventfd_ctx *ki_eventfd;
+};
+
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
unsigned long aio_nr; /* current system wide number of aio requests */
@@ -220,7 +257,7 @@ static int __init aio_setup(void)
if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
- kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
@@ -278,11 +315,11 @@ static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
- int i;
+ int i, res = -EINVAL;
spin_lock(&mm->ioctx_lock);
rcu_read_lock();
@@ -292,13 +329,17 @@ static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
ctx = table->table[i];
if (ctx && ctx->aio_ring_file == file) {
- ctx->user_id = ctx->mmap_base = vma->vm_start;
+ if (!atomic_read(&ctx->dead)) {
+ ctx->user_id = ctx->mmap_base = vma->vm_start;
+ res = 0;
+ }
break;
}
}
rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
+ return res;
}
static const struct file_operations aio_ring_fops = {
@@ -480,8 +521,9 @@ static int aio_setup_ring(struct kioctx *ctx)
#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
{
+ struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
@@ -496,7 +538,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);
-static int kiocb_cancel(struct kiocb *kiocb)
+static int kiocb_cancel(struct aio_kiocb *kiocb)
{
kiocb_cancel_fn *old, *cancel;
@@ -514,7 +556,7 @@ static int kiocb_cancel(struct kiocb *kiocb)
cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
} while (cancel != old);
- return cancel(kiocb);
+ return cancel(&kiocb->common);
}
static void free_ioctx(struct work_struct *work)
@@ -535,8 +577,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
/* At this point we know that there are no any in-flight requests */
- if (ctx->requests_done)
- complete(ctx->requests_done);
+ if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
+ complete(&ctx->rq_wait->comp);
INIT_WORK(&ctx->free_work, free_ioctx);
schedule_work(&ctx->free_work);
@@ -550,13 +592,13 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
static void free_ioctx_users(struct percpu_ref *ref)
{
struct kioctx *ctx = container_of(ref, struct kioctx, users);
- struct kiocb *req;
+ struct aio_kiocb *req;
spin_lock_irq(&ctx->ctx_lock);
while (!list_empty(&ctx->active_reqs)) {
req = list_first_entry(&ctx->active_reqs,
- struct kiocb, ki_list);
+ struct aio_kiocb, ki_list);
list_del_init(&req->ki_list);
kiocb_cancel(req);
@@ -655,8 +697,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
nr_events *= 2;
/* Prevent overflows */
- if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
- (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
+ if (nr_events > (0x10000000U / sizeof(struct io_event))) {
pr_debug("ENOMEM: nr_events too high\n");
return ERR_PTR(-EINVAL);
}
@@ -727,6 +768,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
err_cleanup:
aio_nr_sub(ctx->max_reqs);
err_ctx:
+ atomic_set(&ctx->dead, 1);
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
aio_free_ring(ctx);
err:
mutex_unlock(&ctx->ring_lock);
@@ -744,15 +788,16 @@ err:
* the rapid destruction of the kioctx.
*/
static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
- struct completion *requests_done)
+ struct ctx_rq_wait *wait)
{
struct kioctx_table *table;
- if (atomic_xchg(&ctx->dead, 1))
+ spin_lock(&mm->ioctx_lock);
+ if (atomic_xchg(&ctx->dead, 1)) {
+ spin_unlock(&mm->ioctx_lock);
return -EINVAL;
+ }
-
- spin_lock(&mm->ioctx_lock);
table = rcu_dereference_raw(mm->ioctx_table);
WARN_ON(ctx != table->table[ctx->id]);
table->table[ctx->id] = NULL;
@@ -773,27 +818,11 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
if (ctx->mmap_size)
vm_munmap(ctx->mmap_base, ctx->mmap_size);
- ctx->requests_done = requests_done;
+ ctx->rq_wait = wait;
percpu_ref_kill(&ctx->users);
return 0;
}
-/* wait_on_sync_kiocb:
- * Waits on the given sync kiocb to complete.
- */
-ssize_t wait_on_sync_kiocb(struct kiocb *req)
-{
- while (!req->ki_ctx) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (req->ki_ctx)
- break;
- io_schedule();
- }
- __set_current_state(TASK_RUNNING);
- return req->ki_user_data;
-}
-EXPORT_SYMBOL(wait_on_sync_kiocb);
-
/*
* exit_aio: called when the last user of mm goes away. At this point, there is
* no way for any new requests to be submited or any of the io_* syscalls to be
@@ -805,18 +834,24 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
void exit_aio(struct mm_struct *mm)
{
struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
- int i;
+ struct ctx_rq_wait wait;
+ int i, skipped;
if (!table)
return;
+ atomic_set(&wait.count, table->nr);
+ init_completion(&wait.comp);
+
+ skipped = 0;
for (i = 0; i < table->nr; ++i) {
struct kioctx *ctx = table->table[i];
- struct completion requests_done =
- COMPLETION_INITIALIZER_ONSTACK(requests_done);
- if (!ctx)
+ if (!ctx) {
+ skipped++;
continue;
+ }
+
/*
* We don't need to bother with munmap() here - exit_mmap(mm)
* is coming and it'll unmap everything. And we simply can't,
@@ -825,10 +860,12 @@ void exit_aio(struct mm_struct *mm)
* that it needs to unmap the area, just set it to 0.
*/
ctx->mmap_size = 0;
- kill_ioctx(mm, ctx, &requests_done);
+ kill_ioctx(mm, ctx, &wait);
+ }
+ if (!atomic_sub_and_test(skipped, &wait.count)) {
/* Wait until all IO for the context are done. */
- wait_for_completion(&requests_done);
+ wait_for_completion(&wait.comp);
}
RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -948,9 +985,9 @@ static void user_refill_reqs_available(struct kioctx *ctx)
* Allocate a slot for an aio request.
* Returns NULL if no requests are free.
*/
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
{
- struct kiocb *req;
+ struct aio_kiocb *req;
if (!get_reqs_available(ctx)) {
user_refill_reqs_available(ctx);
@@ -971,10 +1008,10 @@ out_put:
return NULL;
}
-static void kiocb_free(struct kiocb *req)
+static void kiocb_free(struct aio_kiocb *req)
{
- if (req->ki_filp)
- fput(req->ki_filp);
+ if (req->common.ki_filp)
+ fput(req->common.ki_filp);
if (req->ki_eventfd != NULL)
eventfd_ctx_put(req->ki_eventfd);
kmem_cache_free(kiocb_cachep, req);
@@ -1010,8 +1047,9 @@ out:
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
-void aio_complete(struct kiocb *iocb, long res, long res2)
+static void aio_complete(struct kiocb *kiocb, long res, long res2)
{
+ struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
@@ -1025,13 +1063,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
* ref, no other paths have a way to get another ref
* - the sync task helpfully left a reference to itself in the iocb
*/
- if (is_sync_kiocb(iocb)) {
- iocb->ki_user_data = res;
- smp_wmb();
- iocb->ki_ctx = ERR_PTR(-EXDEV);
- wake_up_process(iocb->ki_obj.tsk);
- return;
- }
+ BUG_ON(is_sync_kiocb(kiocb));
if (iocb->ki_list.next) {
unsigned long flags;
@@ -1057,7 +1089,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+ event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
event->data = iocb->ki_user_data;
event->res = res;
event->res2 = res2;
@@ -1066,7 +1098,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+ ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
res, res2);
/* after flagging the request as done, we
@@ -1113,7 +1145,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
percpu_ref_put(&ctx->reqs);
}
-EXPORT_SYMBOL(aio_complete);
/* aio_read_events_ring
* Pull an event off of the ioctx's event ring. Returns the number of
@@ -1313,15 +1344,17 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- struct completion requests_done =
- COMPLETION_INITIALIZER_ONSTACK(requests_done);
+ struct ctx_rq_wait wait;
int ret;
+ init_completion(&wait.comp);
+ atomic_set(&wait.count, 1);
+
/* Pass requests_done to kill_ioctx() where it can be set
* in a thread-safe way. If we try to set it here then we have
* a race condition if two io_destroy() called simultaneously.
*/
- ret = kill_ioctx(current->mm, ioctx, &requests_done);
+ ret = kill_ioctx(current->mm, ioctx, &wait);
percpu_ref_put(&ioctx->users);
/* Wait until all IO for the context are done. Otherwise kernel
@@ -1329,7 +1362,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
* is destroyed.
*/
if (!ret)
- wait_for_completion(&requests_done);
+ wait_for_completion(&wait.comp);
return ret;
}
@@ -1337,50 +1370,21 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
return -EINVAL;
}
-typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
- unsigned long, loff_t);
typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
-static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
- int rw, char __user *buf,
- unsigned long *nr_segs,
- struct iovec **iovec,
- bool compat)
+static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len,
+ struct iovec **iovec,
+ bool compat,
+ struct iov_iter *iter)
{
- ssize_t ret;
-
- *nr_segs = kiocb->ki_nbytes;
-
#ifdef CONFIG_COMPAT
if (compat)
- ret = compat_rw_copy_check_uvector(rw,
+ return compat_import_iovec(rw,
(struct compat_iovec __user *)buf,
- *nr_segs, UIO_FASTIOV, *iovec, iovec);
- else
+ len, UIO_FASTIOV, iovec, iter);
#endif
- ret = rw_copy_check_uvector(rw,
- (struct iovec __user *)buf,
- *nr_segs, UIO_FASTIOV, *iovec, iovec);
- if (ret < 0)
- return ret;
-
- /* ki_nbytes now reflect bytes instead of segs */
- kiocb->ki_nbytes = ret;
- return 0;
-}
-
-static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
- int rw, char __user *buf,
- unsigned long *nr_segs,
- struct iovec *iovec)
-{
- if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
- return -EFAULT;
-
- iovec->iov_base = buf;
- iovec->iov_len = kiocb->ki_nbytes;
- *nr_segs = 1;
- return 0;
+ return import_iovec(rw, (struct iovec __user *)buf,
+ len, UIO_FASTIOV, iovec, iter);
}
/*
@@ -1388,14 +1392,12 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
* Performs the initial checks and io submission.
*/
static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
- char __user *buf, bool compat)
+ char __user *buf, size_t len, bool compat)
{
struct file *file = req->ki_filp;
ssize_t ret;
- unsigned long nr_segs;
int rw;
fmode_t mode;
- aio_rw_op *rw_op;
rw_iter_op *iter_op;
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
@@ -1405,7 +1407,6 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
case IOCB_CMD_PREADV:
mode = FMODE_READ;
rw = READ;
- rw_op = file->f_op->aio_read;
iter_op = file->f_op->read_iter;
goto rw_common;
@@ -1413,51 +1414,40 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
case IOCB_CMD_PWRITEV:
mode = FMODE_WRITE;
rw = WRITE;
- rw_op = file->f_op->aio_write;
iter_op = file->f_op->write_iter;
goto rw_common;
rw_common:
if (unlikely(!(file->f_mode & mode)))
return -EBADF;
- if (!rw_op && !iter_op)
+ if (!iter_op)
return -EINVAL;
- ret = (opcode == IOCB_CMD_PREADV ||
- opcode == IOCB_CMD_PWRITEV)
- ? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
- &iovec, compat)
- : aio_setup_single_vector(req, rw, buf, &nr_segs,
- iovec);
+ if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
+ ret = aio_setup_vectored_rw(rw, buf, len,
+ &iovec, compat, &iter);
+ else {
+ ret = import_single_range(rw, buf, len, iovec, &iter);
+ iovec = NULL;
+ }
if (!ret)
- ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ ret = rw_verify_area(rw, file, &req->ki_pos,
+ iov_iter_count(&iter));
if (ret < 0) {
- if (iovec != inline_vecs)
- kfree(iovec);
+ kfree(iovec);
return ret;
}
- req->ki_nbytes = ret;
-
- /* XXX: move/kill - rw_verify_area()? */
- /* This matches the pread()/pwrite() logic */
- if (req->ki_pos < 0) {
- ret = -EINVAL;
- break;
- }
+ len = ret;
if (rw == WRITE)
file_start_write(file);
- if (iter_op) {
- iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
- ret = iter_op(req, &iter);
- } else {
- ret = rw_op(req, iovec, nr_segs, req->ki_pos);
- }
+ ret = iter_op(req, &iter);
if (rw == WRITE)
file_end_write(file);
+ kfree(iovec);
break;
case IOCB_CMD_FDSYNC:
@@ -1479,9 +1469,6 @@ rw_common:
return -EINVAL;
}
- if (iovec != inline_vecs)
- kfree(iovec);
-
if (ret != -EIOCBQUEUED) {
/*
* There's no easy way to restart the syscall since other AIO's
@@ -1500,7 +1487,7 @@ rw_common:
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
struct iocb *iocb, bool compat)
{
- struct kiocb *req;
+ struct aio_kiocb *req;
ssize_t ret;
/* enforce forwards compatibility on users */
@@ -1523,11 +1510,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
if (unlikely(!req))
return -EAGAIN;
- req->ki_filp = fget(iocb->aio_fildes);
- if (unlikely(!req->ki_filp)) {
+ req->common.ki_filp = fget(iocb->aio_fildes);
+ if (unlikely(!req->common.ki_filp)) {
ret = -EBADF;
goto out_put_req;
}
+ req->common.ki_pos = iocb->aio_offset;
+ req->common.ki_complete = aio_complete;
+ req->common.ki_flags = iocb_flags(req->common.ki_filp);
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
@@ -1542,6 +1532,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->ki_eventfd = NULL;
goto out_put_req;
}
+
+ req->common.ki_flags |= IOCB_EVENTFD;
}
ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@ -1550,13 +1542,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
goto out_put_req;
}
- req->ki_obj.user = user_iocb;
+ req->ki_user_iocb = user_iocb;
req->ki_user_data = iocb->aio_data;
- req->ki_pos = iocb->aio_offset;
- req->ki_nbytes = iocb->aio_nbytes;
- ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+ ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode,
(char __user *)(unsigned long)iocb->aio_buf,
+ iocb->aio_nbytes,
compat);
if (ret)
goto out_put_req;
@@ -1643,10 +1634,10 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
/* lookup_kiocb
* Finds a given iocb for cancellation.
*/
-static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
- u32 key)
+static struct aio_kiocb *
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
{
- struct list_head *pos;
+ struct aio_kiocb *kiocb;
assert_spin_locked(&ctx->ctx_lock);
@@ -1654,9 +1645,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
return NULL;
/* TODO: use a hash or array, this sucks. */
- list_for_each(pos, &ctx->active_reqs) {
- struct kiocb *kiocb = list_kiocb(pos);
- if (kiocb->ki_obj.user == iocb)
+ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+ if (kiocb->ki_user_iocb == iocb)
return kiocb;
}
return NULL;
@@ -1676,7 +1666,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct io_event __user *, result)
{
struct kioctx *ctx;
- struct kiocb *kiocb;
+ struct aio_kiocb *kiocb;
u32 key;
int ret;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 8e98cf954bab..c37149b929be 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -213,7 +213,7 @@ void autofs4_clean_ino(struct autofs_info *);
static inline int autofs_prepare_pipe(struct file *pipe)
{
- if (!pipe->f_op->write)
+ if (!(pipe->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (!S_ISFIFO(file_inode(pipe)->i_mode))
return -EINVAL;
@@ -235,12 +235,7 @@ static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
{
- return sbi->sb->s_root->d_inode->i_ino;
-}
-
-static inline int simple_positive(struct dentry *dentry)
-{
- return dentry->d_inode && !d_unhashed(dentry);
+ return d_inode(sbi->sb->s_root)->i_ino;
}
static inline void __autofs4_add_expiring(struct dentry *dentry)
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 11dd118f75e2..1cebc3c52fa5 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -374,7 +374,7 @@ static struct dentry *should_expire(struct dentry *dentry,
return NULL;
}
- if (dentry->d_inode && d_is_symlink(dentry)) {
+ if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
DPRINTK("checking symlink %p %pd", dentry, dentry);
/*
* A symlink can't be "busy" in the usual sense so
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 1c55388ae633..a3ae0b2aeb5a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -71,7 +71,7 @@ void autofs4_kill_sb(struct super_block *sb)
static int autofs4_show_options(struct seq_file *m, struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
- struct inode *root_inode = root->d_sb->s_root->d_inode;
+ struct inode *root_inode = d_inode(root->d_sb->s_root);
if (!sbi)
return 0;
@@ -352,8 +352,8 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
inode->i_mode = mode;
if (sb->s_root) {
- inode->i_uid = sb->s_root->d_inode->i_uid;
- inode->i_gid = sb->s_root->d_inode->i_gid;
+ inode->i_uid = d_inode(sb->s_root)->i_uid;
+ inode->i_gid = d_inode(sb->s_root)->i_gid;
}
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_ino = get_next_ino();
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 7e44fdd03e2d..c6d7d3dbd52a 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -240,7 +240,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry,
spin_lock(&expiring->d_lock);
/* We've already been dentry_iput or unlinked */
- if (!expiring->d_inode)
+ if (d_really_is_negative(expiring))
goto next;
qstr = &expiring->d_name;
@@ -371,7 +371,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
* having d_mountpoint() true, so there's no need to call back
* to the daemon.
*/
- if (dentry->d_inode && d_is_symlink(dentry)) {
+ if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
spin_unlock(&sbi->fs_lock);
goto done;
}
@@ -459,7 +459,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
return 0;
if (d_mountpoint(dentry))
return 0;
- inode = ACCESS_ONCE(dentry->d_inode);
+ inode = d_inode_rcu(dentry);
if (inode && S_ISLNK(inode->i_mode))
return -EISDIR;
if (list_empty(&dentry->d_subdirs))
@@ -485,7 +485,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
* an incorrect ELOOP error return.
*/
if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
- (dentry->d_inode && d_is_symlink(dentry)))
+ (d_really_is_positive(dentry) && d_is_symlink(dentry)))
status = -EISDIR;
}
spin_unlock(&sbi->fs_lock);
@@ -625,8 +625,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
}
dput(ino->dentry);
- dentry->d_inode->i_size = 0;
- clear_nlink(dentry->d_inode);
+ d_inode(dentry)->i_size = 0;
+ clear_nlink(d_inode(dentry));
dir->i_mtime = CURRENT_TIME;
@@ -719,8 +719,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
atomic_dec(&p_ino->count);
}
dput(ino->dentry);
- dentry->d_inode->i_size = 0;
- clear_nlink(dentry->d_inode);
+ d_inode(dentry)->i_size = 0;
+ clear_nlink(d_inode(dentry));
if (dir->i_nlink)
drop_nlink(dir);
@@ -839,7 +839,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
*/
int is_autofs4_dentry(struct dentry *dentry)
{
- return dentry && dentry->d_inode &&
+ return dentry && d_really_is_positive(dentry) &&
dentry->d_op == &autofs4_dentry_operations &&
dentry->d_fsdata != NULL;
}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 1e8ea192be2b..da0c33481bc0 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -12,14 +12,13 @@
#include "autofs_i.h"
-static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
if (ino && !autofs4_oz_mode(sbi))
ino->last_used = jiffies;
- nd_set_link(nd, dentry->d_inode->i_private);
- return NULL;
+ return d_inode(dentry)->i_private;
}
const struct inode_operations autofs4_symlink_inode_operations = {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 116fd38ee472..35b755e79c2d 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -70,7 +70,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
mutex_lock(&sbi->pipe_mutex);
while (bytes &&
- (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
+ (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) {
data += wr;
bytes -= wr;
}
@@ -322,7 +322,7 @@ static int validate_request(struct autofs_wait_queue **wait,
* continue on and create a new request.
*/
if (!IS_ROOT(dentry)) {
- if (dentry->d_inode && d_unhashed(dentry)) {
+ if (d_really_is_positive(dentry) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
new = d_lookup(parent, &dentry->d_name);
if (new)
@@ -364,7 +364,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
if (pid == 0 || tgid == 0)
return -ENOENT;
- if (!dentry->d_inode) {
+ if (d_really_is_negative(dentry)) {
/*
* A wait for a negative dentry is invalid for certain
* cases. A direct or offset mount "always" has its mount
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index 3a7813ab8c95..35d19e8731e3 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -19,16 +19,16 @@ typedef u64 befs_blocknr_t;
* BeFS in memory structures
*/
-typedef struct befs_mount_options {
+struct befs_mount_options {
kgid_t gid;
kuid_t uid;
int use_gid;
int use_uid;
int debug;
char *iocharset;
-} befs_mount_options;
+};
-typedef struct befs_sb_info {
+struct befs_sb_info {
u32 magic1;
u32 block_size;
u32 block_shift;
@@ -52,12 +52,11 @@ typedef struct befs_sb_info {
befs_inode_addr indices;
u32 magic3;
- befs_mount_options mount_opts;
+ struct befs_mount_options mount_opts;
struct nls_table *nls;
+};
-} befs_sb_info;
-
-typedef struct befs_inode_info {
+struct befs_inode_info {
u32 i_flags;
u32 i_type;
@@ -71,8 +70,7 @@ typedef struct befs_inode_info {
} i_data;
struct inode vfs_inode;
-
-} befs_inode_info;
+};
enum befs_err {
BEFS_OK,
@@ -105,16 +103,16 @@ void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *);
/* Gets a pointer to the private portion of the super_block
* structure from the public part
*/
-static inline befs_sb_info *
+static inline struct befs_sb_info *
BEFS_SB(const struct super_block *super)
{
- return (befs_sb_info *) super->s_fs_info;
+ return (struct befs_sb_info *) super->s_fs_info;
}
-static inline befs_inode_info *
+static inline struct befs_inode_info *
BEFS_I(const struct inode *inode)
{
- return list_entry(inode, struct befs_inode_info, vfs_inode);
+ return container_of(inode, struct befs_inode_info, vfs_inode);
}
static inline befs_blocknr_t
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 0826e91dacda..22c166280883 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -137,8 +137,8 @@ static int
befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
befs_btree_super * sup)
{
- struct buffer_head *bh = NULL;
- befs_disk_btree_super *od_sup = NULL;
+ struct buffer_head *bh;
+ befs_disk_btree_super *od_sup;
befs_debug(sb, "---> %s", __func__);
@@ -250,7 +250,7 @@ int
befs_btree_find(struct super_block *sb, befs_data_stream * ds,
const char *key, befs_off_t * value)
{
- struct befs_btree_node *this_node = NULL;
+ struct befs_btree_node *this_node;
befs_btree_super bt_super;
befs_off_t node_off;
int res;
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index 1e8e0b8d8836..ebd50718659f 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -168,7 +168,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
befs_blocknr_t blocks;
befs_blocknr_t datablocks; /* File data blocks */
befs_blocknr_t metablocks; /* FS metadata blocks */
- befs_sb_info *befs_sb = BEFS_SB(sb);
+ struct befs_sb_info *befs_sb = BEFS_SB(sb);
befs_debug(sb, "---> %s", __func__);
@@ -428,7 +428,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
struct buffer_head *indir_block;
befs_block_run indir_run;
befs_disk_inode_addr *iaddr_array = NULL;
- befs_sb_info *befs_sb = BEFS_SB(sb);
+ struct befs_sb_info *befs_sb = BEFS_SB(sb);
befs_blocknr_t indir_start_blk =
data->max_indirect_range >> befs_sb->block_shift;
diff --git a/fs/befs/io.c b/fs/befs/io.c
index 0408a3d601d0..7a5b4ec21c56 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -28,7 +28,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
{
struct buffer_head *bh = NULL;
befs_blocknr_t block = 0;
- befs_sb_info *befs_sb = BEFS_SB(sb);
+ struct befs_sb_info *befs_sb = BEFS_SB(sb);
befs_debug(sb, "---> Enter %s "
"[%u, %hu, %hu]", __func__, iaddr.allocation_group,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e089f1985fca..46aedacfa6a8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,8 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_destroy_inode(struct inode *inode);
static void befs_destroy_inodecache(void);
-static void *befs_follow_link(struct dentry *, struct nameidata *);
-static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
+static const char *befs_follow_link(struct dentry *, void **);
static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -51,7 +50,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
static void befs_put_super(struct super_block *);
static int befs_remount(struct super_block *, int *, char *);
static int befs_statfs(struct dentry *, struct kstatfs *);
-static int parse_options(char *, befs_mount_options *);
+static int parse_options(char *, struct befs_mount_options *);
static const struct super_operations befs_sops = {
.alloc_inode = befs_alloc_inode, /* allocate a new inode */
@@ -80,11 +79,6 @@ static const struct address_space_operations befs_aops = {
.bmap = befs_bmap,
};
-static const struct inode_operations befs_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = befs_fast_follow_link,
-};
-
static const struct inode_operations befs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = befs_follow_link,
@@ -304,9 +298,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
{
struct buffer_head *bh = NULL;
befs_inode *raw_inode = NULL;
-
- befs_sb_info *befs_sb = BEFS_SB(sb);
- befs_inode_info *befs_ino = NULL;
+ struct befs_sb_info *befs_sb = BEFS_SB(sb);
+ struct befs_inode_info *befs_ino = NULL;
struct inode *inode;
long ret = -EIO;
@@ -404,10 +397,12 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &befs_dir_inode_operations;
inode->i_fop = &befs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (befs_ino->i_flags & BEFS_LONG_SYMLINK)
+ if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
inode->i_op = &befs_symlink_inode_operations;
- else
- inode->i_op = &befs_fast_symlink_inode_operations;
+ } else {
+ inode->i_link = befs_ino->i_data.symlink;
+ inode->i_op = &simple_symlink_inode_operations;
+ }
} else {
befs_error(sb, "Inode %lu is not a regular file, "
"directory or symlink. THAT IS WRONG! BeFS has no "
@@ -468,43 +463,31 @@ befs_destroy_inodecache(void)
* The data stream become link name. Unless the LONG_SYMLINK
* flag is set.
*/
-static void *
-befs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *
+befs_follow_link(struct dentry *dentry, void **cookie)
{
struct super_block *sb = dentry->d_sb;
- befs_inode_info *befs_ino = BEFS_I(dentry->d_inode);
+ struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
char *link;
if (len == 0) {
befs_error(sb, "Long symlink with illegal length");
- link = ERR_PTR(-EIO);
- } else {
- befs_debug(sb, "Follow long symlink");
-
- link = kmalloc(len, GFP_NOFS);
- if (!link) {
- link = ERR_PTR(-ENOMEM);
- } else if (befs_read_lsymlink(sb, data, link, len) != len) {
- kfree(link);
- befs_error(sb, "Failed to read entire long symlink");
- link = ERR_PTR(-EIO);
- } else {
- link[len - 1] = '\0';
- }
+ return ERR_PTR(-EIO);
}
- nd_set_link(nd, link);
- return NULL;
-}
-
+ befs_debug(sb, "Follow long symlink");
-static void *
-befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- befs_inode_info *befs_ino = BEFS_I(dentry->d_inode);
- nd_set_link(nd, befs_ino->i_data.symlink);
- return NULL;
+ link = kmalloc(len, GFP_NOFS);
+ if (!link)
+ return ERR_PTR(-ENOMEM);
+ if (befs_read_lsymlink(sb, data, link, len) != len) {
+ kfree(link);
+ befs_error(sb, "Failed to read entire long symlink");
+ return ERR_PTR(-EIO);
+ }
+ link[len - 1] = '\0';
+ return *cookie = link;
}
/*
@@ -669,7 +652,7 @@ static const match_table_t befs_tokens = {
};
static int
-parse_options(char *options, befs_mount_options * opts)
+parse_options(char *options, struct befs_mount_options *opts)
{
char *p;
substring_t args[MAX_OPT_ARGS];
@@ -769,7 +752,7 @@ static int
befs_fill_super(struct super_block *sb, void *data, int silent)
{
struct buffer_head *bh;
- befs_sb_info *befs_sb;
+ struct befs_sb_info *befs_sb;
befs_super_block *disk_sb;
struct inode *root;
long ret = -EINVAL;
diff --git a/fs/befs/super.c b/fs/befs/super.c
index ca40f828f64d..aeafc4d84278 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -24,7 +24,7 @@
int
befs_load_sb(struct super_block *sb, befs_super_block * disk_sb)
{
- befs_sb_info *befs_sb = BEFS_SB(sb);
+ struct befs_sb_info *befs_sb = BEFS_SB(sb);
/* Check the byte order of the filesystem */
if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE)
@@ -59,7 +59,7 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb)
int
befs_check_sb(struct super_block *sb)
{
- befs_sb_info *befs_sb = BEFS_SB(sb);
+ struct befs_sb_info *befs_sb = BEFS_SB(sb);
/* Check magic headers of super block */
if ((befs_sb->magic1 != BEFS_SUPER_MAGIC1)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 08063ae0a17c..3ec6113146c0 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -86,7 +86,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode = new_inode(s);
if (!inode)
- return -ENOSPC;
+ return -ENOMEM;
mutex_lock(&info->bfs_lock);
ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1);
if (ino > info->si_lasti) {
@@ -153,7 +153,7 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
static int bfs_link(struct dentry *old, struct inode *dir,
struct dentry *new)
{
- struct inode *inode = old->d_inode;
+ struct inode *inode = d_inode(old);
struct bfs_sb_info *info = BFS_SB(inode->i_sb);
int err;
@@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
static int bfs_unlink(struct inode *dir, struct dentry *dentry)
{
int error = -ENOENT;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct buffer_head *bh;
struct bfs_dirent *de;
struct bfs_sb_info *info = BFS_SB(inode->i_sb);
@@ -216,7 +216,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int error = -ENOENT;
old_bh = new_bh = NULL;
- old_inode = old_dentry->d_inode;
+ old_inode = d_inode(old_dentry);
if (S_ISDIR(old_inode->i_mode))
return -EINVAL;
@@ -231,7 +231,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
error = -EPERM;
- new_inode = new_dentry->d_inode;
+ new_inode = d_inode(new_dentry);
new_bh = bfs_find_entry(new_dir,
new_dentry->d_name.name,
new_dentry->d_name.len, &new_de);
@@ -293,7 +293,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
for (block = sblock; block <= eblock; block++) {
bh = sb_bread(dir->i_sb, block);
if (!bh)
- return -ENOSPC;
+ return -EIO;
for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) {
de = (struct bfs_dirent *)(bh->b_data + off);
if (!de->ino) {
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index e7f88ace1a25..97f1b5160155 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -23,9 +23,7 @@
const struct file_operations bfs_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 90bc079d9982..fdcb4d69f430 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
#include <linux/buffer_head.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
#include "bfs.h"
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 995986b8e36b..6b659967898e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
#include <linux/security.h>
#include <linux/random.h>
#include <linux/elf.h>
+#include <linux/elf-randomize.h>
#include <linux/utsname.h>
#include <linux/coredump.h>
#include <linux/sched.h>
@@ -862,6 +863,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
int elf_prot = 0, elf_flags;
unsigned long k, vaddr;
+ unsigned long total_size = 0;
if (elf_ppnt->p_type != PT_LOAD)
continue;
@@ -909,25 +911,20 @@ static int load_elf_binary(struct linux_binprm *bprm)
* default mmap base, as well as whatever program they
* might try to exec. This is because the brk will
* follow the loader, and is not movable. */
-#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
- /* Memory randomization might have been switched off
- * in runtime via sysctl or explicit setting of
- * personality flags.
- * If that is the case, retain the original non-zero
- * load_bias value in order to establish proper
- * non-randomized mappings.
- */
+ load_bias = ELF_ET_DYN_BASE - vaddr;
if (current->flags & PF_RANDOMIZE)
- load_bias = 0;
- else
- load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
-#else
- load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
-#endif
+ load_bias += arch_mmap_rnd();
+ load_bias = ELF_PAGESTART(load_bias);
+ total_size = total_mapping_size(elf_phdata,
+ loc->elf_ex.e_phnum);
+ if (!total_size) {
+ retval = -EINVAL;
+ goto out_free_dentry;
+ }
}
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
- elf_prot, elf_flags, 0);
+ elf_prot, elf_flags, total_size);
if (BAD_ADDR(error)) {
retval = IS_ERR((void *)error) ?
PTR_ERR((void*)error) : -EINVAL;
@@ -1053,15 +1050,13 @@ static int load_elf_binary(struct linux_binprm *bprm)
current->mm->end_data = end_data;
current->mm->start_stack = bprm->p;
-#ifdef arch_randomize_brk
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
current->mm->brk = current->mm->start_brk =
arch_randomize_brk(current->mm);
-#ifdef CONFIG_COMPAT_BRK
+#ifdef compat_brk_randomized
current->brk_randomized = 1;
#endif
}
-#endif
if (current->personality & MMAP_PAGE_ZERO) {
/* Why this, you ask??? Well SVr4 maps page 0 as read-only,
@@ -1535,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note)
file = vma->vm_file;
if (!file)
continue;
- filename = d_path(&file->f_path, name_curpos, remaining);
+ filename = file_path(file, name_curpos, remaining);
if (IS_ERR(filename)) {
if (PTR_ERR(filename) == -ENAMETOOLONG) {
vfree(data);
@@ -1545,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note)
continue;
}
- /* d_path() fills at the end, move name down */
+ /* file_path() fills at the end, move name down */
/* n = strlen(filename) + 1: */
n = (name_curpos + remaining) - filename;
remaining = filename - name_curpos;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 97aff2879cda..78f005f37847 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -9,6 +9,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
@@ -521,9 +522,8 @@ static int parse_command(const char __user *buffer, size_t count)
static void entry_status(Node *e, char *page)
{
- char *dp;
- char *status = "disabled";
- const char *flags = "flags: ";
+ char *dp = page;
+ const char *status = "disabled";
if (test_bit(Enabled, &e->flags))
status = "enabled";
@@ -533,12 +533,10 @@ static void entry_status(Node *e, char *page)
return;
}
- sprintf(page, "%s\ninterpreter %s\n", status, e->interpreter);
- dp = page + strlen(page);
+ dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter);
/* print the special flags */
- sprintf(dp, "%s", flags);
- dp += strlen(flags);
+ dp += sprintf(dp, "flags: ");
if (e->flags & MISC_FMT_PRESERVE_ARGV0)
*dp++ = 'P';
if (e->flags & MISC_FMT_OPEN_BINARY)
@@ -550,21 +548,11 @@ static void entry_status(Node *e, char *page)
if (!test_bit(Magic, &e->flags)) {
sprintf(dp, "extension .%s\n", e->magic);
} else {
- int i;
-
- sprintf(dp, "offset %i\nmagic ", e->offset);
- dp = page + strlen(page);
- for (i = 0; i < e->size; i++) {
- sprintf(dp, "%02x", 0xff & (int) (e->magic[i]));
- dp += 2;
- }
+ dp += sprintf(dp, "offset %i\nmagic ", e->offset);
+ dp = bin2hex(dp, e->magic, e->size);
if (e->mask) {
- sprintf(dp, "\nmask ");
- dp += 6;
- for (i = 0; i < e->size; i++) {
- sprintf(dp, "%02x", 0xff & (int) (e->mask[i]));
- dp += 2;
- }
+ dp += sprintf(dp, "\nmask ");
+ dp = bin2hex(dp, e->mask, e->size);
}
*dp++ = '\n';
*dp = '\0';
@@ -603,7 +591,7 @@ static void kill_node(Node *e)
write_unlock(&entries_lock);
if (dentry) {
- drop_nlink(dentry->d_inode);
+ drop_nlink(d_inode(dentry));
d_drop(dentry);
dput(dentry);
simple_release_fs(&bm_mnt, &entry_count);
@@ -650,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
case 3:
/* Delete this handler. */
root = dget(file->f_path.dentry->d_sb->s_root);
- mutex_lock(&root->d_inode->i_mutex);
+ mutex_lock(&d_inode(root)->i_mutex);
kill_node(e);
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
dput(root);
break;
default:
@@ -687,14 +675,14 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
return PTR_ERR(e);
root = dget(sb->s_root);
- mutex_lock(&root->d_inode->i_mutex);
+ mutex_lock(&d_inode(root)->i_mutex);
dentry = lookup_one_len(e->name, root, strlen(e->name));
err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out;
err = -EEXIST;
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
goto out2;
inode = bm_get_inode(sb, S_IFREG | 0644);
@@ -723,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
out2:
dput(dentry);
out:
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
dput(root);
if (err) {
@@ -766,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
case 3:
/* Delete all handlers. */
root = dget(file->f_path.dentry->d_sb->s_root);
- mutex_lock(&root->d_inode->i_mutex);
+ mutex_lock(&d_inode(root)->i_mutex);
while (!list_empty(&entries))
kill_node(list_entry(entries.next, Node, list));
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
dput(root);
break;
default:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266be67d3..198243717da5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -14,6 +14,7 @@
#include <linux/device_cgroup.h>
#include <linux/highmem.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
#include <linux/magic.h>
@@ -27,7 +28,6 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
-#include <linux/aio.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -43,7 +43,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode)
return container_of(inode, struct bdev_inode, vfs_inode);
}
-inline struct block_device *I_BDEV(struct inode *inode)
+struct block_device *I_BDEV(struct inode *inode)
{
return &BDEV_I(inode)->bdev;
}
@@ -147,15 +147,17 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
}
static ssize_t
-blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
- offset, blkdev_get_block,
- NULL, NULL, 0);
+ if (IS_DAX(inode))
+ return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
+ NULL, DIO_SKIP_DIO_COUNT);
+ return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
+ blkdev_get_block, NULL, NULL,
+ DIO_SKIP_DIO_COUNT);
}
int __sync_blockdev(struct block_device *bdev, int wait)
@@ -378,7 +380,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
struct page *page)
{
const struct block_device_operations *ops = bdev->bd_disk->fops;
- if (!ops->rw_page)
+ if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
}
@@ -409,7 +411,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
int result;
int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
const struct block_device_operations *ops = bdev->bd_disk->fops;
- if (!ops->rw_page)
+ if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
set_page_writeback(page);
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
@@ -444,6 +446,12 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
long avail;
const struct block_device_operations *ops = bdev->bd_disk->fops;
+ /*
+ * The device driver is allowed to sleep, in order to make the
+ * memory directly accessible.
+ */
+ might_sleep();
+
if (size < 0)
return size;
if (!ops->direct_access)
@@ -548,7 +556,8 @@ static struct file_system_type bd_type = {
.kill_sb = kill_anon_super,
};
-static struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
{
@@ -689,11 +698,6 @@ static struct block_device *bd_acquire(struct inode *inode)
return bdev;
}
-int sb_is_blkdev_sb(struct super_block *sb)
-{
- return sb == blockdev_superblock;
-}
-
/* Call when you free inode */
void bd_forget(struct inode *inode)
@@ -1175,6 +1179,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
+ bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
if (!partno) {
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
@@ -1598,9 +1603,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
+ struct inode *bd_inode = file->f_mapping->host;
+ loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
ssize_t ret;
+ if (bdev_read_only(I_BDEV(bd_inode)))
+ return -EPERM;
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ if (iocb->ki_pos >= size)
+ return -ENOSPC;
+
+ iov_iter_truncate(from, size - iocb->ki_pos);
+
blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
if (ret > 0) {
@@ -1660,8 +1678,6 @@ const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
.mmap = generic_file_mmap,
@@ -1708,7 +1724,7 @@ struct block_device *lookup_bdev(const char *pathname)
if (error)
return ERR_PTR(error);
- inode = path.dentry->d_inode;
+ inode = d_backing_inode(path.dentry);
error = -ENOTBLK;
if (!S_ISBLK(inode->i_mode))
goto fail;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4dabeb893b7c..1ce06c849a86 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -85,9 +85,10 @@ BTRFS_WORK_HELPER(extent_refs_helper);
BTRFS_WORK_HELPER(scrub_helper);
BTRFS_WORK_HELPER(scrubwrc_helper);
BTRFS_WORK_HELPER(scrubnc_helper);
+BTRFS_WORK_HELPER(scrubparity_helper);
static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -132,7 +133,7 @@ static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
- int flags,
+ unsigned int flags,
int max_active,
int thresh)
{
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index e386c29ef1f6..b0b093b6afec 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,9 +64,11 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
BTRFS_WORK_HELPER_PROTO(scrub_helper);
BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
+
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
- int flags,
+ unsigned int flags,
int max_active,
int thresh);
void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f55721ff9385..802fabb30e15 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
* the first item to check. But sometimes, we may enter it with
* slot==nritems. In that case, go to the next leaf before we continue.
*/
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
- ret = btrfs_next_old_leaf(root, path, time_seq);
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ if (time_seq == (u64)-1)
+ ret = btrfs_next_leaf(root, path);
+ else
+ ret = btrfs_next_old_leaf(root, path, time_seq);
+ }
while (!ret && count < total_refs) {
eb = path->nodes[0];
@@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- ret = btrfs_next_old_item(root, path, time_seq);
+ if (time_seq == (u64)-1)
+ ret = btrfs_next_item(root, path);
+ else
+ ret = btrfs_next_old_item(root, path, time_seq);
}
if (ret > 0)
@@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
+ else if (time_seq == (u64)-1)
+ root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
}
path->lowest_level = level;
- ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+ if (time_seq == (u64)-1)
+ ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
+ 0, 0);
+ else
+ ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
+ time_seq);
/* root node has been locked, we can release @subvol_srcu safely here */
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -491,7 +505,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(!ref->wanted_disk_byte);
eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -507,7 +523,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
}
/*
- * merge two lists of backrefs and adjust counts accordingly
+ * merge backrefs and adjust counts accordingly
*
* mode = 1: merge identical keys, if key is set
* FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
@@ -535,9 +551,9 @@ static void __merge_refs(struct list_head *head, int mode)
ref2 = list_entry(pos2, struct __prelim_ref, list);
+ if (!ref_for_same_block(ref1, ref2))
+ continue;
if (mode == 1) {
- if (!ref_for_same_block(ref1, ref2))
- continue;
if (!ref1->parent && ref2->parent) {
xchg = ref1;
ref1 = ref2;
@@ -572,8 +588,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct list_head *prefs, u64 *total_refs,
u64 inum)
{
+ struct btrfs_delayed_ref_node *node;
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
- struct rb_node *n = &head->node.rb_node;
struct btrfs_key key;
struct btrfs_key op_key = {0};
int sgn;
@@ -583,12 +599,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
spin_lock(&head->lock);
- n = rb_first(&head->ref_root);
- while (n) {
- struct btrfs_delayed_ref_node *node;
- node = rb_entry(n, struct btrfs_delayed_ref_node,
- rb_node);
- n = rb_next(n);
+ list_for_each_entry(node, &head->ref_list, list) {
if (node->seq > seq)
continue;
@@ -880,6 +891,13 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
* indirect refs to their parent bytenr.
* When roots are found, they're added to the roots list
*
+ * NOTE: This can return values > 0
+ *
+ * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * much like trans == NULL case, the difference only lies in it will not
+ * commit root.
+ * The special case is for qgroup to search roots in commit_transaction().
+ *
* FIXME some caching might speed things up
*/
static int find_parent_nodes(struct btrfs_trans_handle *trans,
@@ -918,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
+ if (time_seq == (u64)-1)
+ path->skip_locking = 1;
+
/*
* grab both a lock on the path and a lock on the delayed ref head.
* We need both to get a consistent picture of how the refs look
@@ -932,9 +953,10 @@ again:
BUG_ON(ret == 0);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (trans && likely(trans->type != __TRANS_DUMMY)) {
+ if (trans && likely(trans->type != __TRANS_DUMMY) &&
+ time_seq != (u64)-1) {
#else
- if (trans) {
+ if (trans && time_seq != (u64)-1) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1032,7 +1054,10 @@ again:
eb = read_tree_block(fs_info->extent_root,
ref->parent, 0);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
@@ -1198,6 +1223,19 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return ret;
}
+/**
+ * btrfs_check_shared - tell us whether an extent is shared
+ *
+ * @trans: optional trans handle
+ *
+ * btrfs_check_shared uses the backref walking code but will short
+ * circuit as soon as it finds a root or inode that doesn't match the
+ * one passed in. This provides a significant performance benefit for
+ * callers (such as fiemap) which want to know whether the extent is
+ * shared but do not need a ref count.
+ *
+ * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
+ */
int btrfs_check_shared(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 root_objectid,
u64 inum, u64 bytenr)
@@ -1206,7 +1244,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
struct ulist *roots = NULL;
struct ulist_iterator uiter;
struct ulist_node *node;
- struct seq_list elem = {};
+ struct seq_list elem = SEQ_LIST_INIT(elem);
int ret = 0;
tmp = ulist_alloc(GFP_NOFS);
@@ -1226,11 +1264,13 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
roots, NULL, root_objectid, inum);
if (ret == BACKREF_FOUND_SHARED) {
+ /* this is the only condition under which we return 1 */
ret = 1;
break;
}
if (ret < 0 && ret != -ENOENT)
break;
+ ret = 0;
node = ulist_next(tmp, &uiter);
if (!node)
break;
@@ -1610,7 +1650,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list tree_mod_seq_elem = {};
+ struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
struct ulist_iterator ref_uiter;
struct ulist_iterator root_uiter;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index de5e4f2adfea..81220b2203c6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,8 @@
#define BTRFS_INODE_IN_DELALLOC_LIST 9
#define BTRFS_INODE_READDIO_NEED_LOCK 10
#define BTRFS_INODE_HAS_PROPS 11
+/* DIO is ready to submit */
+#define BTRFS_INODE_DIO_READY 12
/*
* The following 3 bits are meant only for the btree inode.
* When any of them is set, it means an error happened while writing an
@@ -66,7 +68,11 @@ struct btrfs_inode {
*/
struct btrfs_key location;
- /* Lock for counters */
+ /*
+ * Lock for counters and all fields used to determine if the inode is in
+ * the log or not (last_trans, last_sub_trans, last_log_commit,
+ * logged_trans).
+ */
spinlock_t lock;
/* the extent_tree has caches of all the extent mappings to disk */
@@ -250,6 +256,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
{
+ int ret = 0;
+
+ spin_lock(&BTRFS_I(inode)->lock);
if (BTRFS_I(inode)->logged_trans == generation &&
BTRFS_I(inode)->last_sub_trans <=
BTRFS_I(inode)->last_log_commit &&
@@ -263,9 +272,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
*/
smp_mb();
if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
- return 1;
+ ret = 1;
}
- return 0;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return ret;
}
#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index d897ef803b3b..ce7dec88f4b8 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2990,8 +2990,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
(unsigned long long)bio->bi_iter.bi_sector,
dev_bytenr, bio->bi_bdev);
- mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
- GFP_NOFS);
+ mapped_datav = kmalloc_array(bio->bi_vcnt,
+ sizeof(*mapped_datav), GFP_NOFS);
if (!mapped_datav)
goto leave;
cur_bytenr = dev_bytenr;
@@ -3241,8 +3241,5 @@ void btrfsic_unmount(struct btrfs_root *root,
mutex_unlock(&btrfsic_mutex);
- if (is_vmalloc_addr(state))
- vfree(state);
- else
- kfree(state);
+ kvfree(state);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e9df8862012c..ce62324c78e7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -622,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->orig_bio = bio;
nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
- cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
+ cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
GFP_NOFS);
if (!cb->compressed_pages)
goto fail1;
@@ -750,7 +750,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
-static struct btrfs_compress_op *btrfs_compress_op[] = {
+static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zlib_compress,
&btrfs_lzo_compress,
};
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d181f70caae0..13a4dc0436c9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -77,7 +77,7 @@ struct btrfs_compress_op {
size_t srclen, size_t destlen);
};
-extern struct btrfs_compress_op btrfs_zlib_compress;
-extern struct btrfs_compress_op btrfs_lzo_compress;
+extern const struct btrfs_compress_op btrfs_zlib_compress;
+extern const struct btrfs_compress_op btrfs_lzo_compress;
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d67f32e648d..54114b4887dd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -578,7 +578,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
if (!tree_mod_need_log(fs_info, eb))
return 0;
- tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
if (!tm_list)
return -ENOMEM;
@@ -677,7 +677,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
if (log_removal && btrfs_header_level(old_root) > 0) {
nritems = btrfs_header_nritems(old_root);
- tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
flags);
if (!tm_list) {
ret = -ENOMEM;
@@ -814,7 +814,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
return 0;
- tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
+ tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
GFP_NOFS);
if (!tm_list)
return -ENOMEM;
@@ -905,8 +905,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
return 0;
nritems = btrfs_header_nritems(eb);
- tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
- GFP_NOFS);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
if (!tm_list)
return -ENOMEM;
@@ -1073,7 +1072,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, buf, 1);
BUG_ON(ret); /* -ENOMEM */
}
- clean_tree_block(trans, root, buf);
+ clean_tree_block(trans, root->fs_info, buf);
*last_ref = 1;
}
return 0;
@@ -1440,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
old = read_tree_block(root, logical, 0);
- if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
- free_extent_buffer(old);
+ if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+ if (!IS_ERR(old))
+ free_extent_buffer(old);
btrfs_warn(root->fs_info,
"failed to read tree block %llu from get_old_root", logical);
} else {
@@ -1678,7 +1678,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
continue;
}
- cur = btrfs_find_tree_block(root, blocknr);
+ cur = btrfs_find_tree_block(root->fs_info, blocknr);
if (cur)
uptodate = btrfs_buffer_uptodate(cur, gen, 0);
else
@@ -1686,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (!cur || !uptodate) {
if (!cur) {
cur = read_tree_block(root, blocknr, gen);
- if (!cur || !extent_buffer_uptodate(cur)) {
+ if (IS_ERR(cur)) {
+ return PTR_ERR(cur);
+ } else if (!extent_buffer_uptodate(cur)) {
free_extent_buffer(cur);
return -EIO;
}
@@ -1865,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
btrfs_node_ptr_generation(parent, slot));
- if (eb && !extent_buffer_uptodate(eb)) {
- free_extent_buffer(eb);
+ if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
+ if (!IS_ERR(eb))
+ free_extent_buffer(eb);
eb = NULL;
}
@@ -1943,7 +1946,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
path->nodes[level] = NULL;
- clean_tree_block(trans, root, mid);
+ clean_tree_block(trans, root->fs_info, mid);
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
@@ -1997,7 +2000,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (btrfs_header_nritems(right) == 0) {
- clean_tree_block(trans, root, right);
+ clean_tree_block(trans, root->fs_info, right);
btrfs_tree_unlock(right);
del_ptr(root, path, level + 1, pslot + 1);
root_sub_used(root, right->len);
@@ -2041,7 +2044,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(wret == 1);
}
if (btrfs_header_nritems(mid) == 0) {
- clean_tree_block(trans, root, mid);
+ clean_tree_block(trans, root->fs_info, mid);
btrfs_tree_unlock(mid);
del_ptr(root, path, level + 1, pslot);
root_sub_used(root, mid->len);
@@ -2259,7 +2262,7 @@ static void reada_for_search(struct btrfs_root *root,
search = btrfs_node_blockptr(node, slot);
blocksize = root->nodesize;
- eb = btrfs_find_tree_block(root, search);
+ eb = btrfs_find_tree_block(root->fs_info, search);
if (eb) {
free_extent_buffer(eb);
return;
@@ -2319,7 +2322,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
gen = btrfs_node_ptr_generation(parent, slot - 1);
- eb = btrfs_find_tree_block(root, block1);
+ eb = btrfs_find_tree_block(root->fs_info, block1);
/*
* if we get -eagain from btrfs_buffer_uptodate, we
* don't want to return eagain here. That will loop
@@ -2332,7 +2335,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
if (slot + 1 < nritems) {
block2 = btrfs_node_blockptr(parent, slot + 1);
gen = btrfs_node_ptr_generation(parent, slot + 1);
- eb = btrfs_find_tree_block(root, block2);
+ eb = btrfs_find_tree_block(root->fs_info, block2);
if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
block2 = 0;
free_extent_buffer(eb);
@@ -2450,7 +2453,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
blocknr = btrfs_node_blockptr(b, slot);
gen = btrfs_node_ptr_generation(b, slot);
- tmp = btrfs_find_tree_block(root, blocknr);
+ tmp = btrfs_find_tree_block(root->fs_info, blocknr);
if (tmp) {
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -2495,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
tmp = read_tree_block(root, blocknr, 0);
- if (tmp) {
+ if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now
@@ -3126,7 +3129,8 @@ again:
* higher levels
*
*/
-static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
struct btrfs_disk_key *key, int level)
{
int i;
@@ -3137,7 +3141,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
if (!path->nodes[i])
break;
t = path->nodes[i];
- tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
+ tree_mod_log_set_node_key(fs_info, t, tslot, 1);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
@@ -3151,7 +3155,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
* This function isn't completely safe. It's the caller's responsibility
* that the new key won't break the order
*/
-void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
struct btrfs_key *new_key)
{
struct btrfs_disk_key disk_key;
@@ -3173,7 +3178,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
btrfs_set_item_key(eb, &disk_key, slot);
btrfs_mark_buffer_dirty(eb);
if (slot == 0)
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(fs_info, path, &disk_key, 1);
}
/*
@@ -3692,7 +3697,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (left_nritems)
btrfs_mark_buffer_dirty(left);
else
- clean_tree_block(trans, root, left);
+ clean_tree_block(trans, root->fs_info, left);
btrfs_mark_buffer_dirty(right);
@@ -3704,7 +3709,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (path->slots[0] >= left_nritems) {
path->slots[0] -= left_nritems;
if (btrfs_header_nritems(path->nodes[0]) == 0)
- clean_tree_block(trans, root, path->nodes[0]);
+ clean_tree_block(trans, root->fs_info, path->nodes[0]);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = right;
@@ -3928,10 +3933,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
if (right_nritems)
btrfs_mark_buffer_dirty(right);
else
- clean_tree_block(trans, root, right);
+ clean_tree_block(trans, root->fs_info, right);
btrfs_item_key(right, &disk_key, 0);
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
@@ -4168,6 +4173,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int mid;
int slot;
struct extent_buffer *right;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int wret;
int split;
@@ -4271,10 +4277,10 @@ again:
btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(right, root->root_key.objectid);
btrfs_set_header_level(right, 0);
- write_extent_buffer(right, root->fs_info->fsid,
+ write_extent_buffer(right, fs_info->fsid,
btrfs_header_fsid(), BTRFS_FSID_SIZE);
- write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+ write_extent_buffer(right, fs_info->chunk_tree_uuid,
btrfs_header_chunk_tree_uuid(right),
BTRFS_UUID_SIZE);
@@ -4297,7 +4303,7 @@ again:
path->nodes[0] = right;
path->slots[0] = 0;
if (path->slots[1] == 0)
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(fs_info, path, &disk_key, 1);
}
btrfs_mark_buffer_dirty(right);
return ret;
@@ -4615,7 +4621,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
btrfs_set_item_key(leaf, &disk_key, slot);
if (slot == 0)
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
}
item = btrfs_item_nr(slot);
@@ -4716,7 +4722,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
}
btrfs_unlock_up_safe(path, 1);
@@ -4888,7 +4894,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_disk_key disk_key;
btrfs_node_key(parent, &disk_key, 0);
- fixup_low_keys(root, path, &disk_key, level + 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
}
btrfs_mark_buffer_dirty(parent);
}
@@ -4981,7 +4987,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_level(leaf, 0);
} else {
btrfs_set_path_blocking(path);
- clean_tree_block(trans, root, leaf);
+ clean_tree_block(trans, root->fs_info, leaf);
btrfs_del_leaf(trans, root, path, leaf);
}
} else {
@@ -4990,7 +4996,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_disk_key disk_key;
btrfs_item_key(leaf, &disk_key, 0);
- fixup_low_keys(root, path, &disk_key, 1);
+ fixup_low_keys(root->fs_info, path, &disk_key, 1);
}
/* delete the leaf if it is mostly empty */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9c89cae39ee..aac314e14188 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -174,7 +174,7 @@ struct btrfs_ordered_sum;
/* csum types */
#define BTRFS_CSUM_TYPE_CRC32 0
-static int btrfs_csum_sizes[] = { 4, 0 };
+static int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
@@ -1061,6 +1061,12 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+#define BTRFS_QGROUP_LEVEL_SHIFT 48
+static inline u64 btrfs_qgroup_level(u64 qgroupid)
+{
+ return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
+}
+
/*
* is subvolume quota turned on?
*/
@@ -1256,6 +1262,20 @@ struct btrfs_caching_control {
atomic_t count;
};
+struct btrfs_io_ctl {
+ void *cur, *orig;
+ struct page *page;
+ struct page **pages;
+ struct btrfs_root *root;
+ struct inode *inode;
+ unsigned long size;
+ int index;
+ int num_pages;
+ int entries;
+ int bitmaps;
+ unsigned check_crcs:1;
+};
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
@@ -1321,6 +1341,9 @@ struct btrfs_block_group_cache {
/* For dirty block groups */
struct list_head dirty_list;
+ struct list_head io_list;
+
+ struct btrfs_io_ctl io_ctl;
};
/* delayed seq elem */
@@ -1329,6 +1352,8 @@ struct seq_list {
u64 seq;
};
+#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@@ -1472,6 +1497,12 @@ struct btrfs_fs_info {
struct mutex chunk_mutex;
struct mutex volume_mutex;
+ /*
+ * this is taken to make sure we don't set block groups ro after
+ * the free space cache has been allocated on them
+ */
+ struct mutex ro_block_group_mutex;
+
/* this is used during read/modify/write to make sure
* no two ios are trying to mod the same stripe at the same
* time
@@ -1513,6 +1544,7 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
+ struct rw_semaphore delayed_iput_sem;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
@@ -1587,10 +1619,7 @@ struct btrfs_fs_info {
struct task_struct *cleaner_kthread;
int thread_pool_size;
- struct kobject super_kobj;
struct kobject *space_info_kobj;
- struct kobject *device_dir_kobj;
- struct completion kobj_unregister;
int do_barriers;
int closing;
int log_root_recovering;
@@ -1666,6 +1695,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_nocow_workers;
+ struct btrfs_workqueue *scrub_parity_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
@@ -1703,7 +1733,7 @@ struct btrfs_fs_info {
/* list of dirty qgroups to be written at next commit */
struct list_head dirty_qgroups;
- /* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+ /* used by qgroup for an efficient tree traversal */
u64 qgroup_seq;
/* qgroup rescan items */
@@ -1748,6 +1778,7 @@ struct btrfs_fs_info {
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
+ struct mutex delete_unused_bgs_mutex;
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
@@ -3295,6 +3326,9 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
}
/* extent-tree.c */
+
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
+
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
@@ -3385,6 +3419,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset, int no_quota);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
@@ -3417,10 +3453,11 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
};
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode);
void btrfs_orphan_release_metadata(struct inode *inode);
@@ -3440,6 +3477,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
unsigned short type);
void btrfs_free_block_rsv(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
@@ -3477,6 +3515,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
int __get_raid_index(u64 flags);
int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ const u64 type);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3486,7 +3527,8 @@ int btrfs_previous_item(struct btrfs_root *root,
int type);
int btrfs_previous_extent_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid);
-void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
@@ -4011,6 +4053,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#ifdef CONFIG_BTRFS_ASSERT
+__cold
static inline void assfail(char *expr, char *file, int line)
{
pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
@@ -4026,10 +4069,12 @@ static inline void assfail(char *expr, char *file, int line)
#define btrfs_assert()
__printf(5, 6)
+__cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno);
@@ -4072,11 +4117,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
*/
-
#define btrfs_abort_transaction(trans, root, errno) \
do { \
- __btrfs_abort_transaction(trans, root, __func__, \
- __LINE__, errno); \
+ /* Report first abort since mount */ \
+ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
+ &((root)->fs_info->fs_state))) { \
+ WARN(1, KERN_DEBUG \
+ "BTRFS: Transaction aborted (error %d)\n", \
+ (errno)); \
+ } \
+ __btrfs_abort_transaction((trans), (root), __func__, \
+ __LINE__, (errno)); \
} while (0)
#define btrfs_std_error(fs_info, errno) \
@@ -4093,6 +4144,7 @@ do { \
} while (0)
__printf(5, 6)
+__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
@@ -4180,7 +4232,8 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
static inline int is_fstree(u64 rootid)
{
if (rootid == BTRFS_FS_TREE_OBJECTID ||
- (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+ ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
+ !btrfs_qgroup_level(rootid)))
return 1;
return 0;
}
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 82f0c7c95474..a2ae42720a6a 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1383,7 +1383,7 @@ out:
static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
- struct btrfs_root *root, int nr)
+ struct btrfs_fs_info *fs_info, int nr)
{
struct btrfs_async_delayed_work *async_work;
@@ -1399,7 +1399,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
btrfs_async_run_delayed_root, NULL, NULL);
async_work->nr = nr;
- btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
+ btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
return 0;
}
@@ -1426,6 +1426,7 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
void btrfs_balance_delayed_items(struct btrfs_root *root)
{
struct btrfs_delayed_root *delayed_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
delayed_root = btrfs_get_delayed_root(root);
@@ -1438,7 +1439,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
seq = atomic_read(&delayed_root->items_seq);
- ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
+ ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
if (ret)
return;
@@ -1447,7 +1448,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
return;
}
- btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
+ btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
}
/* Will return 0 or -ENOMEM */
@@ -1801,6 +1802,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
+ BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
+
inode->i_version = btrfs_stack_inode_sequence(inode_item);
inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6d16bea94e1c..ac3e81da6d4e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -22,6 +22,7 @@
#include "ctree.h"
#include "delayed-ref.h"
#include "transaction.h"
+#include "qgroup.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
return 0;
}
-/*
- * entries in the rb tree are ordered by the byte number of the extent,
- * type of the delayed backrefs and content of delayed backrefs.
- */
-static int comp_entry(struct btrfs_delayed_ref_node *ref2,
- struct btrfs_delayed_ref_node *ref1,
- bool compare_seq)
-{
- if (ref1->bytenr < ref2->bytenr)
- return -1;
- if (ref1->bytenr > ref2->bytenr)
- return 1;
- if (ref1->is_head && ref2->is_head)
- return 0;
- if (ref2->is_head)
- return -1;
- if (ref1->is_head)
- return 1;
- if (ref1->type < ref2->type)
- return -1;
- if (ref1->type > ref2->type)
- return 1;
- if (ref1->no_quota > ref2->no_quota)
- return 1;
- if (ref1->no_quota < ref2->no_quota)
- return -1;
- /* merging of sequenced refs is not allowed */
- if (compare_seq) {
- if (ref1->seq < ref2->seq)
- return -1;
- if (ref1->seq > ref2->seq)
- return 1;
- }
- if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
- btrfs_delayed_node_to_tree_ref(ref1),
- ref1->type);
- } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
- ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
- return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
- btrfs_delayed_node_to_data_ref(ref1));
- }
- BUG();
- return 0;
-}
-
-/*
- * insert a new ref into the rbtree. This returns any existing refs
- * for the same (bytenr,parent) tuple, or NULL if the new node was properly
- * inserted.
- */
-static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_delayed_ref_node *entry;
- struct btrfs_delayed_ref_node *ins;
- int cmp;
-
- ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
- rb_node);
-
- cmp = comp_entry(entry, ins, 1);
- if (cmp < 0)
- p = &(*p)->rb_left;
- else if (cmp > 0)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
- return NULL;
-}
-
/* insert a new ref to head ref rbtree */
static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
struct rb_node *node)
@@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
rb_erase(&head->href_node, &delayed_refs->href_root);
} else {
assert_spin_locked(&head->lock);
- rb_erase(&ref->rb_node, &head->ref_root);
+ list_del(&ref->list);
}
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
@@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates--;
}
-static int merge_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head,
- struct btrfs_delayed_ref_node *ref, u64 seq)
-{
- struct rb_node *node;
- int mod = 0;
- int done = 0;
-
- node = rb_next(&ref->rb_node);
- while (!done && node) {
- struct btrfs_delayed_ref_node *next;
-
- next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
- if (seq && next->seq >= seq)
- break;
- if (comp_entry(ref, next, 0))
- continue;
-
- if (ref->action == next->action) {
- mod = next->ref_mod;
- } else {
- if (ref->ref_mod < next->ref_mod) {
- struct btrfs_delayed_ref_node *tmp;
-
- tmp = ref;
- ref = next;
- next = tmp;
- done = 1;
- }
- mod = -next->ref_mod;
- }
-
- drop_delayed_ref(trans, delayed_refs, head, next);
- ref->ref_mod += mod;
- if (ref->ref_mod == 0) {
- drop_delayed_ref(trans, delayed_refs, head, ref);
- done = 1;
- } else {
- /*
- * You can't have multiples of the same ref on a tree
- * block.
- */
- WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
- ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
- }
- }
- return done;
-}
-
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
-{
- struct rb_node *node;
- u64 seq = 0;
-
- assert_spin_locked(&head->lock);
- /*
- * We don't have too much refs to merge in the case of delayed data
- * refs.
- */
- if (head->is_data)
- return;
-
- spin_lock(&fs_info->tree_mod_seq_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- struct seq_list *elem;
-
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- seq = elem->seq;
- }
- spin_unlock(&fs_info->tree_mod_seq_lock);
-
- node = rb_first(&head->ref_root);
- while (node) {
- struct btrfs_delayed_ref_node *ref;
-
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
- /* We can't merge refs that are outside of our seq count */
- if (seq && ref->seq >= seq)
- break;
- if (merge_ref(trans, delayed_refs, head, ref, seq))
- node = rb_first(&head->ref_root);
- else
- node = rb_next(&ref->rb_node);
- }
-}
-
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 seq)
@@ -443,45 +270,71 @@ again:
}
/*
- * helper function to update an extent delayed ref in the
- * rbtree. existing and update must both have the same
- * bytenr and parent
+ * Helper to insert the ref_node to the tail or merge with tail.
*
- * This may free existing if the update cancels out whatever
- * operation it was doing.
+ * Return 0 for insert.
+ * Return >0 for merge.
*/
-static noinline void
-update_existing_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head,
- struct btrfs_delayed_ref_node *existing,
- struct btrfs_delayed_ref_node *update)
+static int
+add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *root,
+ struct btrfs_delayed_ref_head *href,
+ struct btrfs_delayed_ref_node *ref)
{
- if (update->action != existing->action) {
- /*
- * this is effectively undoing either an add or a
- * drop. We decrement the ref_mod, and if it goes
- * down to zero we just delete the entry without
- * every changing the extent allocation tree.
- */
- existing->ref_mod--;
- if (existing->ref_mod == 0)
- drop_delayed_ref(trans, delayed_refs, head, existing);
- else
- WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
- existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ struct btrfs_delayed_ref_node *exist;
+ int mod;
+ int ret = 0;
+
+ spin_lock(&href->lock);
+ /* Check whether we can merge the tail node with ref */
+ if (list_empty(&href->ref_list))
+ goto add_tail;
+ exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
+ list);
+ /* No need to compare bytenr nor is_head */
+ if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
+ exist->seq != ref->seq)
+ goto add_tail;
+
+ if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+ comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
+ btrfs_delayed_node_to_tree_ref(ref),
+ ref->type))
+ goto add_tail;
+ if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
+ comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
+ btrfs_delayed_node_to_data_ref(ref)))
+ goto add_tail;
+
+ /* Now we are sure we can merge */
+ ret = 1;
+ if (exist->action == ref->action) {
+ mod = ref->ref_mod;
} else {
- WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
- existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
- /*
- * the action on the existing ref matches
- * the action on the ref we're trying to add.
- * Bump the ref_mod by one so the backref that
- * is eventually added/removed has the correct
- * reference count
- */
- existing->ref_mod += update->ref_mod;
+ /* Need to change action */
+ if (exist->ref_mod < ref->ref_mod) {
+ exist->action = ref->action;
+ mod = -exist->ref_mod;
+ exist->ref_mod = ref->ref_mod;
+ } else
+ mod = -ref->ref_mod;
}
+ exist->ref_mod += mod;
+
+ /* remove existing tail if its ref_mod is zero */
+ if (exist->ref_mod == 0)
+ drop_delayed_ref(trans, root, href, exist);
+ spin_unlock(&href->lock);
+ return ret;
+
+add_tail:
+ list_add_tail(&ref->list, &href->ref_list);
+ atomic_inc(&root->num_entries);
+ trans->delayed_ref_updates++;
+ spin_unlock(&href->lock);
+ return ret;
}
/*
@@ -489,11 +342,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
* existing and update must have the same bytenr
*/
static noinline void
-update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *existing,
struct btrfs_delayed_ref_node *update)
{
struct btrfs_delayed_ref_head *existing_ref;
struct btrfs_delayed_ref_head *ref;
+ int old_ref_mod;
existing_ref = btrfs_delayed_node_to_head(existing);
ref = btrfs_delayed_node_to_head(update);
@@ -541,7 +396,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
* only need the lock for this case cause we could be processing it
* currently, for refs we just added we know we're a-ok.
*/
+ old_ref_mod = existing_ref->total_ref_mod;
existing->ref_mod += update->ref_mod;
+ existing_ref->total_ref_mod += update->ref_mod;
+
+ /*
+ * If we are going to from a positive ref mod to a negative or vice
+ * versa we need to make sure to adjust pending_csums accordingly.
+ */
+ if (existing_ref->is_data) {
+ if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0)
+ delayed_refs->pending_csums -= existing->num_bytes;
+ if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0)
+ delayed_refs->pending_csums += existing->num_bytes;
+ }
spin_unlock(&existing_ref->lock);
}
@@ -553,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
static noinline struct btrfs_delayed_ref_head *
add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref, u64 bytenr,
- u64 num_bytes, int action, int is_data)
+ struct btrfs_delayed_ref_node *ref,
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr, u64 num_bytes, int action, int is_data)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *qexisting;
int count_mod = 1;
int must_insert_reserved = 0;
@@ -603,8 +473,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
- head_ref->ref_root = RB_ROOT;
+ INIT_LIST_HEAD(&head_ref->ref_list);
head_ref->processing = 0;
+ head_ref->total_ref_mod = count_mod;
+
+ /* Record qgroup extent info if provided */
+ if (qrecord) {
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+
+ qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+ qrecord);
+ if (qexisting)
+ kfree(qrecord);
+ }
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
@@ -614,7 +497,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
- update_existing_head_ref(&existing->node, ref);
+ update_existing_head_ref(delayed_refs, &existing->node, ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
@@ -622,6 +505,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ if (is_data && count_mod < 0)
+ delayed_refs->pending_csums += num_bytes;
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
@@ -641,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 parent, u64 ref_root, int level,
int action, int no_quota)
{
- struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
+ int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -675,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_tree_ref(ref, full_ref, action);
- spin_lock(&head_ref->lock);
- existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
- if (existing) {
- update_existing_ref(trans, delayed_refs, head_ref, existing,
- ref);
- /*
- * we've updated the existing ref, free the newly
- * allocated ref
- */
+ ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+ /*
+ * XXX: memory should be freed at the same level allocated.
+ * But bad practice is anywhere... Follow it now. Need cleanup.
+ */
+ if (ret > 0)
kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
- } else {
- atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
- }
- spin_unlock(&head_ref->lock);
}
/*
@@ -703,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
u64 offset, int action, int no_quota)
{
- struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
u64 seq = 0;
+ int ret;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -740,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_data_ref(ref, full_ref, action);
- spin_lock(&head_ref->lock);
- existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
- if (existing) {
- update_existing_ref(trans, delayed_refs, head_ref, existing,
- ref);
- /*
- * we've updated the existing ref, free the newly
- * allocated ref
- */
+ ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+ if (ret > 0)
kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
- } else {
- atomic_inc(&delayed_refs->num_entries);
- trans->delayed_ref_updates++;
- }
- spin_unlock(&head_ref->lock);
}
/*
@@ -772,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
no_quota = 0;
@@ -782,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
return -ENOMEM;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
- if (!head_ref) {
- kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
- return -ENOMEM;
+ if (!head_ref)
+ goto free_ref;
+
+ if (fs_info->quota_enabled && is_fstree(ref_root)) {
+ record = kmalloc(sizeof(*record), GFP_NOFS);
+ if (!record)
+ goto free_head_ref;
}
head_ref->extent_op = extent_op;
@@ -796,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, action, 0);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -805,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
spin_unlock(&delayed_refs->lock);
return 0;
+
+free_head_ref:
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+free_ref:
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
+ return -ENOMEM;
}
/*
@@ -821,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_qgroup_extent_record *record = NULL;
if (!is_fstree(ref_root) || !fs_info->quota_enabled)
no_quota = 0;
@@ -836,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
return -ENOMEM;
}
+ if (fs_info->quota_enabled && is_fstree(ref_root)) {
+ record = kmalloc(sizeof(*record), GFP_NOFS);
+ if (!record) {
+ kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+ kmem_cache_free(btrfs_delayed_ref_head_cachep,
+ head_ref);
+ return -ENOMEM;
+ }
+ }
+
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
@@ -845,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
bytenr, num_bytes, action, 1);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -873,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
- num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
- extent_op->is_data);
+ add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
+ num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ extent_op->is_data);
spin_unlock(&delayed_refs->lock);
return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a764e2340d48..13fb5e6090fe 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -24,9 +24,25 @@
#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+/*
+ * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the
+ * same ref_node structure.
+ * Ref_head is in a higher logic level than tree/data ref, and duplicated
+ * bytenr/num_bytes in ref_node is really a waste or memory, they should be
+ * referred from ref_head.
+ * This gets more disgusting after we use list to store tree/data ref in
+ * ref_head. Must clean this mess up later.
+ */
struct btrfs_delayed_ref_node {
+ /*
+ * ref_head use rb tree, stored in ref_root->href.
+ * indexed by bytenr
+ */
struct rb_node rb_node;
+ /*data/tree ref use list, stored in ref_head->ref_list. */
+ struct list_head list;
+
/* the starting bytenr of the extent */
u64 bytenr;
@@ -83,11 +99,19 @@ struct btrfs_delayed_ref_head {
struct mutex mutex;
spinlock_t lock;
- struct rb_root ref_root;
+ struct list_head ref_list;
struct rb_node href_node;
struct btrfs_delayed_extent_op *extent_op;
+
+ /*
+ * This is used to track the final ref_mod from all the refs associated
+ * with this head ref, this is not adjusted as delayed refs are run,
+ * this is meant to track if we need to do the csum accounting or not.
+ */
+ int total_ref_mod;
+
/*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
@@ -124,6 +148,9 @@ struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root href_root;
+ /* dirty extent records */
+ struct rb_root dirty_extent_root;
+
/* this spin lock protects the rbtree and the entries inside */
spinlock_t lock;
@@ -138,6 +165,8 @@ struct btrfs_delayed_ref_root {
/* total number of head nodes ready for processing */
unsigned long num_heads_ready;
+ u64 pending_csums;
+
/*
* set when the tree is flushing before a transaction commit,
* used by the throttling code to decide if new updates need
@@ -146,6 +175,14 @@ struct btrfs_delayed_ref_root {
int flushing;
u64 run_delayed_start;
+
+ /*
+ * To make qgroup to skip given root.
+ * This is for snapshot, as btrfs_qgroup_inherit() will manully
+ * modify counters for snapshot and its source, so we should skip
+ * the snapshot in new_root/old_roots or it will get calculated twice
+ */
+ u64 qgroup_to_skip;
};
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5ec03d999c37..564a7de17d99 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
+ ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
+ if (ret)
+ btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
printk_in_rcu(KERN_INFO
"BTRFS: dev_replace from %s (devid %llu) to %s started\n",
src_device->missing ? "<missing disk>" :
@@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info, src_device);
- btrfs_kobj_add_device(fs_info, tgt_device);
+ btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
/* write back the superblocks */
@@ -670,8 +673,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
srcdev = dev_replace->srcdev;
- args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
- div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
+ args->status.progress_1000 = div_u64(dev_replace->cursor_left,
+ div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
btrfs_dev_replace_unlock(dev_replace);
@@ -806,7 +809,7 @@ static int btrfs_dev_replace_kthread(void *data)
btrfs_dev_replace_status(fs_info, status_args);
progress = status_args->status.progress_1000;
kfree(status_args);
- do_div(progress, 10);
+ progress = div_u64(progress, 10);
printk_in_rcu(KERN_INFO
"BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
dev_replace->srcdev->missing ? "<missing disk>" :
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 639f2663ed3f..f556c3732c2c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -54,7 +54,7 @@
#include <asm/cpufeature.h>
#endif
-static struct extent_io_ops btree_extent_io_ops;
+static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
@@ -274,10 +274,11 @@ void btrfs_csum_final(u32 crc, char *result)
* compute the csum for a btree block, and either verify it or write it
* into the csum field of the block.
*/
-static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+static int csum_tree_block(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *buf,
int verify)
{
- u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
char *result = NULL;
unsigned long len;
unsigned long cur_len;
@@ -302,7 +303,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
offset += cur_len;
}
if (csum_size > sizeof(inline_result)) {
- result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+ result = kzalloc(csum_size, GFP_NOFS);
if (!result)
return 1;
} else {
@@ -321,7 +322,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
printk_ratelimited(KERN_WARNING
"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
"level %d\n",
- root->fs_info->sb->s_id, buf->start,
+ fs_info->sb->s_id, buf->start,
val, found, btrfs_header_level(buf));
if (result != (char *)&inline_result)
kfree(result);
@@ -418,12 +419,6 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
if (memcmp(raw_disk_sb, result, csum_size))
ret = 1;
-
- if (ret && btrfs_super_generation(disk_sb) < 10) {
- printk(KERN_WARNING
- "BTRFS: super block crcs don't match, older mkfs detected\n");
- ret = 0;
- }
}
if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
@@ -501,7 +496,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
* we only fill in the checksum field in the first page of a multi-page block
*/
-static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
{
u64 start = page_offset(page);
u64 found_start;
@@ -513,14 +508,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
found_start = btrfs_header_bytenr(eb);
if (WARN_ON(found_start != start || !PageUptodate(page)))
return 0;
- csum_tree_block(root, eb, 0);
+ csum_tree_block(fs_info, eb, 0);
return 0;
}
-static int check_tree_block_fsid(struct btrfs_root *root,
+static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb)
{
- struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u8 fsid[BTRFS_UUID_SIZE];
int ret = 1;
@@ -640,7 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
ret = -EIO;
goto err;
}
- if (check_tree_block_fsid(root, eb)) {
+ if (check_tree_block_fsid(root->fs_info, eb)) {
printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
eb->fs_info->sb->s_id, eb->start);
ret = -EIO;
@@ -657,7 +652,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
eb, found_level);
- ret = csum_tree_block(root, eb, 1);
+ ret = csum_tree_block(root->fs_info, eb, 1);
if (ret) {
ret = -EIO;
goto err;
@@ -882,7 +877,7 @@ static int btree_csum_one_bio(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
- ret = csum_dirty_buffer(root, bvec->bv_page);
+ ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
if (ret)
break;
}
@@ -1119,10 +1114,10 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
return 0;
}
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr)
{
- return find_extent_buffer(root->fs_info, bytenr);
+ return find_extent_buffer(fs_info, bytenr);
}
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -1154,22 +1149,21 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
- return NULL;
+ return ERR_PTR(-ENOMEM);
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret) {
free_extent_buffer(buf);
- return NULL;
+ return ERR_PTR(ret);
}
return buf;
}
-void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+void clean_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
struct extent_buffer *buf)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
-
if (btrfs_header_generation(buf) ==
fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
@@ -1515,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
generation);
- if (!root->node) {
- ret = -ENOMEM;
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
goto find_fail;
} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
- goto read_fail;
+ free_extent_buffer(root->node);
+ goto find_fail;
}
root->commit_root = btrfs_root_node(root);
out:
btrfs_free_path(path);
return root;
-read_fail:
- free_extent_buffer(root->node);
find_fail:
kfree(root);
alloc_fail:
@@ -1751,13 +1744,14 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
- bio_endio_nodec(bio, error);
+ bio_endio(bio, error);
}
static int cleaner_kthread(void *arg)
{
struct btrfs_root *root = arg;
int again;
+ struct btrfs_trans_handle *trans;
do {
again = 0;
@@ -1779,7 +1773,6 @@ static int cleaner_kthread(void *arg)
}
btrfs_run_delayed_iputs(root);
- btrfs_delete_unused_bgs(root->fs_info);
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -1788,6 +1781,16 @@ static int cleaner_kthread(void *arg)
* needn't do anything special here.
*/
btrfs_run_defrag_inodes(root->fs_info);
+
+ /*
+ * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
+ * with relocation (btrfs_relocate_chunk) and relocation
+ * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
+ * after acquiring fs_info->delete_unused_bgs_mutex. So we
+ * can't hold, nor need to, fs_info->cleaner_mutex when deleting
+ * unused block groups.
+ */
+ btrfs_delete_unused_bgs(root->fs_info);
sleep:
if (!try_to_freeze() && !again) {
set_current_state(TASK_INTERRUPTIBLE);
@@ -1796,6 +1799,34 @@ sleep:
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
+
+ /*
+ * Transaction kthread is stopped before us and wakes us up.
+ * However we might have started a new transaction and COWed some
+ * tree blocks when deleting unused block groups for example. So
+ * make sure we commit the transaction we started to have a clean
+ * shutdown when evicting the btree inode - if it has dirty pages
+ * when we do the final iput() on it, eviction will trigger a
+ * writeback for it which will fail with null pointer dereferences
+ * since work queues and other resources were already released and
+ * destroyed by the time the iput/eviction/writeback is made.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (IS_ERR(trans)) {
+ if (PTR_ERR(trans) != -ENOENT)
+ btrfs_err(root->fs_info,
+ "cleaner transaction attach returned %ld",
+ PTR_ERR(trans));
+ } else {
+ int ret;
+
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ btrfs_err(root->fs_info,
+ "cleaner open transaction commit returned %d",
+ ret);
+ }
+
return 0;
}
@@ -2146,6 +2177,271 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
}
}
+static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
+{
+ mutex_init(&fs_info->scrub_lock);
+ atomic_set(&fs_info->scrubs_running, 0);
+ atomic_set(&fs_info->scrub_pause_req, 0);
+ atomic_set(&fs_info->scrubs_paused, 0);
+ atomic_set(&fs_info->scrub_cancel_req, 0);
+ init_waitqueue_head(&fs_info->scrub_pause_wait);
+ fs_info->scrub_workers_refcnt = 0;
+}
+
+static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->balance_lock);
+ mutex_init(&fs_info->balance_mutex);
+ atomic_set(&fs_info->balance_running, 0);
+ atomic_set(&fs_info->balance_pause_req, 0);
+ atomic_set(&fs_info->balance_cancel_req, 0);
+ fs_info->balance_ctl = NULL;
+ init_waitqueue_head(&fs_info->balance_wait_q);
+}
+
+static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *tree_root)
+{
+ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ set_nlink(fs_info->btree_inode, 1);
+ /*
+ * we set the i_size on the btree inode to the max possible int.
+ * the real end of the address space is determined by all of
+ * the devices in the system
+ */
+ fs_info->btree_inode->i_size = OFFSET_MAX;
+ fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+
+ RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
+ extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+ fs_info->btree_inode->i_mapping);
+ BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
+ extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+
+ BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
+ BTRFS_I(fs_info->btree_inode)->root = tree_root;
+ memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+ sizeof(struct btrfs_key));
+ set_bit(BTRFS_INODE_DUMMY,
+ &BTRFS_I(fs_info->btree_inode)->runtime_flags);
+ btrfs_insert_inode_hash(fs_info->btree_inode);
+}
+
+static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
+{
+ fs_info->dev_replace.lock_owner = 0;
+ atomic_set(&fs_info->dev_replace.nesting_level, 0);
+ mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+ mutex_init(&fs_info->dev_replace.lock_management_lock);
+ mutex_init(&fs_info->dev_replace.lock);
+ init_waitqueue_head(&fs_info->replace_wait);
+}
+
+static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
+{
+ spin_lock_init(&fs_info->qgroup_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
+ fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_op_tree = RB_ROOT;
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ fs_info->qgroup_seq = 1;
+ fs_info->quota_enabled = 0;
+ fs_info->pending_quota_state = 0;
+ fs_info->qgroup_ulist = NULL;
+ mutex_init(&fs_info->qgroup_rescan_lock);
+}
+
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int max_active = fs_info->thread_pool_size;
+ unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+
+ fs_info->workers =
+ btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+ max_active, 16);
+
+ fs_info->delalloc_workers =
+ btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
+
+ fs_info->flush_workers =
+ btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
+
+ fs_info->caching_workers =
+ btrfs_alloc_workqueue("cache", flags, max_active, 0);
+
+ /*
+ * a higher idle thresh on the submit workers makes it much more
+ * likely that bios will be send down in a sane order to the
+ * devices
+ */
+ fs_info->submit_workers =
+ btrfs_alloc_workqueue("submit", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 64);
+
+ fs_info->fixup_workers =
+ btrfs_alloc_workqueue("fixup", flags, 1, 0);
+
+ /*
+ * endios are largely parallel and should have a very
+ * low idle thresh
+ */
+ fs_info->endio_workers =
+ btrfs_alloc_workqueue("endio", flags, max_active, 4);
+ fs_info->endio_meta_workers =
+ btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+ fs_info->endio_meta_write_workers =
+ btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+ fs_info->endio_raid56_workers =
+ btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->endio_repair_workers =
+ btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
+ fs_info->rmw_workers =
+ btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+ fs_info->endio_write_workers =
+ btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+ fs_info->endio_freespace_worker =
+ btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+ fs_info->delayed_workers =
+ btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+ fs_info->readahead_workers =
+ btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+ fs_info->qgroup_rescan_workers =
+ btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+ fs_info->extent_workers =
+ btrfs_alloc_workqueue("extent-refs", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 8);
+
+ if (!(fs_info->workers && fs_info->delalloc_workers &&
+ fs_info->submit_workers && fs_info->flush_workers &&
+ fs_info->endio_workers && fs_info->endio_meta_workers &&
+ fs_info->endio_meta_write_workers &&
+ fs_info->endio_repair_workers &&
+ fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+ fs_info->caching_workers && fs_info->readahead_workers &&
+ fs_info->fixup_workers && fs_info->delayed_workers &&
+ fs_info->extent_workers &&
+ fs_info->qgroup_rescan_workers)) {
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *log_tree_root;
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ u64 bytenr = btrfs_super_log_root(disk_super);
+
+ if (fs_devices->rw_devices == 0) {
+ printk(KERN_WARNING "BTRFS: log replay required "
+ "on RO media\n");
+ return -EIO;
+ }
+
+ log_tree_root = btrfs_alloc_root(fs_info);
+ if (!log_tree_root)
+ return -ENOMEM;
+
+ __setup_root(tree_root->nodesize, tree_root->sectorsize,
+ tree_root->stripesize, log_tree_root, fs_info,
+ BTRFS_TREE_LOG_OBJECTID);
+
+ log_tree_root->node = read_tree_block(tree_root, bytenr,
+ fs_info->generation + 1);
+ if (IS_ERR(log_tree_root->node)) {
+ printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ ret = PTR_ERR(log_tree_root->node);
+ kfree(log_tree_root);
+ return ret;
+ } else if (!extent_buffer_uptodate(log_tree_root->node)) {
+ printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ return -EIO;
+ }
+ /* returns with log_tree_root freed on success */
+ ret = btrfs_recover_log_trees(log_tree_root);
+ if (ret) {
+ btrfs_error(tree_root->fs_info, ret,
+ "Failed to recover log tree");
+ free_extent_buffer(log_tree_root->node);
+ kfree(log_tree_root);
+ return ret;
+ }
+
+ if (fs_info->sb->s_flags & MS_RDONLY) {
+ ret = btrfs_commit_super(tree_root);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *tree_root)
+{
+ struct btrfs_root *root;
+ struct btrfs_key location;
+ int ret;
+
+ location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+ location.offset = 0;
+
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->extent_root = root;
+
+ location.objectid = BTRFS_DEV_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->dev_root = root;
+ btrfs_init_devices_late(fs_info);
+
+ location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->csum_root = root;
+
+ location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (!IS_ERR(root)) {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->quota_enabled = 1;
+ fs_info->pending_quota_state = 1;
+ fs_info->quota_root = root;
+ }
+
+ location.objectid = BTRFS_UUID_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT)
+ return ret;
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->uuid_root = root;
+ }
+
+ return 0;
+}
+
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
@@ -2160,21 +2456,12 @@ int open_ctree(struct super_block *sb,
struct btrfs_super_block *disk_super;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *tree_root;
- struct btrfs_root *extent_root;
- struct btrfs_root *csum_root;
struct btrfs_root *chunk_root;
- struct btrfs_root *dev_root;
- struct btrfs_root *quota_root;
- struct btrfs_root *uuid_root;
- struct btrfs_root *log_tree_root;
int ret;
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
int max_active;
- int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
- bool create_uuid_tree;
- bool check_uuid_tree;
tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
@@ -2241,13 +2528,14 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
- mutex_init(&fs_info->unused_bg_unpin_mutex);
rwlock_init(&fs_info->tree_mod_log_lock);
+ mutex_init(&fs_info->unused_bg_unpin_mutex);
+ mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
seqlock_init(&fs_info->profiles_lock);
+ init_rwsem(&fs_info->delayed_iput_sem);
- init_completion(&fs_info->kobj_unregister);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
@@ -2276,7 +2564,7 @@ int open_ctree(struct super_block *sb,
fs_info->free_chunk_space = 0;
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
- fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
+ fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
spin_lock_init(&fs_info->reada_lock);
@@ -2294,55 +2582,18 @@ int open_ctree(struct super_block *sb,
}
btrfs_init_delayed_root(fs_info->delayed_root);
- mutex_init(&fs_info->scrub_lock);
- atomic_set(&fs_info->scrubs_running, 0);
- atomic_set(&fs_info->scrub_pause_req, 0);
- atomic_set(&fs_info->scrubs_paused, 0);
- atomic_set(&fs_info->scrub_cancel_req, 0);
- init_waitqueue_head(&fs_info->replace_wait);
- init_waitqueue_head(&fs_info->scrub_pause_wait);
- fs_info->scrub_workers_refcnt = 0;
+ btrfs_init_scrub(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
fs_info->check_integrity_print_mask = 0;
#endif
-
- spin_lock_init(&fs_info->balance_lock);
- mutex_init(&fs_info->balance_mutex);
- atomic_set(&fs_info->balance_running, 0);
- atomic_set(&fs_info->balance_pause_req, 0);
- atomic_set(&fs_info->balance_cancel_req, 0);
- fs_info->balance_ctl = NULL;
- init_waitqueue_head(&fs_info->balance_wait_q);
+ btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
sb->s_bdi = &fs_info->bdi;
- fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
- set_nlink(fs_info->btree_inode, 1);
- /*
- * we set the i_size on the btree inode to the max possible int.
- * the real end of the address space is determined by all of
- * the devices in the system
- */
- fs_info->btree_inode->i_size = OFFSET_MAX;
- fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-
- RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
- extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
- fs_info->btree_inode->i_mapping);
- BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
- extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
-
- BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
-
- BTRFS_I(fs_info->btree_inode)->root = tree_root;
- memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
- sizeof(struct btrfs_key));
- set_bit(BTRFS_INODE_DUMMY,
- &BTRFS_I(fs_info->btree_inode)->runtime_flags);
- btrfs_insert_inode_hash(fs_info->btree_inode);
+ btrfs_init_btree_inode(fs_info, tree_root);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
@@ -2363,26 +2614,14 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
+ mutex_init(&fs_info->ro_block_group_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
- fs_info->dev_replace.lock_owner = 0;
- atomic_set(&fs_info->dev_replace.nesting_level, 0);
- mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
- mutex_init(&fs_info->dev_replace.lock_management_lock);
- mutex_init(&fs_info->dev_replace.lock);
- spin_lock_init(&fs_info->qgroup_lock);
- mutex_init(&fs_info->qgroup_ioctl_lock);
- fs_info->qgroup_tree = RB_ROOT;
- fs_info->qgroup_op_tree = RB_ROOT;
- INIT_LIST_HEAD(&fs_info->dirty_qgroups);
- fs_info->qgroup_seq = 1;
- fs_info->quota_enabled = 0;
- fs_info->pending_quota_state = 0;
- fs_info->qgroup_ulist = NULL;
- mutex_init(&fs_info->qgroup_rescan_lock);
+ btrfs_init_dev_replace_locks(fs_info);
+ btrfs_init_qgroup(fs_info);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2554,75 +2793,9 @@ int open_ctree(struct super_block *sb,
max_active = fs_info->thread_pool_size;
- fs_info->workers =
- btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
- max_active, 16);
-
- fs_info->delalloc_workers =
- btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
-
- fs_info->flush_workers =
- btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
-
- fs_info->caching_workers =
- btrfs_alloc_workqueue("cache", flags, max_active, 0);
-
- /*
- * a higher idle thresh on the submit workers makes it much more
- * likely that bios will be send down in a sane order to the
- * devices
- */
- fs_info->submit_workers =
- btrfs_alloc_workqueue("submit", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 64);
-
- fs_info->fixup_workers =
- btrfs_alloc_workqueue("fixup", flags, 1, 0);
-
- /*
- * endios are largely parallel and should have a very
- * low idle thresh
- */
- fs_info->endio_workers =
- btrfs_alloc_workqueue("endio", flags, max_active, 4);
- fs_info->endio_meta_workers =
- btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
- fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
- fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
- fs_info->endio_repair_workers =
- btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
- fs_info->rmw_workers =
- btrfs_alloc_workqueue("rmw", flags, max_active, 2);
- fs_info->endio_write_workers =
- btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
- fs_info->endio_freespace_worker =
- btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
- fs_info->delayed_workers =
- btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
- fs_info->readahead_workers =
- btrfs_alloc_workqueue("readahead", flags, max_active, 2);
- fs_info->qgroup_rescan_workers =
- btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
- fs_info->extent_workers =
- btrfs_alloc_workqueue("extent-refs", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 8);
-
- if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->submit_workers && fs_info->flush_workers &&
- fs_info->endio_workers && fs_info->endio_meta_workers &&
- fs_info->endio_meta_write_workers &&
- fs_info->endio_repair_workers &&
- fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
- fs_info->endio_freespace_worker && fs_info->rmw_workers &&
- fs_info->caching_workers && fs_info->readahead_workers &&
- fs_info->fixup_workers && fs_info->delayed_workers &&
- fs_info->extent_workers &&
- fs_info->qgroup_rescan_workers)) {
- err = -ENOMEM;
+ ret = btrfs_init_workqueues(fs_info, fs_devices);
+ if (ret) {
+ err = ret;
goto fail_sb_buffer;
}
@@ -2665,10 +2838,11 @@ int open_ctree(struct super_block *sb,
chunk_root->node = read_tree_block(chunk_root,
btrfs_super_chunk_root(disk_super),
generation);
- if (!chunk_root->node ||
- !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+ if (IS_ERR(chunk_root->node) ||
+ !extent_buffer_uptodate(chunk_root->node)) {
printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
+ chunk_root->node = NULL;
goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
@@ -2688,7 +2862,7 @@ int open_ctree(struct super_block *sb,
* keep the device that is marked to be the target device for the
* dev_replace procedure
*/
- btrfs_close_extra_devices(fs_info, fs_devices, 0);
+ btrfs_close_extra_devices(fs_devices, 0);
if (!fs_devices->latest_bdev) {
printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
@@ -2702,11 +2876,11 @@ retry_root_backup:
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
generation);
- if (!tree_root->node ||
- !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ if (IS_ERR(tree_root->node) ||
+ !extent_buffer_uptodate(tree_root->node)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
sb->s_id);
-
+ tree_root->node = NULL;
goto recovery_tree_root;
}
@@ -2714,61 +2888,9 @@ retry_root_backup:
tree_root->commit_root = btrfs_root_node(tree_root);
btrfs_set_root_refs(&tree_root->root_item, 1);
- location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = 0;
-
- extent_root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(extent_root)) {
- ret = PTR_ERR(extent_root);
- goto recovery_tree_root;
- }
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
- fs_info->extent_root = extent_root;
-
- location.objectid = BTRFS_DEV_TREE_OBJECTID;
- dev_root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(dev_root)) {
- ret = PTR_ERR(dev_root);
- goto recovery_tree_root;
- }
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
- fs_info->dev_root = dev_root;
- btrfs_init_devices_late(fs_info);
-
- location.objectid = BTRFS_CSUM_TREE_OBJECTID;
- csum_root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(csum_root)) {
- ret = PTR_ERR(csum_root);
+ ret = btrfs_read_roots(fs_info, tree_root);
+ if (ret)
goto recovery_tree_root;
- }
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
- fs_info->csum_root = csum_root;
-
- location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
- quota_root = btrfs_read_tree_root(tree_root, &location);
- if (!IS_ERR(quota_root)) {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
- fs_info->quota_enabled = 1;
- fs_info->pending_quota_state = 1;
- fs_info->quota_root = quota_root;
- }
-
- location.objectid = BTRFS_UUID_TREE_OBJECTID;
- uuid_root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(uuid_root)) {
- ret = PTR_ERR(uuid_root);
- if (ret != -ENOENT)
- goto recovery_tree_root;
- create_uuid_tree = true;
- check_uuid_tree = false;
- } else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
- fs_info->uuid_root = uuid_root;
- create_uuid_tree = false;
- check_uuid_tree =
- generation != btrfs_super_uuid_tree_generation(disk_super);
- }
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
@@ -2792,12 +2914,24 @@ retry_root_backup:
goto fail_block_groups;
}
- btrfs_close_extra_devices(fs_info, fs_devices, 1);
+ btrfs_close_extra_devices(fs_devices, 1);
+
+ ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
+ if (ret) {
+ pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
+ goto fail_block_groups;
+ }
+
+ ret = btrfs_sysfs_add_device(fs_devices);
+ if (ret) {
+ pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
+ goto fail_fsdev_sysfs;
+ }
ret = btrfs_sysfs_add_one(fs_info);
if (ret) {
pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
- goto fail_block_groups;
+ goto fail_fsdev_sysfs;
}
ret = btrfs_init_space_info(fs_info);
@@ -2806,7 +2940,7 @@ retry_root_backup:
goto fail_sysfs;
}
- ret = btrfs_read_block_groups(extent_root);
+ ret = btrfs_read_block_groups(fs_info->extent_root);
if (ret) {
printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
goto fail_sysfs;
@@ -2864,48 +2998,11 @@ retry_root_backup:
/* do not make disk changes in broken FS */
if (btrfs_super_log_root(disk_super) != 0) {
- u64 bytenr = btrfs_super_log_root(disk_super);
-
- if (fs_devices->rw_devices == 0) {
- printk(KERN_WARNING "BTRFS: log replay required "
- "on RO media\n");
- err = -EIO;
- goto fail_qgroup;
- }
-
- log_tree_root = btrfs_alloc_root(fs_info);
- if (!log_tree_root) {
- err = -ENOMEM;
- goto fail_qgroup;
- }
-
- __setup_root(nodesize, sectorsize, stripesize,
- log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-
- log_tree_root->node = read_tree_block(tree_root, bytenr,
- generation + 1);
- if (!log_tree_root->node ||
- !extent_buffer_uptodate(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
- goto fail_qgroup;
- }
- /* returns with log_tree_root freed on success */
- ret = btrfs_recover_log_trees(log_tree_root);
+ ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
- btrfs_error(tree_root->fs_info, ret,
- "Failed to recover log tree");
- free_extent_buffer(log_tree_root->node);
- kfree(log_tree_root);
+ err = ret;
goto fail_qgroup;
}
-
- if (sb->s_flags & MS_RDONLY) {
- ret = btrfs_commit_super(tree_root);
- if (ret)
- goto fail_qgroup;
- }
}
ret = btrfs_find_orphan_roots(tree_root);
@@ -2966,7 +3063,7 @@ retry_root_backup:
btrfs_qgroup_rescan_resume(fs_info);
- if (create_uuid_tree) {
+ if (!fs_info->uuid_root) {
pr_info("BTRFS: creating UUID tree\n");
ret = btrfs_create_uuid_tree(fs_info);
if (ret) {
@@ -2975,8 +3072,9 @@ retry_root_backup:
close_ctree(tree_root);
return ret;
}
- } else if (check_uuid_tree ||
- btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
+ } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
+ fs_info->generation !=
+ btrfs_super_uuid_tree_generation(disk_super)) {
pr_info("BTRFS: checking UUID tree\n");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
@@ -3011,6 +3109,9 @@ fail_cleaner:
fail_sysfs:
btrfs_sysfs_remove_one(fs_info);
+fail_fsdev_sysfs:
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
+
fail_block_groups:
btrfs_put_block_group_cache(fs_info);
btrfs_free_block_groups(fs_info);
@@ -3225,11 +3326,8 @@ static int write_dev_supers(struct btrfs_device *device,
*/
static void btrfs_end_empty_barrier(struct bio *bio, int err)
{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ if (err)
clear_bit(BIO_UPTODATE, &bio->bi_flags);
- }
if (bio->bi_private)
complete(bio->bi_private);
bio_put(bio);
@@ -3257,11 +3355,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
- if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
- rcu_str_deref(device->name));
- device->nobarriers = 1;
- } else if (!bio_flagged(bio, BIO_UPTODATE)) {
+ if (!bio_flagged(bio, BIO_UPTODATE)) {
ret = -EIO;
btrfs_dev_stat_inc_and_print(device,
BTRFS_DEV_STAT_FLUSH_ERRS);
@@ -3668,7 +3762,7 @@ void close_ctree(struct btrfs_root *root)
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
- btrfs_err(root->fs_info, "commit super ret %d", ret);
+ btrfs_err(fs_info, "commit super ret %d", ret);
}
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
@@ -3680,14 +3774,15 @@ void close_ctree(struct btrfs_root *root)
fs_info->closing = 2;
smp_mb();
- btrfs_free_qgroup_config(root->fs_info);
+ btrfs_free_qgroup_config(fs_info);
if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
- btrfs_info(root->fs_info, "at unmount delalloc count %lld",
+ btrfs_info(fs_info, "at unmount delalloc count %lld",
percpu_counter_sum(&fs_info->delalloc_bytes));
}
btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_fsid(fs_info->fs_devices);
btrfs_free_fs_roots(fs_info);
@@ -3723,7 +3818,7 @@ void close_ctree(struct btrfs_root *root)
btrfs_free_stripe_hash_table(fs_info);
- btrfs_free_block_rsv(root, root->orphan_block_rsv);
+ __btrfs_free_block_rsv(root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
lock_chunks(root);
@@ -4016,6 +4111,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_node *tmp;
bool pin_bytes = false;
head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -4031,11 +4127,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
spin_lock(&head->lock);
- while ((node = rb_first(&head->ref_root)) != NULL) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
+ list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
+ list) {
ref->in_tree = 0;
- rb_erase(&ref->rb_node, &head->ref_root);
+ list_del(&ref->list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
}
@@ -4134,7 +4229,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
while (start <= end) {
- eb = btrfs_find_tree_block(root, start);
+ eb = btrfs_find_tree_block(root->fs_info, start);
start += root->nodesize;
if (!eb)
continue;
@@ -4285,7 +4380,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
return 0;
}
-static struct extent_io_ops btree_extent_io_ops = {
+static const struct extent_io_ops btree_extent_io_ops = {
.readpage_end_io_hook = btree_readpage_end_io_hook,
.readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 27d44c0fd236..d4cbfeeeedd4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,7 +52,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr);
void clean_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *buf);
+ struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
int open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
@@ -61,7 +61,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_commit_super(struct btrfs_root *root);
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr);
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 37d164540c3a..8d052209f473 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -152,7 +152,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
static struct dentry *btrfs_get_parent(struct dentry *child)
{
- struct inode *dir = child->d_inode;
+ struct inode *dir = d_inode(child);
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -220,8 +220,8 @@ fail:
static int btrfs_get_name(struct dentry *parent, char *name,
struct dentry *child)
{
- struct inode *inode = child->d_inode;
- struct inode *dir = parent->d_inode;
+ struct inode *inode = d_inode(child);
+ struct inode *dir = d_inode(parent);
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_inode_ref *iref;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8b353ad02f03..07204bf601ed 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
+ struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extra_op,
- int no_quota);
+ struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei);
@@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes,
+ struct btrfs_delayed_ref_node *node,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
- int no_quota,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1978,9 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
struct btrfs_key key;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
u64 refs;
int ret;
- enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
+ int no_quota = node->no_quota;
path = btrfs_alloc_path();
if (!path)
@@ -1996,26 +1996,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
bytenr, num_bytes, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
- if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+ if ((ret < 0 && ret != -EAGAIN) || !ret)
goto out;
- /*
- * Ok we were able to insert an inline extent and it appears to be a new
- * reference, deal with the qgroup accounting.
- */
- if (!ret && !no_quota) {
- ASSERT(root->fs_info->quota_enabled);
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- item = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
- type = BTRFS_QGROUP_OPER_ADD_SHARED;
- btrfs_release_path(path);
-
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- bytenr, num_bytes, type, 0);
- goto out;
- }
/*
* Ok we had -EAGAIN which means we didn't have space to insert and
@@ -2026,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, item);
- if (refs)
- type = BTRFS_QGROUP_OPER_ADD_SHARED;
btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
@@ -2035,13 +2015,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- if (!no_quota) {
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- bytenr, num_bytes, type, 0);
- if (ret)
- goto out;
- }
-
path->reada = 1;
path->leave_spinning = 1;
/* now insert the actual backref */
@@ -2087,17 +2060,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
ref->objectid, ref->offset,
&ins, node->ref_mod);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
- node->num_bytes, parent,
+ ret = __btrfs_inc_extent_ref(trans, root, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- node->no_quota, extent_op);
+ extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node->bytenr,
- node->num_bytes, parent,
+ ret = __btrfs_free_extent(trans, root, node, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op, node->no_quota);
+ extent_op);
} else {
BUG();
}
@@ -2255,15 +2226,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref->level, &ins,
node->no_quota);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
- ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
- node->num_bytes, parent, ref_root,
- ref->level, 0, 1, node->no_quota,
+ ret = __btrfs_inc_extent_ref(trans, root, node,
+ parent, ref_root,
+ ref->level, 0, 1,
extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, root, node->bytenr,
- node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op,
- node->no_quota);
+ ret = __btrfs_free_extent(trans, root, node,
+ parent, ref_root,
+ ref->level, 0, 1, extent_op);
} else {
BUG();
}
@@ -2323,28 +2293,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline struct btrfs_delayed_ref_node *
+static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
- struct rb_node *node;
- struct btrfs_delayed_ref_node *ref, *last = NULL;;
+ struct btrfs_delayed_ref_node *ref;
+
+ if (list_empty(&head->ref_list))
+ return NULL;
/*
- * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
- * this prevents ref count from going down to zero when
- * there still are pending delayed ref.
+ * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * This is to prevent a ref count from going down to zero, which deletes
+ * the extent item from the extent tree, when there still are references
+ * to add, which would fail because they would not find the extent item.
*/
- node = rb_first(&head->ref_root);
- while (node) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
+ list_for_each_entry(ref, &head->ref_list, list) {
if (ref->action == BTRFS_ADD_DELAYED_REF)
return ref;
- else if (last == NULL)
- last = ref;
- node = rb_next(node);
}
- return last;
+
+ return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
+ list);
}
/*
@@ -2396,16 +2365,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
}
- /*
- * We need to try and merge add/drops of the same ref since we
- * can run into issues with relocate dropping the implicit ref
- * and then it being added back again before the drop can
- * finish. If we merged anything we need to re-loop so we can
- * get a good ref.
- */
spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
- locked_ref);
/*
* locked_ref is the head node, so we have to go one
@@ -2482,7 +2442,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&locked_ref->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&locked_ref->lock);
- if (rb_first(&locked_ref->ref_root) ||
+ if (!list_empty(&locked_ref->ref_list) ||
locked_ref->extent_op) {
spin_unlock(&locked_ref->lock);
spin_unlock(&delayed_refs->lock);
@@ -2496,7 +2456,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
} else {
actual_count++;
ref->in_tree = 0;
- rb_erase(&ref->rb_node, &locked_ref->ref_root);
+ list_del(&ref->list);
}
atomic_dec(&delayed_refs->num_entries);
@@ -2538,6 +2498,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
* list before we release it.
*/
if (btrfs_delayed_ref_is_head(ref)) {
+ if (locked_ref->is_data &&
+ locked_ref->total_ref_mod < 0) {
+ spin_lock(&delayed_refs->lock);
+ delayed_refs->pending_csums -= ref->num_bytes;
+ spin_unlock(&delayed_refs->lock);
+ }
btrfs_delayed_ref_unlock(locked_ref);
locked_ref = NULL;
}
@@ -2561,8 +2527,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
*/
spin_lock(&delayed_refs->lock);
avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
- avg = div64_u64(avg, 4);
- fs_info->avg_delayed_ref_runtime = avg;
+ fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
spin_unlock(&delayed_refs->lock);
}
return 0;
@@ -2624,7 +2589,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
* We don't ever fill up leaves all the way so multiply by 2 just to be
* closer to what we're really going to want to ouse.
*/
- return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+/*
+ * Takes the number of bytes to be csumm'ed and figures out how many leaves it
+ * would require to store the csums for that many bytes.
+ */
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+{
+ u64 csum_size;
+ u64 num_csums_per_leaf;
+ u64 num_csums;
+
+ csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ num_csums_per_leaf = div64_u64(csum_size,
+ (u64)btrfs_super_csum_size(root->fs_info->super_copy));
+ num_csums = div64_u64(csum_bytes, root->sectorsize);
+ num_csums += num_csums_per_leaf - 1;
+ num_csums = div64_u64(num_csums, num_csums_per_leaf);
+ return num_csums;
}
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2632,7 +2616,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
{
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
- u64 num_bytes;
+ u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
+ u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+ u64 num_bytes, num_dirty_bgs_bytes;
int ret = 0;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
@@ -2640,17 +2626,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
if (num_heads > 1)
num_bytes += (num_heads - 1) * root->nodesize;
num_bytes <<= 1;
+ num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+ num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+ num_dirty_bgs);
global_rsv = &root->fs_info->global_block_rsv;
/*
* If we can't allocate any more chunks lets make sure we have _lots_ of
* wiggle room since running delayed refs can create more delayed refs.
*/
- if (global_rsv->space_info->full)
+ if (global_rsv->space_info->full) {
+ num_dirty_bgs_bytes <<= 1;
num_bytes <<= 1;
+ }
spin_lock(&global_rsv->lock);
- if (global_rsv->reserved <= num_bytes)
+ if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
ret = 1;
spin_unlock(&global_rsv->lock);
return ret;
@@ -2833,9 +2824,6 @@ again:
goto again;
}
out:
- ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
- if (ret)
- return ret;
assert_qgroups_uptodate(trans);
return 0;
}
@@ -2874,7 +2862,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_data_ref *data_ref;
struct btrfs_delayed_ref_root *delayed_refs;
- struct rb_node *node;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
@@ -2903,11 +2890,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
spin_lock(&head->lock);
- node = rb_first(&head->ref_root);
- while (node) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_next(node);
-
+ list_for_each_entry(ref, &head->ref_list, list) {
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
ret = 1;
@@ -3147,10 +3130,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
fail:
- if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_release_path(path);
return ret;
}
@@ -3193,7 +3174,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
struct inode *inode = NULL;
u64 alloc_hint = 0;
int dcs = BTRFS_DC_ERROR;
- int num_pages = 0;
+ u64 num_pages = 0;
int retries = 0;
int ret = 0;
@@ -3267,15 +3248,14 @@ again:
if (ret)
goto out_put;
- ret = btrfs_truncate_free_space_cache(root, trans, inode);
+ ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
if (ret)
goto out_put;
}
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root, SPACE_CACHE) ||
- block_group->delalloc_bytes) {
+ !btrfs_test_opt(root, SPACE_CACHE)) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -3293,14 +3273,14 @@ again:
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
+ num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
if (!num_pages)
num_pages = 1;
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, num_pages);
+ ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
if (ret)
goto out_put;
@@ -3351,16 +3331,188 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
return 0;
}
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS. This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO. There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *cache;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
+ int should_put;
+ struct btrfs_path *path = NULL;
+ LIST_HEAD(dirty);
+ struct list_head *io = &cur_trans->io_bgs;
+ int num_started = 0;
+ int loops = 0;
+
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cur_trans->dirty_bgs)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ return 0;
+ }
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+again:
+ /*
+ * make sure all the block groups on our dirty list actually
+ * exist
+ */
+ btrfs_create_pending_block_groups(trans, root);
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ /*
+ * cache_write_mutex is here only to save us from balance or automatic
+ * removal of empty block groups deleting this block group while we are
+ * writing out the cache
+ */
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ while (!list_empty(&dirty)) {
+ cache = list_first_entry(&dirty,
+ struct btrfs_block_group_cache,
+ dirty_list);
+ /*
+ * this can happen if something re-dirties a block
+ * group that is already under IO. Just wait for it to
+ * finish and then do it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(root, trans, cache,
+ &cache->io_ctl, path,
+ cache->key.objectid);
+ btrfs_put_block_group(cache);
+ }
+
+
+ /*
+ * btrfs_wait_cache_io uses the cache->dirty_list to decide
+ * if it should update the cache_state. Don't delete
+ * until after we wait.
+ *
+ * Since we're not running in the commit critical section
+ * we need the dirty_bgs_lock to protect from update_block_group
+ */
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
+ if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(root, trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ num_started++;
+ should_put = 0;
+
+ /*
+ * the cache_write_mutex is protecting
+ * the io_list
+ */
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * if we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
+ ret = write_one_cache_group(trans, root, path, cache);
+ /*
+ * Our block group might still be attached to the list
+ * of new block groups in the transaction handle of some
+ * other task (struct btrfs_trans_handle->new_bgs). This
+ * means its block group item isn't yet in the extent
+ * tree. If this happens ignore the error, as we will
+ * try again later in the critical section of the
+ * transaction commit.
+ */
+ if (ret == -ENOENT) {
+ ret = 0;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &cur_trans->dirty_bgs);
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ } else if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ }
+ }
+
+ /* if its not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+
+ if (ret)
+ break;
+
+ /*
+ * Avoid blocking other tasks for too long. It might even save
+ * us from writing caches for block groups that are going to be
+ * removed.
+ */
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ }
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
+ /*
+ * go through delayed refs for all the stuff we've just kicked off
+ * and then loop back (just once)
+ */
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ if (!ret && loops == 0) {
+ loops++;
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&cur_trans->dirty_bgs, &dirty);
+ /*
+ * dirty_bgs_lock protects us from concurrent block group
+ * deletes too (not just cache_write_mutex).
+ */
+ if (!list_empty(&dirty)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ goto again;
+ }
+ spin_unlock(&cur_trans->dirty_bgs_lock);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
+ int should_put;
struct btrfs_path *path;
-
- if (list_empty(&cur_trans->dirty_bgs))
- return 0;
+ struct list_head *io = &cur_trans->io_bgs;
+ int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3376,16 +3528,64 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
dirty_list);
+
+ /*
+ * this can happen if cache_save_setup re-dirties a block
+ * group that is already under IO. Just wait for it to
+ * finish and then do it all again
+ */
+ if (!list_empty(&cache->io_list)) {
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(root, trans, cache,
+ &cache->io_ctl, path,
+ cache->key.objectid);
+ btrfs_put_block_group(cache);
+ }
+
+ /*
+ * don't remove from the dirty list until after we've waited
+ * on any pending IO
+ */
list_del_init(&cache->dirty_list);
- if (cache->disk_cache_state == BTRFS_DC_CLEAR)
- cache_save_setup(cache, trans, path);
- if (!ret)
- ret = btrfs_run_delayed_refs(trans, root,
- (unsigned long) -1);
- if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
- btrfs_write_out_cache(root, trans, cache, path);
+ should_put = 1;
+
+ cache_save_setup(cache, trans, path);
+
if (!ret)
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
+
+ if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
+ cache->io_ctl.inode = NULL;
+ ret = btrfs_write_out_cache(root, trans, cache, path);
+ if (ret == 0 && cache->io_ctl.inode) {
+ num_started++;
+ should_put = 0;
+ list_add_tail(&cache->io_list, io);
+ } else {
+ /*
+ * if we failed to write the cache, the
+ * generation will be bad and life goes on
+ */
+ ret = 0;
+ }
+ }
+ if (!ret) {
ret = write_one_cache_group(trans, root, path, cache);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ }
+
+ /* if its not on the io list, we need to put the block group */
+ if (should_put)
+ btrfs_put_block_group(cache);
+ }
+
+ while (!list_empty(io)) {
+ cache = list_first_entry(io, struct btrfs_block_group_cache,
+ io_list);
+ list_del_init(&cache->io_list);
+ btrfs_wait_cache_io(root, trans, cache,
+ &cache->io_ctl, path, cache->key.objectid);
btrfs_put_block_group(cache);
}
@@ -3445,7 +3645,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
- found->full = 0;
+ if (total_bytes > 0)
+ found->full = 0;
spin_unlock(&found->lock);
*space_info = found;
return 0;
@@ -3473,7 +3674,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_reserved = 0;
found->bytes_readonly = 0;
found->bytes_may_use = 0;
- found->full = 0;
+ if (total_bytes > 0)
+ found->full = 0;
+ else
+ found->full = 1;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
@@ -3635,19 +3839,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
* This will check the space that the inode allocates from to make sure we have
* enough space for bytes.
*/
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 used;
- int ret = 0, committed = 0, alloc_chunk = 1;
+ int ret = 0;
+ int need_commit = 2;
+ int have_pinned_space;
/* make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, root->sectorsize);
if (btrfs_is_free_space_inode(inode)) {
- committed = 1;
+ need_commit = 0;
ASSERT(current->journal_info);
}
@@ -3669,7 +3875,7 @@ again:
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
*/
- if (!data_sinfo->full && alloc_chunk) {
+ if (!data_sinfo->full) {
u64 alloc_target;
data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
@@ -3697,8 +3903,10 @@ alloc:
if (ret < 0) {
if (ret != -ENOSPC)
return ret;
- else
+ else {
+ have_pinned_space = 1;
goto commit_trans;
+ }
}
if (!data_sinfo)
@@ -3709,26 +3917,42 @@ alloc:
/*
* If we don't have enough pinned space to deal with this
- * allocation don't bother committing the transaction.
+ * allocation, and no removed chunk in current transaction,
+ * don't bother committing the transaction.
*/
- if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
- bytes) < 0)
- committed = 1;
+ have_pinned_space = percpu_counter_compare(
+ &data_sinfo->total_bytes_pinned,
+ used + bytes - data_sinfo->total_bytes);
spin_unlock(&data_sinfo->lock);
/* commit the current transaction and try again */
commit_trans:
- if (!committed &&
+ if (need_commit &&
!atomic_read(&root->fs_info->open_ioctl_trans)) {
- committed = 1;
+ need_commit--;
+
+ if (need_commit > 0)
+ btrfs_wait_ordered_roots(fs_info, -1);
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
- goto again;
+ if (have_pinned_space >= 0 ||
+ trans->transaction->have_free_bgs ||
+ need_commit > 0) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ /*
+ * make sure that all running delayed iput are
+ * done
+ */
+ down_write(&root->fs_info->delayed_iput_sem);
+ up_write(&root->fs_info->delayed_iput_sem);
+ goto again;
+ } else {
+ btrfs_end_transaction(trans, root);
+ }
}
trace_btrfs_space_reservation(root->fs_info,
@@ -3736,12 +3960,16 @@ commit_trans:
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
+ ret = btrfs_qgroup_reserve(root, write_bytes);
+ if (ret)
+ goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 1);
+out:
spin_unlock(&data_sinfo->lock);
- return 0;
+ return ret;
}
/*
@@ -3819,7 +4047,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
return 1;
}
-static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
{
u64 num_dev;
@@ -3833,24 +4061,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
else
num_dev = 1; /* DUP or single */
- /* metadata for updaing devices and chunk tree */
- return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+ return num_dev;
}
-static void check_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 type)
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 type)
{
struct btrfs_space_info *info;
u64 left;
u64 thresh;
+ int ret = 0;
+ u64 num_devs;
+
+ /*
+ * Needed because we can end up allocating a system chunk and for an
+ * atomic and race free space reservation in the chunk block reserve.
+ */
+ ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
left = info->total_bytes - info->bytes_used - info->bytes_pinned -
- info->bytes_reserved - info->bytes_readonly;
+ info->bytes_reserved - info->bytes_readonly -
+ info->bytes_may_use;
spin_unlock(&info->lock);
- thresh = get_system_chunk_thresh(root, type);
+ num_devs = get_profile_num_devs(root, type);
+
+ /* num_devs device items to update and 1 chunk item to add or remove */
+ thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
+ btrfs_calc_trans_metadata_size(root, 1);
+
if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
left, thresh, type);
@@ -3861,7 +4108,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
u64 flags;
flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
- btrfs_alloc_chunk(trans, root, flags);
+ /*
+ * Ignore failure to create system chunk. We might end up not
+ * needing it, as we might not need to COW all nodes/leafs from
+ * the paths we visit in the chunk tree (they were already COWed
+ * or created in the current transaction for example).
+ */
+ ret = btrfs_alloc_chunk(trans, root, flags);
+ }
+
+ if (!ret) {
+ ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
+ &root->fs_info->chunk_block_rsv,
+ thresh, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->chunk_bytes_reserved += thresh;
}
}
@@ -3966,6 +4227,24 @@ out:
space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
mutex_unlock(&fs_info->chunk_mutex);
+ /*
+ * When we allocate a new chunk we reserve space in the chunk block
+ * reserve to make sure we can COW nodes/leafs in the chunk tree or
+ * add new nodes/leafs to it if we end up needing to do it when
+ * inserting the chunk item and updating device items as part of the
+ * second phase of chunk allocation, performed by
+ * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+ * large number of new block groups to create in our transaction
+ * handle's new_bgs list to avoid exhausting the chunk block reserve
+ * in extreme cases - like having a single transaction create many new
+ * block groups when starting to write out the free space caches of all
+ * the block groups that were made dirty during the lifetime of the
+ * transaction.
+ */
+ if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+ btrfs_create_pending_block_groups(trans, trans->root);
+ btrfs_trans_release_chunk_metadata(trans);
+ }
return ret;
}
@@ -4298,8 +4577,13 @@ out:
static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
struct btrfs_fs_info *fs_info, u64 used)
{
- return (used >= div_factor_fine(space_info->total_bytes, 98) &&
- !btrfs_fs_closing(fs_info) &&
+ u64 thresh = div_factor_fine(space_info->total_bytes, 98);
+
+ /* If we're just plain full then async reclaim just slows us down. */
+ if (space_info->bytes_used >= thresh)
+ return 0;
+
+ return (used >= thresh && !btrfs_fs_closing(fs_info) &&
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
@@ -4354,10 +4638,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
if (!btrfs_need_do_async_reclaim(space_info, fs_info,
flush_state))
return;
- } while (flush_state <= COMMIT_TRANS);
-
- if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
- queue_work(system_unbound_wq, work);
+ } while (flush_state < COMMIT_TRANS);
}
void btrfs_init_async_reclaim_work(struct work_struct *work)
@@ -4700,6 +4981,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
kfree(rsv);
}
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
+{
+ kfree(rsv);
+}
+
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush)
@@ -4812,10 +5098,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
csum_size * 2;
- num_bytes += div64_u64(data_used + meta_used, 50);
+ num_bytes += div_u64(data_used + meta_used, 50);
if (num_bytes * 3 > meta_used)
- num_bytes = div64_u64(meta_used, 3);
+ num_bytes = div_u64(meta_used, 3);
return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
}
@@ -4912,6 +5198,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
trans->bytes_reserved = 0;
}
+/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->root->fs_info;
+
+ if (!trans->chunk_bytes_reserved)
+ return;
+
+ WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+ block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
+ trans->chunk_bytes_reserved);
+ trans->chunk_bytes_reserved = 0;
+}
+
/* Can only return 0 or -ENOSPC */
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
@@ -4998,8 +5302,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
u64 qgroup_reserved)
{
btrfs_block_rsv_release(root, rsv, (u64)-1);
- if (qgroup_reserved)
- btrfs_qgroup_free(root, qgroup_reserved);
}
/**
@@ -5066,30 +5368,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
int reserve)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 csum_size;
- int num_csums_per_leaf;
- int num_csums;
- int old_csums;
+ u64 old_csums, num_csums;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
BTRFS_I(inode)->csum_bytes == 0)
return 0;
- old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+ old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
if (reserve)
BTRFS_I(inode)->csum_bytes += num_bytes;
else
BTRFS_I(inode)->csum_bytes -= num_bytes;
- csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
- num_csums_per_leaf = (int)div64_u64(csum_size,
- sizeof(struct btrfs_csum_item) +
- sizeof(struct btrfs_disk_key));
- num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
- num_csums = num_csums + num_csums_per_leaf - 1;
- num_csums = num_csums / num_csums_per_leaf;
-
- old_csums = old_csums + num_csums_per_leaf - 1;
- old_csums = old_csums / num_csums_per_leaf;
+ num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
/* No change, no need to reserve more */
if (old_csums == num_csums)
@@ -5163,8 +5453,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_unlock(&BTRFS_I(inode)->lock);
if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, num_bytes +
- nr_extents * root->nodesize);
+ ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
@@ -5172,8 +5461,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, num_bytes +
- nr_extents * root->nodesize);
+ btrfs_qgroup_free(root, nr_extents * root->nodesize);
goto out_fail;
}
@@ -5290,10 +5578,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_free, 0);
- if (root->fs_info->quota_enabled) {
- btrfs_qgroup_free(root, num_bytes +
- dropped * root->nodesize);
- }
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
to_free);
@@ -5318,7 +5602,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
{
int ret;
- ret = btrfs_check_data_free_space(inode, num_bytes);
+ ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
if (ret)
return ret;
@@ -5390,14 +5674,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && cache->cached == BTRFS_CACHE_NO)
cache_block_group(cache, 1);
- spin_lock(&trans->transaction->dirty_bgs_lock);
- if (list_empty(&cache->dirty_list)) {
- list_add_tail(&cache->dirty_list,
- &trans->transaction->dirty_bgs);
- btrfs_get_block_group(cache);
- }
- spin_unlock(&trans->transaction->dirty_bgs_lock);
-
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -5446,6 +5722,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->unused_bgs_lock);
}
}
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &trans->transaction->dirty_bgs);
+ trans->transaction->num_dirty_bgs++;
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
@@ -5834,11 +6120,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
+ struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -5852,10 +6137,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
+ int no_quota = node->no_quota;
u32 item_size;
u64 refs;
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
int last_ref = 0;
- enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
@@ -6036,7 +6323,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs -= refs_to_drop;
if (refs > 0) {
- type = BTRFS_QGROUP_OPER_SUB_SHARED;
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
/*
@@ -6098,18 +6384,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- /* Deal with the quota accounting */
- if (!ret && last_ref && !no_quota) {
- int mod_seq = 0;
-
- if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
- type == BTRFS_QGROUP_OPER_SUB_SHARED)
- mod_seq = 1;
-
- ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
- bytenr, num_bytes, type,
- mod_seq);
- }
out:
btrfs_free_path(path);
return ret;
@@ -6135,7 +6409,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (rb_first(&head->ref_root))
+ if (!list_empty(&head->ref_list))
goto out;
if (head->extent_op) {
@@ -6956,15 +7230,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
return -ENOSPC;
}
- if (btrfs_test_opt(root, DISCARD))
- ret = btrfs_discard_extent(root, start, len, NULL);
-
if (pin)
pin_down_extent(root, cache, start, len, 1);
else {
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_discard_extent(root, start, len, NULL);
btrfs_add_free_space(cache, start, len);
btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
}
+
btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(root, start, len);
@@ -7045,13 +7319,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- /* Always set parent to 0 here since its exclusive anyway. */
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- ins->objectid, ins->offset,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
- if (ret)
- return ret;
-
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7095,9 +7362,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
if (ret) {
+ btrfs_free_path(path);
btrfs_free_and_pin_reserved_extent(root, ins->objectid,
root->nodesize);
- btrfs_free_path(path);
return ret;
}
@@ -7133,14 +7400,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- if (!no_quota) {
- ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
- ins->objectid, num_bytes,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
- if (ret)
- return ret;
- }
-
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
@@ -7217,7 +7476,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_header_generation(buf, trans->transid);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
- clean_tree_block(trans, root, buf);
+ clean_tree_block(trans, root->fs_info, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
@@ -7311,7 +7570,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
* returns the key for the extent through ins, and a tree buffer for
* the first block of the extent through buf.
*
- * returns the tree buffer or NULL.
+ * returns the tree buffer or an ERR_PTR on error.
*/
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -7322,6 +7581,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
+ struct btrfs_delayed_extent_op *extent_op;
u64 flags = 0;
int ret;
u32 blocksize = root->nodesize;
@@ -7342,13 +7602,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
ret = btrfs_reserve_extent(root, blocksize, blocksize,
empty_size, hint, &ins, 0, 0);
- if (ret) {
- unuse_block_rsv(root->fs_info, block_rsv, blocksize);
- return ERR_PTR(ret);
- }
+ if (ret)
+ goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
- BUG_ON(IS_ERR(buf)); /* -ENOMEM */
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_free_reserved;
+ }
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
@@ -7358,9 +7619,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(parent > 0);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- struct btrfs_delayed_extent_op *extent_op;
extent_op = btrfs_alloc_delayed_extent_op();
- BUG_ON(!extent_op); /* -ENOMEM */
+ if (!extent_op) {
+ ret = -ENOMEM;
+ goto out_free_buf;
+ }
if (key)
memcpy(&extent_op->key, key, sizeof(extent_op->key));
else
@@ -7375,13 +7638,24 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->level = level;
ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
- ins.objectid,
- ins.offset, parent, root_objectid,
- level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op, 0);
- BUG_ON(ret); /* -ENOMEM */
+ ins.objectid, ins.offset,
+ parent, root_objectid, level,
+ BTRFS_ADD_DELAYED_EXTENT,
+ extent_op, 0);
+ if (ret)
+ goto out_free_delayed;
}
return buf;
+
+out_free_delayed:
+ btrfs_free_delayed_extent_op(extent_op);
+out_free_buf:
+ free_extent_buffer(buf);
+out_free_reserved:
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
+out_unuse:
+ unuse_block_rsv(root->fs_info, block_rsv, blocksize);
+ return ERR_PTR(ret);
}
struct walk_control {
@@ -7482,12 +7756,18 @@ reada:
wc->reada_slot = slot;
}
+/*
+ * TODO: Modify related function to add related node/leaf to dirty_extent_root,
+ * for later qgroup accounting.
+ *
+ * Current, this function does nothing.
+ */
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
- int i, extent_type, ret;
+ int i, extent_type;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
@@ -7510,13 +7790,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-
- ret = btrfs_qgroup_record_ref(trans, root->fs_info,
- root->objectid,
- bytenr, num_bytes,
- BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
- if (ret)
- return ret;
}
return 0;
}
@@ -7585,6 +7858,8 @@ static int adjust_slots_upwards(struct btrfs_root *root,
/*
* root_eb is the subtree root and is locked before this function is called.
+ * TODO: Modify this function to mark all (including complete shared node)
+ * to dirty_extent_root to allow it get accounted in qgroup.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -7647,7 +7922,11 @@ walk_down:
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
eb = read_tree_block(root, child_bytenr, child_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
ret = -EIO;
goto out;
}
@@ -7658,16 +7937,6 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-
- ret = btrfs_qgroup_record_ref(trans, root->fs_info,
- root->objectid,
- child_bytenr,
- root->nodesize,
- BTRFS_QGROUP_OPER_SUB_SUBTREE,
- 0);
- if (ret)
- goto out;
-
}
if (level == 0) {
@@ -7815,7 +8084,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
blocksize = root->nodesize;
- next = btrfs_find_tree_block(root, bytenr);
+ next = btrfs_find_tree_block(root->fs_info, bytenr);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr);
if (!next)
@@ -7878,7 +8147,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
next = read_tree_block(root, bytenr, generation);
- if (!next || !extent_buffer_uptodate(next)) {
+ if (IS_ERR(next)) {
+ return PTR_ERR(next);
+ } else if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
return -EIO;
}
@@ -8016,7 +8287,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(eb);
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
}
- clean_tree_block(trans, root, eb);
+ clean_tree_block(trans, root->fs_info, eb);
}
if (eb == root->node) {
@@ -8260,24 +8531,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out_end_trans;
}
- /*
- * Qgroup update accounting is run from
- * delayed ref handling. This usually works
- * out because delayed refs are normally the
- * only way qgroup updates are added. However,
- * we may have added updates during our tree
- * walk so run qgroups here to make sure we
- * don't lose any updates.
- */
- ret = btrfs_delayed_qgroup_accounting(trans,
- root->fs_info);
- if (ret)
- printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
- "running qgroup updates "
- "during snapshot delete. "
- "Quota is out of sync, "
- "rescan required.\n", ret);
-
btrfs_end_transaction_throttle(trans, tree_root);
if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8331,14 +8584,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
root_dropped = true;
out_end_trans:
- ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
- if (ret)
- printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
- "running qgroup updates "
- "during snapshot delete. "
- "Quota is out of sync, "
- "rescan required.\n", ret);
-
btrfs_end_transaction_throttle(trans, tree_root);
out_free:
kfree(wc);
@@ -8533,10 +8778,48 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
BUG_ON(cache->ro);
+again:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
+ /*
+ * we're not allowed to set block groups readonly after the dirty
+ * block groups cache has started writing. If it already started,
+ * back off and let this transaction commit
+ */
+ mutex_lock(&root->fs_info->ro_block_group_mutex);
+ if (trans->transaction->dirty_bg_run) {
+ u64 transid = trans->transid;
+
+ mutex_unlock(&root->fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans, root);
+
+ ret = btrfs_wait_for_commit(root, transid);
+ if (ret)
+ return ret;
+ goto again;
+ }
+
+ /*
+ * if we are changing raid levels, try to allocate a corresponding
+ * block group with the new raid level.
+ */
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ if (alloc_flags != cache->flags) {
+ ret = do_chunk_alloc(trans, root, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * ENOSPC is allowed here, we may have enough space
+ * already allocated at the new raid level to
+ * carry on
+ */
+ if (ret == -ENOSPC)
+ ret = 0;
+ if (ret < 0)
+ goto out;
+ }
+
ret = set_block_group_ro(cache, 0);
if (!ret)
goto out;
@@ -8549,8 +8832,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = update_block_group_flags(root, cache->flags);
+ lock_chunks(root->fs_info->chunk_root);
check_system_chunk(trans, root, alloc_flags);
+ unlock_chunks(root->fs_info->chunk_root);
}
+ mutex_unlock(&root->fs_info->ro_block_group_mutex);
btrfs_end_transaction(trans, root);
return ret;
@@ -8720,7 +9006,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
min_free <<= 1;
} else if (index == BTRFS_RAID_RAID0) {
dev_min = fs_devices->rw_devices;
- do_div(min_free, dev_min);
+ min_free = div64_u64(min_free, dev_min);
}
/* We need to do this so that we can look at pending chunks */
@@ -8992,6 +9278,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
INIT_LIST_HEAD(&cache->dirty_list);
+ INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
@@ -9247,6 +9534,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+ /*
+ * Call to ensure the corresponding space_info object is created and
+ * assigned to our block group, but don't update its counters just yet.
+ * We want our bg to be added to the rbtree with its ->space_info set.
+ */
+ ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+ &cache->space_info);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
ret = btrfs_add_block_group_cache(root->fs_info, cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
@@ -9254,6 +9554,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return ret;
}
+ /*
+ * Now that our block group has its ->space_info set and is inserted in
+ * the rbtree, update the space info's counters.
+ */
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
if (ret) {
@@ -9355,7 +9659,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * get the inode first so any iput calls done for the io_list
+ * aren't the final iput (no unlinks allowed now)
+ */
inode = lookup_free_space_inode(tree_root, block_group, path);
+
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ /*
+ * make sure our free spache cache IO is done before remove the
+ * free space inode
+ */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ btrfs_wait_cache_io(root, trans, block_group,
+ &block_group->io_ctl, path,
+ block_group->key.objectid);
+ btrfs_put_block_group(block_group);
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ }
+
+ if (!list_empty(&block_group->dirty_list)) {
+ list_del_init(&block_group->dirty_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ mutex_unlock(&trans->transaction->cache_write_mutex);
+
if (!IS_ERR(inode)) {
ret = btrfs_orphan_add(trans, inode);
if (ret) {
@@ -9448,18 +9783,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&trans->transaction->dirty_bgs_lock);
if (!list_empty(&block_group->dirty_list)) {
- list_del_init(&block_group->dirty_list);
- btrfs_put_block_group(block_group);
+ WARN_ON(1);
+ }
+ if (!list_empty(&block_group->io_list)) {
+ WARN_ON(1);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
-
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
list_del_init(&block_group->ro_list);
+
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ WARN_ON(block_group->space_info->total_bytes
+ < block_group->key.offset);
+ WARN_ON(block_group->space_info->bytes_readonly
+ < block_group->key.offset);
+ WARN_ON(block_group->space_info->disk_total
+ < block_group->key.offset * factor);
+ }
block_group->space_info->total_bytes -= block_group->key.offset;
block_group->space_info->bytes_readonly -= block_group->key.offset;
block_group->space_info->disk_total -= block_group->key.offset * factor;
+
spin_unlock(&block_group->space_info->lock);
memcpy(&key, &block_group->key, sizeof(key));
@@ -9574,6 +9920,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
@@ -9647,8 +9995,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/* Reset pinned so btrfs_put_block_group doesn't complain */
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+
+ space_info->bytes_pinned -= block_group->pinned;
+ space_info->bytes_readonly += block_group->pinned;
+ percpu_counter_add(&space_info->total_bytes_pinned,
+ -block_group->pinned);
block_group->pinned = 0;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
/*
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
@@ -9658,6 +10016,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
end_trans:
btrfs_end_transaction(trans, root);
next:
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/fs/btrfs/extent-tree.h
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d688cfe5d496..02d05817cbdf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
}
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
@@ -2767,8 +2772,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
else
btrfsic_submit_bio(rw, bio);
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
bio_put(bio);
return ret;
}
@@ -4492,6 +4495,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
free_extent_map(em);
em = NULL;
@@ -4514,8 +4519,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
em_len, flags);
- if (ret)
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
goto out_free;
+ }
}
out_free:
free_extent_map(em);
@@ -4557,36 +4565,37 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
do {
index--;
page = eb->pages[index];
- if (page && mapped) {
+ if (!page)
+ continue;
+ if (mapped)
spin_lock(&page->mapping->private_lock);
+ /*
+ * We do this since we'll remove the pages after we've
+ * removed the eb from the radix tree, so we could race
+ * and have this page now attached to the new eb. So
+ * only clear page_private if it's still connected to
+ * this eb.
+ */
+ if (PagePrivate(page) &&
+ page->private == (unsigned long)eb) {
+ BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(PageDirty(page));
+ BUG_ON(PageWriteback(page));
/*
- * We do this since we'll remove the pages after we've
- * removed the eb from the radix tree, so we could race
- * and have this page now attached to the new eb. So
- * only clear page_private if it's still connected to
- * this eb.
+ * We need to make sure we haven't be attached
+ * to a new eb.
*/
- if (PagePrivate(page) &&
- page->private == (unsigned long)eb) {
- BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(PageDirty(page));
- BUG_ON(PageWriteback(page));
- /*
- * We need to make sure we haven't be attached
- * to a new eb.
- */
- ClearPagePrivate(page);
- set_page_private(page, 0);
- /* One for the page private */
- page_cache_release(page);
- }
- spin_unlock(&page->mapping->private_lock);
-
- }
- if (page) {
- /* One for when we alloced the page */
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ /* One for the page private */
page_cache_release(page);
}
+
+ if (mapped)
+ spin_unlock(&page->mapping->private_lock);
+
+ /* One for when we alloced the page */
+ page_cache_release(page);
} while (index != 0);
}
@@ -4768,6 +4777,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
+ /*
+ * Lock our eb's refs_lock to avoid races with
+ * free_extent_buffer. When we get our eb it might be flagged
+ * with EXTENT_BUFFER_STALE and another task running
+ * free_extent_buffer might have seen that flag set,
+ * eb->refs == 2, that the buffer isn't under IO (dirty and
+ * writeback flags not set) and it's still in the tree (flag
+ * EXTENT_BUFFER_TREE_REF set), therefore being in the process
+ * of decrementing the extent buffer's reference count twice.
+ * So here we could race and increment the eb's reference count,
+ * clear its stale flag, mark it as dirty and drop our reference
+ * before the other task finishes executing free_extent_buffer,
+ * which would later result in an attempt to free an extent
+ * buffer that is dirty.
+ */
+ if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+ spin_lock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
+ }
mark_extent_buffer_accessed(eb, NULL);
return eb;
}
@@ -4867,6 +4895,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
+ exists = NULL;
/*
* Do this so attach doesn't complain and we need to
@@ -4930,12 +4959,12 @@ again:
return eb;
free_eb:
+ WARN_ON(!atomic_dec_and_test(&eb->refs));
for (i = 0; i < num_pages; i++) {
if (eb->pages[i])
unlock_page(eb->pages[i]);
}
- WARN_ON(!atomic_dec_and_test(&eb->refs));
btrfs_release_extent_buffer(eb);
return exists;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 695b0ccfb755..c668f36898d3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -97,7 +97,7 @@ struct extent_io_tree {
u64 dirty_bytes;
int track_uptodate;
spinlock_t lock;
- struct extent_io_ops *ops;
+ const struct extent_io_ops *ops;
};
struct extent_state {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 84a2d1868271..58ece6558430 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -185,8 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
- btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
- GFP_NOFS);
+ btrfs_bio->csum_allocated = kmalloc_array(nblocks,
+ csum_size, GFP_NOFS);
if (!btrfs_bio->csum_allocated) {
btrfs_free_path(path);
return -ENOMEM;
@@ -553,7 +553,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root,
btrfs_truncate_item(root, path, new_size, 0);
key->offset = end_byte;
- btrfs_set_item_key_safe(root, path, key);
+ btrfs_set_item_key_safe(root->fs_info, path, key);
} else {
BUG();
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 30982bbd31c3..b823fac91c92 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,7 +24,6 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
-#include <linux/aio.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
@@ -32,6 +31,7 @@
#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/btrfs.h>
+#include <linux/uio.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -273,11 +273,7 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
defrag = rb_entry(node, struct inode_defrag, rb_node);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- if (need_resched()) {
- spin_unlock(&fs_info->defrag_inodes_lock);
- cond_resched();
- spin_lock(&fs_info->defrag_inodes_lock);
- }
+ cond_resched_lock(&fs_info->defrag_inodes_lock);
node = rb_first(&fs_info->defrag_inodes);
}
@@ -868,7 +864,7 @@ next_slot:
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = end;
- btrfs_set_item_key_safe(root, path, &new_key);
+ btrfs_set_item_key_safe(root->fs_info, path, &new_key);
extent_offset += end - key.offset;
btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@ -1126,7 +1122,7 @@ again:
ino, bytenr, orig_offset,
&other_start, &other_end)) {
new_key.offset = end;
- btrfs_set_item_key_safe(root, path, &new_key);
+ btrfs_set_item_key_safe(root->fs_info, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi,
@@ -1160,7 +1156,7 @@ again:
trans->transid);
path->slots[0]++;
new_key.offset = start;
- btrfs_set_item_key_safe(root, path, &new_key);
+ btrfs_set_item_key_safe(root->fs_info, path, &new_key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -1485,7 +1481,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
PAGE_CACHE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
- pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+ pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return -ENOMEM;
@@ -1514,7 +1510,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = btrfs_check_data_free_space(inode, reserve_bytes);
+ ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
if (ret == -ENOSPC &&
(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC))) {
@@ -1635,8 +1631,8 @@ again:
btrfs_end_write_no_snapshoting(root);
if (only_release_metadata && copied > 0) {
- u64 lockstart = round_down(pos, root->sectorsize);
- u64 lockend = lockstart +
+ lockstart = round_down(pos, root->sectorsize);
+ lockend = lockstart +
(dirty_pages << PAGE_CACHE_SHIFT) - 1;
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
@@ -1739,28 +1735,20 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
u64 start_pos;
u64 end_pos;
ssize_t num_written = 0;
- ssize_t err = 0;
- size_t count = iov_iter_count(from);
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
- loff_t pos = iocb->ki_pos;
+ ssize_t err;
+ loff_t pos;
+ size_t count;
mutex_lock(&inode->i_mutex);
-
- current->backing_dev_info = inode_to_bdi(inode);
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err) {
+ err = generic_write_checks(iocb, from);
+ if (err <= 0) {
mutex_unlock(&inode->i_mutex);
- goto out;
- }
-
- if (count == 0) {
- mutex_unlock(&inode->i_mutex);
- goto out;
+ return err;
}
- iov_iter_truncate(from, count);
-
- err = file_remove_suid(file);
+ current->backing_dev_info = inode_to_bdi(inode);
+ err = file_remove_privs(file);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
@@ -1786,6 +1774,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
*/
update_time_for_write(inode);
+ pos = iocb->ki_pos;
+ count = iov_iter_count(from);
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
/* Expand hole size to cover write data, preventing empty gap */
@@ -1800,7 +1790,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
if (sync)
atomic_inc(&BTRFS_I(inode)->sync_writers);
- if (file->f_flags & O_DIRECT) {
+ if (iocb->ki_flags & IOCB_DIRECT) {
num_written = __btrfs_direct_write(iocb, from, pos);
} else {
num_written = __btrfs_buffered_write(file, from, pos);
@@ -1815,7 +1805,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occured.
*/
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_sub_trans = root->log_transid;
+ spin_unlock(&BTRFS_I(inode)->lock);
if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
if (err < 0)
@@ -1870,12 +1862,13 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0;
bool full_sync = 0;
+ const u64 len = end - start + 1;
trace_btrfs_sync_file(file, datasync);
@@ -1904,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* all extents are persisted and the respective file extent
* items are in the fs/subvol btree.
*/
- ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+ ret = btrfs_wait_ordered_range(inode, start, len);
} else {
/*
* Start any new ordered operations before starting to log the
@@ -1976,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- (full_sync && BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed)) {
+ (BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed &&
+ (full_sync ||
+ !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2168,7 +2163,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
u64 num_bytes;
key.offset = offset;
- btrfs_set_item_key_safe(root, path, &key);
+ btrfs_set_item_key_safe(root->fs_info, path, &key);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -2551,7 +2546,6 @@ static long btrfs_fallocate(struct file *file, int mode,
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
- struct btrfs_root *root = BTRFS_I(inode)->root;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
@@ -2576,14 +2570,9 @@ static long btrfs_fallocate(struct file *file, int mode,
* Make sure we have enough space before we do the
* allocation.
*/
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
if (ret)
return ret;
- if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
- if (ret)
- goto out_reserve_fail;
- }
mutex_lock(&inode->i_mutex);
ret = inode_newsize_ok(inode, alloc_end);
@@ -2673,23 +2662,35 @@ static long btrfs_fallocate(struct file *file, int mode,
1 << inode->i_blkbits,
offset + len,
&alloc_hint);
-
- if (ret < 0) {
- free_extent_map(em);
- break;
- }
} else if (actual_end > inode->i_size &&
!(mode & FALLOC_FL_KEEP_SIZE)) {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
/*
* We didn't need to allocate any more space, but we
* still extended the size of the file so we need to
- * update i_size.
+ * update i_size and the inode item.
*/
- inode->i_ctime = CURRENT_TIME;
- i_size_write(inode, actual_end);
- btrfs_ordered_update_i_size(inode, actual_end, NULL);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ inode->i_ctime = CURRENT_TIME;
+ i_size_write(inode, actual_end);
+ btrfs_ordered_update_i_size(inode, actual_end,
+ NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_end_transaction(trans,
+ root);
+ }
}
free_extent_map(em);
+ if (ret < 0)
+ break;
cur_offset = last_byte;
if (cur_offset >= alloc_end) {
@@ -2701,9 +2702,6 @@ static long btrfs_fallocate(struct file *file, int mode,
&cached_state, GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
- if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, alloc_end - alloc_start);
-out_reserve_fail:
/* Let go of our reservation. */
btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
return ret;
@@ -2806,8 +2804,6 @@ out:
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a71978578fa7..fb5a6b1c62a6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
}
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_mask(inode->i_mapping) &
+ ~(__GFP_FS | __GFP_HIGHMEM));
return inode;
}
@@ -170,13 +171,13 @@ static int __create_free_space_inode(struct btrfs_root *root,
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
key.type = 0;
-
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_free_space_header));
if (ret < 0) {
btrfs_release_path(path);
return ret;
}
+
leaf = path->nodes[0];
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
@@ -225,9 +226,39 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
struct inode *inode)
{
int ret = 0;
+ struct btrfs_path *path = btrfs_alloc_path();
+ bool locked = false;
+
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ if (block_group) {
+ locked = true;
+ mutex_lock(&trans->transaction->cache_write_mutex);
+ if (!list_empty(&block_group->io_list)) {
+ list_del_init(&block_group->io_list);
+
+ btrfs_wait_cache_io(root, trans, block_group,
+ &block_group->io_ctl, path,
+ block_group->key.objectid);
+ btrfs_put_block_group(block_group);
+ }
+
+ /*
+ * now that we've truncated the cache away, its no longer
+ * setup or written
+ */
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_unlock(&block_group->lock);
+ }
+ btrfs_free_path(path);
btrfs_i_size_write(inode, 0);
truncate_pagecache(inode, 0);
@@ -235,15 +266,19 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
/*
* We don't need an orphan item because truncating the free space cache
* will never be split across transactions.
+ * We don't need to check for -EAGAIN because we're a free space
+ * cache inode
*/
ret = btrfs_truncate_inode_items(trans, root, inode,
0, BTRFS_EXTENT_DATA_KEY);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
+ if (ret)
+ goto fail;
ret = btrfs_update_inode(trans, root, inode);
+
+fail:
+ if (locked)
+ mutex_unlock(&trans->transaction->cache_write_mutex);
if (ret)
btrfs_abort_transaction(trans, root, ret);
@@ -269,18 +304,7 @@ static int readahead_cache(struct inode *inode)
return 0;
}
-struct io_ctl {
- void *cur, *orig;
- struct page *page;
- struct page **pages;
- struct btrfs_root *root;
- unsigned long size;
- int index;
- int num_pages;
- unsigned check_crcs:1;
-};
-
-static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
struct btrfs_root *root, int write)
{
int num_pages;
@@ -296,45 +320,46 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
(num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
return -ENOSPC;
- memset(io_ctl, 0, sizeof(struct io_ctl));
+ memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
- io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+ io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
if (!io_ctl->pages)
return -ENOMEM;
io_ctl->num_pages = num_pages;
io_ctl->root = root;
io_ctl->check_crcs = check_crcs;
+ io_ctl->inode = inode;
return 0;
}
-static void io_ctl_free(struct io_ctl *io_ctl)
+static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
{
kfree(io_ctl->pages);
+ io_ctl->pages = NULL;
}
-static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
{
if (io_ctl->cur) {
- kunmap(io_ctl->page);
io_ctl->cur = NULL;
io_ctl->orig = NULL;
}
}
-static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
{
ASSERT(io_ctl->index < io_ctl->num_pages);
io_ctl->page = io_ctl->pages[io_ctl->index++];
- io_ctl->cur = kmap(io_ctl->page);
+ io_ctl->cur = page_address(io_ctl->page);
io_ctl->orig = io_ctl->cur;
io_ctl->size = PAGE_CACHE_SIZE;
if (clear)
memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
}
-static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
{
int i;
@@ -349,7 +374,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
}
}
-static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
int uptodate)
{
struct page *page;
@@ -383,7 +408,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
return 0;
}
-static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
__le64 *val;
@@ -406,7 +431,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
io_ctl->cur += sizeof(u64);
}
-static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
__le64 *gen;
@@ -435,7 +460,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
return 0;
}
-static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
{
u32 *tmp;
u32 crc = ~(u32)0;
@@ -453,13 +478,12 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
PAGE_CACHE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
io_ctl_unmap_page(io_ctl);
- tmp = kmap(io_ctl->pages[0]);
+ tmp = page_address(io_ctl->pages[0]);
tmp += index;
*tmp = crc;
- kunmap(io_ctl->pages[0]);
}
-static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
{
u32 *tmp, val;
u32 crc = ~(u32)0;
@@ -473,10 +497,9 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
- tmp = kmap(io_ctl->pages[0]);
+ tmp = page_address(io_ctl->pages[0]);
tmp += index;
val = *tmp;
- kunmap(io_ctl->pages[0]);
io_ctl_map_page(io_ctl, 0);
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
@@ -492,7 +515,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
return 0;
}
-static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
void *bitmap)
{
struct btrfs_free_space_entry *entry;
@@ -522,7 +545,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
return 0;
}
-static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
{
if (!io_ctl->cur)
return -ENOSPC;
@@ -545,7 +568,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
return 0;
}
-static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
{
/*
* If we're not on the boundary we know we've modified the page and we
@@ -562,7 +585,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
}
}
-static int io_ctl_read_entry(struct io_ctl *io_ctl,
+static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space *entry, u8 *type)
{
struct btrfs_free_space_entry *e;
@@ -589,7 +612,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
return 0;
}
-static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space *entry)
{
int ret;
@@ -648,7 +671,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
{
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
- struct io_ctl io_ctl;
+ struct btrfs_io_ctl io_ctl;
struct btrfs_key key;
struct btrfs_free_space *e, *n;
LIST_HEAD(bitmaps);
@@ -877,7 +900,7 @@ out:
}
static noinline_for_stack
-int write_cache_extent_entries(struct io_ctl *io_ctl,
+int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group_cache *block_group,
int *entries, int *bitmaps,
@@ -885,6 +908,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
{
int ret;
struct btrfs_free_cluster *cluster = NULL;
+ struct btrfs_free_cluster *cluster_locked = NULL;
struct rb_node *node = rb_first(&ctl->free_space_offset);
struct btrfs_trim_range *trim_entry;
@@ -896,6 +920,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
}
if (!node && cluster) {
+ cluster_locked = cluster;
+ spin_lock(&cluster_locked->lock);
node = rb_first(&cluster->root);
cluster = NULL;
}
@@ -919,9 +945,15 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
node = rb_next(node);
if (!node && cluster) {
node = rb_first(&cluster->root);
+ cluster_locked = cluster;
+ spin_lock(&cluster_locked->lock);
cluster = NULL;
}
}
+ if (cluster_locked) {
+ spin_unlock(&cluster_locked->lock);
+ cluster_locked = NULL;
+ }
/*
* Make sure we don't miss any range that was removed from our rbtree
@@ -939,6 +971,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
return 0;
fail:
+ if (cluster_locked)
+ spin_unlock(&cluster_locked->lock);
return -ENOSPC;
}
@@ -1000,7 +1034,7 @@ fail:
static noinline_for_stack int
write_pinned_extent_entries(struct btrfs_root *root,
struct btrfs_block_group_cache *block_group,
- struct io_ctl *io_ctl,
+ struct btrfs_io_ctl *io_ctl,
int *entries)
{
u64 start, extent_start, extent_end, len;
@@ -1050,7 +1084,7 @@ write_pinned_extent_entries(struct btrfs_root *root,
}
static noinline_for_stack int
-write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
{
struct list_head *pos, *n;
int ret;
@@ -1083,10 +1117,7 @@ static int flush_dirty_cache(struct inode *inode)
}
static void noinline_for_stack
-cleanup_write_cache_enospc(struct inode *inode,
- struct io_ctl *io_ctl,
- struct extent_state **cached_state,
- struct list_head *bitmap_list)
+cleanup_bitmap_list(struct list_head *bitmap_list)
{
struct list_head *pos, *n;
@@ -1095,12 +1126,85 @@ cleanup_write_cache_enospc(struct inode *inode,
list_entry(pos, struct btrfs_free_space, list);
list_del_init(&entry->list);
}
+}
+
+static void noinline_for_stack
+cleanup_write_cache_enospc(struct inode *inode,
+ struct btrfs_io_ctl *io_ctl,
+ struct extent_state **cached_state,
+ struct list_head *bitmap_list)
+{
io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, cached_state,
GFP_NOFS);
}
+int btrfs_wait_cache_io(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path, u64 offset)
+{
+ int ret;
+ struct inode *inode = io_ctl->inode;
+
+ if (!inode)
+ return 0;
+
+ if (block_group)
+ root = root->fs_info->tree_root;
+
+ /* Flush the dirty pages in the cache file. */
+ ret = flush_dirty_cache(inode);
+ if (ret)
+ goto out;
+
+ /* Update the cache item to tell everyone this cache file is valid. */
+ ret = update_cache_item(trans, root, inode, path, offset,
+ io_ctl->entries, io_ctl->bitmaps);
+out:
+ io_ctl_free(io_ctl);
+ if (ret) {
+ invalidate_inode_pages2(inode->i_mapping);
+ BTRFS_I(inode)->generation = 0;
+ if (block_group) {
+#ifdef DEBUG
+ btrfs_err(root->fs_info,
+ "failed to write free space cache for block group %llu",
+ block_group->key.objectid);
+#endif
+ }
+ }
+ btrfs_update_inode(trans, root, inode);
+
+ if (block_group) {
+ /* the dirty list is protected by the dirty_bgs_lock */
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+
+ /* the disk_cache_state is protected by the block group lock */
+ spin_lock(&block_group->lock);
+
+ /*
+ * only mark this as written if we didn't get put back on
+ * the dirty list while waiting for IO. Otherwise our
+ * cache state won't be right, and we won't get written again
+ */
+ if (!ret && list_empty(&block_group->dirty_list))
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ else if (ret)
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+
+ spin_unlock(&block_group->lock);
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+ io_ctl->inode = NULL;
+ iput(inode);
+ }
+
+ return ret;
+
+}
+
/**
* __btrfs_write_out_cache - write out cached info to an inode
* @root - the root the inode belongs to
@@ -1112,27 +1216,29 @@ cleanup_write_cache_enospc(struct inode *inode,
*
* This function writes out a free space cache struct to disk for quick recovery
* on mount. This will return 0 if it was successfull in writing the cache out,
- * and -1 if it was not.
+ * or an errno if it was not.
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 offset)
{
struct extent_state *cached_state = NULL;
- struct io_ctl io_ctl;
LIST_HEAD(bitmap_list);
int entries = 0;
int bitmaps = 0;
int ret;
+ int must_iput = 0;
if (!i_size_read(inode))
- return -1;
+ return -EIO;
- ret = io_ctl_init(&io_ctl, inode, root, 1);
+ WARN_ON(io_ctl->pages);
+ ret = io_ctl_init(io_ctl, inode, root, 1);
if (ret)
- return -1;
+ return ret;
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
down_write(&block_group->data_rwsem);
@@ -1143,55 +1249,59 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
up_write(&block_group->data_rwsem);
BTRFS_I(inode)->generation = 0;
ret = 0;
+ must_iput = 1;
goto out;
}
spin_unlock(&block_group->lock);
}
/* Lock all pages first so we can lock the extent safely. */
- io_ctl_prepare_pages(&io_ctl, inode, 0);
+ ret = io_ctl_prepare_pages(io_ctl, inode, 0);
+ if (ret)
+ goto out;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state);
- io_ctl_set_generation(&io_ctl, trans->transid);
+ io_ctl_set_generation(io_ctl, trans->transid);
mutex_lock(&ctl->cache_writeout_mutex);
/* Write out the extent entries in the free space cache */
- ret = write_cache_extent_entries(&io_ctl, ctl,
+ spin_lock(&ctl->tree_lock);
+ ret = write_cache_extent_entries(io_ctl, ctl,
block_group, &entries, &bitmaps,
&bitmap_list);
- if (ret) {
- mutex_unlock(&ctl->cache_writeout_mutex);
- goto out_nospc;
- }
+ if (ret)
+ goto out_nospc_locked;
/*
* Some spaces that are freed in the current transaction are pinned,
* they will be added into free space cache after the transaction is
* committed, we shouldn't lose them.
+ *
+ * If this changes while we are working we'll get added back to
+ * the dirty list and redo it. No locking needed
*/
- ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
- if (ret) {
- mutex_unlock(&ctl->cache_writeout_mutex);
- goto out_nospc;
- }
+ ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
+ if (ret)
+ goto out_nospc_locked;
/*
* At last, we write out all the bitmaps and keep cache_writeout_mutex
* locked while doing it because a concurrent trim can be manipulating
* or freeing the bitmap.
*/
- ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+ ret = write_bitmap_entries(io_ctl, &bitmap_list);
+ spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
if (ret)
goto out_nospc;
/* Zero out the rest of the pages just to make sure */
- io_ctl_zero_remaining_pages(&io_ctl);
+ io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+ ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
0, i_size_read(inode), &cached_state);
if (ret)
goto out_nospc;
@@ -1202,30 +1312,44 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* Release the pages and unlock the extent, we will flush
* them out later
*/
- io_ctl_drop_pages(&io_ctl);
+ io_ctl_drop_pages(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state, GFP_NOFS);
- /* Flush the dirty pages in the cache file. */
- ret = flush_dirty_cache(inode);
+ /*
+ * at this point the pages are under IO and we're happy,
+ * The caller is responsible for waiting on them and updating the
+ * the cache and the inode
+ */
+ io_ctl->entries = entries;
+ io_ctl->bitmaps = bitmaps;
+
+ ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
if (ret)
goto out;
- /* Update the cache item to tell everyone this cache file is valid. */
- ret = update_cache_item(trans, root, inode, path, offset,
- entries, bitmaps);
+ return 0;
+
out:
- io_ctl_free(&io_ctl);
+ io_ctl->inode = NULL;
+ io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
+ if (must_iput)
+ iput(inode);
return ret;
+out_nospc_locked:
+ cleanup_bitmap_list(&bitmap_list);
+ spin_unlock(&ctl->tree_lock);
+ mutex_unlock(&ctl->cache_writeout_mutex);
+
out_nospc:
- cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+ cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
@@ -1241,7 +1365,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
- enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
root = root->fs_info->tree_root;
@@ -1250,34 +1373,34 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
return 0;
}
-
- if (block_group->delalloc_bytes) {
- block_group->disk_cache_state = BTRFS_DC_WRITTEN;
- spin_unlock(&block_group->lock);
- return 0;
- }
spin_unlock(&block_group->lock);
inode = lookup_free_space_inode(root, block_group, path);
if (IS_ERR(inode))
return 0;
- ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+ ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
+ &block_group->io_ctl, trans,
path, block_group->key.objectid);
if (ret) {
- dcs = BTRFS_DC_ERROR;
- ret = 0;
#ifdef DEBUG
btrfs_err(root->fs_info,
"failed to write free space cache for block group %llu",
block_group->key.objectid);
#endif
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+ spin_unlock(&block_group->lock);
+
+ block_group->io_ctl.inode = NULL;
+ iput(inode);
}
- spin_lock(&block_group->lock);
- block_group->disk_cache_state = dcs;
- spin_unlock(&block_group->lock);
- iput(inode);
+ /*
+ * if ret == 0 the caller is expected to call btrfs_wait_cache_io
+ * to wait for IO and put the inode
+ */
+
return ret;
}
@@ -1298,11 +1421,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
u64 offset)
{
u64 bitmap_start;
- u64 bytes_per_bitmap;
+ u32 bytes_per_bitmap;
bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
bitmap_start = offset - ctl->start;
- bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+ bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
bitmap_start *= bytes_per_bitmap;
bitmap_start += ctl->start;
@@ -1521,10 +1644,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
u64 bitmap_bytes;
u64 extent_bytes;
u64 size = block_group->key.offset;
- u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
- int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+ u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+ u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
- max_bitmaps = max(max_bitmaps, 1);
+ max_bitmaps = max_t(u32, max_bitmaps, 1);
ASSERT(ctl->total_bitmaps <= max_bitmaps);
@@ -1537,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
max_bytes = MAX_CACHE_BYTES_PER_GIG;
else
max_bytes = MAX_CACHE_BYTES_PER_GIG *
- div64_u64(size, 1024 * 1024 * 1024);
+ div_u64(size, 1024 * 1024 * 1024);
/*
* we want to account for 1 more bitmap than what we have so we can make
@@ -1552,14 +1675,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
}
/*
- * we want the extent entry threshold to always be at most 1/2 the maxw
+ * we want the extent entry threshold to always be at most 1/2 the max
* bytes we can have, or whatever is less than that.
*/
extent_bytes = max_bytes - bitmap_bytes;
- extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+ extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
ctl->extents_thresh =
- div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+ div_u64(extent_bytes, sizeof(struct btrfs_free_space));
}
static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1673,7 +1796,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
*/
if (*bytes >= align) {
tmp = entry->offset - ctl->start + align - 1;
- do_div(tmp, align);
+ tmp = div64_u64(tmp, align);
tmp = tmp * align + ctl->start;
align_off = tmp - entry->offset;
} else {
@@ -2402,11 +2525,8 @@ static void __btrfs_remove_free_space_cache_locked(
} else {
free_bitmap(ctl, info);
}
- if (need_resched()) {
- spin_unlock(&ctl->tree_lock);
- cond_resched();
- spin_lock(&ctl->tree_lock);
- }
+
+ cond_resched_lock(&ctl->tree_lock);
}
}
@@ -2431,11 +2551,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
WARN_ON(cluster->block_group != block_group);
__btrfs_return_cluster_to_free_space(block_group, cluster);
- if (need_resched()) {
- spin_unlock(&ctl->tree_lock);
- cond_resched();
- spin_lock(&ctl->tree_lock);
- }
+
+ cond_resched_lock(&ctl->tree_lock);
}
__btrfs_remove_free_space_cache_locked(ctl);
spin_unlock(&ctl->tree_lock);
@@ -3346,13 +3463,29 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
int ret;
+ struct btrfs_io_ctl io_ctl;
+ bool release_metadata = true;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return 0;
- ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+ memset(&io_ctl, 0, sizeof(io_ctl));
+ ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
+ trans, path, 0);
+ if (!ret) {
+ /*
+ * At this point writepages() didn't error out, so our metadata
+ * reservation is released when the writeback finishes, at
+ * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
+ * with or without an error.
+ */
+ release_metadata = false;
+ ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
+ }
+
if (ret) {
- btrfs_delalloc_release_metadata(inode, inode->i_size);
+ if (release_metadata)
+ btrfs_delalloc_release_metadata(inode, inode->i_size);
#ifdef DEBUG
btrfs_err(root->fs_info,
"failed to write free ino cache for root %llu",
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 88b2238a0aed..a16a029ad3b1 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -48,6 +48,8 @@ struct btrfs_free_space_op {
struct btrfs_free_space *info);
};
+struct btrfs_io_ctl;
+
struct inode *lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_block_group_cache
*block_group, struct btrfs_path *path);
@@ -60,14 +62,19 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
struct inode *inode);
int load_free_space_cache(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
+int btrfs_wait_cache_io(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_io_ctl *io_ctl,
+ struct btrfs_path *path, u64 offset);
int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
-
struct inode *lookup_free_ino_inode(struct btrfs_root *root,
struct btrfs_path *path);
int create_free_ino_inode(struct btrfs_root *root,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 74faea3a516e..d4a582ac3f73 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -246,6 +246,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+ spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
struct btrfs_free_space *info;
struct rb_node *n;
u64 count;
@@ -254,24 +255,30 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
return;
while (1) {
+ bool add_to_ctl = true;
+
+ spin_lock(rbroot_lock);
n = rb_first(rbroot);
- if (!n)
+ if (!n) {
+ spin_unlock(rbroot_lock);
break;
+ }
info = rb_entry(n, struct btrfs_free_space, offset_index);
BUG_ON(info->bitmap); /* Logic error */
if (info->offset > root->ino_cache_progress)
- goto free;
+ add_to_ctl = false;
else if (info->offset + info->bytes > root->ino_cache_progress)
count = root->ino_cache_progress - info->offset + 1;
else
count = info->bytes;
- __btrfs_add_free_space(ctl, info->offset, count);
-free:
rb_erase(&info->offset_index, rbroot);
- kfree(info);
+ spin_unlock(rbroot_lock);
+ if (add_to_ctl)
+ __btrfs_add_free_space(ctl, info->offset, count);
+ kmem_cache_free(btrfs_free_space_cachep, info);
}
}
@@ -456,7 +463,7 @@ again:
}
if (i_size_read(inode) > 0) {
- ret = btrfs_truncate_free_space_cache(root, trans, inode);
+ ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
if (ret) {
if (ret != -ENOSPC)
btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d2e732d7af52..e33dff356460 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,7 +32,6 @@
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
-#include <linux/aio.h>
#include <linux/bit_spinlock.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
@@ -43,6 +42,7 @@
#include <linux/btrfs.h>
#include <linux/blkdev.h>
#include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -59,6 +59,7 @@
#include "backref.h"
#include "hash.h"
#include "props.h"
+#include "qgroup.h"
struct btrfs_iget_args {
struct btrfs_key *location;
@@ -470,7 +471,7 @@ again:
*/
if (inode_need_compress(inode)) {
WARN_ON(pages);
- pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages) {
/* just bail out to the uncompressed code */
goto cont;
@@ -752,7 +753,6 @@ retry:
}
goto out_free;
}
-
/*
* here we're doing allocation and writeback of the
* compressed pages
@@ -3110,6 +3110,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
if (empty)
return;
+ down_read(&fs_info->delayed_iput_sem);
+
spin_lock(&fs_info->delayed_iput_lock);
list_splice_init(&fs_info->delayed_iputs, &list);
spin_unlock(&fs_info->delayed_iput_lock);
@@ -3120,6 +3122,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
iput(delayed->inode);
kfree(delayed);
}
+
+ up_read(&root->fs_info->delayed_iput_sem);
}
/*
@@ -3628,25 +3632,28 @@ static void btrfs_read_locked_inode(struct inode *inode)
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+ inode->i_version = btrfs_inode_sequence(leaf, inode_item);
+ inode->i_generation = BTRFS_I(inode)->generation;
+ inode->i_rdev = 0;
+ rdev = btrfs_inode_rdev(leaf, inode_item);
+
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+ BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+cache_index:
/*
* If we were modified in the current generation and evicted from memory
* and then re-read we need to do a full sync since we don't have any
* idea about which extents were modified before we were evicted from
* cache.
+ *
+ * This is required for both inode re-read from disk and delayed inode
+ * in delayed_nodes_tree.
*/
if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
- inode->i_version = btrfs_inode_sequence(leaf, inode_item);
- inode->i_generation = BTRFS_I(inode)->generation;
- inode->i_rdev = 0;
- rdev = btrfs_inode_rdev(leaf, inode_item);
-
- BTRFS_I(inode)->index_cnt = (u64)-1;
- BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
-
-cache_index:
path->slots[0]++;
if (inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
@@ -4016,16 +4023,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int ret;
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
return PTR_ERR(trans);
- btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+ btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0);
- ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
dentry->d_name.name, dentry->d_name.len);
if (ret)
goto out;
@@ -4124,7 +4131,7 @@ out:
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int err = 0;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
@@ -4151,7 +4158,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
goto out;
/* now the directory is empty */
- err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+ err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
dentry->d_name.name, dentry->d_name.len);
if (!err)
btrfs_i_size_write(inode, 0);
@@ -4162,6 +4169,21 @@ out:
return err;
}
+static int truncate_space_check(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytes_deleted)
+{
+ int ret;
+
+ bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
+ ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
+ bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
+ if (!ret)
+ trans->bytes_reserved += bytes_deleted;
+ return ret;
+
+}
+
/*
* this can truncate away extent items, csum items and directory items.
* It starts at a high offset and removes keys until it can't find
@@ -4187,7 +4209,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 extent_num_bytes = 0;
u64 extent_offset = 0;
u64 item_end = 0;
- u64 last_size = (u64)-1;
+ u64 last_size = new_size;
u32 found_type = (u8)-1;
int found_extent;
int del_item;
@@ -4197,9 +4219,21 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int ret;
int err = 0;
u64 ino = btrfs_ino(inode);
+ u64 bytes_deleted = 0;
+ bool be_nice = 0;
+ bool should_throttle = 0;
+ bool should_end = 0;
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+ /*
+ * for non-free space inodes and ref cows, we want to back off from
+ * time to time
+ */
+ if (!btrfs_is_free_space_inode(inode) &&
+ test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ be_nice = 1;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -4229,6 +4263,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
key.type = (u8)-1;
search_again:
+ /*
+ * with a 16K leaf size and 128MB extents, you can actually queue
+ * up a huge file in a single leaf. Most of the time that
+ * bytes_deleted is > 0, it will be huge by the time we get here
+ */
+ if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (btrfs_should_end_transaction(trans, root)) {
+ err = -EAGAIN;
+ goto error;
+ }
+ }
+
+
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0) {
@@ -4371,22 +4418,39 @@ delete:
} else {
break;
}
+ should_throttle = 0;
+
if (found_extent &&
(test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
root == root->fs_info->tree_root)) {
btrfs_set_path_blocking(path);
+ bytes_deleted += extent_num_bytes;
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
ino, extent_offset, 0);
BUG_ON(ret);
+ if (btrfs_should_throttle_delayed_refs(trans, root))
+ btrfs_async_run_delayed_refs(root,
+ trans->delayed_ref_updates * 2, 0);
+ if (be_nice) {
+ if (truncate_space_check(trans, root,
+ extent_num_bytes)) {
+ should_end = 1;
+ }
+ if (btrfs_should_throttle_delayed_refs(trans,
+ root)) {
+ should_throttle = 1;
+ }
+ }
}
if (found_type == BTRFS_INODE_ITEM_KEY)
break;
if (path->slots[0] == 0 ||
- path->slots[0] != pending_del_slot) {
+ path->slots[0] != pending_del_slot ||
+ should_throttle || should_end) {
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path,
pending_del_slot,
@@ -4399,6 +4463,23 @@ delete:
pending_del_nr = 0;
}
btrfs_release_path(path);
+ if (should_throttle) {
+ unsigned long updates = trans->delayed_ref_updates;
+ if (updates) {
+ trans->delayed_ref_updates = 0;
+ ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+ if (ret && !err)
+ err = ret;
+ }
+ }
+ /*
+ * if we failed to refill our space rsv, bail out
+ * and let the transaction restart
+ */
+ if (should_end) {
+ err = -EAGAIN;
+ goto error;
+ }
goto search_again;
} else {
path->slots[0]--;
@@ -4412,10 +4493,20 @@ out:
btrfs_abort_transaction(trans, root, ret);
}
error:
- if (last_size != (u64)-1 &&
- root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
btrfs_ordered_update_i_size(inode, last_size, NULL);
+
btrfs_free_path(path);
+
+ if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ unsigned long updates = trans->delayed_ref_updates;
+ if (updates) {
+ trans->delayed_ref_updates = 0;
+ ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+ if (ret && !err)
+ err = ret;
+ }
+ }
return err;
}
@@ -4826,7 +4917,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
int err;
@@ -4894,24 +4985,41 @@ static void evict_inode_truncate_pages(struct inode *inode)
}
write_unlock(&map_tree->lock);
+ /*
+ * Keep looping until we have no more ranges in the io tree.
+ * We can have ongoing bios started by readpages (called from readahead)
+ * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+ * still in progress (unlocked the pages in the bio but did not yet
+ * unlocked the ranges in the io tree). Therefore this means some
+ * ranges can still be locked and eviction started because before
+ * submitting those bios, which are executed by a separate task (work
+ * queue kthread), inode references (inode->i_count) were not taken
+ * (which would be dropped in the end io callback of each bio).
+ * Therefore here we effectively end up waiting for those bios and
+ * anyone else holding locked ranges without having bumped the inode's
+ * reference count - if we don't do it, when they access the inode's
+ * io_tree to unlock a range it may be too late, leading to an
+ * use-after-free issue.
+ */
spin_lock(&io_tree->lock);
while (!RB_EMPTY_ROOT(&io_tree->state)) {
struct extent_state *state;
struct extent_state *cached_state = NULL;
+ u64 start;
+ u64 end;
node = rb_first(&io_tree->state);
state = rb_entry(node, struct extent_state, rb_node);
- atomic_inc(&state->refs);
+ start = state->start;
+ end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, state->start, state->end,
- 0, &cached_state);
- clear_extent_bit(io_tree, state->start, state->end,
+ lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 1,
&cached_state, GFP_NOFS);
- free_extent_state(state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -4924,6 +5032,7 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv, *global_rsv;
+ int steal_from_global = 0;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
int ret;
@@ -4991,9 +5100,20 @@ void btrfs_evict_inode(struct inode *inode)
* hard as possible to get this to work.
*/
if (ret)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
+ steal_from_global++;
+ else
+ steal_from_global = 0;
+ ret = 0;
- if (ret) {
+ /*
+ * steal_from_global == 0: we reserved stuff, hooray!
+ * steal_from_global == 1: we didn't reserve stuff, boo!
+ * steal_from_global == 2: we've committed, still not a lot of
+ * room but maybe we'll have room in the global reserve this
+ * time.
+ * steal_from_global == 3: abandon all hope!
+ */
+ if (steal_from_global > 2) {
btrfs_warn(root->fs_info,
"Could not get space for a delete, will truncate on mount %d",
ret);
@@ -5009,10 +5129,40 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
+ /*
+ * We can't just steal from the global reserve, we need tomake
+ * sure there is room to do it, if not we need to commit and try
+ * again.
+ */
+ if (steal_from_global) {
+ if (!btrfs_check_space_for_delayed_refs(trans, root))
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv,
+ min_size);
+ else
+ ret = -ENOSPC;
+ }
+
+ /*
+ * Couldn't steal from the global reserve, we have too much
+ * pending stuff built up, commit the transaction and try it
+ * again.
+ */
+ if (ret) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret) {
+ btrfs_orphan_del(NULL, inode);
+ btrfs_free_block_rsv(root, rsv);
+ goto no_delete;
+ }
+ continue;
+ } else {
+ steal_from_global = 0;
+ }
+
trans->block_rsv = rsv;
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
- if (ret != -ENOSPC)
+ if (ret != -ENOSPC && ret != -EAGAIN)
break;
trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -5416,10 +5566,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
static int btrfs_dentry_delete(const struct dentry *dentry)
{
struct btrfs_root *root;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (!inode && !IS_ROOT(dentry))
- inode = dentry->d_parent->d_inode;
+ inode = d_inode(dentry->d_parent);
if (inode) {
root = BTRFS_I(inode)->root;
@@ -6226,7 +6376,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
u64 index;
int err;
int drop_inode = 0;
@@ -7396,6 +7546,7 @@ unlock:
current->journal_info = outstanding_extents;
btrfs_free_reserved_data_space(inode, len);
+ set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags);
}
/*
@@ -7721,8 +7872,6 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
struct bio *dio_bio;
int ret;
- if (err)
- goto out_done;
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
@@ -7745,7 +7894,6 @@ out_test:
ordered = NULL;
goto again;
}
-out_done:
dio_bio = dip->dio_bio;
kfree(dip);
@@ -8013,9 +8161,8 @@ out_err:
static void btrfs_submit_direct(int rw, struct bio *dio_bio,
struct inode *inode, loff_t file_offset)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_dio_private *dip;
- struct bio *io_bio;
+ struct btrfs_dio_private *dip = NULL;
+ struct bio *io_bio = NULL;
struct btrfs_io_bio *btrfs_bio;
int skip_sum;
int write = rw & REQ_WRITE;
@@ -8032,7 +8179,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip = kzalloc(sizeof(*dip), GFP_NOFS);
if (!dip) {
ret = -ENOMEM;
- goto free_io_bio;
+ goto free_ordered;
}
dip->private = dio_bio->bi_private;
@@ -8060,28 +8207,58 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
if (btrfs_bio->end_io)
btrfs_bio->end_io(btrfs_bio, ret);
-free_io_bio:
- bio_put(io_bio);
free_ordered:
/*
- * If this is a write, we need to clean up the reserved space and kill
- * the ordered extent.
+ * If we arrived here it means either we failed to submit the dip
+ * or we either failed to clone the dio_bio or failed to allocate the
+ * dip. If we cloned the dio_bio and allocated the dip, we can just
+ * call bio_endio against our io_bio so that we get proper resource
+ * cleanup if we fail to submit the dip, otherwise, we must do the
+ * same as btrfs_endio_direct_[write|read] because we can't call these
+ * callbacks - they require an allocated dip and a clone of dio_bio.
*/
- if (write) {
- struct btrfs_ordered_extent *ordered;
- ordered = btrfs_lookup_ordered_extent(inode, file_offset);
- if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
- !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
- btrfs_free_reserved_extent(root, ordered->start,
- ordered->disk_len, 1);
- btrfs_put_ordered_extent(ordered);
- btrfs_put_ordered_extent(ordered);
+ if (io_bio && dip) {
+ bio_endio(io_bio, ret);
+ /*
+ * The end io callbacks free our dip, do the final put on io_bio
+ * and all the cleanup and final put for dio_bio (through
+ * dio_end_io()).
+ */
+ dip = NULL;
+ io_bio = NULL;
+ } else {
+ if (write) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_ordered_extent(inode,
+ file_offset);
+ set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+ /*
+ * Decrements our ref on the ordered extent and removes
+ * the ordered extent from the inode's ordered tree,
+ * doing all the proper resource cleanup such as for the
+ * reserved space and waking up any waiters for this
+ * ordered extent (through btrfs_remove_ordered_extent).
+ */
+ btrfs_finish_ordered_io(ordered);
+ } else {
+ unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+ file_offset + dio_bio->bi_iter.bi_size - 1);
+ }
+ clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+ /*
+ * Releases and cleans up our dio_bio, no need to bio_put()
+ * nor bio_endio()/bio_io_error() against dio_bio.
+ */
+ dio_end_io(dio_bio, ret);
}
- bio_endio(dio_bio, ret);
+ if (io_bio)
+ bio_put(io_bio);
+ kfree(dip);
}
-static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
const struct iov_iter *iter, loff_t offset)
{
int seg;
@@ -8096,7 +8273,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
goto out;
/* If this is a write we don't need to check anymore */
- if (rw & WRITE)
+ if (iov_iter_rw(iter) == WRITE)
return 0;
/*
* Check to make sure we don't have duplicate iov_base's in this
@@ -8114,8 +8291,8 @@ out:
return retval;
}
-static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -8126,10 +8303,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
bool relock = false;
ssize_t ret;
- if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
+ if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
return 0;
- atomic_inc(&inode->i_dio_count);
+ inode_dio_begin(inode);
smp_mb__after_atomic();
/*
@@ -8144,7 +8321,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
filemap_fdatawrite_range(inode->i_mapping, offset,
offset + count - 1);
- if (rw & WRITE) {
+ if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
* the isize, but it is protected by i_mutex. So we can
@@ -8169,26 +8346,35 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
current->journal_info = &outstanding_extents;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
- inode_dio_done(inode);
+ inode_dio_end(inode);
flags = DIO_LOCKING | DIO_SKIP_HOLES;
wakeup = false;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iter, offset, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (rw & WRITE) {
+ ret = __blockdev_direct_IO(iocb, inode,
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ iter, offset, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, flags);
+ if (iov_iter_rw(iter) == WRITE) {
current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED)
- btrfs_delalloc_release_space(inode, count);
- else if (ret >= 0 && (size_t)ret < count)
+ if (ret < 0 && ret != -EIOCBQUEUED) {
+ /*
+ * If the error comes from submitting stage,
+ * btrfs_get_blocsk_direct() has free'd data space,
+ * and metadata space will be handled by
+ * finish_ordered_fn, don't do that again to make
+ * sure bytes_may_use is correct.
+ */
+ if (!test_and_clear_bit(BTRFS_INODE_DIO_READY,
+ &BTRFS_I(inode)->runtime_flags))
+ btrfs_delalloc_release_space(inode, count);
+ } else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode,
count - (size_t)ret);
}
out:
if (wakeup)
- inode_dio_done(inode);
+ inode_dio_end(inode);
if (relock)
mutex_lock(&inode->i_mutex);
@@ -8581,7 +8767,7 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
- if (ret != -ENOSPC) {
+ if (ret != -ENOSPC && ret != -EAGAIN) {
err = ret;
break;
}
@@ -8875,7 +9061,7 @@ static int btrfs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
u64 delalloc_bytes;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
u32 blocksize = inode->i_sb->s_blocksize;
generic_fillattr(inode, stat);
@@ -8896,8 +9082,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
- struct inode *new_inode = new_dentry->d_inode;
- struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = d_inode(new_dentry);
+ struct inode *old_inode = d_inode(old_dentry);
struct timespec ctime = CURRENT_TIME;
u64 index = 0;
u64 root_objectid;
@@ -9009,7 +9195,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry->d_name.len);
} else {
ret = __btrfs_unlink_inode(trans, root, old_dir,
- old_dentry->d_inode,
+ d_inode(old_dentry),
old_dentry->d_name.name,
old_dentry->d_name.len);
if (!ret)
@@ -9033,12 +9219,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, dest, new_dir,
- new_dentry->d_inode,
+ d_inode(new_dentry),
new_dentry->d_name.name,
new_dentry->d_name.len);
}
if (!ret && new_inode->i_nlink == 0)
- ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+ ret = btrfs_orphan_add(trans, d_inode(new_dentry));
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out_fail;
@@ -9451,6 +9637,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_end_transaction(trans, root);
break;
}
+
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + ins.offset -1, 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 74609b931ba5..0770c91586ca 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
static int btrfs_clone(struct inode *src, struct inode *inode,
- u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+ u64 off, u64 olen, u64 olen_aligned, u64 destoff,
+ int no_time_update);
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -456,6 +457,13 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
return ret;
+ /*
+ * Don't create subvolume whose level is not zero. Or qgroup will be
+ * screwed up since it assume subvolme qgroup's level to be 0.
+ */
+ if (btrfs_qgroup_level(objectid))
+ return -ENOSPC;
+
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
* The same as the snapshot creation, please see the comment
@@ -546,8 +554,8 @@ static noinline int create_subvol(struct inode *dir,
key.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
if (IS_ERR(new_root)) {
- btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
ret = PTR_ERR(new_root);
+ btrfs_abort_transaction(trans, root, ret);
goto fail;
}
@@ -717,7 +725,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto fail;
- inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+ inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto fail;
@@ -761,10 +769,10 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
{
int error;
- if (!victim->d_inode)
+ if (d_really_is_negative(victim))
return -ENOENT;
- BUG_ON(victim->d_parent->d_inode != dir);
+ BUG_ON(d_inode(victim->d_parent) != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
@@ -772,8 +780,8 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) ||
- IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+ if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
+ IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
@@ -792,7 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
{
- if (child->d_inode)
+ if (d_really_is_positive(child))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
@@ -810,7 +818,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
u64 *async_transid, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- struct inode *dir = parent->dentry->d_inode;
+ struct inode *dir = d_inode(parent->dentry);
struct dentry *dentry;
int error;
@@ -824,7 +832,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_unlock;
error = -EEXIST;
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
goto out_dput;
error = btrfs_may_create(dir, dentry);
@@ -1311,7 +1319,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i = range->start >> PAGE_CACHE_SHIFT;
}
if (!max_to_defrag)
- max_to_defrag = last_index + 1;
+ max_to_defrag = last_index - i + 1;
/*
* make writeback starts from i, so the defrag range can be
@@ -1361,7 +1369,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index = max(i, ra_index);
btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
cluster);
- ra_index += max_cluster;
+ ra_index += cluster;
}
mutex_lock(&inode->i_mutex);
@@ -1564,7 +1572,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
goto out_free;
}
- do_div(new_size, root->sectorsize);
+ new_size = div_u64(new_size, root->sectorsize);
new_size *= root->sectorsize;
printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
@@ -2264,10 +2272,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
{
struct btrfs_ioctl_ino_lookup_args *args;
struct inode *inode;
- int ret;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ int ret = 0;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
@@ -2275,13 +2280,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
inode = file_inode(file);
+ /*
+ * Unprivileged query to obtain the containing subvolume root id. The
+ * path is reset so it's consistent with btrfs_search_path_in_tree.
+ */
if (args->treeid == 0)
args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+ if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+ args->name[0] = 0;
+ goto out;
+ }
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
+ }
+
ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
args->treeid, args->objectid,
args->name);
+out:
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
@@ -2294,7 +2314,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
{
struct dentry *parent = file->f_path.dentry;
struct dentry *dentry;
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
@@ -2333,12 +2353,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_unlock_dir;
}
- if (!dentry->d_inode) {
+ if (d_really_is_negative(dentry)) {
err = -ENOENT;
goto out_dput;
}
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
dest = BTRFS_I(inode)->root;
if (!capable(CAP_SYS_ADMIN)) {
/*
@@ -2403,11 +2423,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
"Attempt to delete subvolume %llu during send",
dest->root_key.objectid);
err = -EPERM;
- goto out_dput;
+ goto out_unlock_inode;
}
- d_invalidate(dentry);
-
down_write(&root->fs_info->subvol_sem);
err = may_destroy_subvol(dest);
@@ -2498,9 +2516,10 @@ out_up_write:
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
}
+out_unlock_inode:
mutex_unlock(&inode->i_mutex);
if (!err) {
- shrink_dcache_sb(root->fs_info->sb);
+ d_invalidate(dentry);
btrfs_invalidate_inodes(dest);
d_delete(dentry);
ASSERT(dest->send_in_progress == 0);
@@ -2747,14 +2766,11 @@ out:
return ret;
}
-static struct page *extent_same_get_page(struct inode *inode, u64 off)
+static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
{
struct page *page;
- pgoff_t index;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
- index = off >> PAGE_CACHE_SHIFT;
-
page = grab_cache_page(inode->i_mapping, index);
if (!page)
return NULL;
@@ -2775,6 +2791,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off)
return page;
}
+static int gather_extent_pages(struct inode *inode, struct page **pages,
+ int num_pages, u64 off)
+{
+ int i;
+ pgoff_t index = off >> PAGE_CACHE_SHIFT;
+
+ for (i = 0; i < num_pages; i++) {
+ pages[i] = extent_same_get_page(inode, index + i);
+ if (!pages[i])
+ return -ENOMEM;
+ }
+ return 0;
+}
+
static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
{
/* do any pending delalloc/csum calc on src, one way or
@@ -2800,52 +2830,120 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
}
}
-static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
{
- unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
- unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
-
mutex_unlock(&inode1->i_mutex);
mutex_unlock(&inode2->i_mutex);
}
-static void btrfs_double_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 < inode2)
+ swap(inode1, inode2);
+
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+ if (inode1 != inode2)
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+}
+
+static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
+{
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+ unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+}
+
+static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len)
{
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
-
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
lock_extent_range(inode1, loff1, len);
- if (inode1 != inode2) {
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ if (inode1 != inode2)
lock_extent_range(inode2, loff2, len);
+}
+
+struct cmp_pages {
+ int num_pages;
+ struct page **src_pages;
+ struct page **dst_pages;
+};
+
+static void btrfs_cmp_data_free(struct cmp_pages *cmp)
+{
+ int i;
+ struct page *pg;
+
+ for (i = 0; i < cmp->num_pages; i++) {
+ pg = cmp->src_pages[i];
+ if (pg)
+ page_cache_release(pg);
+ pg = cmp->dst_pages[i];
+ if (pg)
+ page_cache_release(pg);
+ }
+ kfree(cmp->src_pages);
+ kfree(cmp->dst_pages);
+}
+
+static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
+ struct inode *dst, u64 dst_loff,
+ u64 len, struct cmp_pages *cmp)
+{
+ int ret;
+ int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+ struct page **src_pgarr, **dst_pgarr;
+
+ /*
+ * We must gather up all the pages before we initiate our
+ * extent locking. We use an array for the page pointers. Size
+ * of the array is bounded by len, which is in turn bounded by
+ * BTRFS_MAX_DEDUPE_LEN.
+ */
+ src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+ dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+ if (!src_pgarr || !dst_pgarr) {
+ kfree(src_pgarr);
+ kfree(dst_pgarr);
+ return -ENOMEM;
}
+ cmp->num_pages = num_pages;
+ cmp->src_pages = src_pgarr;
+ cmp->dst_pages = dst_pgarr;
+
+ ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff);
+ if (ret)
+ goto out;
+
+ ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff);
+
+out:
+ if (ret)
+ btrfs_cmp_data_free(cmp);
+ return 0;
}
static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
- u64 dst_loff, u64 len)
+ u64 dst_loff, u64 len, struct cmp_pages *cmp)
{
int ret = 0;
+ int i;
struct page *src_page, *dst_page;
unsigned int cmp_len = PAGE_CACHE_SIZE;
void *addr, *dst_addr;
+ i = 0;
while (len) {
if (len < PAGE_CACHE_SIZE)
cmp_len = len;
- src_page = extent_same_get_page(src, loff);
- if (!src_page)
- return -EINVAL;
- dst_page = extent_same_get_page(dst, dst_loff);
- if (!dst_page) {
- page_cache_release(src_page);
- return -EINVAL;
- }
+ BUG_ON(i >= cmp->num_pages);
+
+ src_page = cmp->src_pages[i];
+ dst_page = cmp->dst_pages[i];
+
addr = kmap_atomic(src_page);
dst_addr = kmap_atomic(dst_page);
@@ -2857,26 +2955,30 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
kunmap_atomic(addr);
kunmap_atomic(dst_addr);
- page_cache_release(src_page);
- page_cache_release(dst_page);
if (ret)
break;
- loff += cmp_len;
- dst_loff += cmp_len;
len -= cmp_len;
+ i++;
}
return ret;
}
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
+ u64 olen)
{
+ u64 len = *plen;
u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
- if (off + len > inode->i_size || off + len < off)
+ if (off + olen > inode->i_size || off + olen < off)
return -EINVAL;
+
+ /* if we extend to eof, continue to block boundary */
+ if (off + len == inode->i_size)
+ *plen = len = ALIGN(inode->i_size, bs) - off;
+
/* Check that we are block aligned - btrfs_clone() requires this */
if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
return -EINVAL;
@@ -2884,28 +2986,67 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
return 0;
}
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
+ u64 len = olen;
+ struct cmp_pages cmp;
+ int same_inode = 0;
+ u64 same_lock_start = 0;
+ u64 same_lock_len = 0;
- /*
- * btrfs_clone() can't handle extents in the same file
- * yet. Once that works, we can drop this check and replace it
- * with a check for the same inode, but overlapping extents.
- */
if (src == dst)
- return -EINVAL;
+ same_inode = 1;
+
+ if (len == 0)
+ return 0;
- btrfs_double_lock(src, loff, dst, dst_loff, len);
+ if (same_inode) {
+ mutex_lock(&src->i_mutex);
- ret = extent_same_check_offsets(src, loff, len);
- if (ret)
- goto out_unlock;
+ ret = extent_same_check_offsets(src, loff, &len, olen);
+ if (ret)
+ goto out_unlock;
- ret = extent_same_check_offsets(dst, dst_loff, len);
- if (ret)
- goto out_unlock;
+ /*
+ * Single inode case wants the same checks, except we
+ * don't want our length pushed out past i_size as
+ * comparing that data range makes no sense.
+ *
+ * extent_same_check_offsets() will do this for an
+ * unaligned length at i_size, so catch it here and
+ * reject the request.
+ *
+ * This effectively means we require aligned extents
+ * for the single-inode case, whereas the other cases
+ * allow an unaligned length so long as it ends at
+ * i_size.
+ */
+ if (len != olen) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Check for overlapping ranges */
+ if (dst_loff + len > loff && dst_loff < loff + len) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ same_lock_start = min_t(u64, loff, dst_loff);
+ same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
+ } else {
+ btrfs_double_inode_lock(src, dst);
+
+ ret = extent_same_check_offsets(src, loff, &len, olen);
+ if (ret)
+ goto out_unlock;
+
+ ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
+ if (ret)
+ goto out_unlock;
+ }
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
@@ -2914,12 +3055,32 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
goto out_unlock;
}
- ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+ ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
+ if (ret)
+ goto out_unlock;
+
+ if (same_inode)
+ lock_extent_range(src, same_lock_start, same_lock_len);
+ else
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+
+ /* pass original length for comparison so we stay within i_size */
+ ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
if (ret == 0)
- ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+ ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
+
+ if (same_inode)
+ unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
+ same_lock_start + same_lock_len - 1);
+ else
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+ btrfs_cmp_data_free(&cmp);
out_unlock:
- btrfs_double_unlock(src, loff, dst, dst_loff, len);
+ if (same_inode)
+ mutex_unlock(&src->i_mutex);
+ else
+ btrfs_double_inode_unlock(src, dst);
return ret;
}
@@ -2929,7 +3090,7 @@ out_unlock:
static long btrfs_ioctl_file_extent_same(struct file *file,
struct btrfs_ioctl_same_args __user *argp)
{
- struct btrfs_ioctl_same_args *same;
+ struct btrfs_ioctl_same_args *same = NULL;
struct btrfs_ioctl_same_extent_info *info;
struct inode *src = file_inode(file);
u64 off;
@@ -2959,6 +3120,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
if (IS_ERR(same)) {
ret = PTR_ERR(same);
+ same = NULL;
goto out;
}
@@ -3029,6 +3191,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
out:
mnt_drop_write_file(file);
+ kfree(same);
return ret;
}
@@ -3039,7 +3202,7 @@ out:
static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 disko)
{
- struct seq_list tree_mod_seq_elem = {};
+ struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
struct ulist *roots;
struct ulist_iterator uiter;
struct ulist_node *root_node = NULL;
@@ -3071,13 +3234,15 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
struct inode *inode,
u64 endoff,
const u64 destoff,
- const u64 olen)
+ const u64 olen,
+ int no_time_update)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (!no_time_update)
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
@@ -3162,13 +3327,13 @@ static void clone_update_extent_map(struct inode *inode,
* @inode: Inode to clone to
* @off: Offset within source to start clone from
* @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen, extent_same uses
- * identical values here
+ * @olen_aligned: Block-aligned value of olen
* @destoff: Offset within @inode to start clone
+ * @no_time_update: Whether to update mtime/ctime on the target inode
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
- const u64 destoff)
+ const u64 destoff, int no_time_update)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path = NULL;
@@ -3202,6 +3367,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
key.offset = off;
while (1) {
+ u64 next_key_min_offset = key.offset + 1;
+
/*
* note the key will change type as we walk through the
* tree.
@@ -3282,7 +3449,7 @@ process_slot:
} else if (key.offset >= off + len) {
break;
}
-
+ next_key_min_offset = key.offset + datal;
size = btrfs_item_size_nr(leaf, slot);
read_extent_buffer(leaf, buf,
btrfs_item_ptr_offset(leaf, slot),
@@ -3421,6 +3588,20 @@ process_slot:
u64 trim = 0;
u64 aligned_end = 0;
+ /*
+ * Don't copy an inline extent into an offset
+ * greater than zero. Having an inline extent
+ * at such an offset results in chaos as btrfs
+ * isn't prepared for such cases. Just skip
+ * this case for the same reasons as commented
+ * at btrfs_ioctl_clone().
+ */
+ if (last_dest_end > 0) {
+ ret = -EOPNOTSUPP;
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+
if (off > key.offset) {
skip = off - key.offset;
new_key.offset += skip;
@@ -3490,14 +3671,15 @@ process_slot:
root->sectorsize);
ret = clone_finish_inode_update(trans, inode,
last_dest_end,
- destoff, olen);
+ destoff, olen,
+ no_time_update);
if (ret)
goto out;
if (new_key.offset + datal >= destoff + len)
break;
}
btrfs_release_path(path);
- key.offset++;
+ key.offset = next_key_min_offset;
}
ret = 0;
@@ -3528,7 +3710,7 @@ process_slot:
clone_update_extent_map(inode, trans, NULL, last_dest_end,
destoff + len - last_dest_end);
ret = clone_finish_inode_update(trans, inode, destoff + len,
- destoff, olen);
+ destoff, olen, no_time_update);
}
out:
@@ -3626,6 +3808,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (off + len == src->i_size)
len = ALIGN(src->i_size, bs) - off;
+ if (len == 0) {
+ ret = 0;
+ goto out_unlock;
+ }
+
/* verify the end result is block aligned */
if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
!IS_ALIGNED(destoff, bs))
@@ -3660,7 +3847,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
lock_extent_range(inode, destoff, len);
}
- ret = btrfs_clone(src, inode, off, olen, len, destoff);
+ ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
if (same_inode) {
u64 lock_start = min_t(u64, off, destoff);
@@ -4624,6 +4811,11 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
sa->src, sa->dst);
}
+ /* update qgroup status and info */
+ err = btrfs_run_qgroups(trans, root->fs_info);
+ if (err < 0)
+ btrfs_error(root->fs_info, ret,
+ "failed to update qgroup status and info\n");
err = btrfs_end_transaction(trans, root);
if (err && !ret)
ret = err;
@@ -4669,8 +4861,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
/* FIXME: check if the IDs really exist */
if (sa->create) {
- ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
- NULL);
+ ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
} else {
ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
}
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 617553cdb7d3..a2f051347731 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -434,7 +434,7 @@ out:
return ret;
}
-struct btrfs_compress_op btrfs_lzo_compress = {
+const struct btrfs_compress_op btrfs_lzo_compress = {
.alloc_workspace = lzo_alloc_workspace,
.free_workspace = lzo_free_workspace,
.compress_pages = lzo_compress_pages,
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
index b7816cefbd13..1b10a3cd1195 100644
--- a/fs/btrfs/math.h
+++ b/fs/btrfs/math.h
@@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor)
if (factor == 10)
return num;
num *= factor;
- do_div(num, 10);
- return num;
+ return div_u64(num, 10);
}
static inline u64 div_factor_fine(u64 num, int factor)
@@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor)
if (factor == 100)
return num;
num *= factor;
- do_div(num, 100);
- return num;
+ return div_u64(num, 100);
}
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 157cc54fc634..52170cf1757e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->file_offset = file_offset;
entry->start = start;
entry->len = len;
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
- !(type == BTRFS_ORDERED_NOCOW))
- entry->csum_bytes_left = disk_len;
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->inode = igrab(inode);
@@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode,
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
list_add_tail(&sum->list, &entry->list);
- WARN_ON(entry->csum_bytes_left < sum->len);
- entry->csum_bytes_left -= sum->len;
- if (entry->csum_bytes_left == 0)
- wake_up(&entry->wait);
spin_unlock_irq(&tree->lock);
}
@@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
- list_add_tail(&ordered->trans_list, &trans->ordered);
+ /*
+ * If our ordered extent completed it means it updated the
+ * fs/subvol and csum trees already, so no need to make the
+ * current transaction's commit wait for it, as we end up
+ * holding memory unnecessarily and delaying the inode's iput
+ * until the transaction commit (we schedule an iput for the
+ * inode when the ordered extent's refcount drops to 0), which
+ * prevents it from being evictable until the transaction
+ * commits.
+ */
+ if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
+ btrfs_put_ordered_extent(ordered);
+ else
+ list_add_tail(&ordered->trans_list, &trans->ordered);
+
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -545,6 +552,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (atomic_dec_and_test(&entry->refs)) {
+ ASSERT(list_empty(&entry->log_list));
+ ASSERT(list_empty(&entry->trans_list));
+ ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
while (!list_empty(&entry->list)) {
@@ -572,6 +583,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
+ RB_CLEAR_NODE(node);
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
@@ -722,6 +734,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
int ret = 0;
+ int ret_wb = 0;
u64 end;
u64 orig_end;
struct btrfs_ordered_extent *ordered;
@@ -741,9 +754,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
if (ret)
return ret;
- ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
- if (ret)
- return ret;
+ /*
+ * If we have a writeback error don't return immediately. Wait first
+ * for any ordered extents that haven't completed yet. This is to make
+ * sure no one can dirty the same page ranges and call writepages()
+ * before the ordered extents complete - to avoid failures (-EEXIST)
+ * when adding the new ordered extents to the ordered tree.
+ */
+ ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
end = orig_end;
while (1) {
@@ -767,7 +785,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
break;
end--;
}
- return ret;
+ return ret_wb ? ret_wb : ret;
}
/*
@@ -838,6 +856,20 @@ out:
return entry;
}
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len)
+{
+ struct btrfs_ordered_extent *oe;
+
+ oe = btrfs_lookup_ordered_range(inode, file_offset, len);
+ if (oe) {
+ btrfs_put_ordered_extent(oe);
+ return true;
+ }
+ return false;
+}
+
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e96cd4ccd805..7176cc0fe43f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -89,9 +89,6 @@ struct btrfs_ordered_extent {
/* number of bytes that still need writing */
u64 bytes_left;
- /* number of bytes that still need csumming */
- u64 csum_bytes_left;
-
/*
* the end of the ordered extent which is behind it but
* didn't update disk_i_size. Please see the comment of
@@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
u64 file_offset,
u64 len);
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+ u64 file_offset,
+ u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 129b1dd28527..dca137b04095 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -425,3 +425,5 @@ static const char *prop_compression_extract(struct inode *inode)
return NULL;
}
+
+
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 058c79eecbfb..8a8202956576 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -34,6 +34,7 @@
#include "extent_io.h"
#include "qgroup.h"
+
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
* - reorganize keys
@@ -84,11 +85,42 @@ struct btrfs_qgroup {
/*
* temp variables for accounting operations
+ * Refer to qgroup_shared_accouting() for details.
*/
u64 old_refcnt;
u64 new_refcnt;
};
+static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq;
+ qg->old_refcnt += mod;
+}
+
+static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
+ int mod)
+{
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq;
+ qg->new_refcnt += mod;
+}
+
+static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->old_refcnt < seq)
+ return 0;
+ return qg->old_refcnt - seq;
+}
+
+static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+ if (qg->new_refcnt < seq)
+ return 0;
+ return qg->new_refcnt - seq;
+}
+
/*
* glue structure to represent the relations between qgroups.
*/
@@ -644,9 +676,8 @@ out:
}
static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 qgroupid,
- u64 flags, u64 max_rfer, u64 max_excl,
- u64 rsv_rfer, u64 rsv_excl)
+ struct btrfs_root *root,
+ struct btrfs_qgroup *qgroup)
{
struct btrfs_path *path;
struct btrfs_key key;
@@ -657,7 +688,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
key.objectid = 0;
key.type = BTRFS_QGROUP_LIMIT_KEY;
- key.offset = qgroupid;
+ key.offset = qgroup->qgroupid;
path = btrfs_alloc_path();
if (!path)
@@ -673,11 +704,11 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
- btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
- btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
- btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
- btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
- btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
+ btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
+ btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
+ btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
+ btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
+ btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
btrfs_mark_buffer_dirty(l);
@@ -967,6 +998,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
fs_info->pending_quota_state = 0;
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
@@ -982,7 +1014,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
list_del(&quota_root->dirty_list);
btrfs_tree_lock(quota_root->node);
- clean_tree_block(trans, tree_root, quota_root->node);
+ clean_tree_block(trans, tree_root->fs_info, quota_root->node);
btrfs_tree_unlock(quota_root->node);
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
@@ -1001,6 +1033,110 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
}
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ *
+ * Caller should hold fs_info->qgroup_lock.
+ */
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+ struct ulist *tmp, u64 ref_root,
+ u64 num_bytes, int sign)
+{
+ struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup_list *glist;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret = 0;
+
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+
+ qgroup->rfer += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes;
+
+ WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ qgroup->excl += sign * num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+ if (sign > 0)
+ qgroup->reserved -= num_bytes;
+
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Get all of the parent groups that contain this qgroup */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+
+ /* Iterate all of the parents and adjust their reference counts */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ qgroup = u64_to_ptr(unode->aux);
+ qgroup->rfer += sign * num_bytes;
+ qgroup->rfer_cmpr += sign * num_bytes;
+ WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+ qgroup->excl += sign * num_bytes;
+ if (sign > 0)
+ qgroup->reserved -= num_bytes;
+ qgroup->excl_cmpr += sign * num_bytes;
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+
+/*
+ * Quick path for updating qgroup with only excl refs.
+ *
+ * In that case, just update all parent will be enough.
+ * Or we needs to do a full rescan.
+ * Caller should also hold fs_info->qgroup_lock.
+ *
+ * Return 0 for quick update, return >0 for need to full rescan
+ * and mark INCONSISTENT flag.
+ * Return < 0 for other error.
+ */
+static int quick_update_accounting(struct btrfs_fs_info *fs_info,
+ struct ulist *tmp, u64 src, u64 dst,
+ int sign)
+{
+ struct btrfs_qgroup *qgroup;
+ int ret = 1;
+ int err = 0;
+
+ qgroup = find_qgroup_rb(fs_info, src);
+ if (!qgroup)
+ goto out;
+ if (qgroup->excl == qgroup->rfer) {
+ ret = 0;
+ err = __qgroup_excl_accounting(fs_info, tmp, dst,
+ qgroup->excl, sign);
+ if (err < 0) {
+ ret = err;
+ goto out;
+ }
+ }
+out:
+ if (ret)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst)
{
@@ -1008,8 +1144,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
+ struct ulist *tmp;
int ret = 0;
+ /* Check the level of src and dst first */
+ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+ return -EINVAL;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
if (!quota_root) {
@@ -1043,23 +1188,33 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
spin_lock(&fs_info->qgroup_lock);
ret = add_relation_rb(quota_root->fs_info, src, dst);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ goto out;
+ }
+ ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
spin_unlock(&fs_info->qgroup_lock);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ ulist_free(tmp);
return ret;
}
-int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+int __del_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
+ struct ulist *tmp;
int ret = 0;
int err;
- mutex_lock(&fs_info->qgroup_ioctl_lock);
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
quota_root = fs_info->quota_root;
if (!quota_root) {
ret = -EINVAL;
@@ -1088,14 +1243,27 @@ exist:
spin_lock(&fs_info->qgroup_lock);
del_relation_rb(fs_info, src, dst);
+ ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
spin_unlock(&fs_info->qgroup_lock);
out:
+ ulist_free(tmp);
+ return ret;
+}
+
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+ int ret = 0;
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ ret = __del_qgroup_relation(trans, fs_info, src, dst);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
return ret;
}
int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
+ struct btrfs_fs_info *fs_info, u64 qgroupid)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
@@ -1133,6 +1301,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
+ struct btrfs_qgroup_list *list;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
@@ -1147,15 +1316,24 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
ret = -ENOENT;
goto out;
} else {
- /* check if there are no relations to this qgroup */
- if (!list_empty(&qgroup->groups) ||
- !list_empty(&qgroup->members)) {
+ /* check if there are no children of this qgroup */
+ if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
goto out;
}
}
ret = del_qgroup_item(trans, quota_root, qgroupid);
+ while (!list_empty(&qgroup->groups)) {
+ list = list_first_entry(&qgroup->groups,
+ struct btrfs_qgroup_list, next_group);
+ ret = __del_qgroup_relation(trans, fs_info,
+ qgroupid,
+ list->group->qgroupid);
+ if (ret)
+ goto out;
+ }
+
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(quota_root->fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
@@ -1171,6 +1349,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
+ /* Sometimes we would want to clear the limit on this qgroup.
+ * To meet this requirement, we treat the -1 as a special value
+ * which tell kernel to clear the limit on this qgroup.
+ */
+ const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
@@ -1184,307 +1367,140 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
ret = -ENOENT;
goto out;
}
- ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
- limit->flags, limit->max_rfer,
- limit->max_excl, limit->rsv_rfer,
- limit->rsv_excl);
- if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_info(fs_info, "unable to update quota limit for %llu",
- qgroupid);
- }
spin_lock(&fs_info->qgroup_lock);
- qgroup->lim_flags = limit->flags;
- qgroup->max_rfer = limit->max_rfer;
- qgroup->max_excl = limit->max_excl;
- qgroup->rsv_rfer = limit->rsv_rfer;
- qgroup->rsv_excl = limit->rsv_excl;
- spin_unlock(&fs_info->qgroup_lock);
-out:
- mutex_unlock(&fs_info->qgroup_ioctl_lock);
- return ret;
-}
-
-static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
- struct btrfs_qgroup_operation *oper2)
-{
- /*
- * Ignore seq and type here, we're looking for any operation
- * at all related to this extent on that root.
- */
- if (oper1->bytenr < oper2->bytenr)
- return -1;
- if (oper1->bytenr > oper2->bytenr)
- return 1;
- if (oper1->ref_root < oper2->ref_root)
- return -1;
- if (oper1->ref_root > oper2->ref_root)
- return 1;
- return 0;
-}
-
-static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct rb_node *n;
- struct btrfs_qgroup_operation *cur;
- int cmp;
-
- spin_lock(&fs_info->qgroup_op_lock);
- n = fs_info->qgroup_op_tree.rb_node;
- while (n) {
- cur = rb_entry(n, struct btrfs_qgroup_operation, n);
- cmp = comp_oper_exist(cur, oper);
- if (cmp < 0) {
- n = n->rb_right;
- } else if (cmp) {
- n = n->rb_left;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
+ if (limit->max_rfer == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+ qgroup->max_rfer = 0;
} else {
- spin_unlock(&fs_info->qgroup_op_lock);
- return -EEXIST;
+ qgroup->max_rfer = limit->max_rfer;
}
}
- spin_unlock(&fs_info->qgroup_op_lock);
- return 0;
-}
-
-static int comp_oper(struct btrfs_qgroup_operation *oper1,
- struct btrfs_qgroup_operation *oper2)
-{
- if (oper1->bytenr < oper2->bytenr)
- return -1;
- if (oper1->bytenr > oper2->bytenr)
- return 1;
- if (oper1->seq < oper2->seq)
- return -1;
- if (oper1->seq > oper2->seq)
- return 1;
- if (oper1->ref_root < oper2->ref_root)
- return -1;
- if (oper1->ref_root > oper2->ref_root)
- return 1;
- if (oper1->type < oper2->type)
- return -1;
- if (oper1->type > oper2->type)
- return 1;
- return 0;
-}
-
-static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct rb_node **p;
- struct rb_node *parent = NULL;
- struct btrfs_qgroup_operation *cur;
- int cmp;
-
- spin_lock(&fs_info->qgroup_op_lock);
- p = &fs_info->qgroup_op_tree.rb_node;
- while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
- cmp = comp_oper(cur, oper);
- if (cmp < 0) {
- p = &(*p)->rb_right;
- } else if (cmp) {
- p = &(*p)->rb_left;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+ if (limit->max_excl == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+ qgroup->max_excl = 0;
} else {
- spin_unlock(&fs_info->qgroup_op_lock);
- return -EEXIST;
+ qgroup->max_excl = limit->max_excl;
}
}
- rb_link_node(&oper->n, parent, p);
- rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
- spin_unlock(&fs_info->qgroup_op_lock);
- return 0;
-}
-
-/*
- * Record a quota operation for processing later on.
- * @trans: the transaction we are adding the delayed op to.
- * @fs_info: the fs_info for this fs.
- * @ref_root: the root of the reference we are acting on,
- * @bytenr: the bytenr we are acting on.
- * @num_bytes: the number of bytes in the reference.
- * @type: the type of operation this is.
- * @mod_seq: do we need to get a sequence number for looking up roots.
- *
- * We just add it to our trans qgroup_ref_list and carry on and process these
- * operations in order at some later point. If the reference root isn't a fs
- * root then we don't bother with doing anything.
- *
- * MUST BE HOLDING THE REF LOCK.
- */
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 ref_root,
- u64 bytenr, u64 num_bytes,
- enum btrfs_qgroup_operation_type type, int mod_seq)
-{
- struct btrfs_qgroup_operation *oper;
- int ret;
-
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- return 0;
-
- oper = kmalloc(sizeof(*oper), GFP_NOFS);
- if (!oper)
- return -ENOMEM;
-
- oper->ref_root = ref_root;
- oper->bytenr = bytenr;
- oper->num_bytes = num_bytes;
- oper->type = type;
- oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
- INIT_LIST_HEAD(&oper->elem.list);
- oper->elem.seq = 0;
-
- trace_btrfs_qgroup_record_ref(oper);
-
- if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
- /*
- * If any operation for this bytenr/ref_root combo
- * exists, then we know it's not exclusively owned and
- * shouldn't be queued up.
- *
- * This also catches the case where we have a cloned
- * extent that gets queued up multiple times during
- * drop snapshot.
- */
- if (qgroup_oper_exists(fs_info, oper)) {
- kfree(oper);
- return 0;
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
+ if (limit->rsv_rfer == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+ qgroup->rsv_rfer = 0;
+ } else {
+ qgroup->rsv_rfer = limit->rsv_rfer;
+ }
+ }
+ if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
+ if (limit->rsv_excl == CLEAR_VALUE) {
+ qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+ limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+ qgroup->rsv_excl = 0;
+ } else {
+ qgroup->rsv_excl = limit->rsv_excl;
}
}
+ qgroup->lim_flags |= limit->flags;
+
+ spin_unlock(&fs_info->qgroup_lock);
- ret = insert_qgroup_oper(fs_info, oper);
+ ret = update_qgroup_limit_item(trans, quota_root, qgroup);
if (ret) {
- /* Shouldn't happen so have an assert for developers */
- ASSERT(0);
- kfree(oper);
- return ret;
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_info(fs_info, "unable to update quota limit for %llu",
+ qgroupid);
}
- list_add_tail(&oper->list, &trans->qgroup_ref_list);
- if (mod_seq)
- btrfs_get_tree_mod_seq(fs_info, &oper->elem);
-
- return 0;
+out:
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ return ret;
}
-/*
- * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their refrence and
- * exclusive counts adjusted.
- */
-static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
- struct btrfs_qgroup *qgroup;
- struct ulist *tmp;
- struct btrfs_qgroup_list *glist;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int sign = 0;
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ u64 qgroup_to_skip;
int ret = 0;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
+ delayed_refs = &trans->transaction->delayed_refs;
+ qgroup_to_skip = delayed_refs->qgroup_to_skip;
- spin_lock(&fs_info->qgroup_lock);
- if (!fs_info->quota_root)
- goto out;
- qgroup = find_qgroup_rb(fs_info, oper->ref_root);
- if (!qgroup)
- goto out;
- switch (oper->type) {
- case BTRFS_QGROUP_OPER_ADD_EXCL:
- sign = 1;
- break;
- case BTRFS_QGROUP_OPER_SUB_EXCL:
- sign = -1;
- break;
- default:
- ASSERT(0);
- }
- qgroup->rfer += sign * oper->num_bytes;
- qgroup->rfer_cmpr += sign * oper->num_bytes;
-
- WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
- qgroup->excl += sign * oper->num_bytes;
- qgroup->excl_cmpr += sign * oper->num_bytes;
-
- qgroup_dirty(fs_info, qgroup);
-
- /* Get all of the parent groups that contain this qgroup */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
+ /*
+ * No need to do lock, since this function will only be called in
+ * btrfs_commmit_transaction().
+ */
+ node = rb_first(&delayed_refs->dirty_extent_root);
+ while (node) {
+ record = rb_entry(node, struct btrfs_qgroup_extent_record,
+ node);
+ ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0,
+ &record->old_roots);
if (ret < 0)
- goto out;
+ break;
+ if (qgroup_to_skip)
+ ulist_del(record->old_roots, qgroup_to_skip, 0);
+ node = rb_next(node);
}
+ return ret;
+}
- /* Iterate all of the parents and adjust their reference counts */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- qgroup = u64_to_ptr(unode->aux);
- qgroup->rfer += sign * oper->num_bytes;
- qgroup->rfer_cmpr += sign * oper->num_bytes;
- WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
- qgroup->excl += sign * oper->num_bytes;
- qgroup->excl_cmpr += sign * oper->num_bytes;
- qgroup_dirty(fs_info, qgroup);
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record)
+{
+ struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_qgroup_extent_record *entry;
+ u64 bytenr = record->bytenr;
- /* Add any parents of the parents */
- list_for_each_entry(glist, &qgroup->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- goto out;
- }
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
+ node);
+ if (bytenr < entry->bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->bytenr)
+ p = &(*p)->rb_right;
+ else
+ return entry;
}
- ret = 0;
-out:
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(tmp);
- return ret;
+
+ rb_link_node(&record->node, parent_node, p);
+ rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
+ return NULL;
}
+#define UPDATE_NEW 0
+#define UPDATE_OLD 1
/*
- * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
- * properly.
+ * Walk all of the roots that points to the bytenr and adjust their refcnts.
*/
-static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
- u64 root_to_skip, struct ulist *tmp,
- struct ulist *roots, struct ulist *qgroups,
- u64 seq, int *old_roots, int rescan)
+static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+ struct ulist *roots, struct ulist *tmp,
+ struct ulist *qgroups, u64 seq, int update_old)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct ulist_node *tmp_unode;
struct ulist_iterator tmp_uiter;
struct btrfs_qgroup *qg;
- int ret;
+ int ret = 0;
+ if (!roots)
+ return 0;
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
- /* We don't count our current root here */
- if (unode->val == root_to_skip)
- continue;
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
- /*
- * We could have a pending removal of this same ref so we may
- * not have actually found our ref root when doing
- * btrfs_find_all_roots, so we need to keep track of how many
- * old roots we find in case we removed ours and added a
- * different one at the same time. I don't think this could
- * happen in practice but that sort of thinking leads to pain
- * and suffering and to the dark side.
- */
- (*old_roots)++;
ulist_reinit(tmp);
ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
@@ -1499,29 +1515,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_list *glist;
qg = u64_to_ptr(tmp_unode->aux);
- /*
- * We use this sequence number to keep from having to
- * run the whole list and 0 out the refcnt every time.
- * We basically use sequnce as the known 0 count and
- * then add 1 everytime we see a qgroup. This is how we
- * get how many of the roots actually point up to the
- * upper level qgroups in order to determine exclusive
- * counts.
- *
- * For rescan we want to set old_refcnt to seq so our
- * exclusive calculations end up correct.
- */
- if (rescan)
- qg->old_refcnt = seq;
- else if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
+ if (update_old)
+ btrfs_qgroup_update_old_refcnt(qg, seq, 1);
else
- qg->old_refcnt++;
-
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
+ btrfs_qgroup_update_new_refcnt(qg, seq, 1);
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(qgroups, glist->group->qgroupid,
ptr_to_u64(glist->group),
@@ -1540,161 +1537,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
}
/*
- * We need to walk forward in our operation tree and account for any roots that
- * were deleted after we made this operation.
- */
-static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper,
- struct ulist *tmp,
- struct ulist *qgroups, u64 seq,
- int *old_roots)
-{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_operation *tmp_oper;
- struct rb_node *n;
- int ret;
-
- ulist_reinit(tmp);
-
- /*
- * We only walk forward in the tree since we're only interested in
- * removals that happened _after_ our operation.
- */
- spin_lock(&fs_info->qgroup_op_lock);
- n = rb_next(&oper->n);
- spin_unlock(&fs_info->qgroup_op_lock);
- if (!n)
- return 0;
- tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
- while (tmp_oper->bytenr == oper->bytenr) {
- /*
- * If it's not a removal we don't care, additions work out
- * properly with our refcnt tracking.
- */
- if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
- tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
- goto next;
- qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
- if (!qg)
- goto next;
- ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
- GFP_ATOMIC);
- if (ret) {
- if (ret < 0)
- return ret;
- /*
- * We only want to increase old_roots if this qgroup is
- * not already in the list of qgroups. If it is already
- * there then that means it must have been re-added or
- * the delete will be discarded because we had an
- * existing ref that we haven't looked up yet. In this
- * case we don't want to increase old_roots. So if ret
- * == 1 then we know that this is the first time we've
- * seen this qgroup and we can bump the old_roots.
- */
- (*old_roots)++;
- ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
-next:
- spin_lock(&fs_info->qgroup_op_lock);
- n = rb_next(&tmp_oper->n);
- spin_unlock(&fs_info->qgroup_op_lock);
- if (!n)
- break;
- tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
- }
-
- /* Ok now process the qgroups we found */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = u64_to_ptr(unode->aux);
- if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
- else
- qg->old_refcnt++;
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(qgroups, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/* Add refcnt for the newly added reference. */
-static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper,
- struct btrfs_qgroup *qgroup,
- struct ulist *tmp, struct ulist *qgroups,
- u64 seq)
-{
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup *qg;
- int ret;
-
- ulist_reinit(tmp);
- ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup_list *glist;
-
- qg = u64_to_ptr(unode->aux);
- if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
- if (qg->new_refcnt < seq)
- qg->new_refcnt = seq + 1;
- else
- qg->new_refcnt++;
- } else {
- if (qg->old_refcnt < seq)
- qg->old_refcnt = seq + 1;
- else
- qg->old_refcnt++;
- }
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- ret = ulist_add(qgroups, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
- }
- return 0;
-}
-
-/*
- * This adjusts the counters for all referenced qgroups if need be.
+ * Update qgroup rfer/excl counters.
+ * Rfer update is easy, codes can explain themselves.
+ *
+ * Excl update is tricky, the update is split into 2 part.
+ * Part 1: Possible exclusive <-> sharing detect:
+ * | A | !A |
+ * -------------------------------------
+ * B | * | - |
+ * -------------------------------------
+ * !B | + | ** |
+ * -------------------------------------
+ *
+ * Conditions:
+ * A: cur_old_roots < nr_old_roots (not exclusive before)
+ * !A: cur_old_roots == nr_old_roots (possible exclusive before)
+ * B: cur_new_roots < nr_new_roots (not exclusive now)
+ * !B: cur_new_roots == nr_new_roots (possible exclsuive now)
+ *
+ * Results:
+ * +: Possible sharing -> exclusive -: Possible exclusive -> sharing
+ * *: Definitely not changed. **: Possible unchanged.
+ *
+ * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
+ *
+ * To make the logic clear, we first use condition A and B to split
+ * combination into 4 results.
+ *
+ * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
+ * only on variant maybe 0.
+ *
+ * Lastly, check result **, since there are 2 variants maybe 0, split them
+ * again(2x2).
+ * But this time we don't need to consider other things, the codes and logic
+ * is easy to understand now.
*/
-static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
- u64 root_to_skip, u64 num_bytes,
- struct ulist *qgroups, u64 seq,
- int old_roots, int new_roots, int rescan)
+static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
+ struct ulist *qgroups,
+ u64 nr_old_roots,
+ u64 nr_new_roots,
+ u64 num_bytes, u64 seq)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1706,423 +1588,196 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
bool dirty = false;
qg = u64_to_ptr(unode->aux);
- /*
- * Wasn't referenced before but is now, add to the reference
- * counters.
- */
- if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+ cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
+ cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+
+ /* Rfer update part */
+ if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
qg->rfer_cmpr += num_bytes;
dirty = true;
}
-
- /*
- * Was referenced before but isn't now, subtract from the
- * reference counters.
- */
- if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+ if (cur_old_count > 0 && cur_new_count == 0) {
qg->rfer -= num_bytes;
qg->rfer_cmpr -= num_bytes;
dirty = true;
}
- if (qg->old_refcnt < seq)
- cur_old_count = 0;
- else
- cur_old_count = qg->old_refcnt - seq;
- if (qg->new_refcnt < seq)
- cur_new_count = 0;
- else
- cur_new_count = qg->new_refcnt - seq;
+ /* Excl update part */
+ /* Exclusive/none -> shared case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count < nr_new_roots) {
+ /* Exclusive -> shared */
+ if (cur_old_count != 0) {
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+ }
- /*
- * If our refcount was the same as the roots previously but our
- * new count isn't the same as the number of roots now then we
- * went from having a exclusive reference on this range to not.
- */
- if (old_roots && cur_old_count == old_roots &&
- (cur_new_count != new_roots || new_roots == 0)) {
- WARN_ON(cur_new_count != new_roots && new_roots == 0);
- qg->excl -= num_bytes;
- qg->excl_cmpr -= num_bytes;
- dirty = true;
+ /* Shared -> exclusive/none case */
+ if (cur_old_count < nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ /* Shared->exclusive */
+ if (cur_new_count != 0) {
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
}
- /*
- * If we didn't reference all the roots before but now we do we
- * have an exclusive reference to this range.
- */
- if ((!old_roots || (old_roots && cur_old_count != old_roots))
- && cur_new_count == new_roots) {
- qg->excl += num_bytes;
- qg->excl_cmpr += num_bytes;
- dirty = true;
+ /* Exclusive/none -> exclusive/none case */
+ if (cur_old_count == nr_old_roots &&
+ cur_new_count == nr_new_roots) {
+ if (cur_old_count == 0) {
+ /* None -> exclusive/none */
+
+ if (cur_new_count != 0) {
+ /* None -> exclusive */
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
+ }
+ /* None -> none, nothing changed */
+ } else {
+ /* Exclusive -> exclusive/none */
+
+ if (cur_new_count == 0) {
+ /* Exclusive -> none */
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
+ /* Exclusive -> exclusive, nothing changed */
+ }
}
+ /* For exclusive extent, free its reserved bytes too */
+ if (nr_old_roots == 0 && nr_new_roots == 1 &&
+ cur_new_count == nr_new_roots)
+ qg->reserved -= num_bytes;
if (dirty)
qgroup_dirty(fs_info, qg);
}
return 0;
}
-/*
- * If we removed a data extent and there were other references for that bytenr
- * then we need to lookup all referenced roots to make sure we still don't
- * reference this bytenr. If we do then we can just discard this operation.
- */
-static int check_existing_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- int ret = 0;
-
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
- oper->elem.seq, &roots);
- if (ret < 0)
- return ret;
- ret = 0;
-
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- if (unode->val == oper->ref_root) {
- ret = 1;
- break;
- }
- }
- ulist_free(roots);
- btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-
- return ret;
-}
-
-/*
- * If we share a reference across multiple roots then we may need to adjust
- * various qgroups referenced and exclusive counters. The basic premise is this
- *
- * 1) We have seq to represent a 0 count. Instead of looping through all of the
- * qgroups and resetting their refcount to 0 we just constantly bump this
- * sequence number to act as the base reference count. This means that if
- * anybody is equal to or below this sequence they were never referenced. We
- * jack this sequence up by the number of roots we found each time in order to
- * make sure we don't have any overlap.
- *
- * 2) We first search all the roots that reference the area _except_ the root
- * we're acting on currently. This makes up the old_refcnt of all the qgroups
- * before.
- *
- * 3) We walk all of the qgroups referenced by the root we are currently acting
- * on, and will either adjust old_refcnt in the case of a removal or the
- * new_refcnt in the case of an addition.
- *
- * 4) Finally we walk all the qgroups that are referenced by this range
- * including the root we are acting on currently. We will adjust the counters
- * based on the number of roots we had and will have after this operation.
- *
- * Take this example as an illustration
- *
- * [qgroup 1/0]
- * / | \
- * [qg 0/0] [qg 0/1] [qg 0/2]
- * \ | /
- * [ extent ]
- *
- * Say we are adding a reference that is covered by qg 0/0. The first step
- * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
- * old_roots being 2. Because it is adding new_roots will be 1. We then go
- * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
- * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
- * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
- * reference and thus must add the size to the referenced bytes. Everything
- * else is the same so nothing else changes.
- */
-static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 bytenr, u64 num_bytes,
+ struct ulist *old_roots, struct ulist *new_roots)
{
- struct ulist *roots = NULL;
- struct ulist *qgroups, *tmp;
- struct btrfs_qgroup *qgroup;
- struct seq_list elem = {};
+ struct ulist *qgroups = NULL;
+ struct ulist *tmp = NULL;
u64 seq;
- int old_roots = 0;
- int new_roots = 0;
+ u64 nr_new_roots = 0;
+ u64 nr_old_roots = 0;
int ret = 0;
- if (oper->elem.seq) {
- ret = check_existing_refs(trans, fs_info, oper);
- if (ret < 0)
- return ret;
- if (ret)
- return 0;
- }
+ if (new_roots)
+ nr_new_roots = new_roots->nnodes;
+ if (old_roots)
+ nr_old_roots = old_roots->nnodes;
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups)
- return -ENOMEM;
+ if (!fs_info->quota_enabled)
+ goto out_free;
+ BUG_ON(!fs_info->quota_root);
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
tmp = ulist_alloc(GFP_NOFS);
if (!tmp) {
- ulist_free(qgroups);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_free;
}
- btrfs_get_tree_mod_seq(fs_info, &elem);
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
- &roots);
- btrfs_put_tree_mod_seq(fs_info, &elem);
- if (ret < 0) {
- ulist_free(qgroups);
- ulist_free(tmp);
- return ret;
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ ret = 0;
+ goto out_free;
+ }
}
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
spin_lock(&fs_info->qgroup_lock);
- qgroup = find_qgroup_rb(fs_info, oper->ref_root);
- if (!qgroup)
- goto out;
seq = fs_info->qgroup_seq;
- /*
- * So roots is the list of all the roots currently pointing at the
- * bytenr, including the ref we are adding if we are adding, or not if
- * we are removing a ref. So we pass in the ref_root to skip that root
- * in our calculations. We set old_refnct and new_refcnt cause who the
- * hell knows what everything looked like before, and it doesn't matter
- * except...
- */
- ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
- seq, &old_roots, 0);
- if (ret < 0)
- goto out;
-
- /*
- * Now adjust the refcounts of the qgroups that care about this
- * reference, either the old_count in the case of removal or new_count
- * in the case of an addition.
- */
- ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
- seq);
+ /* Update old refcnts using old_roots */
+ ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
+ UPDATE_OLD);
if (ret < 0)
goto out;
- /*
- * ...in the case of removals. If we had a removal before we got around
- * to processing this operation then we need to find that guy and count
- * his references as if they really existed so we don't end up screwing
- * up the exclusive counts. Then whenever we go to process the delete
- * everything will be grand and we can account for whatever exclusive
- * changes need to be made there. We also have to pass in old_roots so
- * we have an accurate count of the roots as it pertains to this
- * operations view of the world.
- */
- ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
- &old_roots);
+ /* Update new refcnts using new_roots */
+ ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
+ UPDATE_NEW);
if (ret < 0)
goto out;
- /*
- * We are adding our root, need to adjust up the number of roots,
- * otherwise old_roots is the number of roots we want.
- */
- if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
- new_roots = old_roots + 1;
- } else {
- new_roots = old_roots;
- old_roots++;
- }
- fs_info->qgroup_seq += old_roots + 1;
-
+ qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+ num_bytes, seq);
/*
- * And now the magic happens, bless Arne for having a pretty elegant
- * solution for this.
+ * Bump qgroup_seq to avoid seq overlap
*/
- qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
- qgroups, seq, old_roots, new_roots, 0);
+ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
out:
spin_unlock(&fs_info->qgroup_lock);
- ulist_free(qgroups);
- ulist_free(roots);
+out_free:
ulist_free(tmp);
+ ulist_free(qgroups);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
return ret;
}
-/*
- * Process a reference to a shared subtree. This type of operation is
- * queued during snapshot removal when we encounter extents which are
- * shared between more than one root.
- */
-static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
-{
- struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
- struct btrfs_qgroup_list *glist;
- struct ulist *parents;
- int ret = 0;
- int err;
- struct btrfs_qgroup *qg;
- u64 root_obj = 0;
- struct seq_list elem = {};
-
- parents = ulist_alloc(GFP_NOFS);
- if (!parents)
- return -ENOMEM;
-
- btrfs_get_tree_mod_seq(fs_info, &elem);
- ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
- elem.seq, &roots);
- btrfs_put_tree_mod_seq(fs_info, &elem);
- if (ret < 0)
- goto out;
-
- if (roots->nnodes != 1)
- goto out;
-
- ULIST_ITER_INIT(&uiter);
- unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
- /*
- * If we find our ref root then that means all refs
- * this extent has to the root have not yet been
- * deleted. In that case, we do nothing and let the
- * last ref for this bytenr drive our update.
- *
- * This can happen for example if an extent is
- * referenced multiple times in a snapshot (clone,
- * etc). If we are in the middle of snapshot removal,
- * queued updates for such an extent will find the
- * root if we have not yet finished removing the
- * snapshot.
- */
- if (unode->val == oper->ref_root)
- goto out;
-
- root_obj = unode->val;
- BUG_ON(!root_obj);
-
- spin_lock(&fs_info->qgroup_lock);
- qg = find_qgroup_rb(fs_info, root_obj);
- if (!qg)
- goto out_unlock;
-
- qg->excl += oper->num_bytes;
- qg->excl_cmpr += oper->num_bytes;
- qgroup_dirty(fs_info, qg);
-
- /*
- * Adjust counts for parent groups. First we find all
- * parents, then in the 2nd loop we do the adjustment
- * while adding parents of the parents to our ulist.
- */
- list_for_each_entry(glist, &qg->groups, next_group) {
- err = ulist_add(parents, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (err < 0) {
- ret = err;
- goto out_unlock;
- }
- }
-
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(parents, &uiter))) {
- qg = u64_to_ptr(unode->aux);
- qg->excl += oper->num_bytes;
- qg->excl_cmpr += oper->num_bytes;
- qgroup_dirty(fs_info, qg);
-
- /* Add any parents of the parents */
- list_for_each_entry(glist, &qg->groups, next_group) {
- err = ulist_add(parents, glist->group->qgroupid,
- ptr_to_u64(glist->group), GFP_ATOMIC);
- if (err < 0) {
- ret = err;
- goto out_unlock;
- }
- }
- }
-
-out_unlock:
- spin_unlock(&fs_info->qgroup_lock);
-
-out:
- ulist_free(roots);
- ulist_free(parents);
- return ret;
-}
-
-/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
- */
-static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
{
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct ulist *new_roots = NULL;
+ struct rb_node *node;
+ u64 qgroup_to_skip;
int ret = 0;
- if (!fs_info->quota_enabled)
- return 0;
+ delayed_refs = &trans->transaction->delayed_refs;
+ qgroup_to_skip = delayed_refs->qgroup_to_skip;
+ while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
+ record = rb_entry(node, struct btrfs_qgroup_extent_record,
+ node);
- BUG_ON(!fs_info->quota_root);
-
- mutex_lock(&fs_info->qgroup_rescan_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- return 0;
+ if (!ret) {
+ /*
+ * Use (u64)-1 as time_seq to do special search, which
+ * doesn't lock tree or delayed_refs and search current
+ * root. It's safe inside commit_transaction().
+ */
+ ret = btrfs_find_all_roots(trans, fs_info,
+ record->bytenr, (u64)-1, &new_roots);
+ if (ret < 0)
+ goto cleanup;
+ if (qgroup_to_skip)
+ ulist_del(new_roots, qgroup_to_skip, 0);
+ ret = btrfs_qgroup_account_extent(trans, fs_info,
+ record->bytenr, record->num_bytes,
+ record->old_roots, new_roots);
+ record->old_roots = NULL;
+ new_roots = NULL;
}
- }
- mutex_unlock(&fs_info->qgroup_rescan_lock);
+cleanup:
+ ulist_free(record->old_roots);
+ ulist_free(new_roots);
+ new_roots = NULL;
+ rb_erase(node, &delayed_refs->dirty_extent_root);
+ kfree(record);
- ASSERT(is_fstree(oper->ref_root));
-
- trace_btrfs_qgroup_account(oper);
-
- switch (oper->type) {
- case BTRFS_QGROUP_OPER_ADD_EXCL:
- case BTRFS_QGROUP_OPER_SUB_EXCL:
- ret = qgroup_excl_accounting(fs_info, oper);
- break;
- case BTRFS_QGROUP_OPER_ADD_SHARED:
- case BTRFS_QGROUP_OPER_SUB_SHARED:
- ret = qgroup_shared_accounting(trans, fs_info, oper);
- break;
- case BTRFS_QGROUP_OPER_SUB_SUBTREE:
- ret = qgroup_subtree_accounting(trans, fs_info, oper);
- break;
- default:
- ASSERT(0);
- }
- return ret;
-}
-
-/*
- * Needs to be called everytime we run delayed refs, even if there is an error
- * in order to cleanup outstanding operations.
- */
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_qgroup_operation *oper;
- int ret = 0;
-
- while (!list_empty(&trans->qgroup_ref_list)) {
- oper = list_first_entry(&trans->qgroup_ref_list,
- struct btrfs_qgroup_operation, list);
- list_del_init(&oper->list);
- if (!ret || !trans->aborted)
- ret = btrfs_qgroup_account(trans, fs_info, oper);
- spin_lock(&fs_info->qgroup_op_lock);
- rb_erase(&oper->n, &fs_info->qgroup_op_tree);
- spin_unlock(&fs_info->qgroup_op_lock);
- btrfs_put_tree_mod_seq(fs_info, &oper->elem);
- kfree(oper);
}
return ret;
}
@@ -2156,6 +1811,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
if (ret)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ ret = update_qgroup_limit_item(trans, quota_root, qgroup);
+ if (ret)
+ fs_info->qgroup_flags |=
+ BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
spin_lock(&fs_info->qgroup_lock);
}
if (fs_info->quota_enabled)
@@ -2219,6 +1878,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
ret = -EINVAL;
goto out;
}
+
+ if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
+ ret = -EINVAL;
+ goto out;
+ }
++i_qgroups;
}
}
@@ -2230,17 +1894,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
- ret = update_qgroup_limit_item(trans, quota_root, objectid,
- inherit->lim.flags,
- inherit->lim.max_rfer,
- inherit->lim.max_excl,
- inherit->lim.rsv_rfer,
- inherit->lim.rsv_excl);
- if (ret)
- goto out;
- }
-
if (srcid) {
struct btrfs_root *srcroot;
struct btrfs_key srckey;
@@ -2286,6 +1939,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
goto unlock;
}
+ if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+ dstgroup->lim_flags = inherit->lim.flags;
+ dstgroup->max_rfer = inherit->lim.max_rfer;
+ dstgroup->max_excl = inherit->lim.max_excl;
+ dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
+ dstgroup->rsv_excl = inherit->lim.rsv_excl;
+
+ ret = update_qgroup_limit_item(trans, quota_root, dstgroup);
+ if (ret) {
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ btrfs_info(fs_info, "unable to update quota limit for %llu",
+ dstgroup->qgroupid);
+ goto unlock;
+ }
+ }
+
if (srcid) {
srcgroup = find_qgroup_rb(fs_info, srcid);
if (!srcgroup)
@@ -2302,6 +1971,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dstgroup->excl_cmpr = level_size;
srcgroup->excl = level_size;
srcgroup->excl_cmpr = level_size;
+
+ /* inherit the limit info */
+ dstgroup->lim_flags = srcgroup->lim_flags;
+ dstgroup->max_rfer = srcgroup->max_rfer;
+ dstgroup->max_excl = srcgroup->max_excl;
+ dstgroup->rsv_rfer = srcgroup->rsv_rfer;
+ dstgroup->rsv_excl = srcgroup->rsv_excl;
+
qgroup_dirty(fs_info, dstgroup);
qgroup_dirty(fs_info, srcgroup);
}
@@ -2358,12 +2035,6 @@ out:
return ret;
}
-/*
- * reserve some space for a qgroup and all its parents. The reservation takes
- * place with start_transaction or dealloc_reserve, similar to ENOSPC
- * accounting. If not enough space is available, EDQUOT is returned.
- * We assume that the requested space is new for all qgroups.
- */
int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
{
struct btrfs_root *quota_root;
@@ -2513,19 +2184,17 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
/*
* returns < 0 on error, 0 when more leafs are to be scanned.
- * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
+ * returns 1 when done.
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans, struct ulist *qgroups,
- struct ulist *tmp, struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans,
+ struct extent_buffer *scratch_leaf)
{
struct btrfs_key found;
struct ulist *roots = NULL;
- struct seq_list tree_mod_seq_elem = {};
+ struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
- u64 seq;
- int new_roots;
int slot;
int ret;
@@ -2575,33 +2244,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
else
num_bytes = found.offset;
- ulist_reinit(qgroups);
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
&roots);
if (ret < 0)
goto out;
- spin_lock(&fs_info->qgroup_lock);
- seq = fs_info->qgroup_seq;
- fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
-
- new_roots = 0;
- ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
- seq, &new_roots, 1);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
-
- ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
- seq, 0, new_roots, 1);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
+ /* For rescan, just pass old_roots as NULL */
+ ret = btrfs_qgroup_account_extent(trans, fs_info,
+ found.objectid, num_bytes, NULL, roots);
+ if (ret < 0)
goto out;
- }
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
}
out:
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
@@ -2615,19 +2266,13 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *tmp = NULL, *qgroups = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
goto out;
- qgroups = ulist_alloc(GFP_NOFS);
- if (!qgroups)
- goto out;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- goto out;
scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
if (!scratch_leaf)
goto out;
@@ -2643,7 +2288,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = -EINTR;
} else {
err = qgroup_rescan_leaf(fs_info, path, trans,
- qgroups, tmp, scratch_leaf);
+ scratch_leaf);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2653,14 +2298,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
out:
kfree(scratch_leaf);
- ulist_free(qgroups);
- ulist_free(tmp);
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
- if (err == 2 &&
+ if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
} else if (err < 0) {
@@ -2668,13 +2311,33 @@ out:
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
+ /*
+ * only update status, since the previous part has alreay updated the
+ * qgroup info.
+ */
+ trans = btrfs_start_transaction(fs_info->quota_root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "fail to start transaction for status update: %d\n",
+ err);
+ goto done;
+ }
+ ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
+ if (ret < 0) {
+ err = ret;
+ btrfs_err(fs_info, "fail to update qgroup status: %d\n", err);
+ }
+ btrfs_end_transaction(trans, fs_info->quota_root);
+
if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
- err == 2 ? " (inconsistency flag cleared)" : "");
+ err > 0 ? " (inconsistency flag cleared)" : "");
} else {
btrfs_err(fs_info, "qgroup scan failed with %d", err);
}
+done:
complete_all(&fs_info->qgroup_rescan_completion);
}
@@ -2709,7 +2372,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
mutex_unlock(&fs_info->qgroup_rescan_lock);
goto err;
}
-
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 18cc68ca3090..6387dcfa354c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -19,43 +19,18 @@
#ifndef __BTRFS_QGROUP__
#define __BTRFS_QGROUP__
+#include "ulist.h"
+#include "delayed-ref.h"
+
/*
- * A description of the operations, all of these operations only happen when we
- * are adding the 1st reference for that subvolume in the case of adding space
- * or on the last reference delete in the case of subtraction. The only
- * exception is the last one, which is added for confusion.
- *
- * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
- * one pointing at the bytes we are adding. This is called on the first
- * allocation.
- *
- * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
- * shared between subvols. This is called on the creation of a ref that already
- * has refs from a different subvolume, so basically reflink.
- *
- * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
- * one referencing the range.
- *
- * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
- * refs with other subvolumes.
+ * Record a dirty extent, and info qgroup to update quota on it
+ * TODO: Use kmem cache to alloc it.
*/
-enum btrfs_qgroup_operation_type {
- BTRFS_QGROUP_OPER_ADD_EXCL,
- BTRFS_QGROUP_OPER_ADD_SHARED,
- BTRFS_QGROUP_OPER_SUB_EXCL,
- BTRFS_QGROUP_OPER_SUB_SHARED,
- BTRFS_QGROUP_OPER_SUB_SUBTREE,
-};
-
-struct btrfs_qgroup_operation {
- u64 ref_root;
+struct btrfs_qgroup_extent_record {
+ struct rb_node node;
u64 bytenr;
u64 num_bytes;
- u64 seq;
- enum btrfs_qgroup_operation_type type;
- struct seq_list elem;
- struct rb_node n;
- struct list_head list;
+ struct ulist *old_roots;
};
int btrfs_quota_enable(struct btrfs_trans_handle *trans,
@@ -70,8 +45,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst);
int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid,
- char *name);
+ struct btrfs_fs_info *fs_info, u64 qgroupid);
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 qgroupid);
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
@@ -80,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 ref_root,
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record);
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes,
- enum btrfs_qgroup_operation_type type,
- int mod_seq);
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup_operation *oper);
+ struct ulist *old_roots, struct ulist *new_roots);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5264858ed768..fa72068bd256 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -237,12 +237,8 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
}
x = cmpxchg(&info->stripe_hash_table, NULL, table);
- if (x) {
- if (is_vmalloc_addr(x))
- vfree(x);
- else
- kfree(x);
- }
+ if (x)
+ kvfree(x);
return 0;
}
@@ -453,10 +449,7 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
if (!info->stripe_hash_table)
return;
btrfs_clear_rbio_cache(info);
- if (is_vmalloc_addr(info->stripe_hash_table))
- vfree(info->stripe_hash_table);
- else
- kfree(info->stripe_hash_table);
+ kvfree(info->stripe_hash_table);
info->stripe_hash_table = NULL;
}
@@ -1807,8 +1800,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
int err;
int i;
- pointers = kzalloc(rbio->real_stripes * sizeof(void *),
- GFP_NOFS);
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!pointers) {
err = -ENOMEM;
goto cleanup_io;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d83085381bcc..88cbb5995667 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1847,8 +1847,10 @@ again:
}
eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
- ret = (!eb) ? -ENOMEM : -EIO;
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
+ ret = -EIO;
free_extent_buffer(eb);
break;
}
@@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
eb = read_tree_block(root, bytenr, ptr_gen);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
blocksize = root->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
eb = read_tree_block(root, bytenr, generation);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ err = PTR_ERR(eb);
+ goto next;
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
err = -EIO;
goto next;
@@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
block->key.offset);
- if (!eb || !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb)) {
+ return PTR_ERR(eb);
+ } else if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -3027,7 +3036,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
mutex_lock(&inode->i_mutex);
ret = btrfs_check_data_free_space(inode, cluster->end +
- 1 - cluster->start);
+ 1 - cluster->start, 0);
if (ret)
goto out;
@@ -3430,7 +3439,9 @@ static int block_use_full_backref(struct reloc_control *rc,
}
static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
- struct inode *inode, u64 ino)
+ struct btrfs_block_group_cache *block_group,
+ struct inode *inode,
+ u64 ino)
{
struct btrfs_key key;
struct btrfs_root *root = fs_info->tree_root;
@@ -3463,7 +3474,7 @@ truncate:
goto out;
}
- ret = btrfs_truncate_free_space_cache(root, trans, inode);
+ ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
@@ -3509,6 +3520,7 @@ static int find_data_references(struct reloc_control *rc,
*/
if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
ret = delete_block_group_cache(rc->extent_root->fs_info,
+ rc->block_group,
NULL, ref_objectid);
if (ret != -ENOENT)
return ret;
@@ -4037,7 +4049,7 @@ restart:
if (trans && progress && err == -ENOSPC) {
ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
rc->block_group->flags);
- if (ret == 0) {
+ if (ret == 1) {
err = 0;
progress = 0;
goto restart;
@@ -4223,7 +4235,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_free_path(path);
if (!IS_ERR(inode))
- ret = delete_block_group_cache(fs_info, inode, 0);
+ ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
else
ret = PTR_ERR(inode);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ec57687c9a4d..94db0fa5225a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -964,9 +964,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
* the statistics.
*/
- sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
- sizeof(*sblocks_for_recheck),
- GFP_NOFS);
+ sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
+ sizeof(*sblocks_for_recheck), GFP_NOFS);
if (!sblocks_for_recheck) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2319,7 +2318,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
unsigned long *bitmap,
u64 start, u64 len)
{
- int offset;
+ u32 offset;
int nsectors;
int sectorsize = sparity->sctx->dev_root->sectorsize;
@@ -2329,7 +2328,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
}
start -= sparity->logic_start;
- offset = (int)do_div(start, sparity->stripe_len);
+ start = div_u64_rem(start, sparity->stripe_len, &offset);
offset /= sectorsize;
nsectors = (int)len / sectorsize;
@@ -2612,8 +2611,8 @@ static int get_raid56_logic_offset(u64 physical, int num,
int j = 0;
u64 stripe_nr;
u64 last_offset;
- int stripe_index;
- int rot;
+ u32 stripe_index;
+ u32 rot;
last_offset = (physical - map->stripes[num].physical) *
nr_data_stripes(map);
@@ -2624,12 +2623,11 @@ static int get_raid56_logic_offset(u64 physical, int num,
for (i = 0; i < nr_data_stripes(map); i++) {
*offset = last_offset + i * map->stripe_len;
- stripe_nr = *offset;
- do_div(stripe_nr, map->stripe_len);
- do_div(stripe_nr, nr_data_stripes(map));
+ stripe_nr = div_u64(*offset, map->stripe_len);
+ stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
/* Work out the disk rotation on this stripe-set */
- rot = do_div(stripe_nr, map->num_stripes);
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
/* calculate which stripe this data locates */
rot += i;
stripe_index = rot % map->num_stripes;
@@ -2664,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity)
kfree(sparity);
}
+static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
+{
+ struct scrub_parity *sparity = container_of(work, struct scrub_parity,
+ work);
+ struct scrub_ctx *sctx = sparity->sctx;
+
+ scrub_free_parity(sparity);
+ scrub_pending_bio_dec(sctx);
+}
+
static void scrub_parity_bio_endio(struct bio *bio, int error)
{
struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
- struct scrub_ctx *sctx = sparity->sctx;
if (error)
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
- scrub_free_parity(sparity);
- scrub_pending_bio_dec(sctx);
bio_put(bio);
+
+ btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
+ scrub_parity_bio_endio_worker, NULL, NULL);
+ btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers,
+ &sparity->work);
}
static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -2995,10 +3005,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
int extent_mirror_num;
int stop_loop = 0;
- nstripes = length;
physical = map->stripes[num].physical;
offset = 0;
- do_div(nstripes, map->stripe_len);
+ nstripes = div_u64(length, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
offset = map->stripe_len * num;
increment = map->stripe_len * map->num_stripes;
@@ -3562,8 +3571,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
- int ret = 0;
- int flags = WQ_FREEZABLE | WQ_UNBOUND;
+ unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
if (fs_info->scrub_workers_refcnt == 0) {
@@ -3575,27 +3583,36 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
fs_info->scrub_workers =
btrfs_alloc_workqueue("btrfs-scrub", flags,
max_active, 4);
- if (!fs_info->scrub_workers) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!fs_info->scrub_workers)
+ goto fail_scrub_workers;
+
fs_info->scrub_wr_completion_workers =
btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
max_active, 2);
- if (!fs_info->scrub_wr_completion_workers) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!fs_info->scrub_wr_completion_workers)
+ goto fail_scrub_wr_completion_workers;
+
fs_info->scrub_nocow_workers =
btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
- if (!fs_info->scrub_nocow_workers) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!fs_info->scrub_nocow_workers)
+ goto fail_scrub_nocow_workers;
+ fs_info->scrub_parity_workers =
+ btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+ max_active, 2);
+ if (!fs_info->scrub_parity_workers)
+ goto fail_scrub_parity_workers;
}
++fs_info->scrub_workers_refcnt;
-out:
- return ret;
+ return 0;
+
+fail_scrub_parity_workers:
+ btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+fail_scrub_nocow_workers:
+ btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+fail_scrub_wr_completion_workers:
+ btrfs_destroy_workqueue(fs_info->scrub_workers);
+fail_scrub_workers:
+ return -ENOMEM;
}
static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
@@ -3604,6 +3621,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->scrub_workers);
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d6033f540cc7..aa72bfd28f7d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -243,6 +243,7 @@ struct waiting_dir_move {
* after this directory is moved, we can try to rmdir the ino rmdir_ino.
*/
u64 rmdir_ino;
+ bool orphanized;
};
struct orphan_dir_info {
@@ -1158,6 +1159,9 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
+ /* data offset in the file extent item */
+ u64 data_offset;
+
/* Just to check for bugs in backref resolving */
int found_itself;
};
@@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
if (ret < 0)
return ret;
- if (offset + bctx->extent_len > i_size)
+ if (offset + bctx->data_offset + bctx->extent_len > i_size)
return 0;
/*
@@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx,
backref_ctx->cur_offset = data_offset;
backref_ctx->found_itself = 0;
backref_ctx->extent_len = num_bytes;
+ /*
+ * For non-compressed extents iterate_extent_inodes() gives us extent
+ * offsets that already take into account the data offset, but not for
+ * compressed extents, since the offset is logical and not relative to
+ * the physical extent locations. We must take this into account to
+ * avoid sending clone offsets that go beyond the source file's size,
+ * which would result in the clone ioctl failing with -EINVAL on the
+ * receiving end.
+ */
+ if (compressed == BTRFS_COMPRESS_NONE)
+ backref_ctx->data_offset = 0;
+ else
+ backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
@@ -1900,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx,
goto out;
}
- /* we know that it is or will be overwritten. check this now */
- if (ow_inode < sctx->send_progress)
+ /*
+ * We know that it is or will be overwritten. Check this now.
+ * The current inode being processed might have been the one that caused
+ * inode 'ino' to be orphanized, therefore ow_inode can actually be the
+ * same as sctx->send_progress.
+ */
+ if (ow_inode <= sctx->send_progress)
ret = 1;
else
ret = 0;
@@ -2223,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
fs_path_reset(dest);
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+ struct waiting_dir_move *wdm;
+
fs_path_reset(name);
if (is_waiting_for_rm(sctx, ino)) {
@@ -2233,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
break;
}
- if (is_waiting_for_move(sctx, ino)) {
+ wdm = get_waiting_dir_move(sctx, ino);
+ if (wdm && wdm->orphanized) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ stop = 1;
+ } else if (wdm) {
ret = get_first_ref(sctx->parent_root, ino,
&parent_inode, &parent_gen, name);
} else {
@@ -2328,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
le64_to_cpu(sctx->send_root->root_item.ctransid));
if (parent_root) {
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- sctx->parent_root->root_item.uuid);
+ if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ parent_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(sctx->parent_root->root_item.ctransid));
}
@@ -2923,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
return entry != NULL;
}
-static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
{
struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
struct rb_node *parent = NULL;
@@ -2934,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
return -ENOMEM;
dm->ino = ino;
dm->rmdir_ino = 0;
+ dm->orphanized = orphanized;
while (*p) {
parent = *p;
@@ -3030,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
goto out;
}
- ret = add_waiting_dir_move(sctx, pm->ino);
+ ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
if (ret)
goto out;
@@ -3067,48 +3100,6 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
return NULL;
}
-static int path_loop(struct send_ctx *sctx, struct fs_path *name,
- u64 ino, u64 gen, u64 *ancestor_ino)
-{
- int ret = 0;
- u64 parent_inode = 0;
- u64 parent_gen = 0;
- u64 start_ino = ino;
-
- *ancestor_ino = 0;
- while (ino != BTRFS_FIRST_FREE_OBJECTID) {
- fs_path_reset(name);
-
- if (is_waiting_for_rm(sctx, ino))
- break;
- if (is_waiting_for_move(sctx, ino)) {
- if (*ancestor_ino == 0)
- *ancestor_ino = ino;
- ret = get_first_ref(sctx->parent_root, ino,
- &parent_inode, &parent_gen, name);
- } else {
- ret = __get_cur_name_and_parent(sctx, ino, gen,
- &parent_inode,
- &parent_gen, name);
- if (ret > 0) {
- ret = 0;
- break;
- }
- }
- if (ret < 0)
- break;
- if (parent_inode == start_ino) {
- ret = 1;
- if (*ancestor_ino == 0)
- *ancestor_ino = ino;
- break;
- }
- ino = parent_inode;
- gen = parent_gen;
- }
- return ret;
-}
-
static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
{
struct fs_path *from_path = NULL;
@@ -3120,7 +3111,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
struct waiting_dir_move *dm = NULL;
u64 rmdir_ino = 0;
int ret;
- u64 ancestor = 0;
name = fs_path_alloc();
from_path = fs_path_alloc();
@@ -3152,22 +3142,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
goto out;
sctx->send_progress = sctx->cur_ino + 1;
- ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
- if (ret) {
- LIST_HEAD(deleted_refs);
- ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
- ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
- &pm->update_refs, &deleted_refs,
- pm->is_orphan);
- if (ret < 0)
- goto out;
- if (rmdir_ino) {
- dm = get_waiting_dir_move(sctx, pm->ino);
- ASSERT(dm);
- dm->rmdir_ino = rmdir_ino;
- }
- goto out;
- }
fs_path_reset(name);
to_path = name;
name = NULL;
@@ -3412,8 +3386,40 @@ out:
return ret;
}
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int is_ancestor(struct btrfs_root *root,
+ const u64 ino1,
+ const u64 ino1_gen,
+ const u64 ino2,
+ struct fs_path *fs_path)
+{
+ u64 ino = ino2;
+
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ int ret;
+ u64 parent;
+ u64 parent_gen;
+
+ fs_path_reset(fs_path);
+ ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+ if (ret < 0) {
+ if (ret == -ENOENT && ino == ino2)
+ ret = 0;
+ return ret;
+ }
+ if (parent == ino1)
+ return parent_gen == ino1_gen ? 1 : 0;
+ ino = parent;
+ }
+ return 0;
+}
+
static int wait_for_parent_move(struct send_ctx *sctx,
- struct recorded_ref *parent_ref)
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
{
int ret = 0;
u64 ino = parent_ref->dir;
@@ -3433,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx,
* Our current directory inode may not yet be renamed/moved because some
* ancestor (immediate or not) has to be renamed/moved first. So find if
* such ancestor exists and make sure our own rename/move happens after
- * that ancestor is processed.
+ * that ancestor is processed to avoid path build infinite loops (done
+ * at get_cur_path()).
*/
while (ino > BTRFS_FIRST_FREE_OBJECTID) {
if (is_waiting_for_move(sctx, ino)) {
- ret = 1;
+ /*
+ * If the current inode is an ancestor of ino in the
+ * parent root, we need to delay the rename of the
+ * current inode, otherwise don't delayed the rename
+ * because we can end up with a circular dependency
+ * of renames, resulting in some directories never
+ * getting the respective rename operations issued in
+ * the send stream or getting into infinite path build
+ * loops.
+ */
+ ret = is_ancestor(sctx->parent_root,
+ sctx->cur_ino, sctx->cur_inode_gen,
+ ino, path_before);
break;
}
@@ -3479,7 +3498,7 @@ out:
ino,
&sctx->new_refs,
&sctx->deleted_refs,
- false);
+ is_orphan);
if (!ret)
ret = 1;
}
@@ -3610,10 +3629,27 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
+ struct name_cache_entry *nce;
+
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
goto out;
+ /*
+ * Make sure we clear our orphanized inode's
+ * name from the name cache. This is because the
+ * inode ow_inode might be an ancestor of some
+ * other inode that will be orphanized as well
+ * later and has an inode number greater than
+ * sctx->send_progress. We need to prevent
+ * future name lookups from using the old name
+ * and get instead the orphan name.
+ */
+ nce = name_cache_search(sctx, ow_inode, ow_gen);
+ if (nce) {
+ name_cache_delete(sctx, nce);
+ kfree(nce);
+ }
} else {
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
@@ -3631,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
}
}
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
+ can_rename) {
+ ret = wait_for_parent_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
/*
* link/move the ref to the new place. If we have an orphan
* inode, move it and update valid_path. If not, link or move
@@ -3651,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = wait_for_parent_move(sctx, cur);
- if (ret < 0)
- goto out;
- if (ret) {
- *pending_move = 1;
- } else {
- ret = send_rename(sctx, valid_path,
- cur->full_path);
- if (!ret)
- ret = fs_path_copy(valid_path,
- cur->full_path);
- }
+ ret = send_rename(sctx, valid_path,
+ cur->full_path);
+ if (!ret)
+ ret = fs_path_copy(valid_path,
+ cur->full_path);
if (ret < 0)
goto out;
} else {
@@ -4550,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
if (ret < 0)
goto out;
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
- clone_root->root->root_item.uuid);
+ /*
+ * If the parent we're using has a received_uuid set then use that as
+ * our clone source as that is what we will look for when doing a
+ * receive.
+ *
+ * This covers the case that we create a snapshot off of a received
+ * subvolume and then use that as the parent and try to receive on a
+ * different host.
+ */
+ if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+ clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
le64_to_cpu(clone_root->root->root_item.ctransid));
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
@@ -5852,19 +5905,20 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
ret = PTR_ERR(clone_root);
goto out;
}
- clone_sources_to_rollback = i + 1;
spin_lock(&clone_root->root_item_lock);
- clone_root->send_in_progress++;
- if (!btrfs_root_readonly(clone_root)) {
+ if (!btrfs_root_readonly(clone_root) ||
+ btrfs_root_dead(clone_root)) {
spin_unlock(&clone_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -EPERM;
goto out;
}
+ clone_root->send_in_progress++;
spin_unlock(&clone_root->root_item_lock);
srcu_read_unlock(&fs_info->subvol_srcu, index);
sctx->clone_roots[i].root = clone_root;
+ clone_sources_to_rollback = i + 1;
}
vfree(clone_sources_tmp);
clone_sources_tmp = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 05fef198ff94..cd7ef34d2dce 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
* __btrfs_std_error decodes expected errors from the caller and
* invokes the approciate error response.
*/
+__cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
@@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
* We'll complete the cleanup in btrfs_end_transaction and
* btrfs_commit_transaction.
*/
+__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *function,
unsigned int line, int errno)
{
- /*
- * Report first abort since mount
- */
- if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
- &root->fs_info->fs_state)) {
- WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
- errno);
- }
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
@@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
* __btrfs_panic decodes unexpected, fatal errors from the caller,
* issues an alert, and either panics or BUGs, depending on mount options.
*/
+__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
@@ -841,33 +836,153 @@ out:
return error;
}
-static struct dentry *get_default_root(struct super_block *sb,
- u64 subvol_objectid)
+static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
- struct btrfs_root *new_root;
- struct btrfs_dir_item *di;
- struct btrfs_path *path;
- struct btrfs_key location;
- struct inode *inode;
- u64 dir_id;
- int new = 0;
+ struct btrfs_root *fs_root;
+ struct btrfs_root_ref *root_ref;
+ struct btrfs_inode_ref *inode_ref;
+ struct btrfs_key key;
+ struct btrfs_path *path = NULL;
+ char *name = NULL, *ptr;
+ u64 dirid;
+ int len;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ path->leave_spinning = 1;
+
+ name = kmalloc(PATH_MAX, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ ptr = name + PATH_MAX - 1;
+ ptr[0] = '\0';
/*
- * We have a specific subvol we want to mount, just setup location and
- * go look up the root.
+ * Walk up the subvolume trees in the tree of tree roots by root
+ * backrefs until we hit the top-level subvolume.
*/
- if (subvol_objectid) {
- location.objectid = subvol_objectid;
- location.type = BTRFS_ROOT_ITEM_KEY;
- location.offset = (u64)-1;
- goto find_root;
+ while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+ key.objectid = subvol_objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(root, path, subvol_objectid,
+ BTRFS_ROOT_BACKREF_KEY);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ subvol_objectid = key.offset;
+
+ root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_root_ref);
+ len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(root_ref + 1), len);
+ ptr[0] = '/';
+ dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
+ btrfs_release_path(path);
+
+ key.objectid = subvol_objectid;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+ fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(fs_root)) {
+ ret = PTR_ERR(fs_root);
+ goto err;
+ }
+
+ /*
+ * Walk up the filesystem tree by inode refs until we hit the
+ * root directory.
+ */
+ while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = dirid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = btrfs_previous_item(fs_root, path, dirid,
+ BTRFS_INODE_REF_KEY);
+ if (ret < 0) {
+ goto err;
+ } else if (ret > 0) {
+ ret = -ENOENT;
+ goto err;
+ }
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ dirid = key.offset;
+
+ inode_ref = btrfs_item_ptr(path->nodes[0],
+ path->slots[0],
+ struct btrfs_inode_ref);
+ len = btrfs_inode_ref_name_len(path->nodes[0],
+ inode_ref);
+ ptr -= len + 1;
+ if (ptr < name) {
+ ret = -ENAMETOOLONG;
+ goto err;
+ }
+ read_extent_buffer(path->nodes[0], ptr + 1,
+ (unsigned long)(inode_ref + 1), len);
+ ptr[0] = '/';
+ btrfs_release_path(path);
+ }
+ }
+
+ btrfs_free_path(path);
+ if (ptr == name + PATH_MAX - 1) {
+ name[0] = '/';
+ name[1] = '\0';
+ } else {
+ memmove(name, ptr, name + PATH_MAX - ptr);
}
+ return name;
+
+err:
+ btrfs_free_path(path);
+ kfree(name);
+ return ERR_PTR(ret);
+}
+
+static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
+{
+ struct btrfs_root *root = fs_info->tree_root;
+ struct btrfs_dir_item *di;
+ struct btrfs_path *path;
+ struct btrfs_key location;
+ u64 dir_id;
path = btrfs_alloc_path();
if (!path)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
path->leave_spinning = 1;
/*
@@ -879,49 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb,
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
- return ERR_CAST(di);
+ return PTR_ERR(di);
}
if (!di) {
/*
* Ok the default dir item isn't there. This is weird since
* it's always been there, but don't freak out, just try and
- * mount to root most subvolume.
+ * mount the top-level subvolume.
*/
btrfs_free_path(path);
- dir_id = BTRFS_FIRST_FREE_OBJECTID;
- new_root = fs_info->fs_root;
- goto setup_root;
+ *objectid = BTRFS_FS_TREE_OBJECTID;
+ return 0;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
btrfs_free_path(path);
-
-find_root:
- new_root = btrfs_read_fs_root_no_name(fs_info, &location);
- if (IS_ERR(new_root))
- return ERR_CAST(new_root);
-
- dir_id = btrfs_root_dirid(&new_root->root_item);
-setup_root:
- location.objectid = dir_id;
- location.type = BTRFS_INODE_ITEM_KEY;
- location.offset = 0;
-
- inode = btrfs_iget(sb, &location, new_root, &new);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- /*
- * If we're just mounting the root most subvol put the inode and return
- * a reference to the dentry. We will have already gotten a reference
- * to the inode in btrfs_fill_super so we're good to go.
- */
- if (!new && sb->s_root->d_inode == inode) {
- iput(inode);
- return dget(sb->s_root);
- }
-
- return d_obtain_root(inode);
+ *objectid = location.objectid;
+ return 0;
}
static int btrfs_fill_super(struct super_block *sb,
@@ -1099,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+ seq_printf(seq, ",subvolid=%llu",
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ seq_puts(seq, ",subvol=");
+ seq_dentry(seq, dentry, " \t\n\\");
return 0;
}
@@ -1129,107 +1222,139 @@ static inline int is_subvolume_inode(struct inode *inode)
}
/*
- * This will strip out the subvol=%s argument for an argument string and add
- * subvolid=0 to make sure we get the actual tree root for path walking to the
- * subvol we want.
+ * This will add subvolid=0 to the argument string while removing any subvol=
+ * and subvolid= arguments to make sure we get the top-level root for path
+ * walking to the subvol we want.
*/
static char *setup_root_args(char *args)
{
- unsigned len = strlen(args) + 2 + 1;
- char *src, *dst, *buf;
+ char *buf, *dst, *sep;
- /*
- * We need the same args as before, but with this substitution:
- * s!subvol=[^,]+!subvolid=0!
- *
- * Since the replacement string is up to 2 bytes longer than the
- * original, allocate strlen(args) + 2 + 1 bytes.
- */
-
- src = strstr(args, "subvol=");
- /* This shouldn't happen, but just in case.. */
- if (!src)
- return NULL;
+ if (!args)
+ return kstrdup("subvolid=0", GFP_NOFS);
- buf = dst = kmalloc(len, GFP_NOFS);
+ /* The worst case is that we add ",subvolid=0" to the end. */
+ buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS);
if (!buf)
return NULL;
- /*
- * If the subvol= arg is not at the start of the string,
- * copy whatever precedes it into buf.
- */
- if (src != args) {
- *src++ = '\0';
- strcpy(buf, args);
- dst += strlen(args);
+ while (1) {
+ sep = strchrnul(args, ',');
+ if (!strstarts(args, "subvol=") &&
+ !strstarts(args, "subvolid=")) {
+ memcpy(dst, args, sep - args);
+ dst += sep - args;
+ *dst++ = ',';
+ }
+ if (*sep)
+ args = sep + 1;
+ else
+ break;
}
-
strcpy(dst, "subvolid=0");
- dst += strlen("subvolid=0");
-
- /*
- * If there is a "," after the original subvol=... string,
- * copy that suffix into our buffer. Otherwise, we're done.
- */
- src = strchr(src, ',');
- if (src)
- strcpy(dst, src);
return buf;
}
-static struct dentry *mount_subvol(const char *subvol_name, int flags,
- const char *device_name, char *data)
+static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
+ int flags, const char *device_name,
+ char *data)
{
struct dentry *root;
- struct vfsmount *mnt;
+ struct vfsmount *mnt = NULL;
char *newargs;
+ int ret;
newargs = setup_root_args(data);
- if (!newargs)
- return ERR_PTR(-ENOMEM);
- mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
- newargs);
+ if (!newargs) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
+ }
- if (PTR_RET(mnt) == -EBUSY) {
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
+ if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
if (flags & MS_RDONLY) {
- mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
- newargs);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+ device_name, newargs);
} else {
- int r;
- mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
- newargs);
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+ device_name, newargs);
if (IS_ERR(mnt)) {
- kfree(newargs);
- return ERR_CAST(mnt);
+ root = ERR_CAST(mnt);
+ mnt = NULL;
+ goto out;
}
- r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
- if (r < 0) {
- /* FIXME: release vfsmount mnt ??*/
- kfree(newargs);
- return ERR_PTR(r);
+ down_write(&mnt->mnt_sb->s_umount);
+ ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+ up_write(&mnt->mnt_sb->s_umount);
+ if (ret < 0) {
+ root = ERR_PTR(ret);
+ goto out;
}
}
}
+ if (IS_ERR(mnt)) {
+ root = ERR_CAST(mnt);
+ mnt = NULL;
+ goto out;
+ }
- kfree(newargs);
+ if (!subvol_name) {
+ if (!subvol_objectid) {
+ ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
+ &subvol_objectid);
+ if (ret) {
+ root = ERR_PTR(ret);
+ goto out;
+ }
+ }
+ subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
+ subvol_objectid);
+ if (IS_ERR(subvol_name)) {
+ root = ERR_CAST(subvol_name);
+ subvol_name = NULL;
+ goto out;
+ }
- if (IS_ERR(mnt))
- return ERR_CAST(mnt);
+ }
root = mount_subtree(mnt, subvol_name);
+ /* mount_subtree() drops our reference on the vfsmount. */
+ mnt = NULL;
- if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+ if (!IS_ERR(root)) {
struct super_block *s = root->d_sb;
- dput(root);
- root = ERR_PTR(-EINVAL);
- deactivate_locked_super(s);
- printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
- subvol_name);
+ struct inode *root_inode = d_inode(root);
+ u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+
+ ret = 0;
+ if (!is_subvolume_inode(root_inode)) {
+ pr_err("BTRFS: '%s' is not a valid subvolume\n",
+ subvol_name);
+ ret = -EINVAL;
+ }
+ if (subvol_objectid && root_objectid != subvol_objectid) {
+ /*
+ * This will also catch a race condition where a
+ * subvolume which was passed by ID is renamed and
+ * another subvolume is renamed over the old location.
+ */
+ pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n",
+ subvol_name, subvol_objectid);
+ ret = -EINVAL;
+ }
+ if (ret) {
+ dput(root);
+ root = ERR_PTR(ret);
+ deactivate_locked_super(s);
+ }
}
+out:
+ mntput(mnt);
+ kfree(newargs);
+ kfree(subvol_name);
return root;
}
@@ -1294,7 +1419,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
{
struct block_device *bdev = NULL;
struct super_block *s;
- struct dentry *root;
struct btrfs_fs_devices *fs_devices = NULL;
struct btrfs_fs_info *fs_info = NULL;
struct security_mnt_opts new_sec_opts;
@@ -1314,10 +1438,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
return ERR_PTR(error);
}
- if (subvol_name) {
- root = mount_subvol(subvol_name, flags, device_name, data);
- kfree(subvol_name);
- return root;
+ if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+ /* mount_subvol() will free subvol_name. */
+ return mount_subvol(subvol_name, subvol_objectid, flags,
+ device_name, data);
}
security_init_mnt_opts(&new_sec_opts);
@@ -1383,23 +1507,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
}
-
- root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
- if (IS_ERR(root)) {
+ if (error) {
deactivate_locked_super(s);
- error = PTR_ERR(root);
goto error_sec_opts;
}
fs_info = btrfs_sb(s);
error = setup_security_options(fs_info, s, &new_sec_opts);
if (error) {
- dput(root);
deactivate_locked_super(s);
goto error_sec_opts;
}
- return root;
+ return dget(s->s_root);
error_close_devices:
btrfs_close_devices(fs_devices);
@@ -1714,7 +1834,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
avail_space = device->total_bytes - device->bytes_used;
/* align with stripe_len */
- do_div(avail_space, BTRFS_STRIPE_LEN);
+ avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
avail_space *= BTRFS_STRIPE_LEN;
/*
@@ -1886,8 +2006,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
- buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
+ buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
+ buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
return 0;
}
@@ -1908,6 +2028,17 @@ static struct file_system_type btrfs_fs_type = {
};
MODULE_ALIAS_FS("btrfs");
+static int btrfs_control_open(struct inode *inode, struct file *file)
+{
+ /*
+ * The control file's private_data is used to hold the
+ * transaction when it is started and is used to keep
+ * track of whether a transaction is already in progress.
+ */
+ file->private_data = NULL;
+ return 0;
+}
+
/*
* used by btrfsctl to scan devices when no FS is mounted
*/
@@ -2009,6 +2140,7 @@ static const struct super_operations btrfs_super_ops = {
};
static const struct file_operations btrfs_ctl_fops = {
+ .open = btrfs_control_open,
.unlocked_ioctl = btrfs_control_ioctl,
.compat_ioctl = btrfs_control_ioctl,
.owner = THIS_MODULE,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 94edb0a2a026..603b0cc2b9bb 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -33,6 +33,7 @@
#include "volumes.h"
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
static u64 get_features(struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set)
@@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
-static struct attribute *btrfs_attrs[] = {
+static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
BTRFS_ATTR_PTR(nodesize),
BTRFS_ATTR_PTR(sectorsize),
@@ -438,28 +439,36 @@ static struct attribute *btrfs_attrs[] = {
static void btrfs_release_super_kobj(struct kobject *kobj)
{
- struct btrfs_fs_info *fs_info = to_fs_info(kobj);
- complete(&fs_info->kobj_unregister);
+ struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
+
+ memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+ complete(&fs_devs->kobj_unregister);
}
static struct kobj_type btrfs_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
.release = btrfs_release_super_kobj,
- .default_attrs = btrfs_attrs,
};
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
+{
+ if (kobj->ktype != &btrfs_ktype)
+ return NULL;
+ return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+}
+
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
{
if (kobj->ktype != &btrfs_ktype)
return NULL;
- return container_of(kobj, struct btrfs_fs_info, super_kobj);
+ return to_fs_devs(kobj)->fs_info;
}
#define NUM_FEATURE_BITS 64
static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
-static u64 supported_feature_masks[3] = {
+static const u64 supported_feature_masks[3] = {
[FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
[FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP,
@@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
attrs[0] = &fa->kobj_attr.attr;
if (add) {
int ret;
- ret = sysfs_merge_group(&fs_info->super_kobj,
+ ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
&agroup);
if (ret)
return ret;
} else
- sysfs_unmerge_group(&fs_info->super_kobj,
+ sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
&agroup);
}
@@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
return 0;
}
-static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+{
+ if (fs_devs->device_dir_kobj) {
+ kobject_del(fs_devs->device_dir_kobj);
+ kobject_put(fs_devs->device_dir_kobj);
+ fs_devs->device_dir_kobj = NULL;
+ }
+
+ if (fs_devs->super_kobj.state_initialized) {
+ kobject_del(&fs_devs->super_kobj);
+ kobject_put(&fs_devs->super_kobj);
+ wait_for_completion(&fs_devs->kobj_unregister);
+ }
+}
+
+/* when fs_devs is NULL it will remove all fsid kobject */
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
{
- kobject_del(&fs_info->super_kobj);
- kobject_put(&fs_info->super_kobj);
- wait_for_completion(&fs_info->kobj_unregister);
+ struct list_head *fs_uuids = btrfs_get_fs_uuids();
+
+ if (fs_devs) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ return;
+ }
+
+ list_for_each_entry(fs_devs, fs_uuids, list) {
+ __btrfs_sysfs_remove_fsid(fs_devs);
+ }
}
void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
{
+ btrfs_reset_fs_info_ptr(fs_info);
+
if (fs_info->space_info_kobj) {
sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
kobject_del(fs_info->space_info_kobj);
kobject_put(fs_info->space_info_kobj);
}
- kobject_del(fs_info->device_dir_kobj);
- kobject_put(fs_info->device_dir_kobj);
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
- __btrfs_sysfs_remove_one(fs_info);
+ sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
+ btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
}
const char * const btrfs_feature_set_names[3] = {
@@ -602,40 +635,60 @@ static void init_feature_attrs(void)
}
}
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+/* when one_device is NULL, it removes all device links */
+
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
- if (!fs_info->device_dir_kobj)
+ if (!fs_devices->device_dir_kobj)
return -EINVAL;
if (one_device && one_device->bdev) {
disk = one_device->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_info->device_dir_kobj,
+ sysfs_remove_link(fs_devices->device_dir_kobj,
+ disk_kobj->name);
+ }
+
+ if (one_device)
+ return 0;
+
+ list_for_each_entry(one_device,
+ &fs_devices->devices, dev_list) {
+ if (!one_device->bdev)
+ continue;
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ sysfs_remove_link(fs_devices->device_dir_kobj,
disk_kobj->name);
}
return 0;
}
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
{
- int error = 0;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- struct btrfs_device *dev;
-
- if (!fs_info->device_dir_kobj)
- fs_info->device_dir_kobj = kobject_create_and_add("devices",
- &fs_info->super_kobj);
+ if (!fs_devs->device_dir_kobj)
+ fs_devs->device_dir_kobj = kobject_create_and_add("devices",
+ &fs_devs->super_kobj);
- if (!fs_info->device_dir_kobj)
+ if (!fs_devs->device_dir_kobj)
return -ENOMEM;
+ return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+ struct btrfs_device *one_device)
+{
+ int error = 0;
+ struct btrfs_device *dev;
+
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
struct hd_struct *disk;
struct kobject *disk_kobj;
@@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
disk = dev->bdev->bd_part;
disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_info->device_dir_kobj,
+ error = sysfs_create_link(fs_devices->device_dir_kobj,
disk_kobj, disk_kobj->name);
if (error)
break;
@@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry;
/* Debugging tunables and exported data */
u64 btrfs_debugfs_test;
+/*
+ * Can be called by the device discovery thread.
+ * And parent can be specified for seed device
+ */
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+ struct kobject *parent)
+{
+ int error;
+
+ init_completion(&fs_devs->kobj_unregister);
+ fs_devs->super_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_devs->super_kobj,
+ &btrfs_ktype, parent, "%pU", fs_devs->fsid);
+ return error;
+}
+
int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
{
int error;
+ struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
+ struct kobject *super_kobj = &fs_devs->super_kobj;
+
+ btrfs_set_fs_info_ptr(fs_info);
- init_completion(&fs_info->kobj_unregister);
- fs_info->super_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
- "%pU", fs_info->fsid);
+ error = btrfs_kobj_add_device(fs_devs, NULL);
if (error)
return error;
- error = sysfs_create_group(&fs_info->super_kobj,
- &btrfs_feature_attr_group);
+ error = sysfs_create_files(super_kobj, btrfs_attrs);
if (error) {
- __btrfs_sysfs_remove_one(fs_info);
+ btrfs_kobj_rm_device(fs_devs, NULL);
return error;
}
- error = addrm_unknown_feature_attrs(fs_info, true);
+ error = sysfs_create_group(super_kobj,
+ &btrfs_feature_attr_group);
if (error)
goto failure;
- error = btrfs_kobj_add_device(fs_info, NULL);
+ error = addrm_unknown_feature_attrs(fs_info, true);
if (error)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
- &fs_info->super_kobj);
+ super_kobj);
if (!fs_info->space_info_kobj) {
error = -ENOMEM;
goto failure;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f7dd298b3cf6..6392527bcc15 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -61,17 +61,33 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \
BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
/* convert from attribute */
-#define to_btrfs_feature_attr(a) \
- container_of(a, struct btrfs_feature_attr, kobj_attr)
-#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
-#define attr_to_btrfs_feature_attr(a) \
- to_btrfs_feature_attr(attr_to_btrfs_attr(a))
+static inline struct btrfs_feature_attr *
+to_btrfs_feature_attr(struct kobj_attribute *a)
+{
+ return container_of(a, struct btrfs_feature_attr, kobj_attr);
+}
+
+static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
+{
+ return container_of(attr, struct kobj_attribute, attr);
+}
+
+static inline struct btrfs_feature_attr *
+attr_to_btrfs_feature_attr(struct attribute *attr)
+{
+ return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
+}
+
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+ struct kobject *parent);
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 73f299ebdabb..846d277b1901 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -21,6 +21,7 @@
#include "../transaction.h"
#include "../disk-io.h"
#include "../qgroup.h"
+#include "../backref.h"
static void init_dummy_trans(struct btrfs_trans_handle *trans)
{
@@ -227,21 +228,28 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
int ret;
init_dummy_trans(&trans);
test_msg("Qgroup basic add\n");
- ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
+ ret = btrfs_create_qgroup(NULL, fs_info, 5);
if (ret) {
test_msg("Couldn't create a qgroup %d\n", ret);
return ret;
}
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ /*
+ * Since the test trans doesn't havee the complicated delayed refs,
+ * we can only call btrfs_qgroup_account_extent() directly to test
+ * quota.
+ */
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
if (ret) {
- test_msg("Couldn't add space to a qgroup %d\n", ret);
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
@@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
if (ret)
return ret;
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Delayed qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
test_msg("Qgroup counts didn't match expected values\n");
return -EINVAL;
}
+ old_roots = NULL;
+ new_roots = NULL;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
ret = remove_extent_item(root, 4096, 4096);
if (ret)
return -EINVAL;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Couldn't remove space from the qgroup %d\n", ret);
- return -EINVAL;
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return -EINVAL;
}
@@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct ulist *old_roots = NULL;
+ struct ulist *new_roots = NULL;
int ret;
init_dummy_trans(&trans);
@@ -301,26 +331,35 @@ static int test_multiple_refs(struct btrfs_root *root)
test_msg("Qgroup multiple refs test\n");
/* We have 5 created already from the previous test */
- ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
+ ret = btrfs_create_qgroup(NULL, fs_info, 256);
if (ret) {
test_msg("Couldn't create a qgroup %d\n", ret);
return ret;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Couldn't add space to a qgroup %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Delayed qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return -EINVAL;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = add_tree_ref(root, 4096, 4096, 0, 256);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
- BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Qgroup record ref failed %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
@@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root)
return -EINVAL;
}
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+ if (ret) {
+ ulist_free(old_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
+ return ret;
+ }
+
ret = remove_extent_ref(root, 4096, 4096, 0, 256);
if (ret)
return ret;
- ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
- BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+ ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
if (ret) {
- test_msg("Qgroup record ref failed %d\n", ret);
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+ test_msg("Couldn't find old roots: %d\n", ret);
return ret;
}
- ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+ old_roots, new_roots);
if (ret) {
- test_msg("Qgroup accounting failed %d\n", ret);
+ test_msg("Couldn't account space for a qgroup %d\n", ret);
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8be4278e25e8..f5021fcb154e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -35,7 +35,7 @@
#define BTRFS_ROOT_TRANS_TAG 0
-static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
[TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
__TRANS_START),
@@ -64,6 +64,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+ if (transaction->delayed_refs.pending_csums)
+ printk(KERN_ERR "pending csums is %llu\n",
+ transaction->delayed_refs.pending_csums);
while (!list_empty(&transaction->pending_chunks)) {
struct extent_map *em;
@@ -93,11 +96,8 @@ static void clear_btree_io_tree(struct extent_io_tree *tree)
*/
ASSERT(!waitqueue_active(&state->wq));
free_extent_state(state);
- if (need_resched()) {
- spin_unlock(&tree->lock);
- cond_resched();
- spin_lock(&tree->lock);
- }
+
+ cond_resched_lock(&tree->lock);
}
spin_unlock(&tree->lock);
}
@@ -222,13 +222,17 @@ loop:
atomic_set(&cur_trans->use_count, 2);
cur_trans->have_free_bgs = 0;
cur_trans->start_time = get_seconds();
+ cur_trans->dirty_bg_run = 0;
cur_trans->delayed_refs.href_root = RB_ROOT;
+ cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.pending_csums = 0;
cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0;
cur_trans->delayed_refs.run_delayed_start = 0;
+ cur_trans->delayed_refs.qgroup_to_skip = 0;
/*
* although the tree mod log is per file system and not per transaction,
@@ -250,6 +254,9 @@ loop:
INIT_LIST_HEAD(&cur_trans->switch_commits);
INIT_LIST_HEAD(&cur_trans->pending_ordered);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+ INIT_LIST_HEAD(&cur_trans->io_bgs);
+ mutex_init(&cur_trans->cache_write_mutex);
+ cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
@@ -504,6 +511,7 @@ again:
h->transaction = cur_trans;
h->blocks_used = 0;
h->bytes_reserved = 0;
+ h->chunk_bytes_reserved = 0;
h->root = root;
h->delayed_ref_updates = 0;
h->use_count = 1;
@@ -721,7 +729,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (updates) {
- err = btrfs_run_delayed_refs(trans, root, updates);
+ err = btrfs_run_delayed_refs(trans, root, updates * 2);
if (err) /* Error code will also eval true */
return err;
}
@@ -753,7 +761,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->ordered)) {
spin_lock(&info->trans_lock);
- list_splice(&trans->ordered, &cur_trans->pending_ordered);
+ list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
spin_unlock(&info->trans_lock);
}
@@ -787,6 +795,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
+
if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
should_end_transaction(trans, root) &&
ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -1057,6 +1067,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
+ struct list_head *io_bgs = &trans->transaction->io_bgs;
struct list_head *next;
struct extent_buffer *eb;
int ret;
@@ -1110,7 +1121,7 @@ again:
return ret;
}
- while (!list_empty(dirty_bgs)) {
+ while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
ret = btrfs_write_dirty_block_groups(trans, root);
if (ret)
return ret;
@@ -1284,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (pending->error)
goto no_free_objectid;
+ /*
+ * Make qgroup to skip current new snapshot's qgroupid, as it is
+ * accounted by later btrfs_qgroup_inherit().
+ */
+ btrfs_set_skip_qgroup(trans, objectid);
+
btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
if (to_reserve > 0) {
@@ -1292,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
to_reserve,
BTRFS_RESERVE_NO_FLUSH);
if (pending->error)
- goto no_free_objectid;
+ goto clear_skip_qgroup;
}
key.objectid = objectid;
@@ -1390,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, root, ret);
goto fail;
}
-
- /*
- * We need to flush delayed refs in order to make sure all of our quota
- * operations have been done before we call btrfs_qgroup_inherit.
- */
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
- ret = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- goto fail;
- }
-
/* see comments in should_cow_block() */
set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_wmb();
@@ -1491,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
}
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ /*
+ * account qgroup counters before qgroup_inherit()
+ */
+ ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+ if (ret)
+ goto fail;
+ ret = btrfs_qgroup_account_extents(trans, fs_info);
+ if (ret)
+ goto fail;
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ root->root_key.objectid,
+ objectid, pending->inherit);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
fail:
pending->error = ret;
dir_item_existed:
trans->block_rsv = rsv;
trans->bytes_reserved = 0;
+clear_skip_qgroup:
+ btrfs_clear_skip_qgroup(trans);
no_free_objectid:
kfree(new_root_item);
root_item_alloc_fail:
@@ -1810,8 +1834,39 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
+ if (!cur_trans->dirty_bg_run) {
+ int run_it = 0;
+
+ /* this mutex is also taken before trying to set
+ * block groups readonly. We need to make sure
+ * that nobody has set a block group readonly
+ * after a extents from that block group have been
+ * allocated for cache files. btrfs_set_block_group_ro
+ * will wait for the transaction to commit if it
+ * finds dirty_bg_run = 1
+ *
+ * The dirty_bg_run flag is also used to make sure only
+ * one process starts all the block group IO. It wouldn't
+ * hurt to have more than one go through, but there's no
+ * real advantage to it either.
+ */
+ mutex_lock(&root->fs_info->ro_block_group_mutex);
+ if (!cur_trans->dirty_bg_run) {
+ run_it = 1;
+ cur_trans->dirty_bg_run = 1;
+ }
+ mutex_unlock(&root->fs_info->ro_block_group_mutex);
+
+ if (run_it)
+ ret = btrfs_start_dirty_block_groups(trans, root);
+ }
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+
spin_lock(&root->fs_info->trans_lock);
- list_splice(&trans->ordered, &cur_trans->pending_ordered);
+ list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
@@ -1926,6 +1981,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto scrub_continue;
}
+ /* Reocrd old roots for later qgroup accounting */
+ ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info);
+ if (ret) {
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
/*
* make sure none of the code above managed to slip in a
* delayed item
@@ -1967,6 +2029,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
btrfs_free_log_root_tree(trans, root->fs_info);
+ /*
+ * Since fs roots are all committed, we can get a quite accurate
+ * new_roots. So let's do quota accounting.
+ */
+ ret = btrfs_qgroup_account_extents(trans, root->fs_info);
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto scrub_continue;
+ }
+
ret = commit_cowonly_roots(trans, root);
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
@@ -2003,6 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
assert_qgroups_uptodate(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
+ ASSERT(list_empty(&cur_trans->io_bgs));
update_super_roots(root);
btrfs_set_super_log_root(root->fs_info->super_copy, 0);
@@ -2016,6 +2090,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+ btrfs_trans_release_chunk_metadata(trans);
+
spin_lock(&root->fs_info->trans_lock);
cur_trans->state = TRANS_STATE_UNBLOCKED;
root->fs_info->running_transaction = NULL;
@@ -2076,7 +2152,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- if (current != root->fs_info->transaction_kthread)
+ if (current != root->fs_info->transaction_kthread &&
+ current != root->fs_info->cleaner_kthread)
btrfs_run_delayed_iputs(root);
return ret;
@@ -2085,6 +2162,7 @@ scrub_continue:
btrfs_scrub_continue(root);
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
+ btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
if (trans->qgroup_reserved) {
btrfs_qgroup_free(root, trans->qgroup_reserved);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 937050a2b68e..eb09c2067fa8 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -64,9 +64,19 @@ struct btrfs_transaction {
struct list_head pending_ordered;
struct list_head switch_commits;
struct list_head dirty_bgs;
+ struct list_head io_bgs;
+ u64 num_dirty_bgs;
+
+ /*
+ * we need to make sure block group deletion doesn't race with
+ * free space cache writeout. This mutex keeps them from stomping
+ * on each other
+ */
+ struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
+ int dirty_bg_run;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -92,6 +102,7 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
+ u64 chunk_bytes_reserved;
u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
@@ -136,9 +147,34 @@ struct btrfs_pending_snapshot {
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
struct inode *inode)
{
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_trans = trans->transaction->transid;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ spin_unlock(&BTRFS_I(inode)->lock);
+}
+
+/*
+ * Make qgroup codes to skip given qgroupid, means the old/new_roots for
+ * qgroup won't contain the qgroupid in it.
+ */
+static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
+ u64 qgroupid)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = qgroupid;
+}
+
+static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ WARN_ON(!delayed_refs->qgroup_to_skip);
+ delayed_refs->qgroup_to_skip = 0;
}
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a63719cc9578..a4b9c8b2d35a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
goto out;
- if (btrfs_test_opt(root, SSD))
- goto out;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c5b8ba37f88e..9c45431e69ab 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -492,11 +492,19 @@ insert:
if (btrfs_inode_generation(eb, src_item) == 0) {
struct extent_buffer *dst_eb = path->nodes[0];
+ const u64 ino_size = btrfs_inode_size(eb, src_item);
+ /*
+ * For regular files an ino_size == 0 is used only when
+ * logging that an inode exists, as part of a directory
+ * fsync, and the inode wasn't fsynced before. In this
+ * case don't set the size of the inode in the fs/subvol
+ * tree, otherwise we would be throwing valid data away.
+ */
if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
- S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+ S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+ ino_size != 0) {
struct btrfs_map_token token;
- u64 ino_size = btrfs_inode_size(eb, src_item);
btrfs_init_map_token(&token);
btrfs_set_token_inode_size(dst_eb, dst_item,
@@ -1951,6 +1959,104 @@ out:
return ret;
}
+static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ const u64 ino)
+{
+ struct btrfs_key search_key;
+ struct btrfs_path *log_path;
+ int i;
+ int nritems;
+ int ret;
+
+ log_path = btrfs_alloc_path();
+ if (!log_path)
+ return -ENOMEM;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_XATTR_ITEM_KEY;
+ search_key.offset = 0;
+again:
+ ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+process_leaf:
+ nritems = btrfs_header_nritems(path->nodes[0]);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ struct btrfs_dir_item *log_di;
+ u32 total_size;
+ u32 cur;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+ total_size = btrfs_item_size_nr(path->nodes[0], i);
+ cur = 0;
+ while (cur < total_size) {
+ u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+ u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+ u32 this_len = sizeof(*di) + name_len + data_len;
+ char *name;
+
+ name = kmalloc(name_len, GFP_NOFS);
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(path->nodes[0], name,
+ (unsigned long)(di + 1), name_len);
+
+ log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+ name, name_len, 0);
+ btrfs_release_path(log_path);
+ if (!log_di) {
+ /* Doesn't exist in log tree, so delete it. */
+ btrfs_release_path(path);
+ di = btrfs_lookup_xattr(trans, root, path, ino,
+ name, name_len, -1);
+ kfree(name);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ ASSERT(di);
+ ret = btrfs_delete_one_dir_name(trans, root,
+ path, di);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ search_key = key;
+ goto again;
+ }
+ kfree(name);
+ if (IS_ERR(log_di)) {
+ ret = PTR_ERR(log_di);
+ goto out;
+ }
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *)di + this_len);
+ }
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ goto process_leaf;
+out:
+ btrfs_free_path(log_path);
+ btrfs_release_path(path);
+ return ret;
+}
+
+
/*
* deletion replay happens before we copy any new directory items
* out of the log or out of backreferences from inodes. It
@@ -2104,6 +2210,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
+ ret = replay_xattr_deletes(wc->trans, root, log,
+ path, key.objectid);
+ if (ret)
+ break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
ret = replay_dir_deletes(wc->trans,
@@ -2230,7 +2340,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root, next);
+ clean_tree_block(trans, root->fs_info,
+ next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
}
@@ -2308,7 +2419,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root, next);
+ clean_tree_block(trans, root->fs_info,
+ next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
}
@@ -2384,7 +2496,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
if (trans) {
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
- clean_tree_block(trans, log, next);
+ clean_tree_block(trans, log->fs_info, next);
btrfs_wait_tree_block_writeback(next);
btrfs_tree_unlock(next);
}
@@ -3020,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path, int key_type,
+ struct btrfs_log_ctx *ctx,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
@@ -3104,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
src = path->nodes[0];
nritems = btrfs_header_nritems(src);
for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+
btrfs_item_key_to_cpu(src, &min_key, i);
if (min_key.objectid != ino || min_key.type != key_type)
@@ -3114,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
err = ret;
goto done;
}
+
+ /*
+ * We must make sure that when we log a directory entry,
+ * the corresponding inode, after log replay, has a
+ * matching link count. For example:
+ *
+ * touch foo
+ * mkdir mydir
+ * sync
+ * ln foo mydir/bar
+ * xfs_io -c "fsync" mydir
+ * <crash>
+ * <mount fs and log replay>
+ *
+ * Would result in a fsync log that when replayed, our
+ * file inode would have a link count of 1, but we get
+ * two directory entries pointing to the same inode.
+ * After removing one of the names, it would not be
+ * possible to remove the other name, which resulted
+ * always in stale file handle errors, and would not
+ * be possible to rmdir the parent directory, since
+ * its i_size could never decrement to the value
+ * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
+ */
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+ btrfs_dir_item_key_to_cpu(src, di, &tmp);
+ if (ctx &&
+ (btrfs_dir_transid(src, di) == trans->transid ||
+ btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+ tmp.type != BTRFS_ROOT_ITEM_KEY)
+ ctx->log_new_dentries = true;
}
path->slots[0] = nritems;
@@ -3175,7 +3321,8 @@ done:
static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_path *path,
- struct btrfs_path *dst_path)
+ struct btrfs_path *dst_path,
+ struct btrfs_log_ctx *ctx)
{
u64 min_key;
u64 max_key;
@@ -3187,7 +3334,7 @@ again:
max_key = 0;
while (1) {
ret = log_dir_items(trans, root, inode, path,
- dst_path, key_type, min_key,
+ dst_path, key_type, ctx, min_key,
&max_key);
if (ret)
return ret;
@@ -3734,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
&ordered->flags))
continue;
- if (ordered->csum_bytes_left) {
- btrfs_start_ordered_extent(inode, ordered, 0);
- wait_event(ordered->wait,
- ordered->csum_bytes_left == 0);
- }
-
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
if (ret)
@@ -3963,7 +4104,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
if (ret < 0) {
return ret;
} else if (ret > 0) {
- *size_ret = i_size_read(inode);
+ *size_ret = 0;
} else {
struct btrfs_inode_item *item;
@@ -3976,6 +4117,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
return 0;
}
+/*
+ * At the moment we always log all xattrs. This is to figure out at log replay
+ * time which xattrs must have their deletion replayed. If a xattr is missing
+ * in the log tree and exists in the fs/subvol tree, we delete it. This is
+ * because if a xattr is deleted, the inode is fsynced and a power failure
+ * happens, causing the log to be replayed the next time the fs is mounted,
+ * we want the xattr to not exist anymore (same behaviour as other filesystems
+ * with a journal, ext3/4, xfs, f2fs, etc).
+ */
+static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path)
+{
+ int ret;
+ struct btrfs_key key;
+ const u64 ino = btrfs_ino(inode);
+ int ins_nr = 0;
+ int start_slot = 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_XATTR_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ int nritems = btrfs_header_nritems(leaf);
+
+ if (slot >= nritems) {
+ if (ins_nr > 0) {
+ u64 last_extent = 0;
+
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ if (ins_nr == 0)
+ start_slot = slot;
+ ins_nr++;
+ path->slots[0]++;
+ cond_resched();
+ }
+ if (ins_nr > 0) {
+ u64 last_extent = 0;
+
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, start_slot,
+ ins_nr, 1, 0);
+ /* can't be 1, extent items aren't processed */
+ ASSERT(ret <= 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * If the no holes feature is enabled we need to make sure any hole between the
+ * last extent and the i_size of our inode is explicitly marked in the log. This
+ * is to make sure that doing something like:
+ *
+ * 1) create file with 128Kb of data
+ * 2) truncate file to 64Kb
+ * 3) truncate file to 256Kb
+ * 4) fsync file
+ * 5) <crash/power failure>
+ * 6) mount fs and trigger log replay
+ *
+ * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+ * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+ * file correspond to a hole. The presence of explicit holes in a log tree is
+ * what guarantees that log replay will remove/adjust file extent items in the
+ * fs/subvol tree.
+ *
+ * Here we do not need to care about holes between extents, that is already done
+ * by copy_items(). We also only need to do this in the full sync path, where we
+ * lookup for extents from the fs/subvol tree only. In the fast path case, we
+ * lookup the list of modified extent maps and if any represents a hole, we
+ * insert a corresponding extent representing a hole in the log tree.
+ */
+static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ u64 hole_start;
+ u64 hole_size;
+ struct extent_buffer *leaf;
+ struct btrfs_root *log = root->log_root;
+ const u64 ino = btrfs_ino(inode);
+ const u64 i_size = i_size_read(inode);
+
+ if (!btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+ ASSERT(path->slots[0] > 0);
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* inode does not have any extents */
+ hole_start = 0;
+ hole_size = i_size;
+ } else {
+ struct btrfs_file_extent_item *extent;
+ u64 len;
+
+ /*
+ * If there's an extent beyond i_size, an explicit hole was
+ * already inserted by copy_items().
+ */
+ if (key.offset >= i_size)
+ return 0;
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, extent) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_inline_len(leaf,
+ path->slots[0],
+ extent);
+ ASSERT(len == i_size);
+ return 0;
+ }
+
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ /* Last extent goes beyond i_size, no need to log a hole. */
+ if (key.offset + len > i_size)
+ return 0;
+ hole_start = key.offset + len;
+ hole_size = i_size - hole_start;
+ }
+ btrfs_release_path(path);
+
+ /* Last extent ends at i_size. */
+ if (hole_size == 0)
+ return 0;
+
+ hole_size = ALIGN(hole_size, root->sectorsize);
+ ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+ hole_size, 0, hole_size, 0, 0, 0);
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4014,6 +4336,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 logged_isize = 0;
+ bool need_log_inode_item = true;
path = btrfs_alloc_path();
if (!path)
@@ -4070,10 +4393,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode)) {
int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
- if (inode_only == LOG_INODE_EXISTS) {
- max_key_type = BTRFS_INODE_EXTREF_KEY;
- max_key.type = max_key_type;
- }
+ if (inode_only == LOG_INODE_EXISTS)
+ max_key_type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
if (inode_only == LOG_INODE_EXISTS) {
@@ -4098,7 +4419,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
- max_key.type = BTRFS_INODE_EXTREF_KEY;
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
@@ -4106,30 +4427,24 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
&BTRFS_I(inode)->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&BTRFS_I(inode)->runtime_flags);
- ret = btrfs_truncate_inode_items(trans, log,
- inode, 0, 0);
+ while(1) {
+ ret = btrfs_truncate_inode_items(trans,
+ log, inode, 0, 0);
+ if (ret != -EAGAIN)
+ break;
+ }
}
- } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags) ||
+ } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags) ||
inode_only == LOG_INODE_EXISTS) {
- if (inode_only == LOG_INODE_ALL) {
- clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags);
+ if (inode_only == LOG_INODE_ALL)
fast_search = true;
- max_key.type = BTRFS_XATTR_ITEM_KEY;
- } else {
- max_key.type = BTRFS_INODE_EXTREF_KEY;
- }
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
- ret = log_inode_item(trans, log, dst_path, inode);
- if (ret) {
- err = ret;
- goto out_unlock;
- }
goto log_extents;
}
@@ -4152,6 +4467,28 @@ again:
if (min_key.type > max_key.type)
break;
+ if (min_key.type == BTRFS_INODE_ITEM_KEY)
+ need_log_inode_item = false;
+
+ /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+ if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ins_nr = 0;
+ if (ret) {
+ btrfs_release_path(path);
+ continue;
+ }
+ goto next_slot;
+ }
+
src = path->nodes[0];
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
ins_nr++;
@@ -4219,9 +4556,26 @@ next_slot:
ins_nr = 0;
}
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
+ if (err)
+ goto out_unlock;
+ if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+ err = btrfs_log_trailing_hole(trans, root, inode, path);
+ if (err)
+ goto out_unlock;
+ }
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
+ if (need_log_inode_item) {
+ err = log_inode_item(trans, log, dst_path, inode);
+ if (err)
+ goto out_unlock;
+ }
if (fast_search) {
/*
* Some ordered extents started by fsync might have completed
@@ -4277,15 +4631,18 @@ log_extents:
}
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
- ret = log_directory_changes(trans, root, inode, path, dst_path);
+ ret = log_directory_changes(trans, root, inode, path, dst_path,
+ ctx);
if (ret) {
err = ret;
goto out_unlock;
}
}
+ spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+ spin_unlock(&BTRFS_I(inode)->lock);
out_unlock:
if (unlikely(err))
btrfs_put_logged_extents(&logged_list);
@@ -4327,9 +4684,9 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
goto out;
if (!S_ISDIR(inode->i_mode)) {
- if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
goto out;
- inode = parent->d_inode;
+ inode = d_inode(parent);
}
while (1) {
@@ -4355,7 +4712,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
break;
}
- if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
break;
if (IS_ROOT(parent))
@@ -4364,7 +4721,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
parent = dget_parent(parent);
dput(old_parent);
old_parent = parent;
- inode = parent->d_inode;
+ inode = d_inode(parent);
}
dput(old_parent);
@@ -4372,6 +4729,181 @@ out:
return ret;
}
+struct btrfs_dir_list {
+ u64 ino;
+ struct list_head list;
+};
+
+/*
+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
+ * details about the why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(sb_internal#2);
+ * lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not locking i_mutex of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ * that while logging the inode new references (names) are added or removed
+ * from the inode, leaving the logged inode item with a link count that does
+ * not match the number of logged inode reference items. This is fine because
+ * at log replay time we compute the real number of links and correct the
+ * link count in the inode item (see replay_one_buffer() and
+ * link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+ * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ * has a size that doesn't match the sum of the lengths of all the logged
+ * names. This does not result in a problem because if a dir_item key is
+ * logged but its matching dir_index key is not logged, at log replay time we
+ * don't use it to replay the respective name (see replay_one_name()). On the
+ * other hand if only the dir_index key ends up being logged, the respective
+ * name is added to the fs/subvol tree with both the dir_item and dir_index
+ * keys created (see replay_one_name()).
+ * The directory's inode item with a wrong i_size is not a problem as well,
+ * since we don't use it at log replay time to set the i_size in the inode
+ * item of the fs/subvol tree (see overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *start_inode,
+ struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_root *log = root->log_root;
+ struct btrfs_path *path;
+ LIST_HEAD(dir_list);
+ struct btrfs_dir_list *dir_elem;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+ if (!dir_elem) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+ dir_elem->ino = btrfs_ino(start_inode);
+ list_add_tail(&dir_elem->list, &dir_list);
+
+ while (!list_empty(&dir_list)) {
+ struct extent_buffer *leaf;
+ struct btrfs_key min_key;
+ int nritems;
+ int i;
+
+ dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
+ list);
+ if (ret)
+ goto next_dir_inode;
+
+ min_key.objectid = dir_elem->ino;
+ min_key.type = BTRFS_DIR_ITEM_KEY;
+ min_key.offset = 0;
+again:
+ btrfs_release_path(path);
+ ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+ if (ret < 0) {
+ goto next_dir_inode;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next_dir_inode;
+ }
+
+process_leaf:
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
+ struct btrfs_key di_key;
+ struct inode *di_inode;
+ struct btrfs_dir_list *new_dir_elem;
+ int log_mode = LOG_INODE_EXISTS;
+ int type;
+
+ btrfs_item_key_to_cpu(leaf, &min_key, i);
+ if (min_key.objectid != dir_elem->ino ||
+ min_key.type != BTRFS_DIR_ITEM_KEY)
+ goto next_dir_inode;
+
+ di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+ type = btrfs_dir_type(leaf, di);
+ if (btrfs_dir_transid(leaf, di) < trans->transid &&
+ type != BTRFS_FT_DIR)
+ continue;
+ btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+ if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+ root, NULL);
+ if (IS_ERR(di_inode)) {
+ ret = PTR_ERR(di_inode);
+ goto next_dir_inode;
+ }
+
+ if (btrfs_inode_in_log(di_inode, trans->transid)) {
+ iput(di_inode);
+ continue;
+ }
+
+ ctx->log_new_dentries = false;
+ if (type == BTRFS_FT_DIR)
+ log_mode = LOG_INODE_ALL;
+ btrfs_release_path(path);
+ ret = btrfs_log_inode(trans, root, di_inode,
+ log_mode, 0, LLONG_MAX, ctx);
+ iput(di_inode);
+ if (ret)
+ goto next_dir_inode;
+ if (ctx->log_new_dentries) {
+ new_dir_elem = kmalloc(sizeof(*new_dir_elem),
+ GFP_NOFS);
+ if (!new_dir_elem) {
+ ret = -ENOMEM;
+ goto next_dir_inode;
+ }
+ new_dir_elem->ino = di_key.objectid;
+ list_add_tail(&new_dir_elem->list, &dir_list);
+ }
+ break;
+ }
+ if (i == nritems) {
+ ret = btrfs_next_leaf(log, path);
+ if (ret < 0) {
+ goto next_dir_inode;
+ } else if (ret > 0) {
+ ret = 0;
+ goto next_dir_inode;
+ }
+ goto process_leaf;
+ }
+ if (min_key.offset < (u64)-1) {
+ min_key.offset++;
+ goto again;
+ }
+next_dir_inode:
+ list_del(&dir_elem->list);
+ kfree(dir_elem);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -4394,6 +4926,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
const struct dentry * const first_parent = parent;
const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
last_committed);
+ bool log_dentries = false;
+ struct inode *orig_inode = inode;
sb = inode->i_sb;
@@ -4449,11 +4983,14 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
+ if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
+ log_dentries = true;
+
while (1) {
- if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
break;
- inode = parent->d_inode;
+ inode = d_inode(parent);
if (root != BTRFS_I(inode)->root)
break;
@@ -4485,7 +5022,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
dput(old_parent);
old_parent = parent;
}
- ret = 0;
+ if (log_dentries)
+ ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+ else
+ ret = 0;
end_trans:
dput(old_parent);
if (ret < 0) {
@@ -4515,7 +5055,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+ ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent,
start, end, 0, ctx);
dput(parent);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 154990c26dcb..6916a781ea02 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -29,6 +29,7 @@ struct btrfs_log_ctx {
int log_ret;
int log_transid;
int io_err;
+ bool log_new_dentries;
struct list_head list;
};
@@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->io_err = 0;
+ ctx->log_new_dentries = false;
INIT_LIST_HEAD(&ctx->list);
}
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 840a38b2778a..91feb2bdefee 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
return NULL;
}
+static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
+{
+ rb_erase(&node->rb_node, &ulist->root);
+ list_del(&node->list);
+ kfree(node);
+ BUG_ON(ulist->nnodes == 0);
+ ulist->nnodes--;
+}
+
static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
struct rb_node **p = &ulist->root.rb_node;
@@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
node->val = val;
node->aux = aux;
-#ifdef CONFIG_BTRFS_DEBUG
- node->seqnum = ulist->nnodes;
-#endif
ret = ulist_rbtree_insert(ulist, node);
ASSERT(!ret);
@@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
return 1;
}
+/*
+ * ulist_del - delete one node from ulist
+ * @ulist: ulist to remove node from
+ * @val: value to delete
+ * @aux: aux to delete
+ *
+ * The deletion will only be done when *BOTH* val and aux matches.
+ * Return 0 for successful delete.
+ * Return > 0 for not found.
+ */
+int ulist_del(struct ulist *ulist, u64 val, u64 aux)
+{
+ struct ulist_node *node;
+
+ node = ulist_rbtree_search(ulist, val);
+ /* Not found */
+ if (!node)
+ return 1;
+
+ if (node->aux != aux)
+ return 1;
+
+ /* Found and delete */
+ ulist_rbtree_erase(ulist, node);
+ return 0;
+}
+
/**
* ulist_next - iterate ulist
* @ulist: ulist to iterate
@@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
uiter->cur_list = uiter->cur_list->next;
} else {
uiter->cur_list = ulist->nodes.next;
-#ifdef CONFIG_BTRFS_DEBUG
- uiter->i = 0;
-#endif
}
node = list_entry(uiter->cur_list, struct ulist_node, list);
-#ifdef CONFIG_BTRFS_DEBUG
- ASSERT(node->seqnum == uiter->i);
- ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
- uiter->i++;
-#endif
return node;
}
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 4c29db604bbe..a01a2c45825f 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist);
int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 *old_aux, gfp_t gfp_mask);
+int ulist_del(struct ulist *ulist, u64 val, u64 aux);
/* just like ulist_add_merge() but take a pointer for the aux data */
static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8222f6f74147..fbe7c104531c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
+struct list_head *btrfs_get_fs_uuids(void)
+{
+ return &fs_uuids;
+}
static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
@@ -345,7 +349,7 @@ loop_lock:
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
- BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+ BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
/*
* if we're doing the sync list, record that our
@@ -366,8 +370,8 @@ loop_lock:
btrfsic_submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
- if (need_resched())
- cond_resched();
+
+ cond_resched();
/*
* we made progress, there is more work to do and the bdi
@@ -400,8 +404,7 @@ loop_lock:
* against it before looping
*/
last_waited = ioc->last_waited;
- if (need_resched())
- cond_resched();
+ cond_resched();
continue;
}
spin_lock(&device->io_lock);
@@ -442,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+
+void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+{
+ struct btrfs_fs_devices *fs_devs;
+ struct btrfs_device *dev;
+
+ if (!cur_dev->name)
+ return;
+
+ list_for_each_entry(fs_devs, &fs_uuids, list) {
+ int del = 1;
+
+ if (fs_devs->opened)
+ continue;
+ if (fs_devs->seeding)
+ continue;
+
+ list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+
+ if (dev == cur_dev)
+ continue;
+ if (!dev->name)
+ continue;
+
+ /*
+ * Todo: This won't be enough. What if the same device
+ * comes back (with new uuid and) with its mapper path?
+ * But for now, this does help as mostly an admin will
+ * either use mapper or non mapper path throughout.
+ */
+ rcu_read_lock();
+ del = strcmp(rcu_str_deref(dev->name),
+ rcu_str_deref(cur_dev->name));
+ rcu_read_unlock();
+ if (!del)
+ break;
+ }
+
+ if (!del) {
+ /* delete the stale device */
+ if (fs_devs->num_devices == 1) {
+ btrfs_sysfs_remove_fsid(fs_devs);
+ list_del(&fs_devs->list);
+ free_fs_devices(fs_devs);
+ } else {
+ fs_devs->num_devices--;
+ list_del(&dev->dev_list);
+ rcu_string_free(dev->name);
+ kfree(dev);
+ }
+ break;
+ }
+ }
+}
+
/*
* Add new device to list of registered devices
*
@@ -557,6 +615,12 @@ static noinline int device_list_add(const char *path,
if (!fs_devices->opened)
device->generation = found_transid;
+ /*
+ * if there is new btrfs on an already registered device,
+ * then remove the stale device entry.
+ */
+ btrfs_free_stale_device(device);
+
*fs_devices_ret = fs_devices;
return ret;
@@ -609,8 +673,7 @@ error:
return ERR_PTR(-ENOMEM);
}
-void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices, int step)
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
struct btrfs_device *latest_dev = NULL;
@@ -695,13 +758,13 @@ static void free_device(struct rcu_head *head)
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
+ struct btrfs_device *device, *tmp;
if (--fs_devices->opened > 0)
return 0;
mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
struct btrfs_device *new_device;
struct rcu_string *name;
@@ -1060,6 +1123,7 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans,
struct extent_map *em;
struct list_head *search_list = &trans->transaction->pending_chunks;
int ret = 0;
+ u64 physical_start = *start;
again:
list_for_each_entry(em, search_list, list) {
@@ -1068,15 +1132,31 @@ again:
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
+ u64 end;
+
if (map->stripes[i].dev != device)
continue;
- if (map->stripes[i].physical >= *start + len ||
+ if (map->stripes[i].physical >= physical_start + len ||
map->stripes[i].physical + em->orig_block_len <=
- *start)
+ physical_start)
continue;
- *start = map->stripes[i].physical +
- em->orig_block_len;
- ret = 1;
+ /*
+ * Make sure that while processing the pinned list we do
+ * not override our *start with a lower value, because
+ * we can have pinned chunks that fall within this
+ * device hole and that have lower physical addresses
+ * than the pending chunks we processed before. If we
+ * do not take this special care we can end up getting
+ * 2 pending chunks that start at the same physical
+ * device offsets because the end offset of a pinned
+ * chunk can be equal to the start offset of some
+ * pending chunk.
+ */
+ end = map->stripes[i].physical + em->orig_block_len;
+ if (end > *start) {
+ *start = end;
+ ret = 1;
+ }
}
}
if (search_list == &trans->transaction->pending_chunks) {
@@ -1136,11 +1216,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-again:
+
max_hole_start = search_start;
max_hole_size = 0;
- hole_size = 0;
+again:
if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
ret = -ENOSPC;
goto out;
@@ -1195,8 +1275,14 @@ again:
*/
if (contains_pending_extent(trans, device,
&search_start,
- hole_size))
- hole_size = 0;
+ hole_size)) {
+ if (key.offset >= search_start) {
+ hole_size = key.offset - search_start;
+ } else {
+ WARN_ON_ONCE(1);
+ hole_size = 0;
+ }
+ }
if (hole_size > max_hole_size) {
max_hole_start = search_start;
@@ -1233,21 +1319,23 @@ next:
* allocated dev extents, and when shrinking the device,
* search_end may be smaller than search_start.
*/
- if (search_end > search_start)
+ if (search_end > search_start) {
hole_size = search_end - search_start;
- if (hole_size > max_hole_size) {
- max_hole_start = search_start;
- max_hole_size = hole_size;
- }
+ if (contains_pending_extent(trans, device, &search_start,
+ hole_size)) {
+ btrfs_release_path(path);
+ goto again;
+ }
- if (contains_pending_extent(trans, device, &search_start, hole_size)) {
- btrfs_release_path(path);
- goto again;
+ if (hole_size > max_hole_size) {
+ max_hole_start = search_start;
+ max_hole_size = hole_size;
+ }
}
/* See above. */
- if (hole_size < num_bytes)
+ if (max_hole_size < num_bytes)
ret = -ENOSPC;
else
ret = 0;
@@ -1699,7 +1787,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
- btrfs_kobj_rm_device(root->fs_info, device);
+ btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
}
call_rcu(&device->rcu, free_device);
@@ -1868,6 +1956,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
+
+ btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+
if (tgtdev->bdev) {
btrfs_scratch_superblock(tgtdev);
fs_info->fs_devices->open_devices--;
@@ -2204,7 +2295,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
tmp + 1);
/* add sysfs device entry */
- btrfs_kobj_add_device(root->fs_info, device);
+ btrfs_kobj_add_device(root->fs_info->fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
@@ -2245,8 +2336,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
- if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
- goto error_trans;
+ if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+ fsid_buf))
+ pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
}
root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2282,7 +2374,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
error_trans:
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
- btrfs_kobj_rm_device(root->fs_info, device);
+ btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2487,8 +2579,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 chunk_tree, u64 chunk_objectid,
+ struct btrfs_root *root, u64 chunk_objectid,
u64 chunk_offset)
{
int ret;
@@ -2580,7 +2671,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct map_lookup *map;
u64 dev_extent_len = 0;
u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- u64 chunk_tree = root->fs_info->chunk_root->objectid;
int i, ret = 0;
/* Just in case */
@@ -2604,6 +2694,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
return -EINVAL;
}
map = (struct map_lookup *)em->bdev;
+ lock_chunks(root->fs_info->chunk_root);
+ check_system_chunk(trans, extent_root, map->type);
+ unlock_chunks(root->fs_info->chunk_root);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
@@ -2634,8 +2727,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
}
}
}
- ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
- chunk_offset);
+ ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
@@ -2664,8 +2756,8 @@ out:
}
static int btrfs_relocate_chunk(struct btrfs_root *root,
- u64 chunk_tree, u64 chunk_objectid,
- u64 chunk_offset)
+ u64 chunk_objectid,
+ u64 chunk_offset)
{
struct btrfs_root *extent_root;
struct btrfs_trans_handle *trans;
@@ -2674,6 +2766,20 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
root = root->fs_info->chunk_root;
extent_root = root->fs_info->extent_root;
+ /*
+ * Prevent races with automatic removal of unused block groups.
+ * After we relocate and before we remove the chunk with offset
+ * chunk_offset, automatic removal of the block group can kick in,
+ * resulting in a failure when calling btrfs_remove_chunk() below.
+ *
+ * Make sure to acquire this mutex before doing a tree search (dev
+ * or chunk trees) to find chunks. Otherwise the cleaner kthread might
+ * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
+ * we release the path used to search the chunk/dev tree and before
+ * the current task acquires this mutex and calls us.
+ */
+ ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex));
+
ret = btrfs_can_relocate(extent_root, chunk_offset);
if (ret)
return -ENOSPC;
@@ -2707,7 +2813,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
struct btrfs_chunk *chunk;
struct btrfs_key key;
struct btrfs_key found_key;
- u64 chunk_tree = chunk_root->root_key.objectid;
u64 chunk_type;
bool retried = false;
int failed = 0;
@@ -2723,13 +2828,18 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
goto error;
+ }
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
+ if (ret)
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto error;
if (ret > 0)
@@ -2744,7 +2854,7 @@ again:
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+ ret = btrfs_relocate_chunk(chunk_root,
found_key.objectid,
found_key.offset);
if (ret == -ENOSPC)
@@ -2752,6 +2862,7 @@ again:
else
BUG_ON(ret);
}
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (found_key.offset == 0)
break;
@@ -3022,7 +3133,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
stripe_offset = btrfs_stripe_offset(leaf, stripe);
stripe_length = btrfs_chunk_length(leaf, chunk);
- do_div(stripe_length, factor);
+ stripe_length = div_u64(stripe_length, factor);
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
@@ -3208,9 +3319,12 @@ again:
goto error;
}
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
+ }
/*
* this shouldn't happen, it means the last relocate
@@ -3222,6 +3336,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
ret = 0;
break;
}
@@ -3230,8 +3345,10 @@ again:
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
- if (found_key.objectid != key.objectid)
+ if (found_key.objectid != key.objectid) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
break;
+ }
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
@@ -3244,10 +3361,13 @@ again:
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
btrfs_release_path(path);
- if (!ret)
+ if (!ret) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto loop;
+ }
if (counting) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
@@ -3255,9 +3375,9 @@ again:
}
ret = btrfs_relocate_chunk(chunk_root,
- chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto error;
if (ret == -ENOSPC) {
@@ -3906,9 +4026,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
uuid_root = btrfs_create_tree(trans, fs_info,
BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
- btrfs_abort_transaction(trans, tree_root,
- PTR_ERR(uuid_root));
- return PTR_ERR(uuid_root);
+ ret = PTR_ERR(uuid_root);
+ btrfs_abort_transaction(trans, tree_root, ret);
+ return ret;
}
fs_info->uuid_root = uuid_root;
@@ -3957,13 +4077,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
u64 length;
- u64 chunk_tree;
u64 chunk_objectid;
u64 chunk_offset;
int ret;
int slot;
int failed = 0;
bool retried = false;
+ bool checked_pending_chunks = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
@@ -3997,11 +4117,16 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
+ mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret < 0) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
goto done;
+ }
ret = btrfs_previous_item(root, path, 0, key.type);
+ if (ret)
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret < 0)
goto done;
if (ret) {
@@ -4015,6 +4140,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
@@ -4023,17 +4149,17 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
btrfs_release_path(path);
break;
}
- chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
- ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
- chunk_offset);
+ ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+ mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)
@@ -4046,15 +4172,6 @@ again:
goto again;
} else if (failed && retried) {
ret = -ENOSPC;
- lock_chunks(root);
-
- btrfs_device_set_total_bytes(device, old_size);
- if (device->writeable)
- device->fs_devices->total_rw_bytes += diff;
- spin_lock(&root->fs_info->free_chunk_lock);
- root->fs_info->free_chunk_space += diff;
- spin_unlock(&root->fs_info->free_chunk_lock);
- unlock_chunks(root);
goto done;
}
@@ -4066,6 +4183,35 @@ again:
}
lock_chunks(root);
+
+ /*
+ * We checked in the above loop all device extents that were already in
+ * the device tree. However before we have updated the device's
+ * total_bytes to the new size, we might have had chunk allocations that
+ * have not complete yet (new block groups attached to transaction
+ * handles), and therefore their device extents were not yet in the
+ * device tree and we missed them in the loop above. So if we have any
+ * pending chunk using a device extent that overlaps the device range
+ * that we can not use anymore, commit the current transaction and
+ * repeat the search on the device tree - this way we guarantee we will
+ * not have chunks using device extents that end beyond 'new_size'.
+ */
+ if (!checked_pending_chunks) {
+ u64 start = new_size;
+ u64 len = old_size - new_size;
+
+ if (contains_pending_extent(trans, device, &start, len)) {
+ unlock_chunks(root);
+ checked_pending_chunks = true;
+ failed = 0;
+ retried = false;
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ goto done;
+ goto again;
+ }
+ }
+
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->resized_list))
list_add_tail(&device->resized_list,
@@ -4080,6 +4226,16 @@ again:
btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
+ if (ret) {
+ lock_chunks(root);
+ btrfs_device_set_total_bytes(device, old_size);
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ spin_lock(&root->fs_info->free_chunk_lock);
+ root->fs_info->free_chunk_space += diff;
+ spin_unlock(&root->fs_info->free_chunk_lock);
+ unlock_chunks(root);
+ }
return ret;
}
@@ -4131,7 +4287,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
return 0;
}
-static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
.dev_stripes = 1,
@@ -4289,7 +4445,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
- devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+ devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
GFP_NOFS);
if (!devices_info)
return -ENOMEM;
@@ -4400,8 +4556,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
*/
if (stripe_size * data_stripes > max_chunk_size) {
u64 mask = (1ULL << 24) - 1;
- stripe_size = max_chunk_size;
- do_div(stripe_size, data_stripes);
+
+ stripe_size = div_u64(max_chunk_size, data_stripes);
/* bump the answer up to a 16MB boundary */
stripe_size = (stripe_size + mask) & ~mask;
@@ -4413,10 +4569,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
stripe_size = devices_info[ndevs-1].max_avail;
}
- do_div(stripe_size, dev_stripes);
+ stripe_size = div_u64(stripe_size, dev_stripes);
/* align to BTRFS_STRIPE_LEN */
- do_div(stripe_size, raid_stripe_len);
+ stripe_size = div_u64(stripe_size, raid_stripe_len);
stripe_size *= raid_stripe_len;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -4626,6 +4782,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
{
u64 chunk_offset;
+ ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex));
chunk_offset = find_next_chunk(extent_root->fs_info);
return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
}
@@ -4954,7 +5111,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 stripe_nr_orig;
u64 stripe_nr_end;
u64 stripe_len;
- int stripe_index;
+ u32 stripe_index;
int i;
int ret = 0;
int num_stripes;
@@ -4995,7 +5152,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
- do_div(stripe_nr, stripe_len);
+ stripe_nr = div64_u64(stripe_nr, stripe_len);
stripe_offset = stripe_nr * stripe_len;
BUG_ON(offset < stripe_offset);
@@ -5011,7 +5168,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* allow a write of a full stripe, but make sure we don't
* allow straddling of stripes
*/
- do_div(raid56_full_stripe_start, full_stripe_len);
+ raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+ full_stripe_len);
raid56_full_stripe_start *= full_stripe_len;
}
@@ -5136,7 +5294,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_index = 0;
stripe_nr_orig = stripe_nr;
stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
- do_div(stripe_nr_end, map->stripe_len);
+ stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
stripe_end_offset = stripe_nr_end * map->stripe_len -
(offset + *length);
@@ -5144,7 +5302,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & REQ_DISCARD)
num_stripes = min_t(u64, map->num_stripes,
stripe_nr_end - stripe_nr_orig);
- stripe_index = do_div(stripe_nr, map->num_stripes);
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
@@ -5170,9 +5329,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- int factor = map->num_stripes / map->sub_stripes;
+ u32 factor = map->num_stripes / map->sub_stripes;
- stripe_index = do_div(stripe_nr, factor);
+ stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
@@ -5198,8 +5357,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
- stripe_nr = raid56_full_stripe_start;
- do_div(stripe_nr, stripe_len * nr_data_stripes(map));
+ stripe_nr = div_u64(raid56_full_stripe_start,
+ stripe_len * nr_data_stripes(map));
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
@@ -5209,32 +5368,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_index = 0;
stripe_offset = 0;
} else {
- u64 tmp;
-
/*
* Mirror #0 or #1 means the original data block.
* Mirror #2 is RAID5 parity block.
* Mirror #3 is RAID6 Q block.
*/
- stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+ stripe_nr = div_u64_rem(stripe_nr,
+ nr_data_stripes(map), &stripe_index);
if (mirror_num > 1)
stripe_index = nr_data_stripes(map) +
mirror_num - 2;
/* We distribute the parity blocks across stripes */
- tmp = stripe_nr + stripe_index;
- stripe_index = do_div(tmp, map->num_stripes);
+ div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
+ &stripe_index);
if (!(rw & (REQ_WRITE | REQ_DISCARD |
REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
mirror_num = 1;
}
} else {
/*
- * after this do_div call, stripe_nr is the number of stripes
- * on this device we have to walk to find the data, and
- * stripe_index is the number of our device in the stripe array
+ * after this, stripe_nr is the number of stripes on this
+ * device we have to walk to find the data, and stripe_index is
+ * the number of our device in the stripe array
*/
- stripe_index = do_div(stripe_nr, map->num_stripes);
+ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+ &stripe_index);
mirror_num = stripe_index + 1;
}
BUG_ON(stripe_index >= map->num_stripes);
@@ -5261,7 +5420,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
u64 tmp;
- int i, rot;
+ unsigned rot;
bbio->raid_map = (u64 *)((void *)bbio->stripes +
sizeof(struct btrfs_bio_stripe) *
@@ -5269,8 +5428,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
sizeof(int) * tgtdev_indexes);
/* Work out the disk rotation on this stripe-set */
- tmp = stripe_nr;
- rot = do_div(tmp, num_stripes);
+ div_u64_rem(stripe_nr, num_stripes, &rot);
/* Fill in the logical address of each stripe */
tmp = stripe_nr * nr_data_stripes(map);
@@ -5285,8 +5443,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
if (rw & REQ_DISCARD) {
- int factor = 0;
- int sub_stripes = 0;
+ u32 factor = 0;
+ u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
@@ -5437,9 +5595,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
}
if (found) {
- u64 length = map->stripe_len;
-
- if (physical_of_found + length <=
+ if (physical_of_found + map->stripe_len <=
dev_replace->cursor_left) {
struct btrfs_bio_stripe *tgtdev_stripe =
bbio->stripes + num_stripes;
@@ -5535,15 +5691,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
rmap_len = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
- do_div(length, map->num_stripes / map->sub_stripes);
+ length = div_u64(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
- do_div(length, map->num_stripes);
+ length = div_u64(length, map->num_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- do_div(length, nr_data_stripes(map));
+ length = div_u64(length, nr_data_stripes(map));
rmap_len = map->stripe_len * nr_data_stripes(map);
}
- buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+ buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
BUG_ON(!buf); /* -ENOMEM */
for (i = 0; i < map->num_stripes; i++) {
@@ -5554,11 +5710,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
continue;
stripe_nr = physical - map->stripes[i].physical;
- do_div(stripe_nr, map->stripe_len);
+ stripe_nr = div_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
- do_div(stripe_nr, map->sub_stripes);
+ stripe_nr = div_u64(stripe_nr, map->sub_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
} /* else if RAID[56], multiply by nr_data_stripes().
@@ -5587,17 +5743,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
{
- if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
- bio_endio_nodec(bio, err);
- else
- bio_endio(bio, err);
+ bio->bi_private = bbio->private;
+ bio->bi_end_io = bbio->end_io;
+ bio_endio(bio, err);
+
btrfs_put_bbio(bbio);
}
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
- struct btrfs_device *dev = bbio->stripes[0].dev;
int is_orig_bio = 0;
if (err) {
@@ -5605,6 +5760,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (err == -EIO || err == -EREMOTEIO) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
+ struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
@@ -5634,8 +5790,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
bio = bbio->orig_bio;
}
- bio->bi_private = bbio->private;
- bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
@@ -5817,8 +5971,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
/* Shoud be the original bio. */
WARN_ON(bio != bbio->orig_bio);
- bio->bi_private = bbio->private;
- bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
bio->bi_iter.bi_sector = logical >> 9;
@@ -5835,8 +5987,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
u64 length = 0;
u64 map_length;
int ret;
- int dev_nr = 0;
- int total_devs = 1;
+ int dev_nr;
+ int total_devs;
struct btrfs_bio *bbio = NULL;
length = bio->bi_iter.bi_size;
@@ -5877,11 +6029,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
BUG();
}
- while (dev_nr < total_devs) {
+ for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
bbio_error(bbio, first_bio, logical);
- dev_nr++;
continue;
}
@@ -5894,22 +6045,18 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
ret = breakup_stripe_bio(root, bbio, first_bio, dev,
dev_nr, rw, async_submit);
BUG_ON(ret);
- dev_nr++;
continue;
}
if (dev_nr < total_devs - 1) {
bio = btrfs_bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio); /* -ENOMEM */
- } else {
+ } else
bio = first_bio;
- bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
- }
submit_stripe_bio(root, bbio, bio,
bbio->stripes[dev_nr].physical, dev_nr, rw,
async_submit);
- dev_nr++;
}
btrfs_bio_counter_dec(root->fs_info);
return 0;
@@ -6082,6 +6229,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
free_extent_map(em);
return -EIO;
}
+ btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing",
+ devid, uuid);
}
map->stripes[i].dev->in_fs_metadata = 1;
}
@@ -6201,10 +6350,11 @@ static int read_one_dev(struct btrfs_root *root,
if (!btrfs_test_opt(root, DEGRADED))
return -EIO;
- btrfs_warn(root->fs_info, "devid %llu missing", devid);
device = add_missing_dev(root, fs_devices, devid, dev_uuid);
if (!device)
return -ENOMEM;
+ btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
+ devid, dev_uuid);
} else {
if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
return -EIO;
@@ -6732,3 +6882,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
}
unlock_chunks(root);
}
+
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ while (fs_devices) {
+ fs_devices->fs_info = fs_info;
+ fs_devices = fs_devices->seed;
+ }
+}
+
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ while (fs_devices) {
+ fs_devices->fs_info = NULL;
+ fs_devices = fs_devices->seed;
+ }
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 83069dec6898..95842a909e7f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -253,6 +253,12 @@ struct btrfs_fs_devices {
* nonrot flag set
*/
int rotating;
+
+ struct btrfs_fs_info *fs_info;
+ /* sysfs kobjects */
+ struct kobject super_kobj;
+ struct kobject *device_dir_kobj;
+ struct completion kobj_unregister;
};
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
@@ -292,8 +298,6 @@ struct btrfs_bio_stripe {
struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
-#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
-
struct btrfs_bio {
atomic_t refs;
atomic_t stripes_pending;
@@ -421,8 +425,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
- struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
char *device_path,
struct btrfs_device **device);
@@ -538,5 +541,8 @@ static inline void unlock_chunks(struct btrfs_root *root)
mutex_unlock(&root->fs_info->chunk_mutex);
}
+struct list_head *btrfs_get_fs_uuids(void);
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 883b93623bc5..6f518c90e1c1 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -261,7 +261,7 @@ out:
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct btrfs_key key, found_key;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_handlers[] = {
/*
* Check if the attribute is in a supported namespace.
*
- * This applied after the check for the synthetic attributes in the system
+ * This is applied after the check for the synthetic attributes in the system
* namespace.
*/
-static bool btrfs_is_valid_xattr(const char *name)
+static int btrfs_is_valid_xattr(const char *name)
{
- return !strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN) ||
- !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
- !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
- !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
- !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
+ int len = strlen(name);
+ int prefixlen = 0;
+
+ if (!strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN))
+ prefixlen = XATTR_SECURITY_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ prefixlen = XATTR_SYSTEM_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+ prefixlen = XATTR_TRUSTED_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ prefixlen = XATTR_USER_PREFIX_LEN;
+ else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+ prefixlen = XATTR_BTRFS_PREFIX_LEN;
+ else
+ return -EOPNOTSUPP;
+
+ /*
+ * The name cannot consist of just prefix
+ */
+ if (len <= prefixlen)
+ return -EINVAL;
+
+ return 0;
}
ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
+ int ret;
+
/*
* If this is a request for a synthetic attribute in the system.*
* namespace use the generic infrastructure to resolve a handler
@@ -388,15 +408,17 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_getxattr(dentry, name, buffer, size);
- if (!btrfs_is_valid_xattr(name))
- return -EOPNOTSUPP;
- return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
+ return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
}
int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
- struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
+ int ret;
/*
* The permission on security.* and system.* is not checked
@@ -413,23 +435,25 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_setxattr(dentry, name, value, size, flags);
- if (!btrfs_is_valid_xattr(name))
- return -EOPNOTSUPP;
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(dentry->d_inode, name,
+ return btrfs_set_prop(d_inode(dentry), name,
value, size, flags);
if (size == 0)
value = ""; /* empty EA, do not remove */
- return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size,
+ return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
flags);
}
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
- struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
+ int ret;
/*
* The permission on security.* and system.* is not checked
@@ -446,14 +470,15 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_removexattr(dentry, name);
- if (!btrfs_is_valid_xattr(name))
- return -EOPNOTSUPP;
+ ret = btrfs_is_valid_xattr(name);
+ if (ret)
+ return ret;
if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(dentry->d_inode, name,
+ return btrfs_set_prop(d_inode(dentry), name,
NULL, 0, XATTR_REPLACE);
- return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
+ return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
XATTR_REPLACE);
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index fb22fd8d8fb8..82990b8f872b 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -403,7 +403,7 @@ next:
return ret;
}
-struct btrfs_compress_op btrfs_zlib_compress = {
+const struct btrfs_compress_op btrfs_zlib_compress = {
.alloc_workspace = zlib_alloc_workspace,
.free_workspace = zlib_free_workspace,
.compress_pages = zlib_compress_pages,
diff --git a/fs/buffer.c b/fs/buffer.c
index 20805db2c987..1cf7a53a0277 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -30,6 +30,7 @@
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/export.h>
+#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
@@ -44,6 +45,9 @@
#include <trace/events/block.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+ unsigned long bio_flags,
+ struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -623,21 +627,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
*
* If warn is true, then emit a warning if the page is not uptodate and has
* not been truncated.
+ *
+ * The caller must hold mem_cgroup_begin_page_stat() lock.
*/
-static void __set_page_dirty(struct page *page,
- struct address_space *mapping, int warn)
+static void __set_page_dirty(struct page *page, struct address_space *mapping,
+ struct mem_cgroup *memcg, int warn)
{
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping);
+ account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
/*
@@ -668,6 +673,7 @@ static void __set_page_dirty(struct page *page,
int __set_page_dirty_buffers(struct page *page)
{
int newly_dirty;
+ struct mem_cgroup *memcg;
struct address_space *mapping = page_mapping(page);
if (unlikely(!mapping))
@@ -683,11 +689,22 @@ int __set_page_dirty_buffers(struct page *page)
bh = bh->b_this_page;
} while (bh != head);
}
+ /*
+ * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+ * per-memcg dirty page counters.
+ */
+ memcg = mem_cgroup_begin_page_stat(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, 1);
+ __set_page_dirty(page, mapping, memcg, 1);
+
+ mem_cgroup_end_page_stat(memcg);
+
+ if (newly_dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
return newly_dirty;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -1158,11 +1175,18 @@ void mark_buffer_dirty(struct buffer_head *bh)
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
+ struct address_space *mapping = NULL;
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_begin_page_stat(page);
if (!TestSetPageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ mapping = page_mapping(page);
if (mapping)
- __set_page_dirty(page, mapping, 0);
+ __set_page_dirty(page, mapping, memcg, 0);
}
+ mem_cgroup_end_page_stat(memcg);
+ if (mapping)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
}
EXPORT_SYMBOL(mark_buffer_dirty);
@@ -1684,8 +1708,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *bh, *head;
unsigned int blocksize, bbits;
int nr_underway = 0;
- int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
- WRITE_SYNC : WRITE);
+ int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1774,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh(write_op, bh);
+ submit_bh_wbc(write_op, bh, 0, wbc);
nr_underway++;
}
bh = next;
@@ -1828,7 +1851,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh(write_op, bh);
+ submit_bh_wbc(write_op, bh, 0, wbc);
nr_underway++;
}
bh = next;
@@ -2938,10 +2961,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
{
struct buffer_head *bh = bio->bi_private;
- if (err == -EOPNOTSUPP) {
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- }
-
if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
set_bit(BH_Quiet, &bh->b_state);
@@ -2997,10 +3016,10 @@ void guard_bio_eod(int rw, struct bio *bio)
}
}
-int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+ unsigned long bio_flags, struct writeback_control *wbc)
{
struct bio *bio;
- int ret = 0;
BUG_ON(!buffer_locked(bh));
BUG_ON(!buffer_mapped(bh));
@@ -3020,6 +3039,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
*/
bio = bio_alloc(GFP_NOIO, 1);
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, bh->b_page, bh->b_size);
+ }
+
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_io_vec[0].bv_page = bh->b_page;
@@ -3041,20 +3065,19 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
if (buffer_prio(bh))
rw |= REQ_PRIO;
- bio_get(bio);
submit_bio(rw, bio);
+ return 0;
+}
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
-
- bio_put(bio);
- return ret;
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+{
+ return submit_bh_wbc(rw, bh, bio_flags, NULL);
}
EXPORT_SYMBOL_GPL(_submit_bh);
int submit_bh(int rw, struct buffer_head *bh)
{
- return _submit_bh(rw, bh, 0);
+ return submit_bh_wbc(rw, bh, 0, NULL);
}
EXPORT_SYMBOL(submit_bh);
@@ -3244,7 +3267,7 @@ int try_to_free_buffers(struct page *page)
* dirty bit from being lost.
*/
if (ret)
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
+ cancel_dirty_page(page);
spin_unlock(&mapping->private_lock);
out:
if (buffers_to_free) {
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index fbb08e97438d..6af790fc3df8 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -123,11 +123,11 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
/* check parameters */
ret = -EOPNOTSUPP;
- if (!root->d_inode ||
- !root->d_inode->i_op->lookup ||
- !root->d_inode->i_op->mkdir ||
- !root->d_inode->i_op->setxattr ||
- !root->d_inode->i_op->getxattr ||
+ if (d_is_negative(root) ||
+ !d_backing_inode(root)->i_op->lookup ||
+ !d_backing_inode(root)->i_op->mkdir ||
+ !d_backing_inode(root)->i_op->setxattr ||
+ !d_backing_inode(root)->i_op->getxattr ||
!root->d_sb->s_op->statfs ||
!root->d_sb->s_op->sync_fs)
goto error_unsupported;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 232426214fdd..afa023dded5b 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -441,12 +441,12 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
fscache_set_store_limit(&object->fscache, ni_size);
- oi_size = i_size_read(object->backer->d_inode);
+ oi_size = i_size_read(d_backing_inode(object->backer));
if (oi_size == ni_size)
return 0;
cachefiles_begin_secure(cache, &saved_cred);
- mutex_lock(&object->backer->d_inode->i_mutex);
+ mutex_lock(&d_inode(object->backer)->i_mutex);
/* if there's an extension to a partial page at the end of the backing
* file, we need to discard the partial page so that we pick up new
@@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
ret = notify_change(object->backer, &newattrs, NULL);
truncate_failed:
- mutex_unlock(&object->backer->d_inode->i_mutex);
+ mutex_unlock(&d_inode(object->backer)->i_mutex);
cachefiles_end_secure(cache, saved_cred);
if (ret == -EIO) {
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 8c52472d2efa..aecd0859eacb 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -43,7 +43,6 @@ struct cachefiles_object {
loff_t i_size; /* object size */
unsigned long flags;
#define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */
-#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */
atomic_t usage; /* object usage count */
uint8_t type; /* object type */
uint8_t new; /* T if object new */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 1e51714eb33e..fc1056f5c96a 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -97,7 +97,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
* call vfs_unlink(), vfs_rmdir() or vfs_rename()
*/
static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
- struct dentry *dentry)
+ struct dentry *dentry,
+ enum fscache_why_object_killed why)
{
struct cachefiles_object *object;
struct rb_node *p;
@@ -132,8 +133,9 @@ found_dentry:
pr_err("\n");
pr_err("Error: Can't preemptively bury live object\n");
cachefiles_printk_object(object, NULL);
- } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
- pr_err("Error: Object already preemptively buried\n");
+ } else {
+ if (why != FSCACHE_OBJECT_IS_STALE)
+ fscache_object_mark_killed(&object->fscache, why);
}
write_unlock(&cache->active_lock);
@@ -265,7 +267,8 @@ requeue:
static int cachefiles_bury_object(struct cachefiles_cache *cache,
struct dentry *dir,
struct dentry *rep,
- bool preemptive)
+ bool preemptive,
+ enum fscache_why_object_killed why)
{
struct dentry *grave, *trap;
struct path path, path_to_graveyard;
@@ -286,13 +289,13 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
if (ret < 0) {
cachefiles_io_error(cache, "Unlink security error");
} else {
- ret = vfs_unlink(dir->d_inode, rep, NULL);
+ ret = vfs_unlink(d_inode(dir), rep, NULL);
if (preemptive)
- cachefiles_mark_object_buried(cache, rep);
+ cachefiles_mark_object_buried(cache, rep, why);
}
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
if (ret == -EIO)
cachefiles_io_error(cache, "Unlink failed");
@@ -303,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
/* directories have to be moved to the graveyard */
_debug("move stale object to graveyard");
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
try_again:
/* first step is to make up a grave dentry in the graveyard */
@@ -355,7 +358,7 @@ try_again:
return -EIO;
}
- if (grave->d_inode) {
+ if (d_is_positive(grave)) {
unlock_rename(cache->graveyard, dir);
dput(grave);
grave = NULL;
@@ -387,14 +390,14 @@ try_again:
if (ret < 0) {
cachefiles_io_error(cache, "Rename security error %d", ret);
} else {
- ret = vfs_rename(dir->d_inode, rep,
- cache->graveyard->d_inode, grave, NULL, 0);
+ ret = vfs_rename(d_inode(dir), rep,
+ d_inode(cache->graveyard), grave, NULL, 0);
if (ret != 0 && ret != -ENOMEM)
cachefiles_io_error(cache,
"Rename failed with error %d", ret);
if (preemptive)
- cachefiles_mark_object_buried(cache, rep);
+ cachefiles_mark_object_buried(cache, rep, why);
}
unlock_rename(cache->graveyard, dir);
@@ -415,30 +418,31 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
_enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
ASSERT(object->dentry);
- ASSERT(object->dentry->d_inode);
+ ASSERT(d_backing_inode(object->dentry));
ASSERT(object->dentry->d_parent);
dir = dget_parent(object->dentry);
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
- if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+ if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) {
/* object allocation for the same key preemptively deleted this
* object's file so that it could create its own file */
_debug("object preemptively buried");
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
ret = 0;
} else {
/* we need to check that our parent is _still_ our parent - it
* may have been renamed */
if (dir == object->dentry->d_parent) {
ret = cachefiles_bury_object(cache, dir,
- object->dentry, false);
+ object->dentry, false,
+ FSCACHE_OBJECT_WAS_RETIRED);
} else {
/* it got moved, presumably by cachefilesd culling it,
* so it's no longer in the key path and we can ignore
* it */
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
ret = 0;
}
}
@@ -473,7 +477,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
path.mnt = cache->mnt;
ASSERT(parent->dentry);
- ASSERT(parent->dentry->d_inode);
+ ASSERT(d_backing_inode(parent->dentry));
if (!(d_is_dir(parent->dentry))) {
// TODO: convert file to dir
@@ -497,7 +501,7 @@ lookup_again:
/* search the current directory for the element name */
_debug("lookup '%s'", name);
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
start = jiffies;
next = lookup_one_len(name, dir, nlen);
@@ -505,74 +509,74 @@ lookup_again:
if (IS_ERR(next))
goto lookup_error;
- _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative");
+ _debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative");
if (!key)
- object->new = !next->d_inode;
+ object->new = !d_backing_inode(next);
/* if this element of the path doesn't exist, then the lookup phase
* failed, and we can release any readers in the certain knowledge that
* there's nothing for them to actually read */
- if (!next->d_inode)
+ if (d_is_negative(next))
fscache_object_lookup_negative(&object->fscache);
/* we need to create the object if it's negative */
if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) {
/* index objects and intervening tree levels must be subdirs */
- if (!next->d_inode) {
+ if (d_is_negative(next)) {
ret = cachefiles_has_space(cache, 1, 0);
if (ret < 0)
- goto create_error;
+ goto no_space_error;
path.dentry = dir;
ret = security_path_mkdir(&path, next, 0);
if (ret < 0)
goto create_error;
start = jiffies;
- ret = vfs_mkdir(dir->d_inode, next, 0);
+ ret = vfs_mkdir(d_inode(dir), next, 0);
cachefiles_hist(cachefiles_mkdir_histogram, start);
if (ret < 0)
goto create_error;
- ASSERT(next->d_inode);
+ ASSERT(d_backing_inode(next));
_debug("mkdir -> %p{%p{ino=%lu}}",
- next, next->d_inode, next->d_inode->i_ino);
+ next, d_backing_inode(next), d_backing_inode(next)->i_ino);
} else if (!d_can_lookup(next)) {
pr_err("inode %lu is not a directory\n",
- next->d_inode->i_ino);
+ d_backing_inode(next)->i_ino);
ret = -ENOBUFS;
goto error;
}
} else {
/* non-index objects start out life as files */
- if (!next->d_inode) {
+ if (d_is_negative(next)) {
ret = cachefiles_has_space(cache, 1, 0);
if (ret < 0)
- goto create_error;
+ goto no_space_error;
path.dentry = dir;
ret = security_path_mknod(&path, next, S_IFREG, 0);
if (ret < 0)
goto create_error;
start = jiffies;
- ret = vfs_create(dir->d_inode, next, S_IFREG, true);
+ ret = vfs_create(d_inode(dir), next, S_IFREG, true);
cachefiles_hist(cachefiles_create_histogram, start);
if (ret < 0)
goto create_error;
- ASSERT(next->d_inode);
+ ASSERT(d_backing_inode(next));
_debug("create -> %p{%p{ino=%lu}}",
- next, next->d_inode, next->d_inode->i_ino);
+ next, d_backing_inode(next), d_backing_inode(next)->i_ino);
} else if (!d_can_lookup(next) &&
!d_is_reg(next)
) {
pr_err("inode %lu is not a file or directory\n",
- next->d_inode->i_ino);
+ d_backing_inode(next)->i_ino);
ret = -ENOBUFS;
goto error;
}
@@ -581,7 +585,7 @@ lookup_again:
/* process the next component */
if (key) {
_debug("advance");
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
dput(dir);
dir = next;
next = NULL;
@@ -602,7 +606,8 @@ lookup_again:
* mutex) */
object->dentry = NULL;
- ret = cachefiles_bury_object(cache, dir, next, true);
+ ret = cachefiles_bury_object(cache, dir, next, true,
+ FSCACHE_OBJECT_IS_STALE);
dput(next);
next = NULL;
@@ -610,6 +615,7 @@ lookup_again:
goto delete_error;
_debug("redo lookup");
+ fscache_object_retrying_stale(&object->fscache);
goto lookup_again;
}
}
@@ -617,7 +623,7 @@ lookup_again:
/* note that we're now using this object */
ret = cachefiles_mark_object_active(cache, object);
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
dput(dir);
dir = NULL;
@@ -646,7 +652,7 @@ lookup_again:
const struct address_space_operations *aops;
ret = -EPERM;
- aops = object->dentry->d_inode->i_mapping->a_ops;
+ aops = d_backing_inode(object->dentry)->i_mapping->a_ops;
if (!aops->bmap)
goto check_error;
@@ -659,9 +665,11 @@ lookup_again:
object->new = 0;
fscache_obtained_object(&object->fscache);
- _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino);
+ _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino);
return 0;
+no_space_error:
+ fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE);
create_error:
_debug("create error %d", ret);
if (ret == -EIO)
@@ -695,7 +703,7 @@ lookup_error:
cachefiles_io_error(cache, "Lookup failed");
next = NULL;
error:
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
dput(next);
error_out2:
dput(dir);
@@ -719,7 +727,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
_enter(",,%s", dirname);
/* search the current directory for the element name */
- mutex_lock(&dir->d_inode->i_mutex);
+ mutex_lock(&d_inode(dir)->i_mutex);
start = jiffies;
subdir = lookup_one_len(dirname, dir, strlen(dirname));
@@ -731,10 +739,10 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
}
_debug("subdir -> %p %s",
- subdir, subdir->d_inode ? "positive" : "negative");
+ subdir, d_backing_inode(subdir) ? "positive" : "negative");
/* we need to create the subdir if it doesn't exist yet */
- if (!subdir->d_inode) {
+ if (d_is_negative(subdir)) {
ret = cachefiles_has_space(cache, 1, 0);
if (ret < 0)
goto mkdir_error;
@@ -746,22 +754,22 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
ret = security_path_mkdir(&path, subdir, 0700);
if (ret < 0)
goto mkdir_error;
- ret = vfs_mkdir(dir->d_inode, subdir, 0700);
+ ret = vfs_mkdir(d_inode(dir), subdir, 0700);
if (ret < 0)
goto mkdir_error;
- ASSERT(subdir->d_inode);
+ ASSERT(d_backing_inode(subdir));
_debug("mkdir -> %p{%p{ino=%lu}}",
subdir,
- subdir->d_inode,
- subdir->d_inode->i_ino);
+ d_backing_inode(subdir),
+ d_backing_inode(subdir)->i_ino);
}
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
/* we need to make sure the subdir is a directory */
- ASSERT(subdir->d_inode);
+ ASSERT(d_backing_inode(subdir));
if (!d_can_lookup(subdir)) {
pr_err("%s is not a directory\n", dirname);
@@ -770,18 +778,18 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
}
ret = -EPERM;
- if (!subdir->d_inode->i_op->setxattr ||
- !subdir->d_inode->i_op->getxattr ||
- !subdir->d_inode->i_op->lookup ||
- !subdir->d_inode->i_op->mkdir ||
- !subdir->d_inode->i_op->create ||
- (!subdir->d_inode->i_op->rename &&
- !subdir->d_inode->i_op->rename2) ||
- !subdir->d_inode->i_op->rmdir ||
- !subdir->d_inode->i_op->unlink)
+ if (!d_backing_inode(subdir)->i_op->setxattr ||
+ !d_backing_inode(subdir)->i_op->getxattr ||
+ !d_backing_inode(subdir)->i_op->lookup ||
+ !d_backing_inode(subdir)->i_op->mkdir ||
+ !d_backing_inode(subdir)->i_op->create ||
+ (!d_backing_inode(subdir)->i_op->rename &&
+ !d_backing_inode(subdir)->i_op->rename2) ||
+ !d_backing_inode(subdir)->i_op->rmdir ||
+ !d_backing_inode(subdir)->i_op->unlink)
goto check_error;
- _leave(" = [%lu]", subdir->d_inode->i_ino);
+ _leave(" = [%lu]", d_backing_inode(subdir)->i_ino);
return subdir;
check_error:
@@ -790,19 +798,19 @@ check_error:
return ERR_PTR(ret);
mkdir_error:
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
dput(subdir);
pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
lookup_error:
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
ret = PTR_ERR(subdir);
pr_err("Lookup %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
nomem_d_alloc:
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
_leave(" = -ENOMEM");
return ERR_PTR(-ENOMEM);
}
@@ -827,7 +835,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
// dir, filename);
/* look up the victim */
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
start = jiffies;
victim = lookup_one_len(filename, dir, strlen(filename));
@@ -836,13 +844,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
goto lookup_error;
//_debug("victim -> %p %s",
- // victim, victim->d_inode ? "positive" : "negative");
+ // victim, d_backing_inode(victim) ? "positive" : "negative");
/* if the object is no longer there then we probably retired the object
* at the netfs's request whilst the cull was in progress
*/
- if (!victim->d_inode) {
- mutex_unlock(&dir->d_inode->i_mutex);
+ if (d_is_negative(victim)) {
+ mutex_unlock(&d_inode(dir)->i_mutex);
dput(victim);
_leave(" = -ENOENT [absent]");
return ERR_PTR(-ENOENT);
@@ -871,13 +879,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
object_in_use:
read_unlock(&cache->active_lock);
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
dput(victim);
//_leave(" = -EBUSY [in use]");
return ERR_PTR(-EBUSY);
lookup_error:
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
ret = PTR_ERR(victim);
if (ret == -ENOENT) {
/* file or dir now absent - probably retired by netfs */
@@ -913,7 +921,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
return PTR_ERR(victim);
_debug("victim -> %p %s",
- victim, victim->d_inode ? "positive" : "negative");
+ victim, d_backing_inode(victim) ? "positive" : "negative");
/* okay... the victim is not being used so we can cull it
* - start by marking it as stale
@@ -927,7 +935,8 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
/* actually remove the victim (drops the dir mutex) */
_debug("bury");
- ret = cachefiles_bury_object(cache, dir, victim, false);
+ ret = cachefiles_bury_object(cache, dir, victim, false,
+ FSCACHE_OBJECT_WAS_CULLED);
if (ret < 0)
goto error;
@@ -936,7 +945,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
return 0;
error_unlock:
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
error:
dput(victim);
if (ret == -ENOENT) {
@@ -971,7 +980,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
if (IS_ERR(victim))
return PTR_ERR(victim);
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
dput(victim);
//_leave(" = 0");
return 0;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index c6cd8d7a4eef..3cbb0e834694 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -74,12 +74,12 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
static int cachefiles_read_reissue(struct cachefiles_object *object,
struct cachefiles_one_read *monitor)
{
- struct address_space *bmapping = object->backer->d_inode->i_mapping;
+ struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping;
struct page *backpage = monitor->back_page, *backpage2;
int ret;
_enter("{ino=%lx},{%lx,%lx}",
- object->backer->d_inode->i_ino,
+ d_backing_inode(object->backer)->i_ino,
backpage->index, backpage->flags);
/* skip if the page was truncated away completely */
@@ -157,7 +157,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
object = container_of(op->op.object,
struct cachefiles_object, fscache);
- _enter("{ino=%lu}", object->backer->d_inode->i_ino);
+ _enter("{ino=%lu}", d_backing_inode(object->backer)->i_ino);
max = 8;
spin_lock_irq(&object->work_lock);
@@ -247,7 +247,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter);
/* attempt to get hold of the backing page */
- bmapping = object->backer->d_inode->i_mapping;
+ bmapping = d_backing_inode(object->backer)->i_mapping;
newpage = NULL;
for (;;) {
@@ -408,7 +408,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
if (!object->backer)
goto enobufs;
- inode = object->backer->d_inode;
+ inode = d_backing_inode(object->backer);
ASSERT(S_ISREG(inode->i_mode));
ASSERT(inode->i_mapping->a_ops->bmap);
ASSERT(inode->i_mapping->a_ops->readpages);
@@ -468,7 +468,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
struct list_head *list)
{
struct cachefiles_one_read *monitor = NULL;
- struct address_space *bmapping = object->backer->d_inode->i_mapping;
+ struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping;
struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
int ret = 0;
@@ -705,7 +705,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
space = 0;
- inode = object->backer->d_inode;
+ inode = d_backing_inode(object->backer);
ASSERT(S_ISREG(inode->i_mode));
ASSERT(inode->i_mapping->a_ops->bmap);
ASSERT(inode->i_mapping->a_ops->readpages);
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index 396c18ea2764..31bbc0528b11 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -55,14 +55,14 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
{
int ret;
- ret = security_inode_mkdir(root->d_inode, root, 0);
+ ret = security_inode_mkdir(d_backing_inode(root), root, 0);
if (ret < 0) {
pr_err("Security denies permission to make dirs: error %d",
ret);
return ret;
}
- ret = security_inode_create(root->d_inode, root, 0);
+ ret = security_inode_create(d_backing_inode(root), root, 0);
if (ret < 0)
pr_err("Security denies permission to create files: error %d",
ret);
@@ -95,7 +95,7 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
/* use the cache root dir's security context as the basis with
* which create files */
- ret = set_create_files_as(new, root->d_inode);
+ ret = set_create_files_as(new, d_backing_inode(root));
if (ret < 0) {
abort_creds(new);
cachefiles_begin_secure(cache, _saved_cred);
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index a8a68745e11d..d31c1a72d8a5 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -33,7 +33,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
int ret;
ASSERT(dentry);
- ASSERT(dentry->d_inode);
+ ASSERT(d_backing_inode(dentry));
if (!object->fscache.cookie)
strcpy(type, "C3");
@@ -52,7 +52,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
if (ret != -EEXIST) {
pr_err("Can't set xattr on %pd [%lu] (err %d)\n",
- dentry, dentry->d_inode->i_ino,
+ dentry, d_backing_inode(dentry)->i_ino,
-ret);
goto error;
}
@@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
goto bad_type_length;
pr_err("Can't read xattr on %pd [%lu] (err %d)\n",
- dentry, dentry->d_inode->i_ino,
+ dentry, d_backing_inode(dentry)->i_ino,
-ret);
goto error;
}
@@ -84,14 +84,14 @@ error:
bad_type_length:
pr_err("Cache object %lu type xattr length incorrect\n",
- dentry->d_inode->i_ino);
+ d_backing_inode(dentry)->i_ino);
ret = -EIO;
goto error;
bad_type:
xtype[2] = 0;
pr_err("Cache object %pd [%lu] type %s not %s\n",
- dentry, dentry->d_inode->i_ino,
+ dentry, d_backing_inode(dentry)->i_ino,
xtype, type);
ret = -EIO;
goto error;
@@ -165,7 +165,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object)
int ret;
ASSERT(dentry);
- ASSERT(dentry->d_inode);
+ ASSERT(d_backing_inode(dentry));
ASSERT(object->fscache.cookie->def->check_aux);
auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
@@ -204,7 +204,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
_enter("%p,#%d", object, auxdata->len);
ASSERT(dentry);
- ASSERT(dentry->d_inode);
+ ASSERT(d_backing_inode(dentry));
auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
if (!auxbuf) {
@@ -225,7 +225,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
cachefiles_io_error_obj(object,
"Can't read xattr on %lu (err %d)",
- dentry->d_inode->i_ino, -ret);
+ d_backing_inode(dentry)->i_ino, -ret);
goto error;
}
@@ -276,7 +276,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
cachefiles_io_error_obj(object,
"Can't update xattr on %lu"
" (error %d)",
- dentry->d_inode->i_ino, -ret);
+ d_backing_inode(dentry)->i_ino, -ret);
goto error;
}
}
@@ -291,7 +291,7 @@ error:
bad_type_length:
pr_err("Cache object %lu xattr length incorrect\n",
- dentry->d_inode->i_ino);
+ d_backing_inode(dentry)->i_ino);
ret = -EIO;
goto error;
@@ -316,7 +316,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
cachefiles_io_error(cache,
"Can't remove xattr from %lu"
" (error %d)",
- dentry->d_inode->i_ino, -ret);
+ d_backing_inode(dentry)->i_ino, -ret);
}
_leave(" = %d", ret);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 64fa248343f6..8f84646f10e9 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
val_size2 = posix_acl_xattr_size(default_acl->a_count);
err = -ENOMEM;
- tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
+ tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
if (!tmp_buf)
goto out_err;
- pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
+ pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
if (!pagelist)
goto out_err;
ceph_pagelist_init(pagelist);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index fd5599d32362..890c50971a69 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page)
inode = mapping->host;
ci = ceph_inode(inode);
- /*
- * Note that we're grabbing a snapc ref here without holding
- * any locks!
- */
- snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
-
/* dirty the head */
spin_lock(&ci->i_ceph_lock);
- if (ci->i_head_snapc == NULL)
- ci->i_head_snapc = ceph_get_snap_context(snapc);
- ++ci->i_wrbuffer_ref_head;
+ BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ capsnap->dirty_pages++;
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ ++ci->i_wrbuffer_ref_head;
+ }
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
@@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
/* build page vector */
nr_pages = calc_pages_for(0, len);
- pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
+ pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL);
ret = -ENOMEM;
if (!pages)
goto out;
@@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
dout("start_read %p adding %p idx %lu\n", inode, page,
page->index);
if (add_to_page_cache_lru(page, &inode->i_data, page->index,
- GFP_NOFS)) {
+ GFP_KERNEL)) {
ceph_fscache_uncache_page(inode, page);
page_cache_release(page);
dout("start_read %p add_to_page_cache failed %p\n",
@@ -436,7 +440,7 @@ out:
* only snap context we are allowed to write back.
*/
static struct ceph_snap_context *get_oldest_context(struct inode *inode,
- u64 *snap_size)
+ loff_t *snap_size)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc = NULL;
@@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct ceph_osd_client *osdc;
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
+ loff_t snap_size = -1;
long writeback_stat;
- u64 truncate_size, snap_size = 0;
+ u64 truncate_size;
u32 truncate_seq;
int err = 0, len = PAGE_CACHE_SIZE;
@@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
spin_lock(&ci->i_ceph_lock);
truncate_seq = ci->i_truncate_seq;
truncate_size = ci->i_truncate_size;
- if (!snap_size)
+ if (snap_size == -1)
snap_size = i_size_read(inode);
spin_unlock(&ci->i_ceph_lock);
@@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping,
unsigned wsize = 1 << inode->i_blkbits;
struct ceph_osd_request *req = NULL;
int do_sync = 0;
- u64 truncate_size, snap_size;
+ loff_t snap_size, i_size;
+ u64 truncate_size;
u32 truncate_seq;
/*
@@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping,
retry:
/* find oldest snap context with dirty data */
ceph_put_snap_context(snapc);
- snap_size = 0;
+ snap_size = -1;
snapc = get_oldest_context(inode, &snap_size);
if (!snapc) {
/* hmm, why does writepages get called when there
@@ -749,16 +755,13 @@ retry:
dout(" no snap context with dirty data?\n");
goto out;
}
- if (snap_size == 0)
- snap_size = i_size_read(inode);
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
snapc, snapc->seq, snapc->num_snaps);
spin_lock(&ci->i_ceph_lock);
truncate_seq = ci->i_truncate_seq;
truncate_size = ci->i_truncate_size;
- if (!snap_size)
- snap_size = i_size_read(inode);
+ i_size = i_size_read(inode);
spin_unlock(&ci->i_ceph_lock);
if (last_snapc && snapc != last_snapc) {
@@ -828,8 +831,10 @@ get_more_pages:
dout("waiting on writeback %p\n", page);
wait_on_page_writeback(page);
}
- if (page_offset(page) >= snap_size) {
- dout("%p page eof %llu\n", page, snap_size);
+ if (page_offset(page) >=
+ (snap_size == -1 ? i_size : snap_size)) {
+ dout("%p page eof %llu\n", page,
+ (snap_size == -1 ? i_size : snap_size));
done = 1;
unlock_page(page);
break;
@@ -884,7 +889,8 @@ get_more_pages:
}
if (do_sync)
- osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+ osd_req_op_init(req, 1,
+ CEPH_OSD_OP_STARTSYNC, 0);
req->r_callback = writepages_finish;
req->r_inode = inode;
@@ -944,10 +950,18 @@ get_more_pages:
}
/* Format the osd request message and submit the write */
-
offset = page_offset(pages[0]);
- len = min(snap_size - offset,
- (u64)locked_pages << PAGE_CACHE_SHIFT);
+ len = (u64)locked_pages << PAGE_CACHE_SHIFT;
+ if (snap_size == -1) {
+ len = min(len, (u64)i_size_read(inode) - offset);
+ /* writepages_finish() clears writeback pages
+ * according to the data length, so make sure
+ * data length covers all locked pages */
+ len = max(len, 1 +
+ ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
+ } else {
+ len = min(len, snap_size - offset);
+ }
dout("writepages got %d pages at %llu~%llu\n",
locked_pages, offset, len);
@@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file,
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
loff_t page_off = pos & PAGE_CACHE_MASK;
int pos_in_page = pos & ~PAGE_CACHE_MASK;
int end_in_page = pos_in_page + len;
@@ -1044,10 +1057,6 @@ retry_locked:
/* writepages currently holds page lock, but if we change that later, */
wait_on_page_writeback(page);
- /* check snap context */
- BUG_ON(!ci->i_snap_realm);
- down_read(&mdsc->snap_rwsem);
- BUG_ON(!ci->i_snap_realm->cached_context);
snapc = page_snap_context(page);
if (snapc && snapc != ci->i_head_snapc) {
/*
@@ -1055,7 +1064,6 @@ retry_locked:
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL);
- up_read(&mdsc->snap_rwsem);
if (snapc->seq > oldest->seq) {
ceph_put_snap_context(oldest);
@@ -1112,7 +1120,6 @@ retry_locked:
}
/* we need to read it. */
- up_read(&mdsc->snap_rwsem);
r = readpage_nounlock(file, page);
if (r < 0)
goto fail_nosnap;
@@ -1146,6 +1153,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
inode, page, (int)pos, (int)len);
r = ceph_update_writeable_page(file, pos, len, page);
+ if (r < 0)
+ page_cache_release(page);
+ else
+ *pagep = page;
} while (r == -EAGAIN);
return r;
@@ -1153,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
/*
* we don't do anything in here that simple_write_end doesn't do
- * except adjust dirty page accounting and drop read lock on
- * mdsc->snap_rwsem.
+ * except adjust dirty page accounting
*/
static int ceph_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
int check_cap = 0;
@@ -1184,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
set_page_dirty(page);
unlock_page(page);
- up_read(&mdsc->snap_rwsem);
page_cache_release(page);
if (check_cap)
@@ -1198,8 +1205,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
* intercept O_DIRECT reads and writes early, this function should
* never get called.
*/
-static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
- struct iov_iter *iter,
+static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos)
{
WARN_ON(1);
@@ -1311,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = file_inode(vma->vm_file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_cap_flush *prealloc_cf;
struct page *page = vmf->page;
loff_t off = page_offset(page);
loff_t size = i_size_read(inode);
size_t len;
int want, got, ret;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return VM_FAULT_SIGBUS;
+
if (ci->i_inline_version != CEPH_INLINE_NONE) {
struct page *locked_page = NULL;
if (off == 0) {
@@ -1327,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = ceph_uninline_data(vma->vm_file, locked_page);
if (locked_page)
unlock_page(locked_page);
- if (ret < 0)
- return VM_FAULT_SIGBUS;
+ if (ret < 0) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_free;
+ }
}
if (off + PAGE_CACHE_SIZE <= size)
@@ -1350,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
break;
if (ret != -ERESTARTSYS) {
WARN_ON(1);
- return VM_FAULT_SIGBUS;
+ ret = VM_FAULT_SIGBUS;
+ goto out_free;
}
}
dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
@@ -1370,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret == 0) {
/* success. we'll keep the page locked. */
set_page_dirty(page);
- up_read(&mdsc->snap_rwsem);
ret = VM_FAULT_LOCKED;
} else {
if (ret == -ENOMEM)
@@ -1386,7 +1398,8 @@ out:
int dirty;
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
@@ -1395,6 +1408,8 @@ out:
dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
inode, off, len, ceph_cap_string(got), ret);
ceph_put_cap_refs(ci, got);
+out_free:
+ ceph_free_cap_flush(prealloc_cf);
return ret;
}
@@ -1506,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
- ci->i_snap_realm->cached_context,
- 0, 0, false);
+ ceph_empty_snapc, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -1525,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 1, 3,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
- ci->i_snap_realm->cached_context,
+ ceph_empty_snapc,
ci->i_truncate_seq, ci->i_truncate_size,
false);
if (IS_ERR(req)) {
@@ -1535,19 +1549,27 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
- err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
- "inline_version", &inline_version,
- sizeof(inline_version),
- CEPH_OSD_CMPXATTR_OP_GT,
- CEPH_OSD_CMPXATTR_MODE_U64);
- if (err)
- goto out_put;
+ {
+ __le64 xattr_buf = cpu_to_le64(inline_version);
+ err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
+ "inline_version", &xattr_buf,
+ sizeof(xattr_buf),
+ CEPH_OSD_CMPXATTR_OP_GT,
+ CEPH_OSD_CMPXATTR_MODE_U64);
+ if (err)
+ goto out_put;
+ }
- err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
- "inline_version", &inline_version,
- sizeof(inline_version), 0, 0);
- if (err)
- goto out_put;
+ {
+ char xattr_buf[32];
+ int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
+ "%llu", inline_version);
+ err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
+ "inline_version",
+ xattr_buf, xattr_len, 0, 0);
+ if (err)
+ goto out_put;
+ }
ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
@@ -1586,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_ops = &ceph_vmops;
return 0;
}
+
+enum {
+ POOL_READ = 1,
+ POOL_WRITE = 2,
+};
+
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+{
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
+ struct rb_node **p, *parent;
+ struct ceph_pool_perm *perm;
+ struct page **pages;
+ int err = 0, err2 = 0, have = 0;
+
+ down_read(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
+ while (*p) {
+ perm = rb_entry(*p, struct ceph_pool_perm, node);
+ if (pool < perm->pool)
+ p = &(*p)->rb_left;
+ else if (pool > perm->pool)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
+ }
+ up_read(&mdsc->pool_perm_rwsem);
+ if (*p)
+ goto out;
+
+ dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+
+ down_write(&mdsc->pool_perm_rwsem);
+ parent = NULL;
+ while (*p) {
+ parent = *p;
+ perm = rb_entry(parent, struct ceph_pool_perm, node);
+ if (pool < perm->pool)
+ p = &(*p)->rb_left;
+ else if (pool > perm->pool)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
+ }
+ if (*p) {
+ up_write(&mdsc->pool_perm_rwsem);
+ goto out;
+ }
+
+ rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+ ceph_empty_snapc,
+ 1, false, GFP_NOFS);
+ if (!rd_req) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ rd_req->r_flags = CEPH_OSD_FLAG_READ;
+ osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
+ rd_req->r_base_oloc.pool = pool;
+ snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
+ "%llx.00000000", ci->i_vino.ino);
+ rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
+
+ wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
+ ceph_empty_snapc,
+ 1, false, GFP_NOFS);
+ if (!wr_req) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
+ osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
+ wr_req->r_base_oloc.pool = pool;
+ wr_req->r_base_oid = rd_req->r_base_oid;
+
+ /* one page should be large enough for STAT data */
+ pages = ceph_alloc_page_vector(1, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ err = PTR_ERR(pages);
+ goto out_unlock;
+ }
+
+ osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
+ 0, false, true);
+ ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
+ &ci->vfs_inode.i_mtime);
+ err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
+
+ ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
+ &ci->vfs_inode.i_mtime);
+ err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
+
+ if (!err)
+ err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
+ if (!err2)
+ err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
+
+ if (err >= 0 || err == -ENOENT)
+ have |= POOL_READ;
+ else if (err != -EPERM)
+ goto out_unlock;
+
+ if (err2 == 0 || err2 == -EEXIST)
+ have |= POOL_WRITE;
+ else if (err2 != -EPERM) {
+ err = err2;
+ goto out_unlock;
+ }
+
+ perm = kmalloc(sizeof(*perm), GFP_NOFS);
+ if (!perm) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ perm->pool = pool;
+ perm->perm = have;
+ rb_link_node(&perm->node, parent, p);
+ rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
+ err = 0;
+out_unlock:
+ up_write(&mdsc->pool_perm_rwsem);
+
+ if (rd_req)
+ ceph_osdc_put_request(rd_req);
+ if (wr_req)
+ ceph_osdc_put_request(wr_req);
+out:
+ if (!err)
+ err = have;
+ dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+ return err;
+}
+
+int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
+{
+ u32 pool;
+ int ret, flags;
+
+ if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
+ NOPOOLPERM))
+ return 0;
+
+ spin_lock(&ci->i_ceph_lock);
+ flags = ci->i_ceph_flags;
+ pool = ceph_file_layout_pg_pool(ci->i_layout);
+ spin_unlock(&ci->i_ceph_lock);
+check:
+ if (flags & CEPH_I_POOL_PERM) {
+ if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
+ dout("ceph_pool_perm_check pool %u no read perm\n",
+ pool);
+ return -EPERM;
+ }
+ if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
+ dout("ceph_pool_perm_check pool %u no write perm\n",
+ pool);
+ return -EPERM;
+ }
+ return 0;
+ }
+
+ ret = __ceph_pool_perm_get(ci, pool);
+ if (ret < 0)
+ return ret;
+
+ flags = CEPH_I_POOL_PERM;
+ if (ret & POOL_READ)
+ flags |= CEPH_I_POOL_RD;
+ if (ret & POOL_WRITE)
+ flags |= CEPH_I_POOL_WR;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
+ ci->i_ceph_flags = flags;
+ } else {
+ pool = ceph_file_layout_pg_pool(ci->i_layout);
+ flags = ci->i_ceph_flags;
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ goto check;
+}
+
+void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
+{
+ struct ceph_pool_perm *perm;
+ struct rb_node *n;
+
+ while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
+ n = rb_first(&mdsc->pool_perm_tree);
+ perm = rb_entry(n, struct ceph_pool_perm, node);
+ rb_erase(n, &mdsc->pool_perm_tree);
+ kfree(perm);
+ }
+}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8172775428a0..ddd5e9471290 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
used |= CEPH_CAP_PIN;
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
- if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages)
+ if (ci->i_rdcache_ref ||
+ (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
+ ci->vfs_inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
@@ -896,6 +898,18 @@ int ceph_is_any_caps(struct inode *inode)
return ret;
}
+static void drop_inode_snap_realm(struct ceph_inode_info *ci)
+{
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm_counter++;
+ ci->i_snap_realm = NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
+ realm);
+}
+
/*
* Remove a cap. Take steps to deal with a racing iterate_session_caps.
*
@@ -914,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
/* remove from session list */
spin_lock(&session->s_cap_lock);
- /*
- * s_cap_reconnect is protected by s_cap_lock. no one changes
- * s_cap_gen while session is in the reconnect state.
- */
- if (queue_release &&
- (!session->s_cap_reconnect ||
- cap->cap_gen == session->s_cap_gen))
- __queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
- cap->mseq, cap->issue_seq);
-
if (session->s_cap_iterator == cap) {
/* not yet, we are iterating over this very cap */
dout("__ceph_remove_cap delaying %p removal from session %p\n",
@@ -936,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
}
/* protect backpointer with s_cap_lock: see iterate_session_caps */
cap->ci = NULL;
+
+ /*
+ * s_cap_reconnect is protected by s_cap_lock. no one changes
+ * s_cap_gen while session is in the reconnect state.
+ */
+ if (queue_release &&
+ (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
+ cap->queue_release = 1;
+ if (removed) {
+ list_add_tail(&cap->session_caps,
+ &session->s_cap_releases);
+ session->s_num_cap_releases++;
+ removed = 0;
+ }
+ } else {
+ cap->queue_release = 0;
+ }
+ cap->cap_ino = ci->i_vino.ino;
+
spin_unlock(&session->s_cap_lock);
/* remove from inode list */
@@ -946,15 +969,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
if (removed)
ceph_put_cap(mdsc, cap);
- if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
- struct ceph_snap_realm *realm = ci->i_snap_realm;
- spin_lock(&realm->inodes_with_caps_lock);
- list_del_init(&ci->i_snap_realm_item);
- ci->i_snap_realm_counter++;
- ci->i_snap_realm = NULL;
- spin_unlock(&realm->inodes_with_caps_lock);
- ceph_put_snap_realm(mdsc, realm);
- }
+ /* when reconnect denied, we remove session caps forcibly,
+ * i_wr_ref can be non-zero. If there are ongoing write,
+ * keep i_snap_realm.
+ */
+ if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
+ drop_inode_snap_realm(ci);
+
if (!__ceph_is_any_real_caps(ci))
__cap_delay_cancel(mdsc, ci);
}
@@ -967,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
static int send_cap_msg(struct ceph_mds_session *session,
u64 ino, u64 cid, int op,
int caps, int wanted, int dirty,
- u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
- u64 size, u64 max_size,
+ u32 seq, u64 flush_tid, u64 oldest_flush_tid,
+ u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
u64 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode,
@@ -982,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session,
size_t extra_len;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
- " seq %u/%u mseq %u follows %lld size %llu/%llu"
+ " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
ceph_cap_string(dirty),
- seq, issue_seq, mseq, follows, size, max_size,
+ seq, issue_seq, flush_tid, oldest_flush_tid,
+ mseq, follows, size, max_size,
xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
- /* flock buffer size + inline version + inline data size */
- extra_len = 4 + 8 + 4;
+ /* flock buffer size + inline version + inline data size +
+ * osd_epoch_barrier + oldest_flush_tid */
+ extra_len = 4 + 8 + 4 + 4 + 8;
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
GFP_NOFS, false);
if (!msg)
return -ENOMEM;
+ msg->hdr.version = cpu_to_le16(6);
msg->hdr.tid = cpu_to_le64(flush_tid);
fc = msg->front.iov_base;
@@ -1031,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session,
ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
/* inline data size */
ceph_encode_32(&p, 0);
+ /* osd_epoch_barrier */
+ ceph_encode_32(&p, 0);
+ /* oldest_flush_tid */
+ ceph_encode_64(&p, oldest_flush_tid);
fc->xattr_version = cpu_to_le64(xattr_version);
if (xattrs_buf) {
@@ -1043,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session,
return 0;
}
-void __queue_cap_release(struct ceph_mds_session *session,
- u64 ino, u64 cap_id, u32 migrate_seq,
- u32 issue_seq)
-{
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- struct ceph_mds_cap_item *item;
-
- BUG_ON(!session->s_num_cap_releases);
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
-
- dout(" adding %llx release to mds%d msg %p (%d left)\n",
- ino, session->s_mds, msg, session->s_num_cap_releases);
-
- BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
- head = msg->front.iov_base;
- le32_add_cpu(&head->num, 1);
- item = msg->front.iov_base + msg->front.iov_len;
- item->ino = cpu_to_le64(ino);
- item->cap_id = cpu_to_le64(cap_id);
- item->migrate_seq = cpu_to_le32(migrate_seq);
- item->seq = cpu_to_le32(issue_seq);
-
- session->s_num_cap_releases--;
-
- msg->front.iov_len += sizeof(*item);
- if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
- dout(" release msg %p full\n", msg);
- list_move_tail(&msg->list_head, &session->s_cap_releases_done);
- } else {
- dout(" release msg %p at %d/%d (%d)\n", msg,
- (int)le32_to_cpu(head->num),
- (int)CEPH_CAPS_PER_RELEASE,
- (int)msg->front.iov_len);
- }
-}
-
/*
* Queue cap releases when an inode is dropped from our cache. Since
* inode is about to be destroyed, there is no need for i_ceph_lock.
@@ -1117,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode)
*/
static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
int op, int used, int want, int retain, int flushing,
- unsigned *pflush_tid)
+ u64 flush_tid, u64 oldest_flush_tid)
__releases(cap->ci->i_ceph_lock)
{
struct ceph_inode_info *ci = cap->ci;
@@ -1135,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
u64 xattr_version = 0;
struct ceph_buffer *xattr_blob = NULL;
int delayed = 0;
- u64 flush_tid = 0;
- int i;
int ret;
bool inline_data;
@@ -1180,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
cap->implemented &= cap->issued | used;
cap->mds_wanted = want;
- if (flushing) {
- /*
- * assign a tid for flush operations so we can avoid
- * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
- * clean type races. track latest tid for every bit
- * so we can handle flush AxFw, flush Fw, and have the
- * first ack clean Ax.
- */
- flush_tid = ++ci->i_cap_flush_last_tid;
- if (pflush_tid)
- *pflush_tid = flush_tid;
- dout(" cap_flush_tid %d\n", (int)flush_tid);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if (flushing & (1 << i))
- ci->i_cap_flush_tid[i] = flush_tid;
-
- follows = ci->i_head_snapc->seq;
- } else {
- follows = 0;
- }
+ follows = flushing ? ci->i_head_snapc->seq : 0;
keep = cap->implemented;
seq = cap->seq;
@@ -1227,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
spin_unlock(&ci->i_ceph_lock);
ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
- op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+ op, keep, want, flushing, seq,
+ flush_tid, oldest_flush_tid, issue_seq, mseq,
size, max_size, &mtime, &atime, time_warp_seq,
uid, gid, mode, xattr_version, xattr_blob,
follows, inline_data);
@@ -1249,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Unless @again is true, skip cap_snaps that were already sent to
+ * Unless @kick is true, skip cap_snaps that were already sent to
* the MDS (i.e., during this session).
*
* Called under i_ceph_lock. Takes s_mutex as needed.
*/
void __ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession,
- int again)
+ int kick)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
@@ -1287,11 +1257,8 @@ retry:
if (capsnap->dirty_pages || capsnap->writing)
break;
- /*
- * if cap writeback already occurred, we should have dropped
- * the capsnap in ceph_put_wrbuffer_cap_refs.
- */
- BUG_ON(capsnap->dirty == 0);
+ /* should be removed by ceph_try_drop_cap_snap() */
+ BUG_ON(!capsnap->need_flush);
/* pick mds, take s_mutex */
if (ci->i_auth_cap == NULL) {
@@ -1300,7 +1267,7 @@ retry:
}
/* only flush each capsnap once */
- if (!again && !list_empty(&capsnap->flushing_item)) {
+ if (!kick && !list_empty(&capsnap->flushing_item)) {
dout("already flushed %p, skipping\n", capsnap);
continue;
}
@@ -1310,6 +1277,9 @@ retry:
if (session && session->s_mds != mds) {
dout("oops, wrong session %p mutex\n", session);
+ if (kick)
+ goto out;
+
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
session = NULL;
@@ -1333,20 +1303,22 @@ retry:
goto retry;
}
- capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+ spin_lock(&mdsc->cap_dirty_lock);
+ capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
+ spin_unlock(&mdsc->cap_dirty_lock);
+
atomic_inc(&capsnap->nref);
- if (!list_empty(&capsnap->flushing_item))
- list_del_init(&capsnap->flushing_item);
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
+ if (list_empty(&capsnap->flushing_item))
+ list_add_tail(&capsnap->flushing_item,
+ &session->s_cap_snaps_flushing);
spin_unlock(&ci->i_ceph_lock);
dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
inode, capsnap, capsnap->follows, capsnap->flush_tid);
send_cap_msg(session, ceph_vino(inode).ino, 0,
CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
- capsnap->size, 0,
+ capsnap->dirty, 0, capsnap->flush_tid, 0,
+ 0, mseq, capsnap->size, 0,
&capsnap->mtime, &capsnap->atime,
capsnap->time_warp_seq,
capsnap->uid, capsnap->gid, capsnap->mode,
@@ -1386,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
* Caller is then responsible for calling __mark_inode_dirty with the
* returned flags value.
*/
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+ struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
@@ -1394,17 +1367,28 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
int was = ci->i_dirty_caps;
int dirty = 0;
+ if (!ci->i_auth_cap) {
+ pr_warn("__mark_dirty_caps %p %llx mask %s, "
+ "but no auth cap (session was closed?)\n",
+ inode, ceph_ino(inode), ceph_cap_string(mask));
+ return 0;
+ }
+
dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
ceph_cap_string(mask), ceph_cap_string(was),
ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
if (was == 0) {
- if (!ci->i_head_snapc)
+ WARN_ON_ONCE(ci->i_prealloc_cap_flush);
+ swap(ci->i_prealloc_cap_flush, *pcf);
+
+ if (!ci->i_head_snapc) {
+ WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
ci->i_head_snapc = ceph_get_snap_context(
ci->i_snap_realm->cached_context);
+ }
dout(" inode %p now dirty snapc %p auth cap %p\n",
&ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
- WARN_ON(!ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1413,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
ihold(inode);
dirty |= I_DIRTY_SYNC;
}
+ } else {
+ WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
}
BUG_ON(list_empty(&ci->i_dirty_item));
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
@@ -1422,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
return dirty;
}
+static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_cap_flush *other = NULL;
+
+ while (*p) {
+ parent = *p;
+ other = rb_entry(parent, struct ceph_cap_flush, i_node);
+
+ if (cf->tid < other->tid)
+ p = &(*p)->rb_left;
+ else if (cf->tid > other->tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&cf->i_node, parent, p);
+ rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
+}
+
+static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
+ struct ceph_cap_flush *cf)
+{
+ struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_cap_flush *other = NULL;
+
+ while (*p) {
+ parent = *p;
+ other = rb_entry(parent, struct ceph_cap_flush, g_node);
+
+ if (cf->tid < other->tid)
+ p = &(*p)->rb_left;
+ else if (cf->tid > other->tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&cf->g_node, parent, p);
+ rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
+}
+
+struct ceph_cap_flush *ceph_alloc_cap_flush(void)
+{
+ return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+}
+
+void ceph_free_cap_flush(struct ceph_cap_flush *cf)
+{
+ if (cf)
+ kmem_cache_free(ceph_cap_flush_cachep, cf);
+}
+
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
+{
+ struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
+ if (n) {
+ struct ceph_cap_flush *cf =
+ rb_entry(n, struct ceph_cap_flush, g_node);
+ return cf->tid;
+ }
+ return 0;
+}
+
/*
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
@@ -1429,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
* Called under i_ceph_lock.
*/
static int __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session)
+ struct ceph_mds_session *session,
+ u64 *flush_tid, u64 *oldest_flush_tid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap_flush *cf = NULL;
int flushing;
BUG_ON(ci->i_dirty_caps == 0);
BUG_ON(list_empty(&ci->i_dirty_item));
+ BUG_ON(!ci->i_prealloc_cap_flush);
flushing = ci->i_dirty_caps;
dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
@@ -1447,22 +1504,30 @@ static int __mark_caps_flushing(struct inode *inode,
ci->i_dirty_caps = 0;
dout(" inode %p now !dirty\n", inode);
+ swap(cf, ci->i_prealloc_cap_flush);
+ cf->caps = flushing;
+
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
+ cf->tid = ++mdsc->last_cap_flush_tid;
+ __add_cap_flushing_to_mdsc(mdsc, cf);
+ *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+
if (list_empty(&ci->i_flushing_item)) {
- ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
- dout(" inode %p now flushing seq %lld\n", inode,
- ci->i_cap_flush_seq);
+ dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
} else {
list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) seq %lld\n", inode,
- ci->i_cap_flush_seq);
+ dout(" inode %p now flushing (more) tid %llu\n",
+ inode, cf->tid);
}
spin_unlock(&mdsc->cap_dirty_lock);
+ __add_cap_flushing_to_inode(ci, cf);
+
+ *flush_tid = cf->tid;
return flushing;
}
@@ -1508,6 +1573,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
+ u64 flush_tid, oldest_flush_tid;
int file_wanted, used, cap_used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
@@ -1537,15 +1603,27 @@ retry:
retry_locked:
file_wanted = __ceph_caps_file_wanted(ci);
used = __ceph_caps_used(ci);
- want = file_wanted | used;
issued = __ceph_caps_issued(ci, &implemented);
revoking = implemented & ~issued;
- retain = want | CEPH_CAP_PIN;
+ want = file_wanted;
+ retain = file_wanted | used | CEPH_CAP_PIN;
if (!mdsc->stopping && inode->i_nlink > 0) {
- if (want) {
+ if (file_wanted) {
retain |= CEPH_CAP_ANY; /* be greedy */
+ } else if (S_ISDIR(inode->i_mode) &&
+ (issued & CEPH_CAP_FILE_SHARED) &&
+ __ceph_dir_is_complete(ci)) {
+ /*
+ * If a directory is complete, we want to keep
+ * the exclusive cap. So that MDS does not end up
+ * revoking the shared cap on every create/unlink
+ * operation.
+ */
+ want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+ retain |= want;
} else {
+
retain |= CEPH_CAP_ANY_SHARED;
/*
* keep RD only if we didn't have the file open RW,
@@ -1574,9 +1652,10 @@ retry_locked:
* If we fail, it's because pages are locked.... try again later.
*/
if ((!is_delayed || mdsc->stopping) &&
- ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
- inode->i_data.nrpages && /* have cached pages */
- (file_wanted == 0 || /* no open files */
+ !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
+ ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
+ inode->i_data.nrpages && /* have cached pages */
+ (file_wanted == 0 || /* no open files */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
!tried_invalidate) {
@@ -1714,17 +1793,25 @@ ack:
took_snap_rwsem = 1;
}
- if (cap == ci->i_auth_cap && ci->i_dirty_caps)
- flushing = __mark_caps_flushing(inode, session);
- else
+ if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
+ flushing = __mark_caps_flushing(inode, session,
+ &flush_tid,
+ &oldest_flush_tid);
+ } else {
flushing = 0;
+ flush_tid = 0;
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
mds = cap->mds; /* remember mds, so we don't repeat */
sent++;
/* __send_cap drops i_ceph_lock */
delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
- want, retain, flushing, NULL);
+ want, retain, flushing,
+ flush_tid, oldest_flush_tid);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
@@ -1753,12 +1840,13 @@ ack:
/*
* Try to flush dirty caps back to the auth mds.
*/
-static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, u64 *ptid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- int flushing = 0;
struct ceph_mds_session *session = NULL;
+ int flushing = 0;
+ u64 flush_tid = 0, oldest_flush_tid = 0;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1783,42 +1871,54 @@ retry:
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
- flushing = __mark_caps_flushing(inode, session);
+ flushing = __mark_caps_flushing(inode, session, &flush_tid,
+ &oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
- cap->issued | cap->implemented, flushing,
- flush_tid);
- if (!delayed)
- goto out_unlocked;
+ (cap->issued | cap->implemented),
+ flushing, flush_tid, oldest_flush_tid);
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ if (delayed) {
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(mdsc, ci);
+ spin_unlock(&ci->i_ceph_lock);
+ }
+ } else {
+ struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
+ if (n) {
+ struct ceph_cap_flush *cf =
+ rb_entry(n, struct ceph_cap_flush, i_node);
+ flush_tid = cf->tid;
+ }
+ flushing = ci->i_flushing_caps;
+ spin_unlock(&ci->i_ceph_lock);
}
out:
- spin_unlock(&ci->i_ceph_lock);
-out_unlocked:
if (session)
mutex_unlock(&session->s_mutex);
+
+ *ptid = flush_tid;
return flushing;
}
/*
* Return true if we've flushed caps through the given flush_tid.
*/
-static int caps_are_flushed(struct inode *inode, unsigned tid)
+static int caps_are_flushed(struct inode *inode, u64 flush_tid)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int i, ret = 1;
+ struct ceph_cap_flush *cf;
+ struct rb_node *n;
+ int ret = 1;
spin_lock(&ci->i_ceph_lock);
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((ci->i_flushing_caps & (1 << i)) &&
- ci->i_cap_flush_tid[i] <= tid) {
- /* still flushing this bit */
+ n = rb_first(&ci->i_cap_flush_tree);
+ if (n) {
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ if (cf->tid <= flush_tid)
ret = 0;
- break;
- }
+ }
spin_unlock(&ci->i_ceph_lock);
return ret;
}
@@ -1836,13 +1936,16 @@ static void sync_write_wait(struct inode *inode)
struct ceph_osd_request *req;
u64 last_tid;
+ if (!S_ISREG(inode->i_mode))
+ return;
+
spin_lock(&ci->i_unsafe_lock);
if (list_empty(head))
goto out;
/* set upper bound as _last_ entry in chain */
- req = list_entry(head->prev, struct ceph_osd_request,
- r_unsafe_item);
+ req = list_last_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
last_tid = req->r_tid;
do {
@@ -1860,18 +1963,64 @@ static void sync_write_wait(struct inode *inode)
*/
if (list_empty(head))
break;
- req = list_entry(head->next, struct ceph_osd_request,
- r_unsafe_item);
+ req = list_first_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+}
+
+/*
+ * wait for any uncommitted directory operations to commit.
+ */
+static int unsafe_dirop_wait(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct list_head *head = &ci->i_unsafe_dirops;
+ struct ceph_mds_request *req;
+ u64 last_tid;
+ int ret = 0;
+
+ if (!S_ISDIR(inode->i_mode))
+ return 0;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ req = list_last_entry(head, struct ceph_mds_request,
+ r_unsafe_dir_item);
+ last_tid = req->r_tid;
+
+ do {
+ ceph_mdsc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+
+ dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
+ inode, req->r_tid, last_tid);
+ ret = !wait_for_completion_timeout(&req->r_safe_completion,
+ ceph_timeout_jiffies(req->r_timeout));
+ if (ret)
+ ret = -EIO; /* timed out */
+
+ ceph_mdsc_put_request(req);
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (ret || list_empty(head))
+ break;
+ req = list_first_entry(head, struct ceph_mds_request,
+ r_unsafe_dir_item);
} while (req->r_tid < last_tid);
out:
spin_unlock(&ci->i_unsafe_lock);
+ return ret;
}
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
+ u64 flush_tid;
int ret;
int dirty;
@@ -1880,25 +2029,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret < 0)
- return ret;
+ goto out;
+
+ if (datasync)
+ goto out;
+
mutex_lock(&inode->i_mutex);
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+ ret = unsafe_dirop_wait(inode);
+
/*
* only wait on non-file metadata writeback (the mds
* can recover size and mtime, so we don't need to
* wait for that)
*/
- if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
- dout("fsync waiting for flush_tid %u\n", flush_tid);
+ if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
ret = wait_event_interruptible(ci->i_cap_wq,
- caps_are_flushed(inode, flush_tid));
+ caps_are_flushed(inode, flush_tid));
}
-
- dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
mutex_unlock(&inode->i_mutex);
+out:
+ dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
}
@@ -1911,7 +2065,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- unsigned flush_tid;
+ u64 flush_tid;
int err = 0;
int dirty;
int wait = wbc->sync_mode == WB_SYNC_ALL;
@@ -1966,6 +2120,93 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
}
}
+static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ struct ceph_cap_flush *cf;
+ struct rb_node *n;
+ int delayed = 0;
+ u64 first_tid = 0;
+ u64 oldest_flush_tid;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ while (true) {
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n", inode,
+ cap, session->s_mds);
+ spin_unlock(&ci->i_ceph_lock);
+ break;
+ }
+
+ for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ if (cf->tid >= first_tid)
+ break;
+ }
+ if (!n) {
+ spin_unlock(&ci->i_ceph_lock);
+ break;
+ }
+
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+
+ first_tid = cf->tid + 1;
+
+ dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
+ cap, cf->tid, ceph_cap_string(cf->caps));
+ delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ cf->caps, cf->tid, oldest_flush_tid);
+ }
+ return delayed;
+}
+
+void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+
+ dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+ list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n",
+ &ci->vfs_inode, cap, session->s_mds);
+ spin_unlock(&ci->i_ceph_lock);
+ continue;
+ }
+
+
+ /*
+ * if flushing caps were revoked, we re-send the cap flush
+ * in client reconnect stage. This guarantees MDS * processes
+ * the cap flush message before issuing the flushing caps to
+ * other client.
+ */
+ if ((cap->issued & ci->i_flushing_caps) !=
+ ci->i_flushing_caps) {
+ spin_unlock(&ci->i_ceph_lock);
+ if (!__kick_flushing_caps(mdsc, session, ci))
+ continue;
+ spin_lock(&ci->i_ceph_lock);
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ }
+}
+
void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
@@ -1975,28 +2216,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
dout("kick_flushing_caps mds%d\n", session->s_mds);
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
- int delayed = 0;
-
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p %s\n", inode,
- cap, ceph_cap_string(ci->i_flushing_caps));
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- ci->i_flushing_caps, NULL);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
+ int delayed = __kick_flushing_caps(mdsc, session, ci);
+ if (delayed) {
+ spin_lock(&ci->i_ceph_lock);
+ __cap_delay_requeue(mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
}
}
@@ -2008,26 +2231,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
- int delayed = 0;
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
- dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
- ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+ dout("kick_flushing_inode_caps %p flushing %s\n", inode,
+ ceph_cap_string(ci->i_flushing_caps));
__ceph_flush_snaps(ci, &session, 1);
if (ci->i_flushing_caps) {
+ int delayed;
+
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&cap->session->s_cap_flushing);
spin_unlock(&mdsc->cap_dirty_lock);
- delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- ci->i_flushing_caps, NULL);
+ spin_unlock(&ci->i_ceph_lock);
+
+ delayed = __kick_flushing_caps(mdsc, session, ci);
if (delayed) {
spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
@@ -2045,7 +2267,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
*
* Protected by i_ceph_lock.
*/
-static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+static void __take_cap_refs(struct ceph_inode_info *ci, int got,
+ bool snap_rwsem_locked)
{
if (got & CEPH_CAP_PIN)
ci->i_pin_ref++;
@@ -2053,8 +2276,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
ci->i_rd_ref++;
if (got & CEPH_CAP_FILE_CACHE)
ci->i_rdcache_ref++;
- if (got & CEPH_CAP_FILE_WR)
+ if (got & CEPH_CAP_FILE_WR) {
+ if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
+ BUG_ON(!snap_rwsem_locked);
+ ci->i_head_snapc = ceph_get_snap_context(
+ ci->i_snap_realm->cached_context);
+ }
ci->i_wr_ref++;
+ }
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
ihold(&ci->vfs_inode);
@@ -2072,16 +2301,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
* requested from the MDS.
*/
static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- loff_t endoff, int *got, int *check_max, int *err)
+ loff_t endoff, bool nonblock, int *got, int *err)
{
struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
int ret = 0;
int have, implemented;
int file_wanted;
+ bool snap_rwsem_locked = false;
dout("get_cap_refs %p need %s want %s\n", inode,
ceph_cap_string(need), ceph_cap_string(want));
+again:
spin_lock(&ci->i_ceph_lock);
/* make sure file is actually open */
@@ -2097,6 +2329,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
+ if (snap_rwsem_locked) {
+ up_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = false;
+ }
__ceph_do_pending_vmtruncate(inode);
spin_lock(&ci->i_ceph_lock);
}
@@ -2108,7 +2344,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size) {
- *check_max = 1;
+ *err = -EAGAIN;
ret = 1;
}
goto out_unlock;
@@ -2136,8 +2372,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
if ((revoking & not) == 0) {
+ if (!snap_rwsem_locked &&
+ !ci->i_head_snapc &&
+ (need & CEPH_CAP_FILE_WR)) {
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ /*
+ * we can not call down_read() when
+ * task isn't in TASK_RUNNING state
+ */
+ if (nonblock) {
+ *err = -EAGAIN;
+ ret = 1;
+ goto out_unlock;
+ }
+
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ snap_rwsem_locked = true;
+ goto again;
+ }
+ snap_rwsem_locked = true;
+ }
*got = need | (have & want);
- __take_cap_refs(ci, *got);
+ __take_cap_refs(ci, *got, true);
ret = 1;
}
} else {
@@ -2161,6 +2418,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
}
out_unlock:
spin_unlock(&ci->i_ceph_lock);
+ if (snap_rwsem_locked)
+ up_read(&mdsc->snap_rwsem);
dout("get_cap_refs %p ret %d got %s\n", inode,
ret, ceph_cap_string(*got));
@@ -2203,50 +2462,70 @@ static void check_max_size(struct inode *inode, loff_t endoff)
int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page)
{
- int _got, check_max, ret, err = 0;
+ int _got, ret, err = 0;
-retry:
- if (endoff > 0)
- check_max_size(&ci->vfs_inode, endoff);
- _got = 0;
- check_max = 0;
- ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want, endoff,
- &_got, &check_max, &err));
- if (err)
- ret = err;
+ ret = ceph_pool_perm_check(ci, need);
if (ret < 0)
return ret;
- if (check_max)
- goto retry;
+ while (true) {
+ if (endoff > 0)
+ check_max_size(&ci->vfs_inode, endoff);
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
- (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
- i_size_read(&ci->vfs_inode) > 0) {
- struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- *pinned_page = page;
- goto out;
- }
- page_cache_release(page);
- }
- /*
- * drop cap refs first because getattr while holding
- * caps refs can cause deadlock.
- */
- ceph_put_cap_refs(ci, _got);
+ err = 0;
_got = 0;
+ ret = try_get_cap_refs(ci, need, want, endoff,
+ false, &_got, &err);
+ if (ret) {
+ if (err == -EAGAIN)
+ continue;
+ if (err < 0)
+ return err;
+ } else {
+ ret = wait_event_interruptible(ci->i_cap_wq,
+ try_get_cap_refs(ci, need, want, endoff,
+ true, &_got, &err));
+ if (err == -EAGAIN)
+ continue;
+ if (err < 0)
+ ret = err;
+ if (ret < 0)
+ return ret;
+ }
- /* getattr request will bring inline data into page cache */
- ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (ret < 0)
- return ret;
- goto retry;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ i_size_read(&ci->vfs_inode) > 0) {
+ struct page *page =
+ find_get_page(ci->vfs_inode.i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ *pinned_page = page;
+ break;
+ }
+ page_cache_release(page);
+ }
+ /*
+ * drop cap refs first because getattr while
+ * holding * caps refs can cause deadlock.
+ */
+ ceph_put_cap_refs(ci, _got);
+ _got = 0;
+
+ /*
+ * getattr request will bring inline data into
+ * page cache
+ */
+ ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+ CEPH_STAT_CAP_INLINE_DATA,
+ true);
+ if (ret < 0)
+ return ret;
+ continue;
+ }
+ break;
}
-out:
+
*got = _got;
return 0;
}
@@ -2258,10 +2537,31 @@ out:
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
spin_lock(&ci->i_ceph_lock);
- __take_cap_refs(ci, caps);
+ __take_cap_refs(ci, caps, false);
spin_unlock(&ci->i_ceph_lock);
}
+
+/*
+ * drop cap_snap that is not associated with any snapshot.
+ * we don't need to send FLUSHSNAP message for it.
+ */
+static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+{
+ if (!capsnap->need_flush &&
+ !capsnap->writing && !capsnap->dirty_pages) {
+
+ dout("dropping cap_snap %p follows %llu\n",
+ capsnap, capsnap->follows);
+ ceph_put_snap_context(capsnap->context);
+ list_del(&capsnap->ci_item);
+ list_del(&capsnap->flushing_item);
+ ceph_put_cap_snap(capsnap);
+ return 1;
+ }
+ return 0;
+}
+
/*
* Release cap refs.
*
@@ -2275,7 +2575,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
{
struct inode *inode = &ci->vfs_inode;
int last = 0, put = 0, flushsnaps = 0, wake = 0;
- struct ceph_cap_snap *capsnap;
spin_lock(&ci->i_ceph_lock);
if (had & CEPH_CAP_PIN)
@@ -2297,18 +2596,28 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (had & CEPH_CAP_FILE_WR)
if (--ci->i_wr_ref == 0) {
last++;
- if (!list_empty(&ci->i_cap_snaps)) {
- capsnap = list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap,
- ci_item);
- if (capsnap->writing) {
- capsnap->writing = 0;
- flushsnaps =
- __ceph_finish_cap_snap(ci,
- capsnap);
- wake = 1;
- }
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ capsnap->writing = 0;
+ if (ceph_try_drop_cap_snap(capsnap))
+ put++;
+ else if (__ceph_finish_cap_snap(ci, capsnap))
+ flushsnaps = 1;
+ wake = 1;
}
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ BUG_ON(!ci->i_head_snapc);
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ /* see comment in __ceph_remove_cap() */
+ if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
+ drop_inode_snap_realm(ci);
}
spin_unlock(&ci->i_ceph_lock);
@@ -2321,7 +2630,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
ceph_flush_snaps(ci);
if (wake)
wake_up_all(&ci->i_cap_wq);
- if (put)
+ while (put-- > 0)
iput(inode);
}
@@ -2349,7 +2658,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
if (ci->i_head_snapc == snapc) {
ci->i_wrbuffer_ref_head -= nr;
if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
BUG_ON(!ci->i_head_snapc);
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
@@ -2370,25 +2681,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
capsnap->dirty_pages -= nr;
if (capsnap->dirty_pages == 0) {
complete_capsnap = 1;
- if (capsnap->dirty == 0)
- /* cap writeback completed before we created
- * the cap_snap; no FLUSHSNAP is needed */
- drop_capsnap = 1;
+ drop_capsnap = ceph_try_drop_cap_snap(capsnap);
}
dout("put_wrbuffer_cap_refs on %p cap_snap %p "
- " snap %lld %d/%d -> %d/%d %s%s%s\n",
+ " snap %lld %d/%d -> %d/%d %s%s\n",
inode, capsnap, capsnap->context->seq,
ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
ci->i_wrbuffer_ref, capsnap->dirty_pages,
last ? " (wrbuffer last)" : "",
- complete_capsnap ? " (complete capsnap)" : "",
- drop_capsnap ? " (drop capsnap)" : "");
- if (drop_capsnap) {
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- }
+ complete_capsnap ? " (complete capsnap)" : "");
}
spin_unlock(&ci->i_ceph_lock);
@@ -2495,7 +2796,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
*/
- if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+ if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
+ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!ci->i_wrbuffer_ref) {
if (try_nonblocking_invalidate(inode)) {
@@ -2701,16 +3003,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_cap_flush *cf;
+ struct rb_node *n;
+ LIST_HEAD(to_remove);
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
int drop = 0;
- int i;
- for (i = 0; i < CEPH_CAP_BITS; i++)
- if ((dirty & (1 << i)) &&
- (u16)flush_tid == ci->i_cap_flush_tid[i])
- cleaned |= 1 << i;
+ n = rb_first(&ci->i_cap_flush_tree);
+ while (n) {
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ n = rb_next(&cf->i_node);
+ if (cf->tid == flush_tid)
+ cleaned = cf->caps;
+ if (cf->tid <= flush_tid) {
+ rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
+ list_add_tail(&cf->list, &to_remove);
+ } else {
+ cleaned &= ~cf->caps;
+ if (!cleaned)
+ break;
+ }
+ }
dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
" flushing %s -> %s\n",
@@ -2718,12 +3033,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(ci->i_flushing_caps & ~cleaned));
- if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+ if (list_empty(&to_remove) && !cleaned)
goto out;
ci->i_flushing_caps &= ~cleaned;
spin_lock(&mdsc->cap_dirty_lock);
+
+ if (!list_empty(&to_remove)) {
+ list_for_each_entry(cf, &to_remove, list)
+ rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
+ n = rb_first(&mdsc->cap_flush_tree);
+ cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+ if (!cf || cf->tid > flush_tid)
+ wake_up_all(&mdsc->cap_flushing_wq);
+ }
+
if (ci->i_flushing_caps == 0) {
list_del_init(&ci->i_flushing_item);
if (!list_empty(&session->s_cap_flushing))
@@ -2733,14 +3059,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info,
i_flushing_item)->vfs_inode);
mdsc->num_cap_flushing--;
- wake_up_all(&mdsc->cap_flushing_wq);
dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) {
dout(" inode %p now clean\n", inode);
BUG_ON(!list_empty(&ci->i_dirty_item));
drop = 1;
- if (ci->i_wrbuffer_ref_head == 0) {
+ if (ci->i_wr_ref == 0 &&
+ ci->i_wrbuffer_ref_head == 0) {
BUG_ON(!ci->i_head_snapc);
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
@@ -2754,6 +3080,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
out:
spin_unlock(&ci->i_ceph_lock);
+
+ while (!list_empty(&to_remove)) {
+ cf = list_first_entry(&to_remove,
+ struct ceph_cap_flush, list);
+ list_del(&cf->list);
+ ceph_free_cap_flush(cf);
+ }
if (drop)
iput(inode);
}
@@ -2769,6 +3102,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap;
int drop = 0;
@@ -2792,6 +3126,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
list_del(&capsnap->ci_item);
list_del(&capsnap->flushing_item);
ceph_put_cap_snap(capsnap);
+ wake_up_all(&mdsc->cap_flushing_wq);
drop = 1;
break;
} else {
@@ -2940,7 +3275,6 @@ retry:
mutex_lock_nested(&session->s_mutex,
SINGLE_DEPTH_NESTING);
}
- ceph_add_cap_releases(mdsc, tsession);
new_cap = ceph_get_cap(mdsc, NULL);
} else {
WARN_ON(1);
@@ -3136,16 +3470,20 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);
- if (op == CEPH_CAP_OP_IMPORT)
- ceph_add_cap_releases(mdsc, session);
-
if (!inode) {
dout(" i don't have ino %llx\n", vino.ino);
if (op == CEPH_CAP_OP_IMPORT) {
+ cap = ceph_get_cap(mdsc, NULL);
+ cap->cap_ino = vino.ino;
+ cap->queue_release = 1;
+ cap->cap_id = cap_id;
+ cap->mseq = mseq;
+ cap->seq = seq;
spin_lock(&session->s_cap_lock);
- __queue_cap_release(session, vino.ino, cap_id,
- mseq, seq);
+ list_add_tail(&cap->session_caps,
+ &session->s_cap_releases);
+ session->s_num_cap_releases++;
spin_unlock(&session->s_cap_lock);
}
goto flush_cap_releases;
@@ -3221,11 +3559,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
flush_cap_releases:
/*
- * send any full release message to try to move things
+ * send any cap release message to try to move things
* along for the mds (who clearly thinks we still have this
* cap).
*/
- ceph_add_cap_releases(mdsc, session);
ceph_send_cap_releases(mdsc, session);
done:
@@ -3391,7 +3728,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
int ceph_encode_dentry_release(void **p, struct dentry *dentry,
int mds, int drop, int unless)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int force = 0;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 1b2355109b9f..31f831471ed2 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -84,7 +84,7 @@ static int mdsc_show(struct seq_file *s, void *p)
path = NULL;
spin_lock(&req->r_dentry->d_lock);
seq_printf(s, " #%llx/%pd (%s)",
- ceph_ino(req->r_dentry->d_parent->d_inode),
+ ceph_ino(d_inode(req->r_dentry->d_parent)),
req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 83e9976f7189..9314b4ea2375 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
if (dentry->d_fsdata)
return 0;
- di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
+ di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
if (!di)
return -ENOMEM; /* oh well */
@@ -49,9 +49,9 @@ int ceph_init_dentry(struct dentry *dentry)
goto out_unlock;
}
- if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+ if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP)
d_set_d_op(dentry, &ceph_dentry_ops);
- else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+ else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR)
d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
else
d_set_d_op(dentry, &ceph_snap_dentry_ops);
@@ -77,7 +77,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
spin_lock(&dentry->d_lock);
if (!IS_ROOT(dentry)) {
- inode = dentry->d_parent->d_inode;
+ inode = d_inode(dentry->d_parent);
ihold(inode);
}
spin_unlock(&dentry->d_lock);
@@ -107,6 +107,27 @@ static int fpos_cmp(loff_t l, loff_t r)
}
/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+ int len, unsigned next_offset)
+{
+ char *buf = kmalloc(len+1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ kfree(fi->last_name);
+ fi->last_name = buf;
+ memcpy(fi->last_name, name, len);
+ fi->last_name[len] = 0;
+ fi->next_offset = next_offset;
+ dout("note_last_dentry '%s'\n", fi->last_name);
+ return 0;
+}
+
+/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
* d_child when we initially get results back from the MDS, and
@@ -122,124 +143,114 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
{
struct ceph_file_info *fi = file->private_data;
struct dentry *parent = file->f_path.dentry;
- struct inode *dir = parent->d_inode;
- struct list_head *p;
- struct dentry *dentry, *last;
+ struct inode *dir = d_inode(parent);
+ struct dentry *dentry, *last = NULL;
struct ceph_dentry_info *di;
+ unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
int err = 0;
+ loff_t ptr_pos = 0;
+ struct ceph_readdir_cache_control cache_ctl = {};
- /* claim ref on last dentry we returned */
- last = fi->dentry;
- fi->dentry = NULL;
-
- dout("__dcache_readdir %p v%u at %llu (last %p)\n",
- dir, shared_gen, ctx->pos, last);
+ dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
- spin_lock(&parent->d_lock);
-
- /* start at beginning? */
- if (ctx->pos == 2 || last == NULL ||
- fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
- if (list_empty(&parent->d_subdirs))
- goto out_unlock;
- p = parent->d_subdirs.prev;
- dout(" initial p %p/%p\n", p->prev, p->next);
- } else {
- p = last->d_child.prev;
+ /* we can calculate cache index for the first dirfrag */
+ if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
+ cache_ctl.index = fpos_off(ctx->pos) - 2;
+ BUG_ON(cache_ctl.index < 0);
+ ptr_pos = cache_ctl.index * sizeof(struct dentry *);
}
-more:
- dentry = list_entry(p, struct dentry, d_child);
- di = ceph_dentry(dentry);
- while (1) {
- dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
- d_unhashed(dentry) ? "!hashed" : "hashed",
- parent->d_subdirs.prev, parent->d_subdirs.next);
- if (p == &parent->d_subdirs) {
+ while (true) {
+ pgoff_t pgoff;
+ bool emit_dentry;
+
+ if (ptr_pos >= i_size_read(dir)) {
fi->flags |= CEPH_F_ATEND;
- goto out_unlock;
- }
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (di->lease_shared_gen == shared_gen &&
- !d_unhashed(dentry) && dentry->d_inode &&
- ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
- ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
- fpos_cmp(ctx->pos, di->offset) <= 0)
+ err = 0;
break;
- dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry,
- dentry, di->offset,
- ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
- !dentry->d_inode ? " null" : "");
- spin_unlock(&dentry->d_lock);
- p = p->prev;
- dentry = list_entry(p, struct dentry, d_child);
- di = ceph_dentry(dentry);
- }
-
- dget_dlock(dentry);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&parent->d_lock);
+ }
- /* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_dir_is_complete_ordered(dir)) {
- dout(" lost dir complete on %p; falling back to mds\n", dir);
- dput(dentry);
err = -EAGAIN;
- goto out;
- }
-
- dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
- dentry, dentry, dentry->d_inode);
- if (!dir_emit(ctx, dentry->d_name.name,
- dentry->d_name.len,
- ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
- dentry->d_inode->i_mode >> 12)) {
- if (last) {
- /* remember our position */
- fi->dentry = last;
- fi->next_offset = fpos_off(di->offset);
+ pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
+ if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
+ ceph_readdir_cache_release(&cache_ctl);
+ cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
+ if (!cache_ctl.page) {
+ dout(" page %lu not found\n", pgoff);
+ break;
+ }
+ /* reading/filling the cache are serialized by
+ * i_mutex, no need to use page lock */
+ unlock_page(cache_ctl.page);
+ cache_ctl.dentries = kmap(cache_ctl.page);
}
- dput(dentry);
- return 0;
- }
- ctx->pos = di->offset + 1;
+ rcu_read_lock();
+ spin_lock(&parent->d_lock);
+ /* check i_size again here, because empty directory can be
+ * marked as complete while not holding the i_mutex. */
+ if (ceph_dir_is_complete_ordered(dir) &&
+ ptr_pos < i_size_read(dir))
+ dentry = cache_ctl.dentries[cache_ctl.index % nsize];
+ else
+ dentry = NULL;
+ spin_unlock(&parent->d_lock);
+ if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+ dentry = NULL;
+ rcu_read_unlock();
+ if (!dentry)
+ break;
+
+ emit_dentry = false;
+ di = ceph_dentry(dentry);
+ spin_lock(&dentry->d_lock);
+ if (di->lease_shared_gen == shared_gen &&
+ d_really_is_positive(dentry) &&
+ ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
+ ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
+ fpos_cmp(ctx->pos, di->offset) <= 0) {
+ emit_dentry = true;
+ }
+ spin_unlock(&dentry->d_lock);
- if (last)
- dput(last);
- last = dentry;
+ if (emit_dentry) {
+ dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+ dentry, dentry, d_inode(dentry));
+ ctx->pos = di->offset;
+ if (!dir_emit(ctx, dentry->d_name.name,
+ dentry->d_name.len,
+ ceph_translate_ino(dentry->d_sb,
+ d_inode(dentry)->i_ino),
+ d_inode(dentry)->i_mode >> 12)) {
+ dput(dentry);
+ err = 0;
+ break;
+ }
+ ctx->pos++;
- spin_lock(&parent->d_lock);
- p = p->prev; /* advance to next dentry */
- goto more;
+ if (last)
+ dput(last);
+ last = dentry;
+ } else {
+ dput(dentry);
+ }
-out_unlock:
- spin_unlock(&parent->d_lock);
-out:
- if (last)
+ cache_ctl.index++;
+ ptr_pos += sizeof(struct dentry *);
+ }
+ ceph_readdir_cache_release(&cache_ctl);
+ if (last) {
+ int ret;
+ di = ceph_dentry(last);
+ ret = note_last_dentry(fi, last->d_name.name, last->d_name.len,
+ fpos_off(di->offset) + 1);
+ if (ret < 0)
+ err = ret;
dput(last);
+ }
return err;
}
-/*
- * make note of the last dentry we read, so we can
- * continue at the same lexicographical point,
- * regardless of what dir changes take place on the
- * server.
- */
-static int note_last_dentry(struct ceph_file_info *fi, const char *name,
- int len)
-{
- kfree(fi->last_name);
- fi->last_name = kmalloc(len+1, GFP_NOFS);
- if (!fi->last_name)
- return -ENOMEM;
- memcpy(fi->last_name, name, len);
- fi->last_name[len] = 0;
- dout("note_last_dentry '%s'\n", fi->last_name);
- return 0;
-}
-
static int ceph_readdir(struct file *file, struct dir_context *ctx)
{
struct ceph_file_info *fi = file->private_data;
@@ -280,7 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
- if ((ctx->pos == 2 || fi->dentry) &&
+ if (ceph_test_mount_opt(fsc, DCACHE) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete_ordered(ci) &&
@@ -295,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
} else {
spin_unlock(&ci->i_ceph_lock);
}
- if (fi->dentry) {
- err = note_last_dentry(fi, fi->dentry->d_name.name,
- fi->dentry->d_name.len);
- if (err)
- return err;
- dput(fi->dentry);
- fi->dentry = NULL;
- }
/* proceed with a normal readdir */
-
- if (ctx->pos == 2) {
- /* note dir version at start of readdir so we can tell
- * if any dentries get dropped */
- fi->dir_release_count = atomic_read(&ci->i_release_count);
- fi->dir_ordered_count = ci->i_ordered_count;
- }
-
more:
/* do we have the correct frag content buffered? */
if (fi->frag != frag || fi->last_readdir == NULL) {
@@ -336,16 +331,26 @@ more:
ceph_mdsc_put_request(req);
return err;
}
- req->r_inode = inode;
- ihold(inode);
- req->r_dentry = dget(file->f_path.dentry);
/* hints to request -> mds selection code */
req->r_direct_mode = USE_AUTH_MDS;
req->r_direct_hash = ceph_frag_value(frag);
req->r_direct_is_hash = true;
- req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+ if (fi->last_name) {
+ req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
+ if (!req->r_path2) {
+ ceph_mdsc_put_request(req);
+ return -ENOMEM;
+ }
+ }
+ req->r_dir_release_cnt = fi->dir_release_count;
+ req->r_dir_ordered_cnt = fi->dir_ordered_count;
+ req->r_readdir_cache_idx = fi->readdir_cache_idx;
req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
+
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_dentry = dget(file->f_path.dentry);
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0) {
ceph_mdsc_put_request(req);
@@ -356,26 +361,38 @@ more:
(int)req->r_reply_info.dir_end,
(int)req->r_reply_info.dir_complete);
- if (!req->r_did_prepopulate) {
- dout("readdir !did_prepopulate");
- /* preclude from marking dir complete */
- fi->dir_release_count--;
- }
/* note next offset and last dentry name */
rinfo = &req->r_reply_info;
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
frag = le32_to_cpu(rinfo->dir_dir->frag);
- if (ceph_frag_is_leftmost(frag))
- fi->next_offset = 2;
- else
- fi->next_offset = 0;
- off = fi->next_offset;
+ off = req->r_readdir_offset;
+ fi->next_offset = off;
}
+
fi->frag = frag;
fi->offset = fi->next_offset;
fi->last_readdir = req;
+ if (req->r_did_prepopulate) {
+ fi->readdir_cache_idx = req->r_readdir_cache_idx;
+ if (fi->readdir_cache_idx < 0) {
+ /* preclude from marking dir ordered */
+ fi->dir_ordered_count = 0;
+ } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+ /* note dir version at start of readdir so
+ * we can tell if any dentries get dropped */
+ fi->dir_release_count = req->r_dir_release_cnt;
+ fi->dir_ordered_count = req->r_dir_ordered_cnt;
+ }
+ } else {
+ dout("readdir !did_prepopulate");
+ /* disable readdir cache */
+ fi->readdir_cache_idx = -1;
+ /* preclude from marking dir complete */
+ fi->dir_release_count = 0;
+ }
+
if (req->r_reply_info.dir_end) {
kfree(fi->last_name);
fi->last_name = NULL;
@@ -386,10 +403,10 @@ more:
} else {
err = note_last_dentry(fi,
rinfo->dir_dname[rinfo->dir_nr-1],
- rinfo->dir_dname_len[rinfo->dir_nr-1]);
+ rinfo->dir_dname_len[rinfo->dir_nr-1],
+ fi->next_offset + rinfo->dir_nr);
if (err)
return err;
- fi->next_offset += rinfo->dir_nr;
}
}
@@ -445,16 +462,22 @@ more:
* were released during the whole readdir, and we should have
* the complete dir contents in our cache.
*/
- spin_lock(&ci->i_ceph_lock);
- if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
- if (ci->i_ordered_count == fi->dir_ordered_count)
+ if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
+ spin_lock(&ci->i_ceph_lock);
+ if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
dout(" marking %p complete and ordered\n", inode);
- else
+ /* use i_size to track number of entries in
+ * readdir cache */
+ BUG_ON(fi->readdir_cache_idx < 0);
+ i_size_write(inode, fi->readdir_cache_idx *
+ sizeof(struct dentry*));
+ } else {
dout(" marking %p complete\n", inode);
+ }
__ceph_dir_set_complete(ci, fi->dir_release_count,
fi->dir_ordered_count);
+ spin_unlock(&ci->i_ceph_lock);
}
- spin_unlock(&ci->i_ceph_lock);
dout("readdir %p file %p done.\n", inode, file);
return 0;
@@ -468,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
}
kfree(fi->last_name);
fi->last_name = NULL;
+ fi->dir_release_count = 0;
+ fi->readdir_cache_idx = -1;
if (ceph_frag_is_leftmost(frag))
fi->next_offset = 2; /* compensate for . and .. */
else
fi->next_offset = 0;
- if (fi->dentry) {
- dput(fi->dentry);
- fi->dentry = NULL;
- }
fi->flags &= ~CEPH_F_ATEND;
}
@@ -489,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
mutex_lock(&inode->i_mutex);
retval = -EINVAL;
switch (whence) {
- case SEEK_END:
- offset += inode->i_size + 2; /* FIXME */
- break;
case SEEK_CUR:
offset += file->f_pos;
case SEEK_SET:
break;
+ case SEEK_END:
+ retval = -EOPNOTSUPP;
default:
goto out;
}
@@ -508,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
}
retval = offset;
- /*
- * discard buffered readdir content on seekdir(0), or
- * seek to new frag, or seek prior to current chunk.
- */
if (offset == 0 ||
fpos_frag(offset) != fi->frag ||
fpos_off(offset) < fi->offset) {
+ /* discard buffered readdir content on seekdir(0), or
+ * seek to new frag, or seek prior to current chunk */
dout("dir_llseek dropping %p content\n", file);
reset_readdir(fi, fpos_frag(offset));
+ } else if (fpos_cmp(offset, old_offset) > 0) {
+ /* reset dir_release_count if we did a forward seek */
+ fi->dir_release_count = 0;
+ fi->readdir_cache_idx = -1;
}
-
- /* bump dir_release_count if we did a forward seek */
- if (fpos_cmp(offset, old_offset) > 0)
- fi->dir_release_count--;
}
out:
mutex_unlock(&inode->i_mutex);
@@ -535,7 +553,7 @@ int ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry, int err)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */
+ struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
/* .snap dir? */
if (err == -ENOENT &&
@@ -571,8 +589,8 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
err = 0;
if (!req->r_reply_info.head->is_dentry) {
dout("ENOENT and no trace, dentry %p inode %p\n",
- dentry, dentry->d_inode);
- if (dentry->d_inode) {
+ dentry, d_inode(dentry));
+ if (d_really_is_positive(dentry)) {
d_drop(dentry);
err = -ENOENT;
} else {
@@ -619,7 +637,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(err);
/* can we conclude ENOENT locally? */
- if (dentry->d_inode == NULL) {
+ if (d_really_is_negative(dentry)) {
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_dentry_info *di = ceph_dentry(dentry);
@@ -629,6 +647,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
!is_root_ceph_dentry(dir, dentry) &&
+ ceph_test_mount_opt(fsc, DCACHE) &&
__ceph_dir_is_complete(ci) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
spin_unlock(&ci->i_ceph_lock);
@@ -725,7 +744,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
ceph_mdsc_put_request(req);
out:
if (!err)
- ceph_init_inode_acls(dentry->d_inode, &acls);
+ ceph_init_inode_acls(d_inode(dentry), &acls);
else
d_drop(dentry);
ceph_release_acls_info(&acls);
@@ -755,10 +774,15 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
err = PTR_ERR(req);
goto out;
}
+ req->r_path2 = kstrdup(dest, GFP_KERNEL);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ ceph_mdsc_put_request(req);
+ goto out;
+ }
+ req->r_locked_dir = dir;
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_path2 = kstrdup(dest, GFP_NOFS);
- req->r_locked_dir = dir;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
err = ceph_mdsc_do_request(mdsc, dir, req);
@@ -821,7 +845,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
ceph_mdsc_put_request(req);
out:
if (!err)
- ceph_init_inode_acls(dentry->d_inode, &acls);
+ ceph_init_inode_acls(d_inode(dentry), &acls);
else
d_drop(dentry);
ceph_release_acls_info(&acls);
@@ -858,8 +882,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
if (err) {
d_drop(dentry);
} else if (!req->r_reply_info.head->is_dentry) {
- ihold(old_dentry->d_inode);
- d_instantiate(dentry, old_dentry->d_inode);
+ ihold(d_inode(old_dentry));
+ d_instantiate(dentry, d_inode(old_dentry));
}
ceph_mdsc_put_request(req);
return err;
@@ -892,7 +916,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
int err = -EROFS;
int op;
@@ -933,16 +957,20 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
+ int op = CEPH_MDS_OP_RENAME;
int err;
if (ceph_snap(old_dir) != ceph_snap(new_dir))
return -EXDEV;
- if (ceph_snap(old_dir) != CEPH_NOSNAP ||
- ceph_snap(new_dir) != CEPH_NOSNAP)
- return -EROFS;
+ if (ceph_snap(old_dir) != CEPH_NOSNAP) {
+ if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
+ op = CEPH_MDS_OP_RENAMESNAP;
+ else
+ return -EROFS;
+ }
dout("rename dir %p dentry %p to dir %p dentry %p\n",
old_dir, old_dentry, new_dir, new_dentry);
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
ihold(old_dir);
@@ -957,8 +985,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
/* release LINK_RDCACHE on source inode (mds will lock it) */
req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
- if (new_dentry->d_inode)
- req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+ if (d_really_is_positive(new_dentry))
+ req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry));
err = ceph_mdsc_do_request(mdsc, old_dir, req);
if (!err && !req->r_reply_info.head->is_dentry) {
/*
@@ -967,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
* to do it here.
*/
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_complete(old_dir);
+ ceph_dir_clear_complete(new_dir);
+
d_move(old_dentry, new_dentry);
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(new_dentry);
-
- /* d_move screws up sibling dentries' offsets */
- ceph_dir_clear_complete(old_dir);
- ceph_dir_clear_complete(new_dir);
-
}
ceph_mdsc_put_request(req);
return err;
@@ -1024,7 +1051,7 @@ static int dentry_lease_is_valid(struct dentry *dentry)
if (di->lease_renew_after &&
time_after(jiffies, di->lease_renew_after)) {
/* we should renew */
- dir = dentry->d_parent->d_inode;
+ dir = d_inode(dentry->d_parent);
session = ceph_get_mds_session(s);
seq = di->lease_seq;
di->lease_renew_after = 0;
@@ -1074,22 +1101,22 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
- dentry, dentry->d_inode, ceph_dentry(dentry)->offset);
+ dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
dir = ceph_get_dentry_parent_inode(dentry);
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
- dentry, dentry->d_inode);
+ dentry, d_inode(dentry));
valid = 1;
- } else if (dentry->d_inode &&
- ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) {
+ } else if (d_really_is_positive(dentry) &&
+ ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
valid = 1;
} else if (dentry_lease_is_valid(dentry) ||
dir_lease_is_valid(dir, dentry)) {
- if (dentry->d_inode)
- valid = ceph_is_any_caps(dentry->d_inode);
+ if (d_really_is_positive(dentry))
+ valid = ceph_is_any_caps(d_inode(dentry));
else
valid = 1;
}
@@ -1151,7 +1178,7 @@ static void ceph_d_prune(struct dentry *dentry)
* we hold d_lock, so d_parent is stable, and d_fsdata is never
* cleared until d_release
*/
- ceph_dir_clear_complete(dentry->d_parent->d_inode);
+ ceph_dir_clear_complete(d_inode(dentry->d_parent));
}
/*
@@ -1171,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
return -EISDIR;
if (!cf->dir_info) {
- cf->dir_info = kmalloc(bufsize, GFP_NOFS);
+ cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
if (!cf->dir_info)
return -ENOMEM;
cf->dir_info_len =
@@ -1206,65 +1233,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
}
/*
- * an fsync() on a dir will wait for any uncommitted directory
- * operations to commit.
- */
-static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
-{
- struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_dirops;
- struct ceph_mds_request *req;
- u64 last_tid;
- int ret = 0;
-
- dout("dir_fsync %p\n", inode);
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret)
- return ret;
- mutex_lock(&inode->i_mutex);
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- req = list_entry(head->prev,
- struct ceph_mds_request, r_unsafe_dir_item);
- last_tid = req->r_tid;
-
- do {
- ceph_mdsc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
-
- dout("dir_fsync %p wait on tid %llu (until %llu)\n",
- inode, req->r_tid, last_tid);
- if (req->r_timeout) {
- ret = wait_for_completion_timeout(
- &req->r_safe_completion, req->r_timeout);
- if (ret > 0)
- ret = 0;
- else if (ret == 0)
- ret = -EIO; /* timed out */
- } else {
- wait_for_completion(&req->r_safe_completion);
- }
- ceph_mdsc_put_request(req);
-
- spin_lock(&ci->i_unsafe_lock);
- if (ret || list_empty(head))
- break;
- req = list_entry(head->next,
- struct ceph_mds_request, r_unsafe_dir_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
- mutex_unlock(&inode->i_mutex);
-
- return ret;
-}
-
-/*
* We maintain a private dentry LRU.
*
* FIXME: this needs to be changed to a per-mds lru to be useful.
@@ -1334,7 +1302,7 @@ const struct file_operations ceph_dir_fops = {
.open = ceph_open,
.release = ceph_release,
.unlocked_ioctl = ceph_ioctl,
- .fsync = ceph_dir_fsync,
+ .fsync = ceph_fsync,
};
const struct file_operations ceph_snapdir_fops = {
@@ -1372,6 +1340,7 @@ const struct inode_operations ceph_snapdir_iops = {
.getattr = ceph_getattr,
.mkdir = ceph_mkdir,
.rmdir = ceph_unlink,
+ .rename = ceph_rename,
};
const struct dentry_operations ceph_dentry_ops = {
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 8d7d782f4382..fe02ae7f056a 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -136,8 +136,8 @@ static struct dentry *__get_parent(struct super_block *sb,
return ERR_CAST(req);
if (child) {
- req->r_inode = child->d_inode;
- ihold(child->d_inode);
+ req->r_inode = d_inode(child);
+ ihold(d_inode(child));
} else {
req->r_ino1 = (struct ceph_vino) {
.ino = ino,
@@ -164,7 +164,7 @@ static struct dentry *__get_parent(struct super_block *sb,
return ERR_PTR(err);
}
dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
- child ? ceph_ino(child->d_inode) : ino,
+ child ? ceph_ino(d_inode(child)) : ino,
dentry, ceph_vinop(inode));
return dentry;
}
@@ -172,11 +172,11 @@ static struct dentry *__get_parent(struct super_block *sb,
static struct dentry *ceph_get_parent(struct dentry *child)
{
/* don't re-export snaps */
- if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
+ if (ceph_snap(d_inode(child)) != CEPH_NOSNAP)
return ERR_PTR(-EINVAL);
dout("get_parent %p ino %llx.%llx\n",
- child, ceph_vinop(child->d_inode));
+ child, ceph_vinop(d_inode(child)));
return __get_parent(child->d_sb, child, 0);
}
@@ -209,32 +209,32 @@ static int ceph_get_name(struct dentry *parent, char *name,
struct ceph_mds_request *req;
int err;
- mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
+ mdsc = ceph_inode_to_client(d_inode(child))->mdsc;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
USE_ANY_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
- mutex_lock(&parent->d_inode->i_mutex);
+ mutex_lock(&d_inode(parent)->i_mutex);
- req->r_inode = child->d_inode;
- ihold(child->d_inode);
- req->r_ino2 = ceph_vino(parent->d_inode);
- req->r_locked_dir = parent->d_inode;
+ req->r_inode = d_inode(child);
+ ihold(d_inode(child));
+ req->r_ino2 = ceph_vino(d_inode(parent));
+ req->r_locked_dir = d_inode(parent);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
- mutex_unlock(&parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(parent)->i_mutex);
if (!err) {
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
memcpy(name, rinfo->dname, rinfo->dname_len);
name[rinfo->dname_len] = 0;
dout("get_name %p ino %llx.%llx name %s\n",
- child, ceph_vinop(child->d_inode), name);
+ child, ceph_vinop(d_inode(child)), name);
} else {
dout("get_name %p ino %llx.%llx err %d\n",
- child, ceph_vinop(child->d_inode), err);
+ child, ceph_vinop(d_inode(child)), err);
}
ceph_mdsc_put_request(req);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d533075a823d..8b79d87eaf46 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,7 +7,6 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
-#include <linux/aio.h>
#include <linux/falloc.h>
#include "super.h"
@@ -90,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
- cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+ cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
if (cf == NULL) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM;
}
cf->fmode = fmode;
cf->next_offset = 2;
+ cf->readdir_cache_idx = -1;
file->private_data = cf;
BUG_ON(inode->i_fop->release != ceph_release);
break;
@@ -292,14 +292,14 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
}
if (err)
goto out_req;
- if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) {
+ if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
/* make vfs retry on splice, ENOENT, or symlink */
dout("atomic_open finish_no_open on dn %p\n", dn);
err = finish_no_open(file, dn);
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
- ceph_init_inode_acls(dentry->d_inode, &acls);
+ ceph_init_inode_acls(d_inode(dentry), &acls);
*opened |= FILE_CREATED;
}
err = finish_open(file, dentry, ceph_open, opened);
@@ -325,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
ceph_mdsc_put_request(cf->last_readdir);
kfree(cf->last_name);
kfree(cf->dir_info);
- dput(cf->dentry);
kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */
@@ -458,7 +457,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
if (ret < 0)
return ret;
- if (file->f_flags & O_DIRECT) {
+ if (iocb->ki_flags & IOCB_DIRECT) {
while (iov_iter_count(i)) {
size_t start;
ssize_t n;
@@ -484,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
}
} else {
num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages))
return PTR_ERR(pages);
ret = striped_read(inode, off, len, pages,
@@ -558,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
* objects, rollback on failure, etc.)
*/
static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+ struct ceph_snap_context *snapc)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
@@ -601,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t start;
ssize_t n;
- snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0,
@@ -615,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
break;
}
- osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+ osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
n = iov_iter_get_pages_alloc(from, &pages, len, &start);
if (unlikely(n < 0)) {
@@ -675,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
* objects, rollback on failure, etc.)
*/
static ssize_t
-ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
+ struct ceph_snap_context *snapc)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
@@ -718,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
size_t left;
int n;
- snapc = ci->i_snap_realm->cached_context;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &len, 0, 1,
@@ -737,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
*/
num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
@@ -808,7 +805,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *filp = iocb->ki_filp;
struct ceph_file_info *fi = filp->private_data;
- size_t len = iocb->ki_nbytes;
+ size_t len = iov_iter_count(to);
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct page *pinned_page = NULL;
@@ -829,7 +826,7 @@ again:
return ret;
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
+ (iocb->ki_flags & IOCB_DIRECT) ||
(fi->flags & CEPH_F_SYNC)) {
dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
@@ -861,7 +858,7 @@ again:
struct page *page = NULL;
loff_t i_size;
if (retry_op == READ_INLINE) {
- page = __page_cache_alloc(GFP_NOFS);
+ page = __page_cache_alloc(GFP_KERNEL);
if (!page)
return -ENOMEM;
}
@@ -942,27 +939,30 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
- ssize_t count = iov_iter_count(from), written = 0;
+ struct ceph_cap_flush *prealloc_cf;
+ ssize_t count, written = 0;
int err, want, got;
- loff_t pos = iocb->ki_pos;
+ loff_t pos;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err)
- goto out;
-
- if (count == 0)
+ err = generic_write_checks(iocb, from);
+ if (err <= 0)
goto out;
- iov_iter_truncate(from, count);
- err = file_remove_suid(file);
+ pos = iocb->ki_pos;
+ count = iov_iter_count(from);
+ err = file_remove_privs(file);
if (err)
goto out;
@@ -998,15 +998,31 @@ retry_snap:
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ struct ceph_snap_context *snapc;
struct iov_iter data;
mutex_unlock(&inode->i_mutex);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
/* we might need to revert back to that point */
data = *from;
- if (file->f_flags & O_DIRECT)
- written = ceph_sync_direct_write(iocb, &data, pos);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ written = ceph_sync_direct_write(iocb, &data, pos,
+ snapc);
else
- written = ceph_sync_write(iocb, &data, pos);
+ written = ceph_sync_write(iocb, &data, pos, snapc);
if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n",
@@ -1017,6 +1033,7 @@ retry_snap:
}
if (written > 0)
iov_iter_advance(from, written);
+ ceph_put_snap_context(snapc);
} else {
loff_t old_size = inode->i_size;
/*
@@ -1038,7 +1055,8 @@ retry_snap:
int dirty;
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
@@ -1062,6 +1080,7 @@ retry_snap:
out:
mutex_unlock(&inode->i_mutex);
out_unlocked:
+ ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL;
return written ? written : err;
}
@@ -1258,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
+ struct ceph_cap_flush *prealloc_cf;
int want, got = 0;
int dirty;
int ret = 0;
@@ -1270,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
mutex_lock(&inode->i_mutex);
if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -1316,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
if (!ret) {
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
if (dirty)
__mark_inode_dirty(inode, dirty);
@@ -1325,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
ceph_put_cap_refs(ci, got);
unlock:
mutex_unlock(&inode->i_mutex);
+ ceph_free_cap_flush(prealloc_cf);
return ret;
}
@@ -1332,8 +1358,6 @@ const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
.llseek = ceph_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = ceph_read_iter,
.write_iter = ceph_write_iter,
.mmap = ceph_mmap,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 119c43c80638..96d2bd829902 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -6,7 +6,6 @@
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
-#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/vmalloc.h>
#include <linux/posix_acl.h>
@@ -390,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_inline_version = 0;
ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0;
- ci->i_ordered_count = 0;
- atomic_set(&ci->i_release_count, 1);
- atomic_set(&ci->i_complete_count, 0);
+ atomic64_set(&ci->i_ordered_count, 1);
+ atomic64_set(&ci->i_release_count, 1);
+ atomic64_set(&ci->i_complete_seq[0], 0);
+ atomic64_set(&ci->i_complete_seq[1], 0);
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
@@ -416,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_flushing_caps = 0;
INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item);
- ci->i_cap_flush_seq = 0;
- ci->i_cap_flush_last_tid = 0;
- memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+ ci->i_prealloc_cap_flush = NULL;
+ ci->i_cap_flush_tree = RB_ROOT;
init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
@@ -753,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
ci->i_layout = info->layout;
+
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
le64_to_cpu(info->truncate_size),
@@ -819,6 +821,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
else
kfree(sym); /* lost a race */
}
+ inode->i_link = ci->i_symlink;
break;
case S_IFDIR:
inode->i_op = &ceph_dir_iops;
@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode);
+ i_size_write(inode, 0);
__ceph_dir_set_complete(ci,
- atomic_read(&ci->i_release_count),
- ci->i_ordered_count);
+ atomic64_read(&ci->i_release_count),
+ atomic64_read(&ci->i_ordered_count));
}
wake = true;
@@ -940,7 +944,7 @@ static void update_dentry_lease(struct dentry *dentry,
dentry, duration, ttl);
/* make lease_rdcache_gen match directory */
- dir = dentry->d_parent->d_inode;
+ dir = d_inode(dentry->d_parent);
di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
if (duration == 0)
@@ -980,7 +984,7 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
{
struct dentry *realdn;
- BUG_ON(dn->d_inode);
+ BUG_ON(d_inode(dn));
/* dn must be unhashed */
if (!d_unhashed(dn))
@@ -998,13 +1002,13 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
"inode %p ino %llx.%llx\n",
dn, d_count(dn),
realdn, d_count(realdn),
- realdn->d_inode, ceph_vinop(realdn->d_inode));
+ d_inode(realdn), ceph_vinop(d_inode(realdn)));
dput(dn);
dn = realdn;
} else {
BUG_ON(!ceph_dentry(dn));
dout("dn %p attached to %p ino %llx.%llx\n",
- dn, dn->d_inode, ceph_vinop(dn->d_inode));
+ dn, d_inode(dn), ceph_vinop(d_inode(dn)));
}
if ((!prehash || *prehash) && d_unhashed(dn))
d_rehash(dn);
@@ -1125,11 +1129,11 @@ retry_lookup:
dput(parent);
goto done;
}
- } else if (dn->d_inode &&
- (ceph_ino(dn->d_inode) != vino.ino ||
- ceph_snap(dn->d_inode) != vino.snap)) {
+ } else if (d_really_is_positive(dn) &&
+ (ceph_ino(d_inode(dn)) != vino.ino ||
+ ceph_snap(d_inode(dn)) != vino.snap)) {
dout(" dn %p points to wrong inode %p\n",
- dn, dn->d_inode);
+ dn, d_inode(dn));
d_delete(dn);
dput(dn);
goto retry_lookup;
@@ -1183,7 +1187,7 @@ retry_lookup:
BUG_ON(!dn);
BUG_ON(!dir);
- BUG_ON(dn->d_parent->d_inode != dir);
+ BUG_ON(d_inode(dn->d_parent) != dir);
BUG_ON(ceph_ino(dir) !=
le64_to_cpu(rinfo->diri.in->ino));
BUG_ON(ceph_snap(dir) !=
@@ -1212,6 +1216,10 @@ retry_lookup:
dout("fill_trace doing d_move %p -> %p\n",
req->r_old_dentry, dn);
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_ordered(dir);
+ ceph_dir_clear_ordered(olddir);
+
d_move(req->r_old_dentry, dn);
dout(" src %p '%pd' dst %p '%pd'\n",
req->r_old_dentry,
@@ -1222,10 +1230,6 @@ retry_lookup:
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
- /* d_move screws up sibling dentries' offsets */
- ceph_dir_clear_ordered(dir);
- ceph_dir_clear_ordered(olddir);
-
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
@@ -1235,7 +1239,7 @@ retry_lookup:
/* null dentry? */
if (!rinfo->head->is_target) {
dout("fill_trace null dentry\n");
- if (dn->d_inode) {
+ if (d_really_is_positive(dn)) {
ceph_dir_clear_ordered(dir);
dout("d_delete %p\n", dn);
d_delete(dn);
@@ -1252,7 +1256,7 @@ retry_lookup:
}
/* attach proper inode */
- if (!dn->d_inode) {
+ if (d_really_is_negative(dn)) {
ceph_dir_clear_ordered(dir);
ihold(in);
dn = splice_dentry(dn, in, &have_lease);
@@ -1261,9 +1265,9 @@ retry_lookup:
goto done;
}
req->r_dentry = dn; /* may have spliced */
- } else if (dn->d_inode && dn->d_inode != in) {
+ } else if (d_really_is_positive(dn) && d_inode(dn) != in) {
dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
- dn, dn->d_inode, ceph_vinop(dn->d_inode),
+ dn, d_inode(dn), ceph_vinop(d_inode(dn)),
ceph_vinop(in));
have_lease = false;
}
@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
return err;
}
+void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
+{
+ if (ctl->page) {
+ kunmap(ctl->page);
+ page_cache_release(ctl->page);
+ ctl->page = NULL;
+ }
+}
+
+static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
+ struct ceph_readdir_cache_control *ctl,
+ struct ceph_mds_request *req)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
+ unsigned idx = ctl->index % nsize;
+ pgoff_t pgoff = ctl->index / nsize;
+
+ if (!ctl->page || pgoff != page_index(ctl->page)) {
+ ceph_readdir_cache_release(ctl);
+ ctl->page = grab_cache_page(&dir->i_data, pgoff);
+ if (!ctl->page) {
+ ctl->index = -1;
+ return -ENOMEM;
+ }
+ /* reading/filling the cache are serialized by
+ * i_mutex, no need to use page lock */
+ unlock_page(ctl->page);
+ ctl->dentries = kmap(ctl->page);
+ }
+
+ if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
+ req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
+ dout("readdir cache dn %p idx %d\n", dn, ctl->index);
+ ctl->dentries[idx] = dn;
+ ctl->index++;
+ } else {
+ dout("disable readdir cache\n");
+ ctl->index = -1;
+ }
+ return 0;
+}
+
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di;
- u64 r_readdir_offset = req->r_readdir_offset;
u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+ struct ceph_readdir_cache_control cache_ctl = {};
+
+ if (req->r_aborted)
+ return readdir_prepopulate_inodes_only(req, session);
if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
@@ -1354,16 +1404,13 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag);
if (ceph_frag_is_leftmost(frag))
- r_readdir_offset = 2;
+ req->r_readdir_offset = 2;
else
- r_readdir_offset = 0;
+ req->r_readdir_offset = 0;
}
- if (req->r_aborted)
- return readdir_prepopulate_inodes_only(req, session);
-
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
- snapdir = ceph_get_snapdir(parent->d_inode);
+ snapdir = ceph_get_snapdir(d_inode(parent));
parent = d_find_alias(snapdir);
dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
rinfo->dir_nr, parent);
@@ -1371,9 +1418,20 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
dout("readdir_prepopulate %d items under dn %p\n",
rinfo->dir_nr, parent);
if (rinfo->dir_dir)
- ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
+ ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
+ }
+
+ if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
+ /* note dir version at start of readdir so we can tell
+ * if any dentries get dropped */
+ struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+ req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
+ req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
+ req->r_readdir_cache_idx = 0;
}
+ cache_ctl.index = req->r_readdir_cache_idx;
+
/* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_vino vino;
@@ -1405,26 +1463,19 @@ retry_lookup:
err = ret;
goto out;
}
- } else if (dn->d_inode &&
- (ceph_ino(dn->d_inode) != vino.ino ||
- ceph_snap(dn->d_inode) != vino.snap)) {
+ } else if (d_really_is_positive(dn) &&
+ (ceph_ino(d_inode(dn)) != vino.ino ||
+ ceph_snap(d_inode(dn)) != vino.snap)) {
dout(" dn %p points to wrong inode %p\n",
- dn, dn->d_inode);
+ dn, d_inode(dn));
d_delete(dn);
dput(dn);
goto retry_lookup;
- } else {
- /* reorder parent's d_subdirs */
- spin_lock(&parent->d_lock);
- spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_child, &parent->d_subdirs);
- spin_unlock(&dn->d_lock);
- spin_unlock(&parent->d_lock);
}
/* inode */
- if (dn->d_inode) {
- in = dn->d_inode;
+ if (d_really_is_positive(dn)) {
+ in = d_inode(dn);
} else {
in = ceph_get_inode(parent->d_sb, vino);
if (IS_ERR(in)) {
@@ -1436,17 +1487,19 @@ retry_lookup:
}
}
- if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
- req->r_request_started, -1,
- &req->r_caps_reservation) < 0) {
+ ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+ req->r_request_started, -1,
+ &req->r_caps_reservation);
+ if (ret < 0) {
pr_err("fill_inode badness on %p\n", in);
- if (!dn->d_inode)
+ if (d_really_is_negative(dn))
iput(in);
d_drop(dn);
+ err = ret;
goto next_item;
}
- if (!dn->d_inode) {
+ if (d_really_is_negative(dn)) {
struct dentry *realdn = splice_dentry(dn, in, NULL);
if (IS_ERR(realdn)) {
err = PTR_ERR(realdn);
@@ -1458,19 +1511,28 @@ retry_lookup:
}
di = dn->d_fsdata;
- di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+ di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
update_dentry_lease(dn, rinfo->dir_dlease[i],
req->r_session,
req->r_request_started);
+
+ if (err == 0 && cache_ctl.index >= 0) {
+ ret = fill_readdir_cache(d_inode(parent), dn,
+ &cache_ctl, req);
+ if (ret < 0)
+ err = ret;
+ }
next_item:
if (dn)
dput(dn);
}
- if (err == 0)
- req->r_did_prepopulate = true;
-
out:
+ if (err == 0) {
+ req->r_did_prepopulate = true;
+ req->r_readdir_cache_idx = cache_ctl.index;
+ }
+ ceph_readdir_cache_release(&cache_ctl);
if (snapdir) {
iput(snapdir);
dput(parent);
@@ -1691,16 +1753,9 @@ retry:
/*
* symlinks
*/
-static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
- nd_set_link(nd, ci->i_symlink);
- return NULL;
-}
-
static const struct inode_operations ceph_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = ceph_sym_follow_link,
+ .follow_link = simple_follow_link,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
.setxattr = ceph_setxattr,
@@ -1714,16 +1769,18 @@ static const struct inode_operations ceph_symlink_iops = {
*/
int ceph_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ceph_inode_info *ci = ceph_inode(inode);
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+ struct ceph_cap_flush *prealloc_cf;
int issued;
int release = 0, dirtied = 0;
int mask = 0;
int err = 0;
int inode_dirty_flags = 0;
+ bool lock_snap_rwsem = false;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
@@ -1732,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (err != 0)
return err;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
USE_AUTH_MDS);
- if (IS_ERR(req))
+ if (IS_ERR(req)) {
+ ceph_free_cap_flush(prealloc_cf);
return PTR_ERR(req);
+ }
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
+
+ if (!ci->i_head_snapc &&
+ (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
+ lock_snap_rwsem = true;
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ }
+ }
+
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ia_valid & ATTR_UID) {
@@ -1881,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
dout("setattr %p ATTR_FILE ... hrm!\n", inode);
if (dirtied) {
- inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
+ inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
+ &prealloc_cf);
inode->i_ctime = CURRENT_TIME;
}
release &= issued;
spin_unlock(&ci->i_ceph_lock);
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
@@ -1911,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
ceph_mdsc_put_request(req);
if (mask & CEPH_SETATTR_SIZE)
__ceph_do_pending_vmtruncate(inode);
+ ceph_free_cap_flush(prealloc_cf);
return err;
out_put:
ceph_mdsc_put_request(req);
+ ceph_free_cap_flush(prealloc_cf);
return err;
}
@@ -1990,7 +2070,7 @@ int ceph_permission(struct inode *inode, int mask)
int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ceph_inode_info *ci = ceph_inode(inode);
int err;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 4347039ecc18..6706bde9ad1b 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -287,7 +287,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
return 0;
spin_lock(&ctx->flc_lock);
- list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+ list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
++seen_fcntl;
if (seen_fcntl > num_fcntl_locks) {
err = -ENOSPC;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 71c073f38e54..6aa07af67603 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -8,6 +8,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
+#include <linux/ratelimit.h>
#include "super.h"
#include "mds_client.h"
@@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
- INIT_LIST_HEAD(&s->s_cap_releases_done);
INIT_LIST_HEAD(&s->s_cap_flushing);
INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
@@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
req->r_uid = current_fsuid();
req->r_gid = current_fsgid();
+ if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
+ mdsc->oldest_tid = req->r_tid;
+
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
@@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+
+ if (req->r_tid == mdsc->oldest_tid) {
+ struct rb_node *p = rb_next(&req->r_node);
+ mdsc->oldest_tid = 0;
+ while (p) {
+ struct ceph_mds_request *next_req =
+ rb_entry(p, struct ceph_mds_request, r_node);
+ if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
+ mdsc->oldest_tid = next_req->r_tid;
+ break;
+ }
+ p = rb_next(p);
+ }
+ }
+
rb_erase(&req->r_node, &mdsc->request_tree);
RB_CLEAR_NODE(&req->r_node);
@@ -679,7 +697,7 @@ static struct dentry *get_nonsnap_parent(struct dentry *dentry)
* except to resplice to another snapdir, and either the old or new
* result is a valid result.
*/
- while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+ while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
dentry = dentry->d_parent;
return dentry;
}
@@ -716,20 +734,20 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
} else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */
struct dentry *parent = req->r_dentry->d_parent;
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
if (dir->i_sb != mdsc->fsc->sb) {
/* not this fs! */
- inode = req->r_dentry->d_inode;
+ inode = d_inode(req->r_dentry);
} else if (ceph_snap(dir) != CEPH_NOSNAP) {
/* direct snapped/virtual snapdir requests
* based on parent dir inode */
struct dentry *dn = get_nonsnap_parent(parent);
- inode = dn->d_inode;
+ inode = d_inode(dn);
dout("__choose_mds using nonsnap parent %p\n", inode);
} else {
/* dentry target */
- inode = req->r_dentry->d_inode;
+ inode = d_inode(req->r_dentry);
if (!inode || mode == USE_AUTH_MDS) {
/* dir + name */
inode = dir;
@@ -998,27 +1016,53 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
* session caps
*/
-/*
- * Free preallocated cap messages assigned to this session
- */
-static void cleanup_cap_releases(struct ceph_mds_session *session)
+/* caller holds s_cap_lock, we drop it */
+static void cleanup_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+ __releases(session->s_cap_lock)
{
- struct ceph_msg *msg;
+ LIST_HEAD(tmp_list);
+ list_splice_init(&session->s_cap_releases, &tmp_list);
+ session->s_num_cap_releases = 0;
+ spin_unlock(&session->s_cap_lock);
- spin_lock(&session->s_cap_lock);
- while (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
- }
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
+ dout("cleanup_cap_releases mds%d\n", session->s_mds);
+ while (!list_empty(&tmp_list)) {
+ struct ceph_cap *cap;
+ /* zero out the in-progress message */
+ cap = list_first_entry(&tmp_list,
+ struct ceph_cap, session_caps);
+ list_del(&cap->session_caps);
+ ceph_put_cap(mdsc, cap);
}
- spin_unlock(&session->s_cap_lock);
+}
+
+static void cleanup_session_requests(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_request *req;
+ struct rb_node *p;
+
+ dout("cleanup_session_requests mds%d\n", session->s_mds);
+ mutex_lock(&mdsc->mutex);
+ while (!list_empty(&session->s_unsafe)) {
+ req = list_first_entry(&session->s_unsafe,
+ struct ceph_mds_request, r_unsafe_item);
+ list_del_init(&req->r_unsafe_item);
+ pr_warn_ratelimited(" dropping unsafe request %llu\n",
+ req->r_tid);
+ __unregister_request(mdsc, req);
+ }
+ /* zero r_attempts, so kick_requests() will re-send requests */
+ p = rb_first(&mdsc->request_tree);
+ while (p) {
+ req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
+ if (req->r_session &&
+ req->r_session->s_mds == session->s_mds)
+ req->r_attempts = 0;
+ }
+ mutex_unlock(&mdsc->mutex);
}
/*
@@ -1068,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session,
dout("iterate_session_caps finishing cap %p removal\n",
cap);
BUG_ON(cap->session != session);
+ cap->session = NULL;
list_del_init(&cap->session_caps);
session->s_nr_caps--;
- cap->session = NULL;
- old_cap = cap; /* put_cap it w/o locks held */
+ if (cap->queue_release) {
+ list_add_tail(&cap->session_caps,
+ &session->s_cap_releases);
+ session->s_num_cap_releases++;
+ } else {
+ old_cap = cap; /* put_cap it w/o locks held */
+ }
}
if (ret < 0)
goto out;
@@ -1092,19 +1142,35 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ LIST_HEAD(to_remove);
int drop = 0;
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
__ceph_remove_cap(cap, false);
- if (!__ceph_is_any_real_caps(ci)) {
+ if (!ci->i_auth_cap) {
+ struct ceph_cap_flush *cf;
struct ceph_mds_client *mdsc =
ceph_sb_to_client(inode->i_sb)->mdsc;
+ while (true) {
+ struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
+ if (!n)
+ break;
+ cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
+ list_add(&cf->list, &to_remove);
+ }
+
spin_lock(&mdsc->cap_dirty_lock);
+
+ list_for_each_entry(cf, &to_remove, list)
+ rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+
if (!list_empty(&ci->i_dirty_item)) {
- pr_info(" dropping dirty %s state for %p %lld\n",
+ pr_warn_ratelimited(
+ " dropping dirty %s state for %p %lld\n",
ceph_cap_string(ci->i_dirty_caps),
inode, ceph_ino(inode));
ci->i_dirty_caps = 0;
@@ -1112,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
drop = 1;
}
if (!list_empty(&ci->i_flushing_item)) {
- pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+ pr_warn_ratelimited(
+ " dropping dirty+flushing %s state for %p %lld\n",
ceph_cap_string(ci->i_flushing_caps),
inode, ceph_ino(inode));
ci->i_flushing_caps = 0;
@@ -1120,16 +1187,21 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
mdsc->num_cap_flushing--;
drop = 1;
}
- if (drop && ci->i_wrbuffer_ref) {
- pr_info(" dropping dirty data for %p %lld\n",
- inode, ceph_ino(inode));
- ci->i_wrbuffer_ref = 0;
- ci->i_wrbuffer_ref_head = 0;
- drop++;
- }
spin_unlock(&mdsc->cap_dirty_lock);
+
+ if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+ list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+ ci->i_prealloc_cap_flush = NULL;
+ }
}
spin_unlock(&ci->i_ceph_lock);
+ while (!list_empty(&to_remove)) {
+ struct ceph_cap_flush *cf;
+ cf = list_first_entry(&to_remove,
+ struct ceph_cap_flush, list);
+ list_del(&cf->list);
+ ceph_free_cap_flush(cf);
+ }
while (drop--)
iput(inode);
return 0;
@@ -1171,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session)
spin_lock(&session->s_cap_lock);
}
}
- spin_unlock(&session->s_cap_lock);
+
+ // drop cap expires and unlock s_cap_lock
+ cleanup_cap_releases(session->s_mdsc, session);
BUG_ON(session->s_nr_caps > 0);
BUG_ON(!list_empty(&session->s_cap_flushing));
- cleanup_cap_releases(session);
}
/*
@@ -1351,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
ceph_cap_string(used), ceph_cap_string(wanted));
if (cap == ci->i_auth_cap) {
- if (ci->i_dirty_caps | ci->i_flushing_caps)
+ if (ci->i_dirty_caps || ci->i_flushing_caps ||
+ !list_empty(&ci->i_cap_snaps))
goto out;
if ((used | wanted) & CEPH_CAP_ANY_WR)
goto out;
@@ -1397,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc,
session->s_trim_caps = 0;
}
- ceph_add_cap_releases(mdsc, session);
ceph_send_cap_releases(mdsc, session);
return 0;
}
-/*
- * Allocate cap_release messages. If there is a partially full message
- * in the queue, try to allocate enough to cover it's remainder, so that
- * we can send it immediately.
- *
- * Called under s_mutex.
- */
-int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static int check_capsnap_flush(struct ceph_inode_info *ci,
+ u64 want_snap_seq)
{
- struct ceph_msg *msg, *partial = NULL;
- struct ceph_mds_cap_release *head;
- int err = -ENOMEM;
- int extra = mdsc->fsc->mount_options->cap_release_safety;
- int num;
-
- dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
- extra);
-
- spin_lock(&session->s_cap_lock);
-
- if (!list_empty(&session->s_cap_releases)) {
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg,
- list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- if (num) {
- dout(" partial %p with (%d/%d)\n", msg, num,
- (int)CEPH_CAPS_PER_RELEASE);
- extra += CEPH_CAPS_PER_RELEASE - num;
- partial = msg;
- }
- }
- while (session->s_num_cap_releases < session->s_nr_caps + extra) {
- spin_unlock(&session->s_cap_lock);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
- GFP_NOFS, false);
- if (!msg)
- goto out_unlocked;
- dout("add_cap_releases %p msg %p now %d\n", session, msg,
- (int)msg->front.iov_len);
- head = msg->front.iov_base;
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- spin_lock(&session->s_cap_lock);
- list_add(&msg->list_head, &session->s_cap_releases);
- session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
- }
-
- if (partial) {
- head = partial->front.iov_base;
- num = le32_to_cpu(head->num);
- dout(" queueing partial %p with %d/%d\n", partial, num,
- (int)CEPH_CAPS_PER_RELEASE);
- list_move_tail(&partial->list_head,
- &session->s_cap_releases_done);
- session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
+ int ret = 1;
+ spin_lock(&ci->i_ceph_lock);
+ if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ ret = capsnap->follows >= want_snap_seq;
}
- err = 0;
- spin_unlock(&session->s_cap_lock);
-out_unlocked:
- return err;
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
}
-static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+static int check_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- int ret;
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_flushing_caps)
- ret = ci->i_cap_flush_seq >= want_flush_seq;
- else
- ret = 1;
- spin_unlock(&ci->i_ceph_lock);
+ struct rb_node *n;
+ struct ceph_cap_flush *cf;
+ int ret = 1;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ n = rb_first(&mdsc->cap_flush_tree);
+ cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
+ if (cf && cf->tid <= want_flush_tid) {
+ dout("check_caps_flush still flushing tid %llu <= %llu\n",
+ cf->tid, want_flush_tid);
+ ret = 0;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
return ret;
}
/*
* flush all dirty inode data to disk.
*
- * returns true if we've flushed through want_flush_seq
+ * returns true if we've flushed through want_flush_tid
*/
-static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void wait_caps_flush(struct ceph_mds_client *mdsc,
+ u64 want_flush_tid, u64 want_snap_seq)
{
int mds;
- dout("check_cap_flush want %lld\n", want_flush_seq);
+ dout("check_caps_flush want %llu snap want %llu\n",
+ want_flush_tid, want_snap_seq);
mutex_lock(&mdsc->mutex);
- for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ for (mds = 0; mds < mdsc->max_sessions; ) {
struct ceph_mds_session *session = mdsc->sessions[mds];
struct inode *inode = NULL;
- if (!session)
+ if (!session) {
+ mds++;
continue;
+ }
get_session(session);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_flushing)) {
- struct ceph_inode_info *ci =
- list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item);
-
- if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
- dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n",
- &ci->vfs_inode, ci->i_cap_flush_seq,
- want_flush_seq, session->s_mds);
+ if (!list_empty(&session->s_cap_snaps_flushing)) {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&session->s_cap_snaps_flushing,
+ struct ceph_cap_snap,
+ flushing_item);
+ struct ceph_inode_info *ci = capsnap->ci;
+ if (!check_capsnap_flush(ci, want_snap_seq)) {
+ dout("check_cap_flush still flushing snap %p "
+ "follows %lld <= %lld to mds%d\n",
+ &ci->vfs_inode, capsnap->follows,
+ want_snap_seq, mds);
inode = igrab(&ci->vfs_inode);
}
}
@@ -1520,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
if (inode) {
wait_event(mdsc->cap_flushing_wq,
- check_cap_flush(inode, want_flush_seq));
+ check_capsnap_flush(ceph_inode(inode),
+ want_snap_seq));
iput(inode);
+ } else {
+ mds++;
}
mutex_lock(&mdsc->mutex);
}
-
mutex_unlock(&mdsc->mutex);
- dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+
+ wait_event(mdsc->cap_flushing_wq,
+ check_caps_flush(mdsc, want_flush_tid));
+
+ dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
}
/*
@@ -1537,60 +1576,74 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
- struct ceph_msg *msg;
+ struct ceph_msg *msg = NULL;
+ struct ceph_mds_cap_release *head;
+ struct ceph_mds_cap_item *item;
+ struct ceph_cap *cap;
+ LIST_HEAD(tmp_list);
+ int num_cap_releases;
- dout("send_cap_releases mds%d\n", session->s_mds);
spin_lock(&session->s_cap_lock);
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
- spin_unlock(&session->s_cap_lock);
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
- ceph_con_send(&session->s_con, msg);
- spin_lock(&session->s_cap_lock);
- }
+again:
+ list_splice_init(&session->s_cap_releases, &tmp_list);
+ num_cap_releases = session->s_num_cap_releases;
+ session->s_num_cap_releases = 0;
spin_unlock(&session->s_cap_lock);
-}
-
-static void discard_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_msg *msg;
- struct ceph_mds_cap_release *head;
- unsigned num;
- dout("discard_cap_releases mds%d\n", session->s_mds);
+ while (!list_empty(&tmp_list)) {
+ if (!msg) {
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
+ PAGE_CACHE_SIZE, GFP_NOFS, false);
+ if (!msg)
+ goto out_err;
+ head = msg->front.iov_base;
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+ }
+ cap = list_first_entry(&tmp_list, struct ceph_cap,
+ session_caps);
+ list_del(&cap->session_caps);
+ num_cap_releases--;
- if (!list_empty(&session->s_cap_releases)) {
- /* zero out the in-progress message */
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n",
- session->s_mds, msg, num);
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- session->s_num_cap_releases += num;
+ le32_add_cpu(&head->num, 1);
+ item = msg->front.iov_base + msg->front.iov_len;
+ item->ino = cpu_to_le64(cap->cap_ino);
+ item->cap_id = cpu_to_le64(cap->cap_id);
+ item->migrate_seq = cpu_to_le32(cap->mseq);
+ item->seq = cpu_to_le32(cap->issue_seq);
+ msg->front.iov_len += sizeof(*item);
+
+ ceph_put_cap(mdsc, cap);
+
+ if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ ceph_con_send(&session->s_con, msg);
+ msg = NULL;
+ }
}
- /* requeue completed messages */
- while (!list_empty(&session->s_cap_releases_done)) {
- msg = list_first_entry(&session->s_cap_releases_done,
- struct ceph_msg, list_head);
- list_del_init(&msg->list_head);
+ BUG_ON(num_cap_releases != 0);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
- num);
- session->s_num_cap_releases += num;
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- list_add(&msg->list_head, &session->s_cap_releases);
+ spin_lock(&session->s_cap_lock);
+ if (!list_empty(&session->s_cap_releases))
+ goto again;
+ spin_unlock(&session->s_cap_lock);
+
+ if (msg) {
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ ceph_con_send(&session->s_con, msg);
}
+ return;
+out_err:
+ pr_err("send_cap_releases mds%d, failed to allocate message\n",
+ session->s_mds);
+ spin_lock(&session->s_cap_lock);
+ list_splice(&tmp_list, &session->s_cap_releases);
+ session->s_num_cap_releases += num_cap_releases;
+ spin_unlock(&session->s_cap_lock);
}
/*
@@ -1615,7 +1668,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries);
while (order >= 0) {
- rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
+ rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
+ __GFP_NOWARN,
order);
if (rinfo->dir_in)
break;
@@ -1677,13 +1731,9 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
struct ceph_mds_request, r_node);
}
-static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
{
- struct ceph_mds_request *req = __get_oldest_req(mdsc);
-
- if (req)
- return req->r_tid;
- return 0;
+ return mdsc->oldest_tid;
}
/*
@@ -1712,7 +1762,7 @@ retry:
seq = read_seqbegin(&rename_lock);
rcu_read_lock();
for (temp = dentry; !IS_ROOT(temp);) {
- struct inode *inode = temp->d_inode;
+ struct inode *inode = d_inode(temp);
if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
len++; /* slash only */
else if (stop_on_nosnap && inode &&
@@ -1736,7 +1786,7 @@ retry:
struct inode *inode;
spin_lock(&temp->d_lock);
- inode = temp->d_inode;
+ inode = d_inode(temp);
if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
dout("build_path path+%d: %p SNAPDIR\n",
pos, temp);
@@ -1770,7 +1820,7 @@ retry:
goto retry;
}
- *base = ceph_ino(temp->d_inode);
+ *base = ceph_ino(d_inode(temp));
*plen = len;
dout("build_path on %p %d built %llx '%.*s'\n",
dentry, d_count(dentry), *base, len, path);
@@ -1783,8 +1833,8 @@ static int build_dentry_path(struct dentry *dentry,
{
char *path;
- if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(dentry->d_parent->d_inode);
+ if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) {
+ *pino = ceph_ino(d_inode(dentry->d_parent));
*ppath = dentry->d_name.name;
*ppathlen = dentry->d_name.len;
return 0;
@@ -1853,7 +1903,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
*/
static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
- int mds)
+ int mds, bool drop_cap_releases)
{
struct ceph_msg *msg;
struct ceph_mds_request_head *head;
@@ -1925,7 +1975,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
releases = 0;
if (req->r_inode_drop)
releases += ceph_encode_inode_release(&p,
- req->r_inode ? req->r_inode : req->r_dentry->d_inode,
+ req->r_inode ? req->r_inode : d_inode(req->r_dentry),
mds, req->r_inode_drop, req->r_inode_unless, 0);
if (req->r_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_dentry,
@@ -1935,8 +1985,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
if (req->r_old_inode_drop)
releases += ceph_encode_inode_release(&p,
- req->r_old_dentry->d_inode,
+ d_inode(req->r_old_dentry),
mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+
+ if (drop_cap_releases) {
+ releases = 0;
+ p = msg->front.iov_base + req->r_request_release_offset;
+ }
+
head->num_releases = cpu_to_le16(releases);
/* time stamp */
@@ -1989,7 +2045,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
*/
static int __prepare_send_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
- int mds)
+ int mds, bool drop_cap_releases)
{
struct ceph_mds_request_head *rhead;
struct ceph_msg *msg;
@@ -2048,7 +2104,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
ceph_msg_put(req->r_request);
req->r_request = NULL;
}
- msg = create_request_message(mdsc, req, mds);
+ msg = create_request_message(mdsc, req, mds, drop_cap_releases);
if (IS_ERR(msg)) {
req->r_err = PTR_ERR(msg);
complete_request(mdsc, req);
@@ -2132,7 +2188,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
if (req->r_request_started == 0) /* note request start time */
req->r_request_started = jiffies;
- err = __prepare_send_request(mdsc, req, mds);
+ err = __prepare_send_request(mdsc, req, mds, false);
if (!err) {
ceph_msg_get(req->r_request);
ceph_con_send(&session->s_con, req->r_request);
@@ -2241,15 +2297,18 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
/* wait */
mutex_unlock(&mdsc->mutex);
dout("do_request waiting\n");
- if (req->r_timeout) {
- err = (long)wait_for_completion_killable_timeout(
- &req->r_completion, req->r_timeout);
- if (err == 0)
- err = -EIO;
- } else if (req->r_wait_for_completion) {
+ if (!req->r_timeout && req->r_wait_for_completion) {
err = req->r_wait_for_completion(mdsc, req);
} else {
- err = wait_for_completion_killable(&req->r_completion);
+ long timeleft = wait_for_completion_killable_timeout(
+ &req->r_completion,
+ ceph_timeout_jiffies(req->r_timeout));
+ if (timeleft > 0)
+ err = 0;
+ else if (!timeleft)
+ err = -EIO; /* timed out */
+ else
+ err = timeleft; /* killed */
}
dout("do_request waited, got %d\n", err);
mutex_lock(&mdsc->mutex);
@@ -2470,7 +2529,6 @@ out_err:
}
mutex_unlock(&mdsc->mutex);
- ceph_add_cap_releases(mdsc, req->r_session);
mutex_unlock(&session->s_mutex);
/* kick calling process */
@@ -2590,6 +2648,7 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_CLOSE:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect denied\n", session->s_mds);
+ cleanup_session_requests(mdsc, session);
remove_session_caps(session);
wake = 2; /* for good measure */
wake_up_all(&mdsc->session_close_wq);
@@ -2658,7 +2717,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
mutex_lock(&mdsc->mutex);
list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
- err = __prepare_send_request(mdsc, req, session->s_mds);
+ err = __prepare_send_request(mdsc, req, session->s_mds, true);
if (!err) {
ceph_msg_get(req->r_request);
ceph_con_send(&session->s_con, req->r_request);
@@ -2679,7 +2738,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
continue; /* only old requests */
if (req->r_session &&
req->r_session->s_mds == session->s_mds) {
- err = __prepare_send_request(mdsc, req, session->s_mds);
+ err = __prepare_send_request(mdsc, req,
+ session->s_mds, true);
if (!err) {
ceph_msg_get(req->r_request);
ceph_con_send(&session->s_con, req->r_request);
@@ -2860,11 +2920,11 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
*/
session->s_cap_reconnect = 1;
/* drop old cap expires; we're about to reestablish that state */
- discard_cap_releases(mdsc, session);
- spin_unlock(&session->s_cap_lock);
+ cleanup_cap_releases(mdsc, session);
/* trim unused caps to reduce MDS's cache rejoin time */
- shrink_dcache_parent(mdsc->fsc->sb->s_root);
+ if (mdsc->fsc->sb->s_root)
+ shrink_dcache_parent(mdsc->fsc->sb->s_root);
ceph_con_close(&session->s_con);
ceph_con_open(&session->s_con,
@@ -2927,6 +2987,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
reply->hdr.data_len = cpu_to_le32(pagelist->length);
ceph_msg_data_add_pagelist(reply, pagelist);
+
+ ceph_early_kick_flushing_caps(mdsc, session);
+
ceph_con_send(&session->s_con, reply);
mutex_unlock(&session->s_mutex);
@@ -3133,7 +3196,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
di->lease_renew_from &&
di->lease_renew_after == 0) {
unsigned long duration =
- le32_to_cpu(h->duration_ms) * HZ / 1000;
+ msecs_to_jiffies(le32_to_cpu(h->duration_ms));
di->lease_seq = seq;
dentry->d_time = di->lease_renew_from + duration;
@@ -3323,7 +3386,6 @@ static void delayed_work(struct work_struct *work)
send_renew_caps(mdsc, s);
else
ceph_con_keepalive(&s->s_con);
- ceph_add_cap_releases(mdsc, s);
if (s->s_state == CEPH_MDS_SESSION_OPEN ||
s->s_state == CEPH_MDS_SESSION_HUNG)
ceph_send_cap_releases(mdsc, s);
@@ -3361,11 +3423,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
atomic_set(&mdsc->num_sessions, 0);
mdsc->max_sessions = 0;
mdsc->stopping = 0;
+ mdsc->last_snap_seq = 0;
init_rwsem(&mdsc->snap_rwsem);
mdsc->snap_realms = RB_ROOT;
INIT_LIST_HEAD(&mdsc->snap_empty);
spin_lock_init(&mdsc->snap_empty_lock);
mdsc->last_tid = 0;
+ mdsc->oldest_tid = 0;
mdsc->request_tree = RB_ROOT;
INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
mdsc->last_renew_caps = jiffies;
@@ -3373,7 +3437,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->cap_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
- mdsc->cap_flush_seq = 0;
+ mdsc->last_cap_flush_tid = 1;
+ mdsc->cap_flush_tree = RB_ROOT;
INIT_LIST_HEAD(&mdsc->cap_dirty);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0;
@@ -3385,6 +3450,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
ceph_caps_init(mdsc);
ceph_adjust_min_caps(mdsc, fsc->min_caps);
+ init_rwsem(&mdsc->pool_perm_rwsem);
+ mdsc->pool_perm_tree = RB_ROOT;
+
return 0;
}
@@ -3394,8 +3462,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
*/
static void wait_requests(struct ceph_mds_client *mdsc)
{
+ struct ceph_options *opts = mdsc->fsc->client->options;
struct ceph_mds_request *req;
- struct ceph_fs_client *fsc = mdsc->fsc;
mutex_lock(&mdsc->mutex);
if (__get_oldest_req(mdsc)) {
@@ -3403,7 +3471,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
dout("wait_requests waiting for requests\n");
wait_for_completion_timeout(&mdsc->safe_umount_waiters,
- fsc->client->options->mount_timeout * HZ);
+ ceph_timeout_jiffies(opts->mount_timeout));
/* tear down remaining requests */
mutex_lock(&mdsc->mutex);
@@ -3456,7 +3524,8 @@ restart:
nextreq = rb_entry(n, struct ceph_mds_request, r_node);
else
nextreq = NULL;
- if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+ if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
+ (req->r_op & CEPH_MDS_OP_WRITE)) {
/* write op */
ceph_mdsc_get_request(req);
if (nextreq)
@@ -3484,7 +3553,7 @@ restart:
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
- u64 want_tid, want_flush;
+ u64 want_tid, want_flush, want_snap;
if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
return;
@@ -3496,13 +3565,18 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
ceph_flush_dirty_caps(mdsc);
spin_lock(&mdsc->cap_dirty_lock);
- want_flush = mdsc->cap_flush_seq;
+ want_flush = mdsc->last_cap_flush_tid;
spin_unlock(&mdsc->cap_dirty_lock);
- dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+ down_read(&mdsc->snap_rwsem);
+ want_snap = mdsc->last_snap_seq;
+ up_read(&mdsc->snap_rwsem);
+
+ dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
+ want_tid, want_flush, want_snap);
wait_unsafe_requests(mdsc, want_tid);
- wait_caps_flush(mdsc, want_flush);
+ wait_caps_flush(mdsc, want_flush, want_snap);
}
/*
@@ -3520,10 +3594,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc)
*/
void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
{
+ struct ceph_options *opts = mdsc->fsc->client->options;
struct ceph_mds_session *session;
int i;
- struct ceph_fs_client *fsc = mdsc->fsc;
- unsigned long timeout = fsc->client->options->mount_timeout * HZ;
dout("close_sessions\n");
@@ -3544,7 +3617,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
dout("waiting for sessions to close\n");
wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
- timeout);
+ ceph_timeout_jiffies(opts->mount_timeout));
/* tear down remaining sessions */
mutex_lock(&mdsc->mutex);
@@ -3578,6 +3651,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
ceph_mdsmap_destroy(mdsc->mdsmap);
kfree(mdsc->sessions);
ceph_caps_finalize(mdsc);
+ ceph_pool_perm_destroy(mdsc);
}
void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 1875b5d985c6..762757e6cebf 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -139,7 +139,6 @@ struct ceph_mds_session {
int s_cap_reconnect;
int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
- struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator;
/* protected by mutex */
@@ -228,7 +227,7 @@ struct ceph_mds_request {
int r_err;
bool r_aborted;
- unsigned long r_timeout; /* optional. jiffies */
+ unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only,
used to measure lease durations */
@@ -254,12 +253,21 @@ struct ceph_mds_request {
bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate;
+ long long r_dir_release_cnt;
+ long long r_dir_ordered_cnt;
+ int r_readdir_cache_idx;
u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation;
int r_num_caps;
};
+struct ceph_pool_perm {
+ struct rb_node node;
+ u32 pool;
+ int perm;
+};
+
/*
* mds client state
*/
@@ -284,12 +292,15 @@ struct ceph_mds_client {
* references (implying they contain no inodes with caps) that
* should be destroyed.
*/
+ u64 last_snap_seq;
struct rw_semaphore snap_rwsem;
struct rb_root snap_realms;
struct list_head snap_empty;
spinlock_t snap_empty_lock; /* protect snap_empty */
u64 last_tid; /* most recent mds request */
+ u64 oldest_tid; /* oldest incomplete mds request,
+ excluding setfilelock requests */
struct rb_root request_tree; /* pending mds requests */
struct delayed_work delayed_work; /* delayed work */
unsigned long last_renew_caps; /* last time we renewed our caps */
@@ -298,7 +309,8 @@ struct ceph_mds_client {
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
- u64 cap_flush_seq;
+ u64 last_cap_flush_tid;
+ struct rb_root cap_flush_tree;
struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
@@ -328,6 +340,9 @@ struct ceph_mds_client {
spinlock_t dentry_lru_lock;
struct list_head dentry_lru;
int num_dentry;
+
+ struct rw_semaphore pool_perm_rwsem;
+ struct rb_root pool_perm_tree;
};
extern const char *ceph_mds_op_name(int op);
@@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
-extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index a97e39f09ba6..233d906aec02 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b)
}
-static struct ceph_snap_context *empty_snapc;
+struct ceph_snap_context *ceph_empty_snapc;
/*
* build the snap context for a given realm.
@@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
return 0;
}
- if (num == 0 && realm->seq == empty_snapc->seq) {
- ceph_get_snap_context(empty_snapc);
- snapc = empty_snapc;
+ if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
+ ceph_get_snap_context(ceph_empty_snapc);
+ snapc = ceph_empty_snapc;
goto done;
}
@@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
return 0;
}
+static bool has_new_snaps(struct ceph_snap_context *o,
+ struct ceph_snap_context *n)
+{
+ if (n->num_snaps == 0)
+ return false;
+ /* snaps are in descending order */
+ return n->snaps[0] > o->seq;
+}
/*
* When a snapshot is applied, the size/mtime inode metadata is queued
@@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_cap_snap *capsnap;
+ struct ceph_snap_context *old_snapc, *new_snapc;
int used, dirty;
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
@@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
+ old_snapc = ci->i_head_snapc;
+ new_snapc = ci->i_snap_realm->cached_context;
+
/*
* If there is a write in progress, treat that as a dirty Fw,
* even though it hasn't completed yet; by the time we finish
@@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
writes in progress now were started before the previous
cap_snap. lucky us. */
dout("queue_cap_snap %p already pending\n", inode);
- kfree(capsnap);
- } else if (ci->i_snap_realm->cached_context == empty_snapc) {
- dout("queue_cap_snap %p empty snapc\n", inode);
- kfree(capsnap);
- } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
- CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
- struct ceph_snap_context *snapc = ci->i_head_snapc;
-
- /*
- * if we are a sync write, we may need to go to the snaprealm
- * to get the current snapc.
- */
- if (!snapc)
- snapc = ci->i_snap_realm->cached_context;
+ goto update_snapc;
+ }
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
+ dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+ goto update_snapc;
+ }
- dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
- inode, capsnap, snapc, ceph_cap_string(dirty));
- ihold(inode);
+ BUG_ON(!old_snapc);
- atomic_set(&capsnap->nref, 1);
- capsnap->ci = ci;
- INIT_LIST_HEAD(&capsnap->ci_item);
- INIT_LIST_HEAD(&capsnap->flushing_item);
-
- capsnap->follows = snapc->seq;
- capsnap->issued = __ceph_caps_issued(ci, NULL);
- capsnap->dirty = dirty;
-
- capsnap->mode = inode->i_mode;
- capsnap->uid = inode->i_uid;
- capsnap->gid = inode->i_gid;
-
- if (dirty & CEPH_CAP_XATTR_EXCL) {
- __ceph_build_xattrs_blob(ci);
- capsnap->xattr_blob =
- ceph_buffer_get(ci->i_xattrs.blob);
- capsnap->xattr_version = ci->i_xattrs.version;
- } else {
- capsnap->xattr_blob = NULL;
- capsnap->xattr_version = 0;
+ /*
+ * There is no need to send FLUSHSNAP message to MDS if there is
+ * no new snapshot. But when there is dirty pages or on-going
+ * writes, we still need to create cap_snap. cap_snap is needed
+ * by the write path and page writeback path.
+ *
+ * also see ceph_try_drop_cap_snap()
+ */
+ if (has_new_snaps(old_snapc, new_snapc)) {
+ if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
+ capsnap->need_flush = true;
+ } else {
+ if (!(used & CEPH_CAP_FILE_WR) &&
+ ci->i_wrbuffer_ref_head == 0) {
+ dout("queue_cap_snap %p "
+ "no new_snap|dirty_page|writing\n", inode);
+ goto update_snapc;
}
+ }
- capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
-
- /* dirty page count moved from _head to this cap_snap;
- all subsequent writes page dirties occur _after_ this
- snapshot. */
- capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
- ci->i_wrbuffer_ref_head = 0;
- capsnap->context = snapc;
- ci->i_head_snapc =
- ceph_get_snap_context(ci->i_snap_realm->cached_context);
- dout(" new snapc is %p\n", ci->i_head_snapc);
- list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
-
- if (used & CEPH_CAP_FILE_WR) {
- dout("queue_cap_snap %p cap_snap %p snapc %p"
- " seq %llu used WR, now pending\n", inode,
- capsnap, snapc, snapc->seq);
- capsnap->writing = 1;
- } else {
- /* note mtime, size NOW. */
- __ceph_finish_cap_snap(ci, capsnap);
- }
+ dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
+ inode, capsnap, old_snapc, ceph_cap_string(dirty),
+ capsnap->need_flush ? "" : "no_flush");
+ ihold(inode);
+
+ atomic_set(&capsnap->nref, 1);
+ capsnap->ci = ci;
+ INIT_LIST_HEAD(&capsnap->ci_item);
+ INIT_LIST_HEAD(&capsnap->flushing_item);
+
+ capsnap->follows = old_snapc->seq;
+ capsnap->issued = __ceph_caps_issued(ci, NULL);
+ capsnap->dirty = dirty;
+
+ capsnap->mode = inode->i_mode;
+ capsnap->uid = inode->i_uid;
+ capsnap->gid = inode->i_gid;
+
+ if (dirty & CEPH_CAP_XATTR_EXCL) {
+ __ceph_build_xattrs_blob(ci);
+ capsnap->xattr_blob =
+ ceph_buffer_get(ci->i_xattrs.blob);
+ capsnap->xattr_version = ci->i_xattrs.version;
} else {
- dout("queue_cap_snap %p nothing dirty|writing\n", inode);
- kfree(capsnap);
+ capsnap->xattr_blob = NULL;
+ capsnap->xattr_version = 0;
}
+ capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
+ /* dirty page count moved from _head to this cap_snap;
+ all subsequent writes page dirties occur _after_ this
+ snapshot. */
+ capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+ ci->i_wrbuffer_ref_head = 0;
+ capsnap->context = old_snapc;
+ list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+ old_snapc = NULL;
+
+ if (used & CEPH_CAP_FILE_WR) {
+ dout("queue_cap_snap %p cap_snap %p snapc %p"
+ " seq %llu used WR, now pending\n", inode,
+ capsnap, old_snapc, old_snapc->seq);
+ capsnap->writing = 1;
+ } else {
+ /* note mtime, size NOW. */
+ __ceph_finish_cap_snap(ci, capsnap);
+ }
+ capsnap = NULL;
+
+update_snapc:
+ if (ci->i_head_snapc) {
+ ci->i_head_snapc = ceph_get_snap_context(new_snapc);
+ dout(" new snapc is %p\n", new_snapc);
+ }
spin_unlock(&ci->i_ceph_lock);
+
+ kfree(capsnap);
+ ceph_put_snap_context(old_snapc);
}
/*
@@ -699,6 +730,8 @@ more:
/* queue realm for cap_snap creation */
list_add(&realm->dirty_item, &dirty_realms);
+ if (realm->seq > mdsc->last_snap_seq)
+ mdsc->last_snap_seq = realm->seq;
invalidate = 1;
} else if (!realm->cached_context) {
@@ -964,14 +997,14 @@ out:
int __init ceph_snap_init(void)
{
- empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
- if (!empty_snapc)
+ ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
+ if (!ceph_empty_snapc)
return -ENOMEM;
- empty_snapc->seq = 1;
+ ceph_empty_snapc->seq = 1;
return 0;
}
void ceph_snap_exit(void)
{
- ceph_put_snap_context(empty_snapc);
+ ceph_put_snap_context(ceph_empty_snapc);
}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 51cc23e48111..89e6bc321df3 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -75,6 +75,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LSSNAP: return "lssnap";
case CEPH_MDS_OP_MKSNAP: return "mksnap";
case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+ case CEPH_MDS_OP_RENAMESNAP: return "renamesnap";
case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a63997b8bcff..d1c833c321b9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -44,7 +44,7 @@ static void ceph_put_super(struct super_block *s)
static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry));
struct ceph_monmap *monmap = fsc->client->monc.monmap;
struct ceph_statfs st;
u64 fsid;
@@ -134,10 +134,12 @@ enum {
Opt_noino32,
Opt_fscache,
Opt_nofscache,
+ Opt_poolperm,
+ Opt_nopoolperm,
#ifdef CONFIG_CEPH_FS_POSIX_ACL
Opt_acl,
#endif
- Opt_noacl
+ Opt_noacl,
};
static match_table_t fsopt_tokens = {
@@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = {
{Opt_noino32, "noino32"},
{Opt_fscache, "fsc"},
{Opt_nofscache, "nofsc"},
+ {Opt_poolperm, "poolperm"},
+ {Opt_nopoolperm, "nopoolperm"},
#ifdef CONFIG_CEPH_FS_POSIX_ACL
{Opt_acl, "acl"},
#endif
@@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
break;
+ case Opt_poolperm:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
+ printk ("pool perm");
+ break;
+ case Opt_nopoolperm:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
+ break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl:
fsopt->sb_flags |= MS_POSIXACL;
@@ -345,6 +356,11 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->rsize = CEPH_RSIZE_DEFAULT;
fsopt->rasize = CEPH_RASIZE_DEFAULT;
fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ if (!fsopt->snapdir_name) {
+ err = -ENOMEM;
+ goto out;
+ }
+
fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
@@ -406,31 +422,20 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
struct ceph_mount_options *fsopt = fsc->mount_options;
- struct ceph_options *opt = fsc->client->options;
-
- if (opt->flags & CEPH_OPT_FSID)
- seq_printf(m, ",fsid=%pU", &opt->fsid);
- if (opt->flags & CEPH_OPT_NOSHARE)
- seq_puts(m, ",noshare");
- if (opt->flags & CEPH_OPT_NOCRC)
- seq_puts(m, ",nocrc");
- if (opt->flags & CEPH_OPT_NOMSGAUTH)
- seq_puts(m, ",nocephx_require_signatures");
- if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
- seq_puts(m, ",notcp_nodelay");
-
- if (opt->name)
- seq_printf(m, ",name=%s", opt->name);
- if (opt->key)
- seq_puts(m, ",secret=<hidden>");
-
- if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
- seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
- if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
- seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
- if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
- seq_printf(m, ",osdkeepalivetimeout=%d",
- opt->osd_keepalive_timeout);
+ size_t pos;
+ int ret;
+
+ /* a comma between MNT/MS and client options */
+ seq_putc(m, ',');
+ pos = m->count;
+
+ ret = ceph_print_client_options(m, fsc->client);
+ if (ret)
+ return ret;
+
+ /* retract our comma if no client options */
+ if (m->count == pos)
+ m->count--;
if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
seq_puts(m, ",dirstat");
@@ -438,14 +443,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",norbytes");
if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir");
- if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
- seq_puts(m, ",dcache");
- else
+ if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
seq_puts(m, ",nodcache");
if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
seq_puts(m, ",fsc");
- else
- seq_puts(m, ",nofsc");
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
+ seq_puts(m, ",nopoolperm");
#ifdef CONFIG_CEPH_FS_POSIX_ACL
if (fsopt->sb_flags & MS_POSIXACL)
@@ -477,6 +480,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+
return 0;
}
@@ -618,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
@@ -643,6 +648,10 @@ static int __init init_caches(void)
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (ceph_cap_cachep == NULL)
goto bad_cap;
+ ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_cap_flush_cachep == NULL)
+ goto bad_cap_flush;
ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
@@ -661,6 +670,8 @@ static int __init init_caches(void)
bad_file:
kmem_cache_destroy(ceph_dentry_cachep);
bad_dentry:
+ kmem_cache_destroy(ceph_cap_flush_cachep);
+bad_cap_flush:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
@@ -677,6 +688,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
@@ -730,10 +742,15 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
if (IS_ERR(req))
return ERR_CAST(req);
req->r_path1 = kstrdup(path, GFP_NOFS);
+ if (!req->r_path1) {
+ root = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
req->r_ino1.ino = CEPH_INO_ROOT;
req->r_ino1.snap = CEPH_NOSNAP;
req->r_started = started;
- req->r_timeout = fsc->client->options->mount_timeout * HZ;
+ req->r_timeout = fsc->client->options->mount_timeout;
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -976,7 +993,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
if (IS_ERR(res))
goto out_splat;
dout("root %p inode %p ino %llx.%llx\n", res,
- res->d_inode, ceph_vinop(res->d_inode));
+ d_inode(res), ceph_vinop(d_inode(res)));
return res;
out_splat:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 04c8124ed30e..2f2460d23a06 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -35,8 +35,10 @@
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
+#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
-#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
+#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \
+ CEPH_MOUNT_OPT_DCACHE)
#define ceph_set_mount_opt(fsc, opt) \
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -120,11 +122,21 @@ struct ceph_cap {
struct rb_node ci_node; /* per-ci cap tree */
struct ceph_mds_session *session;
struct list_head session_caps; /* per-session caplist */
- int mds;
u64 cap_id; /* unique cap id (mds provided) */
- int issued; /* latest, from the mds */
- int implemented; /* implemented superset of issued (for revocation) */
- int mds_wanted;
+ union {
+ /* in-use caps */
+ struct {
+ int issued; /* latest, from the mds */
+ int implemented; /* implemented superset of
+ issued (for revocation) */
+ int mds, mds_wanted;
+ };
+ /* caps to release */
+ struct {
+ u64 cap_ino;
+ int queue_release;
+ };
+ };
u32 seq, issue_seq, mseq;
u32 cap_gen; /* active/stale cycle */
unsigned long last_used;
@@ -162,6 +174,7 @@ struct ceph_cap_snap {
int writing; /* a sync write is still in progress */
int dirty_pages; /* dirty pages awaiting writeback */
bool inline_data;
+ bool need_flush;
};
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
@@ -173,6 +186,16 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
}
}
+struct ceph_cap_flush {
+ u64 tid;
+ int caps;
+ struct rb_node g_node; // global
+ union {
+ struct rb_node i_node; // inode
+ struct list_head list;
+ };
+};
+
/*
* The frag tree describes how a directory is fragmented, potentially across
* multiple metadata servers. It is also used to indicate points where
@@ -258,9 +281,9 @@ struct ceph_inode_info {
u32 i_time_warp_seq;
unsigned i_ceph_flags;
- int i_ordered_count;
- atomic_t i_release_count;
- atomic_t i_complete_count;
+ atomic64_t i_release_count;
+ atomic64_t i_ordered_count;
+ atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
@@ -282,11 +305,11 @@ struct ceph_inode_info {
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
struct list_head i_dirty_item, i_flushing_item;
- u64 i_cap_flush_seq;
/* we need to track cap writeback on a per-cap-bit basis, to allow
* overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */
- u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+ struct ceph_cap_flush *i_prealloc_cap_flush;
+ struct rb_root i_cap_flush_tree;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
@@ -437,36 +460,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
/*
* Ceph inode.
*/
-#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */
-#define CEPH_I_NODELAY 4 /* do not delay cap release */
-#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
+#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
+#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
+#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */
+#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */
+#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
+#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
+
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
- int release_count, int ordered_count)
+ long long release_count,
+ long long ordered_count)
{
- atomic_set(&ci->i_complete_count, release_count);
- if (ci->i_ordered_count == ordered_count)
- ci->i_ceph_flags |= CEPH_I_DIR_ORDERED;
- else
- ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
+ smp_mb__before_atomic();
+ atomic64_set(&ci->i_complete_seq[0], release_count);
+ atomic64_set(&ci->i_complete_seq[1], ordered_count);
}
static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
{
- atomic_inc(&ci->i_release_count);
+ atomic64_inc(&ci->i_release_count);
+}
+
+static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci)
+{
+ atomic64_inc(&ci->i_ordered_count);
}
static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
{
- return atomic_read(&ci->i_complete_count) ==
- atomic_read(&ci->i_release_count);
+ return atomic64_read(&ci->i_complete_seq[0]) ==
+ atomic64_read(&ci->i_release_count);
}
static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
{
- return __ceph_dir_is_complete(ci) &&
- (ci->i_ceph_flags & CEPH_I_DIR_ORDERED);
+ return atomic64_read(&ci->i_complete_seq[0]) ==
+ atomic64_read(&ci->i_release_count) &&
+ atomic64_read(&ci->i_complete_seq[1]) ==
+ atomic64_read(&ci->i_ordered_count);
}
static inline void ceph_dir_clear_complete(struct inode *inode)
@@ -476,20 +509,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode)
static inline void ceph_dir_clear_ordered(struct inode *inode)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- spin_lock(&ci->i_ceph_lock);
- ci->i_ordered_count++;
- ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
- spin_unlock(&ci->i_ceph_lock);
+ __ceph_dir_clear_ordered(ceph_inode(inode));
}
static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
{
- struct ceph_inode_info *ci = ceph_inode(inode);
- bool ret;
- spin_lock(&ci->i_ceph_lock);
- ret = __ceph_dir_is_complete_ordered(ci);
- spin_unlock(&ci->i_ceph_lock);
+ bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode));
+ smp_rmb();
return ret;
}
@@ -551,7 +577,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
{
return ci->i_dirty_caps | ci->i_flushing_caps;
}
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern struct ceph_cap_flush *ceph_alloc_cap_flush(void);
+extern void ceph_free_cap_flush(struct ceph_cap_flush *cf);
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+ struct ceph_cap_flush **pcf);
extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
struct ceph_cap *ocap, int mask);
@@ -605,16 +634,20 @@ struct ceph_file_info {
unsigned offset; /* offset of last chunk, adjusted for . and .. */
unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
- struct dentry *dentry; /* next dentry (for dcache readdir) */
- int dir_release_count;
- int dir_ordered_count;
+ long long dir_release_count;
+ long long dir_ordered_count;
+ int readdir_cache_idx;
/* used for -o dirstat read() on directory thing */
char *dir_info;
int dir_info_len;
};
-
+struct ceph_readdir_cache_control {
+ struct page *page;
+ struct dentry **dentries;
+ int index;
+};
/*
* A "snap realm" describes a subset of the file hierarchy sharing
@@ -686,6 +719,7 @@ static inline int default_congestion_kb(void)
/* snap.c */
+extern struct ceph_snap_context *ceph_empty_snapc;
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino);
extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
@@ -712,8 +746,8 @@ extern void ceph_snap_exit(void);
static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
{
return !list_empty(&ci->i_cap_snaps) &&
- list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
- ci_item)->writing;
+ list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap,
+ ci_item)->writing;
}
/* inode.c */
@@ -837,12 +871,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
extern int ceph_is_any_caps(struct inode *inode);
-extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
- u64 cap_id, u32 migrate_seq, u32 issue_seq);
extern void ceph_queue_caps_release(struct inode *inode);
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
+extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
@@ -878,10 +912,12 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
/* addr.c */
extern const struct address_space_operations ceph_aops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
+extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need);
+extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */
extern const struct file_operations ceph_file_fops;
-extern const struct address_space_operations ceph_aops;
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
@@ -890,7 +926,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
extern int ceph_release(struct inode *inode, struct file *filp);
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len);
-int ceph_uninline_data(struct file *filp, struct page *locked_page);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
extern const struct file_operations ceph_snapdir_fops;
@@ -911,6 +946,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
+extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
/*
* our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 5a492caf34cb..819163d8313b 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -776,12 +776,12 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
return generic_getxattr(dentry, name, value, size);
- return __ceph_getxattr(dentry->d_inode, name, value, size);
+ return __ceph_getxattr(d_inode(dentry), name, value, size);
}
ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
u32 vir_namelen = 0;
@@ -847,7 +847,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
const char *value, size_t size, int flags)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -877,16 +877,23 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
err = PTR_ERR(req);
goto out;
}
- req->r_inode = inode;
- ihold(inode);
- req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
- req->r_num_caps = 1;
+
req->r_args.setxattr.flags = cpu_to_le32(flags);
req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ ceph_mdsc_put_request(req);
+ err = -ENOMEM;
+ goto out;
+ }
req->r_pagelist = pagelist;
pagelist = NULL;
+ req->r_inode = inode;
+ ihold(inode);
+ req->r_num_caps = 1;
+ req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+
dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
@@ -901,9 +908,11 @@ out:
int __ceph_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+ struct ceph_cap_flush *prealloc_cf = NULL;
int issued;
int err;
int dirty = 0;
@@ -913,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
char *newval = NULL;
struct ceph_inode_xattr *xattr = NULL;
int required_blob_size;
+ bool lock_snap_rwsem = false;
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
@@ -941,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
if (!xattr)
goto out;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ goto out;
+
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
- dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
+
+ if (!lock_snap_rwsem && !ci->i_head_snapc) {
+ lock_snap_rwsem = true;
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ goto retry;
+ }
+ }
+
+ dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
__build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, name_len, val_len);
@@ -959,7 +984,7 @@ retry:
dout(" preaallocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
- goto out;
+ goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
@@ -971,21 +996,28 @@ retry:
flags, value ? 1 : -1, &xattr);
if (!err) {
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+ &prealloc_cf);
ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME;
}
spin_unlock(&ci->i_ceph_lock);
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
if (dirty)
__mark_inode_dirty(inode, dirty);
+ ceph_free_cap_flush(prealloc_cf);
return err;
do_sync:
spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked:
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
err = ceph_sync_setxattr(dentry, name, value, size, flags);
out:
+ ceph_free_cap_flush(prealloc_cf);
kfree(newname);
kfree(newval);
kfree(xattr);
@@ -995,7 +1027,7 @@ out:
int ceph_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+ if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
return -EROFS;
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
@@ -1011,7 +1043,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
int err;
@@ -1019,12 +1051,14 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2)
+ return -ENOMEM;
+
req->r_inode = inode;
ihold(inode);
- req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
req->r_num_caps = 1;
- req->r_path2 = kstrdup(name, GFP_NOFS);
-
+ req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
return err;
@@ -1032,13 +1066,16 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
int __ceph_removexattr(struct dentry *dentry, const char *name)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+ struct ceph_cap_flush *prealloc_cf = NULL;
int issued;
int err;
int required_blob_size;
int dirty;
+ bool lock_snap_rwsem = false;
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
@@ -1051,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
goto do_sync_unlocked;
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
err = -ENOMEM;
spin_lock(&ci->i_ceph_lock);
retry:
issued = __ceph_caps_issued(ci, NULL);
- dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
-
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
goto do_sync;
+
+ if (!lock_snap_rwsem && !ci->i_head_snapc) {
+ lock_snap_rwsem = true;
+ if (!down_read_trylock(&mdsc->snap_rwsem)) {
+ spin_unlock(&ci->i_ceph_lock);
+ down_read(&mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ goto retry;
+ }
+ }
+
+ dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+
__build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, 0, 0);
@@ -1071,7 +1123,7 @@ retry:
dout(" preaallocating new blob size=%d\n", required_blob_size);
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
if (!blob)
- goto out;
+ goto do_sync_unlocked;
spin_lock(&ci->i_ceph_lock);
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
@@ -1081,24 +1133,30 @@ retry:
err = __remove_xattr_by_name(ceph_inode(inode), name);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+ &prealloc_cf);
ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME;
spin_unlock(&ci->i_ceph_lock);
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
if (dirty)
__mark_inode_dirty(inode, dirty);
+ ceph_free_cap_flush(prealloc_cf);
return err;
do_sync:
spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked:
+ if (lock_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
+ ceph_free_cap_flush(prealloc_cf);
err = ceph_send_removexattr(dentry, name);
-out:
return err;
}
int ceph_removexattr(struct dentry *dentry, const char *name)
{
- if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+ if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
return -EROFS;
if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index a2172f3f69e3..e7b478b49985 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -192,6 +192,15 @@ config CIFS_SMB2
options are also slightly simpler (compared to CIFS) due
to protocol improvements.
+config CIFS_SMB311
+ bool "SMB3.1.1 network file system support (Experimental)"
+ depends on CIFS_SMB2 && INET
+
+ help
+ This enables experimental support for the newest, SMB3.1.1, dialect.
+ This dialect includes improved security negotiation features.
+ If unsure, say N
+
config CIFS_FSCACHE
bool "Provide CIFS client caching support"
depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b8602f199815..7dc886c9a78f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -24,6 +24,7 @@
#include "cifsfs.h"
#include "dns_resolve.h"
#include "cifs_debug.h"
+#include "cifs_unicode.h"
static LIST_HEAD(cifs_dfs_automount_list);
@@ -301,7 +302,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
if (full_path == NULL)
goto cdda_exit;
- cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
+ cifs_sb = CIFS_SB(d_inode(mntpt)->i_sb);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink)) {
mnt = ERR_CAST(tlink);
@@ -312,7 +313,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
xid = get_xid();
rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
&num_referrals, &referrals,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
free_xid(xid);
cifs_put_tlink(tlink);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 0303c6793d90..5a53ac6b1e02 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -27,41 +27,6 @@
#include "cifsglob.h"
#include "cifs_debug.h"
-/*
- * cifs_utf16_bytes - how long will a string be after conversion?
- * @utf16 - pointer to input string
- * @maxbytes - don't go past this many bytes of input string
- * @codepage - destination codepage
- *
- * Walk a utf16le string and return the number of bytes that the string will
- * be after being converted to the given charset, not including any null
- * termination required. Don't walk past maxbytes in the source buffer.
- */
-int
-cifs_utf16_bytes(const __le16 *from, int maxbytes,
- const struct nls_table *codepage)
-{
- int i;
- int charlen, outlen = 0;
- int maxwords = maxbytes / 2;
- char tmp[NLS_MAX_CHARSET_SIZE];
- __u16 ftmp;
-
- for (i = 0; i < maxwords; i++) {
- ftmp = get_unaligned_le16(&from[i]);
- if (ftmp == 0)
- break;
-
- charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
- if (charlen > 0)
- outlen += charlen;
- else
- outlen++;
- }
-
- return outlen;
-}
-
int cifs_remap(struct cifs_sb_info *cifs_sb)
{
int map_type;
@@ -155,10 +120,13 @@ convert_sfm_char(const __u16 src_char, char *target)
* enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
*/
static int
-cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
+cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
int maptype)
{
int len = 1;
+ __u16 src_char;
+
+ src_char = *from;
if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target))
return len;
@@ -168,10 +136,23 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
/* if character not one of seven in special remap set */
len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
- if (len <= 0) {
- *target = '?';
- len = 1;
- }
+ if (len <= 0)
+ goto surrogate_pair;
+
+ return len;
+
+surrogate_pair:
+ /* convert SURROGATE_PAIR and IVS */
+ if (strcmp(cp->charset, "utf8"))
+ goto unknown;
+ len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
+ if (len <= 0)
+ goto unknown;
+ return len;
+
+unknown:
+ *target = '?';
+ len = 1;
return len;
}
@@ -206,7 +187,7 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
int nullsize = nls_nullsize(codepage);
int fromwords = fromlen / 2;
char tmp[NLS_MAX_CHARSET_SIZE];
- __u16 ftmp;
+ __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
/*
* because the chars can be of varying widths, we need to take care
@@ -217,9 +198,17 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
for (i = 0; i < fromwords; i++) {
- ftmp = get_unaligned_le16(&from[i]);
- if (ftmp == 0)
+ ftmp[0] = get_unaligned_le16(&from[i]);
+ if (ftmp[0] == 0)
break;
+ if (i + 1 < fromwords)
+ ftmp[1] = get_unaligned_le16(&from[i + 1]);
+ else
+ ftmp[1] = 0;
+ if (i + 2 < fromwords)
+ ftmp[2] = get_unaligned_le16(&from[i + 2]);
+ else
+ ftmp[2] = 0;
/*
* check to see if converting this character might make the
@@ -234,6 +223,17 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
/* put converted char into 'to' buffer */
charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type);
outlen += charlen;
+
+ /* charlen (=bytes of UTF-8 for 1 character)
+ * 4bytes UTF-8(surrogate pair) is charlen=4
+ * (4bytes UTF-16 code)
+ * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
+ * (2 UTF-8 pairs divided to 2 UTF-16 pairs) */
+ if (charlen == 4)
+ i++;
+ else if (charlen >= 5)
+ /* 5-6bytes UTF-8 */
+ i += 2;
}
/* properly null-terminate string */
@@ -296,6 +296,46 @@ success:
}
/*
+ * cifs_utf16_bytes - how long will a string be after conversion?
+ * @utf16 - pointer to input string
+ * @maxbytes - don't go past this many bytes of input string
+ * @codepage - destination codepage
+ *
+ * Walk a utf16le string and return the number of bytes that the string will
+ * be after being converted to the given charset, not including any null
+ * termination required. Don't walk past maxbytes in the source buffer.
+ */
+int
+cifs_utf16_bytes(const __le16 *from, int maxbytes,
+ const struct nls_table *codepage)
+{
+ int i;
+ int charlen, outlen = 0;
+ int maxwords = maxbytes / 2;
+ char tmp[NLS_MAX_CHARSET_SIZE];
+ __u16 ftmp[3];
+
+ for (i = 0; i < maxwords; i++) {
+ ftmp[0] = get_unaligned_le16(&from[i]);
+ if (ftmp[0] == 0)
+ break;
+ if (i + 1 < maxwords)
+ ftmp[1] = get_unaligned_le16(&from[i + 1]);
+ else
+ ftmp[1] = 0;
+ if (i + 2 < maxwords)
+ ftmp[2] = get_unaligned_le16(&from[i + 2]);
+ else
+ ftmp[2] = 0;
+
+ charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD);
+ outlen += charlen;
+ }
+
+ return outlen;
+}
+
+/*
* cifs_strndup_from_utf16 - copy a string from wire format to the local
* codepage
* @src - source string
@@ -409,10 +449,15 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
char src_char;
__le16 dst_char;
wchar_t tmp;
+ wchar_t *wchar_to; /* UTF-16 */
+ int ret;
+ unicode_t u;
if (map_chars == NO_MAP_UNI_RSVD)
return cifs_strtoUTF16(target, source, PATH_MAX, cp);
+ wchar_to = kzalloc(6, GFP_KERNEL);
+
for (i = 0; i < srclen; j++) {
src_char = source[i];
charlen = 1;
@@ -441,11 +486,55 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
* if no match, use question mark, which at least in
* some cases serves as wild card
*/
- if (charlen < 1) {
- dst_char = cpu_to_le16(0x003f);
- charlen = 1;
+ if (charlen > 0)
+ goto ctoUTF16;
+
+ /* convert SURROGATE_PAIR */
+ if (strcmp(cp->charset, "utf8") || !wchar_to)
+ goto unknown;
+ if (*(source + i) & 0x80) {
+ charlen = utf8_to_utf32(source + i, 6, &u);
+ if (charlen < 0)
+ goto unknown;
+ } else
+ goto unknown;
+ ret = utf8s_to_utf16s(source + i, charlen,
+ UTF16_LITTLE_ENDIAN,
+ wchar_to, 6);
+ if (ret < 0)
+ goto unknown;
+
+ i += charlen;
+ dst_char = cpu_to_le16(*wchar_to);
+ if (charlen <= 3)
+ /* 1-3bytes UTF-8 to 2bytes UTF-16 */
+ put_unaligned(dst_char, &target[j]);
+ else if (charlen == 4) {
+ /* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
+ * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
+ * (charlen=3+4 or 4+4) */
+ put_unaligned(dst_char, &target[j]);
+ dst_char = cpu_to_le16(*(wchar_to + 1));
+ j++;
+ put_unaligned(dst_char, &target[j]);
+ } else if (charlen >= 5) {
+ /* 5-6bytes UTF-8 to 6bytes UTF-16 */
+ put_unaligned(dst_char, &target[j]);
+ dst_char = cpu_to_le16(*(wchar_to + 1));
+ j++;
+ put_unaligned(dst_char, &target[j]);
+ dst_char = cpu_to_le16(*(wchar_to + 2));
+ j++;
+ put_unaligned(dst_char, &target[j]);
}
+ continue;
+
+unknown:
+ dst_char = cpu_to_le16(0x003f);
+ charlen = 1;
}
+
+ctoUTF16:
/*
* character may take more than one byte in the source string,
* but will take exactly two bytes in the target string
@@ -456,6 +545,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
ctoUTF16_out:
put_unaligned(0, &target[j]); /* Null terminate target unicode string */
+ kfree(wchar_to);
return j;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d72fe37f5420..0a9fb6b53126 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -469,6 +469,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",nouser_xattr");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
seq_puts(s, ",mapchars");
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
+ seq_puts(s, ",mapposix");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
seq_puts(s, ",sfu");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
@@ -607,7 +609,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
p = s = full_path;
do {
- struct inode *dir = dentry->d_inode;
+ struct inode *dir = d_inode(dentry);
struct dentry *child;
if (!dir) {
@@ -906,8 +908,6 @@ const struct inode_operations cifs_symlink_inode_ops = {
};
const struct file_operations cifs_file_ops = {
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = cifs_loose_read_iter,
.write_iter = cifs_file_write_iter,
.open = cifs_open,
@@ -926,8 +926,6 @@ const struct file_operations cifs_file_ops = {
};
const struct file_operations cifs_file_strict_ops = {
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = cifs_strict_readv,
.write_iter = cifs_strict_writev,
.open = cifs_open,
@@ -947,8 +945,6 @@ const struct file_operations cifs_file_strict_ops = {
const struct file_operations cifs_file_direct_ops = {
/* BB reevaluate whether they can be done with directio, no cache */
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = cifs_user_readv,
.write_iter = cifs_user_writev,
.open = cifs_open,
@@ -967,8 +963,6 @@ const struct file_operations cifs_file_direct_ops = {
};
const struct file_operations cifs_file_nobrl_ops = {
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = cifs_loose_read_iter,
.write_iter = cifs_file_write_iter,
.open = cifs_open,
@@ -986,8 +980,6 @@ const struct file_operations cifs_file_nobrl_ops = {
};
const struct file_operations cifs_file_strict_nobrl_ops = {
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = cifs_strict_readv,
.write_iter = cifs_strict_writev,
.open = cifs_open,
@@ -1006,8 +998,6 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
const struct file_operations cifs_file_direct_nobrl_ops = {
/* BB reevaluate whether they can be done with directio, no cache */
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = cifs_user_readv,
.write_iter = cifs_user_writev,
.open = cifs_open,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 252f5c15806b..a782b22904e4 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -120,7 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
#endif
/* Functions related to symlinks */
-extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
+extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
int buflen);
extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 22b289a3b1c4..b406a32deb1f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -171,6 +171,10 @@ enum smb_version {
Smb_21,
Smb_30,
Smb_302,
+#ifdef CONFIG_CIFS_SMB311
+ Smb_311,
+#endif /* SMB311 */
+ Smb_version_err
};
struct mid_q_entry;
@@ -368,6 +372,8 @@ struct smb_version_operations {
void (*new_lease_key)(struct cifs_fid *);
int (*generate_signingkey)(struct cifs_ses *);
int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
+ int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon,
+ struct cifsFileInfo *src_file);
int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const unsigned char *,
char *, unsigned int *);
@@ -386,6 +392,9 @@ struct smb_version_operations {
int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file,
struct cifsFileInfo *target_file, u64 src_off, u64 len,
u64 dest_off);
+ int (*duplicate_extents)(const unsigned int, struct cifsFileInfo *src,
+ struct cifsFileInfo *target_file, u64 src_off, u64 len,
+ u64 dest_off);
int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
const unsigned char *, const unsigned char *, char *,
@@ -1617,4 +1626,8 @@ extern struct smb_version_values smb30_values;
#define SMB302_VERSION_STRING "3.02"
/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
extern struct smb_version_values smb302_values;
+#define SMB311_VERSION_STRING "3.1.1"
+#define ALT_SMB311_VERSION_STRING "3.11"
+extern struct smb_version_operations smb311_operations;
+extern struct smb_version_values smb311_values;
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 5f9822ac0245..47b030da0781 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2255,6 +2255,8 @@ typedef struct {
/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
+#define FILE_SUPPORTS_SPARSE_VDL 0x10000000 /* faster nonsparse extend */
+#define FILE_SUPPORTS_BLOCK_REFCOUNTING 0x08000000 /* allow ioctl dup extents */
#define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000
#define FILE_SUPPORTS_USN_JOURNAL 0x02000000
#define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000
@@ -2310,6 +2312,16 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */
char FileName[1];
} __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */
+typedef struct {
+ __le64 AllocationSize;
+ __le64 EndOfFile; /* size ie offset to first free byte in file */
+ __le32 NumberOfLinks; /* hard links */
+ __u8 DeletePending;
+ __u8 Directory;
+ __u16 Pad;
+} __attribute__((packed)) FILE_STANDARD_INFO; /* level 0x102 QPathInfo */
+
+
/* defines for enumerating possible values of the Unix type field below */
#define UNIX_FILE 0
#define UNIX_DIR 1
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index c31ce98c1704..c63fd1dde25b 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -361,11 +361,11 @@ extern int CIFSUnixCreateHardLink(const unsigned int xid,
extern int CIFSUnixCreateSymLink(const unsigned int xid,
struct cifs_tcon *tcon,
const char *fromName, const char *toName,
- const struct nls_table *nls_codepage);
+ const struct nls_table *nls_codepage, int remap);
extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
struct cifs_tcon *tcon,
const unsigned char *searchName, char **syminfo,
- const struct nls_table *nls_codepage);
+ const struct nls_table *nls_codepage, int remap);
extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
__u16 fid, char **symlinkinfo,
const struct nls_table *nls_codepage);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index fa13d5e79f64..672ef35c9f73 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -625,9 +625,8 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
CIFS_CRYPTO_KEY_SIZE);
- } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
- server->capabilities & CAP_EXTENDED_SECURITY) &&
- (pSMBr->EncryptionKeyLength == 0)) {
+ } else if (pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
+ server->capabilities & CAP_EXTENDED_SECURITY) {
server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
rc = decode_ext_sec_blob(ses, pSMBr);
} else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
@@ -1898,7 +1897,7 @@ static void
cifs_writev_requeue(struct cifs_writedata *wdata)
{
int i, rc = 0;
- struct inode *inode = wdata->cfile->dentry->d_inode;
+ struct inode *inode = d_inode(wdata->cfile->dentry);
struct TCP_Server_Info *server;
unsigned int rest_len;
@@ -1981,7 +1980,7 @@ cifs_writev_complete(struct work_struct *work)
{
struct cifs_writedata *wdata = container_of(work,
struct cifs_writedata, work);
- struct inode *inode = wdata->cfile->dentry->d_inode;
+ struct inode *inode = d_inode(wdata->cfile->dentry);
int i = 0;
if (wdata->result == 0) {
@@ -2784,7 +2783,7 @@ copyRetry:
int
CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon,
const char *fromName, const char *toName,
- const struct nls_table *nls_codepage)
+ const struct nls_table *nls_codepage, int remap)
{
TRANSACTION2_SPI_REQ *pSMB = NULL;
TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -2804,9 +2803,9 @@ createSymLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName,
- /* find define for this maxpathcomponent */
- PATH_MAX, nls_codepage);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fromName,
+ /* find define for this maxpathcomponent */
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
@@ -2828,9 +2827,9 @@ createSymLinkRetry:
data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len_target =
- cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX
- /* find define for this maxpathcomponent */
- , nls_codepage);
+ cifsConvertToUTF16((__le16 *) data_offset, toName,
+ /* find define for this maxpathcomponent */
+ PATH_MAX, nls_codepage, remap);
name_len_target++; /* trailing null */
name_len_target *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3034,7 +3033,7 @@ winCreateHardLinkRetry:
int
CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
const unsigned char *searchName, char **symlinkinfo,
- const struct nls_table *nls_codepage)
+ const struct nls_table *nls_codepage, int remap)
{
/* SMB_QUERY_FILE_UNIX_LINK */
TRANSACTION2_QPI_REQ *pSMB = NULL;
@@ -3055,8 +3054,9 @@ querySymLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4917,7 +4917,7 @@ getDFSRetry:
strncpy(pSMB->RequestFileName, search_name, name_len);
}
- if (ses->server && ses->server->sign)
+ if (ses->server->sign)
pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
pSMB->hdr.Uid = ses->Suid;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 480cf9c81d50..773f4dc77630 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -280,6 +280,11 @@ static const match_table_t cifs_smb_version_tokens = {
{ Smb_21, SMB21_VERSION_STRING },
{ Smb_30, SMB30_VERSION_STRING },
{ Smb_302, SMB302_VERSION_STRING },
+#ifdef CONFIG_CIFS_SMB311
+ { Smb_311, SMB311_VERSION_STRING },
+ { Smb_311, ALT_SMB311_VERSION_STRING },
+#endif /* SMB311 */
+ { Smb_version_err, NULL }
};
static int ip_connect(struct TCP_Server_Info *server);
@@ -386,6 +391,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
rc = generic_ip_connect(server);
if (rc) {
cifs_dbg(FYI, "reconnect error %d\n", rc);
+ mutex_unlock(&server->srv_mutex);
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
@@ -393,8 +399,8 @@ cifs_reconnect(struct TCP_Server_Info *server)
if (server->tcpStatus != CifsExiting)
server->tcpStatus = CifsNeedNegotiate;
spin_unlock(&GlobalMid_Lock);
+ mutex_unlock(&server->srv_mutex);
}
- mutex_unlock(&server->srv_mutex);
} while (server->tcpStatus == CifsNeedReconnect);
return rc;
@@ -773,8 +779,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
length = atomic_dec_return(&tcpSesAllocCount);
if (length > 0)
- mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
- GFP_KERNEL);
+ mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
}
static int
@@ -848,8 +853,7 @@ cifs_demultiplex_thread(void *p)
length = atomic_inc_return(&tcpSesAllocCount);
if (length > 1)
- mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
- GFP_KERNEL);
+ mempool_resize(cifs_req_poolp, length + cifs_min_rcv);
set_freezable();
while (server->tcpStatus != CifsExiting) {
@@ -1134,6 +1138,12 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
vol->ops = &smb30_operations; /* currently identical with 3.0 */
vol->vals = &smb302_values;
break;
+#ifdef CONFIG_CIFS_SMB311
+ case Smb_311:
+ vol->ops = &smb311_operations;
+ vol->vals = &smb311_values;
+ break;
+#endif /* SMB311 */
#endif
default:
cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
@@ -3462,6 +3472,8 @@ try_mount_again:
else if (ses)
cifs_put_smb_ses(ses);
+ cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
+
free_xid(xid);
}
#endif
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index b72bc29cba23..c3eb998a99bd 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -620,8 +620,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
}
rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
if (rc)
goto mknod_out;
@@ -745,13 +744,13 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
goto lookup_out;
}
- if (direntry->d_inode != NULL) {
+ if (d_really_is_positive(direntry)) {
cifs_dbg(FYI, "non-NULL inode in lookup\n");
} else {
cifs_dbg(FYI, "NULL inode in lookup\n");
}
cifs_dbg(FYI, "Full path: %s inode = 0x%p\n",
- full_path, direntry->d_inode);
+ full_path, d_inode(direntry));
if (pTcon->unix_ext) {
rc = cifs_get_inode_info_unix(&newInode, full_path,
@@ -792,7 +791,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- if (direntry->d_inode) {
+ if (d_really_is_positive(direntry)) {
if (cifs_revalidate_dentry(direntry))
return 0;
else {
@@ -803,7 +802,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
* attributes will have been updated by
* cifs_revalidate_dentry().
*/
- if (IS_AUTOMOUNT(direntry->d_inode) &&
+ if (IS_AUTOMOUNT(d_inode(direntry)) &&
!(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) {
spin_lock(&direntry->d_lock);
direntry->d_flags |= DCACHE_NEED_AUTOMOUNT;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ca30c391a894..3f50cee79df9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -140,8 +140,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
posix_flags = cifs_posix_convert_flags(f_flags);
rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data,
poplock, full_path, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
cifs_put_tlink(tlink);
if (rc)
@@ -273,7 +272,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
{
struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifsFileInfo *cfile;
struct cifs_fid_locks *fdlocks;
@@ -357,7 +356,7 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
*/
void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
{
- struct inode *inode = cifs_file->dentry->d_inode;
+ struct inode *inode = d_inode(cifs_file->dentry);
struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
@@ -386,7 +385,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
if (list_empty(&cifsi->openFileList)) {
cifs_dbg(FYI, "closing last open instance for inode %p\n",
- cifs_file->dentry->d_inode);
+ d_inode(cifs_file->dentry));
/*
* In strict cache mode we need invalidate mapping on the last
* close because it may cause a error when we open this file
@@ -572,7 +571,7 @@ static int
cifs_relock_file(struct cifsFileInfo *cfile)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
@@ -620,7 +619,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
return rc;
}
- inode = cfile->dentry->d_inode;
+ inode = d_inode(cfile->dentry);
cifs_sb = CIFS_SB(inode->i_sb);
tcon = tlink_tcon(cfile->tlink);
server = tcon->ses->server;
@@ -874,7 +873,7 @@ cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
{
bool rc = false;
struct cifs_fid_locks *cur;
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
list_for_each_entry(cur, &cinode->llist, llist) {
rc = cifs_find_fid_lock_conflict(cur, offset, length, type,
@@ -899,7 +898,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
{
int rc = 0;
struct cifsLockInfo *conf_lock;
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
bool exist;
@@ -927,7 +926,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
static void
cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
{
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
down_write(&cinode->lock_sem);
list_add_tail(&lock->llist, &cfile->llist->locks);
up_write(&cinode->lock_sem);
@@ -944,7 +943,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
bool wait)
{
struct cifsLockInfo *conf_lock;
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
bool exist;
int rc = 0;
@@ -1125,7 +1124,7 @@ struct lock_to_push {
static int
cifs_push_posix_locks(struct cifsFileInfo *cfile)
{
- struct inode *inode = cfile->dentry->d_inode;
+ struct inode *inode = d_inode(cfile->dentry);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct file_lock *flock;
struct file_lock_context *flctx = inode->i_flctx;
@@ -1214,7 +1213,7 @@ static int
cifs_push_locks(struct cifsFileInfo *cfile)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
@@ -1382,7 +1381,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
unsigned int max_num, num, max_buf;
LOCKING_ANDX_RANGE *buf, *cur;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifsLockInfo *li, *tmp;
__u64 length = 1 + flock->fl_end - flock->fl_start;
struct list_head tmp_llist;
@@ -1488,7 +1487,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
- struct inode *inode = cfile->dentry->d_inode;
+ struct inode *inode = d_inode(cfile->dentry);
if (posix_lck) {
int posix_lock_type;
@@ -1553,8 +1552,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
rc = server->ops->mand_unlock_range(cfile, flock, xid);
out:
- if (flock->fl_flags & FL_POSIX)
- posix_lock_file_wait(file, flock);
+ if (flock->fl_flags & FL_POSIX && !rc)
+ rc = posix_lock_file_wait(file, flock);
return rc;
}
@@ -1643,7 +1642,7 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
struct TCP_Server_Info *server;
unsigned int xid;
struct dentry *dentry = open_file->dentry;
- struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode);
+ struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry));
struct cifs_io_parms io_parms;
cifs_sb = CIFS_SB(dentry->d_sb);
@@ -1676,7 +1675,7 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
break;
}
- len = min(server->ops->wp_retry_size(dentry->d_inode),
+ len = min(server->ops->wp_retry_size(d_inode(dentry)),
(unsigned int)write_size - total_written);
/* iov[0] is reserved for smb header */
iov[1].iov_base = (char *)write_data + total_written;
@@ -1696,9 +1695,9 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
return rc;
}
} else {
- spin_lock(&dentry->d_inode->i_lock);
+ spin_lock(&d_inode(dentry)->i_lock);
cifs_update_eof(cifsi, *offset, bytes_written);
- spin_unlock(&dentry->d_inode->i_lock);
+ spin_unlock(&d_inode(dentry)->i_lock);
*offset += bytes_written;
}
}
@@ -1706,12 +1705,12 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
cifs_stats_bytes_written(tcon, total_written);
if (total_written > 0) {
- spin_lock(&dentry->d_inode->i_lock);
- if (*offset > dentry->d_inode->i_size)
- i_size_write(dentry->d_inode, *offset);
- spin_unlock(&dentry->d_inode->i_lock);
+ spin_lock(&d_inode(dentry)->i_lock);
+ if (*offset > d_inode(dentry)->i_size)
+ i_size_write(d_inode(dentry), *offset);
+ spin_unlock(&d_inode(dentry)->i_lock);
}
- mark_inode_dirty_sync(dentry->d_inode);
+ mark_inode_dirty_sync(d_inode(dentry));
free_xid(xid);
return total_written;
}
@@ -2406,7 +2405,7 @@ cifs_uncached_writev_complete(struct work_struct *work)
{
struct cifs_writedata *wdata = container_of(work,
struct cifs_writedata, work);
- struct inode *inode = wdata->cfile->dentry->d_inode;
+ struct inode *inode = d_inode(wdata->cfile->dentry);
struct cifsInodeInfo *cifsi = CIFS_I(inode);
spin_lock(&inode->i_lock);
@@ -2560,10 +2559,9 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
return rc;
}
-static ssize_t
-cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
{
- size_t len;
+ struct file *file = iocb->ki_filp;
ssize_t total_written = 0;
struct cifsFileInfo *open_file;
struct cifs_tcon *tcon;
@@ -2573,15 +2571,15 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
struct iov_iter saved_from;
int rc;
- len = iov_iter_count(from);
- rc = generic_write_checks(file, poffset, &len, 0);
- if (rc)
- return rc;
-
- if (!len)
- return 0;
+ /*
+ * BB - optimize the way when signing is disabled. We can drop this
+ * extra memory-to-memory copying and use iovec buffers for constructing
+ * write request.
+ */
- iov_iter_truncate(from, len);
+ rc = generic_write_checks(iocb, from);
+ if (rc <= 0)
+ return rc;
INIT_LIST_HEAD(&wdata_list);
cifs_sb = CIFS_FILE_SB(file);
@@ -2593,8 +2591,8 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
memcpy(&saved_from, from, sizeof(struct iov_iter));
- rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb,
- &wdata_list);
+ rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from,
+ open_file, cifs_sb, &wdata_list);
/*
* If at least one write was successfully sent, then discard any rc
@@ -2633,7 +2631,7 @@ restart_loop:
memcpy(&tmp_from, &saved_from,
sizeof(struct iov_iter));
iov_iter_advance(&tmp_from,
- wdata->offset - *poffset);
+ wdata->offset - iocb->ki_pos);
rc = cifs_write_from_iter(wdata->offset,
wdata->bytes, &tmp_from,
@@ -2650,34 +2648,13 @@ restart_loop:
kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
- if (total_written > 0)
- *poffset += total_written;
+ if (unlikely(!total_written))
+ return rc;
+ iocb->ki_pos += total_written;
+ set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags);
cifs_stats_bytes_written(tcon, total_written);
- return total_written ? total_written : (ssize_t)rc;
-}
-
-ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
-{
- ssize_t written;
- struct inode *inode;
- loff_t pos = iocb->ki_pos;
-
- inode = file_inode(iocb->ki_filp);
-
- /*
- * BB - optimize the way when signing is disabled. We can drop this
- * extra memory-to-memory copying and use iovec buffers for constructing
- * write request.
- */
-
- written = cifs_iovec_write(iocb->ki_filp, from, &pos);
- if (written > 0) {
- set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
- iocb->ki_pos = pos;
- }
-
- return written;
+ return total_written;
}
static ssize_t
@@ -2688,8 +2665,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file->f_mapping->host;
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
- ssize_t rc = -EACCES;
- loff_t lock_pos = iocb->ki_pos;
+ ssize_t rc;
/*
* We need to hold the sem to be sure nobody modifies lock list
@@ -2697,23 +2673,24 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
*/
down_read(&cinode->lock_sem);
mutex_lock(&inode->i_mutex);
- if (file->f_flags & O_APPEND)
- lock_pos = i_size_read(inode);
- if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from),
+
+ rc = generic_write_checks(iocb, from);
+ if (rc <= 0)
+ goto out;
+
+ if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
server->vals->exclusive_lock_type, NULL,
- CIFS_WRITE_OP)) {
+ CIFS_WRITE_OP))
rc = __generic_file_write_iter(iocb, from);
- mutex_unlock(&inode->i_mutex);
-
- if (rc > 0) {
- ssize_t err;
+ else
+ rc = -EACCES;
+out:
+ mutex_unlock(&inode->i_mutex);
- err = generic_write_sync(file, iocb->ki_pos - rc, rc);
- if (err < 0)
- rc = err;
- }
- } else {
- mutex_unlock(&inode->i_mutex);
+ if (rc > 0) {
+ ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+ if (err < 0)
+ rc = err;
}
up_read(&cinode->lock_sem);
return rc;
@@ -3816,7 +3793,7 @@ void cifs_oplock_break(struct work_struct *work)
{
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
oplock_break);
- struct inode *inode = cfile->dentry->d_inode;
+ struct inode *inode = d_inode(cfile->dentry);
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -3877,8 +3854,7 @@ void cifs_oplock_break(struct work_struct *work)
* Direct IO is not yet supported in the cached mode.
*/
static ssize_t
-cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos)
+cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
/*
* FIXME
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3e126d7bb2ea..f621b44cb800 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -373,8 +373,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* could have done a find first instead but this returns more info */
rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
cifs_put_tlink(tlink);
if (!rc) {
@@ -402,9 +401,25 @@ int cifs_get_inode_info_unix(struct inode **pinode,
rc = -ENOMEM;
} else {
/* we already have inode, update it */
+
+ /* if uniqueid is different, return error */
+ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
+ CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) {
+ rc = -ESTALE;
+ goto cgiiu_exit;
+ }
+
+ /* if filetype is different, return error */
+ if (unlikely(((*pinode)->i_mode & S_IFMT) !=
+ (fattr.cf_mode & S_IFMT))) {
+ rc = -ESTALE;
+ goto cgiiu_exit;
+ }
+
cifs_fattr_to_inode(*pinode, &fattr);
}
+cgiiu_exit:
return rc;
}
@@ -839,6 +854,15 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
if (!*inode)
rc = -ENOMEM;
} else {
+ /* we already have inode, update it */
+
+ /* if filetype is different, return error */
+ if (unlikely(((*inode)->i_mode & S_IFMT) !=
+ (fattr.cf_mode & S_IFMT))) {
+ rc = -ESTALE;
+ goto cgii_exit;
+ }
+
cifs_fattr_to_inode(*inode, &fattr);
}
@@ -1067,7 +1091,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
int rc;
struct cifs_fid fid;
struct cifs_open_parms oparms;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink;
@@ -1196,7 +1220,7 @@ cifs_drop_nlink(struct inode *inode)
}
/*
- * If dentry->d_inode is null (usually meaning the cached dentry
+ * If d_inode(dentry) is null (usually meaning the cached dentry
* is a negative dentry) then we would attempt a standard SMB delete, but
* if that fails we can not attempt the fall back mechanisms on EACCESS
* but will return the EACCESS to the caller. Note that the VFS does not call
@@ -1207,7 +1231,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
int rc = 0;
unsigned int xid;
char *full_path = NULL;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct cifsInodeInfo *cifs_inode;
struct super_block *sb = dir->i_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -1551,13 +1575,13 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
cifs_put_tlink(tlink);
if (!rc) {
- spin_lock(&direntry->d_inode->i_lock);
- i_size_write(direntry->d_inode, 0);
- clear_nlink(direntry->d_inode);
- spin_unlock(&direntry->d_inode->i_lock);
+ spin_lock(&d_inode(direntry)->i_lock);
+ i_size_write(d_inode(direntry), 0);
+ clear_nlink(d_inode(direntry));
+ spin_unlock(&d_inode(direntry)->i_lock);
}
- cifsInode = CIFS_I(direntry->d_inode);
+ cifsInode = CIFS_I(d_inode(direntry));
/* force revalidate to go get info when needed */
cifsInode->time = 0;
@@ -1568,7 +1592,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
*/
cifsInode->time = 0;
- direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
+ d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime =
current_fs_time(inode->i_sb);
rmdir_exit:
@@ -1727,7 +1751,7 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
unlink_target:
/* Try unlinking the target dentry if it's not negative */
- if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
+ if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) {
if (d_is_dir(target_dentry))
tmprc = cifs_rmdir(target_dir, target_dentry);
else
@@ -1867,7 +1891,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
{
unsigned int xid;
int rc = 0;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = dentry->d_sb;
char *full_path = NULL;
@@ -1919,7 +1943,7 @@ int cifs_revalidate_file(struct file *filp)
int cifs_revalidate_dentry(struct dentry *dentry)
{
int rc;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
rc = cifs_revalidate_dentry_attr(dentry);
if (rc)
@@ -1933,7 +1957,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
{
struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int rc;
/*
@@ -2110,7 +2134,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
int rc;
unsigned int xid;
char *full_path = NULL;
- struct inode *inode = direntry->d_inode;
+ struct inode *inode = d_inode(direntry);
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink;
@@ -2215,8 +2239,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
pTcon = tlink_tcon(tlink);
rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cifs_remap(cifs_sb));
cifs_put_tlink(tlink);
}
@@ -2251,7 +2274,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
unsigned int xid;
kuid_t uid = INVALID_UID;
kgid_t gid = INVALID_GID;
- struct inode *inode = direntry->d_inode;
+ struct inode *inode = d_inode(direntry);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
char *full_path = NULL;
@@ -2409,7 +2432,7 @@ cifs_setattr_exit:
int
cifs_setattr(struct dentry *direntry, struct iattr *attrs)
{
- struct inode *inode = direntry->d_inode;
+ struct inode *inode = d_inode(direntry);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 8b7898b7670f..49b8b6e41a18 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -31,12 +31,15 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifsfs.h"
+#include <linux/btrfs.h>
#define CIFS_IOCTL_MAGIC 0xCF
#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
+#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
- unsigned long srcfd, u64 off, u64 len, u64 destoff)
+ unsigned long srcfd, u64 off, u64 len, u64 destoff,
+ bool dup_extents)
{
int rc;
struct cifsFileInfo *smb_file_target = dst_file->private_data;
@@ -109,9 +112,14 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
truncate_inode_pages_range(&target_inode->i_data, destoff,
PAGE_CACHE_ALIGN(destoff + len)-1);
- if (target_tcon->ses->server->ops->clone_range)
+ if (dup_extents && target_tcon->ses->server->ops->duplicate_extents)
+ rc = target_tcon->ses->server->ops->duplicate_extents(xid,
+ smb_file_src, smb_file_target, off, len, destoff);
+ else if (!dup_extents && target_tcon->ses->server->ops->clone_range)
rc = target_tcon->ses->server->ops->clone_range(xid,
smb_file_src, smb_file_target, off, len, destoff);
+ else
+ rc = -EOPNOTSUPP;
/* force revalidate of size and timestamps of target file now
that target is updated on the server */
@@ -205,7 +213,20 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
}
break;
case CIFS_IOC_COPYCHUNK_FILE:
- rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0);
+ rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false);
+ break;
+ case BTRFS_IOC_CLONE:
+ rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true);
+ break;
+ case CIFS_IOC_SET_INTEGRITY:
+ if (pSMBFile == NULL)
+ break;
+ tcon = tlink_tcon(pSMBFile->tlink);
+ if (tcon->ses->server->ops->set_integrity)
+ rc = tcon->ses->server->ops->set_integrity(xid,
+ tcon, pSMBFile);
+ else
+ rc = -EOPNOTSUPP;
break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 2ec6037f61c7..e3548f73bdea 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -586,12 +586,12 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
* if source file is cached (oplocked) revalidate will not go to server
* until the file is closed or oplock broken so update nlinks locally
*/
- if (old_file->d_inode) {
- cifsInode = CIFS_I(old_file->d_inode);
+ if (d_really_is_positive(old_file)) {
+ cifsInode = CIFS_I(d_inode(old_file));
if (rc == 0) {
- spin_lock(&old_file->d_inode->i_lock);
- inc_nlink(old_file->d_inode);
- spin_unlock(&old_file->d_inode->i_lock);
+ spin_lock(&d_inode(old_file)->i_lock);
+ inc_nlink(d_inode(old_file));
+ spin_unlock(&d_inode(old_file)->i_lock);
/*
* parent dir timestamps will update from srv within a
@@ -626,10 +626,10 @@ cifs_hl_exit:
return rc;
}
-void *
-cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
+const char *
+cifs_follow_link(struct dentry *direntry, void **cookie)
{
- struct inode *inode = direntry->d_inode;
+ struct inode *inode = d_inode(direntry);
int rc = -ENOMEM;
unsigned int xid;
char *full_path = NULL;
@@ -643,16 +643,18 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink)) {
- rc = PTR_ERR(tlink);
- tlink = NULL;
- goto out;
+ free_xid(xid);
+ return ERR_CAST(tlink);
}
tcon = tlink_tcon(tlink);
server = tcon->ses->server;
full_path = build_path_from_dentry(direntry);
- if (!full_path)
- goto out;
+ if (!full_path) {
+ free_xid(xid);
+ cifs_put_tlink(tlink);
+ return ERR_PTR(-ENOMEM);
+ }
cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode);
@@ -670,17 +672,13 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
&target_path, cifs_sb);
kfree(full_path);
-out:
+ free_xid(xid);
+ cifs_put_tlink(tlink);
if (rc != 0) {
kfree(target_path);
- target_path = ERR_PTR(rc);
+ return ERR_PTR(rc);
}
-
- free_xid(xid);
- if (tlink)
- cifs_put_tlink(tlink);
- nd_set_link(nd, target_path);
- return NULL;
+ return *cookie = target_path;
}
int
@@ -717,7 +715,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
else if (pTcon->unix_ext)
rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
- cifs_sb->local_nls);
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
/* else
rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName,
cifs_sb_target->local_nls); */
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 337946355b29..8442b8b8e0be 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -473,7 +473,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
continue;
cifs_dbg(FYI, "file id match, oplock break\n");
- pCifsInode = CIFS_I(netfile->dentry->d_inode);
+ pCifsInode = CIFS_I(d_inode(netfile->dentry));
set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
&pCifsInode->flags);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c295338e0a98..b1eede3678a9 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -78,7 +78,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
{
struct dentry *dentry, *alias;
struct inode *inode;
- struct super_block *sb = parent->d_inode->i_sb;
+ struct super_block *sb = d_inode(parent)->i_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
@@ -88,8 +88,10 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
return;
if (dentry) {
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
if (inode) {
+ if (d_mountpoint(dentry))
+ goto out;
/*
* If we're generating inode numbers, then we don't
* want to clobber the existing one with the one that
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index d2979036a4c7..fc537c29044e 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -722,7 +722,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
static void
cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
{
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
cfile->fid.netfid = fid->netfid;
cifs_set_oplock_level(cinode, oplock);
cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
@@ -960,7 +960,8 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
/* Check for unix extensions */
if (cap_unix(tcon->ses)) {
rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
- cifs_sb->local_nls);
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
if (rc == -EREMOTE)
rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
target_path,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 7198eac5dddd..2ab297dae5a7 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -95,7 +95,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
unsigned int max_num, num = 0, max_buf;
struct smb2_lock_element *buf, *cur;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifsLockInfo *li, *tmp;
__u64 length = 1 + flock->fl_end - flock->fl_start;
struct list_head tmp_llist;
@@ -231,7 +231,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
unsigned int xid;
unsigned int max_num, max_buf;
struct smb2_lock_element *buf;
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct cifs_fid_locks *fdlocks;
xid = get_xid();
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 22dfdf17d065..1c5907019045 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -453,7 +453,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
list_for_each(tmp, &tcon->openFileList) {
cfile = list_entry(tmp, struct cifsFileInfo, tlist);
- cinode = CIFS_I(cfile->dentry->d_inode);
+ cinode = CIFS_I(d_inode(cfile->dentry));
if (memcmp(cinode->lease_key, rsp->LeaseKey,
SMB2_LEASE_KEY_SIZE))
@@ -590,7 +590,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
continue;
cifs_dbg(FYI, "file id match, oplock break\n");
- cinode = CIFS_I(cfile->dentry->d_inode);
+ cinode = CIFS_I(d_inode(cfile->dentry));
if (!CIFS_CACHE_WRITE(cinode) &&
rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE)
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index eab05e1aa587..df91bcf56d67 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -524,7 +524,7 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
static void
smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
{
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
cfile->fid.persistent_fid = fid->persistent_fid;
@@ -793,7 +793,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
* If extending file more than one page make sparse. Many Linux fs
* make files sparse by default when extending via ftruncate
*/
- inode = cfile->dentry->d_inode;
+ inode = d_inode(cfile->dentry);
if (!set_alloc && (size > inode->i_size + 8192)) {
__u8 set_sparse = 1;
@@ -806,6 +806,53 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
cfile->fid.volatile_fid, cfile->pid, &eof, false);
}
+#ifdef CONFIG_CIFS_SMB311
+static int
+smb2_duplicate_extents(const unsigned int xid,
+ struct cifsFileInfo *srcfile,
+ struct cifsFileInfo *trgtfile, u64 src_off,
+ u64 len, u64 dest_off)
+{
+ int rc;
+ unsigned int ret_data_len;
+ char *retbuf = NULL;
+ struct duplicate_extents_to_file dup_ext_buf;
+ struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink);
+
+ /* server fileays advertise duplicate extent support with this flag */
+ if ((le32_to_cpu(tcon->fsAttrInfo.Attributes) &
+ FILE_SUPPORTS_BLOCK_REFCOUNTING) == 0)
+ return -EOPNOTSUPP;
+
+ dup_ext_buf.VolatileFileHandle = srcfile->fid.volatile_fid;
+ dup_ext_buf.PersistentFileHandle = srcfile->fid.persistent_fid;
+ dup_ext_buf.SourceFileOffset = cpu_to_le64(src_off);
+ dup_ext_buf.TargetFileOffset = cpu_to_le64(dest_off);
+ dup_ext_buf.ByteCount = cpu_to_le64(len);
+ cifs_dbg(FYI, "duplicate extents: src off %lld dst off %lld len %lld",
+ src_off, dest_off, len);
+
+ rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false);
+ if (rc)
+ goto duplicate_extents_out;
+
+ rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
+ trgtfile->fid.volatile_fid,
+ FSCTL_DUPLICATE_EXTENTS_TO_FILE,
+ true /* is_fsctl */, (char *)&dup_ext_buf,
+ sizeof(struct duplicate_extents_to_file),
+ (char **)&retbuf,
+ &ret_data_len);
+
+ if (ret_data_len > 0)
+ cifs_dbg(FYI, "non-zero response length in duplicate extents");
+
+duplicate_extents_out:
+ return rc;
+}
+#endif /* CONFIG_CIFS_SMB311 */
+
+
static int
smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile)
@@ -815,6 +862,28 @@ smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
}
static int
+smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile)
+{
+ struct fsctl_set_integrity_information_req integr_info;
+ char *retbuf = NULL;
+ unsigned int ret_data_len;
+
+ integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED);
+ integr_info.Flags = 0;
+ integr_info.Reserved = 0;
+
+ return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ FSCTL_SET_INTEGRITY_INFORMATION,
+ true /* is_fsctl */, (char *)&integr_info,
+ sizeof(struct fsctl_set_integrity_information_req),
+ (char **)&retbuf,
+ &ret_data_len);
+
+}
+
+static int
smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
const char *path, struct cifs_sb_info *cifs_sb,
struct cifs_fid *fid, __u16 search_flags,
@@ -1032,7 +1101,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = cfile->dentry->d_inode;
+ inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
/* if file not oplocked can't be sure whether asking to extend size */
@@ -1083,7 +1152,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = cfile->dentry->d_inode;
+ inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
/* Need to make file sparse, if not already, before freeing range. */
@@ -1115,7 +1184,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = cfile->dentry->d_inode;
+ inode = d_inode(cfile->dentry);
cifsi = CIFS_I(inode);
/* if file not oplocked can't be sure whether asking to extend size */
@@ -1624,6 +1693,7 @@ struct smb_version_operations smb30_operations = {
.new_lease_key = smb2_new_lease_key,
.generate_signingkey = generate_smb3signingkey,
.calc_signature = smb3_calc_signature,
+ .set_integrity = smb3_set_integrity,
.is_read_op = smb21_is_read_op,
.set_oplock_level = smb3_set_oplock_level,
.create_lease_buf = smb3_create_lease_buf,
@@ -1635,6 +1705,94 @@ struct smb_version_operations smb30_operations = {
.fallocate = smb3_fallocate,
};
+#ifdef CONFIG_CIFS_SMB311
+struct smb_version_operations smb311_operations = {
+ .compare_fids = smb2_compare_fids,
+ .setup_request = smb2_setup_request,
+ .setup_async_request = smb2_setup_async_request,
+ .check_receive = smb2_check_receive,
+ .add_credits = smb2_add_credits,
+ .set_credits = smb2_set_credits,
+ .get_credits_field = smb2_get_credits_field,
+ .get_credits = smb2_get_credits,
+ .wait_mtu_credits = smb2_wait_mtu_credits,
+ .get_next_mid = smb2_get_next_mid,
+ .read_data_offset = smb2_read_data_offset,
+ .read_data_length = smb2_read_data_length,
+ .map_error = map_smb2_to_linux_error,
+ .find_mid = smb2_find_mid,
+ .check_message = smb2_check_message,
+ .dump_detail = smb2_dump_detail,
+ .clear_stats = smb2_clear_stats,
+ .print_stats = smb2_print_stats,
+ .dump_share_caps = smb2_dump_share_caps,
+ .is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
+ .need_neg = smb2_need_neg,
+ .negotiate = smb2_negotiate,
+ .negotiate_wsize = smb2_negotiate_wsize,
+ .negotiate_rsize = smb2_negotiate_rsize,
+ .sess_setup = SMB2_sess_setup,
+ .logoff = SMB2_logoff,
+ .tree_connect = SMB2_tcon,
+ .tree_disconnect = SMB2_tdis,
+ .qfs_tcon = smb3_qfs_tcon,
+ .is_path_accessible = smb2_is_path_accessible,
+ .can_echo = smb2_can_echo,
+ .echo = SMB2_echo,
+ .query_path_info = smb2_query_path_info,
+ .get_srv_inum = smb2_get_srv_inum,
+ .query_file_info = smb2_query_file_info,
+ .set_path_size = smb2_set_path_size,
+ .set_file_size = smb2_set_file_size,
+ .set_file_info = smb2_set_file_info,
+ .set_compression = smb2_set_compression,
+ .mkdir = smb2_mkdir,
+ .mkdir_setinfo = smb2_mkdir_setinfo,
+ .rmdir = smb2_rmdir,
+ .unlink = smb2_unlink,
+ .rename = smb2_rename_path,
+ .create_hardlink = smb2_create_hardlink,
+ .query_symlink = smb2_query_symlink,
+ .query_mf_symlink = smb3_query_mf_symlink,
+ .create_mf_symlink = smb3_create_mf_symlink,
+ .open = smb2_open_file,
+ .set_fid = smb2_set_fid,
+ .close = smb2_close_file,
+ .flush = smb2_flush_file,
+ .async_readv = smb2_async_readv,
+ .async_writev = smb2_async_writev,
+ .sync_read = smb2_sync_read,
+ .sync_write = smb2_sync_write,
+ .query_dir_first = smb2_query_dir_first,
+ .query_dir_next = smb2_query_dir_next,
+ .close_dir = smb2_close_dir,
+ .calc_smb_size = smb2_calc_size,
+ .is_status_pending = smb2_is_status_pending,
+ .oplock_response = smb2_oplock_response,
+ .queryfs = smb2_queryfs,
+ .mand_lock = smb2_mand_lock,
+ .mand_unlock_range = smb2_unlock_range,
+ .push_mand_locks = smb2_push_mandatory_locks,
+ .get_lease_key = smb2_get_lease_key,
+ .set_lease_key = smb2_set_lease_key,
+ .new_lease_key = smb2_new_lease_key,
+ .generate_signingkey = generate_smb3signingkey,
+ .calc_signature = smb3_calc_signature,
+ .set_integrity = smb3_set_integrity,
+ .is_read_op = smb21_is_read_op,
+ .set_oplock_level = smb3_set_oplock_level,
+ .create_lease_buf = smb3_create_lease_buf,
+ .parse_lease_buf = smb3_parse_lease_buf,
+ .clone_range = smb2_clone_range,
+ .duplicate_extents = smb2_duplicate_extents,
+/* .validate_negotiate = smb3_validate_negotiate, */ /* not used in 3.11 */
+ .wp_retry_size = smb2_wp_retry_size,
+ .dir_needs_close = smb2_dir_needs_close,
+ .fallocate = smb3_fallocate,
+};
+#endif /* CIFS_SMB311 */
+
struct smb_version_values smb20_values = {
.version_string = SMB20_VERSION_STRING,
.protocol_id = SMB20_PROT_ID,
@@ -1714,3 +1872,25 @@ struct smb_version_values smb302_values = {
.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
.create_lease_size = sizeof(struct create_lease_v2),
};
+
+#ifdef CONFIG_CIFS_SMB311
+struct smb_version_values smb311_values = {
+ .version_string = SMB311_VERSION_STRING,
+ .protocol_id = SMB311_PROT_ID,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .large_lock_type = 0,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+ .header_size = sizeof(struct smb2_hdr),
+ .max_header_size = MAX_SMB2_HDR_SIZE,
+ .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .lock_cmd = SMB2_LOCK,
+ .cap_unix = 0,
+ .cap_nt_find = SMB2_NT_FIND,
+ .cap_large_files = SMB2_LARGE_FILES,
+ .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+ .create_lease_size = sizeof(struct create_lease_v2),
+};
+#endif /* SMB311 */
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 65cd7a84c8bc..b8b4f08ee094 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -110,7 +110,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
/* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
/* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
- if ((tcon->ses) &&
+ if ((tcon->ses) && (tcon->ses->server) &&
(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
hdr->CreditCharge = cpu_to_le16(1);
/* else CreditCharge MBZ */
@@ -304,6 +304,59 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
return rc;
}
+#ifdef CONFIG_CIFS_SMB311
+/* offset is sizeof smb2_negotiate_req - 4 but rounded up to 8 bytes */
+#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) - 4 */
+
+
+#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1)
+#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2)
+
+static void
+build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt)
+{
+ pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES;
+ pneg_ctxt->DataLength = cpu_to_le16(38);
+ pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1);
+ pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE);
+ get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE);
+ pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512;
+}
+
+static void
+build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt)
+{
+ pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES;
+ pneg_ctxt->DataLength = cpu_to_le16(6);
+ pneg_ctxt->CipherCount = cpu_to_le16(2);
+ pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM;
+ pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM;
+}
+
+static void
+assemble_neg_contexts(struct smb2_negotiate_req *req)
+{
+
+ /* +4 is to account for the RFC1001 len field */
+ char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT + 4;
+
+ build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
+ /* Add 2 to size to round to 8 byte boundary */
+ pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context);
+ build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
+ req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
+ req->NegotiateContextCount = cpu_to_le16(2);
+ inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2
+ + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */
+}
+#else
+static void assemble_neg_contexts(struct smb2_negotiate_req *req)
+{
+ return;
+}
+#endif /* SMB311 */
+
+
/*
*
* SMB2 Worker functions follow:
@@ -363,10 +416,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
/* ClientGUID must be zero for SMB2.02 dialect */
if (ses->server->vals->protocol_id == SMB20_PROT_ID)
memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE);
- else
+ else {
memcpy(req->ClientGUID, server->client_guid,
SMB2_CLIENT_GUID_SIZE);
-
+ if (ses->server->vals->protocol_id == SMB311_PROT_ID)
+ assemble_neg_contexts(req);
+ }
iov[0].iov_base = (char *)req;
/* 4 for rfc1002 length field */
iov[0].iov_len = get_rfc1002_length(req) + 4;
@@ -393,8 +448,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
+#ifdef CONFIG_CIFS_SMB311
+ else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID))
+ cifs_dbg(FYI, "negotiated smb3.1.1 dialect\n");
+#endif /* SMB311 */
else {
- cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
+ cifs_dbg(VFS, "Illegal dialect returned by server 0x%x\n",
le16_to_cpu(rsp->DialectRevision));
rc = -EIO;
goto neg_exit;
@@ -572,7 +631,7 @@ ssetup_ntlmssp_authenticate:
return rc;
req->hdr.SessionId = 0; /* First session, not a reauthenticate */
- req->VcNumber = 0; /* MBZ */
+ req->Flags = 0; /* MBZ */
/* to enable echos and oplocks */
req->hdr.CreditRequest = cpu_to_le16(3);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 70867d54fb8b..451108284a2f 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -136,9 +136,6 @@ struct smb2_transform_hdr {
__u64 SessionId;
} __packed;
-/* Encryption Algorithms */
-#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
-
/*
* SMB2 flag definitions
*/
@@ -191,7 +188,10 @@ struct smb2_negotiate_req {
__le16 Reserved; /* MBZ */
__le32 Capabilities;
__u8 ClientGUID[SMB2_CLIENT_GUID_SIZE];
- __le64 ClientStartTime; /* MBZ */
+ /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */
+ __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */
+ __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */
+ __le16 Reserved2;
__le16 Dialects[1]; /* One dialect (vers=) at a time for now */
} __packed;
@@ -200,6 +200,7 @@ struct smb2_negotiate_req {
#define SMB21_PROT_ID 0x0210
#define SMB30_PROT_ID 0x0300
#define SMB302_PROT_ID 0x0302
+#define SMB311_PROT_ID 0x0311
#define BAD_PROT_ID 0xFFFF
/* SecurityMode flags */
@@ -217,12 +218,38 @@ struct smb2_negotiate_req {
#define SMB2_NT_FIND 0x00100000
#define SMB2_LARGE_FILES 0x00200000
+#define SMB311_SALT_SIZE 32
+/* Hash Algorithm Types */
+#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001)
+
+struct smb2_preauth_neg_context {
+ __le16 ContextType; /* 1 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 HashAlgorithmCount; /* 1 */
+ __le16 SaltLength;
+ __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */
+ __u8 Salt[SMB311_SALT_SIZE];
+} __packed;
+
+/* Encryption Algorithms Ciphers */
+#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001)
+#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002)
+
+struct smb2_encryption_neg_context {
+ __le16 ContextType; /* 2 */
+ __le16 DataLength;
+ __le32 Reserved;
+ __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */
+ __le16 Ciphers[2]; /* Ciphers[0] since only one used now */
+} __packed;
+
struct smb2_negotiate_rsp {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 65 */
__le16 SecurityMode;
__le16 DialectRevision;
- __le16 Reserved; /* MBZ */
+ __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */
__u8 ServerGUID[16];
__le32 Capabilities;
__le32 MaxTransactSize;
@@ -232,14 +259,18 @@ struct smb2_negotiate_rsp {
__le64 ServerStartTime;
__le16 SecurityBufferOffset;
__le16 SecurityBufferLength;
- __le32 Reserved2; /* may be any value, ignore */
+ __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */
__u8 Buffer[1]; /* variable length GSS security buffer */
} __packed;
+/* Flags */
+#define SMB2_SESSION_REQ_FLAG_BINDING 0x01
+#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04
+
struct smb2_sess_setup_req {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 25 */
- __u8 VcNumber;
+ __u8 Flags;
__u8 SecurityMode;
__le32 Capabilities;
__le32 Channel;
@@ -274,10 +305,13 @@ struct smb2_logoff_rsp {
__le16 Reserved;
} __packed;
+/* Flags/Reserved for SMB3.1.1 */
+#define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001
+
struct smb2_tree_connect_req {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 9 */
- __le16 Reserved;
+ __le16 Reserved; /* Flags in SMB3.1.1 */
__le16 PathOffset;
__le16 PathLength;
__u8 Buffer[1]; /* variable length */
@@ -587,6 +621,29 @@ struct copychunk_ioctl_rsp {
__le32 TotalBytesWritten;
} __packed;
+struct fsctl_set_integrity_information_req {
+ __le16 ChecksumAlgorithm;
+ __le16 Reserved;
+ __le32 Flags;
+} __packed;
+
+struct fsctl_get_integrity_information_rsp {
+ __le16 ChecksumAlgorithm;
+ __le16 Reserved;
+ __le32 Flags;
+ __le32 ChecksumChunkSizeInBytes;
+ __le32 ClusterSizeInBytes;
+} __packed;
+
+/* Integrity ChecksumAlgorithm choices for above */
+#define CHECKSUM_TYPE_NONE 0x0000
+#define CHECKSUM_TYPE_CRC64 0x0002
+#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */
+
+/* Integrity flags for above */
+#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
+
+
struct validate_negotiate_info_req {
__le32 Capabilities;
__u8 Guid[SMB2_CLIENT_GUID_SIZE];
@@ -620,6 +677,14 @@ struct compress_ioctl {
__le16 CompressionState; /* See cifspdu.h for possible flag values */
} __packed;
+struct duplicate_extents_to_file {
+ __u64 PersistentFileHandle; /* source file handle, opaque endianness */
+ __u64 VolatileFileHandle;
+ __le64 SourceFileOffset;
+ __le64 TargetFileOffset;
+ __le64 ByteCount; /* Bytes to be copied */
+} __packed;
+
struct smb2_ioctl_req {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 57 */
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 83efa59535be..a639d0dab453 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -75,10 +75,13 @@
#define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */
#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */
#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
+#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C
#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */
+#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344
#define FSCTL_SIS_LINK_FILES 0x0009C104
+#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280
#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */
/* strange that the number for this op is not sequential with previous op */
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 72a4d10653d6..ff9e1f8b16a4 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -50,9 +50,9 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
if (direntry == NULL)
return -EIO;
- if (direntry->d_inode == NULL)
+ if (d_really_is_negative(direntry))
return -EIO;
- sb = direntry->d_inode->i_sb;
+ sb = d_inode(direntry)->i_sb;
if (sb == NULL)
return -EIO;
@@ -111,9 +111,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
if (direntry == NULL)
return -EIO;
- if (direntry->d_inode == NULL)
+ if (d_really_is_negative(direntry))
return -EIO;
- sb = direntry->d_inode->i_sb;
+ sb = d_inode(direntry)->i_sb;
if (sb == NULL)
return -EIO;
@@ -177,12 +177,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
memcpy(pacl, ea_value, value_size);
if (pTcon->ses->server->ops->set_acl)
rc = pTcon->ses->server->ops->set_acl(pacl,
- value_size, direntry->d_inode,
+ value_size, d_inode(direntry),
full_path, CIFS_ACL_DACL);
else
rc = -EOPNOTSUPP;
if (rc == 0) /* force revalidate of the inode */
- CIFS_I(direntry->d_inode)->time = 0;
+ CIFS_I(d_inode(direntry))->time = 0;
kfree(pacl);
}
#else
@@ -246,9 +246,9 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
if (direntry == NULL)
return -EIO;
- if (direntry->d_inode == NULL)
+ if (d_really_is_negative(direntry))
return -EIO;
- sb = direntry->d_inode->i_sb;
+ sb = d_inode(direntry)->i_sb;
if (sb == NULL)
return -EIO;
@@ -324,7 +324,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
goto get_ea_exit; /* rc already EOPNOTSUPP */
pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
- direntry->d_inode, full_path, &acllen);
+ d_inode(direntry), full_path, &acllen);
if (IS_ERR(pacl)) {
rc = PTR_ERR(pacl);
cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
@@ -382,9 +382,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
if (direntry == NULL)
return -EIO;
- if (direntry->d_inode == NULL)
+ if (d_really_is_negative(direntry))
return -EIO;
- sb = direntry->d_inode->i_sb;
+ sb = d_inode(direntry)->i_sb;
if (sb == NULL)
return -EIO;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 46ee6f238985..5bb630a769e0 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -94,8 +94,8 @@ static void coda_flag_children(struct dentry *parent, int flag)
spin_lock(&parent->d_lock);
list_for_each_entry(de, &parent->d_subdirs, d_child) {
/* don't know what to do with negative dentries */
- if (de->d_inode )
- coda_flag_inode(de->d_inode, flag);
+ if (d_inode(de) )
+ coda_flag_inode(d_inode(de), flag);
}
spin_unlock(&parent->d_lock);
return;
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index d6f7a76a1f5b..f829fe963f5b 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -79,7 +79,7 @@ void coda_sysctl_clean(void);
static inline struct coda_inode_info *ITOC(struct inode *inode)
{
- return list_entry(inode, struct coda_inode_info, vfs_inode);
+ return container_of(inode, struct coda_inode_info, vfs_inode);
}
static __inline__ struct CodaFid *coda_i2f(struct inode *inode)
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 60cb88c1dd2b..fda9f4311212 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -201,7 +201,7 @@ err_out:
static int coda_link(struct dentry *source_de, struct inode *dir_inode,
struct dentry *de)
{
- struct inode *inode = source_de->d_inode;
+ struct inode *inode = d_inode(source_de);
const char * name = de->d_name.name;
int len = de->d_name.len;
int error;
@@ -266,7 +266,7 @@ static int coda_unlink(struct inode *dir, struct dentry *de)
return error;
coda_dir_update_mtime(dir);
- drop_nlink(de->d_inode);
+ drop_nlink(d_inode(de));
return 0;
}
@@ -279,8 +279,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
if (!error) {
/* VFS may delete the child */
- if (de->d_inode)
- clear_nlink(de->d_inode);
+ if (d_really_is_positive(de))
+ clear_nlink(d_inode(de));
/* fix the link count of the parent */
coda_dir_drop_nlink(dir);
@@ -303,14 +303,14 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
coda_i2f(new_dir), old_length, new_length,
(const char *) old_name, (const char *)new_name);
if (!error) {
- if (new_dentry->d_inode) {
+ if (d_really_is_positive(new_dentry)) {
if (d_is_dir(new_dentry)) {
coda_dir_drop_nlink(old_dir);
coda_dir_inc_nlink(new_dir);
}
coda_dir_update_mtime(old_dir);
coda_dir_update_mtime(new_dir);
- coda_flag_inode(new_dentry->d_inode, C_VATTR);
+ coda_flag_inode(d_inode(new_dentry), C_VATTR);
} else {
coda_flag_inode(old_dir, C_VATTR);
coda_flag_inode(new_dir, C_VATTR);
@@ -449,13 +449,13 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- inode = de->d_inode;
+ inode = d_inode(de);
if (!inode || is_root_inode(inode))
goto out;
if (is_bad_inode(inode))
goto bad;
- cii = ITOC(de->d_inode);
+ cii = ITOC(d_inode(de));
if (!(cii->c_flags & (C_PURGE | C_FLUSH)))
goto out;
@@ -487,11 +487,11 @@ static int coda_dentry_delete(const struct dentry * dentry)
{
int flags;
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
return 0;
- flags = (ITOC(dentry->d_inode)->c_flags) & C_PURGE;
- if (is_bad_inode(dentry->d_inode) || flags) {
+ flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE;
+ if (is_bad_inode(d_inode(dentry)) || flags) {
return 1;
}
return 0;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index d244d743a232..1da3805f3ddc 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -27,19 +27,14 @@
#include "coda_int.h"
static ssize_t
-coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *ppos)
+coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct coda_file_info *cfi;
- struct file *host_file;
+ struct file *coda_file = iocb->ki_filp;
+ struct coda_file_info *cfi = CODA_FTOC(coda_file);
- cfi = CODA_FTOC(coda_file);
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
- host_file = cfi->cfi_container;
- if (!host_file->f_op->read)
- return -EINVAL;
-
- return host_file->f_op->read(host_file, buf, count, ppos);
+ return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos);
}
static ssize_t
@@ -64,32 +59,25 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
}
static ssize_t
-coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos)
+coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct inode *host_inode, *coda_inode = file_inode(coda_file);
- struct coda_file_info *cfi;
+ struct file *coda_file = iocb->ki_filp;
+ struct inode *coda_inode = file_inode(coda_file);
+ struct coda_file_info *cfi = CODA_FTOC(coda_file);
struct file *host_file;
ssize_t ret;
- cfi = CODA_FTOC(coda_file);
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
- host_file = cfi->cfi_container;
-
- if (!host_file->f_op->write)
- return -EINVAL;
- host_inode = file_inode(host_file);
+ host_file = cfi->cfi_container;
file_start_write(host_file);
mutex_lock(&coda_inode->i_mutex);
-
- ret = host_file->f_op->write(host_file, buf, count, ppos);
-
- coda_inode->i_size = host_inode->i_size;
+ ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
+ coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
mutex_unlock(&coda_inode->i_mutex);
file_end_write(host_file);
-
return ret;
}
@@ -231,8 +219,8 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
const struct file_operations coda_file_operations = {
.llseek = generic_file_llseek,
- .read = coda_file_read,
- .write = coda_file_write,
+ .read_iter = coda_file_read_iter,
+ .write_iter = coda_file_write_iter,
.mmap = coda_file_mmap,
.open = coda_open,
.release = coda_release,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 82ec68b59208..cac1390b87a3 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -257,15 +257,15 @@ static void coda_evict_inode(struct inode *inode)
int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
- int err = coda_revalidate_inode(dentry->d_inode);
+ int err = coda_revalidate_inode(d_inode(dentry));
if (!err)
- generic_fillattr(dentry->d_inode, stat);
+ generic_fillattr(d_inode(dentry), stat);
return err;
}
int coda_setattr(struct dentry *de, struct iattr *iattr)
{
- struct inode *inode = de->d_inode;
+ struct inode *inode = d_inode(de);
struct coda_vattr vattr;
int error;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 4326d172fc27..f36a4040afb8 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -72,7 +72,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd,
if (error)
return error;
- target_inode = path.dentry->d_inode;
+ target_inode = d_inode(path.dentry);
/* return if it is not a Coda inode */
if (target_inode->i_sb != inode->i_sb) {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 5bb6e27298a4..9b1ffaa0572e 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -820,8 +820,8 @@ int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
case CODA_FLUSH:
coda_cache_clear_all(sb);
shrink_dcache_sb(sb);
- if (sb->s_root->d_inode)
- coda_flag_inode(sb->s_root->d_inode, C_FLUSH);
+ if (d_really_is_positive(sb->s_root))
+ coda_flag_inode(d_inode(sb->s_root), C_FLUSH);
break;
case CODA_PURGEUSER:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index afec6450450f..48851f6ea6ec 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -570,6 +570,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
#define BNEPCONNDEL _IOW('B', 201, int)
#define BNEPGETCONNLIST _IOR('B', 210, int)
#define BNEPGETCONNINFO _IOR('B', 211, int)
+#define BNEPGETSUPPFEAT _IOR('B', 212, int)
#define CMTPCONNADD _IOW('C', 200, int)
#define CMTPCONNDEL _IOW('C', 201, int)
@@ -895,6 +896,7 @@ COMPATIBLE_IOCTL(FIGETBSZ)
/* 'X' - originally XFS but some now in the VFS */
COMPATIBLE_IOCTL(FIFREEZE)
COMPATIBLE_IOCTL(FITHAW)
+COMPATIBLE_IOCTL(FITRIM)
COMPATIBLE_IOCTL(KDGETKEYCODE)
COMPATIBLE_IOCTL(KDSETKEYCODE)
COMPATIBLE_IOCTL(KDGKBTYPE)
@@ -1247,6 +1249,7 @@ COMPATIBLE_IOCTL(BNEPCONNADD)
COMPATIBLE_IOCTL(BNEPCONNDEL)
COMPATIBLE_IOCTL(BNEPGETCONNLIST)
COMPATIBLE_IOCTL(BNEPGETCONNINFO)
+COMPATIBLE_IOCTL(BNEPGETSUPPFEAT)
COMPATIBLE_IOCTL(CMTPCONNADD)
COMPATIBLE_IOCTL(CMTPCONNDEL)
COMPATIBLE_IOCTL(CMTPGETCONNLIST)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index cf0db005d2f5..c81ce7f200a6 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -289,7 +289,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry)
configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata);
error = configfs_create(dentry, mode, init_dir);
if (!error) {
- inc_nlink(p->d_inode);
+ inc_nlink(d_inode(p));
item->ci_dentry = dentry;
} else {
struct configfs_dirent *sd = dentry->d_fsdata;
@@ -375,8 +375,8 @@ static void remove_dir(struct dentry * d)
list_del_init(&sd->s_sibling);
spin_unlock(&configfs_dirent_lock);
configfs_put(sd);
- if (d->d_inode)
- simple_rmdir(parent->d_inode,d);
+ if (d_really_is_positive(d))
+ simple_rmdir(d_inode(parent),d);
pr_debug(" o %pd removing done (%d)\n", d, d_count(d));
@@ -513,7 +513,7 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
/* Abort if racing with mkdir() */
if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
if (wait_mutex)
- *wait_mutex = &sd->s_dentry->d_inode->i_mutex;
+ *wait_mutex = &d_inode(sd->s_dentry)->i_mutex;
return -EAGAIN;
}
@@ -624,13 +624,13 @@ static void detach_groups(struct config_group *group)
child = sd->s_dentry;
- mutex_lock(&child->d_inode->i_mutex);
+ mutex_lock(&d_inode(child)->i_mutex);
configfs_detach_group(sd->s_element);
- child->d_inode->i_flags |= S_DEAD;
+ d_inode(child)->i_flags |= S_DEAD;
dont_mount(child);
- mutex_unlock(&child->d_inode->i_mutex);
+ mutex_unlock(&d_inode(child)->i_mutex);
d_delete(child);
dput(child);
@@ -672,7 +672,7 @@ static int create_default_group(struct config_group *parent_group,
sd = child->d_fsdata;
sd->s_type |= CONFIGFS_USET_DEFAULT;
} else {
- BUG_ON(child->d_inode);
+ BUG_ON(d_inode(child));
d_drop(child);
dput(child);
}
@@ -818,11 +818,11 @@ static int configfs_attach_item(struct config_item *parent_item,
* the VFS may already have hit and used them. Thus,
* we must lock them as rmdir() would.
*/
- mutex_lock(&dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(dentry)->i_mutex);
configfs_remove_dir(item);
- dentry->d_inode->i_flags |= S_DEAD;
+ d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
d_delete(dentry);
}
}
@@ -858,16 +858,16 @@ static int configfs_attach_group(struct config_item *parent_item,
* We must also lock the inode to remove it safely in case of
* error, as rmdir() would.
*/
- mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
configfs_adjust_dir_dirent_depth_before_populate(sd);
ret = populate_groups(to_config_group(item));
if (ret) {
configfs_detach_item(item);
- dentry->d_inode->i_flags |= S_DEAD;
+ d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
}
configfs_adjust_dir_dirent_depth_after_populate(sd);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
if (ret)
d_delete(dentry);
}
@@ -1075,7 +1075,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
* subsystem is really registered, and so we need to lock out
* configfs_[un]register_subsystem().
*/
- mutex_lock(&root->d_inode->i_mutex);
+ mutex_lock(&d_inode(root)->i_mutex);
root_sd = root->d_fsdata;
@@ -1111,7 +1111,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
out_unlock_dirent_lock:
spin_unlock(&configfs_dirent_lock);
out_unlock_fs:
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
/*
* If we succeeded, the fs is pinned via other methods. If not,
@@ -1453,11 +1453,11 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
down_write(&configfs_rename_sem);
parent = item->parent->dentry;
- mutex_lock(&parent->d_inode->i_mutex);
+ mutex_lock(&d_inode(parent)->i_mutex);
new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
if (!IS_ERR(new_dentry)) {
- if (!new_dentry->d_inode) {
+ if (d_really_is_negative(new_dentry)) {
error = config_item_set_name(item, "%s", new_name);
if (!error) {
d_add(new_dentry, NULL);
@@ -1469,7 +1469,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
error = -EEXIST;
dput(new_dentry);
}
- mutex_unlock(&parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(parent)->i_mutex);
up_write(&configfs_rename_sem);
return error;
@@ -1482,7 +1482,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
struct configfs_dirent * parent_sd = dentry->d_fsdata;
int err;
- mutex_lock(&dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(dentry)->i_mutex);
/*
* Fake invisibility if dir belongs to a group/default groups hierarchy
* being attached
@@ -1495,7 +1495,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
else
err = 0;
}
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
return err;
}
@@ -1505,11 +1505,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
struct dentry * dentry = file->f_path.dentry;
struct configfs_dirent * cursor = file->private_data;
- mutex_lock(&dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(dentry)->i_mutex);
spin_lock(&configfs_dirent_lock);
list_del_init(&cursor->s_sibling);
spin_unlock(&configfs_dirent_lock);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
release_configfs_dirent(cursor);
@@ -1567,7 +1567,7 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx)
spin_lock(&configfs_dirent_lock);
dentry = next->s_dentry;
if (dentry)
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
if (inode)
ino = inode->i_ino;
spin_unlock(&configfs_dirent_lock);
@@ -1590,7 +1590,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
{
struct dentry * dentry = file->f_path.dentry;
- mutex_lock(&dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(dentry)->i_mutex);
switch (whence) {
case 1:
offset += file->f_pos;
@@ -1598,7 +1598,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- mutex_unlock(&file_inode(file)->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
return -EINVAL;
}
if (offset != file->f_pos) {
@@ -1624,7 +1624,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
spin_unlock(&configfs_dirent_lock);
}
}
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
return offset;
}
@@ -1654,7 +1654,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
sd = root->d_fsdata;
link_group(to_config_group(sd->s_element), group);
- mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT);
err = -ENOMEM;
dentry = d_alloc_name(root, group->cg_item.ci_name);
@@ -1664,7 +1664,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
err = configfs_attach_group(sd->s_element, &group->cg_item,
dentry);
if (err) {
- BUG_ON(dentry->d_inode);
+ BUG_ON(d_inode(dentry));
d_drop(dentry);
dput(dentry);
} else {
@@ -1674,7 +1674,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
}
}
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
if (err) {
unlink_group(group);
@@ -1695,9 +1695,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
return;
}
- mutex_lock_nested(&root->d_inode->i_mutex,
+ mutex_lock_nested(&d_inode(root)->i_mutex,
I_MUTEX_PARENT);
- mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
mutex_lock(&configfs_symlink_mutex);
spin_lock(&configfs_dirent_lock);
if (configfs_detach_prep(dentry, NULL)) {
@@ -1706,13 +1706,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
spin_unlock(&configfs_dirent_lock);
mutex_unlock(&configfs_symlink_mutex);
configfs_detach_group(&group->cg_item);
- dentry->d_inode->i_flags |= S_DEAD;
+ d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
d_delete(dentry);
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
dput(dentry);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 56d2cdc9ae0a..403269ffcdf3 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -326,10 +326,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
int error = 0;
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL);
+ mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
CONFIGFS_ITEM_ATTR);
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
return error;
}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5423a6a6ecc8..eae87575e681 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -56,7 +56,7 @@ static const struct inode_operations configfs_inode_operations ={
int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
{
- struct inode * inode = dentry->d_inode;
+ struct inode * inode = d_inode(dentry);
struct configfs_dirent * sd = dentry->d_fsdata;
struct iattr * sd_iattr;
unsigned int ia_valid = iattr->ia_valid;
@@ -186,7 +186,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in
if (!dentry)
return -ENOENT;
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
return -EEXIST;
sd = dentry->d_fsdata;
@@ -194,7 +194,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in
if (!inode)
return -ENOMEM;
- p_inode = dentry->d_parent->d_inode;
+ p_inode = d_inode(dentry->d_parent);
p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
configfs_set_inode_lock_class(sd, inode);
@@ -236,11 +236,11 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
if (dentry) {
spin_lock(&dentry->d_lock);
- if (!d_unhashed(dentry) && dentry->d_inode) {
+ if (simple_positive(dentry)) {
dget_dlock(dentry);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
- simple_unlink(parent->d_inode, dentry);
+ simple_unlink(d_inode(parent), dentry);
} else
spin_unlock(&dentry->d_lock);
}
@@ -251,11 +251,11 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
struct configfs_dirent * sd;
struct configfs_dirent * parent_sd = dir->d_fsdata;
- if (dir->d_inode == NULL)
+ if (d_really_is_negative(dir))
/* no inode means this hasn't been made visible yet */
return;
- mutex_lock(&dir->d_inode->i_mutex);
+ mutex_lock(&d_inode(dir)->i_mutex);
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
if (!sd->s_element)
continue;
@@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
break;
}
}
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
}
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e65f9ffbb999..b863a09cd2f1 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref);
* config_item_init - initialize item.
* @item: item in question.
*/
-void config_item_init(struct config_item *item)
+static void config_item_init(struct config_item *item)
{
kref_init(&item->ci_kref);
INIT_LIST_HEAD(&item->ci_entry);
}
-EXPORT_SYMBOL(config_item_init);
/**
* config_item_set_name - Set the name of an item
@@ -116,7 +115,7 @@ void config_item_init_type_name(struct config_item *item,
const char *name,
struct config_item_type *type)
{
- config_item_set_name(item, name);
+ config_item_set_name(item, "%s", name);
item->ci_type = type;
config_item_init(item);
}
@@ -125,7 +124,7 @@ EXPORT_SYMBOL(config_item_init_type_name);
void config_group_init_type_name(struct config_group *group, const char *name,
struct config_item_type *type)
{
- config_item_set_name(&group->cg_item, name);
+ config_item_set_name(&group->cg_item, "%s", name);
group->cg_item.ci_type = type;
config_group_init(group);
}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index da94e41bdbf6..a8f3b589a2df 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -129,8 +129,6 @@ void configfs_release_fs(void)
}
-static struct kobject *config_kobj;
-
static int __init configfs_init(void)
{
int err = -ENOMEM;
@@ -141,8 +139,8 @@ static int __init configfs_init(void)
if (!configfs_dir_cachep)
goto out;
- config_kobj = kobject_create_and_add("config", kernel_kobj);
- if (!config_kobj)
+ err = sysfs_create_mount_point(kernel_kobj, "config");
+ if (err)
goto out2;
err = register_filesystem(&configfs_fs_type);
@@ -152,7 +150,7 @@ static int __init configfs_init(void)
return 0;
out3:
pr_err("Unable to register filesystem!\n");
- kobject_put(config_kobj);
+ sysfs_remove_mount_point(kernel_kobj, "config");
out2:
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
@@ -163,7 +161,7 @@ out:
static void __exit configfs_exit(void)
{
unregister_filesystem(&configfs_fs_type);
- kobject_put(config_kobj);
+ sysfs_remove_mount_point(kernel_kobj, "config");
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
}
@@ -173,5 +171,5 @@ MODULE_LICENSE("GPL");
MODULE_VERSION("0.0.2");
MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
-module_init(configfs_init);
+core_initcall(configfs_init);
module_exit(configfs_exit);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index cc9f2546ea4a..ec5c8325b503 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,36 +279,27 @@ static int configfs_getlink(struct dentry *dentry, char * path)
}
-static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
{
- int error = -ENOMEM;
unsigned long page = get_zeroed_page(GFP_KERNEL);
+ int error;
- if (page) {
- error = configfs_getlink(dentry, (char *)page);
- if (!error) {
- nd_set_link(nd, (char *)page);
- return (void *)page;
- }
- }
-
- nd_set_link(nd, ERR_PTR(error));
- return NULL;
-}
+ if (!page)
+ return ERR_PTR(-ENOMEM);
-static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
- void *cookie)
-{
- if (cookie) {
- unsigned long page = (unsigned long)cookie;
- free_page(page);
+ error = configfs_getlink(dentry, (char *)page);
+ if (!error) {
+ return *cookie = (void *)page;
}
+
+ free_page(page);
+ return ERR_PTR(error);
}
const struct inode_operations configfs_symlink_inode_operations = {
.follow_link = configfs_follow_link,
.readlink = generic_readlink,
- .put_link = configfs_put_link,
+ .put_link = free_page_put_link,
.setattr = configfs_setattr,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index f319926ddf8c..c5ecde6f3eed 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -70,7 +70,8 @@ static int expand_corename(struct core_name *cn, int size)
return 0;
}
-static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
+static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt,
+ va_list arg)
{
int free, need;
va_list arg_copy;
@@ -93,7 +94,7 @@ again:
return -ENOMEM;
}
-static int cn_printf(struct core_name *cn, const char *fmt, ...)
+static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...)
{
va_list arg;
int ret;
@@ -105,7 +106,8 @@ static int cn_printf(struct core_name *cn, const char *fmt, ...)
return ret;
}
-static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
+static __printf(2, 3)
+int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
{
int cur = cn->used;
va_list arg;
@@ -138,7 +140,7 @@ static int cn_print_exe_file(struct core_name *cn)
goto put_exe_file;
}
- path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+ path = file_path(exe_file, pathbuf, PATH_MAX);
if (IS_ERR(path)) {
ret = PTR_ERR(path);
goto free_buf;
@@ -209,11 +211,15 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
break;
/* uid */
case 'u':
- err = cn_printf(cn, "%d", cred->uid);
+ err = cn_printf(cn, "%u",
+ from_kuid(&init_user_ns,
+ cred->uid));
break;
/* gid */
case 'g':
- err = cn_printf(cn, "%d", cred->gid);
+ err = cn_printf(cn, "%u",
+ from_kgid(&init_user_ns,
+ cred->gid));
break;
case 'd':
err = cn_printf(cn, "%d",
@@ -221,7 +227,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
break;
/* signal that caused the coredump */
case 's':
- err = cn_printf(cn, "%ld", cprm->siginfo->si_signo);
+ err = cn_printf(cn, "%d",
+ cprm->siginfo->si_signo);
break;
/* UNIX time of coredump */
case 't': {
@@ -657,7 +664,7 @@ void do_coredump(const siginfo_t *siginfo)
*/
if (!uid_eq(inode->i_uid, current_fsuid()))
goto close_fail;
- if (!cprm.file->f_op->write)
+ if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
goto close_fail;
diff --git a/fs/dax.c b/fs/dax.c
index ed1619ec6537..a7f77e1fa18c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -98,9 +98,9 @@ static bool buffer_size_valid(struct buffer_head *bh)
return bh->b_state != 0;
}
-static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
- loff_t start, loff_t end, get_block_t get_block,
- struct buffer_head *bh)
+static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
+ loff_t start, loff_t end, get_block_t get_block,
+ struct buffer_head *bh)
{
ssize_t retval = 0;
loff_t pos = start;
@@ -109,7 +109,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
void *addr;
bool hole = false;
- if (rw != WRITE)
+ if (iov_iter_rw(iter) != WRITE)
end = min(end, i_size_read(inode));
while (pos < end) {
@@ -124,7 +124,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
bh->b_size = PAGE_ALIGN(end - pos);
bh->b_state = 0;
retval = get_block(inode, block, bh,
- rw == WRITE);
+ iov_iter_rw(iter) == WRITE);
if (retval)
break;
if (!buffer_size_valid(bh))
@@ -137,7 +137,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
bh->b_size -= done;
}
- hole = (rw != WRITE) && !buffer_written(bh);
+ hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh);
if (hole) {
addr = NULL;
size = bh->b_size - first;
@@ -154,8 +154,8 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
max = min(pos + size, end);
}
- if (rw == WRITE)
- len = copy_from_iter(addr, max - pos, iter);
+ if (iov_iter_rw(iter) == WRITE)
+ len = copy_from_iter_nocache(addr, max - pos, iter);
else if (!hole)
len = copy_to_iter(addr, max - pos, iter);
else
@@ -173,7 +173,6 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
/**
* dax_do_io - Perform I/O to a DAX file
- * @rw: READ to read or WRITE to write
* @iocb: The control block for this I/O
* @inode: The file which the I/O is directed at
* @iter: The addresses to do I/O from or to
@@ -189,9 +188,9 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
* As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
* is in progress.
*/
-ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
- struct iov_iter *iter, loff_t pos,
- get_block_t get_block, dio_iodone_t end_io, int flags)
+ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
+ struct iov_iter *iter, loff_t pos, get_block_t get_block,
+ dio_iodone_t end_io, int flags)
{
struct buffer_head bh;
ssize_t retval = -EINVAL;
@@ -199,7 +198,7 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
memset(&bh, 0, sizeof(bh));
- if ((flags & DIO_LOCKING) && (rw == READ)) {
+ if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
struct address_space *mapping = inode->i_mapping;
mutex_lock(&inode->i_mutex);
retval = filemap_write_and_wait_range(mapping, pos, end - 1);
@@ -210,17 +209,19 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
}
/* Protects against truncate */
- atomic_inc(&inode->i_dio_count);
+ if (!(flags & DIO_SKIP_DIO_COUNT))
+ inode_dio_begin(inode);
- retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
+ retval = dax_io(inode, iter, pos, end, get_block, &bh);
- if ((flags & DIO_LOCKING) && (rw == READ))
+ if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
mutex_unlock(&inode->i_mutex);
if ((retval > 0) && end_io)
end_io(iocb, pos, retval, bh.b_private);
- inode_dio_done(inode);
+ if (!(flags & DIO_SKIP_DIO_COUNT))
+ inode_dio_end(inode);
out:
return retval;
}
@@ -310,14 +311,27 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
out:
i_mmap_unlock_read(mapping);
- if (bh->b_end_io)
- bh->b_end_io(bh, 1);
-
return error;
}
-static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
+/**
+ * __dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @complete_unwritten: The filesystem method used to convert unwritten blocks
+ * to written so the data written to them is exposed. This is required for
+ * required by write faults for filesystems that will return unwritten
+ * extent mappings from @get_block, but it is optional for reads as
+ * dax_insert_mapping() will always zero unwritten blocks. If the fs does
+ * not support unwritten extents, the it should pass NULL.
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files. __dax_fault() assumes the caller has done all
+ * the necessary locking for the page fault to proceed successfully.
+ */
+int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block, dax_iodone_t complete_unwritten)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
@@ -418,7 +432,23 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
page_cache_release(page);
}
+ /*
+ * If we successfully insert the new mapping over an unwritten extent,
+ * we need to ensure we convert the unwritten extent. If there is an
+ * error inserting the mapping, the filesystem needs to leave it as
+ * unwritten to prevent exposure of the stale underlying data to
+ * userspace, but we still need to call the completion function so
+ * the private resources on the mapping buffer can be released. We
+ * indicate what the callback should do via the uptodate variable, same
+ * as for normal BH based IO completions.
+ */
error = dax_insert_mapping(inode, &bh, vma, vmf);
+ if (buffer_unwritten(&bh)) {
+ if (complete_unwritten)
+ complete_unwritten(&bh, !error);
+ else
+ WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
+ }
out:
if (error == -ENOMEM)
@@ -435,6 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
goto out;
}
+EXPORT_SYMBOL(__dax_fault);
/**
* dax_fault - handle a page fault on a DAX file
@@ -446,7 +477,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* fault handler for DAX files.
*/
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
+ get_block_t get_block, dax_iodone_t complete_unwritten)
{
int result;
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -455,7 +486,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
}
- result = do_dax_fault(vma, vmf, get_block);
+ result = __dax_fault(vma, vmf, get_block, complete_unwritten);
if (vmf->flags & FAULT_FLAG_WRITE)
sb_end_pagefault(sb);
@@ -464,6 +495,23 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
EXPORT_SYMBOL_GPL(dax_fault);
/**
+ * dax_pfn_mkwrite - handle first write to DAX page
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ *
+ */
+int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ sb_end_pagefault(sb);
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
+
+/**
* dax_zero_page_range - zero a range within a page of a DAX file
* @inode: The file being truncated
* @from: The file offset that is being truncated to
diff --git a/fs/dcache.c b/fs/dcache.c
index c71e3732e53b..9b5fe503f6cb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -269,6 +269,41 @@ static inline int dname_external(const struct dentry *dentry)
return dentry->d_name.name != dentry->d_iname;
}
+/*
+ * Make sure other CPUs see the inode attached before the type is set.
+ */
+static inline void __d_set_inode_and_type(struct dentry *dentry,
+ struct inode *inode,
+ unsigned type_flags)
+{
+ unsigned flags;
+
+ dentry->d_inode = inode;
+ smp_wmb();
+ flags = READ_ONCE(dentry->d_flags);
+ flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+ flags |= type_flags;
+ WRITE_ONCE(dentry->d_flags, flags);
+}
+
+/*
+ * Ideally, we want to make sure that other CPUs see the flags cleared before
+ * the inode is detached, but this is really a violation of RCU principles
+ * since the ordering suggests we should always set inode before flags.
+ *
+ * We should instead replace or discard the entire dentry - but that sucks
+ * performancewise on mass deletion/rename.
+ */
+static inline void __d_clear_type_and_inode(struct dentry *dentry)
+{
+ unsigned flags = READ_ONCE(dentry->d_flags);
+
+ flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+ WRITE_ONCE(dentry->d_flags, flags);
+ smp_wmb();
+ dentry->d_inode = NULL;
+}
+
static void dentry_free(struct dentry *dentry)
{
WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
@@ -287,17 +322,17 @@ static void dentry_free(struct dentry *dentry)
}
/**
- * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
* @dentry: the target dentry
* After this call, in-progress rcu-walk path lookup will fail. This
* should be called after unhashing, and after changing d_inode (if
* the dentry has not already been unhashed).
*/
-static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
{
- assert_spin_locked(&dentry->d_lock);
- /* Go through a barrier */
- write_seqcount_barrier(&dentry->d_seq);
+ lockdep_assert_held(&dentry->d_lock);
+ /* Go through am invalidation barrier */
+ write_seqcount_invalidate(&dentry->d_seq);
}
/*
@@ -311,7 +346,7 @@ static void dentry_iput(struct dentry * dentry)
{
struct inode *inode = dentry->d_inode;
if (inode) {
- dentry->d_inode = NULL;
+ __d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
@@ -335,10 +370,9 @@ static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
- __d_clear_type(dentry);
- dentry->d_inode = NULL;
+ __d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- dentry_rcuwalk_barrier(dentry);
+ dentry_rcuwalk_invalidate(dentry);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -460,7 +494,7 @@ void __d_drop(struct dentry *dentry)
__hlist_bl_del(&dentry->d_hash);
dentry->d_hash.pprev = NULL;
hlist_bl_unlock(b);
- dentry_rcuwalk_barrier(dentry);
+ dentry_rcuwalk_invalidate(dentry);
}
}
EXPORT_SYMBOL(__d_drop);
@@ -608,7 +642,7 @@ static inline bool fast_dput(struct dentry *dentry)
/*
* If we have a d_op->d_delete() operation, we sould not
- * let the dentry count go to zero, so use "put__or_lock".
+ * let the dentry count go to zero, so use "put_or_lock".
*/
if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
return lockref_put_or_lock(&dentry->d_lockref);
@@ -663,7 +697,7 @@ static inline bool fast_dput(struct dentry *dentry)
*/
smp_rmb();
d_flags = ACCESS_ONCE(dentry->d_flags);
- d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST;
+ d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;
/* Nothing to do? Dropping the reference was all we needed? */
if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
@@ -742,6 +776,9 @@ repeat:
if (unlikely(d_unhashed(dentry)))
goto kill_it;
+ if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+ goto kill_it;
+
if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
if (dentry->d_op->d_delete(dentry))
goto kill_it;
@@ -1205,13 +1242,13 @@ ascend:
/* might go back up the wrong parent if we have had a rename. */
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
- next = child->d_child.next;
- while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)) {
+ /* go into the first sibling still alive */
+ do {
+ next = child->d_child.next;
if (next == &this_parent->d_subdirs)
goto ascend;
child = list_entry(next, struct dentry, d_child);
- next = next->next;
- }
+ } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
rcu_read_unlock();
goto resume;
}
@@ -1639,7 +1676,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
DCACHE_OP_COMPARE |
DCACHE_OP_REVALIDATE |
DCACHE_OP_WEAK_REVALIDATE |
- DCACHE_OP_DELETE ));
+ DCACHE_OP_DELETE |
+ DCACHE_OP_SELECT_INODE));
dentry->d_op = op;
if (!op)
return;
@@ -1655,6 +1693,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
dentry->d_flags |= DCACHE_OP_DELETE;
if (op->d_prune)
dentry->d_flags |= DCACHE_OP_PRUNE;
+ if (op->d_select_inode)
+ dentry->d_flags |= DCACHE_OP_SELECT_INODE;
}
EXPORT_SYMBOL(d_set_d_op);
@@ -1715,12 +1755,10 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
unsigned add_flags = d_flags_for_inode(inode);
spin_lock(&dentry->d_lock);
- dentry->d_flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
- dentry->d_flags |= add_flags;
if (inode)
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
- dentry->d_inode = inode;
- dentry_rcuwalk_barrier(dentry);
+ __d_set_inode_and_type(dentry, inode, add_flags);
+ dentry_rcuwalk_invalidate(dentry);
spin_unlock(&dentry->d_lock);
fsnotify_d_instantiate(dentry, inode);
}
@@ -1937,8 +1975,7 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
add_flags |= DCACHE_DISCONNECTED;
spin_lock(&tmp->d_lock);
- tmp->d_inode = inode;
- tmp->d_flags |= add_flags;
+ __d_set_inode_and_type(tmp, inode, add_flags);
hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry);
hlist_bl_lock(&tmp->d_sb->s_anon);
hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
@@ -2690,7 +2727,7 @@ static int __d_unalias(struct inode *inode,
struct dentry *dentry, struct dentry *alias)
{
struct mutex *m1 = NULL, *m2 = NULL;
- int ret = -EBUSY;
+ int ret = -ESTALE;
/* If alias and dentry share a parent, then no extra locks required */
if (alias->d_parent == dentry->d_parent)
@@ -2896,17 +2933,6 @@ restart:
vfsmnt = &mnt->mnt;
continue;
}
- /*
- * Filesystems needing to implement special "root names"
- * should do so with ->d_dname()
- */
- if (IS_ROOT(dentry) &&
- (dentry->d_name.len != 1 ||
- dentry->d_name.name[0] != '/')) {
- WARN(1, "Root dentry has weird name <%.*s>\n",
- (int) dentry->d_name.len,
- dentry->d_name.name);
- }
if (!error)
error = is_mounted(vfsmnt) ? 1 : 2;
break;
@@ -3416,22 +3442,15 @@ void __init vfs_caches_init_early(void)
inode_init_early();
}
-void __init vfs_caches_init(unsigned long mempages)
+void __init vfs_caches_init(void)
{
- unsigned long reserve;
-
- /* Base hash sizes on available memory, with a reserve equal to
- 150% of current kernel size */
-
- reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1);
- mempages -= reserve;
-
names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
dcache_init();
inode_init();
- files_init(mempages);
+ files_init();
+ files_maxfiles_init();
mnt_init();
bdev_cache_init();
chrdev_init();
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 517e64938438..284f9aa0028b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -17,7 +17,6 @@
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/pagemap.h>
-#include <linux/namei.h>
#include <linux/debugfs.h>
#include <linux/io.h>
#include <linux/slab.h>
@@ -43,17 +42,6 @@ const struct file_operations debugfs_file_operations = {
.llseek = noop_llseek,
};
-static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- nd_set_link(nd, dentry->d_inode->i_private);
- return NULL;
-}
-
-const struct inode_operations debugfs_link_operations = {
- .readlink = generic_readlink,
- .follow_link = debugfs_follow_link,
-};
-
static int debugfs_u8_set(void *data, u64 val)
{
*(u8 *)data = val;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 96400ab42d13..c711be8d6a3c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -44,11 +44,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
return inode;
}
-static inline int debugfs_positive(struct dentry *dentry)
-{
- return dentry->d_inode && !d_unhashed(dentry);
-}
-
struct debugfs_mount_opts {
kuid_t uid;
kgid_t gid;
@@ -124,7 +119,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
static int debugfs_apply_options(struct super_block *sb)
{
struct debugfs_fs_info *fsi = sb->s_fs_info;
- struct inode *inode = sb->s_root->d_inode;
+ struct inode *inode = d_inode(sb->s_root);
struct debugfs_mount_opts *opts = &fsi->mount_opts;
inode->i_mode &= ~S_IALLUGO;
@@ -174,7 +169,7 @@ static void debugfs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (S_ISLNK(inode->i_mode))
- kfree(inode->i_private);
+ kfree(inode->i_link);
}
static const struct super_operations debugfs_super_operations = {
@@ -188,7 +183,7 @@ static struct vfsmount *debugfs_automount(struct path *path)
{
struct vfsmount *(*f)(void *);
f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
- return f(path->dentry->d_inode->i_private);
+ return f(d_inode(path->dentry)->i_private);
}
static const struct dentry_operations debugfs_dops = {
@@ -254,6 +249,9 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
pr_debug("debugfs: creating file '%s'\n",name);
+ if (IS_ERR(parent))
+ return parent;
+
error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
&debugfs_mount_count);
if (error)
@@ -267,20 +265,20 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- mutex_lock(&parent->d_inode->i_mutex);
+ mutex_lock(&d_inode(parent)->i_mutex);
dentry = lookup_one_len(name, parent, strlen(name));
- if (!IS_ERR(dentry) && dentry->d_inode) {
+ if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
if (IS_ERR(dentry))
- mutex_unlock(&parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(parent)->i_mutex);
return dentry;
}
static struct dentry *failed_creating(struct dentry *dentry)
{
- mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
dput(dentry);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
return NULL;
@@ -288,7 +286,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
static struct dentry *end_creating(struct dentry *dentry)
{
- mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
return dentry;
}
@@ -341,7 +339,7 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
inode->i_fop = fops ? fops : &debugfs_file_operations;
inode->i_private = data;
d_instantiate(dentry, inode);
- fsnotify_create(dentry->d_parent->d_inode, dentry);
+ fsnotify_create(d_inode(dentry->d_parent), dentry);
return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_file);
@@ -381,7 +379,7 @@ struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
if (de)
- de->d_inode->i_size = file_size;
+ d_inode(de)->i_size = file_size;
return de;
}
EXPORT_SYMBOL_GPL(debugfs_create_file_size);
@@ -423,8 +421,8 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
- inc_nlink(dentry->d_parent->d_inode);
- fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ inc_nlink(d_inode(dentry->d_parent));
+ fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_dir);
@@ -508,8 +506,8 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
return failed_creating(dentry);
}
inode->i_mode = S_IFLNK | S_IRWXUGO;
- inode->i_op = &debugfs_link_operations;
- inode->i_private = link;
+ inode->i_op = &simple_symlink_inode_operations;
+ inode->i_link = link;
d_instantiate(dentry, inode);
return end_creating(dentry);
}
@@ -519,12 +517,12 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
{
int ret = 0;
- if (debugfs_positive(dentry)) {
+ if (simple_positive(dentry)) {
dget(dentry);
- if (S_ISDIR(dentry->d_inode->i_mode))
- ret = simple_rmdir(parent->d_inode, dentry);
+ if (d_is_dir(dentry))
+ ret = simple_rmdir(d_inode(parent), dentry);
else
- simple_unlink(parent->d_inode, dentry);
+ simple_unlink(d_inode(parent), dentry);
if (!ret)
d_delete(dentry);
dput(dentry);
@@ -554,12 +552,12 @@ void debugfs_remove(struct dentry *dentry)
return;
parent = dentry->d_parent;
- if (!parent || !parent->d_inode)
+ if (!parent || d_really_is_negative(parent))
return;
- mutex_lock(&parent->d_inode->i_mutex);
+ mutex_lock(&d_inode(parent)->i_mutex);
ret = __debugfs_remove(dentry, parent);
- mutex_unlock(&parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(parent)->i_mutex);
if (!ret)
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
@@ -585,12 +583,12 @@ void debugfs_remove_recursive(struct dentry *dentry)
return;
parent = dentry->d_parent;
- if (!parent || !parent->d_inode)
+ if (!parent || d_really_is_negative(parent))
return;
parent = dentry;
down:
- mutex_lock(&parent->d_inode->i_mutex);
+ mutex_lock(&d_inode(parent)->i_mutex);
loop:
/*
* The parent->d_subdirs is protected by the d_lock. Outside that
@@ -599,13 +597,13 @@ void debugfs_remove_recursive(struct dentry *dentry)
*/
spin_lock(&parent->d_lock);
list_for_each_entry(child, &parent->d_subdirs, d_child) {
- if (!debugfs_positive(child))
+ if (!simple_positive(child))
continue;
/* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
spin_unlock(&parent->d_lock);
- mutex_unlock(&parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(parent)->i_mutex);
parent = child;
goto down;
}
@@ -620,16 +618,16 @@ void debugfs_remove_recursive(struct dentry *dentry)
* from d_subdirs. When releasing the parent->d_lock we can
* no longer trust that the next pointer is valid.
* Restart the loop. We'll skip this one with the
- * debugfs_positive() check.
+ * simple_positive() check.
*/
goto loop;
}
spin_unlock(&parent->d_lock);
- mutex_unlock(&parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(parent)->i_mutex);
child = parent;
parent = parent->d_parent;
- mutex_lock(&parent->d_inode->i_mutex);
+ mutex_lock(&d_inode(parent)->i_mutex);
if (child != dentry)
/* go up */
@@ -637,7 +635,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
if (!__debugfs_remove(child, parent))
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
- mutex_unlock(&parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(parent)->i_mutex);
}
EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
@@ -669,27 +667,27 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
trap = lock_rename(new_dir, old_dir);
/* Source or destination directories don't exist? */
- if (!old_dir->d_inode || !new_dir->d_inode)
+ if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))
goto exit;
/* Source does not exist, cyclic rename, or mountpoint? */
- if (!old_dentry->d_inode || old_dentry == trap ||
+ if (d_really_is_negative(old_dentry) || old_dentry == trap ||
d_mountpoint(old_dentry))
goto exit;
dentry = lookup_one_len(new_name, new_dir, strlen(new_name));
/* Lookup failed, cyclic rename or target exists? */
- if (IS_ERR(dentry) || dentry == trap || dentry->d_inode)
+ if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry))
goto exit;
old_name = fsnotify_oldname_init(old_dentry->d_name.name);
- error = simple_rename(old_dir->d_inode, old_dentry, new_dir->d_inode,
+ error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir),
dentry);
if (error) {
fsnotify_oldname_free(old_name);
goto exit;
}
d_move(old_dentry, dentry);
- fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name,
+ fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name,
d_is_dir(old_dentry),
NULL, old_dentry);
fsnotify_oldname_free(old_name);
@@ -713,20 +711,17 @@ bool debugfs_initialized(void)
}
EXPORT_SYMBOL_GPL(debugfs_initialized);
-
-static struct kobject *debug_kobj;
-
static int __init debugfs_init(void)
{
int retval;
- debug_kobj = kobject_create_and_add("debug", kernel_kobj);
- if (!debug_kobj)
- return -EINVAL;
+ retval = sysfs_create_mount_point(kernel_kobj, "debug");
+ if (retval)
+ return retval;
retval = register_filesystem(&debug_fs_type);
if (retval)
- kobject_put(debug_kobj);
+ sysfs_remove_mount_point(kernel_kobj, "debug");
else
debugfs_registered = true;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index cfe8466f7fef..c35ffdc12bba 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
return inode->i_sb;
#endif
+ if (!devpts_mnt)
+ return NULL;
return devpts_mnt->mnt_sb;
}
@@ -253,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb)
if (!uid_valid(root_uid) || !gid_valid(root_gid))
return -EINVAL;
- mutex_lock(&root->d_inode->i_mutex);
+ mutex_lock(&d_inode(root)->i_mutex);
/* If we have already created ptmx node, return */
if (fsi->ptmx_dentry) {
@@ -290,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb)
fsi->ptmx_dentry = dentry;
rc = 0;
out:
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
return rc;
}
@@ -298,7 +300,7 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
{
struct inode *inode;
if (fsi->ptmx_dentry) {
- inode = fsi->ptmx_dentry->d_inode;
+ inode = d_inode(fsi->ptmx_dentry);
inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
}
}
@@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = {
int devpts_new_index(struct inode *ptmx_inode)
{
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
+ struct pts_fs_info *fsi;
int index;
int ida_ret;
+ if (!sb)
+ return -ENODEV;
+
+ fsi = DEVPTS_SB(sb);
retry:
if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
return -ENOMEM;
@@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
struct dentry *dentry;
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
struct inode *inode;
- struct dentry *root = sb->s_root;
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
- struct pts_mount_opts *opts = &fsi->mount_opts;
+ struct dentry *root;
+ struct pts_fs_info *fsi;
+ struct pts_mount_opts *opts;
char s[12];
+ if (!sb)
+ return ERR_PTR(-ENODEV);
+
+ root = sb->s_root;
+ fsi = DEVPTS_SB(sb);
+ opts = &fsi->mount_opts;
+
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -602,18 +615,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
sprintf(s, "%d", index);
- mutex_lock(&root->d_inode->i_mutex);
+ mutex_lock(&d_inode(root)->i_mutex);
dentry = d_alloc_name(root, s);
if (dentry) {
d_add(dentry, inode);
- fsnotify_create(root->d_inode, dentry);
+ fsnotify_create(d_inode(root), dentry);
} else {
iput(inode);
inode = ERR_PTR(-ENOMEM);
}
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
return inode;
}
@@ -658,7 +671,7 @@ void devpts_pty_kill(struct inode *inode)
BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
- mutex_lock(&root->d_inode->i_mutex);
+ mutex_lock(&d_inode(root)->i_mutex);
dentry = d_find_alias(inode);
@@ -667,7 +680,7 @@ void devpts_pty_kill(struct inode *inode)
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
dput(dentry); /* d_find_alias above */
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
}
static int __init init_devpts_fs(void)
@@ -676,12 +689,16 @@ static int __init init_devpts_fs(void)
struct ctl_table_header *table;
if (!err) {
+ struct vfsmount *mnt;
+
table = register_sysctl_table(pty_root_table);
- devpts_mnt = kern_mount(&devpts_fs_type);
- if (IS_ERR(devpts_mnt)) {
- err = PTR_ERR(devpts_mnt);
+ mnt = kern_mount(&devpts_fs_type);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
unregister_filesystem(&devpts_fs_type);
unregister_sysctl_table(table);
+ } else {
+ devpts_mnt = mnt;
}
}
return err;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b2e297..745d2342651a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,7 +37,6 @@
#include <linux/uio.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
-#include <linux/aio.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -254,7 +253,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
if (dio->end_io && dio->result)
dio->end_io(dio->iocb, offset, transferred, dio->private);
- inode_dio_done(dio->inode);
+ if (!(dio->flags & DIO_SKIP_DIO_COUNT))
+ inode_dio_end(dio->inode);
+
if (is_async) {
if (dio->rw & WRITE) {
int err;
@@ -265,7 +266,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
ret = err;
}
- aio_complete(dio->iocb, ret, 0);
+ dio->iocb->ki_complete(dio->iocb, ret, 0);
}
kmem_cache_free(dio_cache, dio);
@@ -1056,7 +1057,7 @@ static inline int drop_refcount(struct dio *dio)
* operation. AIO can if it was a broken operation described above or
* in fact if all the bios race to complete before we get here. In
* that case dio_complete() translates the EIOCBQUEUED into the proper
- * return code that the caller will hand to aio_complete().
+ * return code that the caller will hand to ->complete().
*
* This is managed by the bio_lock instead of being an atomic_t so that
* completion paths can drop their ref and use the remaining count to
@@ -1094,10 +1095,10 @@ static inline int drop_refcount(struct dio *dio)
* for the whole file.
*/
static inline ssize_t
-do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, struct iov_iter *iter, loff_t offset,
- get_block_t get_block, dio_iodone_t end_io,
- dio_submit_t submit_io, int flags)
+do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, struct iov_iter *iter,
+ loff_t offset, get_block_t get_block, dio_iodone_t end_io,
+ dio_submit_t submit_io, int flags)
{
unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
@@ -1111,9 +1112,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct blk_plug plug;
unsigned long align = offset | iov_iter_alignment(iter);
- if (rw & WRITE)
- rw = WRITE_ODIRECT;
-
/*
* Avoid references to bdev if not absolutely needed to give
* the early prefetch in the caller enough time.
@@ -1128,7 +1126,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
}
/* watch out for a 0 len io from a tricksy fs */
- if (rw == READ && !iov_iter_count(iter))
+ if (iov_iter_rw(iter) == READ && !iov_iter_count(iter))
return 0;
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
@@ -1144,7 +1142,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
dio->flags = flags;
if (dio->flags & DIO_LOCKING) {
- if (rw == READ) {
+ if (iov_iter_rw(iter) == READ) {
struct address_space *mapping =
iocb->ki_filp->f_mapping;
@@ -1170,19 +1168,19 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (is_sync_kiocb(iocb))
dio->is_async = false;
else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
- (rw & WRITE) && end > i_size_read(inode))
+ iov_iter_rw(iter) == WRITE && end > i_size_read(inode))
dio->is_async = false;
else
dio->is_async = true;
dio->inode = inode;
- dio->rw = rw;
+ dio->rw = iov_iter_rw(iter) == WRITE ? WRITE_ODIRECT : READ;
/*
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
* so that we can call ->fsync.
*/
- if (dio->is_async && (rw & WRITE) &&
+ if (dio->is_async && iov_iter_rw(iter) == WRITE &&
((iocb->ki_filp->f_flags & O_DSYNC) ||
IS_SYNC(iocb->ki_filp->f_mapping->host))) {
retval = dio_set_defer_completion(dio);
@@ -1199,7 +1197,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
/*
* Will be decremented at I/O completion time.
*/
- atomic_inc(&inode->i_dio_count);
+ if (!(dio->flags & DIO_SKIP_DIO_COUNT))
+ inode_dio_begin(inode);
retval = 0;
sdio.blkbits = blkbits;
@@ -1275,7 +1274,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
* we can let i_mutex go now that its achieved its purpose
* of protecting us from looking up uninitialized blocks.
*/
- if (rw == READ && (dio->flags & DIO_LOCKING))
+ if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
mutex_unlock(&dio->inode->i_mutex);
/*
@@ -1287,7 +1286,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
*/
BUG_ON(retval == -EIOCBQUEUED);
if (dio->is_async && retval == 0 && dio->result &&
- (rw == READ || dio->result == count))
+ (iov_iter_rw(iter) == READ || dio->result == count))
retval = -EIOCBQUEUED;
else
dio_await_completion(dio);
@@ -1301,11 +1300,11 @@ out:
return retval;
}
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, struct iov_iter *iter, loff_t offset,
- get_block_t get_block, dio_iodone_t end_io,
- dio_submit_t submit_io, int flags)
+ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, struct iov_iter *iter,
+ loff_t offset, get_block_t get_block,
+ dio_iodone_t end_io, dio_submit_t submit_io,
+ int flags)
{
/*
* The block device state is needed in the end to finally
@@ -1319,8 +1318,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
prefetch(bdev->bd_queue);
prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
- return do_blockdev_direct_IO(rw, iocb, inode, bdev, iter, offset,
- get_block, end_io, submit_io, flags);
+ return do_blockdev_direct_IO(iocb, inode, bdev, iter, offset, get_block,
+ end_io, submit_io, flags);
}
EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d08e079ea5d3..754fd6c0b747 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -921,8 +921,8 @@ static int tcp_accept_from_sock(struct connection *con)
mutex_unlock(&connections_lock);
memset(&peeraddr, 0, sizeof(peeraddr));
- result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
- IPPROTO_TCP, &newsock);
+ result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ SOCK_STREAM, IPPROTO_TCP, &newsock);
if (result < 0)
return -ENOMEM;
@@ -1173,8 +1173,8 @@ static void tcp_connect_to_sock(struct connection *con)
goto out;
/* Create a socket to communicate with */
- result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
- IPPROTO_TCP, &sock);
+ result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (result < 0)
goto out_err;
@@ -1258,8 +1258,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
addr_len = sizeof(struct sockaddr_in6);
/* Create a socket to communicate with */
- result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM,
- IPPROTO_TCP, &sock);
+ result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (result < 0) {
log_print("Can't create listening comms socket");
goto create_out;
@@ -1365,8 +1365,8 @@ static int sctp_listen_for_all(void)
log_print("Using SCTP for communications");
- result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
- IPPROTO_SCTP, &sock);
+ result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family,
+ SOCK_SEQPACKET, IPPROTO_SCTP, &sock);
if (result < 0) {
log_print("Can't create comms socket, check SCTP is loaded");
goto out;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 719e1ce1c609..97315f2f6816 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1326,7 +1326,7 @@ static int ecryptfs_read_headers_virt(char *page_virt,
if (rc)
goto out;
if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
- ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
+ ecryptfs_i_size_init(page_virt, d_inode(ecryptfs_dentry));
offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset),
&bytes_read);
@@ -1425,7 +1425,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
{
int rc;
char *page_virt;
- struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
+ struct inode *ecryptfs_inode = d_inode(ecryptfs_dentry);
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 4000f6b3a750..8db0b464483f 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -54,11 +54,11 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
- if (dentry->d_inode) {
+ if (d_really_is_positive(dentry)) {
struct inode *lower_inode =
- ecryptfs_inode_to_lower(dentry->d_inode);
+ ecryptfs_inode_to_lower(d_inode(dentry));
- fsstack_copy_attr_all(dentry->d_inode, lower_inode);
+ fsstack_copy_attr_all(d_inode(dentry), lower_inode);
}
return rc;
}
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index fd39bad6f1bd..feef8a9c4de7 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
-#include <linux/aio.h>
#include "ecryptfs_kernel.h"
/**
@@ -52,12 +51,6 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
struct file *file = iocb->ki_filp;
rc = generic_file_read_iter(iocb, to);
- /*
- * Even though this is a async interface, we need to wait
- * for IO to finish to update atime
- */
- if (-EIOCBQUEUED == rc)
- rc = wait_on_sync_kiocb(iocb);
if (rc >= 0) {
path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
touch_atime(path);
@@ -137,7 +130,7 @@ struct kmem_cache *ecryptfs_file_info_cache;
static int read_or_initialize_metadata(struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct ecryptfs_crypt_stat *crypt_stat;
int rc;
@@ -332,7 +325,6 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return rc;
switch (cmd) {
- case FITRIM:
case FS_IOC32_GETFLAGS:
case FS_IOC32_SETFLAGS:
case FS_IOC32_GETVERSION:
@@ -365,9 +357,7 @@ const struct file_operations ecryptfs_dir_fops = {
const struct file_operations ecryptfs_main_fops = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = ecryptfs_read_update_atime,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.iterate = ecryptfs_readdir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b08b5187f662..3c4db1172d22 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry)
struct dentry *dir;
dir = dget_parent(dentry);
- mutex_lock_nested(&(dir->d_inode->i_mutex), I_MUTEX_PARENT);
+ mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT);
return dir;
}
static void unlock_dir(struct dentry *dir)
{
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
dput(dir);
}
@@ -131,7 +131,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode,
static int ecryptfs_interpose(struct dentry *lower_dentry,
struct dentry *dentry, struct super_block *sb)
{
- struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb);
+ struct inode *inode = ecryptfs_get_inode(d_inode(lower_dentry), sb);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -170,7 +170,6 @@ out_unlock:
* @directory_inode: inode of the new file's dentry's parent in ecryptfs
* @ecryptfs_dentry: New file's dentry in ecryptfs
* @mode: The mode of the new file
- * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
*
* Creates the underlying file and the eCryptfs inode which will link to
* it. It will also update the eCryptfs directory inode to mimic the
@@ -189,21 +188,21 @@ ecryptfs_do_create(struct inode *directory_inode,
lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true);
+ rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true);
if (rc) {
printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
"rc = [%d]\n", __func__, rc);
inode = ERR_PTR(rc);
goto out_lock;
}
- inode = __ecryptfs_get_inode(lower_dentry->d_inode,
+ inode = __ecryptfs_get_inode(d_inode(lower_dentry),
directory_inode->i_sb);
if (IS_ERR(inode)) {
- vfs_unlink(lower_dir_dentry->d_inode, lower_dentry, NULL);
+ vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL);
goto out_lock;
}
- fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
- fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
+ fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
+ fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry));
out_lock:
unlock_dir(lower_dir_dentry);
return inode;
@@ -332,7 +331,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry,
struct inode *dir_inode)
{
- struct inode *inode, *lower_inode = lower_dentry->d_inode;
+ struct inode *inode, *lower_inode = d_inode(lower_dentry);
struct ecryptfs_dentry_info *dentry_info;
struct vfsmount *lower_mnt;
int rc = 0;
@@ -347,14 +346,14 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
}
lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
- fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
+ fsstack_copy_attr_atime(dir_inode, d_inode(lower_dentry->d_parent));
BUG_ON(!d_count(lower_dentry));
ecryptfs_set_dentry_private(dentry, dentry_info);
dentry_info->lower_path.mnt = lower_mnt;
dentry_info->lower_path.dentry = lower_dentry;
- if (!lower_dentry->d_inode) {
+ if (d_really_is_negative(lower_dentry)) {
/* We want to add because we couldn't find in lower */
d_add(dentry, NULL);
return 0;
@@ -384,7 +383,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
* ecryptfs_lookup
* @ecryptfs_dir_inode: The eCryptfs directory inode
* @ecryptfs_dentry: The eCryptfs dentry that we are looking up
- * @ecryptfs_nd: nameidata; may be NULL
+ * @flags: lookup flags
*
* Find a file on disk. If the file does not exist, then we'll add it to the
* dentry cache and continue on to read it from the disk.
@@ -400,11 +399,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
int rc = 0;
lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
- mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
lower_dir_dentry,
ecryptfs_dentry->d_name.len);
- mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -412,7 +411,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
ecryptfs_dentry);
goto out;
}
- if (lower_dentry->d_inode)
+ if (d_really_is_positive(lower_dentry))
goto interpose;
mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
@@ -429,11 +428,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
"filename; rc = [%d]\n", __func__, rc);
goto out;
}
- mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
lower_dentry = lookup_one_len(encrypted_and_encoded_name,
lower_dir_dentry,
encrypted_and_encoded_name_size);
- mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -458,24 +457,24 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
u64 file_size_save;
int rc;
- file_size_save = i_size_read(old_dentry->d_inode);
+ file_size_save = i_size_read(d_inode(old_dentry));
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
dget(lower_old_dentry);
dget(lower_new_dentry);
lower_dir_dentry = lock_parent(lower_new_dentry);
- rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode,
+ rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
lower_new_dentry, NULL);
- if (rc || !lower_new_dentry->d_inode)
+ if (rc || d_really_is_negative(lower_new_dentry))
goto out_lock;
rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
if (rc)
goto out_lock;
- fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
- fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
- set_nlink(old_dentry->d_inode,
- ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink);
- i_size_write(new_dentry->d_inode, file_size_save);
+ fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+ fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+ set_nlink(d_inode(old_dentry),
+ ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink);
+ i_size_write(d_inode(new_dentry), file_size_save);
out_lock:
unlock_dir(lower_dir_dentry);
dput(lower_new_dentry);
@@ -485,7 +484,7 @@ out_lock:
static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
{
- return ecryptfs_do_unlink(dir, dentry, dentry->d_inode);
+ return ecryptfs_do_unlink(dir, dentry, d_inode(dentry));
}
static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -510,20 +509,20 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
strlen(symname));
if (rc)
goto out_lock;
- rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
+ rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry,
encoded_symname);
kfree(encoded_symname);
- if (rc || !lower_dentry->d_inode)
+ if (rc || d_really_is_negative(lower_dentry))
goto out_lock;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out_lock;
- fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
- fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
+ fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+ fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
out_lock:
unlock_dir(lower_dir_dentry);
dput(lower_dentry);
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
}
@@ -536,18 +535,18 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
lower_dentry = ecryptfs_dentry_to_lower(dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
- if (rc || !lower_dentry->d_inode)
+ rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode);
+ if (rc || d_really_is_negative(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out;
- fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
- fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
- set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
+ fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+ fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+ set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
out:
unlock_dir(lower_dir_dentry);
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
}
@@ -562,12 +561,12 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
dget(dentry);
lower_dir_dentry = lock_parent(lower_dentry);
dget(lower_dentry);
- rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry);
+ rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry);
dput(lower_dentry);
- if (!rc && dentry->d_inode)
- clear_nlink(dentry->d_inode);
- fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
- set_nlink(dir, lower_dir_dentry->d_inode->i_nlink);
+ if (!rc && d_really_is_positive(dentry))
+ clear_nlink(d_inode(dentry));
+ fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+ set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
unlock_dir(lower_dir_dentry);
if (!rc)
d_drop(dentry);
@@ -584,17 +583,17 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev
lower_dentry = ecryptfs_dentry_to_lower(dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
- if (rc || !lower_dentry->d_inode)
+ rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev);
+ if (rc || d_really_is_negative(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
goto out;
- fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
- fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
+ fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+ fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
out:
unlock_dir(lower_dir_dentry);
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
d_drop(dentry);
return rc;
}
@@ -617,7 +616,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
dget(lower_new_dentry);
lower_old_dir_dentry = dget_parent(lower_old_dentry);
lower_new_dir_dentry = dget_parent(lower_new_dentry);
- target_inode = new_dentry->d_inode;
+ target_inode = d_inode(new_dentry);
trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
/* source should not be ancestor of target */
if (trap == lower_old_dentry) {
@@ -629,17 +628,17 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
rc = -ENOTEMPTY;
goto out_lock;
}
- rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
- lower_new_dir_dentry->d_inode, lower_new_dentry,
+ rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry,
+ d_inode(lower_new_dir_dentry), lower_new_dentry,
NULL, 0);
if (rc)
goto out_lock;
if (target_inode)
fsstack_copy_attr_all(target_inode,
ecryptfs_inode_to_lower(target_inode));
- fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
+ fsstack_copy_attr_all(new_dir, d_inode(lower_new_dir_dentry));
if (new_dir != old_dir)
- fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
+ fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry));
out_lock:
unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
dput(lower_new_dir_dentry);
@@ -662,7 +661,7 @@ static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz)
return ERR_PTR(-ENOMEM);
old_fs = get_fs();
set_fs(get_ds());
- rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
+ rc = d_inode(lower_dentry)->i_op->readlink(lower_dentry,
(char __user *)lower_buf,
PATH_MAX);
set_fs(old_fs);
@@ -675,18 +674,16 @@ out:
return rc ? ERR_PTR(rc) : buf;
}
-static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
{
size_t len;
char *buf = ecryptfs_readlink_lower(dentry, &len);
if (IS_ERR(buf))
- goto out;
- fsstack_copy_attr_atime(dentry->d_inode,
- ecryptfs_dentry_to_lower(dentry)->d_inode);
+ return buf;
+ fsstack_copy_attr_atime(d_inode(dentry),
+ d_inode(ecryptfs_dentry_to_lower(dentry)));
buf[len] = '\0';
-out:
- nd_set_link(nd, buf);
- return NULL;
+ return *cookie = buf;
}
/**
@@ -738,7 +735,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
struct iattr *lower_ia)
{
int rc = 0;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ecryptfs_crypt_stat *crypt_stat;
loff_t i_size = i_size_read(inode);
loff_t lower_size_before_truncate;
@@ -751,7 +748,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
rc = ecryptfs_get_lower_file(dentry, inode);
if (rc)
return rc;
- crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+ crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat;
/* Switch on growing or shrinking file */
if (ia->ia_size > i_size) {
char zero[] = { 0x00 };
@@ -858,7 +855,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
struct iattr lower_ia = { .ia_valid = 0 };
int rc;
- rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length);
+ rc = ecryptfs_inode_newsize_ok(d_inode(dentry), new_length);
if (rc)
return rc;
@@ -866,9 +863,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- mutex_lock(&lower_dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(lower_dentry)->i_mutex);
rc = notify_change(lower_dentry, &lower_ia, NULL);
- mutex_unlock(&lower_dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(lower_dentry)->i_mutex);
}
return rc;
}
@@ -900,10 +897,10 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
struct inode *lower_inode;
struct ecryptfs_crypt_stat *crypt_stat;
- crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+ crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat;
if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
ecryptfs_init_crypt_stat(crypt_stat);
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
lower_inode = ecryptfs_inode_to_lower(inode);
lower_dentry = ecryptfs_dentry_to_lower(dentry);
mutex_lock(&crypt_stat->cs_mutex);
@@ -967,9 +964,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
lower_ia.ia_valid &= ~ATTR_MODE;
- mutex_lock(&lower_dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(lower_dentry)->i_mutex);
rc = notify_change(lower_dentry, &lower_ia, NULL);
- mutex_unlock(&lower_dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(lower_dentry)->i_mutex);
out:
fsstack_copy_attr_all(inode, lower_inode);
return rc;
@@ -983,7 +980,7 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
mount_crypt_stat = &ecryptfs_superblock_to_private(
dentry->d_sb)->mount_crypt_stat;
- generic_fillattr(dentry->d_inode, stat);
+ generic_fillattr(d_inode(dentry), stat);
if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
char *target;
size_t targetsiz;
@@ -1007,9 +1004,9 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat);
if (!rc) {
- fsstack_copy_attr_all(dentry->d_inode,
- ecryptfs_inode_to_lower(dentry->d_inode));
- generic_fillattr(dentry->d_inode, stat);
+ fsstack_copy_attr_all(d_inode(dentry),
+ ecryptfs_inode_to_lower(d_inode(dentry)));
+ generic_fillattr(d_inode(dentry), stat);
stat->blocks = lower_stat.blocks;
}
return rc;
@@ -1023,14 +1020,14 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
struct dentry *lower_dentry;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!lower_dentry->d_inode->i_op->setxattr) {
+ if (!d_inode(lower_dentry)->i_op->setxattr) {
rc = -EOPNOTSUPP;
goto out;
}
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
- if (!rc && dentry->d_inode)
- fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
+ if (!rc && d_really_is_positive(dentry))
+ fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry));
out:
return rc;
}
@@ -1041,14 +1038,14 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
{
int rc = 0;
- if (!lower_dentry->d_inode->i_op->getxattr) {
+ if (!d_inode(lower_dentry)->i_op->getxattr) {
rc = -EOPNOTSUPP;
goto out;
}
- mutex_lock(&lower_dentry->d_inode->i_mutex);
- rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value,
+ mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value,
size);
- mutex_unlock(&lower_dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(lower_dentry)->i_mutex);
out:
return rc;
}
@@ -1068,13 +1065,13 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
struct dentry *lower_dentry;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!lower_dentry->d_inode->i_op->listxattr) {
+ if (!d_inode(lower_dentry)->i_op->listxattr) {
rc = -EOPNOTSUPP;
goto out;
}
- mutex_lock(&lower_dentry->d_inode->i_mutex);
- rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size);
- mutex_unlock(&lower_dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size);
+ mutex_unlock(&d_inode(lower_dentry)->i_mutex);
out:
return rc;
}
@@ -1085,13 +1082,13 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
struct dentry *lower_dentry;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!lower_dentry->d_inode->i_op->removexattr) {
+ if (!d_inode(lower_dentry)->i_op->removexattr) {
rc = -EOPNOTSUPP;
goto out;
}
- mutex_lock(&lower_dentry->d_inode->i_mutex);
- rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name);
- mutex_unlock(&lower_dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name);
+ mutex_unlock(&d_inode(lower_dentry)->i_mutex);
out:
return rc;
}
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index f1ea610362c6..866bb18efefe 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -144,7 +144,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
/* Corresponding dput() and mntput() are done when the
* lower file is fput() when all eCryptfs files for the inode are
* released. */
- flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR;
+ flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR;
(*lower_file) = dentry_open(&req.path, flags, cred);
if (!IS_ERR(*lower_file))
goto out;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index c095d3264259..4f4d0474bee9 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -546,11 +546,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out_free;
}
- if (check_ruid && !uid_eq(path.dentry->d_inode->i_uid, current_uid())) {
+ if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) {
rc = -EPERM;
printk(KERN_ERR "Mount of device (uid: %d) not owned by "
"requested user (uid: %d)\n",
- i_uid_read(path.dentry->d_inode),
+ i_uid_read(d_inode(path.dentry)),
from_kuid(&init_user_ns, current_uid()));
goto out_free;
}
@@ -584,7 +584,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out_free;
}
- inode = ecryptfs_get_inode(path.dentry->d_inode, s);
+ inode = ecryptfs_get_inode(d_inode(path.dentry), s);
rc = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_free;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 4626976794e7..cf208522998e 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -420,7 +420,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
void *xattr_virt;
struct dentry *lower_dentry =
ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry;
- struct inode *lower_inode = lower_dentry->d_inode;
+ struct inode *lower_inode = d_inode(lower_dentry);
int rc;
if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) {
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 07ab49745e31..3381b9da9ee6 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -145,12 +145,12 @@ out:
static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct efivar_entry *var = dentry->d_inode->i_private;
+ struct efivar_entry *var = d_inode(dentry)->i_private;
if (efivar_entry_delete(var))
return -EINVAL;
- drop_nlink(dentry->d_inode);
+ drop_nlink(d_inode(dentry));
dput(dentry);
return 0;
};
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index ddbce42548c9..86a2121828c3 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -121,7 +121,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
int len, i;
int err = -ENOMEM;
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return err;
@@ -144,7 +144,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
- inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0);
+ inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0);
if (!inode)
goto fail_name;
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index bbee8f063dfa..40ba9cc41bf7 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -111,9 +111,9 @@ struct dentry *efs_get_parent(struct dentry *child)
struct dentry *parent = ERR_PTR(-ENOENT);
efs_ino_t ino;
- ino = efs_find_entry(child->d_inode, "..", 2);
+ ino = efs_find_entry(d_inode(child), "..", 2);
if (ino)
- parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino));
+ parent = d_obtain_alias(efs_iget(d_inode(child)->i_sb, ino));
return parent;
}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 7fca462ea4e3..c8411a30f7da 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -67,7 +67,7 @@ static struct kmem_cache * efs_inode_cachep;
static struct inode *efs_alloc_inode(struct super_block *sb)
{
struct efs_inode_info *ei;
- ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/exec.c b/fs/exec.c
index c7f9b733406d..1977c2a553ac 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -659,6 +659,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
if (stack_base > STACK_SIZE_MAX)
stack_base = STACK_SIZE_MAX;
+ /* Add space for stack randomization. */
+ stack_base += (STACK_RND_MASK << PAGE_SHIFT);
+
/* Make sure we didn't let the argument array grow too large. */
if (vma->vm_end - vma->vm_start > stack_base)
return -ENOMEM;
@@ -926,10 +929,14 @@ static int de_thread(struct task_struct *tsk)
if (!thread_group_leader(tsk)) {
struct task_struct *leader = tsk->group_leader;
- sig->notify_count = -1; /* for exit_notify() */
for (;;) {
threadgroup_change_begin(tsk);
write_lock_irq(&tasklist_lock);
+ /*
+ * Do this under tasklist_lock to ensure that
+ * exit_notify() can't miss ->group_exit_task
+ */
+ sig->notify_count = -1;
if (likely(leader->exit_state))
break;
__set_current_state(TASK_KILLABLE);
@@ -1078,7 +1085,13 @@ int flush_old_exec(struct linux_binprm * bprm)
if (retval)
goto out;
+ /*
+ * Must be called _before_ exec_mmap() as bprm->mm is
+ * not visibile until then. This also enables the update
+ * to be lockless.
+ */
set_mm_exe_file(bprm->mm, bprm->file);
+
/*
* Release all of the old mmap stuff
*/
@@ -1265,6 +1278,53 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
spin_unlock(&p->fs->lock);
}
+static void bprm_fill_uid(struct linux_binprm *bprm)
+{
+ struct inode *inode;
+ unsigned int mode;
+ kuid_t uid;
+ kgid_t gid;
+
+ /* clear any previous set[ug]id data from a previous binary */
+ bprm->cred->euid = current_euid();
+ bprm->cred->egid = current_egid();
+
+ if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+ return;
+
+ if (task_no_new_privs(current))
+ return;
+
+ inode = file_inode(bprm->file);
+ mode = READ_ONCE(inode->i_mode);
+ if (!(mode & (S_ISUID|S_ISGID)))
+ return;
+
+ /* Be careful if suid/sgid is set */
+ mutex_lock(&inode->i_mutex);
+
+ /* reload atomically mode/uid/gid now that lock held */
+ mode = inode->i_mode;
+ uid = inode->i_uid;
+ gid = inode->i_gid;
+ mutex_unlock(&inode->i_mutex);
+
+ /* We ignore suid/sgid if there are no mappings for them in the ns */
+ if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
+ !kgid_has_mapping(bprm->cred->user_ns, gid))
+ return;
+
+ if (mode & S_ISUID) {
+ bprm->per_clear |= PER_CLEAR_ON_SETID;
+ bprm->cred->euid = uid;
+ }
+
+ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+ bprm->per_clear |= PER_CLEAR_ON_SETID;
+ bprm->cred->egid = gid;
+ }
+}
+
/*
* Fill the binprm structure from the inode.
* Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
@@ -1273,36 +1333,9 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
*/
int prepare_binprm(struct linux_binprm *bprm)
{
- struct inode *inode = file_inode(bprm->file);
- umode_t mode = inode->i_mode;
int retval;
-
- /* clear any previous set[ug]id data from a previous binary */
- bprm->cred->euid = current_euid();
- bprm->cred->egid = current_egid();
-
- if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
- !task_no_new_privs(current) &&
- kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
- kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
- /* Set-uid? */
- if (mode & S_ISUID) {
- bprm->per_clear |= PER_CLEAR_ON_SETID;
- bprm->cred->euid = inode->i_uid;
- }
-
- /* Set-gid? */
- /*
- * If setgid is set but no group execute bit then this
- * is a candidate for mandatory locking, not a setgid
- * executable.
- */
- if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
- bprm->per_clear |= PER_CLEAR_ON_SETID;
- bprm->cred->egid = inode->i_gid;
- }
- }
+ bprm_fill_uid(bprm);
/* fill in binprm security blob */
retval = security_bprm_set_creds(bprm);
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index b47c7b8dc275..a364fd0965ec 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -16,5 +16,5 @@
libore-y := ore.o ore_raid.o
obj-$(CONFIG_ORE) += libore.o
-exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o
+exofs-y := inode.o file.o namei.o dir.o super.o sys.o
obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index d7defd557601..e5bb2abf77f9 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -44,12 +44,6 @@ static inline void exofs_put_page(struct page *page)
page_cache_release(page);
}
-/* Accesses dir's inode->i_size must be called under inode lock */
-static inline unsigned long dir_pages(struct inode *inode)
-{
- return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
-
static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
{
loff_t last_byte = inode->i_size;
@@ -379,7 +373,7 @@ ino_t exofs_parent_ino(struct dentry *child)
struct exofs_dir_entry *de;
ino_t ino;
- de = exofs_dotdot(child->d_inode, &page);
+ de = exofs_dotdot(d_inode(child), &page);
if (!de)
return 0;
@@ -429,7 +423,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
int exofs_add_link(struct dentry *dentry, struct inode *inode)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
const unsigned char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned chunk_size = exofs_chunk_size(dir);
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index ad9cac670a47..2e86086bc940 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -207,10 +207,6 @@ extern const struct address_space_operations exofs_aops;
extern const struct inode_operations exofs_dir_inode_operations;
extern const struct inode_operations exofs_special_inode_operations;
-/* symlink.c */
-extern const struct inode_operations exofs_symlink_inode_operations;
-extern const struct inode_operations exofs_fast_symlink_inode_operations;
-
/* exofs_init_comps will initialize an ore_components device array
* pointing to a single ore_comp struct, and a round-robin view
* of the device table.
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 1a376b42d305..906de66e8e7e 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -67,8 +67,6 @@ static int exofs_flush(struct file *file, fl_owner_t id)
const struct file_operations exofs_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a198e94813fe..73c64daa0f55 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -963,8 +963,8 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
/* TODO: Should be easy enough to do proprly */
-static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
return 0;
}
@@ -1028,7 +1028,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
*/
int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
/* if we are about to modify an object, and it hasn't been
@@ -1222,10 +1222,11 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &exofs_dir_operations;
inode->i_mapping->a_ops = &exofs_aops;
} else if (S_ISLNK(inode->i_mode)) {
- if (exofs_inode_is_fast_symlink(inode))
- inode->i_op = &exofs_fast_symlink_inode_operations;
- else {
- inode->i_op = &exofs_symlink_inode_operations;
+ if (exofs_inode_is_fast_symlink(inode)) {
+ inode->i_op = &simple_symlink_inode_operations;
+ inode->i_link = (char *)oi->i_data;
+ } else {
+ inode->i_op = &page_symlink_inode_operations;
inode->i_mapping->a_ops = &exofs_aops;
}
} else {
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 28907460e8fa..09a6bb1ad63c 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -113,7 +113,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
oi = exofs_i(inode);
if (l > sizeof(oi->i_data)) {
/* slow symlink */
- inode->i_op = &exofs_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
inode->i_mapping->a_ops = &exofs_aops;
memset(oi->i_data, 0, sizeof(oi->i_data));
@@ -122,7 +122,8 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
goto out_fail;
} else {
/* fast symlink */
- inode->i_op = &exofs_fast_symlink_inode_operations;
+ inode->i_op = &simple_symlink_inode_operations;
+ inode->i_link = (char *)oi->i_data;
memcpy(oi->i_data, symname, l);
inode->i_size = l-1;
}
@@ -141,7 +142,7 @@ out_fail:
static int exofs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
inode->i_ctime = CURRENT_TIME;
inode_inc_link_count(inode);
@@ -191,7 +192,7 @@ out_dir:
static int exofs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct exofs_dir_entry *de;
struct page *page;
int err = -ENOENT;
@@ -213,7 +214,7 @@ out:
static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int err = -ENOTEMPTY;
if (exofs_empty_dir(inode)) {
@@ -230,8 +231,8 @@ static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct page *dir_page = NULL;
struct exofs_dir_entry *dir_de = NULL;
struct page *old_page;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index fcc2e565f540..b795c567b5e1 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -958,7 +958,7 @@ static struct dentry *exofs_get_parent(struct dentry *child)
if (!ino)
return ERR_PTR(-ESTALE);
- return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
+ return d_obtain_alias(exofs_iget(d_inode(child)->i_sb, ino));
}
static struct inode *exofs_nfs_get_inode(struct super_block *sb,
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
deleted file mode 100644
index 832e2624b80b..000000000000
--- a/fs/exofs/symlink.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (C) 2005, 2006
- * Avishay Traeger (avishay@gmail.com)
- * Copyright (C) 2008, 2009
- * Boaz Harrosh <ooo@electrozaur.com>
- *
- * Copyrights for code taken from ext2:
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- * from
- * linux/fs/minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This file is part of exofs.
- *
- * exofs is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation. Since it is based on ext2, and the only
- * valid version of GPL for the Linux kernel is version 2, the only valid
- * version of GPL for exofs is version 2.
- *
- * exofs is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with exofs; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <linux/namei.h>
-
-#include "exofs.h"
-
-static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct exofs_i_info *oi = exofs_i(dentry->d_inode);
-
- nd_set_link(nd, (char *)oi->i_data);
- return NULL;
-}
-
-const struct inode_operations exofs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
-};
-
-const struct inode_operations exofs_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = exofs_follow_link,
-};
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 6e1d4ab09d72..0c6638b40f21 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -70,11 +70,6 @@ static inline void ext2_put_page(struct page *page)
page_cache_release(page);
}
-static inline unsigned long dir_pages(struct inode *inode)
-{
- return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
/*
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
@@ -486,7 +481,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
*/
int ext2_add_link (struct dentry *dentry, struct inode *inode)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned chunk_size = ext2_chunk_size(dir);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 678f9ab08c48..8d15febd0aa3 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -793,7 +793,6 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
extern const struct inode_operations ext2_file_inode_operations;
extern const struct file_operations ext2_file_operations;
-extern const struct file_operations ext2_dax_file_operations;
/* inode.c */
extern const struct address_space_operations ext2_aops;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index e31701713516..3b57c9f83c9b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -28,17 +28,18 @@
#ifdef CONFIG_FS_DAX
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext2_get_block);
+ return dax_fault(vma, vmf, ext2_get_block, NULL);
}
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext2_get_block);
+ return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
}
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
.page_mkwrite = ext2_dax_mkwrite,
+ .pfn_mkwrite = dax_pfn_mkwrite,
};
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -92,8 +93,6 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
@@ -108,24 +107,6 @@ const struct file_operations ext2_file_operations = {
.splice_write = iter_file_splice_write,
};
-#ifdef CONFIG_FS_DAX
-const struct file_operations ext2_dax_file_operations = {
- .llseek = generic_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .unlocked_ioctl = ext2_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext2_compat_ioctl,
-#endif
- .mmap = ext2_file_mmap,
- .open = dquot_file_open,
- .release = ext2_release_file,
- .fsync = ext2_fsync,
-};
-#endif
-
const struct inode_operations ext2_file_inode_operations = {
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 6c14bb8322fa..5c04a0ddea80 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -278,7 +278,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
avefreeb = free_blocks / ngroups;
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
- if ((parent == sb->s_root->d_inode) ||
+ if ((parent == d_inode(sb->s_root)) ||
(EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) {
struct ext2_group_desc *best_desc = NULL;
int best_ndir = inodes_per_group;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6434bc000125..5c09776d347f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,7 +31,7 @@
#include <linux/mpage.h>
#include <linux/fiemap.h>
#include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "ext2.h"
#include "acl.h"
#include "xattr.h"
@@ -851,8 +851,7 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
}
static ssize_t
-ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -861,12 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
ssize_t ret;
if (IS_DAX(inode))
- ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
- NULL, DIO_LOCKING);
+ ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL,
+ DIO_LOCKING);
else
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+ ret = blockdev_direct_IO(iocb, inode, iter, offset,
ext2_get_block);
- if (ret < 0 && (rw & WRITE))
+ if (ret < 0 && iov_iter_rw(iter) == WRITE)
ext2_write_failed(mapping, offset + count);
return ret;
}
@@ -1388,10 +1387,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext2_file_inode_operations;
- if (test_opt(inode->i_sb, DAX)) {
- inode->i_mapping->a_ops = &ext2_aops;
- inode->i_fop = &ext2_dax_file_operations;
- } else if (test_opt(inode->i_sb, NOBH)) {
+ if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
} else {
@@ -1407,6 +1403,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
inode->i_mapping->a_ops = &ext2_aops;
} else if (S_ISLNK(inode->i_mode)) {
if (ext2_inode_is_fast_symlink(inode)) {
+ inode->i_link = (char *)ei->i_data;
inode->i_op = &ext2_fast_symlink_inode_operations;
nd_terminate_link(ei->i_data, inode->i_size,
sizeof(ei->i_data) - 1);
@@ -1548,7 +1545,7 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
error = inode_change_ok(inode, iattr);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 148f6e3789ea..13ec54a99c96 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -79,10 +79,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, uns
struct dentry *ext2_get_parent(struct dentry *child)
{
struct qstr dotdot = QSTR_INIT("..", 2);
- unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot);
+ unsigned long ino = ext2_inode_by_name(d_inode(child), &dotdot);
if (!ino)
return ERR_PTR(-ENOENT);
- return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino));
+ return d_obtain_alias(ext2_iget(d_inode(child)->i_sb, ino));
}
/*
@@ -104,10 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
return PTR_ERR(inode);
inode->i_op = &ext2_file_inode_operations;
- if (test_opt(inode->i_sb, DAX)) {
- inode->i_mapping->a_ops = &ext2_aops;
- inode->i_fop = &ext2_dax_file_operations;
- } else if (test_opt(inode->i_sb, NOBH)) {
+ if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
} else {
@@ -125,10 +122,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
return PTR_ERR(inode);
inode->i_op = &ext2_file_inode_operations;
- if (test_opt(inode->i_sb, DAX)) {
- inode->i_mapping->a_ops = &ext2_aops;
- inode->i_fop = &ext2_dax_file_operations;
- } else if (test_opt(inode->i_sb, NOBH)) {
+ if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
} else {
@@ -195,7 +189,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
} else {
/* fast symlink */
inode->i_op = &ext2_fast_symlink_inode_operations;
- memcpy((char*)(EXT2_I(inode)->i_data),symname,l);
+ inode->i_link = (char*)EXT2_I(inode)->i_data;
+ memcpy(inode->i_link, symname, l);
inode->i_size = l-1;
}
mark_inode_dirty(inode);
@@ -214,7 +209,7 @@ out_fail:
static int ext2_link (struct dentry * old_dentry, struct inode * dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int err;
dquot_initialize(dir);
@@ -281,7 +276,7 @@ out_dir:
static int ext2_unlink(struct inode * dir, struct dentry *dentry)
{
- struct inode * inode = dentry->d_inode;
+ struct inode * inode = d_inode(dentry);
struct ext2_dir_entry_2 * de;
struct page * page;
int err = -ENOENT;
@@ -305,7 +300,7 @@ out:
static int ext2_rmdir (struct inode * dir, struct dentry *dentry)
{
- struct inode * inode = dentry->d_inode;
+ struct inode * inode = d_inode(dentry);
int err = -ENOTEMPTY;
if (ext2_empty_dir(inode)) {
@@ -322,8 +317,8 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry)
static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
struct inode * new_dir, struct dentry * new_dentry )
{
- struct inode * old_inode = old_dentry->d_inode;
- struct inode * new_inode = new_dentry->d_inode;
+ struct inode * old_inode = d_inode(old_dentry);
+ struct inode * new_inode = d_inode(new_dentry);
struct page * dir_page = NULL;
struct ext2_dir_entry_2 * dir_de = NULL;
struct page * old_page;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d0e746e96511..900e19cf9ef6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
MS_POSIXACL : 0);
+ sb->s_iflags |= SB_I_CGROUPWB;
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
(EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 565cf817bbf1..ae17179f3810 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -19,14 +19,6 @@
#include "ext2.h"
#include "xattr.h"
-#include <linux/namei.h>
-
-static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct ext2_inode_info *ei = EXT2_I(dentry->d_inode);
- nd_set_link(nd, (char *)ei->i_data);
- return NULL;
-}
const struct inode_operations ext2_symlink_inode_operations = {
.readlink = generic_readlink,
@@ -43,7 +35,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
const struct inode_operations ext2_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ext2_follow_link,
+ .follow_link = simple_follow_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 91426141c33a..0b6bfd3a398b 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -243,7 +243,7 @@ cleanup:
static int
ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct buffer_head *bh = NULL;
struct ext2_xattr_entry *entry;
char *end;
@@ -319,7 +319,7 @@ cleanup:
/*
* Inode operation listxattr()
*
- * dentry->d_inode->i_mutex: don't care
+ * d_inode(dentry)->i_mutex: don't care
*/
ssize_t
ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c0ebc4db8849..702fc6840246 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -28,7 +28,7 @@ ext2_xattr_security_get(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
+ return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
buffer, size);
}
@@ -38,7 +38,7 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
+ return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
value, size, flags);
}
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 7e192574c001..42b6e9874bcc 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -32,7 +32,7 @@ ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
+ return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
buffer, size);
}
@@ -42,7 +42,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
+ return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
value, size, flags);
}
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index f470e44c4b8d..ecdc4605192c 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -36,7 +36,7 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name,
return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER,
+ return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
name, buffer, size);
}
@@ -49,7 +49,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER,
+ return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index a062fa1e1b11..3b8f650de22c 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -50,8 +50,6 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.unlocked_ioctl = ext3_ioctl,
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index a1b810230cc5..3ad242e5840e 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -210,7 +210,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
avefreeb = freeb / ngroups;
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
- if ((parent == sb->s_root->d_inode) ||
+ if ((parent == d_inode(sb->s_root)) ||
(EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {
int best_ndir = inodes_per_group;
int best_group = -1;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2c6ccc49ba27..6c7e5468a2f8 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,7 +27,7 @@
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "ext3.h"
#include "xattr.h"
#include "acl.h"
@@ -1820,8 +1820,8 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -1832,9 +1832,9 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
size_t count = iov_iter_count(iter);
int retries = 0;
- trace_ext3_direct_IO_enter(inode, offset, count, rw);
+ trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (rw == WRITE) {
+ if (iov_iter_rw(iter) == WRITE) {
loff_t final_size = offset + count;
if (final_size > inode->i_size) {
@@ -1856,12 +1856,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
}
retry:
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again.
*/
- if (unlikely((rw & WRITE) && ret < 0)) {
+ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
loff_t end = offset + count;
@@ -1908,7 +1908,7 @@ retry:
ret = err;
}
out:
- trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
+ trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
return ret;
}
@@ -2999,6 +2999,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext3_fast_symlink_inode_operations;
nd_terminate_link(ei->i_data, inode->i_size,
sizeof(ei->i_data) - 1);
+ inode->i_link = (char *)ei->i_data;
} else {
inode->i_op = &ext3_symlink_inode_operations;
ext3_set_aops(inode);
@@ -3240,7 +3241,7 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
*/
int ext3_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error, rc = 0;
const unsigned int ia_valid = attr->ia_valid;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index f197736dccfa..c9e767cd4b67 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1049,19 +1049,19 @@ struct dentry *ext3_get_parent(struct dentry *child)
struct ext3_dir_entry_2 * de;
struct buffer_head *bh;
- bh = ext3_find_entry(child->d_inode, &dotdot, &de);
+ bh = ext3_find_entry(d_inode(child), &dotdot, &de);
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
brelse(bh);
- if (!ext3_valid_inum(child->d_inode->i_sb, ino)) {
- ext3_error(child->d_inode->i_sb, "ext3_get_parent",
+ if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) {
+ ext3_error(d_inode(child)->i_sb, "ext3_get_parent",
"bad inode number: %lu", ino);
return ERR_PTR(-EIO);
}
- return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino));
+ return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino));
}
#define S_SHIFT 12
@@ -1243,7 +1243,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *inode, struct ext3_dir_entry_2 *de,
struct buffer_head * bh)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned long offset = 0;
@@ -1330,7 +1330,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct inode *inode, struct buffer_head *bh)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
struct buffer_head *bh2;
@@ -1435,7 +1435,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
struct inode *inode)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
struct buffer_head * bh;
struct ext3_dir_entry_2 *de;
struct super_block * sb;
@@ -1489,7 +1489,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct dx_entry *entries, *at;
struct dx_hash_info hinfo;
struct buffer_head * bh;
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
struct super_block * sb = dir->i_sb;
struct ext3_dir_entry_2 *de;
int err;
@@ -2111,7 +2111,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go in
* separate transaction */
dquot_initialize(dir);
- dquot_initialize(dentry->d_inode);
+ dquot_initialize(d_inode(dentry));
handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
@@ -2125,7 +2125,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
retval = -EIO;
if (le32_to_cpu(de->inode) != inode->i_ino)
@@ -2173,7 +2173,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go
* in separate transaction */
dquot_initialize(dir);
- dquot_initialize(dentry->d_inode);
+ dquot_initialize(d_inode(dentry));
handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
@@ -2187,7 +2187,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
if (!bh)
goto end_unlink;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
retval = -EIO;
if (le32_to_cpu(de->inode) != inode->i_ino)
@@ -2308,7 +2308,8 @@ retry:
}
} else {
inode->i_op = &ext3_fast_symlink_inode_operations;
- memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+ inode->i_link = (char*)&EXT3_I(inode)->i_data;
+ memcpy(inode->i_link, symname, l);
inode->i_size = l-1;
}
EXT3_I(inode)->i_disksize = inode->i_size;
@@ -2328,7 +2329,7 @@ static int ext3_link (struct dentry * old_dentry,
struct inode * dir, struct dentry *dentry)
{
handle_t *handle;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int err, retries = 0;
if (inode->i_nlink >= EXT3_LINK_MAX)
@@ -2391,8 +2392,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- if (new_dentry->d_inode)
- dquot_initialize(new_dentry->d_inode);
+ if (d_really_is_positive(new_dentry))
+ dquot_initialize(d_inode(new_dentry));
handle = ext3_journal_start(old_dir, 2 *
EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
@@ -2409,12 +2410,12 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
* and merrily kill the link to whatever was created under the
* same name. Goodbye sticky bit ;-<
*/
- old_inode = old_dentry->d_inode;
+ old_inode = d_inode(old_dentry);
retval = -ENOENT;
if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
goto end_rename;
- new_inode = new_dentry->d_inode;
+ new_inode = d_inode(new_dentry);
new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
if (new_bh) {
if (!new_inode) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index d4dbf3c259b3..5ed0044fbb37 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -789,7 +789,7 @@ static const struct quotactl_ops ext3_qctl_operations = {
.quota_on = ext3_quota_on,
.quota_off = dquot_quota_off,
.quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
+ .get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk
@@ -1170,7 +1170,7 @@ static int parse_options (char *options, struct super_block *sb,
return 0;
}
- journal_inode = path.dentry->d_inode;
+ journal_inode = d_inode(path.dentry);
if (!S_ISBLK(journal_inode->i_mode)) {
ext3_msg(sb, KERN_ERR, "error: journal path %s "
"is not a block device", journal_path);
@@ -1908,7 +1908,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_mount_state = le16_to_cpu(es->s_state);
sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
- for (i=0; i < 4; i++)
+ for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
i = le32_to_cpu(es->s_flags);
@@ -2947,7 +2947,7 @@ static int ext3_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext3_journal_start(sb->s_root->d_inode, 2);
+ handle = ext3_journal_start(d_inode(sb->s_root), 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -2994,7 +2994,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
* When we journal data on quota file, we have to flush journal to see
* all updates to the file when we bypass pagecache...
*/
- if (ext3_should_journal_data(path->dentry->d_inode)) {
+ if (ext3_should_journal_data(d_inode(path->dentry))) {
/*
* We don't need to lock updates but journal_flush() could
* otherwise be livelocked...
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index 6b01c3eab1f3..c08c59094ae6 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -17,17 +17,9 @@
* ext3 symlink handling code
*/
-#include <linux/namei.h>
#include "ext3.h"
#include "xattr.h"
-static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct ext3_inode_info *ei = EXT3_I(dentry->d_inode);
- nd_set_link(nd, (char*)ei->i_data);
- return NULL;
-}
-
const struct inode_operations ext3_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
@@ -43,7 +35,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
const struct inode_operations ext3_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ext3_follow_link,
+ .follow_link = simple_follow_link,
.setattr = ext3_setattr,
#ifdef CONFIG_EXT3_FS_XATTR
.setxattr = generic_setxattr,
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index c6874be6d58b..7cf36501ccf4 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -137,7 +137,7 @@ ext3_xattr_handler(int name_index)
/*
* Inode operation listxattr()
*
- * dentry->d_inode->i_mutex: don't care
+ * d_inode(dentry)->i_mutex: don't care
*/
ssize_t
ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -355,7 +355,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
static int
ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct buffer_head *bh = NULL;
int error;
@@ -391,7 +391,7 @@ cleanup:
static int
ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ext3_xattr_ibody_header *header;
struct ext3_inode *raw_inode;
struct ext3_iloc iloc;
@@ -432,7 +432,7 @@ ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
int i_error, b_error;
- down_read(&EXT3_I(dentry->d_inode)->xattr_sem);
+ down_read(&EXT3_I(d_inode(dentry))->xattr_sem);
i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
if (i_error < 0) {
b_error = 0;
@@ -445,7 +445,7 @@ ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
if (b_error < 0)
i_error = 0;
}
- up_read(&EXT3_I(dentry->d_inode)->xattr_sem);
+ up_read(&EXT3_I(d_inode(dentry))->xattr_sem);
return i_error + b_error;
}
@@ -546,8 +546,7 @@ ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s)
free += EXT3_XATTR_LEN(name_len);
}
if (i->value) {
- if (free < EXT3_XATTR_SIZE(i->value_len) ||
- free < EXT3_XATTR_LEN(name_len) +
+ if (free < EXT3_XATTR_LEN(name_len) +
EXT3_XATTR_SIZE(i->value_len))
return -ENOSPC;
}
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 722c2bf9645d..c9506d5e3b13 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -29,7 +29,7 @@ ext3_xattr_security_get(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
+ return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
name, buffer, size);
}
@@ -39,7 +39,7 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
+ return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index d75727cc67fa..206cc66dc285 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -32,7 +32,7 @@ ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED,
+ return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
@@ -42,7 +42,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name,
+ return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name,
value, size, flags);
}
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 5612af3567e0..021508ad1616 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -34,7 +34,7 @@ ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER,
+ return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER,
name, buffer, size);
}
@@ -46,7 +46,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER,
+ return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index efea5d5c44ce..bf8bc8aba471 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -64,6 +64,29 @@ config EXT4_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
+config EXT4_ENCRYPTION
+ tristate "Ext4 Encryption"
+ depends on EXT4_FS
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_ECB
+ select CRYPTO_XTS
+ select CRYPTO_CTS
+ select CRYPTO_CTR
+ select CRYPTO_SHA256
+ select KEYS
+ select ENCRYPTED_KEYS
+ help
+ Enable encryption of ext4 files and directories. This
+ feature is similar to ecryptfs, but it is more memory
+ efficient since it avoids caching the encrypted and
+ decrypted pages in the page cache.
+
+config EXT4_FS_ENCRYPTION
+ bool
+ default y
+ depends on EXT4_ENCRYPTION
+
config EXT4_DEBUG
bool "EXT4 debugging support"
depends on EXT4_FS
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 0310fec2ee3d..75285ea9aa05 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,9 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
- xattr_trusted.o inline.o
+ xattr_trusted.o inline.o readpage.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
+ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \
+ crypto_key.o crypto_fname.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index d40c8dbbb0d6..69b1e73026a5 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -4,11 +4,6 @@
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 83a6f497c4e0..cd6ea29be645 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -14,7 +14,6 @@
#include <linux/time.h>
#include <linux/capability.h>
#include <linux/fs.h>
-#include <linux/jbd2.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include "ext4.h"
@@ -370,7 +369,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (buffer_verified(bh))
+ if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
return;
ext4_lock_group(sb, block_group);
@@ -447,7 +446,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
unlock_buffer(bh);
if (err)
ext4_error(sb, "Checksum bad for grp %u", block_group);
- return bh;
+ goto verify;
}
ext4_unlock_group(sb, block_group);
if (buffer_uptodate(bh)) {
@@ -641,8 +640,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
* fail EDQUOT for metdata, but we do account for it.
*/
if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
dquot_alloc_block_nofail(inode,
EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index b610779a958c..4a606afb171f 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -8,7 +8,6 @@
*/
#include <linux/buffer_head.h>
-#include <linux/jbd2.h>
#include "ext4.h"
unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 41eb9dcfac7e..3522340c7a99 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -16,7 +16,6 @@
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
-#include <linux/mutex.h>
#include <linux/slab.h>
#include "ext4.h"
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
new file mode 100644
index 000000000000..45731558138c
--- /dev/null
+++ b/fs/ext4/crypto.c
@@ -0,0 +1,475 @@
+/*
+ * linux/fs/ext4/crypto.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption functions for ext4
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ * Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ * Ildar Muslukhov, 2014
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/user-type.h>
+#include <keys/encrypted-type.h>
+#include <linux/crypto.h>
+#include <linux/ecryptfs.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+
+#include "ext4_extents.h"
+#include "xattr.h"
+
+/* Encryption added and removed here! (L: */
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+ "Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+ "Number of crypto contexts to preallocate");
+
+static mempool_t *ext4_bounce_page_pool;
+
+static LIST_HEAD(ext4_free_crypto_ctxs);
+static DEFINE_SPINLOCK(ext4_crypto_ctx_lock);
+
+static struct kmem_cache *ext4_crypto_ctx_cachep;
+struct kmem_cache *ext4_crypt_info_cachep;
+
+/**
+ * ext4_release_crypto_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx)
+{
+ unsigned long flags;
+
+ if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page)
+ mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool);
+ ctx->w.bounce_page = NULL;
+ ctx->w.control_page = NULL;
+ if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+ kmem_cache_free(ext4_crypto_ctx_cachep, ctx);
+ } else {
+ spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
+ list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
+ spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
+ }
+}
+
+/**
+ * ext4_get_crypto_ctx() - Gets an encryption context
+ * @inode: The inode for which we are doing the crypto
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode)
+{
+ struct ext4_crypto_ctx *ctx = NULL;
+ int res = 0;
+ unsigned long flags;
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+
+ if (ci == NULL)
+ return ERR_PTR(-ENOKEY);
+
+ /*
+ * We first try getting the ctx from a free list because in
+ * the common case the ctx will have an allocated and
+ * initialized crypto tfm, so it's probably a worthwhile
+ * optimization. For the bounce page, we first try getting it
+ * from the kernel allocator because that's just about as fast
+ * as getting it from a list and because a cache of free pages
+ * should generally be a "last resort" option for a filesystem
+ * to be able to do its job.
+ */
+ spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
+ ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs,
+ struct ext4_crypto_ctx, free_list);
+ if (ctx)
+ list_del(&ctx->free_list);
+ spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
+ if (!ctx) {
+ ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS);
+ if (!ctx) {
+ res = -ENOMEM;
+ goto out;
+ }
+ ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ } else {
+ ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ }
+ ctx->flags &= ~EXT4_WRITE_PATH_FL;
+
+out:
+ if (res) {
+ if (!IS_ERR_OR_NULL(ctx))
+ ext4_release_crypto_ctx(ctx);
+ ctx = ERR_PTR(res);
+ }
+ return ctx;
+}
+
+struct workqueue_struct *ext4_read_workqueue;
+static DEFINE_MUTEX(crypto_init);
+
+/**
+ * ext4_exit_crypto() - Shutdown the ext4 encryption system
+ */
+void ext4_exit_crypto(void)
+{
+ struct ext4_crypto_ctx *pos, *n;
+
+ list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list)
+ kmem_cache_free(ext4_crypto_ctx_cachep, pos);
+ INIT_LIST_HEAD(&ext4_free_crypto_ctxs);
+ if (ext4_bounce_page_pool)
+ mempool_destroy(ext4_bounce_page_pool);
+ ext4_bounce_page_pool = NULL;
+ if (ext4_read_workqueue)
+ destroy_workqueue(ext4_read_workqueue);
+ ext4_read_workqueue = NULL;
+ if (ext4_crypto_ctx_cachep)
+ kmem_cache_destroy(ext4_crypto_ctx_cachep);
+ ext4_crypto_ctx_cachep = NULL;
+ if (ext4_crypt_info_cachep)
+ kmem_cache_destroy(ext4_crypt_info_cachep);
+ ext4_crypt_info_cachep = NULL;
+}
+
+/**
+ * ext4_init_crypto() - Set up for ext4 encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int ext4_init_crypto(void)
+{
+ int i, res = -ENOMEM;
+
+ mutex_lock(&crypto_init);
+ if (ext4_read_workqueue)
+ goto already_initialized;
+ ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0);
+ if (!ext4_read_workqueue)
+ goto fail;
+
+ ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!ext4_crypto_ctx_cachep)
+ goto fail;
+
+ ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!ext4_crypt_info_cachep)
+ goto fail;
+
+ for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+ struct ext4_crypto_ctx *ctx;
+
+ ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS);
+ if (!ctx) {
+ res = -ENOMEM;
+ goto fail;
+ }
+ list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
+ }
+
+ ext4_bounce_page_pool =
+ mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+ if (!ext4_bounce_page_pool) {
+ res = -ENOMEM;
+ goto fail;
+ }
+already_initialized:
+ mutex_unlock(&crypto_init);
+ return 0;
+fail:
+ ext4_exit_crypto();
+ mutex_unlock(&crypto_init);
+ return res;
+}
+
+void ext4_restore_control_page(struct page *data_page)
+{
+ struct ext4_crypto_ctx *ctx =
+ (struct ext4_crypto_ctx *)page_private(data_page);
+
+ set_page_private(data_page, (unsigned long)NULL);
+ ClearPagePrivate(data_page);
+ unlock_page(data_page);
+ ext4_release_crypto_ctx(ctx);
+}
+
+/**
+ * ext4_crypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void ext4_crypt_complete(struct crypto_async_request *req, int res)
+{
+ struct ext4_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+typedef enum {
+ EXT4_DECRYPT = 0,
+ EXT4_ENCRYPT,
+} ext4_direction_t;
+
+static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
+ struct inode *inode,
+ ext4_direction_t rw,
+ pgoff_t index,
+ struct page *src_page,
+ struct page *dest_page)
+
+{
+ u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
+ struct ablkcipher_request *req = NULL;
+ DECLARE_EXT4_COMPLETION_RESULT(ecr);
+ struct scatterlist dst, src;
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_request_alloc() failed\n",
+ __func__);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ ext4_crypt_complete, &ecr);
+
+ BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index));
+ memcpy(xts_tweak, &index, sizeof(index));
+ memset(&xts_tweak[sizeof(index)], 0,
+ EXT4_XTS_TWEAK_SIZE - sizeof(index));
+
+ sg_init_table(&dst, 1);
+ sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+ sg_init_table(&src, 1);
+ sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
+ ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+ xts_tweak);
+ if (rw == EXT4_DECRYPT)
+ res = crypto_ablkcipher_decrypt(req);
+ else
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ ablkcipher_request_free(req);
+ if (res) {
+ printk_ratelimited(
+ KERN_ERR
+ "%s: crypto_ablkcipher_encrypt() returned %d\n",
+ __func__, res);
+ return res;
+ }
+ return 0;
+}
+
+static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx)
+{
+ ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT);
+ if (ctx->w.bounce_page == NULL)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= EXT4_WRITE_PATH_FL;
+ return ctx->w.bounce_page;
+}
+
+/**
+ * ext4_encrypt() - Encrypts a page
+ * @inode: The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path. The caller must call
+ * ext4_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *ext4_encrypt(struct inode *inode,
+ struct page *plaintext_page)
+{
+ struct ext4_crypto_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ int err;
+
+ BUG_ON(!PageLocked(plaintext_page));
+
+ ctx = ext4_get_crypto_ctx(inode);
+ if (IS_ERR(ctx))
+ return (struct page *) ctx;
+
+ /* The encryption operation will require a bounce page. */
+ ciphertext_page = alloc_bounce_page(ctx);
+ if (IS_ERR(ciphertext_page))
+ goto errout;
+ ctx->w.control_page = plaintext_page;
+ err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
+ plaintext_page, ciphertext_page);
+ if (err) {
+ ciphertext_page = ERR_PTR(err);
+ errout:
+ ext4_release_crypto_ctx(ctx);
+ return ciphertext_page;
+ }
+ SetPagePrivate(ciphertext_page);
+ set_page_private(ciphertext_page, (unsigned long)ctx);
+ lock_page(ciphertext_page);
+ return ciphertext_page;
+}
+
+/**
+ * ext4_decrypt() - Decrypts a page in-place
+ * @ctx: The encryption context.
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+
+ return ext4_page_crypto(ctx, page->mapping->host,
+ EXT4_DECRYPT, page->index, page, page);
+}
+
+/*
+ * Convenience function which takes care of allocating and
+ * deallocating the encryption context
+ */
+int ext4_decrypt_one(struct inode *inode, struct page *page)
+{
+ int ret;
+
+ struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode);
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ ret = ext4_decrypt(ctx, page);
+ ext4_release_crypto_ctx(ctx);
+ return ret;
+}
+
+int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+ struct ext4_crypto_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ struct bio *bio;
+ ext4_lblk_t lblk = ex->ee_block;
+ ext4_fsblk_t pblk = ext4_ext_pblock(ex);
+ unsigned int len = ext4_ext_get_actual_len(ex);
+ int err = 0;
+
+ BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+
+ ctx = ext4_get_crypto_ctx(inode);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ciphertext_page = alloc_bounce_page(ctx);
+ if (IS_ERR(ciphertext_page)) {
+ err = PTR_ERR(ciphertext_page);
+ goto errout;
+ }
+
+ while (len--) {
+ err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
+ ZERO_PAGE(0), ciphertext_page);
+ if (err)
+ goto errout;
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (!bio) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ bio->bi_bdev = inode->i_sb->s_bdev;
+ bio->bi_iter.bi_sector = pblk;
+ err = bio_add_page(bio, ciphertext_page,
+ inode->i_sb->s_blocksize, 0);
+ if (err) {
+ bio_put(bio);
+ goto errout;
+ }
+ err = submit_bio_wait(WRITE, bio);
+ bio_put(bio);
+ if (err)
+ goto errout;
+ }
+ err = 0;
+errout:
+ ext4_release_crypto_ctx(ctx);
+ return err;
+}
+
+bool ext4_valid_contents_enc_mode(uint32_t mode)
+{
+ return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS);
+}
+
+/**
+ * ext4_validate_encryption_key_size() - Validate the encryption key size
+ * @mode: The key mode.
+ * @size: The key size to validate.
+ *
+ * Return: The validated key size for @mode. Zero if invalid.
+ */
+uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
+{
+ if (size == ext4_encryption_key_size(mode))
+ return size;
+ return 0;
+}
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
new file mode 100644
index 000000000000..7dc4eb55913c
--- /dev/null
+++ b/fs/ext4/crypto_fname.c
@@ -0,0 +1,469 @@
+/*
+ * linux/fs/ext4/crypto_fname.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains functions for filename crypto management in ext4
+ *
+ * Written by Uday Savagaonkar, 2014.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/crypto.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+
+#include "ext4.h"
+#include "ext4_crypto.h"
+#include "xattr.h"
+
+/**
+ * ext4_dir_crypt_complete() -
+ */
+static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res)
+{
+ struct ext4_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+bool ext4_valid_filenames_enc_mode(uint32_t mode)
+{
+ return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS);
+}
+
+static unsigned max_name_len(struct inode *inode)
+{
+ return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
+ EXT4_NAME_LEN;
+}
+
+/**
+ * ext4_fname_encrypt() -
+ *
+ * This function encrypts the input filename, and returns the length of the
+ * ciphertext. Errors are returned as negative numbers. We trust the caller to
+ * allocate sufficient memory to oname string.
+ */
+static int ext4_fname_encrypt(struct inode *inode,
+ const struct qstr *iname,
+ struct ext4_str *oname)
+{
+ u32 ciphertext_len;
+ struct ablkcipher_request *req = NULL;
+ DECLARE_EXT4_COMPLETION_RESULT(ecr);
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+ char iv[EXT4_CRYPTO_BLOCK_SIZE];
+ struct scatterlist src_sg, dst_sg;
+ int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
+ char *workbuf, buf[32], *alloc_buf = NULL;
+ unsigned lim = max_name_len(inode);
+
+ if (iname->len <= 0 || iname->len > lim)
+ return -EIO;
+
+ ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ?
+ EXT4_CRYPTO_BLOCK_SIZE : iname->len;
+ ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding);
+ ciphertext_len = (ciphertext_len > lim)
+ ? lim : ciphertext_len;
+
+ if (ciphertext_len <= sizeof(buf)) {
+ workbuf = buf;
+ } else {
+ alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
+ if (!alloc_buf)
+ return -ENOMEM;
+ workbuf = alloc_buf;
+ }
+
+ /* Allocate request */
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(
+ KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
+ kfree(alloc_buf);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ ext4_dir_crypt_complete, &ecr);
+
+ /* Copy the input */
+ memcpy(workbuf, iname->name, iname->len);
+ if (iname->len < ciphertext_len)
+ memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
+
+ /* Initialize IV */
+ memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
+
+ /* Create encryption request */
+ sg_init_one(&src_sg, workbuf, ciphertext_len);
+ sg_init_one(&dst_sg, oname->name, ciphertext_len);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ kfree(alloc_buf);
+ ablkcipher_request_free(req);
+ if (res < 0) {
+ printk_ratelimited(
+ KERN_ERR "%s: Error (error code %d)\n", __func__, res);
+ }
+ oname->len = ciphertext_len;
+ return res;
+}
+
+/*
+ * ext4_fname_decrypt()
+ * This function decrypts the input filename, and returns
+ * the length of the plaintext.
+ * Errors are returned as negative numbers.
+ * We trust the caller to allocate sufficient memory to oname string.
+ */
+static int ext4_fname_decrypt(struct inode *inode,
+ const struct ext4_str *iname,
+ struct ext4_str *oname)
+{
+ struct ext4_str tmp_in[2], tmp_out[1];
+ struct ablkcipher_request *req = NULL;
+ DECLARE_EXT4_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+ char iv[EXT4_CRYPTO_BLOCK_SIZE];
+ unsigned lim = max_name_len(inode);
+
+ if (iname->len <= 0 || iname->len > lim)
+ return -EIO;
+
+ tmp_in[0].name = iname->name;
+ tmp_in[0].len = iname->len;
+ tmp_out[0].name = oname->name;
+
+ /* Allocate request */
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(
+ KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ ext4_dir_crypt_complete, &ecr);
+
+ /* Initialize IV */
+ memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
+
+ /* Create encryption request */
+ sg_init_one(&src_sg, iname->name, iname->len);
+ sg_init_one(&dst_sg, oname->name, oname->len);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+ res = crypto_ablkcipher_decrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ ablkcipher_request_free(req);
+ if (res < 0) {
+ printk_ratelimited(
+ KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
+ __func__, res);
+ return res;
+ }
+
+ oname->len = strnlen(oname->name, iname->len);
+ return oname->len;
+}
+
+static const char *lookup_table =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+
+/**
+ * ext4_fname_encode_digest() -
+ *
+ * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * The encoded string is roughly 4/3 times the size of the input string.
+ */
+static int digest_encode(const char *src, int len, char *dst)
+{
+ int i = 0, bits = 0, ac = 0;
+ char *cp = dst;
+
+ while (i < len) {
+ ac += (((unsigned char) src[i]) << bits);
+ bits += 8;
+ do {
+ *cp++ = lookup_table[ac & 0x3f];
+ ac >>= 6;
+ bits -= 6;
+ } while (bits >= 6);
+ i++;
+ }
+ if (bits)
+ *cp++ = lookup_table[ac & 0x3f];
+ return cp - dst;
+}
+
+static int digest_decode(const char *src, int len, char *dst)
+{
+ int i = 0, bits = 0, ac = 0;
+ const char *p;
+ char *cp = dst;
+
+ while (i < len) {
+ p = strchr(lookup_table, src[i]);
+ if (p == NULL || src[i] == 0)
+ return -2;
+ ac += (p - lookup_table) << bits;
+ bits += 6;
+ if (bits >= 8) {
+ *cp++ = ac & 0xff;
+ ac >>= 8;
+ bits -= 8;
+ }
+ i++;
+ }
+ if (ac)
+ return -1;
+ return cp - dst;
+}
+
+/**
+ * ext4_fname_crypto_round_up() -
+ *
+ * Return: The next multiple of block size
+ */
+u32 ext4_fname_crypto_round_up(u32 size, u32 blksize)
+{
+ return ((size+blksize-1)/blksize)*blksize;
+}
+
+unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen)
+{
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+ int padding = 32;
+
+ if (ci)
+ padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK);
+ if (ilen < EXT4_CRYPTO_BLOCK_SIZE)
+ ilen = EXT4_CRYPTO_BLOCK_SIZE;
+ return ext4_fname_crypto_round_up(ilen, padding);
+}
+
+/*
+ * ext4_fname_crypto_alloc_buffer() -
+ *
+ * Allocates an output buffer that is sufficient for the crypto operation
+ * specified by the context and the direction.
+ */
+int ext4_fname_crypto_alloc_buffer(struct inode *inode,
+ u32 ilen, struct ext4_str *crypto_str)
+{
+ unsigned int olen = ext4_fname_encrypted_size(inode, ilen);
+
+ crypto_str->len = olen;
+ if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2)
+ olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2;
+ /* Allocated buffer can hold one more character to null-terminate the
+ * string */
+ crypto_str->name = kmalloc(olen+1, GFP_NOFS);
+ if (!(crypto_str->name))
+ return -ENOMEM;
+ return 0;
+}
+
+/**
+ * ext4_fname_crypto_free_buffer() -
+ *
+ * Frees the buffer allocated for crypto operation.
+ */
+void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str)
+{
+ if (!crypto_str)
+ return;
+ kfree(crypto_str->name);
+ crypto_str->name = NULL;
+}
+
+/**
+ * ext4_fname_disk_to_usr() - converts a filename from disk space to user space
+ */
+int _ext4_fname_disk_to_usr(struct inode *inode,
+ struct dx_hash_info *hinfo,
+ const struct ext4_str *iname,
+ struct ext4_str *oname)
+{
+ char buf[24];
+ int ret;
+
+ if (iname->len < 3) {
+ /*Check for . and .. */
+ if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') {
+ oname->name[0] = '.';
+ oname->name[iname->len-1] = '.';
+ oname->len = iname->len;
+ return oname->len;
+ }
+ }
+ if (EXT4_I(inode)->i_crypt_info)
+ return ext4_fname_decrypt(inode, iname, oname);
+
+ if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) {
+ ret = digest_encode(iname->name, iname->len, oname->name);
+ oname->len = ret;
+ return ret;
+ }
+ if (hinfo) {
+ memcpy(buf, &hinfo->hash, 4);
+ memcpy(buf+4, &hinfo->minor_hash, 4);
+ } else
+ memset(buf, 0, 8);
+ memcpy(buf + 8, iname->name + iname->len - 16, 16);
+ oname->name[0] = '_';
+ ret = digest_encode(buf, 24, oname->name+1);
+ oname->len = ret + 1;
+ return ret + 1;
+}
+
+int ext4_fname_disk_to_usr(struct inode *inode,
+ struct dx_hash_info *hinfo,
+ const struct ext4_dir_entry_2 *de,
+ struct ext4_str *oname)
+{
+ struct ext4_str iname = {.name = (unsigned char *) de->name,
+ .len = de->name_len };
+
+ return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname);
+}
+
+
+/**
+ * ext4_fname_usr_to_disk() - converts a filename from user space to disk space
+ */
+int ext4_fname_usr_to_disk(struct inode *inode,
+ const struct qstr *iname,
+ struct ext4_str *oname)
+{
+ int res;
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+
+ if (iname->len < 3) {
+ /*Check for . and .. */
+ if (iname->name[0] == '.' &&
+ iname->name[iname->len-1] == '.') {
+ oname->name[0] = '.';
+ oname->name[iname->len-1] = '.';
+ oname->len = iname->len;
+ return oname->len;
+ }
+ }
+ if (ci) {
+ res = ext4_fname_encrypt(inode, iname, oname);
+ return res;
+ }
+ /* Without a proper key, a user is not allowed to modify the filenames
+ * in a directory. Consequently, a user space name cannot be mapped to
+ * a disk-space name */
+ return -EACCES;
+}
+
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
+{
+ struct ext4_crypt_info *ci;
+ int ret = 0, bigname = 0;
+
+ memset(fname, 0, sizeof(struct ext4_filename));
+ fname->usr_fname = iname;
+
+ if (!ext4_encrypted_inode(dir) ||
+ ((iname->name[0] == '.') &&
+ ((iname->len == 1) ||
+ ((iname->name[1] == '.') && (iname->len == 2))))) {
+ fname->disk_name.name = (unsigned char *) iname->name;
+ fname->disk_name.len = iname->len;
+ return 0;
+ }
+ ret = ext4_get_encryption_info(dir);
+ if (ret)
+ return ret;
+ ci = EXT4_I(dir)->i_crypt_info;
+ if (ci) {
+ ret = ext4_fname_crypto_alloc_buffer(dir, iname->len,
+ &fname->crypto_buf);
+ if (ret < 0)
+ return ret;
+ ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf);
+ if (ret < 0)
+ goto errout;
+ fname->disk_name.name = fname->crypto_buf.name;
+ fname->disk_name.len = fname->crypto_buf.len;
+ return 0;
+ }
+ if (!lookup)
+ return -EACCES;
+
+ /* We don't have the key and we are doing a lookup; decode the
+ * user-supplied name
+ */
+ if (iname->name[0] == '_')
+ bigname = 1;
+ if ((bigname && (iname->len != 33)) ||
+ (!bigname && (iname->len > 43)))
+ return -ENOENT;
+
+ fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+ if (fname->crypto_buf.name == NULL)
+ return -ENOMEM;
+ ret = digest_decode(iname->name + bigname, iname->len - bigname,
+ fname->crypto_buf.name);
+ if (ret < 0) {
+ ret = -ENOENT;
+ goto errout;
+ }
+ fname->crypto_buf.len = ret;
+ if (bigname) {
+ memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4);
+ memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4);
+ } else {
+ fname->disk_name.name = fname->crypto_buf.name;
+ fname->disk_name.len = fname->crypto_buf.len;
+ }
+ return 0;
+errout:
+ kfree(fname->crypto_buf.name);
+ fname->crypto_buf.name = NULL;
+ return ret;
+}
+
+void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+ kfree(fname->crypto_buf.name);
+ fname->crypto_buf.name = NULL;
+ fname->usr_fname = NULL;
+ fname->disk_name.name = NULL;
+}
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
new file mode 100644
index 000000000000..442d24e8efc0
--- /dev/null
+++ b/fs/ext4/crypto_key.c
@@ -0,0 +1,260 @@
+/*
+ * linux/fs/ext4/crypto_key.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions for ext4
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+
+#include "ext4.h"
+#include "xattr.h"
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+ struct ext4_completion_result *ecr = req->data;
+
+ if (rc == -EINPROGRESS)
+ return;
+
+ ecr->res = rc;
+ complete(&ecr->completion);
+}
+
+/**
+ * ext4_derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivatio.
+ * @source_key: Source key to which to apply derivation.
+ * @derived_key: Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
+ char source_key[EXT4_AES_256_XTS_KEY_SIZE],
+ char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
+{
+ int res = 0;
+ struct ablkcipher_request *req = NULL;
+ DECLARE_EXT4_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
+ 0);
+
+ if (IS_ERR(tfm)) {
+ res = PTR_ERR(tfm);
+ tfm = NULL;
+ goto out;
+ }
+ crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ res = -ENOMEM;
+ goto out;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ derive_crypt_complete, &ecr);
+ res = crypto_ablkcipher_setkey(tfm, deriving_key,
+ EXT4_AES_128_ECB_KEY_SIZE);
+ if (res < 0)
+ goto out;
+ sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
+ sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ EXT4_AES_256_XTS_KEY_SIZE, NULL);
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+
+out:
+ if (req)
+ ablkcipher_request_free(req);
+ if (tfm)
+ crypto_free_ablkcipher(tfm);
+ return res;
+}
+
+void ext4_free_crypt_info(struct ext4_crypt_info *ci)
+{
+ if (!ci)
+ return;
+
+ if (ci->ci_keyring_key)
+ key_put(ci->ci_keyring_key);
+ crypto_free_ablkcipher(ci->ci_ctfm);
+ kmem_cache_free(ext4_crypt_info_cachep, ci);
+}
+
+void ext4_free_encryption_info(struct inode *inode,
+ struct ext4_crypt_info *ci)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_crypt_info *prev;
+
+ if (ci == NULL)
+ ci = ACCESS_ONCE(ei->i_crypt_info);
+ if (ci == NULL)
+ return;
+ prev = cmpxchg(&ei->i_crypt_info, ci, NULL);
+ if (prev != ci)
+ return;
+
+ ext4_free_crypt_info(ci);
+}
+
+int _ext4_get_encryption_info(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_crypt_info *crypt_info;
+ char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
+ (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1];
+ struct key *keyring_key = NULL;
+ struct ext4_encryption_key *master_key;
+ struct ext4_encryption_context ctx;
+ struct user_key_payload *ukp;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct crypto_ablkcipher *ctfm;
+ const char *cipher_str;
+ char raw_key[EXT4_MAX_KEY_SIZE];
+ char mode;
+ int res;
+
+ if (!ext4_read_workqueue) {
+ res = ext4_init_crypto();
+ if (res)
+ return res;
+ }
+
+retry:
+ crypt_info = ACCESS_ONCE(ei->i_crypt_info);
+ if (crypt_info) {
+ if (!crypt_info->ci_keyring_key ||
+ key_validate(crypt_info->ci_keyring_key) == 0)
+ return 0;
+ ext4_free_encryption_info(inode, crypt_info);
+ goto retry;
+ }
+
+ res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ &ctx, sizeof(ctx));
+ if (res < 0) {
+ if (!DUMMY_ENCRYPTION_ENABLED(sbi))
+ return res;
+ ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode =
+ EXT4_ENCRYPTION_MODE_AES_256_CTS;
+ ctx.flags = 0;
+ } else if (res != sizeof(ctx))
+ return -EINVAL;
+ res = 0;
+
+ crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL);
+ if (!crypt_info)
+ return -ENOMEM;
+
+ crypt_info->ci_flags = ctx.flags;
+ crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+ crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+ crypt_info->ci_ctfm = NULL;
+ crypt_info->ci_keyring_key = NULL;
+ memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+ sizeof(crypt_info->ci_master_key));
+ if (S_ISREG(inode->i_mode))
+ mode = crypt_info->ci_data_mode;
+ else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ mode = crypt_info->ci_filename_mode;
+ else
+ BUG();
+ switch (mode) {
+ case EXT4_ENCRYPTION_MODE_AES_256_XTS:
+ cipher_str = "xts(aes)";
+ break;
+ case EXT4_ENCRYPTION_MODE_AES_256_CTS:
+ cipher_str = "cts(cbc(aes))";
+ break;
+ default:
+ printk_once(KERN_WARNING
+ "ext4: unsupported key mode %d (ino %u)\n",
+ mode, (unsigned) inode->i_ino);
+ res = -ENOKEY;
+ goto out;
+ }
+ if (DUMMY_ENCRYPTION_ENABLED(sbi)) {
+ memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE);
+ goto got_key;
+ }
+ memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX,
+ EXT4_KEY_DESC_PREFIX_SIZE);
+ sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE,
+ "%*phN", EXT4_KEY_DESCRIPTOR_SIZE,
+ ctx.master_key_descriptor);
+ full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
+ (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0';
+ keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+ if (IS_ERR(keyring_key)) {
+ res = PTR_ERR(keyring_key);
+ keyring_key = NULL;
+ goto out;
+ }
+ crypt_info->ci_keyring_key = keyring_key;
+ BUG_ON(keyring_key->type != &key_type_logon);
+ ukp = ((struct user_key_payload *)keyring_key->payload.data);
+ if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
+ res = -EINVAL;
+ goto out;
+ }
+ master_key = (struct ext4_encryption_key *)ukp->data;
+ BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
+ EXT4_KEY_DERIVATION_NONCE_SIZE);
+ BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
+ res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
+ raw_key);
+got_key:
+ ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+ if (!ctfm || IS_ERR(ctfm)) {
+ res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+ printk(KERN_DEBUG
+ "%s: error %d (inode %u) allocating crypto tfm\n",
+ __func__, res, (unsigned) inode->i_ino);
+ goto out;
+ }
+ crypt_info->ci_ctfm = ctfm;
+ crypto_ablkcipher_clear_flags(ctfm, ~0);
+ crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
+ CRYPTO_TFM_REQ_WEAK_KEY);
+ res = crypto_ablkcipher_setkey(ctfm, raw_key,
+ ext4_encryption_key_size(mode));
+ if (res)
+ goto out;
+ memzero_explicit(raw_key, sizeof(raw_key));
+ if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) {
+ ext4_free_crypt_info(crypt_info);
+ goto retry;
+ }
+ return 0;
+
+out:
+ if (res == -ENOKEY)
+ res = 0;
+ ext4_free_crypt_info(crypt_info);
+ memzero_explicit(raw_key, sizeof(raw_key));
+ return res;
+}
+
+int ext4_has_encryption_key(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ return (ei->i_crypt_info != NULL);
+}
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
new file mode 100644
index 000000000000..02c4e5df7afb
--- /dev/null
+++ b/fs/ext4/crypto_policy.c
@@ -0,0 +1,215 @@
+/*
+ * linux/fs/ext4/crypto_policy.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption policy functions for ext4
+ *
+ * Written by Michael Halcrow, 2015.
+ */
+
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "ext4.h"
+#include "xattr.h"
+
+static int ext4_inode_has_encryption_context(struct inode *inode)
+{
+ int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0);
+ return (res > 0);
+}
+
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int ext4_is_encryption_context_consistent_with_policy(
+ struct inode *inode, const struct ext4_encryption_policy *policy)
+{
+ struct ext4_encryption_context ctx;
+ int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+ sizeof(ctx));
+ if (res != sizeof(ctx))
+ return 0;
+ return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+ EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (ctx.flags ==
+ policy->flags) &&
+ (ctx.contents_encryption_mode ==
+ policy->contents_encryption_mode) &&
+ (ctx.filenames_encryption_mode ==
+ policy->filenames_encryption_mode));
+}
+
+static int ext4_create_encryption_context_from_policy(
+ struct inode *inode, const struct ext4_encryption_policy *policy)
+{
+ struct ext4_encryption_context ctx;
+ int res = 0;
+
+ res = ext4_convert_inline_data(inode);
+ if (res)
+ return res;
+
+ ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
+ memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+ EXT4_KEY_DESCRIPTOR_SIZE);
+ if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid contents encryption mode %d\n", __func__,
+ policy->contents_encryption_mode);
+ return -EINVAL;
+ }
+ if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid filenames encryption mode %d\n", __func__,
+ policy->filenames_encryption_mode);
+ return -EINVAL;
+ }
+ if (policy->flags & ~EXT4_POLICY_FLAGS_VALID)
+ return -EINVAL;
+ ctx.contents_encryption_mode = policy->contents_encryption_mode;
+ ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+ ctx.flags = policy->flags;
+ BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE);
+ get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
+
+ res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+ sizeof(ctx), 0);
+ if (!res)
+ ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+ return res;
+}
+
+int ext4_process_policy(const struct ext4_encryption_policy *policy,
+ struct inode *inode)
+{
+ if (policy->version != 0)
+ return -EINVAL;
+
+ if (!ext4_inode_has_encryption_context(inode)) {
+ if (!S_ISDIR(inode->i_mode))
+ return -EINVAL;
+ if (!ext4_empty_dir(inode))
+ return -ENOTEMPTY;
+ return ext4_create_encryption_context_from_policy(inode,
+ policy);
+ }
+
+ if (ext4_is_encryption_context_consistent_with_policy(inode, policy))
+ return 0;
+
+ printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+ __func__);
+ return -EINVAL;
+}
+
+int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy)
+{
+ struct ext4_encryption_context ctx;
+
+ int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+ &ctx, sizeof(ctx));
+ if (res != sizeof(ctx))
+ return -ENOENT;
+ if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1)
+ return -EINVAL;
+ policy->version = 0;
+ policy->contents_encryption_mode = ctx.contents_encryption_mode;
+ policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+ policy->flags = ctx.flags;
+ memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+ EXT4_KEY_DESCRIPTOR_SIZE);
+ return 0;
+}
+
+int ext4_is_child_context_consistent_with_parent(struct inode *parent,
+ struct inode *child)
+{
+ struct ext4_crypt_info *parent_ci, *child_ci;
+ int res;
+
+ if ((parent == NULL) || (child == NULL)) {
+ pr_err("parent %p child %p\n", parent, child);
+ BUG_ON(1);
+ }
+ /* no restrictions if the parent directory is not encrypted */
+ if (!ext4_encrypted_inode(parent))
+ return 1;
+ /* if the child directory is not encrypted, this is always a problem */
+ if (!ext4_encrypted_inode(child))
+ return 0;
+ res = ext4_get_encryption_info(parent);
+ if (res)
+ return 0;
+ res = ext4_get_encryption_info(child);
+ if (res)
+ return 0;
+ parent_ci = EXT4_I(parent)->i_crypt_info;
+ child_ci = EXT4_I(child)->i_crypt_info;
+ if (!parent_ci && !child_ci)
+ return 1;
+ if (!parent_ci || !child_ci)
+ return 0;
+
+ return (memcmp(parent_ci->ci_master_key,
+ child_ci->ci_master_key,
+ EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+ (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
+ (parent_ci->ci_flags == child_ci->ci_flags));
+}
+
+/**
+ * ext4_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child: Child inode that inherits the context from @parent.
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int ext4_inherit_context(struct inode *parent, struct inode *child)
+{
+ struct ext4_encryption_context ctx;
+ struct ext4_crypt_info *ci;
+ int res;
+
+ res = ext4_get_encryption_info(parent);
+ if (res < 0)
+ return res;
+ ci = EXT4_I(parent)->i_crypt_info;
+ if (ci == NULL)
+ return -ENOKEY;
+
+ ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
+ if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) {
+ ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode =
+ EXT4_ENCRYPTION_MODE_AES_256_CTS;
+ ctx.flags = 0;
+ memset(ctx.master_key_descriptor, 0x42,
+ EXT4_KEY_DESCRIPTOR_SIZE);
+ res = 0;
+ } else {
+ ctx.contents_encryption_mode = ci->ci_data_mode;
+ ctx.filenames_encryption_mode = ci->ci_filename_mode;
+ ctx.flags = ci->ci_flags;
+ memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+ EXT4_KEY_DESCRIPTOR_SIZE);
+ }
+ get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
+ res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION,
+ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+ sizeof(ctx), 0);
+ if (!res) {
+ ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT);
+ ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA);
+ res = ext4_get_encryption_info(child);
+ }
+ return res;
+}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index c24143ea9c08..f9e14911918c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -22,10 +22,8 @@
*/
#include <linux/fs.h>
-#include <linux/jbd2.h>
#include <linux/buffer_head.h>
#include <linux/slab.h>
-#include <linux/rbtree.h>
#include "ext4.h"
#include "xattr.h"
@@ -110,7 +108,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
int err;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
+ struct buffer_head *bh = NULL;
int dir_has_error = 0;
+ struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
@@ -127,17 +127,23 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
- int ret = ext4_read_inline_dir(file, ctx,
+ err = ext4_read_inline_dir(file, ctx,
&has_inline_data);
if (has_inline_data)
- return ret;
+ return err;
+ }
+
+ if (ext4_encrypted_inode(inode)) {
+ err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN,
+ &fname_crypto_str);
+ if (err < 0)
+ return err;
}
offset = ctx->pos & (sb->s_blocksize - 1);
while (ctx->pos < inode->i_size) {
struct ext4_map_blocks map;
- struct buffer_head *bh = NULL;
map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
map.m_len = 1;
@@ -180,6 +186,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
(unsigned long long)ctx->pos);
ctx->pos += sb->s_blocksize - offset;
brelse(bh);
+ bh = NULL;
continue;
}
set_buffer_verified(bh);
@@ -226,25 +233,45 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
- if (!dir_emit(ctx, de->name,
- de->name_len,
- le32_to_cpu(de->inode),
- get_dtype(sb, de->file_type))) {
- brelse(bh);
- return 0;
+ if (!ext4_encrypted_inode(inode)) {
+ if (!dir_emit(ctx, de->name,
+ de->name_len,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type)))
+ goto done;
+ } else {
+ int save_len = fname_crypto_str.len;
+
+ /* Directory is encrypted */
+ err = ext4_fname_disk_to_usr(inode,
+ NULL, de, &fname_crypto_str);
+ fname_crypto_str.len = save_len;
+ if (err < 0)
+ goto errout;
+ if (!dir_emit(ctx,
+ fname_crypto_str.name, err,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type)))
+ goto done;
}
}
ctx->pos += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
}
- offset = 0;
+ if ((ctx->pos < inode->i_size) && !dir_relax(inode))
+ goto done;
brelse(bh);
- if (ctx->pos < inode->i_size) {
- if (!dir_relax(inode))
- return 0;
- }
+ bh = NULL;
+ offset = 0;
}
- return 0;
+done:
+ err = 0;
+errout:
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ ext4_fname_crypto_free_buffer(&fname_crypto_str);
+#endif
+ brelse(bh);
+ return err;
}
static inline int is_32bit_api(void)
@@ -384,10 +411,15 @@ void ext4_htree_free_dir_info(struct dir_private_info *p)
/*
* Given a directory entry, enter it into the fname rb tree.
+ *
+ * When filename encryption is enabled, the dirent will hold the
+ * encrypted filename, while the htree will hold decrypted filename.
+ * The decrypted filename is passed in via ent_name. parameter.
*/
int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
- struct ext4_dir_entry_2 *dirent)
+ struct ext4_dir_entry_2 *dirent,
+ struct ext4_str *ent_name)
{
struct rb_node **p, *parent = NULL;
struct fname *fname, *new_fn;
@@ -398,17 +430,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
p = &info->root.rb_node;
/* Create and allocate the fname structure */
- len = sizeof(struct fname) + dirent->name_len + 1;
+ len = sizeof(struct fname) + ent_name->len + 1;
new_fn = kzalloc(len, GFP_KERNEL);
if (!new_fn)
return -ENOMEM;
new_fn->hash = hash;
new_fn->minor_hash = minor_hash;
new_fn->inode = le32_to_cpu(dirent->inode);
- new_fn->name_len = dirent->name_len;
+ new_fn->name_len = ent_name->len;
new_fn->file_type = dirent->file_type;
- memcpy(new_fn->name, dirent->name, dirent->name_len);
- new_fn->name[dirent->name_len] = 0;
+ memcpy(new_fn->name, ent_name->name, ent_name->len);
+ new_fn->name[ent_name->len] = 0;
while (*p) {
parent = *p;
@@ -561,6 +593,13 @@ finished:
return 0;
}
+static int ext4_dir_open(struct inode * inode, struct file * filp)
+{
+ if (ext4_encrypted_inode(inode))
+ return ext4_get_encryption_info(inode) ? -EACCES : 0;
+ return 0;
+}
+
static int ext4_release_dir(struct inode *inode, struct file *filp)
{
if (filp->private_data)
@@ -603,5 +642,6 @@ const struct file_operations ext4_dir_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
+ .open = ext4_dir_open,
.release = ext4_release_dir,
};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f63c3d5805c4..f5e9f04220c1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -69,15 +69,6 @@
#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
-#define EXT4_ERROR_INODE(inode, fmt, a...) \
- ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
-
-#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
- ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
-
-#define EXT4_ERROR_FILE(file, block, fmt, a...) \
- ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
-
/* data type for block offset of block group */
typedef int ext4_grpblk_t;
@@ -90,6 +81,11 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned int ext4_group_t;
+enum SHIFT_DIRECTION {
+ SHIFT_LEFT = 0,
+ SHIFT_RIGHT,
+};
+
/*
* Flags used in mballoc's allocation_context flags field.
*
@@ -422,7 +418,7 @@ enum {
EXT4_INODE_DIRTY = 8,
EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
EXT4_INODE_NOCOMPR = 10, /* Don't compress */
- EXT4_INODE_ENCRYPT = 11, /* Compression error */
+ EXT4_INODE_ENCRYPT = 11, /* Encrypted file */
/* End compression flags --- maybe not all used */
EXT4_INODE_INDEX = 12, /* hash-indexed directory */
EXT4_INODE_IMAGIC = 13, /* AFS directory */
@@ -582,6 +578,15 @@ enum {
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
+/* Encryption algorithms */
+#define EXT4_ENCRYPTION_MODE_INVALID 0
+#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1
+#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2
+#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3
+#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4
+
+#include "ext4_crypto.h"
+
/*
* ioctl commands
*/
@@ -603,6 +608,9 @@ enum {
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
+#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy)
+#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
+#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -939,6 +947,11 @@ struct ext4_inode_info {
/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
__u32 i_csum_seed;
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ /* Encryption params */
+ struct ext4_crypt_info *i_crypt_info;
+#endif
};
/*
@@ -1049,12 +1062,6 @@ extern void ext4_set_bits(void *bm, int cur, int len);
/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM 1
-/* Encryption algorithms */
-#define EXT4_ENCRYPTION_MODE_INVALID 0
-#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1
-#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2
-#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3
-
/*
* Structure of the super block
*/
@@ -1142,7 +1149,8 @@ struct ext4_super_block {
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
__u8 s_checksum_type; /* metadata checksum algorithm used */
- __le16 s_reserved_pad;
+ __u8 s_encryption_level; /* versioning level for encryption */
+ __u8 s_reserved_pad; /* Padding to next 32bits */
__le64 s_kbytes_written; /* nr of lifetime kilobytes written */
__le32 s_snapshot_inum; /* Inode number of active snapshot */
__le32 s_snapshot_id; /* sequential ID of active snapshot */
@@ -1169,7 +1177,9 @@ struct ext4_super_block {
__le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
__le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
__u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
- __le32 s_reserved[105]; /* Padding to the end of the block */
+ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
+ __le32 s_lpf_ino; /* Location of the lost+found inode */
+ __le32 s_reserved[100]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1180,8 +1190,16 @@ struct ext4_super_block {
/*
* run-time mount flags
*/
-#define EXT4_MF_MNTDIR_SAMPLED 0x0001
-#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
+#define EXT4_MF_MNTDIR_SAMPLED 0x0001
+#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
+#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
+ EXT4_MF_TEST_DUMMY_ENCRYPTION))
+#else
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
+#endif
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 2
@@ -1466,6 +1484,18 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_SB(sb) (sb)
#endif
+/*
+ * Returns true if the inode is inode is encrypted
+ */
+static inline int ext4_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
+#else
+ return 0;
+#endif
+}
+
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
@@ -1575,8 +1605,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
- EXT4_FEATURE_INCOMPAT_MMP | \
- EXT4_FEATURE_INCOMPAT_INLINE_DATA)
+ EXT4_FEATURE_INCOMPAT_MMP | \
+ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
+ EXT4_FEATURE_INCOMPAT_ENCRYPT)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1796,6 +1827,17 @@ struct dx_hash_info
*/
#define HASH_NB_ALWAYS 1
+struct ext4_filename {
+ const struct qstr *usr_fname;
+ struct ext4_str disk_name;
+ struct dx_hash_info hinfo;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct ext4_str crypto_buf;
+#endif
+};
+
+#define fname_name(p) ((p)->disk_name.name)
+#define fname_len(p) ((p)->disk_name.len)
/*
* Describe an inode's exact location on disk and in memory
@@ -2001,6 +2043,130 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
+/* crypto_policy.c */
+int ext4_is_child_context_consistent_with_parent(struct inode *parent,
+ struct inode *child);
+int ext4_inherit_context(struct inode *parent, struct inode *child);
+void ext4_to_hex(char *dst, char *src, size_t src_size);
+int ext4_process_policy(const struct ext4_encryption_policy *policy,
+ struct inode *inode);
+int ext4_get_policy(struct inode *inode,
+ struct ext4_encryption_policy *policy);
+
+/* crypto.c */
+extern struct kmem_cache *ext4_crypt_info_cachep;
+bool ext4_valid_contents_enc_mode(uint32_t mode);
+uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
+extern struct workqueue_struct *ext4_read_workqueue;
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode);
+void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
+void ext4_restore_control_page(struct page *data_page);
+struct page *ext4_encrypt(struct inode *inode,
+ struct page *plaintext_page);
+int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page);
+int ext4_decrypt_one(struct inode *inode, struct page *page);
+int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+int ext4_init_crypto(void);
+void ext4_exit_crypto(void);
+static inline int ext4_sb_has_crypto(struct super_block *sb)
+{
+ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+}
+#else
+static inline int ext4_init_crypto(void) { return 0; }
+static inline void ext4_exit_crypto(void) { }
+static inline int ext4_sb_has_crypto(struct super_block *sb)
+{
+ return 0;
+}
+#endif
+
+/* crypto_fname.c */
+bool ext4_valid_filenames_enc_mode(uint32_t mode);
+u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
+unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen);
+int ext4_fname_crypto_alloc_buffer(struct inode *inode,
+ u32 ilen, struct ext4_str *crypto_str);
+int _ext4_fname_disk_to_usr(struct inode *inode,
+ struct dx_hash_info *hinfo,
+ const struct ext4_str *iname,
+ struct ext4_str *oname);
+int ext4_fname_disk_to_usr(struct inode *inode,
+ struct dx_hash_info *hinfo,
+ const struct ext4_dir_entry_2 *de,
+ struct ext4_str *oname);
+int ext4_fname_usr_to_disk(struct inode *inode,
+ const struct qstr *iname,
+ struct ext4_str *oname);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str);
+int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct ext4_filename *fname);
+void ext4_fname_free_filename(struct ext4_filename *fname);
+#else
+static inline
+int ext4_setup_fname_crypto(struct inode *inode)
+{
+ return 0;
+}
+static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { }
+static inline int ext4_fname_setup_filename(struct inode *dir,
+ const struct qstr *iname,
+ int lookup, struct ext4_filename *fname)
+{
+ fname->usr_fname = iname;
+ fname->disk_name.name = (unsigned char *) iname->name;
+ fname->disk_name.len = iname->len;
+ return 0;
+}
+static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
+#endif
+
+
+/* crypto_key.c */
+void ext4_free_crypt_info(struct ext4_crypt_info *ci);
+void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci);
+int _ext4_get_encryption_info(struct inode *inode);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+int ext4_has_encryption_key(struct inode *inode);
+
+static inline int ext4_get_encryption_info(struct inode *inode)
+{
+ struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
+
+ if (!ci ||
+ (ci->ci_keyring_key &&
+ (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+ (1 << KEY_FLAG_REVOKED) |
+ (1 << KEY_FLAG_DEAD)))))
+ return _ext4_get_encryption_info(inode);
+ return 0;
+}
+
+static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
+{
+ return EXT4_I(inode)->i_crypt_info;
+}
+
+#else
+static inline int ext4_has_encryption_key(struct inode *inode)
+{
+ return 0;
+}
+static inline int ext4_get_encryption_info(struct inode *inode)
+{
+ return 0;
+}
+static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode)
+{
+ return NULL;
+}
+#endif
+
+
/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
struct file *,
@@ -2011,18 +2177,20 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
(de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
- __u32 minor_hash,
- struct ext4_dir_entry_2 *dirent);
+ __u32 minor_hash,
+ struct ext4_dir_entry_2 *dirent,
+ struct ext4_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
struct buffer_head *bh,
void *buf, int buf_size,
- const char *name, int namelen,
+ struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
-void ext4_insert_dentry(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- const char *name, int namelen);
+int ext4_insert_dentry(struct inode *dir,
+ struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
@@ -2099,6 +2267,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
/* inode.c */
+int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
@@ -2152,8 +2321,8 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
-extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset);
+extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@ -2175,13 +2344,14 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
-extern int search_dir(struct buffer_head *bh,
- char *search_buf,
- int buf_size,
- struct inode *dir,
- const struct qstr *d_name,
- unsigned int offset,
- struct ext4_dir_entry_2 **res_dir);
+extern int ext4_search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ struct ext4_filename *fname,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir);
extern int ext4_generic_delete_entry(handle_t *handle,
struct inode *dir,
struct ext4_dir_entry_2 *de_del,
@@ -2189,6 +2359,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
void *entry_buf,
int buf_size,
int csum_size);
+extern int ext4_empty_dir(struct inode *inode);
/* resize.c */
extern int ext4_group_add(struct super_block *sb,
@@ -2225,6 +2396,9 @@ void __ext4_abort(struct super_block *, const char *, unsigned int,
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
const char *, ...);
+extern __printf(4, 5)
+void __ext4_warning_inode(const struct inode *inode, const char *function,
+ unsigned int line, const char *fmt, ...);
extern __printf(3, 4)
void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
@@ -2235,6 +2409,15 @@ void __ext4_grp_locked_error(const char *, unsigned int,
unsigned long, ext4_fsblk_t,
const char *, ...);
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
+ ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+
+#define EXT4_ERROR_FILE(file, block, fmt, a...) \
+ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
+
#ifdef CONFIG_PRINTK
#define ext4_error_inode(inode, func, line, block, fmt, ...) \
@@ -2247,6 +2430,8 @@ void __ext4_grp_locked_error(const char *, unsigned int,
__ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...) \
__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning_inode(inode, fmt, ...) \
+ __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_msg(sb, level, fmt, ...) \
__ext4_msg(sb, level, fmt, ##__VA_ARGS__)
#define dump_mmp_msg(sb, mmp, msg) \
@@ -2282,6 +2467,11 @@ do { \
no_printk(fmt, ##__VA_ARGS__); \
__ext4_warning(sb, "", 0, " "); \
} while (0)
+#define ext4_warning_inode(inode, fmt, ...) \
+do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __ext4_warning_inode(inode, "", 0, " "); \
+} while (0)
#define ext4_msg(sb, level, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
@@ -2593,7 +2783,6 @@ extern const struct file_operations ext4_dir_operations;
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
-extern const struct file_operations ext4_dax_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
/* inline.c */
@@ -2626,7 +2815,9 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
unsigned len, unsigned copied,
struct page *page);
-extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+extern int ext4_try_add_inline_entry(handle_t *handle,
+ struct ext4_filename *fname,
+ struct dentry *dentry,
struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
struct inode *parent,
@@ -2640,6 +2831,7 @@ extern int htree_inlinedir_to_tree(struct file *dir_file,
__u32 start_hash, __u32 start_minor_hash,
int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ struct ext4_filename *fname,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
int *has_inline_data);
@@ -2699,8 +2891,13 @@ static inline void ext4_set_de_type(struct super_block *sb,
de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
+/* readpages.c */
+extern int ext4_mpage_readpages(struct address_space *mapping,
+ struct list_head *pages, struct page *page,
+ unsigned nr_pages);
/* symlink.c */
+extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;
@@ -2743,7 +2940,6 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
ext4_lblk_t lblocks);
-extern int ext4_extent_tree_init(handle_t *, struct inode *);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
@@ -2767,6 +2963,7 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
struct inode *inode2, ext4_lblk_t lblk1,
ext4_lblk_t lblk2, ext4_lblk_t count,
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
new file mode 100644
index 000000000000..ac7d4e813796
--- /dev/null
+++ b/fs/ext4/ext4_crypto.h
@@ -0,0 +1,159 @@
+/*
+ * linux/fs/ext4/ext4_crypto.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption header content for ext4
+ *
+ * Written by Michael Halcrow, 2015.
+ */
+
+#ifndef _EXT4_CRYPTO_H
+#define _EXT4_CRYPTO_H
+
+#include <linux/fs.h>
+
+#define EXT4_KEY_DESCRIPTOR_SIZE 8
+
+/* Policy provided via an ioctl on the topmost directory */
+struct ext4_encryption_policy {
+ char version;
+ char contents_encryption_mode;
+ char filenames_encryption_mode;
+ char flags;
+ char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
+} __attribute__((__packed__));
+
+#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1
+#define EXT4_KEY_DERIVATION_NONCE_SIZE 16
+
+#define EXT4_POLICY_FLAGS_PAD_4 0x00
+#define EXT4_POLICY_FLAGS_PAD_8 0x01
+#define EXT4_POLICY_FLAGS_PAD_16 0x02
+#define EXT4_POLICY_FLAGS_PAD_32 0x03
+#define EXT4_POLICY_FLAGS_PAD_MASK 0x03
+#define EXT4_POLICY_FLAGS_VALID 0x03
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ * 1 byte: Protector format (1 = this version)
+ * 1 byte: File contents encryption mode
+ * 1 byte: File names encryption mode
+ * 1 byte: Reserved
+ * 8 bytes: Master Key descriptor
+ * 16 bytes: Encryption Key derivation nonce
+ */
+struct ext4_encryption_context {
+ char format;
+ char contents_encryption_mode;
+ char filenames_encryption_mode;
+ char flags;
+ char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
+ char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE];
+} __attribute__((__packed__));
+
+/* Encryption parameters */
+#define EXT4_XTS_TWEAK_SIZE 16
+#define EXT4_AES_128_ECB_KEY_SIZE 16
+#define EXT4_AES_256_GCM_KEY_SIZE 32
+#define EXT4_AES_256_CBC_KEY_SIZE 32
+#define EXT4_AES_256_CTS_KEY_SIZE 32
+#define EXT4_AES_256_XTS_KEY_SIZE 64
+#define EXT4_MAX_KEY_SIZE 64
+
+#define EXT4_KEY_DESC_PREFIX "ext4:"
+#define EXT4_KEY_DESC_PREFIX_SIZE 5
+
+/* This is passed in from userspace into the kernel keyring */
+struct ext4_encryption_key {
+ __u32 mode;
+ char raw[EXT4_MAX_KEY_SIZE];
+ __u32 size;
+} __attribute__((__packed__));
+
+struct ext4_crypt_info {
+ char ci_data_mode;
+ char ci_filename_mode;
+ char ci_flags;
+ struct crypto_ablkcipher *ci_ctfm;
+ struct key *ci_keyring_key;
+ char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE];
+};
+
+#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
+#define EXT4_WRITE_PATH_FL 0x00000002
+
+struct ext4_crypto_ctx {
+ union {
+ struct {
+ struct page *bounce_page; /* Ciphertext page */
+ struct page *control_page; /* Original page */
+ } w;
+ struct {
+ struct bio *bio;
+ struct work_struct work;
+ } r;
+ struct list_head free_list; /* Free list */
+ };
+ char flags; /* Flags */
+ char mode; /* Encryption mode for tfm */
+};
+
+struct ext4_completion_result {
+ struct completion completion;
+ int res;
+};
+
+#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \
+ struct ext4_completion_result ecr = { \
+ COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+static inline int ext4_encryption_key_size(int mode)
+{
+ switch (mode) {
+ case EXT4_ENCRYPTION_MODE_AES_256_XTS:
+ return EXT4_AES_256_XTS_KEY_SIZE;
+ case EXT4_ENCRYPTION_MODE_AES_256_GCM:
+ return EXT4_AES_256_GCM_KEY_SIZE;
+ case EXT4_ENCRYPTION_MODE_AES_256_CBC:
+ return EXT4_AES_256_CBC_KEY_SIZE;
+ case EXT4_ENCRYPTION_MODE_AES_256_CTS:
+ return EXT4_AES_256_CTS_KEY_SIZE;
+ default:
+ BUG();
+ }
+ return 0;
+}
+
+#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4
+#define EXT4_CRYPTO_BLOCK_SIZE 16
+#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32
+
+struct ext4_str {
+ unsigned char *name;
+ u32 len;
+};
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct ext4_encrypted_symlink_data {
+ __le16 len;
+ char encrypted_path[1];
+} __attribute__((__packed__));
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 encrypted_symlink_data_len(u32 l)
+{
+ if (l < EXT4_CRYPTO_BLOCK_SIZE)
+ l = EXT4_CRYPTO_BLOCK_SIZE;
+ return (l + sizeof(struct ext4_encrypted_symlink_data) - 1);
+}
+
+#endif /* _EXT4_CRYPTO_H */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3445035c7e01..d41843181818 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -87,6 +87,12 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
ext4_put_nojournal(handle);
return 0;
}
+
+ if (!handle->h_transaction) {
+ err = jbd2_journal_stop(handle);
+ return handle->h_err ? handle->h_err : err;
+ }
+
sb = handle->h_transaction->t_journal->j_private;
err = handle->h_err;
rc = jbd2_journal_stop(handle);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bed43081720f..2553aa8b608d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/fiemap.h>
+#include <linux/backing-dev.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
@@ -377,7 +378,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
ext4_lblk_t last = lblock + len - 1;
- if (lblock > last)
+ if (len == 0 || lblock > last)
return 0;
return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
}
@@ -503,7 +504,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
struct buffer_head *bh;
int err;
- bh = sb_getblk(inode->i_sb, pblk);
+ bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
@@ -1088,7 +1089,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
err = -EIO;
goto cleanup;
}
- bh = sb_getblk(inode->i_sb, newblock);
+ bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
if (unlikely(!bh)) {
err = -ENOMEM;
goto cleanup;
@@ -1282,7 +1283,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
if (newblock == 0)
return err;
- bh = sb_getblk(inode->i_sb, newblock);
+ bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
if (unlikely(!bh))
return -ENOMEM;
lock_buffer(bh);
@@ -1717,12 +1718,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
{
unsigned short ext1_ee_len, ext2_ee_len;
- /*
- * Make sure that both extents are initialized. We don't merge
- * unwritten extents so that we can be sure that end_io code has
- * the extent that was written properly split out and conversion to
- * initialized is trivial.
- */
if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
return 0;
@@ -3128,6 +3123,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
ee_len = ext4_ext_get_actual_len(ex);
ee_pblock = ext4_ext_pblock(ex);
+ if (ext4_encrypted_inode(inode))
+ return ext4_encrypted_zeroout(inode, ex);
+
ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
if (ret > 0)
ret = 0;
@@ -4459,6 +4457,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ar.flags |= EXT4_MB_HINT_NOPREALLOC;
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+ if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ ar.flags |= EXT4_MB_USE_RESERVED;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
@@ -4535,19 +4535,7 @@ got_allocated_blocks:
*/
reserved_clusters = get_reserved_cluster_alloc(inode,
map->m_lblk, allocated);
- if (map_from_cluster) {
- if (reserved_clusters) {
- /*
- * We have clusters reserved for this range.
- * But since we are not doing actual allocation
- * and are simply using blocks from previously
- * allocated cluster, we should release the
- * reservation and not claim quota.
- */
- ext4_da_update_reserve_space(inode,
- reserved_clusters, 0);
- }
- } else {
+ if (!map_from_cluster) {
BUG_ON(allocated_clusters < reserved_clusters);
if (reserved_clusters < allocated_clusters) {
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4678,6 +4666,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
int ret = 0;
int ret2 = 0;
int retries = 0;
+ int depth = 0;
struct ext4_map_blocks map;
unsigned int credits;
loff_t epos;
@@ -4692,13 +4681,32 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
if (len <= EXT_UNWRITTEN_MAX_LEN)
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
/*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, len);
+ /*
+ * We can only call ext_depth() on extent based inodes
+ */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ depth = ext_depth(inode);
+ else
+ depth = -1;
retry:
while (ret >= 0 && len) {
+ /*
+ * Recalculate credits when extent tree depth changes.
+ */
+ if (depth >= 0 && depth != ext_depth(inode)) {
+ credits = ext4_chunk_trans_blocks(inode, len);
+ depth = ext_depth(inode);
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
credits);
if (IS_ERR(handle)) {
@@ -4740,6 +4748,8 @@ retry:
goto retry;
}
+ ext4_inode_resume_unlocked_dio(inode);
+
return ret > 0 ? ret2 : ret;
}
@@ -4803,12 +4813,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
else
max_blocks -= lblk;
- flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
- EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
- EXT4_EX_NOCACHE;
- if (mode & FALLOC_FL_KEEP_SIZE)
- flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
-
mutex_lock(&inode->i_mutex);
/*
@@ -4825,15 +4829,28 @@ static long ext4_zero_range(struct file *file, loff_t offset,
ret = inode_newsize_ok(inode, new_size);
if (ret)
goto out_mutex;
- /*
- * If we have a partial block after EOF we have to allocate
- * the entire block.
- */
- if (partial_end)
- max_blocks += 1;
}
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+ /* Preallocate the range including the unaligned edges */
+ if (partial_begin || partial_end) {
+ ret = ext4_alloc_file_blocks(file,
+ round_down(offset, 1 << blkbits) >> blkbits,
+ (round_up((offset + len), 1 << blkbits) -
+ round_down(offset, 1 << blkbits)) >> blkbits,
+ new_size, flags, mode);
+ if (ret)
+ goto out_mutex;
+
+ }
+
+ /* Zero range excluding the unaligned edges */
if (max_blocks > 0) {
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE);
/* Now release the pages and zero block aligned part of pages*/
truncate_pagecache_range(inode, start, end - 1);
@@ -4847,19 +4864,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags, mode);
if (ret)
goto out_dio;
- /*
- * Remove entire range from the extent status tree.
- *
- * ext4_es_remove_extent(inode, lblk, max_blocks) is
- * NOT sufficient. I'm not sure why this is the case,
- * but let's be conservative and remove the extent
- * status tree for the entire inode. There should be
- * no outstanding delalloc extents thanks to the
- * filemap_write_and_wait_range() call above.
- */
- ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
- if (ret)
- goto out_dio;
}
if (!partial_begin && !partial_end)
goto out_dio;
@@ -4922,9 +4926,25 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
ext4_lblk_t lblk;
unsigned int blkbits = inode->i_blkbits;
+ /*
+ * Encrypted inodes can't handle collapse range or insert
+ * range since we would need to re-encrypt blocks with a
+ * different IV or XTS tweak (which are based on the logical
+ * block number).
+ *
+ * XXX It's not clear why zero range isn't working, but we'll
+ * leave it disabled for encrypted inodes for now. This is a
+ * bug we should fix....
+ */
+ if (ext4_encrypted_inode(inode) &&
+ (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
+ FALLOC_FL_ZERO_RANGE)))
+ return -EOPNOTSUPP;
+
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -4934,16 +4954,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
return ret;
- /*
- * currently supporting (pre)allocate mode for extent-based
- * files _only_
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return -EOPNOTSUPP;
-
if (mode & FALLOC_FL_COLLAPSE_RANGE)
return ext4_collapse_range(inode, offset, len);
+ if (mode & FALLOC_FL_INSERT_RANGE)
+ return ext4_insert_range(inode, offset, len);
+
if (mode & FALLOC_FL_ZERO_RANGE)
return ext4_zero_range(file, offset, len, mode);
@@ -4962,6 +4978,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
mutex_lock(&inode->i_mutex);
+ /*
+ * We only support preallocation for extent-based files only
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode)) {
new_size = offset + len;
@@ -5230,13 +5254,13 @@ ext4_access_path(handle_t *handle, struct inode *inode,
/*
* ext4_ext_shift_path_extents:
* Shift the extents of a path structure lying between path[depth].p_ext
- * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
- * from starting block for each extent.
+ * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
+ * if it is right shift or left shift operation.
*/
static int
ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
struct inode *inode, handle_t *handle,
- ext4_lblk_t *start)
+ enum SHIFT_DIRECTION SHIFT)
{
int depth, err = 0;
struct ext4_extent *ex_start, *ex_last;
@@ -5258,19 +5282,25 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
update = 1;
- *start = le32_to_cpu(ex_last->ee_block) +
- ext4_ext_get_actual_len(ex_last);
-
while (ex_start <= ex_last) {
- le32_add_cpu(&ex_start->ee_block, -shift);
- /* Try to merge to the left. */
- if ((ex_start >
- EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
- ext4_ext_try_to_merge_right(inode,
- path, ex_start - 1))
+ if (SHIFT == SHIFT_LEFT) {
+ le32_add_cpu(&ex_start->ee_block,
+ -shift);
+ /* Try to merge to the left. */
+ if ((ex_start >
+ EXT_FIRST_EXTENT(path[depth].p_hdr))
+ &&
+ ext4_ext_try_to_merge_right(inode,
+ path, ex_start - 1))
+ ex_last--;
+ else
+ ex_start++;
+ } else {
+ le32_add_cpu(&ex_last->ee_block, shift);
+ ext4_ext_try_to_merge_right(inode, path,
+ ex_last);
ex_last--;
- else
- ex_start++;
+ }
}
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
@@ -5285,7 +5315,10 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
if (err)
goto out;
- le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+ if (SHIFT == SHIFT_LEFT)
+ le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+ else
+ le32_add_cpu(&path[depth].p_idx->ei_block, shift);
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
goto out;
@@ -5303,19 +5336,20 @@ out:
/*
* ext4_ext_shift_extents:
- * All the extents which lies in the range from start to the last allocated
- * block for the file are shifted downwards by shift blocks.
+ * All the extents which lies in the range from @start to the last allocated
+ * block for the @inode are shifted either towards left or right (depending
+ * upon @SHIFT) by @shift blocks.
* On success, 0 is returned, error otherwise.
*/
static int
ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
- ext4_lblk_t start, ext4_lblk_t shift)
+ ext4_lblk_t start, ext4_lblk_t shift,
+ enum SHIFT_DIRECTION SHIFT)
{
struct ext4_ext_path *path;
int ret = 0, depth;
struct ext4_extent *extent;
- ext4_lblk_t stop_block;
- ext4_lblk_t ex_start, ex_end;
+ ext4_lblk_t stop, *iterator, ex_start, ex_end;
/* Let path point to the last extent */
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
@@ -5327,58 +5361,84 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
if (!extent)
goto out;
- stop_block = le32_to_cpu(extent->ee_block) +
+ stop = le32_to_cpu(extent->ee_block) +
ext4_ext_get_actual_len(extent);
- /* Nothing to shift, if hole is at the end of file */
- if (start >= stop_block)
- goto out;
+ /*
+ * In case of left shift, Don't start shifting extents until we make
+ * sure the hole is big enough to accommodate the shift.
+ */
+ if (SHIFT == SHIFT_LEFT) {
+ path = ext4_find_extent(inode, start - 1, &path, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (extent) {
+ ex_start = le32_to_cpu(extent->ee_block);
+ ex_end = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ } else {
+ ex_start = 0;
+ ex_end = 0;
+ }
- /*
- * Don't start shifting extents until we make sure the hole is big
- * enough to accomodate the shift.
- */
- path = ext4_find_extent(inode, start - 1, &path, 0);
- if (IS_ERR(path))
- return PTR_ERR(path);
- depth = path->p_depth;
- extent = path[depth].p_ext;
- if (extent) {
- ex_start = le32_to_cpu(extent->ee_block);
- ex_end = le32_to_cpu(extent->ee_block) +
- ext4_ext_get_actual_len(extent);
- } else {
- ex_start = 0;
- ex_end = 0;
+ if ((start == ex_start && shift > ex_start) ||
+ (shift > start - ex_end)) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return -EINVAL;
+ }
}
- if ((start == ex_start && shift > ex_start) ||
- (shift > start - ex_end))
- return -EINVAL;
+ /*
+ * In case of left shift, iterator points to start and it is increased
+ * till we reach stop. In case of right shift, iterator points to stop
+ * and it is decreased till we reach start.
+ */
+ if (SHIFT == SHIFT_LEFT)
+ iterator = &start;
+ else
+ iterator = &stop;
/* Its safe to start updating extents */
- while (start < stop_block) {
- path = ext4_find_extent(inode, start, &path, 0);
+ while (start < stop) {
+ path = ext4_find_extent(inode, *iterator, &path, 0);
if (IS_ERR(path))
return PTR_ERR(path);
depth = path->p_depth;
extent = path[depth].p_ext;
if (!extent) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
- (unsigned long) start);
+ (unsigned long) *iterator);
return -EIO;
}
- if (start > le32_to_cpu(extent->ee_block)) {
+ if (SHIFT == SHIFT_LEFT && *iterator >
+ le32_to_cpu(extent->ee_block)) {
/* Hole, move to the next extent */
if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
path[depth].p_ext++;
} else {
- start = ext4_ext_next_allocated_block(path);
+ *iterator = ext4_ext_next_allocated_block(path);
continue;
}
}
+
+ if (SHIFT == SHIFT_LEFT) {
+ extent = EXT_LAST_EXTENT(path[depth].p_hdr);
+ *iterator = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ } else {
+ extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
+ *iterator = le32_to_cpu(extent->ee_block) > 0 ?
+ le32_to_cpu(extent->ee_block) - 1 : 0;
+ /* Update path extent in case we need to stop */
+ while (le32_to_cpu(extent->ee_block) < start)
+ extent++;
+ path[depth].p_ext = extent;
+ }
ret = ext4_ext_shift_path_extents(path, shift, inode,
- handle, &start);
+ handle, SHIFT);
if (ret)
break;
}
@@ -5402,6 +5462,14 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
loff_t new_size, ioffset;
int ret;
+ /*
+ * We need to test this early because xfstests assumes that a
+ * collapse range of (0, 1) will return EOPNOTSUPP if the file
+ * system does not support collapse range.
+ */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return -EOPNOTSUPP;
+
/* Collapse range works only on fs block size aligned offsets. */
if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
len & (EXT4_CLUSTER_SIZE(sb) - 1))
@@ -5483,7 +5551,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ext4_discard_preallocations(inode);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
- punch_stop - punch_start);
+ punch_stop - punch_start, SHIFT_LEFT);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
@@ -5508,6 +5576,174 @@ out_mutex:
return ret;
}
+/*
+ * ext4_insert_range:
+ * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
+ * The data blocks starting from @offset to the EOF are shifted by @len
+ * towards right to create a hole in the @inode. Inode size is increased
+ * by len bytes.
+ * Returns 0 on success, error otherwise.
+ */
+int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct super_block *sb = inode->i_sb;
+ handle_t *handle;
+ struct ext4_ext_path *path;
+ struct ext4_extent *extent;
+ ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
+ unsigned int credits, ee_len;
+ int ret = 0, depth, split_flag = 0;
+ loff_t ioffset;
+
+ /*
+ * We need to test this early because xfstests assumes that an
+ * insert range of (0, 1) will return EOPNOTSUPP if the file
+ * system does not support insert range.
+ */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return -EOPNOTSUPP;
+
+ /* Insert range works only on fs block size aligned offsets. */
+ if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
+ len & (EXT4_CLUSTER_SIZE(sb) - 1))
+ return -EINVAL;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ trace_ext4_insert_range(inode, offset, len);
+
+ offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ /* Call ext4_force_commit to flush all data in case of data=journal */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Need to round down to align start offset to page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+
+ /* Write out all dirty pages */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ LLONG_MAX);
+ if (ret)
+ return ret;
+
+ /* Take mutex lock */
+ mutex_lock(&inode->i_mutex);
+
+ /* Currently just for extent based files */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ret = -EOPNOTSUPP;
+ goto out_mutex;
+ }
+
+ /* Check for wrap through zero */
+ if (inode->i_size + len > inode->i_sb->s_maxbytes) {
+ ret = -EFBIG;
+ goto out_mutex;
+ }
+
+ /* Offset should be less than i_size */
+ if (offset >= i_size_read(inode)) {
+ ret = -EINVAL;
+ goto out_mutex;
+ }
+
+ truncate_pagecache(inode, ioffset);
+
+ /* Wait for existing dio to complete */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_dio;
+ }
+
+ /* Expand file to avoid data loss if there is error while shifting */
+ inode->i_size += len;
+ EXT4_I(inode)->i_disksize += len;
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ goto out_stop;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ if (IS_ERR(path)) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ depth = ext_depth(inode);
+ extent = path[depth].p_ext;
+ if (extent) {
+ ee_start_lblk = le32_to_cpu(extent->ee_block);
+ ee_len = ext4_ext_get_actual_len(extent);
+
+ /*
+ * If offset_lblk is not the starting block of extent, split
+ * the extent @offset_lblk
+ */
+ if ((offset_lblk > ee_start_lblk) &&
+ (offset_lblk < (ee_start_lblk + ee_len))) {
+ if (ext4_ext_is_unwritten(extent))
+ split_flag = EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
+ ret = ext4_split_extent_at(handle, inode, &path,
+ offset_lblk, split_flag,
+ EXT4_EX_NOCACHE |
+ EXT4_GET_BLOCKS_PRE_IO |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL);
+ }
+
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret < 0) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+ }
+
+ ret = ext4_es_remove_extent(inode, offset_lblk,
+ EXT_MAX_BLOCKS - offset_lblk);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ /*
+ * if offset_lblk lies in a hole which is at start of file, use
+ * ee_start_lblk to shift extents
+ */
+ ret = ext4_ext_shift_extents(inode, handle,
+ ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
+ len_lblk, SHIFT_RIGHT);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
/**
* ext4_swap_extents - Swap extents between two inodes
*
@@ -5540,7 +5776,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
BUG_ON(!mutex_is_locked(&inode1->i_mutex));
- BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+ BUG_ON(!mutex_is_locked(&inode2->i_mutex));
*erp = ext4_es_remove_extent(inode1, lblk1, count);
if (unlikely(*erp))
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e04d45733976..26724aeece73 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -9,12 +9,10 @@
*
* Ext4 extents status tree core functions.
*/
-#include <linux/rbtree.h>
#include <linux/list_sort.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "ext4.h"
-#include "extents_status.h"
#include <trace/events/ext4.h>
@@ -705,6 +703,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
BUG_ON(end < lblk);
+ if ((status & EXTENT_STATUS_DELAYED) &&
+ (status & EXTENT_STATUS_WRITTEN)) {
+ ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
+ " delayed and written which can potentially "
+ " cause data loss.\n", lblk, len);
+ WARN_ON(1);
+ }
+
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 33a09da16c9c..bc313ac5d3fa 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -20,12 +20,11 @@
#include <linux/time.h>
#include <linux/fs.h>
-#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
-#include <linux/aio.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
+#include <linux/uio.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -95,11 +94,9 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
struct mutex *aio_mutex = NULL;
struct blk_plug plug;
- int o_direct = io_is_direct(file);
+ int o_direct = iocb->ki_flags & IOCB_DIRECT;
int overwrite = 0;
- size_t length = iov_iter_count(from);
ssize_t ret;
- loff_t pos = iocb->ki_pos;
/*
* Unaligned direct AIO must be serialized; see comment above
@@ -108,16 +105,17 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (o_direct &&
ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
!is_sync_kiocb(iocb) &&
- (file->f_flags & O_APPEND ||
- ext4_unaligned_aio(inode, from, pos))) {
+ (iocb->ki_flags & IOCB_APPEND ||
+ ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
aio_mutex = ext4_aio_mutex(inode);
mutex_lock(aio_mutex);
ext4_unwritten_wait(inode);
}
mutex_lock(&inode->i_mutex);
- if (file->f_flags & O_APPEND)
- iocb->ki_pos = pos = i_size_read(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
/*
* If we have encountered a bitmap-format file, the size limit
@@ -126,22 +124,19 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if ((pos > sbi->s_bitmap_maxbytes) ||
- (pos == sbi->s_bitmap_maxbytes && length > 0)) {
- mutex_unlock(&inode->i_mutex);
+ if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
ret = -EFBIG;
- goto errout;
+ goto out;
}
-
- if (pos + length > sbi->s_bitmap_maxbytes)
- iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
+ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
}
iocb->private = &overwrite;
if (o_direct) {
+ size_t length = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
blk_start_plug(&plug);
-
/* check whether we do a DIO overwrite or not */
if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
!file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
@@ -185,27 +180,45 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (o_direct)
blk_finish_plug(&plug);
-errout:
+ if (aio_mutex)
+ mutex_unlock(aio_mutex);
+ return ret;
+
+out:
+ mutex_unlock(&inode->i_mutex);
if (aio_mutex)
mutex_unlock(aio_mutex);
return ret;
}
#ifdef CONFIG_FS_DAX
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+ struct inode *inode = bh->b_assoc_map->host;
+ /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+ int err;
+ if (!uptodate)
+ return;
+ WARN_ON(!buffer_unwritten(bh));
+ err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext4_get_block);
+ return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
/* Is this the right get_block? */
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block);
+ return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.page_mkwrite = ext4_dax_mkwrite,
+ .pfn_mkwrite = dax_pfn_mkwrite,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
@@ -219,6 +232,15 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct inode *inode = file->f_mapping->host;
+
+ if (ext4_encrypted_inode(inode)) {
+ int err = ext4_get_encryption_info(inode);
+ if (err)
+ return 0;
+ if (ext4_encryption_info(inode) == NULL)
+ return -ENOKEY;
+ }
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
@@ -236,6 +258,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
struct vfsmount *mnt = filp->f_path.mnt;
struct path path;
char buf[64], *cp;
+ int ret;
if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
!(sb->s_flags & MS_RDONLY))) {
@@ -269,12 +292,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
ext4_journal_stop(handle);
}
}
+ if (ext4_encrypted_inode(inode)) {
+ ret = ext4_get_encryption_info(inode);
+ if (ret)
+ return -EACCES;
+ if (ext4_encryption_info(inode) == NULL)
+ return -ENOKEY;
+ }
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
*/
if (filp->f_mode & FMODE_WRITE) {
- int ret = ext4_inode_attach_jinode(inode);
+ ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
return ret;
}
@@ -607,8 +637,6 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
@@ -624,26 +652,6 @@ const struct file_operations ext4_file_operations = {
.fallocate = ext4_fallocate,
};
-#ifdef CONFIG_FS_DAX
-const struct file_operations ext4_dax_file_operations = {
- .llseek = ext4_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
- .read_iter = generic_file_read_iter,
- .write_iter = ext4_file_write_iter,
- .unlocked_ioctl = ext4_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = ext4_compat_ioctl,
-#endif
- .mmap = ext4_file_mmap,
- .open = ext4_file_open,
- .release = ext4_release_file,
- .fsync = ext4_sync_file,
- /* Splice not yet supported with DAX */
- .fallocate = ext4_fallocate,
-};
-#endif
-
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a8bc47f75fa0..8850254136ae 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -26,7 +26,6 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
-#include <linux/jbd2.h>
#include <linux/blkdev.h>
#include "ext4.h"
@@ -56,7 +55,7 @@ static int ext4_sync_parent(struct inode *inode)
dentry = d_find_any_alias(inode);
if (!dentry)
break;
- next = igrab(dentry->d_parent->d_inode);
+ next = igrab(d_inode(dentry->d_parent));
dput(dentry);
if (!next)
break;
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 3d586f02883e..e026aa941fd5 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -10,7 +10,6 @@
*/
#include <linux/fs.h>
-#include <linux/jbd2.h>
#include <linux/cryptohash.h>
#include "ext4.h"
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ac644c31ca67..173c1ae21395 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -14,7 +14,6 @@
#include <linux/time.h>
#include <linux/fs.h>
-#include <linux/jbd2.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/quotaops.h>
@@ -444,7 +443,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
if (S_ISDIR(mode) &&
- ((parent == sb->s_root->d_inode) ||
+ ((parent == d_inode(sb->s_root)) ||
(ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
int best_ndir = inodes_per_group;
int ret = -1;
@@ -727,11 +726,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_group_t i;
ext4_group_t flex_group;
struct ext4_group_info *grp;
+ int encrypt = 0;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
return ERR_PTR(-EPERM);
+ if ((ext4_encrypted_inode(dir) ||
+ DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+ err = ext4_get_encryption_info(dir);
+ if (err)
+ return ERR_PTR(err);
+ if (ext4_encryption_info(dir) == NULL)
+ return ERR_PTR(-EPERM);
+ if (!handle)
+ nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
+ encrypt = 1;
+ }
+
sb = dir->i_sb;
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
@@ -1029,11 +1042,9 @@ got:
ext4_set_inode_state(inode, EXT4_STATE_NEW);
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
-
ei->i_inline_off = 0;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-
ret = inode;
err = dquot_alloc_inode(inode);
if (err)
@@ -1060,6 +1071,12 @@ got:
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
+ if (encrypt) {
+ err = ext4_inherit_context(dir, inode);
+ if (err)
+ goto fail_free_drop;
+ }
+
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 45fe924f82bc..4f6ac499f09e 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,9 +20,9 @@
* (sct@redhat.com), 1993, 1998
*/
-#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "truncate.h"
+#include <linux/uio.h>
#include <trace/events/ext4.h>
@@ -565,7 +565,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
"non-extent mapped inodes with bigalloc");
- return -ENOSPC;
+ return -EUCLEAN;
}
/* Set up for the direct block allocation */
@@ -576,6 +576,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ar.flags = EXT4_MB_HINT_DATA;
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+ if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ ar.flags |= EXT4_MB_USE_RESERVED;
ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
@@ -642,8 +644,8 @@ out:
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
-ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -654,7 +656,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
size_t count = iov_iter_count(iter);
int retries = 0;
- if (rw == WRITE) {
+ if (iov_iter_rw(iter) == WRITE) {
loff_t final_size = offset + count;
if (final_size > inode->i_size) {
@@ -676,37 +678,38 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
}
retry:
- if (rw == READ && ext4_should_dioread_nolock(inode)) {
+ if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) {
/*
* Nolock dioread optimization may be dynamically disabled
* via ext4_inode_block_unlocked_dio(). Check inode's state
* while holding extra i_dio_count ref.
*/
- atomic_inc(&inode->i_dio_count);
+ inode_dio_begin(inode);
smp_mb();
if (unlikely(ext4_test_inode_state(inode,
EXT4_STATE_DIOREAD_LOCK))) {
- inode_dio_done(inode);
+ inode_dio_end(inode);
goto locked;
}
if (IS_DAX(inode))
- ret = dax_do_io(rw, iocb, inode, iter, offset,
+ ret = dax_do_io(iocb, inode, iter, offset,
ext4_get_block, NULL, 0);
else
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iter, offset,
- ext4_get_block, NULL, NULL, 0);
- inode_dio_done(inode);
+ ret = __blockdev_direct_IO(iocb, inode,
+ inode->i_sb->s_bdev, iter,
+ offset, ext4_get_block, NULL,
+ NULL, 0);
+ inode_dio_end(inode);
} else {
locked:
if (IS_DAX(inode))
- ret = dax_do_io(rw, iocb, inode, iter, offset,
+ ret = dax_do_io(iocb, inode, iter, offset,
ext4_get_block, NULL, DIO_LOCKING);
else
- ret = blockdev_direct_IO(rw, iocb, inode, iter,
- offset, ext4_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, offset,
+ ext4_get_block);
- if (unlikely((rw & WRITE) && ret < 0)) {
+ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
loff_t end = offset + count;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 4b143febf21f..cd944a7a99cd 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -11,11 +11,13 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
+
+#include <linux/fiemap.h>
+
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
#include "truncate.h"
-#include <linux/fiemap.h>
#define EXT4_XATTR_SYSTEM_DATA "data"
#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
@@ -972,7 +974,7 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
offset = 0;
while ((void *)de < dlimit) {
de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
- trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
+ trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n",
offset, de_len, de->name_len, de->name,
de->name_len, le32_to_cpu(de->inode));
if (ext4_check_dir_entry(dir, NULL, de, bh,
@@ -993,20 +995,18 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
* and -EEXIST if directory entry already exists.
*/
static int ext4_add_dirent_to_inline(handle_t *handle,
+ struct ext4_filename *fname,
struct dentry *dentry,
struct inode *inode,
struct ext4_iloc *iloc,
void *inline_start, int inline_size)
{
- struct inode *dir = dentry->d_parent->d_inode;
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
+ struct inode *dir = d_inode(dentry->d_parent);
int err;
struct ext4_dir_entry_2 *de;
- err = ext4_find_dest_de(dir, inode, iloc->bh,
- inline_start, inline_size,
- name, namelen, &de);
+ err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
+ inline_size, fname, &de);
if (err)
return err;
@@ -1014,7 +1014,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
err = ext4_journal_get_write_access(handle, iloc->bh);
if (err)
return err;
- ext4_insert_dentry(inode, de, inline_size, name, namelen);
+ ext4_insert_dentry(dir, inode, de, inline_size, fname);
ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
@@ -1245,13 +1245,13 @@ out:
* If succeeds, return 0. If not, extended the inline dir and copied data to
* the new created block.
*/
-int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode)
+int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
+ struct dentry *dentry, struct inode *inode)
{
int ret, inline_size;
void *inline_start;
struct ext4_iloc iloc;
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
ret = ext4_get_inode_loc(dir, &iloc);
if (ret)
@@ -1265,7 +1265,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
- ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc,
inline_start, inline_size);
if (ret != -ENOSPC)
goto out;
@@ -1286,8 +1286,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
if (inline_size) {
inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
- ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
- inline_start, inline_size);
+ ret = ext4_add_dirent_to_inline(handle, fname, dentry,
+ inode, &iloc, inline_start,
+ inline_size);
if (ret != -ENOSPC)
goto out;
@@ -1327,6 +1328,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
struct ext4_iloc iloc;
void *dir_buf = NULL;
struct ext4_dir_entry_2 fake;
+ struct ext4_str tmp_str;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -1398,8 +1400,10 @@ int htree_inlinedir_to_tree(struct file *dir_file,
continue;
if (de->inode == 0)
continue;
- err = ext4_htree_store_dirent(dir_file,
- hinfo->hash, hinfo->minor_hash, de);
+ tmp_str.name = de->name;
+ tmp_str.len = de->name_len;
+ err = ext4_htree_store_dirent(dir_file, hinfo->hash,
+ hinfo->minor_hash, de, &tmp_str);
if (err) {
count = err;
goto out;
@@ -1605,6 +1609,7 @@ out:
}
struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ struct ext4_filename *fname,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
int *has_inline_data)
@@ -1626,8 +1631,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
- ret = search_dir(iloc.bh, inline_start, inline_size,
- dir, d_name, 0, res_dir);
+ ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+ dir, fname, d_name, 0, res_dir);
if (ret == 1)
goto out_find;
if (ret < 0)
@@ -1639,8 +1644,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
- ret = search_dir(iloc.bh, inline_start, inline_size,
- dir, d_name, 0, res_dir);
+ ret = ext4_search_dir(iloc.bh, inline_start, inline_size,
+ dir, fname, d_name, 0, res_dir);
if (ret == 1)
goto out_find;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cb9a212b86f..cecf9aa10811 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -20,7 +20,6 @@
#include <linux/fs.h>
#include <linux/time.h>
-#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
@@ -36,8 +35,6 @@
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/slab.h>
-#include <linux/ratelimit.h>
-#include <linux/aio.h>
#include <linux/bitops.h>
#include "ext4_jbd2.h"
@@ -141,7 +138,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
/*
* Test whether an inode is a fast symlink.
*/
-static int ext4_inode_is_fast_symlink(struct inode *inode)
+int ext4_inode_is_fast_symlink(struct inode *inode)
{
int ea_blocks = EXT4_I(inode)->i_file_acl ?
EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
@@ -534,6 +531,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ !(status & EXTENT_STATUS_WRITTEN) &&
ext4_find_delalloc_range(inode, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
@@ -638,6 +636,7 @@ found:
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+ !(status & EXTENT_STATUS_WRITTEN) &&
ext4_find_delalloc_range(inode, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
@@ -657,18 +656,6 @@ has_zeroout:
return retval;
}
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
- struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
- loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
- int err;
- if (!uptodate)
- return;
- WARN_ON(!buffer_unwritten(bh));
- err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@@ -706,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
- if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+ if (IS_DAX(inode) && buffer_unwritten(bh)) {
+ /*
+ * dgc: I suspect unwritten conversion on ext4+DAX is
+ * fundamentally broken here when there are concurrent
+ * read/write in progress on this inode.
+ */
+ WARN_ON_ONCE(io_end);
bh->b_assoc_map = inode->i_mapping;
bh->b_private = (void *)(unsigned long)iblock;
- bh->b_end_io = ext4_end_io_unwritten;
}
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
@@ -732,18 +724,18 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, int create)
+ ext4_lblk_t block, int map_flags)
{
struct ext4_map_blocks map;
struct buffer_head *bh;
+ int create = map_flags & EXT4_GET_BLOCKS_CREATE;
int err;
J_ASSERT(handle != NULL || create == 0);
map.m_lblk = block;
map.m_len = 1;
- err = ext4_map_blocks(handle, inode, &map,
- create ? EXT4_GET_BLOCKS_CREATE : 0);
+ err = ext4_map_blocks(handle, inode, &map, map_flags);
if (err == 0)
return create ? ERR_PTR(-ENOSPC) : NULL;
@@ -789,11 +781,11 @@ errout:
}
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, int create)
+ ext4_lblk_t block, int map_flags)
{
struct buffer_head *bh;
- bh = ext4_getblk(handle, inode, block, create);
+ bh = ext4_getblk(handle, inode, block, map_flags);
if (IS_ERR(bh))
return bh;
if (!bh || buffer_uptodate(bh))
@@ -888,6 +880,95 @@ int do_journal_get_write_access(handle_t *handle,
static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
+ get_block_t *get_block)
+{
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned to = from + len;
+ struct inode *inode = page->mapping->host;
+ unsigned block_start, block_end;
+ sector_t block;
+ int err = 0;
+ unsigned blocksize = inode->i_sb->s_blocksize;
+ unsigned bbits;
+ struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+ bool decrypt = false;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(from > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > to);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+ head = page_buffers(page);
+ bbits = ilog2(blocksize);
+ block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+
+ for (bh = head, block_start = 0; bh != head || !block_start;
+ block++, block_start = block_end, bh = bh->b_this_page) {
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ }
+ continue;
+ }
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
+ if (!buffer_mapped(bh)) {
+ WARN_ON(bh->b_size != blocksize);
+ err = get_block(inode, block, bh, 1);
+ if (err)
+ break;
+ if (buffer_new(bh)) {
+ unmap_underlying_metadata(bh->b_bdev,
+ bh->b_blocknr);
+ if (PageUptodate(page)) {
+ clear_buffer_new(bh);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ continue;
+ }
+ if (block_end > to || block_start < from)
+ zero_user_segments(page, to, block_end,
+ block_start, from);
+ continue;
+ }
+ }
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+ !buffer_unwritten(bh) &&
+ (block_start < from || block_end > to)) {
+ ll_rw_block(READ, 1, &bh);
+ *wait_bh++ = bh;
+ decrypt = ext4_encrypted_inode(inode) &&
+ S_ISREG(inode->i_mode);
+ }
+ }
+ /*
+ * If we issued read requests, let them complete.
+ */
+ while (wait_bh > wait) {
+ wait_on_buffer(*--wait_bh);
+ if (!buffer_uptodate(*wait_bh))
+ err = -EIO;
+ }
+ if (unlikely(err))
+ page_zero_new_buffers(page, from, to);
+ else if (decrypt)
+ err = ext4_decrypt_one(inode, page);
+ return err;
+}
+#endif
+
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -950,11 +1031,19 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (ext4_should_dioread_nolock(inode))
+ ret = ext4_block_write_begin(page, pos, len,
+ ext4_get_block_write);
+ else
+ ret = ext4_block_write_begin(page, pos, len,
+ ext4_get_block);
+#else
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len, ext4_get_block_write);
else
ret = __block_write_begin(page, pos, len, ext4_get_block);
-
+#endif
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, page_buffers(page),
from, to, NULL,
@@ -1165,13 +1254,12 @@ static int ext4_journalled_write_end(struct file *file,
}
/*
- * Reserve a single cluster located at lblock
+ * Reserve space for a single cluster
*/
-static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+static int ext4_da_reserve_space(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned int md_needed;
int ret;
/*
@@ -1183,25 +1271,14 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
if (ret)
return ret;
- /*
- * recalculate the amount of metadata blocks to reserve
- * in order to allocate nrblocks
- * worse case is one extent per block
- */
spin_lock(&ei->i_block_reservation_lock);
- /*
- * ext4_calc_metadata_amount() has side effects, which we have
- * to be prepared undo if we fail to claim space.
- */
- md_needed = 0;
- trace_ext4_da_reserve_space(inode, 0);
-
if (ext4_claim_free_clusters(sbi, 1, 0)) {
spin_unlock(&ei->i_block_reservation_lock);
dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
return -ENOSPC;
}
ei->i_reserved_data_blocks++;
+ trace_ext4_da_reserve_space(inode);
spin_unlock(&ei->i_block_reservation_lock);
return 0; /* success */
@@ -1246,7 +1323,7 @@ static void ext4_da_page_release_reservation(struct page *page,
unsigned int offset,
unsigned int length)
{
- int to_release = 0;
+ int to_release = 0, contiguous_blks = 0;
struct buffer_head *head, *bh;
unsigned int curr_off = 0;
struct inode *inode = page->mapping->host;
@@ -1267,14 +1344,23 @@ static void ext4_da_page_release_reservation(struct page *page,
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
+ contiguous_blks++;
clear_buffer_delay(bh);
+ } else if (contiguous_blks) {
+ lblk = page->index <<
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ lblk += (curr_off >> inode->i_blkbits) -
+ contiguous_blks;
+ ext4_es_remove_extent(inode, lblk, contiguous_blks);
+ contiguous_blks = 0;
}
curr_off = next_off;
} while ((bh = bh->b_this_page) != head);
- if (to_release) {
+ if (contiguous_blks) {
lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- ext4_es_remove_extent(inode, lblk, to_release);
+ lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
+ ext4_es_remove_extent(inode, lblk, contiguous_blks);
}
/* If we have released all the blocks belonging to a cluster, then we
@@ -1470,9 +1556,9 @@ add_delayed:
* then we don't need to reserve it again. However we still need
* to reserve metadata for every block we're going to write.
*/
- if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 ||
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 ||
!ext4_find_delalloc_cluster(inode, map->m_lblk)) {
- ret = ext4_da_reserve_space(inode, iblock);
+ ret = ext4_da_reserve_space(inode);
if (ret) {
/* not enough space to reserve */
retval = ret;
@@ -1605,19 +1691,32 @@ static int __ext4_journalled_writepage(struct page *page,
ext4_walk_page_buffers(handle, page_bufs, 0, len,
NULL, bget_one);
}
- /* As soon as we unlock the page, it can go away, but we have
- * references to buffers so we are safe */
+ /*
+ * We need to release the page lock before we start the
+ * journal, so grab a reference so the page won't disappear
+ * out from under us.
+ */
+ get_page(page);
unlock_page(page);
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out;
+ put_page(page);
+ goto out_no_pagelock;
}
-
BUG_ON(!ext4_handle_valid(handle));
+ lock_page(page);
+ put_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ ext4_journal_stop(handle);
+ ret = 0;
+ goto out;
+ }
+
if (inline_data) {
BUFFER_TRACE(inode_bh, "get write access");
ret = ext4_journal_get_write_access(handle, inode_bh);
@@ -1643,6 +1742,8 @@ static int __ext4_journalled_writepage(struct page *page,
NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
+ unlock_page(page);
+out_no_pagelock:
brelse(inode_bh);
return ret;
}
@@ -2576,7 +2677,12 @@ retry_journal:
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ ret = ext4_block_write_begin(page, pos, len,
+ ext4_da_get_block_prep);
+#else
ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
+#endif
if (ret < 0) {
unlock_page(page);
ext4_journal_stop(handle);
@@ -2822,7 +2928,7 @@ static int ext4_readpage(struct file *file, struct page *page)
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
- return mpage_readpage(page, ext4_get_block);
+ return ext4_mpage_readpages(page->mapping, NULL, page, 1);
return ret;
}
@@ -2837,7 +2943,7 @@ ext4_readpages(struct file *file, struct address_space *mapping,
if (ext4_has_inline_data(inode))
return 0;
- return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
+ return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
}
static void ext4_invalidatepage(struct page *page, unsigned int offset,
@@ -2953,8 +3059,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
* if the machine crashes during the write.
*
*/
-static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -2967,8 +3073,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */
- if (rw != WRITE || final_size > inode->i_size)
- return ext4_ind_direct_IO(rw, iocb, iter, offset);
+ if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
+ return ext4_ind_direct_IO(iocb, iter, offset);
BUG_ON(iocb->private == NULL);
@@ -2977,8 +3083,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* conversion. This also disallows race between truncate() and
* overwrite DIO as i_dio_count needs to be incremented under i_mutex.
*/
- if (rw == WRITE)
- atomic_inc(&inode->i_dio_count);
+ if (iov_iter_rw(iter) == WRITE)
+ inode_dio_begin(inode);
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
@@ -3034,11 +3140,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
+#endif
if (IS_DAX(inode))
- ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
+ ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
ext4_end_io_dio, dio_flags);
else
- ret = __blockdev_direct_IO(rw, iocb, inode,
+ ret = __blockdev_direct_IO(iocb, inode,
inode->i_sb->s_bdev, iter, offset,
get_block_func,
ext4_end_io_dio, NULL, dio_flags);
@@ -3079,8 +3188,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
}
retake_lock:
- if (rw == WRITE)
- inode_dio_done(inode);
+ if (iov_iter_rw(iter) == WRITE)
+ inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
up_read(&EXT4_I(inode)->i_data_sem);
@@ -3090,14 +3199,19 @@ retake_lock:
return ret;
}
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
size_t count = iov_iter_count(iter);
ssize_t ret;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ return 0;
+#endif
+
/*
* If we are doing data journalling we don't support O_DIRECT
*/
@@ -3108,12 +3222,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
if (ext4_has_inline_data(inode))
return 0;
- trace_ext4_direct_IO_enter(inode, offset, count, rw);
+ trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
+ ret = ext4_ext_direct_IO(iocb, iter, offset);
else
- ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
- trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
+ ret = ext4_ind_direct_IO(iocb, iter, offset);
+ trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
return ret;
}
@@ -3262,6 +3376,13 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* Uhhuh. Read error. Complain and punt. */
if (!buffer_uptodate(bh))
goto unlock;
+ if (S_ISREG(inode->i_mode) &&
+ ext4_encrypted_inode(inode)) {
+ /* We expect the key to be set. */
+ BUG_ON(!ext4_has_encryption_key(inode));
+ BUG_ON(blocksize != PAGE_CACHE_SIZE);
+ WARN_ON_ONCE(ext4_decrypt_one(inode, page));
+ }
}
if (ext4_should_journal_data(inode)) {
BUFFER_TRACE(bh, "get write access");
@@ -4091,16 +4212,17 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
- if (test_opt(inode->i_sb, DAX))
- inode->i_fop = &ext4_dax_file_operations;
- else
- inode->i_fop = &ext4_file_operations;
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext4_inode_is_fast_symlink(inode)) {
+ if (ext4_encrypted_inode(inode)) {
+ inode->i_op = &ext4_encrypted_symlink_inode_operations;
+ ext4_set_aops(inode);
+ } else if (ext4_inode_is_fast_symlink(inode)) {
+ inode->i_link = (char *)ei->i_data;
inode->i_op = &ext4_fast_symlink_inode_operations;
nd_terminate_link(ei->i_data, inode->i_size,
sizeof(ei->i_data) - 1);
@@ -4231,7 +4353,12 @@ static void ext4_update_other_inodes_time(struct super_block *sb,
int inode_size = EXT4_INODE_SIZE(sb);
oi.orig_ino = orig_ino;
- ino = orig_ino & ~(inodes_per_block - 1);
+ /*
+ * Calculate the first inode in the inode table block. Inode
+ * numbers are one-based. That is, the first inode in a block
+ * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
+ */
+ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
if (ino == orig_ino)
continue;
@@ -4525,7 +4652,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
*/
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error, rc = 0;
int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
@@ -4564,8 +4691,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_journal_stop(handle);
}
- if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ if (attr->ia_valid & ATTR_SIZE) {
handle_t *handle;
+ loff_t oldsize = inode->i_size;
+ int shrink = (attr->ia_size <= inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4573,24 +4702,26 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_size > sbi->s_bitmap_maxbytes)
return -EFBIG;
}
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
inode_inc_iversion(inode);
- if (S_ISREG(inode->i_mode) &&
+ if (ext4_should_order_data(inode) &&
(attr->ia_size < inode->i_size)) {
- if (ext4_should_order_data(inode)) {
- error = ext4_begin_ordered_truncate(inode,
+ error = ext4_begin_ordered_truncate(inode,
attr->ia_size);
- if (error)
- goto err_out;
- }
+ if (error)
+ goto err_out;
+ }
+ if (attr->ia_size != inode->i_size) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
}
- if (ext4_handle_valid(handle)) {
+ if (ext4_handle_valid(handle) && shrink) {
error = ext4_orphan_add(handle, inode);
orphan = 1;
}
@@ -4609,15 +4740,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
up_write(&EXT4_I(inode)->i_data_sem);
ext4_journal_stop(handle);
if (error) {
- ext4_orphan_del(NULL, inode);
+ if (orphan)
+ ext4_orphan_del(NULL, inode);
goto err_out;
}
- } else {
- loff_t oldsize = inode->i_size;
-
- i_size_write(inode, attr->ia_size);
- pagecache_isize_extended(inode, oldsize, inode->i_size);
}
+ if (!shrink)
+ pagecache_isize_extended(inode, oldsize, inode->i_size);
/*
* Blocks are going to be removed from the inode. Wait
@@ -4637,13 +4766,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
+ if (shrink)
+ ext4_truncate(inode);
}
- /*
- * We want to call ext4_truncate() even if attr->ia_size ==
- * inode->i_size for cases like truncation of fallocated space
- */
- if (attr->ia_valid & ATTR_SIZE)
- ext4_truncate(inode);
if (!rc) {
setattr_copy(inode, attr);
@@ -4673,7 +4798,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct inode *inode;
unsigned long long delalloc_blocks;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
generic_fillattr(inode, stat);
/*
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f58a0d106726..1346cfa355d0 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -8,12 +8,12 @@
*/
#include <linux/fs.h>
-#include <linux/jbd2.h>
#include <linux/capability.h>
#include <linux/time.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/file.h>
+#include <linux/random.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -31,14 +31,11 @@
static void memswap(void *a, void *b, size_t len)
{
unsigned char *ap, *bp;
- unsigned char tmp;
ap = (unsigned char *)a;
bp = (unsigned char *)b;
while (len-- > 0) {
- tmp = *ap;
- *ap = *bp;
- *bp = tmp;
+ swap(*ap, *bp);
ap++;
bp++;
}
@@ -196,6 +193,16 @@ journal_err_out:
return err;
}
+static int uuid_is_zero(__u8 u[16])
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ if (u[i])
+ return 0;
+ return 1;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -615,7 +622,78 @@ resizefs_out:
}
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
+ case EXT4_IOC_SET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct ext4_encryption_policy policy;
+ int err = 0;
+
+ if (copy_from_user(&policy,
+ (struct ext4_encryption_policy __user *)arg,
+ sizeof(policy))) {
+ err = -EFAULT;
+ goto encryption_policy_out;
+ }
+ err = ext4_process_policy(&policy, inode);
+encryption_policy_out:
+ return err;
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
+ case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+ int err, err2;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ handle_t *handle;
+
+ if (!ext4_sb_has_crypto(sb))
+ return -EOPNOTSUPP;
+ if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto pwsalt_err_exit;
+ }
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto pwsalt_err_journal;
+ generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
+ err = ext4_handle_dirty_metadata(handle, NULL,
+ sbi->s_sbh);
+ pwsalt_err_journal:
+ err2 = ext4_journal_stop(handle);
+ if (err2 && !err)
+ err = err2;
+ pwsalt_err_exit:
+ mnt_drop_write_file(filp);
+ if (err)
+ return err;
+ }
+ if (copy_to_user((void __user *) arg,
+ sbi->s_es->s_encrypt_pw_salt, 16))
+ return -EFAULT;
+ return 0;
+ }
+ case EXT4_IOC_GET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct ext4_encryption_policy policy;
+ int err = 0;
+
+ if (!ext4_encrypted_inode(inode))
+ return -ENOENT;
+ err = ext4_get_policy(inode, &policy);
+ if (err)
+ return err;
+ if (copy_to_user((void __user *)arg, &policy, sizeof(policy)))
+ return -EFAULT;
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
default:
return -ENOTTY;
}
@@ -677,9 +755,11 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return err;
}
case EXT4_IOC_MOVE_EXT:
- case FITRIM:
case EXT4_IOC_RESIZE_FS:
case EXT4_IOC_PRECACHE_EXTENTS:
+ case EXT4_IOC_SET_ENCRYPTION_POLICY:
+ case EXT4_IOC_GET_ENCRYPTION_PWSALT:
+ case EXT4_IOC_GET_ENCRYPTION_POLICY:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8d1e60214ef0..34b610ea5030 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -26,6 +26,7 @@
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/backing-dev.h>
#include <trace/events/ext4.h>
#ifdef CONFIG_EXT4_DEBUG
@@ -882,10 +883,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* wait for I/O completion */
for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
- if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+ if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i]))
err = -EIO;
- goto out;
- }
}
first_block = page->index * blocks_per_page;
@@ -898,6 +897,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* skip initialized uptodate buddy */
continue;
+ if (!buffer_verified(bh[group - first_group]))
+ /* Skip faulty bitmaps */
+ continue;
+ err = 0;
+
/*
* data carry information regarding this
* particular group in the format specified
@@ -2008,7 +2012,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
}
-/* This is now called BEFORE we load the buddy bitmap. */
+/*
+ * This is now called BEFORE we load the buddy bitmap.
+ * Returns either 1 or 0 indicating that the group is either suitable
+ * for the allocation or not. In addition it can also return negative
+ * error code when something goes wrong.
+ */
static int ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
@@ -2031,7 +2040,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
int ret = ext4_mb_init_group(ac->ac_sb, group);
if (ret)
- return 0;
+ return ret;
}
fragments = grp->bb_fragments;
@@ -2078,7 +2087,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
ext4_group_t ngroups, group, i;
int cr;
- int err = 0;
+ int err = 0, first_err = 0;
struct ext4_sb_info *sbi;
struct super_block *sb;
struct ext4_buddy e4b;
@@ -2145,6 +2154,7 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
+ int ret = 0;
cond_resched();
/*
* Artificially restricted ngroups for non-extent
@@ -2154,8 +2164,12 @@ repeat:
group = 0;
/* This now checks without needing the buddy page */
- if (!ext4_mb_good_group(ac, group, cr))
+ ret = ext4_mb_good_group(ac, group, cr);
+ if (ret <= 0) {
+ if (!first_err)
+ first_err = ret;
continue;
+ }
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err)
@@ -2167,9 +2181,12 @@ repeat:
* We need to check again after locking the
* block group
*/
- if (!ext4_mb_good_group(ac, group, cr)) {
+ ret = ext4_mb_good_group(ac, group, cr);
+ if (ret <= 0) {
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
+ if (!first_err)
+ first_err = ret;
continue;
}
@@ -2216,6 +2233,8 @@ repeat:
}
}
out:
+ if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
+ err = first_err;
return err;
}
@@ -2257,12 +2276,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
group--;
if (group == 0)
- seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
- "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
- "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
- "group", "free", "frags", "first",
- "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
- "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+ seq_puts(seq, "#group: free frags first ["
+ " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
+ " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]");
i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
sizeof(struct ext4_group_info);
@@ -4800,18 +4816,12 @@ do_more:
/*
* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed
+ *
+ * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
+ * to fail.
*/
- retry:
- new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
- if (!new_entry) {
- /*
- * We use a retry loop because
- * ext4_free_blocks() is not allowed to fail.
- */
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry;
- }
+ new_entry = kmem_cache_alloc(ext4_free_data_cachep,
+ GFP_NOFS|__GFP_NOFAIL);
new_entry->efd_start_cluster = bit;
new_entry->efd_group = block_group;
new_entry->efd_count = count_clusters;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 3cb267aee802..6163ad21cb0e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
owner[0] = i_uid_read(inode);
owner[1] = i_gid_read(inode);
- tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+ tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root),
S_IFREG, NULL, goal, owner);
if (IS_ERR(tmp_inode)) {
retval = PTR_ERR(tmp_inode);
@@ -620,6 +620,7 @@ int ext4_ind_migrate(struct inode *inode)
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_extent *ex;
unsigned int i, len;
+ ext4_lblk_t start, end;
ext4_fsblk_t blk;
handle_t *handle;
int ret;
@@ -633,6 +634,14 @@ int ext4_ind_migrate(struct inode *inode)
EXT4_FEATURE_RO_COMPAT_BIGALLOC))
return -EOPNOTSUPP;
+ /*
+ * In order to get correct extent info, force all delayed allocation
+ * blocks to be allocated, otherwise delayed allocation blocks may not
+ * be reflected and bypass the checks on extent header.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ ext4_alloc_da_blocks(inode);
+
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -650,11 +659,13 @@ int ext4_ind_migrate(struct inode *inode)
goto errout;
}
if (eh->eh_entries == 0)
- blk = len = 0;
+ blk = len = start = end = 0;
else {
len = le16_to_cpu(ex->ee_len);
blk = ext4_ext_pblock(ex);
- if (len > EXT4_NDIR_BLOCKS) {
+ start = le32_to_cpu(ex->ee_block);
+ end = start + len - 1;
+ if (end >= EXT4_NDIR_BLOCKS) {
ret = -EOPNOTSUPP;
goto errout;
}
@@ -662,7 +673,7 @@ int ext4_ind_migrate(struct inode *inode)
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
memset(ei->i_data, 0, sizeof(ei->i_data));
- for (i=0; i < len; i++)
+ for (i = start; i <= end; i++)
ei->i_data[i] = cpu_to_le32(blk++);
ext4_mark_inode_dirty(handle, inode);
errout:
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 370420bfae8d..fb6f11709ae6 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -166,12 +166,9 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
*/
wait_on_page_writeback(page[0]);
wait_on_page_writeback(page[1]);
- if (inode1 > inode2) {
- struct page *tmp;
- tmp = page[0];
- page[0] = page[1];
- page[1] = tmp;
- }
+ if (inode1 > inode2)
+ swap(page[0], page[1]);
+
return 0;
}
@@ -574,12 +571,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
orig_inode->i_ino, donor_inode->i_ino);
return -EINVAL;
}
- /* TODO: This is non obvious task to swap blocks for inodes with full
- jornaling enabled */
+
+ /* TODO: it's not obvious how to swap blocks for inodes with full
+ journaling enabled */
if (ext4_should_journal_data(orig_inode) ||
ext4_should_journal_data(donor_inode)) {
- return -EINVAL;
+ ext4_msg(orig_inode->i_sb, KERN_ERR,
+ "Online defrag not supported with data journaling");
+ return -EOPNOTSUPP;
}
+
/* Protect orig and donor inodes against a truncate */
lock_two_nondirectories(orig_inode, donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 28fe71a2904c..011dcfb5cce3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -26,7 +26,6 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/jbd2.h>
#include <linux/time.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
@@ -62,7 +61,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
- bh = ext4_bread(handle, inode, *block, 1);
+ bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
if (IS_ERR(bh))
return bh;
inode->i_size += inode->i_sb->s_blocksize;
@@ -85,12 +84,13 @@ typedef enum {
} dirblock_type_t;
#define ext4_read_dirblock(inode, block, type) \
- __ext4_read_dirblock((inode), (block), (type), __LINE__)
+ __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__)
static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
- ext4_lblk_t block,
- dirblock_type_t type,
- unsigned int line)
+ ext4_lblk_t block,
+ dirblock_type_t type,
+ const char *func,
+ unsigned int line)
{
struct buffer_head *bh;
struct ext4_dir_entry *dirent;
@@ -98,15 +98,17 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
bh = ext4_bread(NULL, inode, block, 0);
if (IS_ERR(bh)) {
- __ext4_warning(inode->i_sb, __func__, line,
- "error %ld reading directory block "
- "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino,
- (unsigned long) block);
+ __ext4_warning(inode->i_sb, func, line,
+ "inode #%lu: lblock %lu: comm %s: "
+ "error %ld reading directory block",
+ inode->i_ino, (unsigned long)block,
+ current->comm, PTR_ERR(bh));
return bh;
}
if (!bh) {
- ext4_error_inode(inode, __func__, line, block, "Directory hole found");
+ ext4_error_inode(inode, func, line, block,
+ "Directory hole found");
return ERR_PTR(-EIO);
}
dirent = (struct ext4_dir_entry *) bh->b_data;
@@ -120,7 +122,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
is_dx_block = 1;
}
if (!is_dx_block && type == INDEX) {
- ext4_error_inode(inode, __func__, line, block,
+ ext4_error_inode(inode, func, line, block,
"directory leaf block found instead of index block");
return ERR_PTR(-EIO);
}
@@ -137,8 +139,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (ext4_dx_csum_verify(inode, dirent))
set_buffer_verified(bh);
else {
- ext4_error_inode(inode, __func__, line, block,
- "Directory index failed checksum");
+ ext4_error_inode(inode, func, line, block,
+ "Directory index failed checksum");
brelse(bh);
return ERR_PTR(-EIO);
}
@@ -147,8 +149,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (ext4_dirent_csum_verify(inode, dirent))
set_buffer_verified(bh);
else {
- ext4_error_inode(inode, __func__, line, block,
- "Directory block failed checksum");
+ ext4_error_inode(inode, func, line, block,
+ "Directory block failed checksum");
brelse(bh);
return ERR_PTR(-EIO);
}
@@ -249,13 +251,14 @@ static void dx_set_count(struct dx_entry *entries, unsigned value);
static void dx_set_limit(struct dx_entry *entries, unsigned value);
static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(const struct qstr *d_name,
+static struct dx_frame *dx_probe(struct ext4_filename *fname,
struct inode *dir,
struct dx_hash_info *hinfo,
struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
- struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
+ unsigned blocksize, struct dx_hash_info *hinfo,
+ struct dx_map_entry map[]);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
struct dx_map_entry *offsets, int count, unsigned blocksize);
@@ -267,10 +270,10 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frames,
__u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
- const struct qstr *d_name,
+ struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir);
-static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode);
+static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+ struct dentry *dentry, struct inode *inode);
/* checksumming functions */
void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
@@ -327,10 +330,14 @@ static __le32 ext4_dirent_csum(struct inode *inode,
return cpu_to_le32(csum);
}
-static void warn_no_space_for_csum(struct inode *inode)
+#define warn_no_space_for_csum(inode) \
+ __warn_no_space_for_csum((inode), __func__, __LINE__)
+
+static void __warn_no_space_for_csum(struct inode *inode, const char *func,
+ unsigned int line)
{
- ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
- "checksum. Please run e2fsck -D.", inode->i_ino);
+ __ext4_warning_inode(inode, func, line,
+ "No space for directory leaf checksum. Please run e2fsck -D.");
}
int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
@@ -586,8 +593,10 @@ struct stats
unsigned bcount;
};
-static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
- int size, int show_names)
+static struct stats dx_show_leaf(struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct ext4_dir_entry_2 *de,
+ int size, int show_names)
{
unsigned names = 0, space = 0;
char *base = (char *) de;
@@ -600,12 +609,69 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
{
if (show_names)
{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ int len;
+ char *name;
+ struct ext4_str fname_crypto_str
+ = {.name = NULL, .len = 0};
+ int res = 0;
+
+ name = de->name;
+ len = de->name_len;
+ if (ext4_encrypted_inode(inode))
+ res = ext4_get_encryption_info(dir);
+ if (res) {
+ printk(KERN_WARNING "Error setting up"
+ " fname crypto: %d\n", res);
+ }
+ if (ctx == NULL) {
+ /* Directory is not encrypted */
+ ext4fs_dirhash(de->name,
+ de->name_len, &h);
+ printk("%*.s:(U)%x.%u ", len,
+ name, h.hash,
+ (unsigned) ((char *) de
+ - base));
+ } else {
+ /* Directory is encrypted */
+ res = ext4_fname_crypto_alloc_buffer(
+ ctx, de->name_len,
+ &fname_crypto_str);
+ if (res < 0) {
+ printk(KERN_WARNING "Error "
+ "allocating crypto "
+ "buffer--skipping "
+ "crypto\n");
+ ctx = NULL;
+ }
+ res = ext4_fname_disk_to_usr(ctx, NULL, de,
+ &fname_crypto_str);
+ if (res < 0) {
+ printk(KERN_WARNING "Error "
+ "converting filename "
+ "from disk to usr"
+ "\n");
+ name = "??";
+ len = 2;
+ } else {
+ name = fname_crypto_str.name;
+ len = fname_crypto_str.len;
+ }
+ ext4fs_dirhash(de->name, de->name_len,
+ &h);
+ printk("%*.s:(E)%x.%u ", len, name,
+ h.hash, (unsigned) ((char *) de
+ - base));
+ ext4_fname_crypto_free_buffer(
+ &fname_crypto_str);
+ }
+#else
int len = de->name_len;
char *name = de->name;
- while (len--) printk("%c", *name++);
ext4fs_dirhash(de->name, de->name_len, &h);
- printk(":%x.%u ", h.hash,
+ printk("%*.s:%x.%u ", len, name, h.hash,
(unsigned) ((char *) de - base));
+#endif
}
space += EXT4_DIR_REC_LEN(de->name_len);
names++;
@@ -623,7 +689,6 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
unsigned count = dx_get_count(entries), names = 0, space = 0, i;
unsigned bcount = 0;
struct buffer_head *bh;
- int err;
printk("%i indexed blocks...\n", count);
for (i = 0; i < count; i++, entries++)
{
@@ -637,7 +702,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
continue;
stats = levels?
dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
- dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
+ dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *)
+ bh->b_data, blocksize, 0);
names += stats.names;
space += stats.space;
bcount += stats.bcount;
@@ -661,7 +727,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
* back to userspace.
*/
static struct dx_frame *
-dx_probe(const struct qstr *d_name, struct inode *dir,
+dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
unsigned count, indirect;
@@ -679,36 +745,41 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
root->info.hash_version != DX_HASH_LEGACY) {
- ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
- root->info.hash_version);
+ ext4_warning_inode(dir, "Unrecognised inode hash code %u",
+ root->info.hash_version);
goto fail;
}
+ if (fname)
+ hinfo = &fname->hinfo;
hinfo->hash_version = root->info.hash_version;
if (hinfo->hash_version <= DX_HASH_TEA)
hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- if (d_name)
- ext4fs_dirhash(d_name->name, d_name->len, hinfo);
+ if (fname && fname_name(fname))
+ ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo);
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
- ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
- root->info.unused_flags);
+ ext4_warning_inode(dir, "Unimplemented hash flags: %#06x",
+ root->info.unused_flags);
goto fail;
}
- if ((indirect = root->info.indirect_levels) > 1) {
- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
- root->info.indirect_levels);
+ indirect = root->info.indirect_levels;
+ if (indirect > 1) {
+ ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
+ root->info.indirect_levels);
goto fail;
}
- entries = (struct dx_entry *) (((char *)&root->info) +
- root->info.info_length);
+ entries = (struct dx_entry *)(((char *)&root->info) +
+ root->info.info_length);
if (dx_get_limit(entries) != dx_root_limit(dir,
root->info.info_length)) {
- ext4_warning(dir->i_sb, "dx entry: limit != root limit");
+ ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
+ dx_get_limit(entries),
+ dx_root_limit(dir, root->info.info_length));
goto fail;
}
@@ -716,15 +787,16 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
while (1) {
count = dx_get_count(entries);
if (!count || count > dx_get_limit(entries)) {
- ext4_warning(dir->i_sb,
- "dx entry: no count or count > limit");
+ ext4_warning_inode(dir,
+ "dx entry: count %u beyond limit %u",
+ count, dx_get_limit(entries));
goto fail;
}
p = entries + 1;
q = entries + count - 1;
while (p <= q) {
- m = p + (q - p)/2;
+ m = p + (q - p) / 2;
dxtrace(printk("."));
if (dx_get_hash(m) > hash)
q = m - 1;
@@ -748,7 +820,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
}
at = p - 1;
- dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+ dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at),
+ dx_get_block(at)));
frame->entries = entries;
frame->at = at;
if (!indirect--)
@@ -762,9 +835,10 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
}
entries = ((struct dx_node *) frame->bh->b_data)->entries;
- if (dx_get_limit(entries) != dx_node_limit (dir)) {
- ext4_warning(dir->i_sb,
- "dx entry: limit != node limit");
+ if (dx_get_limit(entries) != dx_node_limit(dir)) {
+ ext4_warning_inode(dir,
+ "dx entry: limit %u != node limit %u",
+ dx_get_limit(entries), dx_node_limit(dir));
goto fail;
}
}
@@ -773,19 +847,19 @@ fail:
brelse(frame->bh);
frame--;
}
+
if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
- ext4_warning(dir->i_sb,
- "Corrupt dir inode %lu, running e2fsck is "
- "recommended.", dir->i_ino);
+ ext4_warning_inode(dir,
+ "Corrupt directory, running e2fsck is recommended");
return ret_err;
}
-static void dx_release (struct dx_frame *frames)
+static void dx_release(struct dx_frame *frames)
{
if (frames[0].bh == NULL)
return;
- if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+ if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
brelse(frames[1].bh);
brelse(frames[0].bh);
}
@@ -878,6 +952,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
+ struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str;
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@@ -889,6 +964,22 @@ static int htree_dirblock_to_tree(struct file *dir_file,
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ /* Check if the directory is encrypted */
+ if (ext4_encrypted_inode(dir)) {
+ err = ext4_get_encryption_info(dir);
+ if (err < 0) {
+ brelse(bh);
+ return err;
+ }
+ err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN,
+ &fname_crypto_str);
+ if (err < 0) {
+ brelse(bh);
+ return err;
+ }
+ }
+#endif
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size,
@@ -904,14 +995,38 @@ static int htree_dirblock_to_tree(struct file *dir_file,
continue;
if (de->inode == 0)
continue;
- if ((err = ext4_htree_store_dirent(dir_file,
- hinfo->hash, hinfo->minor_hash, de)) != 0) {
- brelse(bh);
- return err;
+ if (!ext4_encrypted_inode(dir)) {
+ tmp_str.name = de->name;
+ tmp_str.len = de->name_len;
+ err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de,
+ &tmp_str);
+ } else {
+ int save_len = fname_crypto_str.len;
+
+ /* Directory is encrypted */
+ err = ext4_fname_disk_to_usr(dir, hinfo, de,
+ &fname_crypto_str);
+ if (err < 0) {
+ count = err;
+ goto errout;
+ }
+ err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de,
+ &fname_crypto_str);
+ fname_crypto_str.len = save_len;
+ }
+ if (err != 0) {
+ count = err;
+ goto errout;
}
count++;
}
+errout:
brelse(bh);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ ext4_fname_crypto_free_buffer(&fname_crypto_str);
+#endif
return count;
}
@@ -935,6 +1050,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
int count = 0;
int ret, err;
__u32 hashval;
+ struct ext4_str tmp_str;
dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
start_hash, start_minor_hash));
@@ -970,14 +1086,22 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
/* Add '.' and '..' from the htree header */
if (!start_hash && !start_minor_hash) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
- if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+ tmp_str.name = de->name;
+ tmp_str.len = de->name_len;
+ err = ext4_htree_store_dirent(dir_file, 0, 0,
+ de, &tmp_str);
+ if (err != 0)
goto errout;
count++;
}
if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
de = ext4_next_entry(de, dir->i_sb->s_blocksize);
- if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+ tmp_str.name = de->name;
+ tmp_str.len = de->name_len;
+ err = ext4_htree_store_dirent(dir_file, 2, 0,
+ de, &tmp_str);
+ if (err != 0)
goto errout;
count++;
}
@@ -1019,12 +1143,13 @@ errout:
static inline int search_dirblock(struct buffer_head *bh,
struct inode *dir,
+ struct ext4_filename *fname,
const struct qstr *d_name,
unsigned int offset,
struct ext4_dir_entry_2 **res_dir)
{
- return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
- d_name, offset, res_dir);
+ return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+ fname, d_name, offset, res_dir);
}
/*
@@ -1035,8 +1160,8 @@ static inline int search_dirblock(struct buffer_head *bh,
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
- struct dx_hash_info *hinfo,
+static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
+ unsigned blocksize, struct dx_hash_info *hinfo,
struct dx_map_entry *map_tail)
{
int count = 0;
@@ -1106,57 +1231,87 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
* `len <= EXT4_NAME_LEN' is guaranteed by caller.
* `de != NULL' is guaranteed by caller.
*/
-static inline int ext4_match (int len, const char * const name,
- struct ext4_dir_entry_2 * de)
+static inline int ext4_match(struct ext4_filename *fname,
+ struct ext4_dir_entry_2 *de)
{
- if (len != de->name_len)
- return 0;
+ const void *name = fname_name(fname);
+ u32 len = fname_len(fname);
+
if (!de->inode)
return 0;
- return !memcmp(name, de->name, len);
+
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (unlikely(!name)) {
+ if (fname->usr_fname->name[0] == '_') {
+ int ret;
+ if (de->name_len < 16)
+ return 0;
+ ret = memcmp(de->name + de->name_len - 16,
+ fname->crypto_buf.name + 8, 16);
+ return (ret == 0) ? 1 : 0;
+ }
+ name = fname->crypto_buf.name;
+ len = fname->crypto_buf.len;
+ }
+#endif
+ if (de->name_len != len)
+ return 0;
+ return (memcmp(de->name, name, len) == 0) ? 1 : 0;
}
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-int search_dir(struct buffer_head *bh,
- char *search_buf,
- int buf_size,
- struct inode *dir,
- const struct qstr *d_name,
- unsigned int offset,
- struct ext4_dir_entry_2 **res_dir)
+int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
+ struct inode *dir, struct ext4_filename *fname,
+ const struct qstr *d_name,
+ unsigned int offset, struct ext4_dir_entry_2 **res_dir)
{
struct ext4_dir_entry_2 * de;
char * dlimit;
int de_len;
- const char *name = d_name->name;
- int namelen = d_name->len;
+ int res;
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
+ if ((char *) de + de->name_len <= dlimit) {
+ res = ext4_match(fname, de);
+ if (res < 0) {
+ res = -1;
+ goto return_result;
+ }
+ if (res > 0) {
+ /* found a match - just to be sure, do
+ * a full check */
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data,
+ bh->b_size, offset)) {
+ res = -1;
+ goto return_result;
+ }
+ *res_dir = de;
+ res = 1;
+ goto return_result;
+ }
- if ((char *) de + namelen <= dlimit &&
- ext4_match (namelen, name, de)) {
- /* found a match - just to be sure, do a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
- bh->b_size, offset))
- return -1;
- *res_dir = de;
- return 1;
}
/* prevent looping on a bad block */
de_len = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize);
- if (de_len <= 0)
- return -1;
+ if (de_len <= 0) {
+ res = -1;
+ goto return_result;
+ }
offset += de_len;
de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
}
- return 0;
+
+ res = 0;
+return_result:
+ return res;
}
static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
@@ -1202,7 +1357,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
buffer */
int num = 0;
ext4_lblk_t nblocks;
- int i, namelen;
+ int i, namelen, retval;
+ struct ext4_filename fname;
*res_dir = NULL;
sb = dir->i_sb;
@@ -1210,14 +1366,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
if (namelen > EXT4_NAME_LEN)
return NULL;
+ retval = ext4_fname_setup_filename(dir, d_name, 1, &fname);
+ if (retval)
+ return ERR_PTR(retval);
+
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
- ret = ext4_find_inline_entry(dir, d_name, res_dir,
+ ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir,
&has_inline_data);
if (has_inline_data) {
if (inlined)
*inlined = 1;
- return ret;
+ goto cleanup_and_exit;
}
}
@@ -1232,14 +1392,14 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
goto restart;
}
if (is_dx(dir)) {
- bh = ext4_dx_find_entry(dir, d_name, res_dir);
+ ret = ext4_dx_find_entry(dir, &fname, res_dir);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
- if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR)
- return bh;
+ if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
+ goto cleanup_and_exit;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
}
@@ -1270,8 +1430,10 @@ restart:
num++;
bh = ext4_getblk(NULL, dir, b++, 0);
if (unlikely(IS_ERR(bh))) {
- if (ra_max == 0)
- return bh;
+ if (ra_max == 0) {
+ ret = bh;
+ goto cleanup_and_exit;
+ }
break;
}
bh_use[ra_max] = bh;
@@ -1301,7 +1463,7 @@ restart:
goto next;
}
set_buffer_verified(bh);
- i = search_dirblock(bh, dir, d_name,
+ i = search_dirblock(bh, dir, &fname, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) {
EXT4_I(dir)->i_dir_start_lookup = block;
@@ -1332,20 +1494,25 @@ cleanup_and_exit:
/* Clean up the read-ahead blocks */
for (; ra_ptr < ra_max; ra_ptr++)
brelse(bh_use[ra_ptr]);
+ ext4_fname_free_filename(&fname);
return ret;
}
-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
- struct ext4_dir_entry_2 **res_dir)
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+ struct ext4_filename *fname,
+ struct ext4_dir_entry_2 **res_dir)
{
struct super_block * sb = dir->i_sb;
- struct dx_hash_info hinfo;
struct dx_frame frames[2], *frame;
+ const struct qstr *d_name = fname->usr_fname;
struct buffer_head *bh;
ext4_lblk_t block;
int retval;
- frame = dx_probe(d_name, dir, &hinfo, frames);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ *res_dir = NULL;
+#endif
+ frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
return (struct buffer_head *) frame;
do {
@@ -1354,7 +1521,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
if (IS_ERR(bh))
goto errout;
- retval = search_dirblock(bh, dir, d_name,
+ retval = search_dirblock(bh, dir, fname, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb),
res_dir);
if (retval == 1)
@@ -1366,12 +1533,12 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
}
/* Check to see if we should continue to search */
- retval = ext4_htree_next_block(dir, hinfo.hash, frame,
+ retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
frames, NULL);
if (retval < 0) {
- ext4_warning(sb,
- "error %d reading index page in directory #%lu",
- retval, dir->i_ino);
+ ext4_warning_inode(dir,
+ "error %d reading directory index block",
+ retval);
bh = ERR_PTR(retval);
goto errout;
}
@@ -1417,6 +1584,18 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
ino);
return ERR_PTR(-EIO);
}
+ if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
+ (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) &&
+ !ext4_is_child_context_consistent_with_parent(dir,
+ inode)) {
+ iput(inode);
+ ext4_warning(inode->i_sb,
+ "Inconsistent encryption contexts: %lu/%lu\n",
+ (unsigned long) dir->i_ino,
+ (unsigned long) inode->i_ino);
+ return ERR_PTR(-EPERM);
+ }
}
return d_splice_alias(inode, dentry);
}
@@ -1429,7 +1608,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
+ bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL);
if (IS_ERR(bh))
return (struct dentry *) bh;
if (!bh)
@@ -1437,13 +1616,13 @@ struct dentry *ext4_get_parent(struct dentry *child)
ino = le32_to_cpu(de->inode);
brelse(bh);
- if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
- EXT4_ERROR_INODE(child->d_inode,
+ if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) {
+ EXT4_ERROR_INODE(d_inode(child),
"bad parent inode number: %u", ino);
return ERR_PTR(-EIO);
}
- return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino));
+ return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino));
}
/*
@@ -1541,7 +1720,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* create map in the end of data2 block */
map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map((struct ext4_dir_entry_2 *) data1,
+ count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
blocksize, hinfo, map);
map -= count;
dx_sort_map(map, count);
@@ -1564,7 +1743,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
- de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
+ de2 = dx_move_dirents(data1, data2, map + split, count - split,
+ blocksize);
de = dx_pack_dirents(data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
(char *) de,
@@ -1580,8 +1760,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
initialize_dirent_tail(t, blocksize);
}
- dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
- dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
+ dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1,
+ blocksize, 1));
+ dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
+ blocksize, 1));
/* Which block gets the new entry? */
if (hinfo->hash >= hash2) {
@@ -1610,23 +1792,32 @@ journal_error:
int ext4_find_dest_de(struct inode *dir, struct inode *inode,
struct buffer_head *bh,
void *buf, int buf_size,
- const char *name, int namelen,
+ struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de)
{
struct ext4_dir_entry_2 *de;
- unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+ unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
int nlen, rlen;
unsigned int offset = 0;
char *top;
+ int res;
de = (struct ext4_dir_entry_2 *)buf;
top = buf + buf_size - reclen;
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- buf, buf_size, offset))
- return -EIO;
- if (ext4_match(namelen, name, de))
- return -EEXIST;
+ buf, buf_size, offset)) {
+ res = -EIO;
+ goto return_result;
+ }
+ /* Provide crypto context and crypto buffer to ext4 match */
+ res = ext4_match(fname, de);
+ if (res < 0)
+ goto return_result;
+ if (res > 0) {
+ res = -EEXIST;
+ goto return_result;
+ }
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if ((de->inode ? rlen - nlen : rlen) >= reclen)
@@ -1634,17 +1825,22 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
}
- if ((char *) de > top)
- return -ENOSPC;
- *dest_de = de;
- return 0;
+ if ((char *) de > top)
+ res = -ENOSPC;
+ else {
+ *dest_de = de;
+ res = 0;
+ }
+return_result:
+ return res;
}
-void ext4_insert_dentry(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- const char *name, int namelen)
+int ext4_insert_dentry(struct inode *dir,
+ struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname)
{
int nlen, rlen;
@@ -1653,7 +1849,7 @@ void ext4_insert_dentry(struct inode *inode,
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if (de->inode) {
struct ext4_dir_entry_2 *de1 =
- (struct ext4_dir_entry_2 *)((char *)de + nlen);
+ (struct ext4_dir_entry_2 *)((char *)de + nlen);
de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
de = de1;
@@ -1661,9 +1857,11 @@ void ext4_insert_dentry(struct inode *inode,
de->file_type = EXT4_FT_UNKNOWN;
de->inode = cpu_to_le32(inode->i_ino);
ext4_set_de_type(inode->i_sb, de, inode->i_mode);
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
+ de->name_len = fname_len(fname);
+ memcpy(de->name, fname_name(fname), fname_len(fname));
+ return 0;
}
+
/*
* Add a new entry into a directory (leaf) block. If de is non-NULL,
* it points to a directory entry which is guaranteed to be large
@@ -1672,13 +1870,11 @@ void ext4_insert_dentry(struct inode *inode,
* space. It will return -ENOSPC if no space is available, and -EIO
* and -EEXIST if directory entry already exists.
*/
-static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
+ struct inode *dir,
struct inode *inode, struct ext4_dir_entry_2 *de,
struct buffer_head *bh)
{
- struct inode *dir = dentry->d_parent->d_inode;
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
int err;
@@ -1687,9 +1883,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
- err = ext4_find_dest_de(dir, inode,
- bh, bh->b_data, blocksize - csum_size,
- name, namelen, &de);
+ err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
+ blocksize - csum_size, fname, &de);
if (err)
return err;
}
@@ -1700,8 +1895,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
return err;
}
- /* By now the buffer is marked for journaling */
- ext4_insert_dentry(inode, de, blocksize, name, namelen);
+ /* By now the buffer is marked for journaling. Due to crypto operations,
+ * the following function call may fail */
+ err = ext4_insert_dentry(dir, inode, de, blocksize, fname);
+ if (err < 0)
+ return err;
/*
* XXX shouldn't update any times until successful
@@ -1729,12 +1927,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
* This converts a one block unindexed directory to a 3 block indexed
* directory, and adds the dentry to the indexed directory.
*/
-static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+ struct dentry *dentry,
struct inode *inode, struct buffer_head *bh)
{
- struct inode *dir = dentry->d_parent->d_inode;
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
+ struct inode *dir = d_inode(dentry->d_parent);
struct buffer_head *bh2;
struct dx_root *root;
struct dx_frame frames[2], *frame;
@@ -1745,10 +1942,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
unsigned len;
int retval;
unsigned blocksize;
- struct dx_hash_info hinfo;
ext4_lblk_t block;
struct fake_dirent *fde;
- int csum_size = 0;
+ int csum_size = 0;
if (ext4_has_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
@@ -1811,11 +2007,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
/* Initialize as for dx_probe */
- hinfo.hash_version = root->info.hash_version;
- if (hinfo.hash_version <= DX_HASH_TEA)
- hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
- hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- ext4fs_dirhash(name, namelen, &hinfo);
+ fname->hinfo.hash_version = root->info.hash_version;
+ if (fname->hinfo.hash_version <= DX_HASH_TEA)
+ fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
+ fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo);
+
memset(frames, 0, sizeof(frames));
frame = frames;
frame->entries = entries;
@@ -1830,14 +2027,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
if (retval)
goto out_frames;
- de = do_split(handle,dir, &bh, frame, &hinfo);
+ de = do_split(handle,dir, &bh, frame, &fname->hinfo);
if (IS_ERR(de)) {
retval = PTR_ERR(de);
goto out_frames;
}
dx_release(frames);
- retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
brelse(bh);
return retval;
out_frames:
@@ -1864,11 +2061,12 @@ out_frames:
static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode)
{
- struct inode *dir = dentry->d_parent->d_inode;
- struct buffer_head *bh;
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct buffer_head *bh = NULL;
struct ext4_dir_entry_2 *de;
struct ext4_dir_entry_tail *t;
struct super_block *sb;
+ struct ext4_filename fname;
int retval;
int dx_fallback=0;
unsigned blocksize;
@@ -1883,20 +2081,25 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (!dentry->d_name.len)
return -EINVAL;
+ retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (retval)
+ return retval;
+
if (ext4_has_inline_data(dir)) {
- retval = ext4_try_add_inline_entry(handle, dentry, inode);
+ retval = ext4_try_add_inline_entry(handle, &fname,
+ dentry, inode);
if (retval < 0)
- return retval;
+ goto out;
if (retval == 1) {
retval = 0;
- return retval;
+ goto out;
}
}
if (is_dx(dir)) {
- retval = ext4_dx_add_entry(handle, dentry, inode);
+ retval = ext4_dx_add_entry(handle, &fname, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
- return retval;
+ goto out;
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
@@ -1904,23 +2107,31 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
bh = ext4_read_dirblock(dir, block, DIRENT);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
-
- retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (retval != -ENOSPC) {
- brelse(bh);
- return retval;
+ if (IS_ERR(bh)) {
+ retval = PTR_ERR(bh);
+ bh = NULL;
+ goto out;
}
+ retval = add_dirent_to_buf(handle, &fname, dir, inode,
+ NULL, bh);
+ if (retval != -ENOSPC)
+ goto out;
if (blocks == 1 && !dx_fallback &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
- return make_indexed_dir(handle, dentry, inode, bh);
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ retval = make_indexed_dir(handle, &fname, dentry,
+ inode, bh);
+ bh = NULL; /* make_indexed_dir releases bh */
+ goto out;
+ }
brelse(bh);
}
bh = ext4_append(handle, dir, &block);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
+ if (IS_ERR(bh)) {
+ retval = PTR_ERR(bh);
+ bh = NULL;
+ goto out;
+ }
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
@@ -1930,7 +2141,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
initialize_dirent_tail(t, blocksize);
}
- retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
+out:
+ ext4_fname_free_filename(&fname);
brelse(bh);
if (retval == 0)
ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -1940,19 +2153,18 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
/*
* Returns 0 for success, or a negative error value
*/
-static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
- struct inode *inode)
+static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+ struct dentry *dentry, struct inode *inode)
{
struct dx_frame frames[2], *frame;
struct dx_entry *entries, *at;
- struct dx_hash_info hinfo;
struct buffer_head *bh;
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
int err;
- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames);
+ frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
return PTR_ERR(frame);
entries = frame->entries;
@@ -1969,7 +2181,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
- err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh);
if (err != -ENOSPC)
goto cleanup;
@@ -1987,7 +2199,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (levels && (dx_get_count(frames->entries) ==
dx_get_limit(frames->entries))) {
- ext4_warning(sb, "Directory index full!");
+ ext4_warning_inode(dir, "Directory index full!");
err = -ENOSPC;
goto cleanup;
}
@@ -2065,12 +2277,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
}
}
- de = do_split(handle, dir, &bh, frame, &hinfo);
+ de = do_split(handle, dir, &bh, frame, &fname->hinfo);
if (IS_ERR(de)) {
err = PTR_ERR(de);
goto cleanup;
}
- err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ err = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
goto cleanup;
journal_error:
@@ -2235,10 +2447,7 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
- if (test_opt(inode->i_sb, DAX))
- inode->i_fop = &ext4_dax_file_operations;
- else
- inode->i_fop = &ext4_file_operations;
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, inode);
if (!err && IS_DIRSYNC(dir))
@@ -2302,10 +2511,7 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
- if (test_opt(inode->i_sb, DAX))
- inode->i_fop = &ext4_dax_file_operations;
- else
- inode->i_fop = &ext4_file_operations;
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
@@ -2456,7 +2662,7 @@ out_stop:
/*
* routine to check that the specified directory is empty (for rmdir)
*/
-static int empty_dir(struct inode *inode)
+int ext4_empty_dir(struct inode *inode)
{
unsigned int offset;
struct buffer_head *bh;
@@ -2484,12 +2690,9 @@ static int empty_dir(struct inode *inode)
de = (struct ext4_dir_entry_2 *) bh->b_data;
de1 = ext4_next_entry(de, sb->s_blocksize);
if (le32_to_cpu(de->inode) != inode->i_ino ||
- !le32_to_cpu(de1->inode) ||
- strcmp(".", de->name) ||
- strcmp("..", de1->name)) {
- ext4_warning(inode->i_sb,
- "bad directory (dir #%lu) - no `.' or `..'",
- inode->i_ino);
+ le32_to_cpu(de1->inode) == 0 ||
+ strcmp(".", de->name) || strcmp("..", de1->name)) {
+ ext4_warning_inode(inode, "directory missing '.' and/or '..'");
brelse(bh);
return 1;
}
@@ -2708,7 +2911,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go in
* separate transaction */
dquot_initialize(dir);
- dquot_initialize(dentry->d_inode);
+ dquot_initialize(d_inode(dentry));
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
@@ -2717,14 +2920,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (!bh)
goto end_rmdir;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
retval = -EIO;
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_rmdir;
retval = -ENOTEMPTY;
- if (!empty_dir(inode))
+ if (!ext4_empty_dir(inode))
goto end_rmdir;
handle = ext4_journal_start(dir, EXT4_HT_DIR,
@@ -2742,8 +2945,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
if (retval)
goto end_rmdir;
if (!EXT4_DIR_LINK_EMPTY(inode))
- ext4_warning(inode->i_sb,
- "empty directory has too many links (%d)",
+ ext4_warning_inode(inode,
+ "empty directory '%.*s' has too many links (%u)",
+ dentry->d_name.len, dentry->d_name.name,
inode->i_nlink);
inode->i_version++;
clear_nlink(inode);
@@ -2777,7 +2981,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
/* Initialize quotas before so that eventual writes go
* in separate transaction */
dquot_initialize(dir);
- dquot_initialize(dentry->d_inode);
+ dquot_initialize(d_inode(dentry));
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
@@ -2786,7 +2990,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (!bh)
goto end_unlink;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
retval = -EIO;
if (le32_to_cpu(de->inode) != inode->i_ino)
@@ -2803,10 +3007,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- if (!inode->i_nlink) {
- ext4_warning(inode->i_sb,
- "Deleting nonexistent file (%lu), %d",
- inode->i_ino, inode->i_nlink);
+ if (inode->i_nlink == 0) {
+ ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
+ dentry->d_name.len, dentry->d_name.name);
set_nlink(inode, 1);
}
retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2834,16 +3037,38 @@ static int ext4_symlink(struct inode *dir,
{
handle_t *handle;
struct inode *inode;
- int l, err, retries = 0;
+ int err, len = strlen(symname);
int credits;
+ bool encryption_required;
+ struct ext4_str disk_link;
+ struct ext4_encrypted_symlink_data *sd = NULL;
- l = strlen(symname)+1;
- if (l > dir->i_sb->s_blocksize)
- return -ENAMETOOLONG;
+ disk_link.len = len + 1;
+ disk_link.name = (char *) symname;
+
+ encryption_required = (ext4_encrypted_inode(dir) ||
+ DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)));
+ if (encryption_required) {
+ err = ext4_get_encryption_info(dir);
+ if (err)
+ return err;
+ if (ext4_encryption_info(dir) == NULL)
+ return -EPERM;
+ disk_link.len = (ext4_fname_encrypted_size(dir, len) +
+ sizeof(struct ext4_encrypted_symlink_data));
+ sd = kzalloc(disk_link.len, GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+ }
+
+ if (disk_link.len > dir->i_sb->s_blocksize) {
+ err = -ENAMETOOLONG;
+ goto err_free_sd;
+ }
dquot_initialize(dir);
- if (l > EXT4_N_BLOCKS * 4) {
+ if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
/*
* For non-fast symlinks, we just allocate inode and put it on
* orphan list in the first transaction => we need bitmap,
@@ -2862,17 +3087,37 @@ static int ext4_symlink(struct inode *dir,
credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
}
-retry:
+
inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
&dentry->d_name, 0, NULL,
EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_stop;
+ if (IS_ERR(inode)) {
+ if (handle)
+ ext4_journal_stop(handle);
+ err = PTR_ERR(inode);
+ goto err_free_sd;
+ }
+
+ if (encryption_required) {
+ struct qstr istr;
+ struct ext4_str ostr;
+
+ istr.name = (const unsigned char *) symname;
+ istr.len = len;
+ ostr.name = sd->encrypted_path;
+ ostr.len = disk_link.len;
+ err = ext4_fname_usr_to_disk(inode, &istr, &ostr);
+ if (err < 0)
+ goto err_drop_inode;
+ sd->len = cpu_to_le16(ostr.len);
+ disk_link.name = (char *) sd;
+ inode->i_op = &ext4_encrypted_symlink_inode_operations;
+ }
- if (l > EXT4_N_BLOCKS * 4) {
- inode->i_op = &ext4_symlink_inode_operations;
+ if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
+ if (!encryption_required)
+ inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
/*
* We cannot call page_symlink() with transaction started
@@ -2887,9 +3132,10 @@ retry:
drop_nlink(inode);
err = ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
+ handle = NULL;
if (err)
goto err_drop_inode;
- err = __page_symlink(inode, symname, l, 1);
+ err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
if (err)
goto err_drop_inode;
/*
@@ -2901,36 +3147,41 @@ retry:
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
+ handle = NULL;
goto err_drop_inode;
}
set_nlink(inode, 1);
err = ext4_orphan_del(handle, inode);
- if (err) {
- ext4_journal_stop(handle);
- clear_nlink(inode);
+ if (err)
goto err_drop_inode;
- }
} else {
/* clear the extent format for fast symlink */
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
- inode->i_op = &ext4_fast_symlink_inode_operations;
- memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
- inode->i_size = l-1;
+ if (!encryption_required) {
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ inode->i_link = (char *)&EXT4_I(inode)->i_data;
+ }
+ memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
+ disk_link.len);
+ inode->i_size = disk_link.len - 1;
}
EXT4_I(inode)->i_disksize = inode->i_size;
err = ext4_add_nondir(handle, dentry, inode);
if (!err && IS_DIRSYNC(dir))
ext4_handle_sync(handle);
-out_stop:
if (handle)
ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
- goto retry;
+ kfree(sd);
return err;
err_drop_inode:
+ if (handle)
+ ext4_journal_stop(handle);
+ clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
+err_free_sd:
+ kfree(sd);
return err;
}
@@ -2938,12 +3189,14 @@ static int ext4_link(struct dentry *old_dentry,
struct inode *dir, struct dentry *dentry)
{
handle_t *handle;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int err, retries = 0;
if (inode->i_nlink >= EXT4_LINK_MAX)
return -EMLINK;
-
+ if (ext4_encrypted_inode(dir) &&
+ !ext4_is_child_context_consistent_with_parent(dir, inode))
+ return -EPERM;
dquot_initialize(dir);
retry:
@@ -3144,9 +3397,9 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
}
if (retval) {
- ext4_warning(ent->dir->i_sb,
- "Deleting old file (%lu), %d, error=%d",
- ent->dir->i_ino, ent->dir->i_nlink, retval);
+ ext4_warning_inode(ent->dir,
+ "Deleting old file: nlink %d, error=%d",
+ ent->dir->i_nlink, retval);
}
}
@@ -3210,12 +3463,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ext4_renament old = {
.dir = old_dir,
.dentry = old_dentry,
- .inode = old_dentry->d_inode,
+ .inode = d_inode(old_dentry),
};
struct ext4_renament new = {
.dir = new_dir,
.dentry = new_dentry,
- .inode = new_dentry->d_inode,
+ .inode = d_inode(new_dentry),
};
int force_reread;
int retval;
@@ -3244,6 +3497,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
goto end_rename;
+ if ((old.dir != new.dir) &&
+ ext4_encrypted_inode(new.dir) &&
+ !ext4_is_child_context_consistent_with_parent(new.dir,
+ old.inode)) {
+ retval = -EPERM;
+ goto end_rename;
+ }
+
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
@@ -3264,12 +3525,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
if (!(flags & RENAME_WHITEOUT)) {
handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_rename;
+ }
} else {
whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
- if (IS_ERR(whiteout))
- return PTR_ERR(whiteout);
+ if (IS_ERR(whiteout)) {
+ retval = PTR_ERR(whiteout);
+ whiteout = NULL;
+ goto end_rename;
+ }
}
if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
@@ -3278,7 +3545,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old.inode->i_mode)) {
if (new.inode) {
retval = -ENOTEMPTY;
- if (!empty_dir(new.inode))
+ if (!ext4_empty_dir(new.inode))
goto end_rename;
} else {
retval = -EMLINK;
@@ -3352,8 +3619,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_dec_count(handle, old.dir);
if (new.inode) {
- /* checked empty_dir above, can't have another parent,
- * ext4_dec_count() won't work for many-linked dirs */
+ /* checked ext4_empty_dir above, can't have another
+ * parent, ext4_dec_count() won't work for many-linked
+ * dirs */
clear_nlink(new.inode);
} else {
ext4_inc_count(handle, new.dir);
@@ -3391,16 +3659,25 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ext4_renament old = {
.dir = old_dir,
.dentry = old_dentry,
- .inode = old_dentry->d_inode,
+ .inode = d_inode(old_dentry),
};
struct ext4_renament new = {
.dir = new_dir,
.dentry = new_dentry,
- .inode = new_dentry->d_inode,
+ .inode = d_inode(new_dentry),
};
u8 new_file_type;
int retval;
+ if ((ext4_encrypted_inode(old_dir) ||
+ ext4_encrypted_inode(new_dir)) &&
+ (old_dir != new_dir) &&
+ (!ext4_is_child_context_consistent_with_parent(new_dir,
+ old.inode) ||
+ !ext4_is_child_context_consistent_with_parent(old_dir,
+ new.inode)))
+ return -EPERM;
+
dquot_initialize(old.dir);
dquot_initialize(new.dir);
@@ -3433,8 +3710,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
(2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+ handle = NULL;
+ goto end_rename;
+ }
if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
ext4_handle_sync(handle);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b24a2541a9ba..5602450f03f6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -8,7 +8,6 @@
#include <linux/fs.h>
#include <linux/time.h>
-#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
@@ -18,14 +17,12 @@
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
-#include <linux/aio.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/ratelimit.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -69,6 +66,10 @@ static void ext4_finish_bio(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct page *data_page = NULL;
+ struct ext4_crypto_ctx *ctx = NULL;
+#endif
struct buffer_head *bh, *head;
unsigned bio_start = bvec->bv_offset;
unsigned bio_end = bio_start + bvec->bv_len;
@@ -78,6 +79,15 @@ static void ext4_finish_bio(struct bio *bio)
if (!page)
continue;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (!page->mapping) {
+ /* The bounce data pages are unmapped. */
+ data_page = page;
+ ctx = (struct ext4_crypto_ctx *)page_private(data_page);
+ page = ctx->w.control_page;
+ }
+#endif
+
if (error) {
SetPageError(page);
set_bit(AS_EIO, &page->mapping->flags);
@@ -102,8 +112,13 @@ static void ext4_finish_bio(struct bio *bio)
} while ((bh = bh->b_this_page) != head);
bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
local_irq_restore(flags);
- if (!under_io)
+ if (!under_io) {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (ctx)
+ ext4_restore_control_page(data_page);
+#endif
end_page_writeback(page);
+ }
}
}
@@ -344,7 +359,6 @@ void ext4_io_submit(struct ext4_io_submit *io)
if (bio) {
bio_get(io->io_bio);
submit_bio(io->io_op, io->io_bio);
- BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
bio_put(io->io_bio);
}
io->io_bio = NULL;
@@ -378,6 +392,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
static int io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode,
+ struct page *page,
struct buffer_head *bh)
{
int ret;
@@ -391,7 +406,7 @@ submit_and_retry:
if (ret)
return ret;
}
- ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+ ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
io->io_next_block++;
@@ -404,6 +419,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
struct writeback_control *wbc,
bool keep_towrite)
{
+ struct page *data_page = NULL;
struct inode *inode = page->mapping->host;
unsigned block_start, blocksize;
struct buffer_head *bh, *head;
@@ -463,19 +479,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
set_buffer_async_write(bh);
} while ((bh = bh->b_this_page) != head);
- /* Now submit buffers to write */
bh = head = page_buffers(page);
+
+ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ data_page = ext4_encrypt(inode, page);
+ if (IS_ERR(data_page)) {
+ ret = PTR_ERR(data_page);
+ data_page = NULL;
+ goto out;
+ }
+ }
+
+ /* Now submit buffers to write */
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode, bh);
+ ret = io_submit_add_bh(io, inode,
+ data_page ? data_page : page, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
* we can do but mark the page as dirty, and
* better luck next time.
*/
- redirty_page_for_writepage(wbc, page);
break;
}
nr_submitted++;
@@ -484,6 +510,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/* Error stopped previous loop? Clean up buffers... */
if (ret) {
+ out:
+ if (data_page)
+ ext4_restore_control_page(data_page);
+ printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+ redirty_page_for_writepage(wbc, page);
do {
clear_buffer_async_write(bh);
bh = bh->b_this_page;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
new file mode 100644
index 000000000000..ec3ef93a52db
--- /dev/null
+++ b/fs/ext4/readpage.c
@@ -0,0 +1,328 @@
+/*
+ * linux/fs/ext4/readpage.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This was originally taken from fs/mpage.c
+ *
+ * The intent is the ext4_mpage_readpages() function here is intended
+ * to replace mpage_readpages() in the general case, not just for
+ * encrypted files. It has some limitations (see below), where it
+ * will fall back to read_block_full_page(), but these limitations
+ * should only be hit when page_size != block_size.
+ *
+ * This will allow us to attach a callback function to support ext4
+ * encryption.
+ *
+ * If anything unusual happens, such as:
+ *
+ * - encountering a page which has buffers
+ * - encountering a page which has a non-hole after a hole
+ * - encountering a page with non-contiguous blocks
+ *
+ * then this code just gives up and calls the buffer_head-based read function.
+ * It does handle a page which has holes at the end - that is a common case:
+ * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/kdev_t.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/prefetch.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/cleancache.h>
+
+#include "ext4.h"
+
+/*
+ * Call ext4_decrypt on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ struct ext4_crypto_ctx *ctx =
+ container_of(work, struct ext4_crypto_ctx, r.work);
+ struct bio *bio = ctx->r.bio;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+
+ int ret = ext4_decrypt(ctx, page);
+ if (ret) {
+ WARN_ON_ONCE(1);
+ SetPageError(page);
+ } else
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ ext4_release_crypto_ctx(ctx);
+ bio_put(bio);
+#else
+ BUG();
+#endif
+}
+
+static inline bool ext4_bio_encrypted(struct bio *bio)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ return unlikely(bio->bi_private != NULL);
+#else
+ return false;
+#endif
+}
+
+/*
+ * I/O completion handler for multipage BIOs.
+ *
+ * The mpage code never puts partial pages into a BIO (except for end-of-file).
+ * If a page does not map to a contiguous run of blocks then it simply falls
+ * back to block_read_full_page().
+ *
+ * Why is this? If a page's completion depends on a number of different BIOs
+ * which can complete in any order (or at the same time) then determining the
+ * status of that page is hard. See end_buffer_async_read() for the details.
+ * There is no point in duplicating all that complexity.
+ */
+static void mpage_end_io(struct bio *bio, int err)
+{
+ struct bio_vec *bv;
+ int i;
+
+ if (ext4_bio_encrypted(bio)) {
+ struct ext4_crypto_ctx *ctx = bio->bi_private;
+
+ if (err) {
+ ext4_release_crypto_ctx(ctx);
+ } else {
+ INIT_WORK(&ctx->r.work, completion_pages);
+ ctx->r.bio = bio;
+ queue_work(ext4_read_workqueue, &ctx->r.work);
+ return;
+ }
+ }
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+
+ if (!err) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ }
+
+ bio_put(bio);
+}
+
+int ext4_mpage_readpages(struct address_space *mapping,
+ struct list_head *pages, struct page *page,
+ unsigned nr_pages)
+{
+ struct bio *bio = NULL;
+ unsigned page_idx;
+ sector_t last_block_in_bio = 0;
+
+ struct inode *inode = mapping->host;
+ const unsigned blkbits = inode->i_blkbits;
+ const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocksize = 1 << blkbits;
+ sector_t block_in_file;
+ sector_t last_block;
+ sector_t last_block_in_file;
+ sector_t blocks[MAX_BUF_PER_PAGE];
+ unsigned page_block;
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int length;
+ unsigned relative_block = 0;
+ struct ext4_map_blocks map;
+
+ map.m_pblk = 0;
+ map.m_lblk = 0;
+ map.m_len = 0;
+ map.m_flags = 0;
+
+ for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
+ int fully_mapped = 1;
+ unsigned first_hole = blocks_per_page;
+
+ prefetchw(&page->flags);
+ if (pages) {
+ page = list_entry(pages->prev, struct page, lru);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping,
+ page->index, GFP_KERNEL))
+ goto next_page;
+ }
+
+ if (page_has_buffers(page))
+ goto confused;
+
+ block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ last_block = block_in_file + nr_pages * blocks_per_page;
+ last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ if (last_block > last_block_in_file)
+ last_block = last_block_in_file;
+ page_block = 0;
+
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & EXT4_MAP_MAPPED) &&
+ block_in_file > map.m_lblk &&
+ block_in_file < (map.m_lblk + map.m_len)) {
+ unsigned map_offset = block_in_file - map.m_lblk;
+ unsigned last = map.m_len - map_offset;
+
+ for (relative_block = 0; ; relative_block++) {
+ if (relative_block == last) {
+ /* needed? */
+ map.m_flags &= ~EXT4_MAP_MAPPED;
+ break;
+ }
+ if (page_block == blocks_per_page)
+ break;
+ blocks[page_block] = map.m_pblk + map_offset +
+ relative_block;
+ page_block++;
+ block_in_file++;
+ }
+ }
+
+ /*
+ * Then do more ext4_map_blocks() calls until we are
+ * done with this page.
+ */
+ while (page_block < blocks_per_page) {
+ if (block_in_file < last_block) {
+ map.m_lblk = block_in_file;
+ map.m_len = last_block - block_in_file;
+
+ if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
+ set_error_page:
+ SetPageError(page);
+ zero_user_segment(page, 0,
+ PAGE_CACHE_SIZE);
+ unlock_page(page);
+ goto next_page;
+ }
+ }
+ if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
+ fully_mapped = 0;
+ if (first_hole == blocks_per_page)
+ first_hole = page_block;
+ page_block++;
+ block_in_file++;
+ continue;
+ }
+ if (first_hole != blocks_per_page)
+ goto confused; /* hole -> non-hole */
+
+ /* Contiguous blocks? */
+ if (page_block && blocks[page_block-1] != map.m_pblk-1)
+ goto confused;
+ for (relative_block = 0; ; relative_block++) {
+ if (relative_block == map.m_len) {
+ /* needed? */
+ map.m_flags &= ~EXT4_MAP_MAPPED;
+ break;
+ } else if (page_block == blocks_per_page)
+ break;
+ blocks[page_block] = map.m_pblk+relative_block;
+ page_block++;
+ block_in_file++;
+ }
+ }
+ if (first_hole != blocks_per_page) {
+ zero_user_segment(page, first_hole << blkbits,
+ PAGE_CACHE_SIZE);
+ if (first_hole == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto next_page;
+ }
+ } else if (fully_mapped) {
+ SetPageMappedToDisk(page);
+ }
+ if (fully_mapped && blocks_per_page == 1 &&
+ !PageUptodate(page) && cleancache_get_page(page) == 0) {
+ SetPageUptodate(page);
+ goto confused;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (last_block_in_bio != blocks[0] - 1)) {
+ submit_and_realloc:
+ submit_bio(READ, bio);
+ bio = NULL;
+ }
+ if (bio == NULL) {
+ struct ext4_crypto_ctx *ctx = NULL;
+
+ if (ext4_encrypted_inode(inode) &&
+ S_ISREG(inode->i_mode)) {
+ ctx = ext4_get_crypto_ctx(inode);
+ if (IS_ERR(ctx))
+ goto set_error_page;
+ }
+ bio = bio_alloc(GFP_KERNEL,
+ min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+ if (!bio) {
+ if (ctx)
+ ext4_release_crypto_ctx(ctx);
+ goto set_error_page;
+ }
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+ bio->bi_end_io = mpage_end_io;
+ bio->bi_private = ctx;
+ }
+
+ length = first_hole << blkbits;
+ if (bio_add_page(bio, page, length, 0) < length)
+ goto submit_and_realloc;
+
+ if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
+ (relative_block == map.m_len)) ||
+ (first_hole != blocks_per_page)) {
+ submit_bio(READ, bio);
+ bio = NULL;
+ } else
+ last_block_in_bio = blocks[blocks_per_page - 1];
+ goto next_page;
+ confused:
+ if (bio) {
+ submit_bio(READ, bio);
+ bio = NULL;
+ }
+ if (!PageUptodate(page))
+ block_read_full_page(page, ext4_get_block);
+ else
+ unlock_page(page);
+ next_page:
+ if (pages)
+ page_cache_release(page);
+ }
+ BUG_ON(pages && !list_empty(pages));
+ if (bio)
+ submit_bio(READ, bio);
+ return 0;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 8a8ec6293b19..cf0c472047e3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1432,12 +1432,15 @@ static int ext4_flex_group_add(struct super_block *sb,
goto exit;
/*
* We will always be modifying at least the superblock and GDT
- * block. If we are adding a group past the last current GDT block,
+ * blocks. If we are adding a group past the last current GDT block,
* we will also modify the inode and the dindirect block. If we
* are adding a group with superblock/GDT backups we will also
* modify each of the reserved GDT dindirect blocks.
*/
- credit = flex_gd->count * 4 + reserved_gdb;
+ credit = 3; /* sb, resize inode, resize inode dindirect */
+ /* GDT blocks */
+ credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb));
+ credit += reserved_gdb; /* Reserved GDT dindirect blocks */
handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e061e66c8280..58987b5c514b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -21,10 +21,10 @@
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/vmalloc.h>
-#include <linux/jbd2.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
@@ -295,6 +295,8 @@ static void __save_error_info(struct super_block *sb, const char *func,
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ if (bdev_read_only(sb->s_bdev))
+ return;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
es->s_last_error_time = cpu_to_le32(get_seconds());
strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
@@ -323,22 +325,6 @@ static void save_error_info(struct super_block *sb, const char *func,
ext4_commit_super(sb, 1);
}
-/*
- * The del_gendisk() function uninitializes the disk-specific data
- * structures, including the bdi structure, without telling anyone
- * else. Once this happens, any attempt to call mark_buffer_dirty()
- * (for example, by ext4_commit_super), will cause a kernel OOPS.
- * This is a kludge to prevent these oops until we can put in a proper
- * hook in del_gendisk() to inform the VFS and file system layers.
- */
-static int block_device_ejected(struct super_block *sb)
-{
- struct inode *bd_inode = sb->s_bdev->bd_inode;
- struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
-
- return bdi->dev == NULL;
-}
-
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
@@ -466,7 +452,7 @@ void __ext4_error_file(struct file *file, const char *function,
es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
if (ext4_error_ratelimit(inode->i_sb)) {
- path = d_path(&(file->f_path), pathname, sizeof(pathname));
+ path = file_path(file, pathname, sizeof(pathname));
if (IS_ERR(path))
path = "(unknown)";
va_start(args, fmt);
@@ -606,14 +592,17 @@ void __ext4_msg(struct super_block *sb,
va_end(args);
}
+#define ext4_warning_ratelimit(sb) \
+ ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
+ "EXT4-fs warning")
+
void __ext4_warning(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
- if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
- "EXT4-fs warning"))
+ if (!ext4_warning_ratelimit(sb))
return;
va_start(args, fmt);
@@ -624,6 +613,24 @@ void __ext4_warning(struct super_block *sb, const char *function,
va_end(args);
}
+void __ext4_warning_inode(const struct inode *inode, const char *function,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (!ext4_warning_ratelimit(inode->i_sb))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
+ "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
+ function, line, inode->i_ino, current->comm, &vaf);
+ va_end(args);
+}
+
void __ext4_grp_locked_error(const char *function, unsigned int line,
struct super_block *sb, ext4_group_t grp,
unsigned long ino, ext4_fsblk_t block,
@@ -822,6 +829,7 @@ static void ext4_put_super(struct super_block *sb)
dump_orphan_list(sb, sbi);
J_ASSERT(list_empty(&sbi->s_orphan));
+ sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
/*
@@ -893,7 +901,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
-
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ ei->i_crypt_info = NULL;
+#endif
return &ei->vfs_inode;
}
@@ -970,6 +980,10 @@ void ext4_clear_inode(struct inode *inode)
jbd2_free_inode(EXT4_I(inode)->jinode);
EXT4_I(inode)->jinode = NULL;
}
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ if (EXT4_I(inode)->i_crypt_info)
+ ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info);
+#endif
}
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1076,7 +1090,7 @@ static const struct quotactl_ops ext4_qctl_operations = {
.quota_on = ext4_quota_on,
.quota_off = ext4_quota_off,
.quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
+ .get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk
@@ -1120,7 +1134,7 @@ enum {
Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_data_err_abort, Opt_data_err_ignore,
+ Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
@@ -1211,6 +1225,7 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+ {Opt_test_dummy_encryption, "test_dummy_encryption"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1412,6 +1427,7 @@ static const struct mount_opts {
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+ {Opt_test_dummy_encryption, 0, MOPT_GTE0},
{Opt_err, 0, 0}
};
@@ -1568,7 +1584,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
}
- journal_inode = path.dentry->d_inode;
+ journal_inode = d_inode(path.dentry);
if (!S_ISBLK(journal_inode->i_mode)) {
ext4_msg(sb, KERN_ERR, "error: journal path %s "
"is not a block device", journal_path);
@@ -1588,6 +1604,15 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
*journal_ioprio =
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+ } else if (token == Opt_test_dummy_encryption) {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
+ ext4_msg(sb, KERN_WARNING,
+ "Test dummy encryption mode enabled");
+#else
+ ext4_msg(sb, KERN_WARNING,
+ "Test dummy encryption mount option ignored");
+#endif
} else if (m->flags & MOPT_DATAJ) {
if (is_remount) {
if (!sbi->s_journal)
@@ -2685,11 +2710,13 @@ static struct attribute *ext4_attrs[] = {
EXT4_INFO_ATTR(lazy_itable_init);
EXT4_INFO_ATTR(batched_discard);
EXT4_INFO_ATTR(meta_bg_resize);
+EXT4_INFO_ATTR(encryption);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
ATTR_LIST(meta_bg_resize),
+ ATTR_LIST(encryption),
NULL,
};
@@ -3419,7 +3446,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned long journal_devnum = 0;
unsigned long def_mount_opts;
struct inode *root;
- char *cp;
const char *descr;
int ret = -ENOMEM;
int blocksize, clustersize;
@@ -3450,8 +3476,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
part_stat_read(sb->s_bdev->bd_part, sectors[1]);
/* Cleanup superblock name */
- for (cp = sb->s_id; (cp = strchr(cp, '/'));)
- *cp = '!';
+ strreplace(sb->s_id, '/', '!');
/* -EINVAL is default */
ret = -EINVAL;
@@ -3692,6 +3717,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
+ es->s_encryption_level) {
+ ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
+ es->s_encryption_level);
+ goto failed_mount;
+ }
+
if (sb->s_blocksize != blocksize) {
/* Validate the filesystem blocksize */
if (!sb_set_blocksize(sb, blocksize)) {
@@ -4054,6 +4086,21 @@ no_journal:
}
}
+ if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
+ EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+ (blocksize != PAGE_CACHE_SIZE)) {
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported blocksize for fs encryption");
+ goto failed_mount_wq;
+ }
+
+ if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
+ !(sb->s_flags & MS_RDONLY) &&
+ !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+ ext4_commit_super(sb, 1);
+ }
+
/*
* Get the # of file system overhead blocks from the
* superblock if present.
@@ -4570,7 +4617,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
int error = 0;
- if (!sbh || block_device_ejected(sb))
+ if (!sbh)
return error;
if (buffer_write_io_error(sbh)) {
/*
@@ -4923,6 +4970,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
}
+ if (*flags & MS_LAZYTIME)
+ sb->s_flags |= MS_LAZYTIME;
+
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
err = -EROFS;
@@ -5199,7 +5249,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);
+ handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -5247,7 +5297,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
* all updates to the file when we bypass pagecache...
*/
if (EXT4_SB(sb)->s_journal &&
- ext4_should_journal_data(path->dentry->d_inode)) {
+ ext4_should_journal_data(d_inode(path->dentry))) {
/*
* We don't need to lock updates but journal_flush() could
* otherwise be livelocked...
@@ -5390,6 +5440,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
struct inode *inode = sb_dqopt(sb)->files[type];
ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
int err, offset = off & (sb->s_blocksize - 1);
+ int retries = 0;
struct buffer_head *bh;
handle_t *handle = journal_current_handle();
@@ -5410,7 +5461,12 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
return -EIO;
}
- bh = ext4_bread(handle, inode, blk, 1);
+ do {
+ bh = ext4_bread(handle, inode, blk,
+ EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_METADATA_NOFAIL);
+ } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
+ ext4_should_retry_alloc(inode->i_sb, &retries));
if (IS_ERR(bh))
return PTR_ERR(bh);
if (!bh)
@@ -5627,6 +5683,7 @@ out7:
static void __exit ext4_exit_fs(void)
{
+ ext4_exit_crypto();
ext4_destroy_lazyinit_thread();
unregister_as_ext2();
unregister_as_ext3();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ff3711932018..c677f2c1044b 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -18,18 +18,89 @@
*/
#include <linux/fs.h>
-#include <linux/jbd2.h>
#include <linux/namei.h>
#include "ext4.h"
#include "xattr.h"
-static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie)
{
- struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
- nd_set_link(nd, (char *) ei->i_data);
- return NULL;
+ struct page *cpage = NULL;
+ char *caddr, *paddr = NULL;
+ struct ext4_str cstr, pstr;
+ struct inode *inode = d_inode(dentry);
+ struct ext4_encrypted_symlink_data *sd;
+ loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
+ int res;
+ u32 plen, max_size = inode->i_sb->s_blocksize;
+
+ res = ext4_get_encryption_info(inode);
+ if (res)
+ return ERR_PTR(res);
+
+ if (ext4_inode_is_fast_symlink(inode)) {
+ caddr = (char *) EXT4_I(inode)->i_data;
+ max_size = sizeof(EXT4_I(inode)->i_data);
+ } else {
+ cpage = read_mapping_page(inode->i_mapping, 0, NULL);
+ if (IS_ERR(cpage))
+ return ERR_CAST(cpage);
+ caddr = kmap(cpage);
+ caddr[size] = 0;
+ }
+
+ /* Symlink is encrypted */
+ sd = (struct ext4_encrypted_symlink_data *)caddr;
+ cstr.name = sd->encrypted_path;
+ cstr.len = le32_to_cpu(sd->len);
+ if ((cstr.len +
+ sizeof(struct ext4_encrypted_symlink_data) - 1) >
+ max_size) {
+ /* Symlink data on the disk is corrupted */
+ res = -EIO;
+ goto errout;
+ }
+ plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
+ EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len;
+ paddr = kmalloc(plen + 1, GFP_NOFS);
+ if (!paddr) {
+ res = -ENOMEM;
+ goto errout;
+ }
+ pstr.name = paddr;
+ pstr.len = plen;
+ res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+ if (res < 0)
+ goto errout;
+ /* Null-terminate the name */
+ if (res <= plen)
+ paddr[res] = '\0';
+ if (cpage) {
+ kunmap(cpage);
+ page_cache_release(cpage);
+ }
+ return *cookie = paddr;
+errout:
+ if (cpage) {
+ kunmap(cpage);
+ page_cache_release(cpage);
+ }
+ kfree(paddr);
+ return ERR_PTR(res);
}
+const struct inode_operations ext4_encrypted_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = ext4_encrypted_follow_link,
+ .put_link = kfree_put_link,
+ .setattr = ext4_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+};
+#endif
+
const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
@@ -43,7 +114,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ext4_follow_link,
+ .follow_link = simple_follow_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1e09fc77395c..16e28c08d1e8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -55,7 +55,6 @@
#include <linux/slab.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
-#include <linux/rwsem.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "xattr.h"
@@ -179,7 +178,7 @@ ext4_xattr_handler(int name_index)
/*
* Inode operation listxattr()
*
- * dentry->d_inode->i_mutex: don't care
+ * d_inode(dentry)->i_mutex: don't care
*/
ssize_t
ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -424,7 +423,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
static int
ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct buffer_head *bh = NULL;
int error;
struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
@@ -461,7 +460,7 @@ cleanup:
static int
ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
@@ -502,7 +501,7 @@ ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
int ret, ret2;
- down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+ down_read(&EXT4_I(d_inode(dentry))->xattr_sem);
ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
if (ret < 0)
goto errout;
@@ -515,7 +514,7 @@ ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
goto errout;
ret += ret2;
errout:
- up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+ up_read(&EXT4_I(d_inode(dentry))->xattr_sem);
return ret;
}
@@ -639,8 +638,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
free += EXT4_XATTR_LEN(name_len);
}
if (i->value) {
- if (free < EXT4_XATTR_SIZE(i->value_len) ||
- free < EXT4_XATTR_LEN(name_len) +
+ if (free < EXT4_XATTR_LEN(name_len) +
EXT4_XATTR_SIZE(i->value_len))
return -ENOSPC;
}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 29bedf5589f6..ddc0957760ba 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -23,6 +23,7 @@
#define EXT4_XATTR_INDEX_SECURITY 6
#define EXT4_XATTR_INDEX_SYSTEM 7
#define EXT4_XATTR_INDEX_RICHACL 8
+#define EXT4_XATTR_INDEX_ENCRYPTION 9
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
@@ -98,6 +99,8 @@ extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
+#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
+
extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index d2a200624af5..95d90e0560f0 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -33,7 +33,7 @@ ext4_xattr_security_get(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+ return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, buffer, size);
}
@@ -43,7 +43,7 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+ return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 95f1f4ab59a4..891ee2ddfbd6 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -36,7 +36,7 @@ ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+ return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
@@ -46,7 +46,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+ return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 0edb7611ffbe..6ed932b3c043 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -37,7 +37,7 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name,
return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+ return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
name, buffer, size);
}
@@ -49,7 +49,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+ return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 94e2d2ffabe1..c629762005bc 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -1,5 +1,5 @@
config F2FS_FS
- tristate "F2FS filesystem support (EXPERIMENTAL)"
+ tristate "F2FS filesystem support"
depends on BLOCK
help
F2FS is based on Log-structured File System (LFS), which supports
@@ -72,6 +72,25 @@ config F2FS_CHECK_FS
If you want to improve the performance, say N.
+config F2FS_FS_ENCRYPTION
+ bool "F2FS Encryption"
+ depends on F2FS_FS
+ depends on F2FS_FS_XATTR
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_ECB
+ select CRYPTO_XTS
+ select CRYPTO_CTS
+ select CRYPTO_CTR
+ select CRYPTO_SHA256
+ select KEYS
+ select ENCRYPTED_KEYS
+ help
+ Enable encryption of f2fs files and directories. This
+ feature is similar to ecryptfs, but it is more memory
+ efficient since it avoids caching the encrypted and
+ decrypted pages in the page cache.
+
config F2FS_IO_TRACE
bool "F2FS IO tracer"
depends on F2FS_FS
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index d92397731db8..396be1a39e55 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -6,3 +6,5 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
+f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \
+ crypto_key.o crypto_fname.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 742202779bd5..c8f25f7241f0 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -334,50 +334,48 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
struct page *dpage)
{
struct posix_acl *p;
+ struct posix_acl *clone;
int ret;
+ *acl = NULL;
+ *default_acl = NULL;
+
if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
- goto no_acl;
+ return 0;
p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage);
- if (IS_ERR(p)) {
- if (p == ERR_PTR(-EOPNOTSUPP))
- goto apply_umask;
- return PTR_ERR(p);
+ if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
+ *mode &= ~current_umask();
+ return 0;
}
+ if (IS_ERR(p))
+ return PTR_ERR(p);
- if (!p)
- goto apply_umask;
-
- *acl = f2fs_acl_clone(p, GFP_NOFS);
- if (!*acl)
- return -ENOMEM;
+ clone = f2fs_acl_clone(p, GFP_NOFS);
+ if (!clone)
+ goto no_mem;
- ret = f2fs_acl_create_masq(*acl, mode);
- if (ret < 0) {
- posix_acl_release(*acl);
- return -ENOMEM;
- }
+ ret = f2fs_acl_create_masq(clone, mode);
+ if (ret < 0)
+ goto no_mem_clone;
- if (ret == 0) {
- posix_acl_release(*acl);
- *acl = NULL;
- }
+ if (ret == 0)
+ posix_acl_release(clone);
+ else
+ *acl = clone;
- if (!S_ISDIR(*mode)) {
+ if (!S_ISDIR(*mode))
posix_acl_release(p);
- *default_acl = NULL;
- } else {
+ else
*default_acl = p;
- }
- return 0;
-apply_umask:
- *mode &= ~current_umask();
-no_acl:
- *default_acl = NULL;
- *acl = NULL;
return 0;
+
+no_mem_clone:
+ posix_acl_release(clone);
+no_mem:
+ posix_acl_release(p);
+ return -ENOMEM;
}
int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7f794b72b3b7..b70bbe1a6a8c 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -52,9 +52,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
struct address_space *mapping = META_MAPPING(sbi);
struct page *page;
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = META,
.rw = READ_SYNC | REQ_META | REQ_PRIO,
.blk_addr = index,
+ .encrypted_page = NULL,
};
repeat:
page = grab_cache_page(mapping, index);
@@ -65,7 +67,9 @@ repeat:
if (PageUptodate(page))
goto out;
- if (f2fs_submit_page_bio(sbi, page, &fio))
+ fio.page = page;
+
+ if (f2fs_submit_page_bio(&fio))
goto repeat;
lock_page(page);
@@ -77,8 +81,7 @@ out:
return page;
}
-static inline bool is_valid_blkaddr(struct f2fs_sb_info *sbi,
- block_t blkaddr, int type)
+bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
{
switch (type) {
case META_NAT:
@@ -118,8 +121,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
struct page *page;
block_t blkno = start;
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO
+ .rw = READ_SYNC | REQ_META | REQ_PRIO,
+ .encrypted_page = NULL,
};
for (; nrpages-- > 0; blkno++) {
@@ -161,7 +166,8 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
continue;
}
- f2fs_submit_page_mbio(sbi, page, &fio);
+ fio.page = page;
+ f2fs_submit_page_mbio(&fio);
f2fs_put_page(page, 0);
}
out:
@@ -276,7 +282,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- if (f2fs_write_meta_page(page, &wbc)) {
+ if (mapping->a_ops->writepage(page, &wbc)) {
unlock_page(page);
break;
}
@@ -464,20 +470,19 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
void recover_orphan_inodes(struct f2fs_sb_info *sbi)
{
- block_t start_blk, orphan_blkaddr, i, j;
+ block_t start_blk, orphan_blocks, i, j;
if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
return;
set_sbi_flag(sbi, SBI_POR_DOING);
- start_blk = __start_cp_addr(sbi) + 1 +
- le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
- orphan_blkaddr = __start_sum_addr(sbi) - 1;
+ start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
+ orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
- ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
+ ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP);
- for (i = 0; i < orphan_blkaddr; i++) {
+ for (i = 0; i < orphan_blocks; i++) {
struct page *page = get_meta_page(sbi, start_blk + i);
struct f2fs_orphan_block *orphan_blk;
@@ -511,7 +516,12 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
grab_meta_page(sbi, start_blk + index);
index = 1;
- spin_lock(&im->ino_lock);
+
+ /*
+ * we don't need to do spin_lock(&im->ino_lock) here, since all the
+ * orphan inode operations are covered under f2fs_lock_op().
+ * And, spin_lock should be avoided due to page operations below.
+ */
head = &im->ino_list;
/* loop for each orphan inode entry and write them in Jornal block */
@@ -551,8 +561,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
set_page_dirty(page);
f2fs_put_page(page, 1);
}
-
- spin_unlock(&im->ino_lock);
}
static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -615,7 +623,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
unsigned long blk_size = sbi->blocksize;
unsigned long long cp1_version = 0, cp2_version = 0;
unsigned long long cp_start_blk_no;
- unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+ unsigned int cp_blks = 1 + __cp_payload(sbi);
block_t cp_blk_no;
int i;
@@ -796,6 +804,7 @@ retry:
* wribacking dentry pages in the freeing inode.
*/
f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ cond_resched();
}
goto retry;
}
@@ -879,12 +888,10 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num;
nid_t last_nid = nm_i->next_scan_nid;
block_t start_blk;
- struct page *cp_page;
unsigned int data_sum_blocks, orphan_blocks;
__u32 crc32 = 0;
- void *kaddr;
int i;
- int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+ int cp_payload_blks = __cp_payload(sbi);
/*
* This avoids to conduct wrong roll-forward operations and uses
@@ -979,19 +986,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk = __start_cp_addr(sbi);
/* write out checkpoint buffer at block 0 */
- cp_page = grab_meta_page(sbi, start_blk++);
- kaddr = page_address(cp_page);
- memcpy(kaddr, ckpt, F2FS_BLKSIZE);
- set_page_dirty(cp_page);
- f2fs_put_page(cp_page, 1);
-
- for (i = 1; i < 1 + cp_payload_blks; i++) {
- cp_page = grab_meta_page(sbi, start_blk++);
- kaddr = page_address(cp_page);
- memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
- set_page_dirty(cp_page);
- f2fs_put_page(cp_page, 1);
- }
+ update_meta_page(sbi, ckpt, start_blk++);
+
+ for (i = 1; i < 1 + cp_payload_blks; i++)
+ update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE,
+ start_blk++);
if (orphan_num) {
write_orphan_inodes(sbi, start_blk);
@@ -1006,11 +1005,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
/* writeout checkpoint block */
- cp_page = grab_meta_page(sbi, start_blk);
- kaddr = page_address(cp_page);
- memcpy(kaddr, ckpt, F2FS_BLKSIZE);
- set_page_dirty(cp_page);
- f2fs_put_page(cp_page, 1);
+ update_meta_page(sbi, ckpt, start_blk);
/* wait for previous submitted node/meta pages writeback */
wait_on_all_pages_writeback(sbi);
@@ -1036,7 +1031,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (unlikely(f2fs_cp_error(sbi)))
return;
- clear_prefree_segments(sbi);
+ clear_prefree_segments(sbi, cpc);
clear_sbi_flag(sbi, SBI_IS_DIRTY);
}
@@ -1048,17 +1043,19 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
- trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
-
mutex_lock(&sbi->cp_mutex);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
- cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT)
+ (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
+ (cpc->reason == CP_DISCARD && !sbi->discard_blks)))
goto out;
if (unlikely(f2fs_cp_error(sbi)))
goto out;
if (f2fs_readonly(sbi->sb))
goto out;
+
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
+
if (block_operations(sbi))
goto out;
@@ -1085,6 +1082,10 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
+
+ if (cpc->reason == CP_RECOVERY)
+ f2fs_msg(sbi->sb, KERN_NOTICE,
+ "checkpoint: version = %llx", ckpt_ver);
out:
mutex_unlock(&sbi->cp_mutex);
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
@@ -1103,14 +1104,9 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
im->ino_num = 0;
}
- /*
- * considering 512 blocks in a segment 8 blocks are needed for cp
- * and log segment summaries. Remaining blocks are used to keep
- * orphan entries with the limitation one reserved segment
- * for cp pack we can have max 1020*504 orphan entries
- */
sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
- NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
+ NR_CURSEG_TYPE - __cp_payload(sbi)) *
+ F2FS_ORPHANS_PER_BLOCK;
}
int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c
new file mode 100644
index 000000000000..4a62ef14e932
--- /dev/null
+++ b/fs/f2fs/crypto.c
@@ -0,0 +1,491 @@
+/*
+ * linux/fs/f2fs/crypto.c
+ *
+ * Copied from linux/fs/ext4/crypto.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * This contains encryption functions for f2fs
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ * Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ * Ildar Muslukhov, 2014
+ * Remove ext4_encrypted_zeroout(),
+ * add f2fs_restore_and_release_control_page()
+ * Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/user-type.h>
+#include <keys/encrypted-type.h>
+#include <linux/crypto.h>
+#include <linux/ecryptfs.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+#include <linux/f2fs_fs.h>
+#include <linux/ratelimit.h>
+#include <linux/bio.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+
+/* Encryption added and removed here! (L: */
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+ "Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+ "Number of crypto contexts to preallocate");
+
+static mempool_t *f2fs_bounce_page_pool;
+
+static LIST_HEAD(f2fs_free_crypto_ctxs);
+static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock);
+
+static struct workqueue_struct *f2fs_read_workqueue;
+static DEFINE_MUTEX(crypto_init);
+
+static struct kmem_cache *f2fs_crypto_ctx_cachep;
+struct kmem_cache *f2fs_crypt_info_cachep;
+
+/**
+ * f2fs_release_crypto_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx)
+{
+ unsigned long flags;
+
+ if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+ mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
+ ctx->w.bounce_page = NULL;
+ }
+ ctx->w.control_page = NULL;
+ if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+ kmem_cache_free(f2fs_crypto_ctx_cachep, ctx);
+ } else {
+ spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
+ list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
+ spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
+ }
+}
+
+/**
+ * f2fs_get_crypto_ctx() - Gets an encryption context
+ * @inode: The inode for which we are doing the crypto
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode)
+{
+ struct f2fs_crypto_ctx *ctx = NULL;
+ unsigned long flags;
+ struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+
+ if (ci == NULL)
+ return ERR_PTR(-ENOKEY);
+
+ /*
+ * We first try getting the ctx from a free list because in
+ * the common case the ctx will have an allocated and
+ * initialized crypto tfm, so it's probably a worthwhile
+ * optimization. For the bounce page, we first try getting it
+ * from the kernel allocator because that's just about as fast
+ * as getting it from a list and because a cache of free pages
+ * should generally be a "last resort" option for a filesystem
+ * to be able to do its job.
+ */
+ spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
+ ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs,
+ struct f2fs_crypto_ctx, free_list);
+ if (ctx)
+ list_del(&ctx->free_list);
+ spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
+ if (!ctx) {
+ ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ } else {
+ ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ }
+ ctx->flags &= ~F2FS_WRITE_PATH_FL;
+ return ctx;
+}
+
+/*
+ * Call f2fs_decrypt on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+ struct f2fs_crypto_ctx *ctx =
+ container_of(work, struct f2fs_crypto_ctx, r.work);
+ struct bio *bio = ctx->r.bio;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+ int ret = f2fs_decrypt(ctx, page);
+
+ if (ret) {
+ WARN_ON_ONCE(1);
+ SetPageError(page);
+ } else
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ f2fs_release_crypto_ctx(ctx);
+ bio_put(bio);
+}
+
+void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio)
+{
+ INIT_WORK(&ctx->r.work, completion_pages);
+ ctx->r.bio = bio;
+ queue_work(f2fs_read_workqueue, &ctx->r.work);
+}
+
+static void f2fs_crypto_destroy(void)
+{
+ struct f2fs_crypto_ctx *pos, *n;
+
+ list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list)
+ kmem_cache_free(f2fs_crypto_ctx_cachep, pos);
+ INIT_LIST_HEAD(&f2fs_free_crypto_ctxs);
+ if (f2fs_bounce_page_pool)
+ mempool_destroy(f2fs_bounce_page_pool);
+ f2fs_bounce_page_pool = NULL;
+}
+
+/**
+ * f2fs_crypto_initialize() - Set up for f2fs encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int f2fs_crypto_initialize(void)
+{
+ int i, res = -ENOMEM;
+
+ if (f2fs_bounce_page_pool)
+ return 0;
+
+ mutex_lock(&crypto_init);
+ if (f2fs_bounce_page_pool)
+ goto already_initialized;
+
+ for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+ struct f2fs_crypto_ctx *ctx;
+
+ ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ goto fail;
+ list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
+ }
+
+ /* must be allocated at the last step to avoid race condition above */
+ f2fs_bounce_page_pool =
+ mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+ if (!f2fs_bounce_page_pool)
+ goto fail;
+
+already_initialized:
+ mutex_unlock(&crypto_init);
+ return 0;
+fail:
+ f2fs_crypto_destroy();
+ mutex_unlock(&crypto_init);
+ return res;
+}
+
+/**
+ * f2fs_exit_crypto() - Shutdown the f2fs encryption system
+ */
+void f2fs_exit_crypto(void)
+{
+ f2fs_crypto_destroy();
+
+ if (f2fs_read_workqueue)
+ destroy_workqueue(f2fs_read_workqueue);
+ if (f2fs_crypto_ctx_cachep)
+ kmem_cache_destroy(f2fs_crypto_ctx_cachep);
+ if (f2fs_crypt_info_cachep)
+ kmem_cache_destroy(f2fs_crypt_info_cachep);
+}
+
+int __init f2fs_init_crypto(void)
+{
+ int res = -ENOMEM;
+
+ f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0);
+ if (!f2fs_read_workqueue)
+ goto fail;
+
+ f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!f2fs_crypto_ctx_cachep)
+ goto fail;
+
+ f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info,
+ SLAB_RECLAIM_ACCOUNT);
+ if (!f2fs_crypt_info_cachep)
+ goto fail;
+
+ return 0;
+fail:
+ f2fs_exit_crypto();
+ return res;
+}
+
+void f2fs_restore_and_release_control_page(struct page **page)
+{
+ struct f2fs_crypto_ctx *ctx;
+ struct page *bounce_page;
+
+ /* The bounce data pages are unmapped. */
+ if ((*page)->mapping)
+ return;
+
+ /* The bounce data page is unmapped. */
+ bounce_page = *page;
+ ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page);
+
+ /* restore control page */
+ *page = ctx->w.control_page;
+
+ f2fs_restore_control_page(bounce_page);
+}
+
+void f2fs_restore_control_page(struct page *data_page)
+{
+ struct f2fs_crypto_ctx *ctx =
+ (struct f2fs_crypto_ctx *)page_private(data_page);
+
+ set_page_private(data_page, (unsigned long)NULL);
+ ClearPagePrivate(data_page);
+ unlock_page(data_page);
+ f2fs_release_crypto_ctx(ctx);
+}
+
+/**
+ * f2fs_crypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void f2fs_crypt_complete(struct crypto_async_request *req, int res)
+{
+ struct f2fs_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+typedef enum {
+ F2FS_DECRYPT = 0,
+ F2FS_ENCRYPT,
+} f2fs_direction_t;
+
+static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx,
+ struct inode *inode,
+ f2fs_direction_t rw,
+ pgoff_t index,
+ struct page *src_page,
+ struct page *dest_page)
+{
+ u8 xts_tweak[F2FS_XTS_TWEAK_SIZE];
+ struct ablkcipher_request *req = NULL;
+ DECLARE_F2FS_COMPLETION_RESULT(ecr);
+ struct scatterlist dst, src;
+ struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_request_alloc() failed\n",
+ __func__);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ f2fs_crypt_complete, &ecr);
+
+ BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index));
+ memcpy(xts_tweak, &index, sizeof(index));
+ memset(&xts_tweak[sizeof(index)], 0,
+ F2FS_XTS_TWEAK_SIZE - sizeof(index));
+
+ sg_init_table(&dst, 1);
+ sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+ sg_init_table(&src, 1);
+ sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
+ ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+ xts_tweak);
+ if (rw == F2FS_DECRYPT)
+ res = crypto_ablkcipher_decrypt(req);
+ else
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ ablkcipher_request_free(req);
+ if (res) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_ablkcipher_encrypt() returned %d\n",
+ __func__, res);
+ return res;
+ }
+ return 0;
+}
+
+static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx)
+{
+ ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT);
+ if (ctx->w.bounce_page == NULL)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= F2FS_WRITE_PATH_FL;
+ return ctx->w.bounce_page;
+}
+
+/**
+ * f2fs_encrypt() - Encrypts a page
+ * @inode: The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path. The caller must call
+ * f2fs_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *f2fs_encrypt(struct inode *inode,
+ struct page *plaintext_page)
+{
+ struct f2fs_crypto_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ int err;
+
+ BUG_ON(!PageLocked(plaintext_page));
+
+ ctx = f2fs_get_crypto_ctx(inode);
+ if (IS_ERR(ctx))
+ return (struct page *)ctx;
+
+ /* The encryption operation will require a bounce page. */
+ ciphertext_page = alloc_bounce_page(ctx);
+ if (IS_ERR(ciphertext_page))
+ goto err_out;
+
+ ctx->w.control_page = plaintext_page;
+ err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index,
+ plaintext_page, ciphertext_page);
+ if (err) {
+ ciphertext_page = ERR_PTR(err);
+ goto err_out;
+ }
+
+ SetPagePrivate(ciphertext_page);
+ set_page_private(ciphertext_page, (unsigned long)ctx);
+ lock_page(ciphertext_page);
+ return ciphertext_page;
+
+err_out:
+ f2fs_release_crypto_ctx(ctx);
+ return ciphertext_page;
+}
+
+/**
+ * f2fs_decrypt() - Decrypts a page in-place
+ * @ctx: The encryption context.
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+
+ return f2fs_page_crypto(ctx, page->mapping->host,
+ F2FS_DECRYPT, page->index, page, page);
+}
+
+/*
+ * Convenience function which takes care of allocating and
+ * deallocating the encryption context
+ */
+int f2fs_decrypt_one(struct inode *inode, struct page *page)
+{
+ struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode);
+ int ret;
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ ret = f2fs_decrypt(ctx, page);
+ f2fs_release_crypto_ctx(ctx);
+ return ret;
+}
+
+bool f2fs_valid_contents_enc_mode(uint32_t mode)
+{
+ return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS);
+}
+
+/**
+ * f2fs_validate_encryption_key_size() - Validate the encryption key size
+ * @mode: The key mode.
+ * @size: The key size to validate.
+ *
+ * Return: The validated key size for @mode. Zero if invalid.
+ */
+uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size)
+{
+ if (size == f2fs_encryption_key_size(mode))
+ return size;
+ return 0;
+}
diff --git a/fs/f2fs/crypto_fname.c b/fs/f2fs/crypto_fname.c
new file mode 100644
index 000000000000..ab377d496a39
--- /dev/null
+++ b/fs/f2fs/crypto_fname.c
@@ -0,0 +1,440 @@
+/*
+ * linux/fs/f2fs/crypto_fname.c
+ *
+ * Copied from linux/fs/ext4/crypto.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * This contains functions for filename crypto management in f2fs
+ *
+ * Written by Uday Savagaonkar, 2014.
+ *
+ * Adjust f2fs dentry structure
+ * Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/crypto.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+#include <linux/f2fs_fs.h>
+#include <linux/ratelimit.h>
+
+#include "f2fs.h"
+#include "f2fs_crypto.h"
+#include "xattr.h"
+
+/**
+ * f2fs_dir_crypt_complete() -
+ */
+static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
+{
+ struct f2fs_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+bool f2fs_valid_filenames_enc_mode(uint32_t mode)
+{
+ return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS);
+}
+
+static unsigned max_name_len(struct inode *inode)
+{
+ return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
+ F2FS_NAME_LEN;
+}
+
+/**
+ * f2fs_fname_encrypt() -
+ *
+ * This function encrypts the input filename, and returns the length of the
+ * ciphertext. Errors are returned as negative numbers. We trust the caller to
+ * allocate sufficient memory to oname string.
+ */
+static int f2fs_fname_encrypt(struct inode *inode,
+ const struct qstr *iname, struct f2fs_str *oname)
+{
+ u32 ciphertext_len;
+ struct ablkcipher_request *req = NULL;
+ DECLARE_F2FS_COMPLETION_RESULT(ecr);
+ struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+ char iv[F2FS_CRYPTO_BLOCK_SIZE];
+ struct scatterlist src_sg, dst_sg;
+ int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
+ char *workbuf, buf[32], *alloc_buf = NULL;
+ unsigned lim = max_name_len(inode);
+
+ if (iname->len <= 0 || iname->len > lim)
+ return -EIO;
+
+ ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ?
+ F2FS_CRYPTO_BLOCK_SIZE : iname->len;
+ ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding);
+ ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
+
+ if (ciphertext_len <= sizeof(buf)) {
+ workbuf = buf;
+ } else {
+ alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
+ if (!alloc_buf)
+ return -ENOMEM;
+ workbuf = alloc_buf;
+ }
+
+ /* Allocate request */
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_request_alloc() failed\n", __func__);
+ kfree(alloc_buf);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ f2fs_dir_crypt_complete, &ecr);
+
+ /* Copy the input */
+ memcpy(workbuf, iname->name, iname->len);
+ if (iname->len < ciphertext_len)
+ memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
+
+ /* Initialize IV */
+ memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+
+ /* Create encryption request */
+ sg_init_one(&src_sg, workbuf, ciphertext_len);
+ sg_init_one(&dst_sg, oname->name, ciphertext_len);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ kfree(alloc_buf);
+ ablkcipher_request_free(req);
+ if (res < 0) {
+ printk_ratelimited(KERN_ERR
+ "%s: Error (error code %d)\n", __func__, res);
+ }
+ oname->len = ciphertext_len;
+ return res;
+}
+
+/*
+ * f2fs_fname_decrypt()
+ * This function decrypts the input filename, and returns
+ * the length of the plaintext.
+ * Errors are returned as negative numbers.
+ * We trust the caller to allocate sufficient memory to oname string.
+ */
+static int f2fs_fname_decrypt(struct inode *inode,
+ const struct f2fs_str *iname, struct f2fs_str *oname)
+{
+ struct ablkcipher_request *req = NULL;
+ DECLARE_F2FS_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+ struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+ char iv[F2FS_CRYPTO_BLOCK_SIZE];
+ unsigned lim = max_name_len(inode);
+
+ if (iname->len <= 0 || iname->len > lim)
+ return -EIO;
+
+ /* Allocate request */
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_request_alloc() failed\n", __func__);
+ return -ENOMEM;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ f2fs_dir_crypt_complete, &ecr);
+
+ /* Initialize IV */
+ memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+
+ /* Create decryption request */
+ sg_init_one(&src_sg, iname->name, iname->len);
+ sg_init_one(&dst_sg, oname->name, oname->len);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+ res = crypto_ablkcipher_decrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ ablkcipher_request_free(req);
+ if (res < 0) {
+ printk_ratelimited(KERN_ERR
+ "%s: Error in f2fs_fname_decrypt (error code %d)\n",
+ __func__, res);
+ return res;
+ }
+
+ oname->len = strnlen(oname->name, iname->len);
+ return oname->len;
+}
+
+static const char *lookup_table =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+
+/**
+ * f2fs_fname_encode_digest() -
+ *
+ * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * The encoded string is roughly 4/3 times the size of the input string.
+ */
+static int digest_encode(const char *src, int len, char *dst)
+{
+ int i = 0, bits = 0, ac = 0;
+ char *cp = dst;
+
+ while (i < len) {
+ ac += (((unsigned char) src[i]) << bits);
+ bits += 8;
+ do {
+ *cp++ = lookup_table[ac & 0x3f];
+ ac >>= 6;
+ bits -= 6;
+ } while (bits >= 6);
+ i++;
+ }
+ if (bits)
+ *cp++ = lookup_table[ac & 0x3f];
+ return cp - dst;
+}
+
+static int digest_decode(const char *src, int len, char *dst)
+{
+ int i = 0, bits = 0, ac = 0;
+ const char *p;
+ char *cp = dst;
+
+ while (i < len) {
+ p = strchr(lookup_table, src[i]);
+ if (p == NULL || src[i] == 0)
+ return -2;
+ ac += (p - lookup_table) << bits;
+ bits += 6;
+ if (bits >= 8) {
+ *cp++ = ac & 0xff;
+ ac >>= 8;
+ bits -= 8;
+ }
+ i++;
+ }
+ if (ac)
+ return -1;
+ return cp - dst;
+}
+
+/**
+ * f2fs_fname_crypto_round_up() -
+ *
+ * Return: The next multiple of block size
+ */
+u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize)
+{
+ return ((size + blksize - 1) / blksize) * blksize;
+}
+
+/**
+ * f2fs_fname_crypto_alloc_obuff() -
+ *
+ * Allocates an output buffer that is sufficient for the crypto operation
+ * specified by the context and the direction.
+ */
+int f2fs_fname_crypto_alloc_buffer(struct inode *inode,
+ u32 ilen, struct f2fs_str *crypto_str)
+{
+ unsigned int olen;
+ int padding = 16;
+ struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+
+ if (ci)
+ padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
+ if (padding < F2FS_CRYPTO_BLOCK_SIZE)
+ padding = F2FS_CRYPTO_BLOCK_SIZE;
+ olen = f2fs_fname_crypto_round_up(ilen, padding);
+ crypto_str->len = olen;
+ if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
+ olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+ /* Allocated buffer can hold one more character to null-terminate the
+ * string */
+ crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
+ if (!(crypto_str->name))
+ return -ENOMEM;
+ return 0;
+}
+
+/**
+ * f2fs_fname_crypto_free_buffer() -
+ *
+ * Frees the buffer allocated for crypto operation.
+ */
+void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str)
+{
+ if (!crypto_str)
+ return;
+ kfree(crypto_str->name);
+ crypto_str->name = NULL;
+}
+
+/**
+ * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space
+ */
+int f2fs_fname_disk_to_usr(struct inode *inode,
+ f2fs_hash_t *hash,
+ const struct f2fs_str *iname,
+ struct f2fs_str *oname)
+{
+ const struct qstr qname = FSTR_TO_QSTR(iname);
+ char buf[24];
+ int ret;
+
+ if (is_dot_dotdot(&qname)) {
+ oname->name[0] = '.';
+ oname->name[iname->len - 1] = '.';
+ oname->len = iname->len;
+ return oname->len;
+ }
+
+ if (F2FS_I(inode)->i_crypt_info)
+ return f2fs_fname_decrypt(inode, iname, oname);
+
+ if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) {
+ ret = digest_encode(iname->name, iname->len, oname->name);
+ oname->len = ret;
+ return ret;
+ }
+ if (hash) {
+ memcpy(buf, hash, 4);
+ memset(buf + 4, 0, 4);
+ } else
+ memset(buf, 0, 8);
+ memcpy(buf + 8, iname->name + iname->len - 16, 16);
+ oname->name[0] = '_';
+ ret = digest_encode(buf, 24, oname->name + 1);
+ oname->len = ret + 1;
+ return ret + 1;
+}
+
+/**
+ * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space
+ */
+int f2fs_fname_usr_to_disk(struct inode *inode,
+ const struct qstr *iname,
+ struct f2fs_str *oname)
+{
+ int res;
+ struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+
+ if (is_dot_dotdot(iname)) {
+ oname->name[0] = '.';
+ oname->name[iname->len - 1] = '.';
+ oname->len = iname->len;
+ return oname->len;
+ }
+
+ if (ci) {
+ res = f2fs_fname_encrypt(inode, iname, oname);
+ return res;
+ }
+ /* Without a proper key, a user is not allowed to modify the filenames
+ * in a directory. Consequently, a user space name cannot be mapped to
+ * a disk-space name */
+ return -EACCES;
+}
+
+int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct f2fs_filename *fname)
+{
+ struct f2fs_crypt_info *ci;
+ int ret = 0, bigname = 0;
+
+ memset(fname, 0, sizeof(struct f2fs_filename));
+ fname->usr_fname = iname;
+
+ if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) {
+ fname->disk_name.name = (unsigned char *)iname->name;
+ fname->disk_name.len = iname->len;
+ return 0;
+ }
+ ret = f2fs_get_encryption_info(dir);
+ if (ret)
+ return ret;
+ ci = F2FS_I(dir)->i_crypt_info;
+ if (ci) {
+ ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len,
+ &fname->crypto_buf);
+ if (ret < 0)
+ return ret;
+ ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf);
+ if (ret < 0)
+ goto errout;
+ fname->disk_name.name = fname->crypto_buf.name;
+ fname->disk_name.len = fname->crypto_buf.len;
+ return 0;
+ }
+ if (!lookup)
+ return -EACCES;
+
+ /* We don't have the key and we are doing a lookup; decode the
+ * user-supplied name
+ */
+ if (iname->name[0] == '_')
+ bigname = 1;
+ if ((bigname && (iname->len != 33)) ||
+ (!bigname && (iname->len > 43)))
+ return -ENOENT;
+
+ fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+ if (fname->crypto_buf.name == NULL)
+ return -ENOMEM;
+ ret = digest_decode(iname->name + bigname, iname->len - bigname,
+ fname->crypto_buf.name);
+ if (ret < 0) {
+ ret = -ENOENT;
+ goto errout;
+ }
+ fname->crypto_buf.len = ret;
+ if (bigname) {
+ memcpy(&fname->hash, fname->crypto_buf.name, 4);
+ } else {
+ fname->disk_name.name = fname->crypto_buf.name;
+ fname->disk_name.len = fname->crypto_buf.len;
+ }
+ return 0;
+errout:
+ f2fs_fname_crypto_free_buffer(&fname->crypto_buf);
+ return ret;
+}
+
+void f2fs_fname_free_filename(struct f2fs_filename *fname)
+{
+ kfree(fname->crypto_buf.name);
+ fname->crypto_buf.name = NULL;
+ fname->usr_fname = NULL;
+ fname->disk_name.name = NULL;
+}
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
new file mode 100644
index 000000000000..95b8f936f00b
--- /dev/null
+++ b/fs/f2fs/crypto_key.c
@@ -0,0 +1,255 @@
+/*
+ * linux/fs/f2fs/crypto_key.c
+ *
+ * Copied from linux/fs/f2fs/crypto_key.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions for f2fs
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+#include <crypto/hash.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+ struct f2fs_completion_result *ecr = req->data;
+
+ if (rc == -EINPROGRESS)
+ return;
+
+ ecr->res = rc;
+ complete(&ecr->completion);
+}
+
+/**
+ * f2fs_derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivatio.
+ * @source_key: Source key to which to apply derivation.
+ * @derived_key: Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE],
+ char source_key[F2FS_AES_256_XTS_KEY_SIZE],
+ char derived_key[F2FS_AES_256_XTS_KEY_SIZE])
+{
+ int res = 0;
+ struct ablkcipher_request *req = NULL;
+ DECLARE_F2FS_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
+ 0);
+
+ if (IS_ERR(tfm)) {
+ res = PTR_ERR(tfm);
+ tfm = NULL;
+ goto out;
+ }
+ crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ res = -ENOMEM;
+ goto out;
+ }
+ ablkcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ derive_crypt_complete, &ecr);
+ res = crypto_ablkcipher_setkey(tfm, deriving_key,
+ F2FS_AES_128_ECB_KEY_SIZE);
+ if (res < 0)
+ goto out;
+
+ sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE);
+ sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE);
+ ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ F2FS_AES_256_XTS_KEY_SIZE, NULL);
+ res = crypto_ablkcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+out:
+ if (req)
+ ablkcipher_request_free(req);
+ if (tfm)
+ crypto_free_ablkcipher(tfm);
+ return res;
+}
+
+static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
+{
+ if (!ci)
+ return;
+
+ if (ci->ci_keyring_key)
+ key_put(ci->ci_keyring_key);
+ crypto_free_ablkcipher(ci->ci_ctfm);
+ kmem_cache_free(f2fs_crypt_info_cachep, ci);
+}
+
+void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_crypt_info *prev;
+
+ if (ci == NULL)
+ ci = ACCESS_ONCE(fi->i_crypt_info);
+ if (ci == NULL)
+ return;
+ prev = cmpxchg(&fi->i_crypt_info, ci, NULL);
+ if (prev != ci)
+ return;
+
+ f2fs_free_crypt_info(ci);
+}
+
+int _f2fs_get_encryption_info(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_crypt_info *crypt_info;
+ char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
+ (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
+ struct key *keyring_key = NULL;
+ struct f2fs_encryption_key *master_key;
+ struct f2fs_encryption_context ctx;
+ struct user_key_payload *ukp;
+ struct crypto_ablkcipher *ctfm;
+ const char *cipher_str;
+ char raw_key[F2FS_MAX_KEY_SIZE];
+ char mode;
+ int res;
+
+ res = f2fs_crypto_initialize();
+ if (res)
+ return res;
+retry:
+ crypt_info = ACCESS_ONCE(fi->i_crypt_info);
+ if (crypt_info) {
+ if (!crypt_info->ci_keyring_key ||
+ key_validate(crypt_info->ci_keyring_key) == 0)
+ return 0;
+ f2fs_free_encryption_info(inode, crypt_info);
+ goto retry;
+ }
+
+ res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ &ctx, sizeof(ctx), NULL);
+ if (res < 0)
+ return res;
+ else if (res != sizeof(ctx))
+ return -EINVAL;
+ res = 0;
+
+ crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS);
+ if (!crypt_info)
+ return -ENOMEM;
+
+ crypt_info->ci_flags = ctx.flags;
+ crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+ crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+ crypt_info->ci_ctfm = NULL;
+ crypt_info->ci_keyring_key = NULL;
+ memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+ sizeof(crypt_info->ci_master_key));
+ if (S_ISREG(inode->i_mode))
+ mode = crypt_info->ci_data_mode;
+ else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ mode = crypt_info->ci_filename_mode;
+ else
+ BUG();
+
+ switch (mode) {
+ case F2FS_ENCRYPTION_MODE_AES_256_XTS:
+ cipher_str = "xts(aes)";
+ break;
+ case F2FS_ENCRYPTION_MODE_AES_256_CTS:
+ cipher_str = "cts(cbc(aes))";
+ break;
+ default:
+ printk_once(KERN_WARNING
+ "f2fs: unsupported key mode %d (ino %u)\n",
+ mode, (unsigned) inode->i_ino);
+ res = -ENOKEY;
+ goto out;
+ }
+
+ memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX,
+ F2FS_KEY_DESC_PREFIX_SIZE);
+ sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE,
+ "%*phN", F2FS_KEY_DESCRIPTOR_SIZE,
+ ctx.master_key_descriptor);
+ full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
+ (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0';
+ keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+ if (IS_ERR(keyring_key)) {
+ res = PTR_ERR(keyring_key);
+ keyring_key = NULL;
+ goto out;
+ }
+ crypt_info->ci_keyring_key = keyring_key;
+ BUG_ON(keyring_key->type != &key_type_logon);
+ ukp = ((struct user_key_payload *)keyring_key->payload.data);
+ if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
+ res = -EINVAL;
+ goto out;
+ }
+ master_key = (struct f2fs_encryption_key *)ukp->data;
+ BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE !=
+ F2FS_KEY_DERIVATION_NONCE_SIZE);
+ BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE);
+ res = f2fs_derive_key_aes(ctx.nonce, master_key->raw,
+ raw_key);
+ if (res)
+ goto out;
+
+ ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+ if (!ctfm || IS_ERR(ctfm)) {
+ res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+ printk(KERN_DEBUG
+ "%s: error %d (inode %u) allocating crypto tfm\n",
+ __func__, res, (unsigned) inode->i_ino);
+ goto out;
+ }
+ crypt_info->ci_ctfm = ctfm;
+ crypto_ablkcipher_clear_flags(ctfm, ~0);
+ crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
+ CRYPTO_TFM_REQ_WEAK_KEY);
+ res = crypto_ablkcipher_setkey(ctfm, raw_key,
+ f2fs_encryption_key_size(mode));
+ if (res)
+ goto out;
+
+ memzero_explicit(raw_key, sizeof(raw_key));
+ if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) {
+ f2fs_free_crypt_info(crypt_info);
+ goto retry;
+ }
+ return 0;
+
+out:
+ if (res == -ENOKEY && !S_ISREG(inode->i_mode))
+ res = 0;
+
+ f2fs_free_crypt_info(crypt_info);
+ memzero_explicit(raw_key, sizeof(raw_key));
+ return res;
+}
+
+int f2fs_has_encryption_key(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
+ return (fi->i_crypt_info != NULL);
+}
diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c
new file mode 100644
index 000000000000..d4a96af513c2
--- /dev/null
+++ b/fs/f2fs/crypto_policy.c
@@ -0,0 +1,209 @@
+/*
+ * copied from linux/fs/ext4/crypto_policy.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility.
+ *
+ * This contains encryption policy functions for f2fs with some modifications
+ * to support f2fs-specific xattr APIs.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+
+static int f2fs_inode_has_encryption_context(struct inode *inode)
+{
+ int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL);
+ return (res > 0);
+}
+
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int f2fs_is_encryption_context_consistent_with_policy(
+ struct inode *inode, const struct f2fs_encryption_policy *policy)
+{
+ struct f2fs_encryption_context ctx;
+ int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+ sizeof(ctx), NULL);
+
+ if (res != sizeof(ctx))
+ return 0;
+
+ return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+ F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (ctx.flags == policy->flags) &&
+ (ctx.contents_encryption_mode ==
+ policy->contents_encryption_mode) &&
+ (ctx.filenames_encryption_mode ==
+ policy->filenames_encryption_mode));
+}
+
+static int f2fs_create_encryption_context_from_policy(
+ struct inode *inode, const struct f2fs_encryption_policy *policy)
+{
+ struct f2fs_encryption_context ctx;
+
+ ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+ memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+ F2FS_KEY_DESCRIPTOR_SIZE);
+
+ if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid contents encryption mode %d\n", __func__,
+ policy->contents_encryption_mode);
+ return -EINVAL;
+ }
+
+ if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid filenames encryption mode %d\n", __func__,
+ policy->filenames_encryption_mode);
+ return -EINVAL;
+ }
+
+ if (policy->flags & ~F2FS_POLICY_FLAGS_VALID)
+ return -EINVAL;
+
+ ctx.contents_encryption_mode = policy->contents_encryption_mode;
+ ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+ ctx.flags = policy->flags;
+ BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE);
+ get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
+
+ return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+ sizeof(ctx), NULL, XATTR_CREATE);
+}
+
+int f2fs_process_policy(const struct f2fs_encryption_policy *policy,
+ struct inode *inode)
+{
+ if (policy->version != 0)
+ return -EINVAL;
+
+ if (!S_ISDIR(inode->i_mode))
+ return -EINVAL;
+
+ if (!f2fs_inode_has_encryption_context(inode)) {
+ if (!f2fs_empty_dir(inode))
+ return -ENOTEMPTY;
+ return f2fs_create_encryption_context_from_policy(inode,
+ policy);
+ }
+
+ if (f2fs_is_encryption_context_consistent_with_policy(inode, policy))
+ return 0;
+
+ printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+ __func__);
+ return -EINVAL;
+}
+
+int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy)
+{
+ struct f2fs_encryption_context ctx;
+ int res;
+
+ if (!f2fs_encrypted_inode(inode))
+ return -ENODATA;
+
+ res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ &ctx, sizeof(ctx), NULL);
+ if (res != sizeof(ctx))
+ return -ENODATA;
+ if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+ return -EINVAL;
+
+ policy->version = 0;
+ policy->contents_encryption_mode = ctx.contents_encryption_mode;
+ policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+ policy->flags = ctx.flags;
+ memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+ F2FS_KEY_DESCRIPTOR_SIZE);
+ return 0;
+}
+
+int f2fs_is_child_context_consistent_with_parent(struct inode *parent,
+ struct inode *child)
+{
+ struct f2fs_crypt_info *parent_ci, *child_ci;
+ int res;
+
+ if ((parent == NULL) || (child == NULL)) {
+ pr_err("parent %p child %p\n", parent, child);
+ BUG_ON(1);
+ }
+
+ /* no restrictions if the parent directory is not encrypted */
+ if (!f2fs_encrypted_inode(parent))
+ return 1;
+ /* if the child directory is not encrypted, this is always a problem */
+ if (!f2fs_encrypted_inode(child))
+ return 0;
+ res = f2fs_get_encryption_info(parent);
+ if (res)
+ return 0;
+ res = f2fs_get_encryption_info(child);
+ if (res)
+ return 0;
+ parent_ci = F2FS_I(parent)->i_crypt_info;
+ child_ci = F2FS_I(child)->i_crypt_info;
+ if (!parent_ci && !child_ci)
+ return 1;
+ if (!parent_ci || !child_ci)
+ return 0;
+
+ return (memcmp(parent_ci->ci_master_key,
+ child_ci->ci_master_key,
+ F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+ (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
+ (parent_ci->ci_flags == child_ci->ci_flags));
+}
+
+/**
+ * f2fs_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child: Child inode that inherits the context from @parent.
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int f2fs_inherit_context(struct inode *parent, struct inode *child,
+ struct page *ipage)
+{
+ struct f2fs_encryption_context ctx;
+ struct f2fs_crypt_info *ci;
+ int res;
+
+ res = f2fs_get_encryption_info(parent);
+ if (res < 0)
+ return res;
+
+ ci = F2FS_I(parent)->i_crypt_info;
+ BUG_ON(ci == NULL);
+
+ ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+
+ ctx.contents_encryption_mode = ci->ci_data_mode;
+ ctx.filenames_encryption_mode = ci->ci_filename_mode;
+ ctx.flags = ci->ci_flags;
+ memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+ F2FS_KEY_DESCRIPTOR_SIZE);
+
+ get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
+ return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+ sizeof(ctx), ipage, XATTR_CREATE);
+}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 985ed023a750..f71e19a9dd3c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,12 +12,13 @@
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
-#include <linux/aio.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/prefetch.h>
+#include <linux/uio.h>
+#include <linux/cleancache.h>
#include "f2fs.h"
#include "node.h"
@@ -25,11 +26,23 @@
#include "trace.h"
#include <trace/events/f2fs.h>
+static struct kmem_cache *extent_tree_slab;
+static struct kmem_cache *extent_node_slab;
+
static void f2fs_read_end_io(struct bio *bio, int err)
{
struct bio_vec *bvec;
int i;
+ if (f2fs_bio_encrypted(bio)) {
+ if (err) {
+ f2fs_release_crypto_ctx(bio->bi_private);
+ } else {
+ f2fs_end_io_crypto_work(bio->bi_private, bio);
+ return;
+ }
+ }
+
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
@@ -53,6 +66,8 @@ static void f2fs_write_end_io(struct bio *bio, int err)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
+ f2fs_restore_and_release_control_page(&page);
+
if (unlikely(err)) {
set_page_dirty(page);
set_bit(AS_EIO, &page->mapping->flags);
@@ -83,7 +98,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
bio->bi_bdev = sbi->sb->s_bdev;
bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
- bio->bi_private = sbi;
+ bio->bi_private = is_read ? NULL : sbi;
return bio;
}
@@ -130,16 +145,16 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
* Fill the locked page with data located in the block address.
* Return unlocked page.
*/
-int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
- struct f2fs_io_info *fio)
+int f2fs_submit_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio;
+ struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page;
trace_f2fs_submit_page_bio(page, fio);
- f2fs_trace_ios(page, fio, 0);
+ f2fs_trace_ios(fio, 0);
/* Allocate a new bio */
- bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw));
+ bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw));
if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
bio_put(bio);
@@ -151,12 +166,13 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
return 0;
}
-void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
- struct f2fs_io_info *fio)
+void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
{
+ struct f2fs_sb_info *sbi = fio->sbi;
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io;
bool is_read = is_read_io(fio->rw);
+ struct page *bio_page;
io = is_read ? &sbi->read_io : &sbi->write_io[btype];
@@ -178,17 +194,19 @@ alloc_new:
io->fio = *fio;
}
- if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+ bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+
+ if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) <
PAGE_CACHE_SIZE) {
__submit_merged_bio(io);
goto alloc_new;
}
io->last_block_in_bio = fio->blk_addr;
- f2fs_trace_ios(page, fio, 0);
+ f2fs_trace_ios(fio, 0);
up_write(&io->io_rwsem);
- trace_f2fs_submit_page_mbio(page, fio);
+ trace_f2fs_submit_page_mbio(fio->page, fio);
}
/*
@@ -197,7 +215,7 @@ alloc_new:
* ->node_page
* update block addresses in the node page
*/
-static void __set_data_blkaddr(struct dnode_of_data *dn)
+void set_data_blkaddr(struct dnode_of_data *dn)
{
struct f2fs_node *rn;
__le32 *addr_array;
@@ -226,7 +244,7 @@ int reserve_new_block(struct dnode_of_data *dn)
trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
dn->data_blkaddr = NEW_ADDR;
- __set_data_blkaddr(dn);
+ set_data_blkaddr(dn);
mark_inode_dirty(dn->inode);
sync_inode_page(dn);
return 0;
@@ -248,73 +266,49 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
return err;
}
-static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
- struct buffer_head *bh_result)
+static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
pgoff_t start_fofs, end_fofs;
block_t start_blkaddr;
- if (is_inode_flag_set(fi, FI_NO_EXTENT))
- return 0;
-
- read_lock(&fi->ext.ext_lock);
+ read_lock(&fi->ext_lock);
if (fi->ext.len == 0) {
- read_unlock(&fi->ext.ext_lock);
- return 0;
+ read_unlock(&fi->ext_lock);
+ return false;
}
stat_inc_total_hit(inode->i_sb);
start_fofs = fi->ext.fofs;
end_fofs = fi->ext.fofs + fi->ext.len - 1;
- start_blkaddr = fi->ext.blk_addr;
+ start_blkaddr = fi->ext.blk;
if (pgofs >= start_fofs && pgofs <= end_fofs) {
- unsigned int blkbits = inode->i_sb->s_blocksize_bits;
- size_t count;
-
- set_buffer_new(bh_result);
- map_bh(bh_result, inode->i_sb,
- start_blkaddr + pgofs - start_fofs);
- count = end_fofs - pgofs + 1;
- if (count < (UINT_MAX >> blkbits))
- bh_result->b_size = (count << blkbits);
- else
- bh_result->b_size = UINT_MAX;
-
+ *ei = fi->ext;
stat_inc_read_hit(inode->i_sb);
- read_unlock(&fi->ext.ext_lock);
- return 1;
+ read_unlock(&fi->ext_lock);
+ return true;
}
- read_unlock(&fi->ext.ext_lock);
- return 0;
+ read_unlock(&fi->ext_lock);
+ return false;
}
-void update_extent_cache(struct dnode_of_data *dn)
+static bool update_extent_info(struct inode *inode, pgoff_t fofs,
+ block_t blkaddr)
{
- struct f2fs_inode_info *fi = F2FS_I(dn->inode);
- pgoff_t fofs, start_fofs, end_fofs;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ pgoff_t start_fofs, end_fofs;
block_t start_blkaddr, end_blkaddr;
int need_update = true;
- f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
-
- /* Update the page address in the parent node */
- __set_data_blkaddr(dn);
-
- if (is_inode_flag_set(fi, FI_NO_EXTENT))
- return;
-
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
- dn->ofs_in_node;
-
- write_lock(&fi->ext.ext_lock);
+ write_lock(&fi->ext_lock);
start_fofs = fi->ext.fofs;
end_fofs = fi->ext.fofs + fi->ext.len - 1;
- start_blkaddr = fi->ext.blk_addr;
- end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
+ start_blkaddr = fi->ext.blk;
+ end_blkaddr = fi->ext.blk + fi->ext.len - 1;
/* Drop and initialize the matched extent */
if (fi->ext.len == 1 && fofs == start_fofs)
@@ -322,24 +316,24 @@ void update_extent_cache(struct dnode_of_data *dn)
/* Initial extent */
if (fi->ext.len == 0) {
- if (dn->data_blkaddr != NULL_ADDR) {
+ if (blkaddr != NULL_ADDR) {
fi->ext.fofs = fofs;
- fi->ext.blk_addr = dn->data_blkaddr;
+ fi->ext.blk = blkaddr;
fi->ext.len = 1;
}
goto end_update;
}
/* Front merge */
- if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) {
+ if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) {
fi->ext.fofs--;
- fi->ext.blk_addr--;
+ fi->ext.blk--;
fi->ext.len++;
goto end_update;
}
/* Back merge */
- if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) {
+ if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) {
fi->ext.len++;
goto end_update;
}
@@ -351,8 +345,7 @@ void update_extent_cache(struct dnode_of_data *dn)
fi->ext.len = fofs - start_fofs;
} else {
fi->ext.fofs = fofs + 1;
- fi->ext.blk_addr = start_blkaddr +
- fofs - start_fofs + 1;
+ fi->ext.blk = start_blkaddr + fofs - start_fofs + 1;
fi->ext.len -= fofs - start_fofs + 1;
}
} else {
@@ -366,85 +359,580 @@ void update_extent_cache(struct dnode_of_data *dn)
need_update = true;
}
end_update:
- write_unlock(&fi->ext.ext_lock);
- if (need_update)
- sync_inode_page(dn);
- return;
+ write_unlock(&fi->ext_lock);
+ return need_update;
}
-struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
+static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_info *ei,
+ struct rb_node *parent, struct rb_node **p)
{
- struct address_space *mapping = inode->i_mapping;
- struct dnode_of_data dn;
- struct page *page;
- int err;
- struct f2fs_io_info fio = {
- .type = DATA,
- .rw = sync ? READ_SYNC : READA,
- };
+ struct extent_node *en;
- page = find_get_page(mapping, index);
- if (page && PageUptodate(page))
- return page;
- f2fs_put_page(page, 0);
+ en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
+ if (!en)
+ return NULL;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err)
- return ERR_PTR(err);
- f2fs_put_dnode(&dn);
+ en->ei = *ei;
+ INIT_LIST_HEAD(&en->list);
- if (dn.data_blkaddr == NULL_ADDR)
- return ERR_PTR(-ENOENT);
+ rb_link_node(&en->rb_node, parent, p);
+ rb_insert_color(&en->rb_node, &et->root);
+ et->count++;
+ atomic_inc(&sbi->total_ext_node);
+ return en;
+}
- /* By fallocate(), there is no cached page, but with NEW_ADDR */
- if (unlikely(dn.data_blkaddr == NEW_ADDR))
- return ERR_PTR(-EINVAL);
+static void __detach_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_node *en)
+{
+ rb_erase(&en->rb_node, &et->root);
+ et->count--;
+ atomic_dec(&sbi->total_ext_node);
- page = grab_cache_page(mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ if (et->cached_en == en)
+ et->cached_en = NULL;
+}
- if (PageUptodate(page)) {
- unlock_page(page);
- return page;
+static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi,
+ nid_t ino)
+{
+ struct extent_tree *et;
+
+ down_read(&sbi->extent_tree_lock);
+ et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+ if (!et) {
+ up_read(&sbi->extent_tree_lock);
+ return NULL;
}
+ atomic_inc(&et->refcount);
+ up_read(&sbi->extent_tree_lock);
- fio.blk_addr = dn.data_blkaddr;
- err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
- if (err)
- return ERR_PTR(err);
+ return et;
+}
- if (sync) {
- wait_on_page_locked(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 0);
- return ERR_PTR(-EIO);
+static struct extent_tree *__grab_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et;
+ nid_t ino = inode->i_ino;
+
+ down_write(&sbi->extent_tree_lock);
+ et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+ if (!et) {
+ et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
+ f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+ memset(et, 0, sizeof(struct extent_tree));
+ et->ino = ino;
+ et->root = RB_ROOT;
+ et->cached_en = NULL;
+ rwlock_init(&et->lock);
+ atomic_set(&et->refcount, 0);
+ et->count = 0;
+ sbi->total_ext_tree++;
+ }
+ atomic_inc(&et->refcount);
+ up_write(&sbi->extent_tree_lock);
+
+ return et;
+}
+
+static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
+ unsigned int fofs)
+{
+ struct rb_node *node = et->root.rb_node;
+ struct extent_node *en;
+
+ if (et->cached_en) {
+ struct extent_info *cei = &et->cached_en->ei;
+
+ if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
+ return et->cached_en;
+ }
+
+ while (node) {
+ en = rb_entry(node, struct extent_node, rb_node);
+
+ if (fofs < en->ei.fofs) {
+ node = node->rb_left;
+ } else if (fofs >= en->ei.fofs + en->ei.len) {
+ node = node->rb_right;
+ } else {
+ et->cached_en = en;
+ return en;
}
}
- return page;
+ return NULL;
}
-/*
- * If it tries to access a hole, return an error.
- * Because, the callers, functions in dir.c and GC, should be able to know
- * whether this page exists or not.
- */
-struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_node *en)
+{
+ struct extent_node *prev;
+ struct rb_node *node;
+
+ node = rb_prev(&en->rb_node);
+ if (!node)
+ return NULL;
+
+ prev = rb_entry(node, struct extent_node, rb_node);
+ if (__is_back_mergeable(&en->ei, &prev->ei)) {
+ en->ei.fofs = prev->ei.fofs;
+ en->ei.blk = prev->ei.blk;
+ en->ei.len += prev->ei.len;
+ __detach_extent_node(sbi, et, prev);
+ return prev;
+ }
+ return NULL;
+}
+
+static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_node *en)
+{
+ struct extent_node *next;
+ struct rb_node *node;
+
+ node = rb_next(&en->rb_node);
+ if (!node)
+ return NULL;
+
+ next = rb_entry(node, struct extent_node, rb_node);
+ if (__is_front_mergeable(&en->ei, &next->ei)) {
+ en->ei.len += next->ei.len;
+ __detach_extent_node(sbi, et, next);
+ return next;
+ }
+ return NULL;
+}
+
+static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_info *ei,
+ struct extent_node **den)
+{
+ struct rb_node **p = &et->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_node *en;
+
+ while (*p) {
+ parent = *p;
+ en = rb_entry(parent, struct extent_node, rb_node);
+
+ if (ei->fofs < en->ei.fofs) {
+ if (__is_front_mergeable(ei, &en->ei)) {
+ f2fs_bug_on(sbi, !den);
+ en->ei.fofs = ei->fofs;
+ en->ei.blk = ei->blk;
+ en->ei.len += ei->len;
+ *den = __try_back_merge(sbi, et, en);
+ return en;
+ }
+ p = &(*p)->rb_left;
+ } else if (ei->fofs >= en->ei.fofs + en->ei.len) {
+ if (__is_back_mergeable(ei, &en->ei)) {
+ f2fs_bug_on(sbi, !den);
+ en->ei.len += ei->len;
+ *den = __try_front_merge(sbi, et, en);
+ return en;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ f2fs_bug_on(sbi, 1);
+ }
+ }
+
+ return __attach_extent_node(sbi, et, ei, parent, p);
+}
+
+static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, bool free_all)
+{
+ struct rb_node *node, *next;
+ struct extent_node *en;
+ unsigned int count = et->count;
+
+ node = rb_first(&et->root);
+ while (node) {
+ next = rb_next(node);
+ en = rb_entry(node, struct extent_node, rb_node);
+
+ if (free_all) {
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list))
+ list_del_init(&en->list);
+ spin_unlock(&sbi->extent_lock);
+ }
+
+ if (free_all || list_empty(&en->list)) {
+ __detach_extent_node(sbi, et, en);
+ kmem_cache_free(extent_node_slab, en);
+ }
+ node = next;
+ }
+
+ return count - et->count;
+}
+
+static void f2fs_init_extent_tree(struct inode *inode,
+ struct f2fs_extent *i_ext)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et;
+ struct extent_node *en;
+ struct extent_info ei;
+
+ if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
+ return;
+
+ et = __grab_extent_tree(inode);
+
+ write_lock(&et->lock);
+ if (et->count)
+ goto out;
+
+ set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
+ le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
+
+ en = __insert_extent_tree(sbi, et, &ei, NULL);
+ if (en) {
+ et->cached_en = en;
+
+ spin_lock(&sbi->extent_lock);
+ list_add_tail(&en->list, &sbi->extent_list);
+ spin_unlock(&sbi->extent_lock);
+ }
+out:
+ write_unlock(&et->lock);
+ atomic_dec(&et->refcount);
+}
+
+static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et;
+ struct extent_node *en;
+
+ trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+
+ et = __find_extent_tree(sbi, inode->i_ino);
+ if (!et)
+ return false;
+
+ read_lock(&et->lock);
+ en = __lookup_extent_tree(et, pgofs);
+ if (en) {
+ *ei = en->ei;
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list))
+ list_move_tail(&en->list, &sbi->extent_list);
+ spin_unlock(&sbi->extent_lock);
+ stat_inc_read_hit(sbi->sb);
+ }
+ stat_inc_total_hit(sbi->sb);
+ read_unlock(&et->lock);
+
+ trace_f2fs_lookup_extent_tree_end(inode, pgofs, en);
+
+ atomic_dec(&et->refcount);
+ return en ? true : false;
+}
+
+static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
+ block_t blkaddr)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et;
+ struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
+ struct extent_node *den = NULL;
+ struct extent_info ei, dei;
+ unsigned int endofs;
+
+ trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
+
+ et = __grab_extent_tree(inode);
+
+ write_lock(&et->lock);
+
+ /* 1. lookup and remove existing extent info in cache */
+ en = __lookup_extent_tree(et, fofs);
+ if (!en)
+ goto update_extent;
+
+ dei = en->ei;
+ __detach_extent_node(sbi, et, en);
+
+ /* 2. if extent can be split more, split and insert the left part */
+ if (dei.len > 1) {
+ /* insert left part of split extent into cache */
+ if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+ set_extent_info(&ei, dei.fofs, dei.blk,
+ fofs - dei.fofs);
+ en1 = __insert_extent_tree(sbi, et, &ei, NULL);
+ }
+
+ /* insert right part of split extent into cache */
+ endofs = dei.fofs + dei.len - 1;
+ if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
+ set_extent_info(&ei, fofs + 1,
+ fofs - dei.fofs + dei.blk, endofs - fofs);
+ en2 = __insert_extent_tree(sbi, et, &ei, NULL);
+ }
+ }
+
+update_extent:
+ /* 3. update extent in extent cache */
+ if (blkaddr) {
+ set_extent_info(&ei, fofs, blkaddr, 1);
+ en3 = __insert_extent_tree(sbi, et, &ei, &den);
+ }
+
+ /* 4. update in global extent list */
+ spin_lock(&sbi->extent_lock);
+ if (en && !list_empty(&en->list))
+ list_del(&en->list);
+ /*
+ * en1 and en2 split from en, they will become more and more smaller
+ * fragments after splitting several times. So if the length is smaller
+ * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree.
+ */
+ if (en1)
+ list_add_tail(&en1->list, &sbi->extent_list);
+ if (en2)
+ list_add_tail(&en2->list, &sbi->extent_list);
+ if (en3) {
+ if (list_empty(&en3->list))
+ list_add_tail(&en3->list, &sbi->extent_list);
+ else
+ list_move_tail(&en3->list, &sbi->extent_list);
+ }
+ if (den && !list_empty(&den->list))
+ list_del(&den->list);
+ spin_unlock(&sbi->extent_lock);
+
+ /* 5. release extent node */
+ if (en)
+ kmem_cache_free(extent_node_slab, en);
+ if (den)
+ kmem_cache_free(extent_node_slab, den);
+
+ write_unlock(&et->lock);
+ atomic_dec(&et->refcount);
+}
+
+void f2fs_preserve_extent_tree(struct inode *inode)
+{
+ struct extent_tree *et;
+ struct extent_info *ext = &F2FS_I(inode)->ext;
+ bool sync = false;
+
+ if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
+ return;
+
+ et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino);
+ if (!et) {
+ if (ext->len) {
+ ext->len = 0;
+ update_inode_page(inode);
+ }
+ return;
+ }
+
+ read_lock(&et->lock);
+ if (et->count) {
+ struct extent_node *en;
+
+ if (et->cached_en) {
+ en = et->cached_en;
+ } else {
+ struct rb_node *node = rb_first(&et->root);
+
+ if (!node)
+ node = rb_last(&et->root);
+ en = rb_entry(node, struct extent_node, rb_node);
+ }
+
+ if (__is_extent_same(ext, &en->ei))
+ goto out;
+
+ *ext = en->ei;
+ sync = true;
+ } else if (ext->len) {
+ ext->len = 0;
+ sync = true;
+ }
+out:
+ read_unlock(&et->lock);
+ atomic_dec(&et->refcount);
+
+ if (sync)
+ update_inode_page(inode);
+}
+
+void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
+ struct extent_node *en, *tmp;
+ unsigned long ino = F2FS_ROOT_INO(sbi);
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned int found;
+ unsigned int node_cnt = 0, tree_cnt = 0;
+
+ if (!test_opt(sbi, EXTENT_CACHE))
+ return;
+
+ if (available_free_memory(sbi, EXTENT_CACHE))
+ return;
+
+ spin_lock(&sbi->extent_lock);
+ list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
+ if (!nr_shrink--)
+ break;
+ list_del_init(&en->list);
+ }
+ spin_unlock(&sbi->extent_lock);
+
+ down_read(&sbi->extent_tree_lock);
+ while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
+ (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
+ unsigned i;
+
+ ino = treevec[found - 1]->ino + 1;
+ for (i = 0; i < found; i++) {
+ struct extent_tree *et = treevec[i];
+
+ atomic_inc(&et->refcount);
+ write_lock(&et->lock);
+ node_cnt += __free_extent_tree(sbi, et, false);
+ write_unlock(&et->lock);
+ atomic_dec(&et->refcount);
+ }
+ }
+ up_read(&sbi->extent_tree_lock);
+
+ down_write(&sbi->extent_tree_lock);
+ radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter,
+ F2FS_ROOT_INO(sbi)) {
+ struct extent_tree *et = (struct extent_tree *)*slot;
+
+ if (!atomic_read(&et->refcount) && !et->count) {
+ radix_tree_delete(&sbi->extent_tree_root, et->ino);
+ kmem_cache_free(extent_tree_slab, et);
+ sbi->total_ext_tree--;
+ tree_cnt++;
+ }
+ }
+ up_write(&sbi->extent_tree_lock);
+
+ trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+}
+
+void f2fs_destroy_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree *et;
+ unsigned int node_cnt = 0;
+
+ if (!test_opt(sbi, EXTENT_CACHE))
+ return;
+
+ et = __find_extent_tree(sbi, inode->i_ino);
+ if (!et)
+ goto out;
+
+ /* free all extent info belong to this extent tree */
+ write_lock(&et->lock);
+ node_cnt = __free_extent_tree(sbi, et, true);
+ write_unlock(&et->lock);
+
+ atomic_dec(&et->refcount);
+
+ /* try to find and delete extent tree entry in radix tree */
+ down_write(&sbi->extent_tree_lock);
+ et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino);
+ if (!et) {
+ up_write(&sbi->extent_tree_lock);
+ goto out;
+ }
+ f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
+ radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+ kmem_cache_free(extent_tree_slab, et);
+ sbi->total_ext_tree--;
+ up_write(&sbi->extent_tree_lock);
+out:
+ trace_f2fs_destroy_extent_tree(inode, node_cnt);
+ return;
+}
+
+void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext)
+{
+ if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
+ f2fs_init_extent_tree(inode, i_ext);
+
+ write_lock(&F2FS_I(inode)->ext_lock);
+ get_extent_info(&F2FS_I(inode)->ext, *i_ext);
+ write_unlock(&F2FS_I(inode)->ext_lock);
+}
+
+static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+ return false;
+
+ if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
+ return f2fs_lookup_extent_tree(inode, pgofs, ei);
+
+ return lookup_extent_info(inode, pgofs, ei);
+}
+
+void f2fs_update_extent_cache(struct dnode_of_data *dn)
+{
+ struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+ pgoff_t fofs;
+
+ f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
+
+ if (is_inode_flag_set(fi, FI_NO_EXTENT))
+ return;
+
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ dn->ofs_in_node;
+
+ if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE))
+ return f2fs_update_extent_tree(dn->inode, fofs,
+ dn->data_blkaddr);
+
+ if (update_extent_info(dn->inode, fofs, dn->data_blkaddr))
+ sync_inode_page(dn);
+}
+
+struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
+ struct extent_info ei;
int err;
struct f2fs_io_info fio = {
+ .sbi = F2FS_I_SB(inode),
.type = DATA,
- .rw = READ_SYNC,
+ .rw = rw,
+ .encrypted_page = NULL,
};
-repeat:
+
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ return read_mapping_page(mapping, index, NULL);
+
page = grab_cache_page(mapping, index);
if (!page)
return ERR_PTR(-ENOMEM);
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn.data_blkaddr = ei.blk + index - ei.fofs;
+ goto got_it;
+ }
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err) {
@@ -457,9 +945,11 @@ repeat:
f2fs_put_page(page, 1);
return ERR_PTR(-ENOENT);
}
-
- if (PageUptodate(page))
+got_it:
+ if (PageUptodate(page)) {
+ unlock_page(page);
return page;
+ }
/*
* A new dentry page is allocated but not able to be written, since its
@@ -470,14 +960,58 @@ repeat:
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
+ unlock_page(page);
return page;
}
fio.blk_addr = dn.data_blkaddr;
- err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
+ fio.page = page;
+ err = f2fs_submit_page_bio(&fio);
if (err)
return ERR_PTR(err);
+ return page;
+}
+struct page *find_data_page(struct inode *inode, pgoff_t index)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+
+ page = find_get_page(mapping, index);
+ if (page && PageUptodate(page))
+ return page;
+ f2fs_put_page(page, 0);
+
+ page = get_read_data_page(inode, index, READ_SYNC);
+ if (IS_ERR(page))
+ return page;
+
+ if (PageUptodate(page))
+ return page;
+
+ wait_on_page_locked(page);
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 0);
+ return ERR_PTR(-EIO);
+ }
+ return page;
+}
+
+/*
+ * If it tries to access a hole, return an error.
+ * Because, the callers, functions in dir.c and GC, should be able to know
+ * whether this page exists or not.
+ */
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+repeat:
+ page = get_read_data_page(inode, index, READ_SYNC);
+ if (IS_ERR(page))
+ return page;
+
+ /* wait for read completion */
lock_page(page);
if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 1);
@@ -505,46 +1039,37 @@ struct page *get_new_data_page(struct inode *inode,
struct page *page;
struct dnode_of_data dn;
int err;
+repeat:
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
set_new_dnode(&dn, inode, ipage, NULL, 0);
err = f2fs_reserve_block(&dn, index);
- if (err)
+ if (err) {
+ f2fs_put_page(page, 1);
return ERR_PTR(err);
-repeat:
- page = grab_cache_page(mapping, index);
- if (!page) {
- err = -ENOMEM;
- goto put_err;
}
+ if (!ipage)
+ f2fs_put_dnode(&dn);
if (PageUptodate(page))
- return page;
+ goto got_it;
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
- struct f2fs_io_info fio = {
- .type = DATA,
- .rw = READ_SYNC,
- .blk_addr = dn.data_blkaddr,
- };
- err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
- if (err)
- goto put_err;
+ f2fs_put_page(page, 1);
- lock_page(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- err = -EIO;
- goto put_err;
- }
- if (unlikely(page->mapping != mapping)) {
- f2fs_put_page(page, 1);
+ page = get_read_data_page(inode, index, READ_SYNC);
+ if (IS_ERR(page))
goto repeat;
- }
- }
+ /* wait for read completion */
+ lock_page(page);
+ }
+got_it:
if (new_i_size &&
i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
@@ -552,10 +1077,6 @@ repeat:
set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
}
return page;
-
-put_err:
- f2fs_put_dnode(&dn);
- return ERR_PTR(err);
}
static int __allocate_data_block(struct dnode_of_data *dn)
@@ -569,19 +1090,26 @@ static int __allocate_data_block(struct dnode_of_data *dn)
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return -EPERM;
+
+ dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+ if (dn->data_blkaddr == NEW_ADDR)
+ goto alloc;
+
if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
return -ENOSPC;
+alloc:
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
seg = CURSEG_DIRECT_IO;
- allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg);
+ allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
+ &sum, seg);
/* direct IO doesn't use extent cache to maximize the performance */
- __set_data_blkaddr(dn);
+ set_data_blkaddr(dn);
/* update i_size */
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -615,7 +1143,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
while (dn.ofs_in_node < end_offset && len) {
- if (dn.data_blkaddr == NULL_ADDR) {
+ block_t blkaddr;
+
+ blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
if (__allocate_data_block(&dn))
goto sync_out;
allocated = true;
@@ -643,29 +1174,37 @@ out:
}
/*
- * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
+ * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
+ * f2fs_map_blocks structure.
* If original data blocks are allocated, then give them to blockdev.
* Otherwise,
* a. preallocate requested block addresses
* b. do not use extent cache for better performance
* c. give the block addresses to blockdev
*/
-static int __get_data_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create, bool fiemap)
+static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+ int create, bool fiemap)
{
- unsigned int blkbits = inode->i_sb->s_blocksize_bits;
- unsigned maxblocks = bh_result->b_size >> blkbits;
+ unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
pgoff_t pgofs, end_offset;
int err = 0, ofs = 1;
+ struct extent_info ei;
bool allocated = false;
- /* Get the page offset from the block offset(iblock) */
- pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
+ map->m_len = 0;
+ map->m_flags = 0;
+
+ /* it only supports block size == page size */
+ pgofs = (pgoff_t)map->m_lblk;
- if (check_extent_cache(inode, pgofs, bh_result))
+ if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+ map->m_pblk = ei.blk + pgofs - ei.fofs;
+ map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
+ map->m_flags = F2FS_MAP_MAPPED;
goto out;
+ }
if (create)
f2fs_lock_op(F2FS_I_SB(inode));
@@ -682,21 +1221,23 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
goto put_out;
if (dn.data_blkaddr != NULL_ADDR) {
- set_buffer_new(bh_result);
- map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ map->m_flags = F2FS_MAP_MAPPED;
+ map->m_pblk = dn.data_blkaddr;
+ if (dn.data_blkaddr == NEW_ADDR)
+ map->m_flags |= F2FS_MAP_UNWRITTEN;
} else if (create) {
err = __allocate_data_block(&dn);
if (err)
goto put_out;
allocated = true;
- set_buffer_new(bh_result);
- map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED;
+ map->m_pblk = dn.data_blkaddr;
} else {
goto put_out;
}
end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
- bh_result->b_size = (((size_t)1) << blkbits);
+ map->m_len = 1;
dn.ofs_in_node++;
pgofs++;
@@ -720,21 +1261,25 @@ get_next:
end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
}
- if (maxblocks > (bh_result->b_size >> blkbits)) {
+ if (maxblocks > map->m_len) {
block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
if (blkaddr == NULL_ADDR && create) {
err = __allocate_data_block(&dn);
if (err)
goto sync_out;
allocated = true;
+ map->m_flags |= F2FS_MAP_NEW;
blkaddr = dn.data_blkaddr;
}
/* Give more consecutive addresses for the readahead */
- if (blkaddr == (bh_result->b_blocknr + ofs)) {
+ if ((map->m_pblk != NEW_ADDR &&
+ blkaddr == (map->m_pblk + ofs)) ||
+ (map->m_pblk == NEW_ADDR &&
+ blkaddr == NEW_ADDR)) {
ofs++;
dn.ofs_in_node++;
pgofs++;
- bh_result->b_size += (((size_t)1) << blkbits);
+ map->m_len++;
goto get_next;
}
}
@@ -747,10 +1292,28 @@ unlock_out:
if (create)
f2fs_unlock_op(F2FS_I_SB(inode));
out:
- trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+ trace_f2fs_map_blocks(inode, map, err);
return err;
}
+static int __get_data_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create, bool fiemap)
+{
+ struct f2fs_map_blocks map;
+ int ret;
+
+ map.m_lblk = iblock;
+ map.m_len = bh->b_size >> inode->i_blkbits;
+
+ ret = f2fs_map_blocks(inode, &map, create, fiemap);
+ if (!ret) {
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
+ bh->b_size = map.m_len << inode->i_blkbits;
+ }
+ return ret;
+}
+
static int get_data_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
@@ -763,11 +1326,268 @@ static int get_data_block_fiemap(struct inode *inode, sector_t iblock,
return __get_data_block(inode, iblock, bh_result, create, true);
}
+static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
+{
+ return (offset >> inode->i_blkbits);
+}
+
+static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
+{
+ return (blk << inode->i_blkbits);
+}
+
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
- return generic_block_fiemap(inode, fieinfo,
- start, len, get_data_block_fiemap);
+ struct buffer_head map_bh;
+ sector_t start_blk, last_blk;
+ loff_t isize = i_size_read(inode);
+ u64 logical = 0, phys = 0, size = 0;
+ u32 flags = 0;
+ bool past_eof = false, whole_file = false;
+ int ret = 0;
+
+ ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (len >= isize) {
+ whole_file = true;
+ len = isize;
+ }
+
+ if (logical_to_blk(inode, len) == 0)
+ len = blk_to_logical(inode, 1);
+
+ start_blk = logical_to_blk(inode, start);
+ last_blk = logical_to_blk(inode, start + len - 1);
+next:
+ memset(&map_bh, 0, sizeof(struct buffer_head));
+ map_bh.b_size = len;
+
+ ret = get_data_block_fiemap(inode, start_blk, &map_bh, 0);
+ if (ret)
+ goto out;
+
+ /* HOLE */
+ if (!buffer_mapped(&map_bh)) {
+ start_blk++;
+
+ if (!past_eof && blk_to_logical(inode, start_blk) >= isize)
+ past_eof = 1;
+
+ if (past_eof && size) {
+ flags |= FIEMAP_EXTENT_LAST;
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size, flags);
+ } else if (size) {
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size, flags);
+ size = 0;
+ }
+
+ /* if we have holes up to/past EOF then we're done */
+ if (start_blk > last_blk || past_eof || ret)
+ goto out;
+ } else {
+ if (start_blk > last_blk && !whole_file) {
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size, flags);
+ goto out;
+ }
+
+ /*
+ * if size != 0 then we know we already have an extent
+ * to add, so add it.
+ */
+ if (size) {
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size, flags);
+ if (ret)
+ goto out;
+ }
+
+ logical = blk_to_logical(inode, start_blk);
+ phys = blk_to_logical(inode, map_bh.b_blocknr);
+ size = map_bh.b_size;
+ flags = 0;
+ if (buffer_unwritten(&map_bh))
+ flags = FIEMAP_EXTENT_UNWRITTEN;
+
+ start_blk += logical_to_blk(inode, size);
+
+ /*
+ * If we are past the EOF, then we need to make sure as
+ * soon as we find a hole that the last extent we found
+ * is marked with FIEMAP_EXTENT_LAST
+ */
+ if (!past_eof && logical + size >= isize)
+ past_eof = true;
+ }
+ cond_resched();
+ if (fatal_signal_pending(current))
+ ret = -EINTR;
+ else
+ goto next;
+out:
+ if (ret == 1)
+ ret = 0;
+
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+/*
+ * This function was originally taken from fs/mpage.c, and customized for f2fs.
+ * Major change was from block_size == page_size in f2fs by default.
+ */
+static int f2fs_mpage_readpages(struct address_space *mapping,
+ struct list_head *pages, struct page *page,
+ unsigned nr_pages)
+{
+ struct bio *bio = NULL;
+ unsigned page_idx;
+ sector_t last_block_in_bio = 0;
+ struct inode *inode = mapping->host;
+ const unsigned blkbits = inode->i_blkbits;
+ const unsigned blocksize = 1 << blkbits;
+ sector_t block_in_file;
+ sector_t last_block;
+ sector_t last_block_in_file;
+ sector_t block_nr;
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ struct f2fs_map_blocks map;
+
+ map.m_pblk = 0;
+ map.m_lblk = 0;
+ map.m_len = 0;
+ map.m_flags = 0;
+
+ for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
+
+ prefetchw(&page->flags);
+ if (pages) {
+ page = list_entry(pages->prev, struct page, lru);
+ list_del(&page->lru);
+ if (add_to_page_cache_lru(page, mapping,
+ page->index, GFP_KERNEL))
+ goto next_page;
+ }
+
+ block_in_file = (sector_t)page->index;
+ last_block = block_in_file + nr_pages;
+ last_block_in_file = (i_size_read(inode) + blocksize - 1) >>
+ blkbits;
+ if (last_block > last_block_in_file)
+ last_block = last_block_in_file;
+
+ /*
+ * Map blocks using the previous result first.
+ */
+ if ((map.m_flags & F2FS_MAP_MAPPED) &&
+ block_in_file > map.m_lblk &&
+ block_in_file < (map.m_lblk + map.m_len))
+ goto got_it;
+
+ /*
+ * Then do more f2fs_map_blocks() calls until we are
+ * done with this page.
+ */
+ map.m_flags = 0;
+
+ if (block_in_file < last_block) {
+ map.m_lblk = block_in_file;
+ map.m_len = last_block - block_in_file;
+
+ if (f2fs_map_blocks(inode, &map, 0, false))
+ goto set_error_page;
+ }
+got_it:
+ if ((map.m_flags & F2FS_MAP_MAPPED)) {
+ block_nr = map.m_pblk + block_in_file - map.m_lblk;
+ SetPageMappedToDisk(page);
+
+ if (!PageUptodate(page) && !cleancache_get_page(page)) {
+ SetPageUptodate(page);
+ goto confused;
+ }
+ } else {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto next_page;
+ }
+
+ /*
+ * This page will go to BIO. Do we need to send this
+ * BIO off first?
+ */
+ if (bio && (last_block_in_bio != block_nr - 1)) {
+submit_and_realloc:
+ submit_bio(READ, bio);
+ bio = NULL;
+ }
+ if (bio == NULL) {
+ struct f2fs_crypto_ctx *ctx = NULL;
+
+ if (f2fs_encrypted_inode(inode) &&
+ S_ISREG(inode->i_mode)) {
+ struct page *cpage;
+
+ ctx = f2fs_get_crypto_ctx(inode);
+ if (IS_ERR(ctx))
+ goto set_error_page;
+
+ /* wait the page to be moved by cleaning */
+ cpage = find_lock_page(
+ META_MAPPING(F2FS_I_SB(inode)),
+ block_nr);
+ if (cpage) {
+ f2fs_wait_on_page_writeback(cpage,
+ DATA);
+ f2fs_put_page(cpage, 1);
+ }
+ }
+
+ bio = bio_alloc(GFP_KERNEL,
+ min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+ if (!bio) {
+ if (ctx)
+ f2fs_release_crypto_ctx(ctx);
+ goto set_error_page;
+ }
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr);
+ bio->bi_end_io = f2fs_read_end_io;
+ bio->bi_private = ctx;
+ }
+
+ if (bio_add_page(bio, page, blocksize, 0) < blocksize)
+ goto submit_and_realloc;
+
+ last_block_in_bio = block_nr;
+ goto next_page;
+set_error_page:
+ SetPageError(page);
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ unlock_page(page);
+ goto next_page;
+confused:
+ if (bio) {
+ submit_bio(READ, bio);
+ bio = NULL;
+ }
+ unlock_page(page);
+next_page:
+ if (pages)
+ page_cache_release(page);
+ }
+ BUG_ON(pages && !list_empty(pages));
+ if (bio)
+ submit_bio(READ, bio);
+ return 0;
}
static int f2fs_read_data_page(struct file *file, struct page *page)
@@ -781,8 +1601,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
if (f2fs_has_inline_data(inode))
ret = f2fs_read_inline_data(inode, page);
if (ret == -EAGAIN)
- ret = mpage_readpage(page, get_data_block);
-
+ ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1);
return ret;
}
@@ -796,11 +1615,12 @@ static int f2fs_read_data_pages(struct file *file,
if (f2fs_has_inline_data(inode))
return 0;
- return mpage_readpages(mapping, pages, nr_pages, get_data_block);
+ return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages);
}
-int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
+int do_write_data_page(struct f2fs_io_info *fio)
{
+ struct page *page = fio->page;
struct inode *inode = page->mapping->host;
struct dnode_of_data dn;
int err = 0;
@@ -813,8 +1633,18 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
fio->blk_addr = dn.data_blkaddr;
/* This page is already truncated */
- if (fio->blk_addr == NULL_ADDR)
+ if (fio->blk_addr == NULL_ADDR) {
+ ClearPageUptodate(page);
goto out_writepage;
+ }
+
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ fio->encrypted_page = f2fs_encrypt(inode, fio->page);
+ if (IS_ERR(fio->encrypted_page)) {
+ err = PTR_ERR(fio->encrypted_page);
+ goto out_writepage;
+ }
+ }
set_page_writeback(page);
@@ -825,12 +1655,17 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
if (unlikely(fio->blk_addr != NEW_ADDR &&
!is_cold_data(page) &&
need_inplace_update(inode))) {
- rewrite_data_page(page, fio);
+ rewrite_data_page(fio);
set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
+ trace_f2fs_do_write_data_page(page, IPU);
} else {
- write_data_page(page, &dn, fio);
- update_extent_cache(&dn);
+ write_data_page(&dn, fio);
+ set_data_blkaddr(&dn);
+ f2fs_update_extent_cache(&dn);
+ trace_f2fs_do_write_data_page(page, OPU);
set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+ if (page->index == 0)
+ set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
}
out_writepage:
f2fs_put_dnode(&dn);
@@ -849,8 +1684,11 @@ static int f2fs_write_data_page(struct page *page,
bool need_balance_fs = false;
int err = 0;
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = DATA,
.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ .page = page,
+ .encrypted_page = NULL,
};
trace_f2fs_writepage(page, DATA);
@@ -880,7 +1718,7 @@ write:
if (S_ISDIR(inode->i_mode)) {
if (unlikely(f2fs_cp_error(sbi)))
goto redirty_out;
- err = do_write_data_page(page, &fio);
+ err = do_write_data_page(&fio);
goto done;
}
@@ -900,7 +1738,7 @@ write:
if (f2fs_has_inline_data(inode))
err = f2fs_write_inline_data(inode, page);
if (err == -EAGAIN)
- err = do_write_data_page(page, &fio);
+ err = do_write_data_page(&fio);
f2fs_unlock_op(sbi);
done:
if (err && err != -ENOENT)
@@ -909,6 +1747,8 @@ done:
clear_cold_data(page);
out:
inode_dec_dirty_pages(inode);
+ if (err)
+ ClearPageUptodate(page);
unlock_page(page);
if (need_balance_fs)
f2fs_balance_fs(sbi);
@@ -950,6 +1790,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
+ /* during POR, we don't need to trigger writepage at all. */
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto skip_write;
+
diff = nr_pages_to_write(sbi, DATA, wbc);
if (!S_ISDIR(inode->i_mode)) {
@@ -1063,11 +1907,14 @@ put_next:
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
} else {
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = DATA,
.rw = READ_SYNC,
.blk_addr = dn.data_blkaddr,
+ .page = page,
+ .encrypted_page = NULL,
};
- err = f2fs_submit_page_bio(sbi, page, &fio);
+ err = f2fs_submit_page_bio(&fio);
if (err)
goto fail;
@@ -1081,6 +1928,15 @@ put_next:
f2fs_put_page(page, 1);
goto repeat;
}
+
+ /* avoid symlink page */
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ err = f2fs_decrypt_one(inode, page);
+ if (err) {
+ f2fs_put_page(page, 1);
+ goto fail;
+ }
+ }
}
out:
SetPageUptodate(page);
@@ -1118,12 +1974,12 @@ static int f2fs_write_end(struct file *file,
return copied;
}
-static int check_direct_IO(struct inode *inode, int rw,
- struct iov_iter *iter, loff_t offset)
+static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
+ loff_t offset)
{
unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
- if (rw == READ)
+ if (iov_iter_rw(iter) == READ)
return 0;
if (offset & blocksize_mask)
@@ -1135,8 +1991,8 @@ static int check_direct_IO(struct inode *inode, int rw,
return 0;
}
-static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -1151,19 +2007,22 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
return err;
}
- if (check_direct_IO(inode, rw, iter, offset))
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ return 0;
+
+ if (check_direct_IO(inode, iter, offset))
return 0;
- trace_f2fs_direct_IO_enter(inode, offset, count, rw);
+ trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (rw & WRITE)
+ if (iov_iter_rw(iter) == WRITE)
__allocate_data_blocks(inode, offset, count);
- err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
- if (err < 0 && (rw & WRITE))
+ err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block);
+ if (err < 0 && iov_iter_rw(iter) == WRITE)
f2fs_write_failed(mapping, offset + count);
- trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
+ trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err);
return err;
}
@@ -1213,8 +2072,6 @@ static int f2fs_set_data_page_dirty(struct page *page)
return 1;
}
- mark_inode_dirty(inode);
-
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
update_dirty_page(inode, page);
@@ -1236,6 +2093,37 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, get_data_block);
}
+void init_extent_cache_info(struct f2fs_sb_info *sbi)
+{
+ INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
+ init_rwsem(&sbi->extent_tree_lock);
+ INIT_LIST_HEAD(&sbi->extent_list);
+ spin_lock_init(&sbi->extent_lock);
+ sbi->total_ext_tree = 0;
+ atomic_set(&sbi->total_ext_node, 0);
+}
+
+int __init create_extent_cache(void)
+{
+ extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
+ sizeof(struct extent_tree));
+ if (!extent_tree_slab)
+ return -ENOMEM;
+ extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node",
+ sizeof(struct extent_node));
+ if (!extent_node_slab) {
+ kmem_cache_destroy(extent_tree_slab);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void destroy_extent_cache(void)
+{
+ kmem_cache_destroy(extent_node_slab);
+ kmem_cache_destroy(extent_tree_slab);
+}
+
const struct address_space_operations f2fs_dblock_aops = {
.readpage = f2fs_read_data_page,
.readpages = f2fs_read_data_pages,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index e671373cc8ab..75176e0dd6c8 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -35,6 +35,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
/* validation check of the segment numbers */
si->hit_ext = sbi->read_hit_ext;
si->total_ext = sbi->total_hit_ext;
+ si->ext_tree = sbi->total_ext_tree;
+ si->ext_node = atomic_read(&sbi->total_ext_node);
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
si->ndirty_dirs = sbi->n_dirty_dirs;
@@ -92,7 +94,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
static void update_sit_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
+ unsigned long long blks_per_sec, hblks_per_sec, total_vblocks;
+ unsigned long long bimodal, dist;
unsigned int segno, vblocks;
int ndirty = 0;
@@ -110,10 +113,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
ndirty++;
}
}
- dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
- si->bimodal = bimodal / dist;
+ dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100);
+ si->bimodal = div_u64(bimodal, dist);
if (si->dirty_count)
- si->avg_vblocks = total_vblocks / ndirty;
+ si->avg_vblocks = div_u64(total_vblocks, ndirty);
else
si->avg_vblocks = 0;
}
@@ -141,7 +144,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem += sizeof(struct sit_info);
si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
- si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+ si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
si->base_mem += SIT_VBLOCK_MAP_SIZE;
if (sbi->segs_per_sec > 1)
si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
@@ -185,6 +188,9 @@ get_cache:
si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
for (i = 0; i <= UPDATE_INO; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
+ si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree);
+ si->cache_mem += atomic_read(&sbi->total_ext_node) *
+ sizeof(struct extent_node);
si->page_mem = 0;
npages = NODE_MAPPING(sbi)->nrpages;
@@ -260,13 +266,20 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "CP calls: %d\n", si->cp_count);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
- seq_printf(s, " - data segments : %d\n", si->data_segs);
- seq_printf(s, " - node segments : %d\n", si->node_segs);
- seq_printf(s, "Try to move %d blocks\n", si->tot_blks);
- seq_printf(s, " - data blocks : %d\n", si->data_blks);
- seq_printf(s, " - node blocks : %d\n", si->node_blks);
+ seq_printf(s, " - data segments : %d (%d)\n",
+ si->data_segs, si->bg_data_segs);
+ seq_printf(s, " - node segments : %d (%d)\n",
+ si->node_segs, si->bg_node_segs);
+ seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
+ si->bg_data_blks + si->bg_node_blks);
+ seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
+ si->bg_data_blks);
+ seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
+ si->bg_node_blks);
seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
si->hit_ext, si->total_ext);
+ seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree);
+ seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
seq_printf(s, " - inmem: %4d, wb: %4d\n",
si->inmem_pages, si->wb_pages);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b74097a7f6d9..a34ebd8312ab 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -59,9 +59,8 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK,
};
-void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
+void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
{
- umode_t mode = inode->i_mode;
de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
@@ -77,20 +76,10 @@ static unsigned long dir_block_index(unsigned int level,
return bidx;
}
-static bool early_match_name(size_t namelen, f2fs_hash_t namehash,
- struct f2fs_dir_entry *de)
-{
- if (le16_to_cpu(de->name_len) != namelen)
- return false;
-
- if (de->hash_code != namehash)
- return false;
-
- return true;
-}
-
static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
- struct qstr *name, int *max_slots,
+ struct f2fs_filename *fname,
+ f2fs_hash_t namehash,
+ int *max_slots,
struct page **res_page)
{
struct f2fs_dentry_block *dentry_blk;
@@ -99,9 +88,8 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page);
- make_dentry_ptr(&d, (void *)dentry_blk, 1);
- de = find_target_dentry(name, max_slots, &d);
-
+ make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+ de = find_target_dentry(fname, namehash, max_slots, &d);
if (de)
*res_page = dentry_page;
else
@@ -115,34 +103,43 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
return de;
}
-struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots,
- struct f2fs_dentry_ptr *d)
+struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname,
+ f2fs_hash_t namehash, int *max_slots,
+ struct f2fs_dentry_ptr *d)
{
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
- f2fs_hash_t namehash = f2fs_dentry_hash(name);
int max_len = 0;
+ struct f2fs_str de_name = FSTR_INIT(NULL, 0);
+ struct f2fs_str *name = &fname->disk_name;
if (max_slots)
*max_slots = 0;
while (bit_pos < d->max) {
if (!test_bit_le(bit_pos, d->bitmap)) {
- if (bit_pos == 0)
- max_len = 1;
- else if (!test_bit_le(bit_pos - 1, d->bitmap))
- max_len++;
bit_pos++;
+ max_len++;
continue;
}
+
de = &d->dentry[bit_pos];
- if (early_match_name(name->len, namehash, de) &&
- !memcmp(d->filename[bit_pos], name->name, name->len))
+
+ /* encrypted case */
+ de_name.name = d->filename[bit_pos];
+ de_name.len = le16_to_cpu(de->name_len);
+
+ /* show encrypted name */
+ if (fname->hash) {
+ if (de->hash_code == fname->hash)
+ goto found;
+ } else if (de_name.len == name->len &&
+ de->hash_code == namehash &&
+ !memcmp(de_name.name, name->name, name->len))
goto found;
- if (max_slots && *max_slots >= 0 && max_len > *max_slots) {
+ if (max_slots && max_len > *max_slots)
*max_slots = max_len;
- max_len = 0;
- }
+ max_len = 0;
/* remain bug on condition */
if (unlikely(!de->name_len))
@@ -159,16 +156,21 @@ found:
}
static struct f2fs_dir_entry *find_in_level(struct inode *dir,
- unsigned int level, struct qstr *name,
- f2fs_hash_t namehash, struct page **res_page)
+ unsigned int level,
+ struct f2fs_filename *fname,
+ struct page **res_page)
{
- int s = GET_DENTRY_SLOTS(name->len);
+ struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
+ int s = GET_DENTRY_SLOTS(name.len);
unsigned int nbucket, nblock;
unsigned int bidx, end_block;
struct page *dentry_page;
struct f2fs_dir_entry *de = NULL;
bool room = false;
int max_slots;
+ f2fs_hash_t namehash;
+
+ namehash = f2fs_dentry_hash(&name);
f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
@@ -181,13 +183,14 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
for (; bidx < end_block; bidx++) {
/* no need to allocate new dentry pages to all the indices */
- dentry_page = find_data_page(dir, bidx, true);
+ dentry_page = find_data_page(dir, bidx);
if (IS_ERR(dentry_page)) {
room = true;
continue;
}
- de = find_in_block(dentry_page, name, &max_slots, res_page);
+ de = find_in_block(dentry_page, fname, namehash, &max_slots,
+ res_page);
if (de)
break;
@@ -215,30 +218,34 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
{
unsigned long npages = dir_blocks(dir);
struct f2fs_dir_entry *de = NULL;
- f2fs_hash_t name_hash;
unsigned int max_depth;
unsigned int level;
+ struct f2fs_filename fname;
+ int err;
- if (f2fs_has_inline_dentry(dir))
- return find_in_inline_dir(dir, child, res_page);
+ *res_page = NULL;
- if (npages == 0)
+ err = f2fs_fname_setup_filename(dir, child, 1, &fname);
+ if (err)
return NULL;
- *res_page = NULL;
+ if (f2fs_has_inline_dentry(dir)) {
+ de = find_in_inline_dir(dir, &fname, res_page);
+ goto out;
+ }
+
+ if (npages == 0)
+ goto out;
- name_hash = f2fs_dentry_hash(child);
max_depth = F2FS_I(dir)->i_current_depth;
for (level = 0; level < max_depth; level++) {
- de = find_in_level(dir, level, child, name_hash, res_page);
+ de = find_in_level(dir, level, &fname, res_page);
if (de)
break;
}
- if (!de && F2FS_I(dir)->chash != name_hash) {
- F2FS_I(dir)->chash = name_hash;
- F2FS_I(dir)->clevel = level - 1;
- }
+out:
+ f2fs_fname_free_filename(&fname);
return de;
}
@@ -285,7 +292,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
lock_page(page);
f2fs_wait_on_page_writeback(page, type);
de->ino = cpu_to_le32(inode->i_ino);
- set_de_type(de, inode);
+ set_de_type(de, inode->i_mode);
f2fs_dentry_kunmap(dir, page);
set_page_dirty(page);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -307,10 +314,14 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
set_page_dirty(ipage);
}
-int update_dent_inode(struct inode *inode, const struct qstr *name)
+int update_dent_inode(struct inode *inode, struct inode *to,
+ const struct qstr *name)
{
struct page *page;
+ if (file_enc_name(to))
+ return 0;
+
page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -331,14 +342,14 @@ void do_make_empty_dir(struct inode *inode, struct inode *parent,
de->hash_code = 0;
de->ino = cpu_to_le32(inode->i_ino);
memcpy(d->filename[0], ".", 1);
- set_de_type(de, inode);
+ set_de_type(de, inode->i_mode);
de = &d->dentry[1];
de->hash_code = 0;
de->name_len = cpu_to_le16(2);
de->ino = cpu_to_le32(parent->i_ino);
memcpy(d->filename[1], "..", 2);
- set_de_type(de, inode);
+ set_de_type(de, parent->i_mode);
test_and_set_bit_le(0, (void *)d->bitmap);
test_and_set_bit_le(1, (void *)d->bitmap);
@@ -360,7 +371,7 @@ static int make_empty_dir(struct inode *inode,
dentry_blk = kmap_atomic(dentry_page);
- make_dentry_ptr(&d, (void *)dentry_blk, 1);
+ make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
do_make_empty_dir(inode, parent, &d);
kunmap_atomic(dentry_blk);
@@ -394,6 +405,12 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
err = f2fs_init_security(inode, dir, name, page);
if (err)
goto put_error;
+
+ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
+ err = f2fs_inherit_context(dir, inode, page);
+ if (err)
+ goto put_error;
+ }
} else {
page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
if (IS_ERR(page))
@@ -435,7 +452,7 @@ error:
void update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth)
{
- if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+ if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
if (S_ISDIR(inode->i_mode)) {
inc_nlink(dir);
set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
@@ -450,7 +467,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
- if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
+ if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
}
@@ -474,38 +491,64 @@ next:
goto next;
}
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
+ const struct qstr *name, f2fs_hash_t name_hash,
+ unsigned int bit_pos)
+{
+ struct f2fs_dir_entry *de;
+ int slots = GET_DENTRY_SLOTS(name->len);
+ int i;
+
+ de = &d->dentry[bit_pos];
+ de->hash_code = name_hash;
+ de->name_len = cpu_to_le16(name->len);
+ memcpy(d->filename[bit_pos], name->name, name->len);
+ de->ino = cpu_to_le32(ino);
+ set_de_type(de, mode);
+ for (i = 0; i < slots; i++)
+ test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
+}
+
/*
* Caller should grab and release a rwsem by calling f2fs_lock_op() and
* f2fs_unlock_op().
*/
int __f2fs_add_link(struct inode *dir, const struct qstr *name,
- struct inode *inode)
+ struct inode *inode, nid_t ino, umode_t mode)
{
unsigned int bit_pos;
unsigned int level;
unsigned int current_depth;
unsigned long bidx, block;
f2fs_hash_t dentry_hash;
- struct f2fs_dir_entry *de;
unsigned int nbucket, nblock;
- size_t namelen = name->len;
struct page *dentry_page = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
- int slots = GET_DENTRY_SLOTS(namelen);
- struct page *page;
- int err = 0;
- int i;
+ struct f2fs_dentry_ptr d;
+ struct page *page = NULL;
+ struct f2fs_filename fname;
+ struct qstr new_name;
+ int slots, err;
+
+ err = f2fs_fname_setup_filename(dir, name, 0, &fname);
+ if (err)
+ return err;
+
+ new_name.name = fname_name(&fname);
+ new_name.len = fname_len(&fname);
if (f2fs_has_inline_dentry(dir)) {
- err = f2fs_add_inline_entry(dir, name, inode);
+ err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode);
if (!err || err != -EAGAIN)
- return err;
+ goto out;
else
err = 0;
}
- dentry_hash = f2fs_dentry_hash(name);
level = 0;
+ slots = GET_DENTRY_SLOTS(new_name.len);
+ dentry_hash = f2fs_dentry_hash(&new_name);
+
current_depth = F2FS_I(dir)->i_current_depth;
if (F2FS_I(dir)->chash == dentry_hash) {
level = F2FS_I(dir)->clevel;
@@ -513,8 +556,10 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
}
start:
- if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
- return -ENOSPC;
+ if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) {
+ err = -ENOSPC;
+ goto out;
+ }
/* Increase the depth, if required */
if (level == current_depth)
@@ -528,8 +573,10 @@ start:
for (block = bidx; block <= (bidx + nblock - 1); block++) {
dentry_page = get_new_data_page(dir, NULL, block, true);
- if (IS_ERR(dentry_page))
- return PTR_ERR(dentry_page);
+ if (IS_ERR(dentry_page)) {
+ err = PTR_ERR(dentry_page);
+ goto out;
+ }
dentry_blk = kmap(dentry_page);
bit_pos = room_for_filename(&dentry_blk->dentry_bitmap,
@@ -547,30 +594,33 @@ start:
add_dentry:
f2fs_wait_on_page_writeback(dentry_page, DATA);
- down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, name, NULL);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto fail;
+ if (inode) {
+ down_write(&F2FS_I(inode)->i_sem);
+ page = init_inode_metadata(inode, dir, &new_name, NULL);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto fail;
+ }
+ if (f2fs_encrypted_inode(dir))
+ file_set_enc_name(inode);
}
- de = &dentry_blk->dentry[bit_pos];
- de->hash_code = dentry_hash;
- de->name_len = cpu_to_le16(namelen);
- memcpy(dentry_blk->filename[bit_pos], name->name, name->len);
- de->ino = cpu_to_le32(inode->i_ino);
- set_de_type(de, inode);
- for (i = 0; i < slots; i++)
- test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+
+ make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1);
+ f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos);
+
set_page_dirty(dentry_page);
- /* we don't need to mark_inode_dirty now */
- F2FS_I(inode)->i_pino = dir->i_ino;
- update_inode(inode, page);
- f2fs_put_page(page, 1);
+ if (inode) {
+ /* we don't need to mark_inode_dirty now */
+ F2FS_I(inode)->i_pino = dir->i_ino;
+ update_inode(inode, page);
+ f2fs_put_page(page, 1);
+ }
update_parent_metadata(dir, inode, current_depth);
fail:
- up_write(&F2FS_I(inode)->i_sem);
+ if (inode)
+ up_write(&F2FS_I(inode)->i_sem);
if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
update_inode_page(dir);
@@ -578,6 +628,8 @@ fail:
}
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
+out:
+ f2fs_fname_free_filename(&fname);
return err;
}
@@ -669,6 +721,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (bit_pos == NR_DENTRY_IN_BLOCK) {
truncate_hole(dir, page->index, page->index + 1);
clear_page_dirty_for_io(page);
+ ClearPagePrivate(page);
ClearPageUptodate(page);
inode_dec_dirty_pages(dir);
}
@@ -714,11 +767,12 @@ bool f2fs_empty_dir(struct inode *dir)
}
bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
- unsigned int start_pos)
+ unsigned int start_pos, struct f2fs_str *fstr)
{
unsigned char d_type = DT_UNKNOWN;
unsigned int bit_pos;
struct f2fs_dir_entry *de = NULL;
+ struct f2fs_str de_name = FSTR_INIT(NULL, 0);
bit_pos = ((unsigned long)ctx->pos % d->max);
@@ -732,8 +786,24 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
d_type = f2fs_filetype_table[de->file_type];
else
d_type = DT_UNKNOWN;
- if (!dir_emit(ctx, d->filename[bit_pos],
- le16_to_cpu(de->name_len),
+
+ /* encrypted case */
+ de_name.name = d->filename[bit_pos];
+ de_name.len = le16_to_cpu(de->name_len);
+
+ if (f2fs_encrypted_inode(d->inode)) {
+ int save_len = fstr->len;
+ int ret;
+
+ ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
+ &de_name, fstr);
+ de_name = *fstr;
+ fstr->len = save_len;
+ if (ret < 0)
+ return true;
+ }
+
+ if (!dir_emit(ctx, de_name.name, de_name.len,
le32_to_cpu(de->ino), d_type))
return true;
@@ -752,9 +822,24 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
struct file_ra_state *ra = &file->f_ra;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
struct f2fs_dentry_ptr d;
+ struct f2fs_str fstr = FSTR_INIT(NULL, 0);
+ int err = 0;
- if (f2fs_has_inline_dentry(inode))
- return f2fs_read_inline_dir(file, ctx);
+ if (f2fs_encrypted_inode(inode)) {
+ err = f2fs_get_encryption_info(inode);
+ if (err)
+ return err;
+
+ err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN,
+ &fstr);
+ if (err < 0)
+ return err;
+ }
+
+ if (f2fs_has_inline_dentry(inode)) {
+ err = f2fs_read_inline_dir(file, ctx, &fstr);
+ goto out;
+ }
/* readahead for multi pages of dir */
if (npages - n > 1 && !ra_has_index(ra, n))
@@ -768,9 +853,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
dentry_blk = kmap(dentry_page);
- make_dentry_ptr(&d, (void *)dentry_blk, 1);
+ make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
- if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK))
+ if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr))
goto stop;
ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
@@ -783,8 +868,9 @@ stop:
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
}
-
- return 0;
+out:
+ f2fs_fname_crypto_free_buffer(&fstr);
+ return err;
}
const struct file_operations f2fs_dir_operations = {
@@ -793,4 +879,7 @@ const struct file_operations f2fs_dir_operations = {
.iterate = f2fs_readdir,
.fsync = f2fs_sync_file,
.unlocked_ioctl = f2fs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = f2fs_compat_ioctl,
+#endif
};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7fa3313ab0e2..a8327ed73898 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -50,6 +50,7 @@
#define F2FS_MOUNT_FLUSH_MERGE 0x00000400
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
+#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -69,6 +70,15 @@ struct f2fs_mount_info {
unsigned int opt;
};
+#define F2FS_FEATURE_ENCRYPT 0x0001
+
+#define F2FS_HAS_FEATURE(sb, mask) \
+ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
+#define F2FS_SET_FEATURE(sb, mask) \
+ F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)
+#define F2FS_CLEAR_FEATURE(sb, mask) \
+ F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
+
#define CRCPOLY_LE 0xedb88320
static inline __u32 f2fs_crc32(void *buf, size_t len)
@@ -102,12 +112,15 @@ enum {
CP_UMOUNT,
CP_FASTBOOT,
CP_SYNC,
+ CP_RECOVERY,
CP_DISCARD,
};
#define DEF_BATCHED_TRIM_SECTIONS 32
#define BATCHED_TRIM_SEGMENTS(sbi) \
(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
+#define BATCHED_TRIM_BLOCKS(sbi) \
+ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
struct cp_control {
int reason;
@@ -216,6 +229,22 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
+#define F2FS_IOC_SET_ENCRYPTION_POLICY \
+ _IOR('f', 19, struct f2fs_encryption_policy)
+#define F2FS_IOC_GET_ENCRYPTION_PWSALT \
+ _IOW('f', 20, __u8[16])
+#define F2FS_IOC_GET_ENCRYPTION_POLICY \
+ _IOW('f', 21, struct f2fs_encryption_policy)
+
+/*
+ * should be same as XFS_IOC_GOINGDOWN.
+ * Flags for going down operation used by FS_IOC_GOINGDOWN
+ */
+#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */
+#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
+#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
+#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
+
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
@@ -228,16 +257,38 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
* For INODE and NODE manager
*/
/* for directory operations */
+struct f2fs_str {
+ unsigned char *name;
+ u32 len;
+};
+
+struct f2fs_filename {
+ const struct qstr *usr_fname;
+ struct f2fs_str disk_name;
+ f2fs_hash_t hash;
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ struct f2fs_str crypto_buf;
+#endif
+};
+
+#define FSTR_INIT(n, l) { .name = n, .len = l }
+#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p) ((p)->disk_name.name)
+#define fname_len(p) ((p)->disk_name.len)
+
struct f2fs_dentry_ptr {
+ struct inode *inode;
const void *bitmap;
struct f2fs_dir_entry *dentry;
__u8 (*filename)[F2FS_SLOT_LEN];
int max;
};
-static inline void make_dentry_ptr(struct f2fs_dentry_ptr *d,
- void *src, int type)
+static inline void make_dentry_ptr(struct inode *inode,
+ struct f2fs_dentry_ptr *d, void *src, int type)
{
+ d->inode = inode;
+
if (type == 1) {
struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src;
d->max = NR_DENTRY_IN_BLOCK;
@@ -273,14 +324,52 @@ enum {
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
+/* vector size for gang look-up from extent cache that consists of radix tree */
+#define EXT_TREE_VEC_SIZE 64
+
/* for in-memory extent cache entry */
-#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */
+#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */
+
+/* number of extent info in extent cache we try to shrink */
+#define EXTENT_CACHE_SHRINK_NUMBER 128
struct extent_info {
- rwlock_t ext_lock; /* rwlock for consistency */
- unsigned int fofs; /* start offset in a file */
- u32 blk_addr; /* start block address of the extent */
- unsigned int len; /* length of the extent */
+ unsigned int fofs; /* start offset in a file */
+ u32 blk; /* start block address of the extent */
+ unsigned int len; /* length of the extent */
+};
+
+struct extent_node {
+ struct rb_node rb_node; /* rb node located in rb-tree */
+ struct list_head list; /* node in global extent list of sbi */
+ struct extent_info ei; /* extent info */
+};
+
+struct extent_tree {
+ nid_t ino; /* inode number */
+ struct rb_root root; /* root of extent info rb-tree */
+ struct extent_node *cached_en; /* recently accessed extent node */
+ rwlock_t lock; /* protect extent info rb-tree */
+ atomic_t refcount; /* reference count of rb-tree */
+ unsigned int count; /* # of extent node in rb-tree*/
+};
+
+/*
+ * This structure is taken from ext4_map_blocks.
+ *
+ * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks().
+ */
+#define F2FS_MAP_NEW (1 << BH_New)
+#define F2FS_MAP_MAPPED (1 << BH_Mapped)
+#define F2FS_MAP_UNWRITTEN (1 << BH_Unwritten)
+#define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\
+ F2FS_MAP_UNWRITTEN)
+
+struct f2fs_map_blocks {
+ block_t m_pblk;
+ block_t m_lblk;
+ unsigned int m_len;
+ unsigned int m_flags;
};
/*
@@ -288,6 +377,29 @@ struct extent_info {
*/
#define FADVISE_COLD_BIT 0x01
#define FADVISE_LOST_PINO_BIT 0x02
+#define FADVISE_ENCRYPT_BIT 0x04
+#define FADVISE_ENC_NAME_BIT 0x08
+
+#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
+#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
+#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
+#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
+#define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
+#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT)
+#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+
+/* Encryption algorithms */
+#define F2FS_ENCRYPTION_MODE_INVALID 0
+#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1
+#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2
+#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3
+#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4
+
+#include "f2fs_crypto.h"
#define DEF_DIR_LEVEL 0
@@ -309,31 +421,67 @@ struct f2fs_inode_info {
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
struct extent_info ext; /* in-memory extent cache entry */
+ rwlock_t ext_lock; /* rwlock for single extent cache */
struct inode_entry *dirty_dir; /* the pointer of dirty dir */
struct radix_tree_root inmem_root; /* radix tree for inmem pages */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct mutex inmem_lock; /* lock for inmemory pages */
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ /* Encryption params */
+ struct f2fs_crypt_info *i_crypt_info;
+#endif
};
static inline void get_extent_info(struct extent_info *ext,
struct f2fs_extent i_ext)
{
- write_lock(&ext->ext_lock);
ext->fofs = le32_to_cpu(i_ext.fofs);
- ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
+ ext->blk = le32_to_cpu(i_ext.blk);
ext->len = le32_to_cpu(i_ext.len);
- write_unlock(&ext->ext_lock);
}
static inline void set_raw_extent(struct extent_info *ext,
struct f2fs_extent *i_ext)
{
- read_lock(&ext->ext_lock);
i_ext->fofs = cpu_to_le32(ext->fofs);
- i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
+ i_ext->blk = cpu_to_le32(ext->blk);
i_ext->len = cpu_to_le32(ext->len);
- read_unlock(&ext->ext_lock);
+}
+
+static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
+ u32 blk, unsigned int len)
+{
+ ei->fofs = fofs;
+ ei->blk = blk;
+ ei->len = len;
+}
+
+static inline bool __is_extent_same(struct extent_info *ei1,
+ struct extent_info *ei2)
+{
+ return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk &&
+ ei1->len == ei2->len);
+}
+
+static inline bool __is_extent_mergeable(struct extent_info *back,
+ struct extent_info *front)
+{
+ return (back->fofs + back->len == front->fofs &&
+ back->blk + back->len == front->blk);
+}
+
+static inline bool __is_back_mergeable(struct extent_info *cur,
+ struct extent_info *back)
+{
+ return __is_extent_mergeable(back, cur);
+}
+
+static inline bool __is_front_mergeable(struct extent_info *cur,
+ struct extent_info *front)
+{
+ return __is_extent_mergeable(cur, front);
}
struct f2fs_nm_info {
@@ -502,12 +650,19 @@ enum page_type {
META,
NR_PAGE_TYPE,
META_FLUSH,
+ INMEM, /* the below types are used by tracepoints only. */
+ INMEM_DROP,
+ IPU,
+ OPU,
};
struct f2fs_io_info {
+ struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
block_t blk_addr; /* block address to be written */
+ struct page *page; /* page to be written */
+ struct page *encrypted_page; /* encrypted page */
};
#define is_read_io(rw) (((rw) & 1) == READ)
@@ -571,6 +726,14 @@ struct f2fs_sb_info {
struct list_head dir_inode_list; /* dir inode list */
spinlock_t dir_inode_lock; /* for dir inode list lock */
+ /* for extent tree cache */
+ struct radix_tree_root extent_tree_root;/* cache extent cache entries */
+ struct rw_semaphore extent_tree_lock; /* locking extent radix tree */
+ struct list_head extent_list; /* lru list for shrinker */
+ spinlock_t extent_lock; /* locking extent lru list */
+ int total_ext_tree; /* extent tree count */
+ atomic_t total_ext_node; /* extent info count */
+
/* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
unsigned int log_blocksize; /* log2 block size */
@@ -592,6 +755,7 @@ struct f2fs_sb_info {
block_t user_block_count; /* # of user blocks */
block_t total_valid_block_count; /* # of valid blocks */
block_t alloc_valid_block_count; /* # of allocated blocks */
+ block_t discard_blks; /* discard command candidats */
block_t last_valid_block_count; /* for recovery */
u32 s_next_generation; /* for NFS support */
atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */
@@ -920,12 +1084,17 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
return 0;
}
+static inline block_t __cp_payload(struct f2fs_sb_info *sbi)
+{
+ return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+}
+
static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
int offset;
- if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) {
+ if (__cp_payload(sbi) > 0) {
if (flag == NAT_BITMAP)
return &ckpt->sit_nat_version_bitmap;
else
@@ -1114,6 +1283,24 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr)
return mask & *addr;
}
+static inline void f2fs_set_bit(unsigned int nr, char *addr)
+{
+ int mask;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ *addr |= mask;
+}
+
+static inline void f2fs_clear_bit(unsigned int nr, char *addr)
+{
+ int mask;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ *addr &= ~mask;
+}
+
static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr)
{
int mask;
@@ -1166,8 +1353,10 @@ enum {
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
FI_VOLATILE_FILE, /* indicate volatile file */
+ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
+ FI_INLINE_DOTS, /* indicate inline dot dentries */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1204,6 +1393,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
set_inode_flag(fi, FI_INLINE_DENTRY);
if (ri->i_inline & F2FS_DATA_EXIST)
set_inode_flag(fi, FI_DATA_EXIST);
+ if (ri->i_inline & F2FS_INLINE_DOTS)
+ set_inode_flag(fi, FI_INLINE_DOTS);
}
static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -1219,6 +1410,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
ri->i_inline |= F2FS_INLINE_DENTRY;
if (is_inode_flag_set(fi, FI_DATA_EXIST))
ri->i_inline |= F2FS_DATA_EXIST;
+ if (is_inode_flag_set(fi, FI_INLINE_DOTS))
+ ri->i_inline |= F2FS_INLINE_DOTS;
}
static inline int f2fs_has_inline_xattr(struct inode *inode)
@@ -1264,6 +1457,11 @@ static inline int f2fs_exist_data(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST);
}
+static inline int f2fs_has_inline_dots(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS);
+}
+
static inline bool f2fs_is_atomic_file(struct inode *inode)
{
return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
@@ -1274,6 +1472,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
}
+static inline bool f2fs_is_first_block_written(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+}
+
static inline bool f2fs_is_drop_cache(struct inode *inode)
{
return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
@@ -1290,18 +1493,27 @@ static inline int f2fs_has_inline_dentry(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY);
}
-static inline void *inline_dentry_addr(struct page *page)
-{
- struct f2fs_inode *ri = F2FS_INODE(page);
- return (void *)&(ri->i_addr[1]);
-}
-
static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
{
if (!f2fs_has_inline_dentry(dir))
kunmap(page);
}
+static inline int is_file(struct inode *inode, int type)
+{
+ return F2FS_I(inode)->i_advise & type;
+}
+
+static inline void set_file(struct inode *inode, int type)
+{
+ F2FS_I(inode)->i_advise |= type;
+}
+
+static inline void clear_file(struct inode *inode, int type)
+{
+ F2FS_I(inode)->i_advise &= ~type;
+}
+
static inline int f2fs_readonly(struct super_block *sb)
{
return sb->s_flags & MS_RDONLY;
@@ -1318,6 +1530,17 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
sbi->sb->s_flags |= MS_RDONLY;
}
+static inline bool is_dot_dotdot(const struct qstr *str)
+{
+ if (str->len == 1 && str->name[0] == '.')
+ return true;
+
+ if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+ return true;
+
+ return false;
+}
+
#define get_inode_mode(i) \
((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -1363,11 +1586,12 @@ struct dentry *f2fs_get_parent(struct dentry *child);
* dir.c
*/
extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
-void set_de_type(struct f2fs_dir_entry *, struct inode *);
-struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *,
- struct f2fs_dentry_ptr *);
+void set_de_type(struct f2fs_dir_entry *, umode_t);
+
+struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *,
+ f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
- unsigned int);
+ unsigned int, struct f2fs_str *);
void do_make_empty_dir(struct inode *, struct inode *,
struct f2fs_dentry_ptr *);
struct page *init_inode_metadata(struct inode *, struct inode *,
@@ -1381,23 +1605,26 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
struct page *, struct inode *);
-int update_dent_inode(struct inode *, const struct qstr *);
-int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
+int update_dent_inode(struct inode *, struct inode *, const struct qstr *);
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
+ const struct qstr *, f2fs_hash_t , unsigned int);
+int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
+ umode_t);
void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
struct inode *);
int f2fs_do_tmpfile(struct inode *, struct inode *);
-int f2fs_make_empty(struct inode *, struct inode *);
bool f2fs_empty_dir(struct inode *);
static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
{
- return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name,
- inode);
+ return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name,
+ inode, inode->i_ino, inode->i_mode);
}
/*
* super.c
*/
+int f2fs_commit_super(struct f2fs_sb_info *, bool);
int f2fs_sync_fs(struct super_block *, int);
extern __printf(3, 4)
void f2fs_msg(struct super_block *, const char *, const char *, ...);
@@ -1414,8 +1641,8 @@ struct dnode_of_data;
struct node_info;
bool available_free_memory(struct f2fs_sb_info *, int);
+int need_dentry_mark(struct f2fs_sb_info *, nid_t);
bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
-bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
@@ -1456,21 +1683,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *);
void destroy_flush_cmd_control(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
-void clear_prefree_segments(struct f2fs_sb_info *);
+void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
void release_discard_addrs(struct f2fs_sb_info *);
void discard_next_dnode(struct f2fs_sb_info *, block_t);
int npages_for_summary_flush(struct f2fs_sb_info *, bool);
void allocate_new_segments(struct f2fs_sb_info *);
int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
+void update_meta_page(struct f2fs_sb_info *, void *, block_t);
void write_meta_page(struct f2fs_sb_info *, struct page *);
-void write_node_page(struct f2fs_sb_info *, struct page *,
- unsigned int, struct f2fs_io_info *);
-void write_data_page(struct page *, struct dnode_of_data *,
- struct f2fs_io_info *);
-void rewrite_data_page(struct page *, struct f2fs_io_info *);
-void recover_data_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_summary *, block_t, block_t);
+void write_node_page(unsigned int, struct f2fs_io_info *);
+void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
+void rewrite_data_page(struct f2fs_io_info *);
+void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
+ block_t, block_t, unsigned char, bool);
void allocate_data_block(struct f2fs_sb_info *, struct page *,
block_t, block_t *, struct f2fs_summary *, int);
void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1489,6 +1715,7 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
@@ -1515,18 +1742,25 @@ void destroy_checkpoint_caches(void);
* data.c
*/
void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
-int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
- struct f2fs_io_info *);
-void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
- struct f2fs_io_info *);
+int f2fs_submit_page_bio(struct f2fs_io_info *);
+void f2fs_submit_page_mbio(struct f2fs_io_info *);
+void set_data_blkaddr(struct dnode_of_data *);
int reserve_new_block(struct dnode_of_data *);
int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-void update_extent_cache(struct dnode_of_data *);
-struct page *find_data_page(struct inode *, pgoff_t, bool);
+void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
+void f2fs_destroy_extent_tree(struct inode *);
+void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *);
+void f2fs_update_extent_cache(struct dnode_of_data *);
+void f2fs_preserve_extent_tree(struct inode *);
+struct page *get_read_data_page(struct inode *, pgoff_t, int);
+struct page *find_data_page(struct inode *, pgoff_t);
struct page *get_lock_data_page(struct inode *, pgoff_t);
struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int do_write_data_page(struct page *, struct f2fs_io_info *);
+int do_write_data_page(struct f2fs_io_info *);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void init_extent_cache_info(struct f2fs_sb_info *);
+int __init create_extent_cache(void);
+void destroy_extent_cache(void);
void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
int f2fs_release_page(struct page *, gfp_t);
@@ -1554,7 +1788,7 @@ struct f2fs_stat_info {
struct f2fs_sb_info *sbi;
int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
int main_area_segs, main_area_sections, main_area_zones;
- int hit_ext, total_ext;
+ int hit_ext, total_ext, ext_tree, ext_node;
int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
int nats, dirty_nats, sits, dirty_sits, fnids;
int total_count, utilization;
@@ -1566,7 +1800,9 @@ struct f2fs_stat_info {
int dirty_count, node_pages, meta_pages;
int prefree_count, call_count, cp_count;
int tot_segs, node_segs, data_segs, free_segs, free_secs;
+ int bg_node_segs, bg_data_segs;
int tot_blks, data_blks, node_blks;
+ int bg_data_blks, bg_node_blks;
int curseg[NR_CURSEG_TYPE];
int cursec[NR_CURSEG_TYPE];
int curzone[NR_CURSEG_TYPE];
@@ -1615,31 +1851,36 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
((sbi)->block_count[(curseg)->alloc_type]++)
#define stat_inc_inplace_blocks(sbi) \
(atomic_inc(&(sbi)->inplace_count))
-#define stat_inc_seg_count(sbi, type) \
+#define stat_inc_seg_count(sbi, type, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
(si)->tot_segs++; \
- if (type == SUM_TYPE_DATA) \
+ if (type == SUM_TYPE_DATA) { \
si->data_segs++; \
- else \
+ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \
+ } else { \
si->node_segs++; \
+ si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \
+ } \
} while (0)
#define stat_inc_tot_blk_count(si, blks) \
(si->tot_blks += (blks))
-#define stat_inc_data_blk_count(sbi, blks) \
+#define stat_inc_data_blk_count(sbi, blks, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->data_blks += (blks); \
+ si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \
} while (0)
-#define stat_inc_node_blk_count(sbi, blks) \
+#define stat_inc_node_blk_count(sbi, blks, gc_type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
stat_inc_tot_blk_count(si, blks); \
si->node_blks += (blks); \
+ si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \
} while (0)
int f2fs_build_stats(struct f2fs_sb_info *);
@@ -1661,10 +1902,10 @@ void f2fs_destroy_root_stats(void);
#define stat_inc_seg_type(sbi, curseg)
#define stat_inc_block_count(sbi, curseg)
#define stat_inc_inplace_blocks(sbi)
-#define stat_inc_seg_count(si, type)
+#define stat_inc_seg_count(sbi, type, gc_type)
#define stat_inc_tot_blk_count(si, blks)
-#define stat_inc_data_blk_count(si, blks)
-#define stat_inc_node_blk_count(sbi, blks)
+#define stat_inc_data_blk_count(sbi, blks, gc_type)
+#define stat_inc_node_blk_count(sbi, blks, gc_type)
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
@@ -1680,26 +1921,162 @@ extern const struct address_space_operations f2fs_node_aops;
extern const struct address_space_operations f2fs_meta_aops;
extern const struct inode_operations f2fs_dir_inode_operations;
extern const struct inode_operations f2fs_symlink_inode_operations;
+extern const struct inode_operations f2fs_encrypted_symlink_inode_operations;
extern const struct inode_operations f2fs_special_inode_operations;
extern struct kmem_cache *inode_entry_slab;
/*
* inline.c
*/
-bool f2fs_may_inline(struct inode *);
+bool f2fs_may_inline_data(struct inode *);
+bool f2fs_may_inline_dentry(struct inode *);
void read_inline_data(struct page *, struct page *);
+bool truncate_inline_inode(struct page *, u64);
int f2fs_read_inline_data(struct inode *, struct page *);
int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
int f2fs_convert_inline_inode(struct inode *);
int f2fs_write_inline_data(struct inode *, struct page *);
bool recover_inline_data(struct inode *, struct page *);
-struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
- struct page **);
+struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
+ struct f2fs_filename *, struct page **);
struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
-int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *);
+int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
+ nid_t, umode_t);
void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
struct inode *, struct inode *);
bool f2fs_empty_inline_dir(struct inode *);
-int f2fs_read_inline_dir(struct file *, struct dir_context *);
+int f2fs_read_inline_dir(struct file *, struct dir_context *,
+ struct f2fs_str *);
+
+/*
+ * crypto support
+ */
+static inline int f2fs_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ return file_is_encrypt(inode);
+#else
+ return 0;
+#endif
+}
+
+static inline void f2fs_set_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ file_set_encrypt(inode);
+#endif
+}
+
+static inline bool f2fs_bio_encrypted(struct bio *bio)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ return unlikely(bio->bi_private != NULL);
+#else
+ return false;
+#endif
+}
+
+static inline int f2fs_sb_has_crypto(struct super_block *sb)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
+#else
+ return 0;
+#endif
+}
+
+static inline bool f2fs_may_encrypt(struct inode *inode)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ mode_t mode = inode->i_mode;
+
+ return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
+#else
+ return 0;
+#endif
+}
+
+/* crypto_policy.c */
+int f2fs_is_child_context_consistent_with_parent(struct inode *,
+ struct inode *);
+int f2fs_inherit_context(struct inode *, struct inode *, struct page *);
+int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *);
+int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *);
+
+/* crypt.c */
+extern struct kmem_cache *f2fs_crypt_info_cachep;
+bool f2fs_valid_contents_enc_mode(uint32_t);
+uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t);
+struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *);
+void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *);
+struct page *f2fs_encrypt(struct inode *, struct page *);
+int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *);
+int f2fs_decrypt_one(struct inode *, struct page *);
+void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *);
+
+/* crypto_key.c */
+void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *);
+int _f2fs_get_encryption_info(struct inode *inode);
+
+/* crypto_fname.c */
+bool f2fs_valid_filenames_enc_mode(uint32_t);
+u32 f2fs_fname_crypto_round_up(u32, u32);
+int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *);
+int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *,
+ const struct f2fs_str *, struct f2fs_str *);
+int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *,
+ struct f2fs_str *);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+void f2fs_restore_and_release_control_page(struct page **);
+void f2fs_restore_control_page(struct page *);
+
+int __init f2fs_init_crypto(void);
+int f2fs_crypto_initialize(void);
+void f2fs_exit_crypto(void);
+
+int f2fs_has_encryption_key(struct inode *);
+
+static inline int f2fs_get_encryption_info(struct inode *inode)
+{
+ struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+
+ if (!ci ||
+ (ci->ci_keyring_key &&
+ (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+ (1 << KEY_FLAG_REVOKED) |
+ (1 << KEY_FLAG_DEAD)))))
+ return _f2fs_get_encryption_info(inode);
+ return 0;
+}
+
+void f2fs_fname_crypto_free_buffer(struct f2fs_str *);
+int f2fs_fname_setup_filename(struct inode *, const struct qstr *,
+ int lookup, struct f2fs_filename *);
+void f2fs_fname_free_filename(struct f2fs_filename *);
+#else
+static inline void f2fs_restore_and_release_control_page(struct page **p) { }
+static inline void f2fs_restore_control_page(struct page *p) { }
+
+static inline int __init f2fs_init_crypto(void) { return 0; }
+static inline void f2fs_exit_crypto(void) { }
+
+static inline int f2fs_has_encryption_key(struct inode *i) { return 0; }
+static inline int f2fs_get_encryption_info(struct inode *i) { return 0; }
+static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { }
+
+static inline int f2fs_fname_setup_filename(struct inode *dir,
+ const struct qstr *iname,
+ int lookup, struct f2fs_filename *fname)
+{
+ memset(fname, 0, sizeof(struct f2fs_filename));
+ fname->usr_fname = iname;
+ fname->disk_name.name = (unsigned char *)iname->name;
+ fname->disk_name.len = iname->len;
+ return 0;
+}
+
+static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { }
+#endif
#endif
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
new file mode 100644
index 000000000000..c2c1c2b63b25
--- /dev/null
+++ b/fs/f2fs/f2fs_crypto.h
@@ -0,0 +1,151 @@
+/*
+ * linux/fs/f2fs/f2fs_crypto.h
+ *
+ * Copied from linux/fs/ext4/ext4_crypto.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption header content for f2fs
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#ifndef _F2FS_CRYPTO_H
+#define _F2FS_CRYPTO_H
+
+#include <linux/fs.h>
+
+#define F2FS_KEY_DESCRIPTOR_SIZE 8
+
+/* Policy provided via an ioctl on the topmost directory */
+struct f2fs_encryption_policy {
+ char version;
+ char contents_encryption_mode;
+ char filenames_encryption_mode;
+ char flags;
+ char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
+} __attribute__((__packed__));
+
+#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
+#define F2FS_KEY_DERIVATION_NONCE_SIZE 16
+
+#define F2FS_POLICY_FLAGS_PAD_4 0x00
+#define F2FS_POLICY_FLAGS_PAD_8 0x01
+#define F2FS_POLICY_FLAGS_PAD_16 0x02
+#define F2FS_POLICY_FLAGS_PAD_32 0x03
+#define F2FS_POLICY_FLAGS_PAD_MASK 0x03
+#define F2FS_POLICY_FLAGS_VALID 0x03
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ * 1 byte: Protector format (1 = this version)
+ * 1 byte: File contents encryption mode
+ * 1 byte: File names encryption mode
+ * 1 byte: Flags
+ * 8 bytes: Master Key descriptor
+ * 16 bytes: Encryption Key derivation nonce
+ */
+struct f2fs_encryption_context {
+ char format;
+ char contents_encryption_mode;
+ char filenames_encryption_mode;
+ char flags;
+ char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
+ char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE];
+} __attribute__((__packed__));
+
+/* Encryption parameters */
+#define F2FS_XTS_TWEAK_SIZE 16
+#define F2FS_AES_128_ECB_KEY_SIZE 16
+#define F2FS_AES_256_GCM_KEY_SIZE 32
+#define F2FS_AES_256_CBC_KEY_SIZE 32
+#define F2FS_AES_256_CTS_KEY_SIZE 32
+#define F2FS_AES_256_XTS_KEY_SIZE 64
+#define F2FS_MAX_KEY_SIZE 64
+
+#define F2FS_KEY_DESC_PREFIX "f2fs:"
+#define F2FS_KEY_DESC_PREFIX_SIZE 5
+
+struct f2fs_encryption_key {
+ __u32 mode;
+ char raw[F2FS_MAX_KEY_SIZE];
+ __u32 size;
+} __attribute__((__packed__));
+
+struct f2fs_crypt_info {
+ char ci_data_mode;
+ char ci_filename_mode;
+ char ci_flags;
+ struct crypto_ablkcipher *ci_ctfm;
+ struct key *ci_keyring_key;
+ char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
+#define F2FS_WRITE_PATH_FL 0x00000002
+
+struct f2fs_crypto_ctx {
+ union {
+ struct {
+ struct page *bounce_page; /* Ciphertext page */
+ struct page *control_page; /* Original page */
+ } w;
+ struct {
+ struct bio *bio;
+ struct work_struct work;
+ } r;
+ struct list_head free_list; /* Free list */
+ };
+ char flags; /* Flags */
+};
+
+struct f2fs_completion_result {
+ struct completion completion;
+ int res;
+};
+
+#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \
+ struct f2fs_completion_result ecr = { \
+ COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+static inline int f2fs_encryption_key_size(int mode)
+{
+ switch (mode) {
+ case F2FS_ENCRYPTION_MODE_AES_256_XTS:
+ return F2FS_AES_256_XTS_KEY_SIZE;
+ case F2FS_ENCRYPTION_MODE_AES_256_GCM:
+ return F2FS_AES_256_GCM_KEY_SIZE;
+ case F2FS_ENCRYPTION_MODE_AES_256_CBC:
+ return F2FS_AES_256_CBC_KEY_SIZE;
+ case F2FS_ENCRYPTION_MODE_AES_256_CTS:
+ return F2FS_AES_256_CTS_KEY_SIZE;
+ default:
+ BUG();
+ }
+ return 0;
+}
+
+#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4
+#define F2FS_CRYPTO_BLOCK_SIZE 16
+#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct f2fs_encrypted_symlink_data {
+ __le16 len;
+ char encrypted_path[1];
+} __attribute__((__packed__));
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 encrypted_symlink_data_len(u32 l)
+{
+ return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1);
+}
+#endif /* _F2FS_CRYPTO_H */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 98dac27bc3f7..b0f38c3b37f4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -20,6 +20,7 @@
#include <linux/uaccess.h>
#include <linux/mount.h>
#include <linux/pagevec.h>
+#include <linux/random.h>
#include "f2fs.h"
#include "node.h"
@@ -105,7 +106,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
if (!dentry)
return 0;
- if (update_dent_inode(inode, &dentry->d_name)) {
+ if (update_dent_inode(inode, inode, &dentry->d_name)) {
dput(dentry);
return 0;
}
@@ -122,6 +123,8 @@ static inline bool need_do_checkpoint(struct inode *inode)
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
+ else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino))
+ need_cp = true;
else if (file_wrong_pino(inode))
need_cp = true;
else if (!space_for_roll_forward(sbi))
@@ -241,6 +244,8 @@ go_write:
* will be used only for fsynced inodes after checkpoint.
*/
try_to_fix_pino(inode);
+ clear_inode_flag(fi, FI_APPEND_WRITE);
+ clear_inode_flag(fi, FI_UPDATE_WRITE);
goto out;
}
sync_nodes:
@@ -269,7 +274,7 @@ flush_out:
ret = f2fs_issue_flush(sbi);
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
- f2fs_trace_ios(NULL, NULL, 1);
+ f2fs_trace_ios(NULL, 1);
return ret;
}
@@ -405,6 +410,12 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
+ if (f2fs_encrypted_inode(inode)) {
+ int err = f2fs_get_encryption_info(inode);
+ if (err)
+ return 0;
+ }
+
/* we don't need to use inline_data strictly */
if (f2fs_has_inline_data(inode)) {
int err = f2fs_convert_inline_inode(inode);
@@ -417,6 +428,18 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+static int f2fs_file_open(struct inode *inode, struct file *filp)
+{
+ int ret = generic_file_open(inode, filp);
+
+ if (!ret && f2fs_encrypted_inode(inode)) {
+ ret = f2fs_get_encryption_info(inode);
+ if (ret)
+ ret = -EACCES;
+ }
+ return ret;
+}
+
int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
int nr_free = 0, ofs = dn->ofs_in_node;
@@ -433,8 +456,12 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
continue;
dn->data_blkaddr = NULL_ADDR;
- update_extent_cache(dn);
+ set_data_blkaddr(dn);
+ f2fs_update_extent_cache(dn);
invalidate_blocks(sbi, blkaddr);
+ if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
+ clear_inode_flag(F2FS_I(dn->inode),
+ FI_FIRST_BLOCK_WRITTEN);
nr_free++;
}
if (nr_free) {
@@ -454,27 +481,33 @@ void truncate_data_blocks(struct dnode_of_data *dn)
truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
}
-static int truncate_partial_data_page(struct inode *inode, u64 from)
+static int truncate_partial_data_page(struct inode *inode, u64 from,
+ bool cache_only)
{
unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+ pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ struct address_space *mapping = inode->i_mapping;
struct page *page;
- if (!offset)
+ if (!offset && !cache_only)
return 0;
- page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false);
- if (IS_ERR(page))
+ if (cache_only) {
+ page = grab_cache_page(mapping, index);
+ if (page && PageUptodate(page))
+ goto truncate_out;
+ f2fs_put_page(page, 1);
return 0;
+ }
- lock_page(page);
- if (unlikely(!PageUptodate(page) ||
- page->mapping != inode->i_mapping))
- goto out;
-
+ page = get_lock_data_page(inode, index);
+ if (IS_ERR(page))
+ return 0;
+truncate_out:
f2fs_wait_on_page_writeback(page, DATA);
zero_user(page, offset, PAGE_CACHE_SIZE - offset);
- set_page_dirty(page);
-out:
+ if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+ set_page_dirty(page);
f2fs_put_page(page, 1);
return 0;
}
@@ -487,6 +520,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
pgoff_t free_from;
int count = 0, err = 0;
struct page *ipage;
+ bool truncate_page = false;
trace_f2fs_truncate_blocks_enter(inode, from);
@@ -502,7 +536,10 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
}
if (f2fs_has_inline_data(inode)) {
+ if (truncate_inline_inode(ipage, from))
+ set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
+ truncate_page = true;
goto out;
}
@@ -533,7 +570,7 @@ out:
/* lastly zero out the first data page */
if (!err)
- err = truncate_partial_data_page(inode, from);
+ err = truncate_partial_data_page(inode, from, truncate_page);
trace_f2fs_truncate_blocks_exit(inode, err);
return err;
@@ -548,7 +585,7 @@ void f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
/* we should check inline_data size */
- if (f2fs_has_inline_data(inode) && !f2fs_may_inline(inode)) {
+ if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) {
if (f2fs_convert_inline_inode(inode))
return;
}
@@ -562,7 +599,7 @@ void f2fs_truncate(struct inode *inode)
int f2fs_getattr(struct vfsmount *mnt,
struct dentry *dentry, struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
generic_fillattr(inode, stat);
stat->blocks <<= 3;
return 0;
@@ -601,7 +638,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr)
int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
int err;
@@ -610,16 +647,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
if (attr->ia_valid & ATTR_SIZE) {
- if (attr->ia_size != i_size_read(inode)) {
+ if (f2fs_encrypted_inode(inode) &&
+ f2fs_get_encryption_info(inode))
+ return -EACCES;
+
+ if (attr->ia_size <= i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
f2fs_truncate(inode);
f2fs_balance_fs(F2FS_I_SB(inode));
} else {
/*
- * giving a chance to truncate blocks past EOF which
- * are fallocated with FALLOC_FL_KEEP_SIZE.
+ * do not trim all blocks after i_size if target size is
+ * larger than i_size.
*/
- f2fs_truncate(inode);
+ truncate_setsize(inode, attr->ia_size);
}
}
@@ -706,10 +747,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- /* skip punching hole beyond i_size */
- if (offset >= inode->i_size)
- return ret;
-
if (f2fs_has_inline_data(inode)) {
ret = f2fs_convert_inline_inode(inode);
if (ret)
@@ -753,6 +790,320 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
+static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+ int ret = 0;
+
+ f2fs_lock_op(sbi);
+
+ for (; end < nrpages; start++, end++) {
+ block_t new_addr, old_addr;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA);
+ if (ret && ret != -ENOENT) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ new_addr = NULL_ADDR;
+ } else {
+ new_addr = dn.data_blkaddr;
+ truncate_data_blocks_range(&dn, 1);
+ f2fs_put_dnode(&dn);
+ }
+
+ if (new_addr == NULL_ADDR) {
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA);
+ if (ret && ret != -ENOENT)
+ goto out;
+ else if (ret == -ENOENT)
+ continue;
+
+ if (dn.data_blkaddr == NULL_ADDR) {
+ f2fs_put_dnode(&dn);
+ continue;
+ } else {
+ truncate_data_blocks_range(&dn, 1);
+ }
+
+ f2fs_put_dnode(&dn);
+ } else {
+ struct page *ipage;
+
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ ret = PTR_ERR(ipage);
+ goto out;
+ }
+
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ ret = f2fs_reserve_block(&dn, start);
+ if (ret)
+ goto out;
+
+ old_addr = dn.data_blkaddr;
+ if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) {
+ dn.data_blkaddr = NULL_ADDR;
+ f2fs_update_extent_cache(&dn);
+ invalidate_blocks(sbi, old_addr);
+
+ dn.data_blkaddr = new_addr;
+ set_data_blkaddr(&dn);
+ } else if (new_addr != NEW_ADDR) {
+ struct node_info ni;
+
+ get_node_info(sbi, dn.nid, &ni);
+ f2fs_replace_block(sbi, &dn, old_addr, new_addr,
+ ni.version, true);
+ }
+
+ f2fs_put_dnode(&dn);
+ }
+ }
+ ret = 0;
+out:
+ f2fs_unlock_op(sbi);
+ return ret;
+}
+
+static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ pgoff_t pg_start, pg_end;
+ loff_t new_size;
+ int ret;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (offset + len >= i_size_read(inode))
+ return -EINVAL;
+
+ /* collapse range should be aligned to block size of f2fs. */
+ if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
+ return -EINVAL;
+
+ pg_start = offset >> PAGE_CACHE_SHIFT;
+ pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+
+ /* write out all dirty pages from offset */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ if (ret)
+ return ret;
+
+ truncate_pagecache(inode, offset);
+
+ ret = f2fs_do_collapse(inode, pg_start, pg_end);
+ if (ret)
+ return ret;
+
+ new_size = i_size_read(inode) - len;
+
+ ret = truncate_blocks(inode, new_size, true);
+ if (!ret)
+ i_size_write(inode, new_size);
+
+ return ret;
+}
+
+static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
+ int mode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index, pg_start, pg_end;
+ loff_t new_size = i_size_read(inode);
+ loff_t off_start, off_end;
+ int ret = 0;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ ret = inode_newsize_ok(inode, (len + offset));
+ if (ret)
+ return ret;
+
+ f2fs_balance_fs(sbi);
+
+ if (f2fs_has_inline_data(inode)) {
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+ }
+
+ ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
+ if (ret)
+ return ret;
+
+ truncate_pagecache_range(inode, offset, offset + len - 1);
+
+ pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+
+ off_start = offset & (PAGE_CACHE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+
+ if (pg_start == pg_end) {
+ fill_zero(inode, pg_start, off_start, off_end - off_start);
+ if (offset + len > new_size)
+ new_size = offset + len;
+ new_size = max_t(loff_t, new_size, offset + len);
+ } else {
+ if (off_start) {
+ fill_zero(inode, pg_start++, off_start,
+ PAGE_CACHE_SIZE - off_start);
+ new_size = max_t(loff_t, new_size,
+ pg_start << PAGE_CACHE_SHIFT);
+ }
+
+ for (index = pg_start; index < pg_end; index++) {
+ struct dnode_of_data dn;
+ struct page *ipage;
+
+ f2fs_lock_op(sbi);
+
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ ret = PTR_ERR(ipage);
+ f2fs_unlock_op(sbi);
+ goto out;
+ }
+
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ ret = f2fs_reserve_block(&dn, index);
+ if (ret) {
+ f2fs_unlock_op(sbi);
+ goto out;
+ }
+
+ if (dn.data_blkaddr != NEW_ADDR) {
+ invalidate_blocks(sbi, dn.data_blkaddr);
+
+ dn.data_blkaddr = NEW_ADDR;
+ set_data_blkaddr(&dn);
+
+ dn.data_blkaddr = NULL_ADDR;
+ f2fs_update_extent_cache(&dn);
+ }
+ f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
+
+ new_size = max_t(loff_t, new_size,
+ (index + 1) << PAGE_CACHE_SHIFT);
+ }
+
+ if (off_end) {
+ fill_zero(inode, pg_end, 0, off_end);
+ new_size = max_t(loff_t, new_size, offset + len);
+ }
+ }
+
+out:
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) {
+ i_size_write(inode, new_size);
+ mark_inode_dirty(inode);
+ update_inode_page(inode);
+ }
+
+ return ret;
+}
+
+static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ pgoff_t pg_start, pg_end, delta, nrpages, idx;
+ loff_t new_size;
+ int ret;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ new_size = i_size_read(inode) + len;
+ if (new_size > inode->i_sb->s_maxbytes)
+ return -EFBIG;
+
+ if (offset >= i_size_read(inode))
+ return -EINVAL;
+
+ /* insert range should be aligned to block size of f2fs. */
+ if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
+ return -EINVAL;
+
+ f2fs_balance_fs(sbi);
+
+ ret = truncate_blocks(inode, i_size_read(inode), true);
+ if (ret)
+ return ret;
+
+ /* write out all dirty pages from offset */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ if (ret)
+ return ret;
+
+ truncate_pagecache(inode, offset);
+
+ pg_start = offset >> PAGE_CACHE_SHIFT;
+ pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+ delta = pg_end - pg_start;
+ nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) {
+ struct dnode_of_data dn;
+ struct page *ipage;
+ block_t new_addr, old_addr;
+
+ f2fs_lock_op(sbi);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA);
+ if (ret && ret != -ENOENT) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ goto next;
+ } else if (dn.data_blkaddr == NULL_ADDR) {
+ f2fs_put_dnode(&dn);
+ goto next;
+ } else {
+ new_addr = dn.data_blkaddr;
+ truncate_data_blocks_range(&dn, 1);
+ f2fs_put_dnode(&dn);
+ }
+
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ ret = PTR_ERR(ipage);
+ goto out;
+ }
+
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ ret = f2fs_reserve_block(&dn, idx + delta);
+ if (ret)
+ goto out;
+
+ old_addr = dn.data_blkaddr;
+ f2fs_bug_on(sbi, old_addr != NEW_ADDR);
+
+ if (new_addr != NEW_ADDR) {
+ struct node_info ni;
+
+ get_node_info(sbi, dn.nid, &ni);
+ f2fs_replace_block(sbi, &dn, old_addr, new_addr,
+ ni.version, true);
+ }
+ f2fs_put_dnode(&dn);
+next:
+ f2fs_unlock_op(sbi);
+ }
+
+ i_size_write(inode, new_size);
+ return 0;
+out:
+ f2fs_unlock_op(sbi);
+ return ret;
+}
+
static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t len, int mode)
{
@@ -818,23 +1169,40 @@ static long f2fs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- long ret;
+ long ret = 0;
+
+ if (f2fs_encrypted_inode(inode) &&
+ (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
+ return -EOPNOTSUPP;
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
mutex_lock(&inode->i_mutex);
- if (mode & FALLOC_FL_PUNCH_HOLE)
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (offset >= inode->i_size)
+ goto out;
+
ret = punch_hole(inode, offset, len);
- else
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ ret = f2fs_collapse_range(inode, offset, len);
+ } else if (mode & FALLOC_FL_ZERO_RANGE) {
+ ret = f2fs_zero_range(inode, offset, len, mode);
+ } else if (mode & FALLOC_FL_INSERT_RANGE) {
+ ret = f2fs_insert_range(inode, offset, len);
+ } else {
ret = expand_inode_data(inode, offset, len, mode);
+ }
if (!ret) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
+out:
mutex_unlock(&inode->i_mutex);
trace_f2fs_fallocate(inode, mode, offset, len, ret);
@@ -963,12 +1331,13 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (ret)
return ret;
- if (f2fs_is_atomic_file(inode))
+ if (f2fs_is_atomic_file(inode)) {
+ clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
commit_inmem_pages(inode, false);
+ }
ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
mnt_drop_write_file(filp);
- clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
return ret;
}
@@ -997,6 +1366,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
if (!f2fs_is_volatile_file(inode))
return 0;
+ if (!f2fs_is_first_block_written(inode))
+ return truncate_partial_data_page(inode, 0, true);
+
punch_hole(inode, 0, F2FS_BLKSIZE);
return 0;
}
@@ -1016,19 +1388,52 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
f2fs_balance_fs(F2FS_I_SB(inode));
if (f2fs_is_atomic_file(inode)) {
- commit_inmem_pages(inode, false);
clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ commit_inmem_pages(inode, false);
}
- if (f2fs_is_volatile_file(inode)) {
+ if (f2fs_is_volatile_file(inode))
clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
- filemap_fdatawrite(inode->i_mapping);
- set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
- }
+
mnt_drop_write_file(filp);
return ret;
}
+static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct super_block *sb = sbi->sb;
+ __u32 in;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (get_user(in, (__u32 __user *)arg))
+ return -EFAULT;
+
+ switch (in) {
+ case F2FS_GOING_DOWN_FULLSYNC:
+ sb = freeze_bdev(sb->s_bdev);
+ if (sb && !IS_ERR(sb)) {
+ f2fs_stop_checkpoint(sbi);
+ thaw_bdev(sb->s_bdev, sb);
+ }
+ break;
+ case F2FS_GOING_DOWN_METASYNC:
+ /* do checkpoint only */
+ f2fs_sync_fs(sb, 1);
+ f2fs_stop_checkpoint(sbi);
+ break;
+ case F2FS_GOING_DOWN_NOSYNC:
+ f2fs_stop_checkpoint(sbi);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1059,6 +1464,86 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
return 0;
}
+static bool uuid_is_nonzero(__u8 u[16])
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ if (u[i])
+ return true;
+ return false;
+}
+
+static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ struct f2fs_encryption_policy policy;
+ struct inode *inode = file_inode(filp);
+
+ if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg,
+ sizeof(policy)))
+ return -EFAULT;
+
+ return f2fs_process_policy(&policy, inode);
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
+{
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ struct f2fs_encryption_policy policy;
+ struct inode *inode = file_inode(filp);
+ int err;
+
+ err = f2fs_get_policy(inode, &policy);
+ if (err)
+ return err;
+
+ if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy,
+ sizeof(policy)))
+ return -EFAULT;
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ int err;
+
+ if (!f2fs_sb_has_crypto(inode->i_sb))
+ return -EOPNOTSUPP;
+
+ if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
+ goto got_it;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ /* update superblock with uuid */
+ generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
+
+ err = f2fs_commit_super(sbi, false);
+
+ mnt_drop_write_file(filp);
+ if (err) {
+ /* undo new data */
+ memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
+ return err;
+ }
+got_it:
+ if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
+ 16))
+ return -EFAULT;
+ return 0;
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch (cmd) {
@@ -1078,13 +1563,33 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_release_volatile_write(filp);
case F2FS_IOC_ABORT_VOLATILE_WRITE:
return f2fs_ioc_abort_volatile_write(filp);
+ case F2FS_IOC_SHUTDOWN:
+ return f2fs_ioc_shutdown(filp, arg);
case FITRIM:
return f2fs_ioc_fitrim(filp, arg);
+ case F2FS_IOC_SET_ENCRYPTION_POLICY:
+ return f2fs_ioc_set_encryption_policy(filp, arg);
+ case F2FS_IOC_GET_ENCRYPTION_POLICY:
+ return f2fs_ioc_get_encryption_policy(filp, arg);
+ case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+ return f2fs_ioc_get_encryption_pwsalt(filp, arg);
default:
return -ENOTTY;
}
}
+static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (f2fs_encrypted_inode(inode) &&
+ !f2fs_has_encryption_key(inode) &&
+ f2fs_get_encryption_info(inode))
+ return -EACCES;
+
+ return generic_file_write_iter(iocb, from);
+}
+
#ifdef CONFIG_COMPAT
long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
@@ -1104,11 +1609,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
const struct file_operations f2fs_file_operations = {
.llseek = f2fs_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .open = generic_file_open,
+ .write_iter = f2fs_file_write_iter,
+ .open = f2fs_file_open,
.release = f2fs_release_file,
.mmap = f2fs_file_mmap,
.fsync = f2fs_sync_file,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 76adbc3641f1..22fb5ef37966 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -435,7 +435,7 @@ next_step:
set_page_dirty(node_page);
}
f2fs_put_page(node_page, 1);
- stat_inc_node_blk_count(sbi, 1);
+ stat_inc_node_blk_count(sbi, 1, gc_type);
}
if (initial) {
@@ -518,12 +518,91 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return 1;
}
-static void move_data_page(struct inode *inode, struct page *page, int gc_type)
+static void move_encrypted_block(struct inode *inode, block_t bidx)
{
struct f2fs_io_info fio = {
+ .sbi = F2FS_I_SB(inode),
.type = DATA,
- .rw = WRITE_SYNC,
+ .rw = READ_SYNC,
+ .encrypted_page = NULL,
};
+ struct dnode_of_data dn;
+ struct f2fs_summary sum;
+ struct node_info ni;
+ struct page *page;
+ int err;
+
+ /* do not read out */
+ page = grab_cache_page(inode->i_mapping, bidx);
+ if (!page)
+ return;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
+ if (err)
+ goto out;
+
+ if (unlikely(dn.data_blkaddr == NULL_ADDR))
+ goto put_out;
+
+ get_node_info(fio.sbi, dn.nid, &ni);
+ set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+
+ /* read page */
+ fio.page = page;
+ fio.blk_addr = dn.data_blkaddr;
+
+ fio.encrypted_page = grab_cache_page(META_MAPPING(fio.sbi), fio.blk_addr);
+ if (!fio.encrypted_page)
+ goto put_out;
+
+ err = f2fs_submit_page_bio(&fio);
+ if (err)
+ goto put_page_out;
+
+ /* write page */
+ lock_page(fio.encrypted_page);
+
+ if (unlikely(!PageUptodate(fio.encrypted_page)))
+ goto put_page_out;
+ if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi)))
+ goto put_page_out;
+
+ set_page_dirty(fio.encrypted_page);
+ f2fs_wait_on_page_writeback(fio.encrypted_page, META);
+ if (clear_page_dirty_for_io(fio.encrypted_page))
+ dec_page_count(fio.sbi, F2FS_DIRTY_META);
+
+ set_page_writeback(fio.encrypted_page);
+
+ /* allocate block address */
+ f2fs_wait_on_page_writeback(dn.node_page, NODE);
+ allocate_data_block(fio.sbi, NULL, fio.blk_addr,
+ &fio.blk_addr, &sum, CURSEG_COLD_DATA);
+ fio.rw = WRITE_SYNC;
+ f2fs_submit_page_mbio(&fio);
+
+ dn.data_blkaddr = fio.blk_addr;
+ set_data_blkaddr(&dn);
+ f2fs_update_extent_cache(&dn);
+ set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+ if (page->index == 0)
+ set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+put_page_out:
+ f2fs_put_page(fio.encrypted_page, 1);
+put_out:
+ f2fs_put_dnode(&dn);
+out:
+ f2fs_put_page(page, 1);
+}
+
+static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
+{
+ struct page *page;
+
+ page = get_lock_data_page(inode, bidx);
+ if (IS_ERR(page))
+ return;
if (gc_type == BG_GC) {
if (PageWriteback(page))
@@ -531,12 +610,19 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
set_page_dirty(page);
set_cold_data(page);
} else {
+ struct f2fs_io_info fio = {
+ .sbi = F2FS_I_SB(inode),
+ .type = DATA,
+ .rw = WRITE_SYNC,
+ .page = page,
+ .encrypted_page = NULL,
+ };
+ set_page_dirty(page);
f2fs_wait_on_page_writeback(page, DATA);
-
if (clear_page_dirty_for_io(page))
inode_dec_dirty_pages(inode);
set_cold_data(page);
- do_write_data_page(page, &fio);
+ do_write_data_page(&fio);
clear_cold_data(page);
}
out:
@@ -599,10 +685,16 @@ next_step:
if (IS_ERR(inode) || is_bad_inode(inode))
continue;
- start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+ /* if encrypted inode, let's go phase 3 */
+ if (f2fs_encrypted_inode(inode) &&
+ S_ISREG(inode->i_mode)) {
+ add_gc_inode(gc_list, inode);
+ continue;
+ }
- data_page = find_data_page(inode,
- start_bidx + ofs_in_node, false);
+ start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+ data_page = get_read_data_page(inode,
+ start_bidx + ofs_in_node, READA);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -616,13 +708,13 @@ next_step:
/* phase 3 */
inode = find_gc_inode(gc_list, dni.ino);
if (inode) {
- start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
- data_page = get_lock_data_page(inode,
- start_bidx + ofs_in_node);
- if (IS_ERR(data_page))
- continue;
- move_data_page(inode, data_page, gc_type);
- stat_inc_data_blk_count(sbi, 1);
+ start_bidx = start_bidx_of_node(nofs, F2FS_I(inode))
+ + ofs_in_node;
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ move_encrypted_block(inode, start_bidx);
+ else
+ move_data_page(inode, start_bidx, gc_type);
+ stat_inc_data_blk_count(sbi, 1, gc_type);
}
}
@@ -670,6 +762,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
sum = page_address(sum_page);
+ /*
+ * this is to avoid deadlock:
+ * - lock_page(sum_page) - f2fs_replace_block
+ * - check_valid_map() - mutex_lock(sentry_lock)
+ * - mutex_lock(sentry_lock) - change_curseg()
+ * - lock_page(sum_page)
+ */
+ unlock_page(sum_page);
+
switch (GET_SUM_TYPE((&sum->footer))) {
case SUM_TYPE_NODE:
gc_node_segment(sbi, sum->entries, segno, gc_type);
@@ -680,10 +781,10 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
}
blk_finish_plug(&plug);
- stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
+ stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
stat_inc_call_count(sbi->stat_info);
- f2fs_put_page(sum_page, 1);
+ f2fs_put_page(sum_page, 0);
}
int f2fs_gc(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index a844fcfb9a8d..71b7206c431e 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -79,8 +79,7 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
const unsigned char *name = name_info->name;
size_t len = name_info->len;
- if ((len <= 2) && (name[0] == '.') &&
- (name[1] == '.' || name[1] == '\0'))
+ if (is_dot_dotdot(name_info))
return 0;
/* Initialize the default seed for the hash checksum functions */
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 1484c00133cd..a13ffcc32992 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -13,7 +13,7 @@
#include "f2fs.h"
-bool f2fs_may_inline(struct inode *inode)
+bool f2fs_may_inline_data(struct inode *inode)
{
if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
return false;
@@ -21,12 +21,26 @@ bool f2fs_may_inline(struct inode *inode)
if (f2fs_is_atomic_file(inode))
return false;
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
return false;
if (i_size_read(inode) > MAX_INLINE_DATA)
return false;
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ return false;
+
+ return true;
+}
+
+bool f2fs_may_inline_dentry(struct inode *inode)
+{
+ if (!test_opt(F2FS_I_SB(inode), INLINE_DENTRY))
+ return false;
+
+ if (!S_ISDIR(inode->i_mode))
+ return false;
+
return true;
}
@@ -50,10 +64,19 @@ void read_inline_data(struct page *page, struct page *ipage)
SetPageUptodate(page);
}
-static void truncate_inline_data(struct page *ipage)
+bool truncate_inline_inode(struct page *ipage, u64 from)
{
+ void *addr;
+
+ if (from >= MAX_INLINE_DATA)
+ return false;
+
+ addr = inline_data_addr(ipage);
+
f2fs_wait_on_page_writeback(ipage, NODE);
- memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA);
+ memset(addr + from, 0, MAX_INLINE_DATA - from);
+
+ return true;
}
int f2fs_read_inline_data(struct inode *inode, struct page *page)
@@ -86,8 +109,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
{
void *src_addr, *dst_addr;
struct f2fs_io_info fio = {
+ .sbi = F2FS_I_SB(dn->inode),
.type = DATA,
.rw = WRITE_SYNC | REQ_PRIO,
+ .page = page,
+ .encrypted_page = NULL,
};
int dirty, err;
@@ -115,14 +141,17 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
kunmap_atomic(dst_addr);
SetPageUptodate(page);
no_update:
+ set_page_dirty(page);
+
/* clear dirty state */
dirty = clear_page_dirty_for_io(page);
/* write data page to try to make data consistent */
set_page_writeback(page);
fio.blk_addr = dn->data_blkaddr;
- write_data_page(page, dn, &fio);
- update_extent_cache(dn);
+ write_data_page(dn, &fio);
+ set_data_blkaddr(dn);
+ f2fs_update_extent_cache(dn);
f2fs_wait_on_page_writeback(page, DATA);
if (dirty)
inode_dec_dirty_pages(dn->inode);
@@ -131,7 +160,7 @@ no_update:
set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
/* clear inline data and flag after data writeback */
- truncate_inline_data(dn->inode_page);
+ truncate_inline_inode(dn->inode_page, 0);
clear_out:
stat_dec_inline_inode(dn->inode);
f2fs_clear_inline_inode(dn->inode);
@@ -245,7 +274,7 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- truncate_inline_data(ipage);
+ truncate_inline_inode(ipage, 0);
f2fs_clear_inline_inode(inode);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
@@ -257,23 +286,26 @@ process_inline:
}
struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
- struct qstr *name, struct page **res_page)
+ struct f2fs_filename *fname, struct page **res_page)
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct f2fs_inline_dentry *inline_dentry;
+ struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
struct f2fs_dir_entry *de;
struct f2fs_dentry_ptr d;
struct page *ipage;
+ f2fs_hash_t namehash;
ipage = get_node_page(sbi, dir->i_ino);
if (IS_ERR(ipage))
return NULL;
- inline_dentry = inline_data_addr(ipage);
+ namehash = f2fs_dentry_hash(&name);
- make_dentry_ptr(&d, (void *)inline_dentry, 2);
- de = find_target_dentry(name, NULL, &d);
+ inline_dentry = inline_data_addr(ipage);
+ make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2);
+ de = find_target_dentry(fname, namehash, NULL, &d);
unlock_page(ipage);
if (de)
*res_page = ipage;
@@ -315,7 +347,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
dentry_blk = inline_data_addr(ipage);
- make_dentry_ptr(&d, (void *)dentry_blk, 2);
+ make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
do_make_empty_dir(inode, parent, &d);
set_page_dirty(ipage);
@@ -363,7 +395,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
set_page_dirty(page);
/* clear inline dir and flag after data writeback */
- truncate_inline_data(ipage);
+ truncate_inline_inode(ipage, 0);
stat_dec_inline_dir(dir);
clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
@@ -380,21 +412,18 @@ out:
}
int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
- struct inode *inode)
+ struct inode *inode, nid_t ino, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct page *ipage;
unsigned int bit_pos;
f2fs_hash_t name_hash;
- struct f2fs_dir_entry *de;
size_t namelen = name->len;
struct f2fs_inline_dentry *dentry_blk = NULL;
+ struct f2fs_dentry_ptr d;
int slots = GET_DENTRY_SLOTS(namelen);
- struct page *page;
+ struct page *page = NULL;
int err = 0;
- int i;
-
- name_hash = f2fs_dentry_hash(name);
ipage = get_node_page(sbi, dir->i_ino);
if (IS_ERR(ipage))
@@ -410,32 +439,34 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
goto out;
}
- down_write(&F2FS_I(inode)->i_sem);
- page = init_inode_metadata(inode, dir, name, ipage);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
- goto fail;
+ if (inode) {
+ down_write(&F2FS_I(inode)->i_sem);
+ page = init_inode_metadata(inode, dir, name, ipage);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto fail;
+ }
}
f2fs_wait_on_page_writeback(ipage, NODE);
- de = &dentry_blk->dentry[bit_pos];
- de->hash_code = name_hash;
- de->name_len = cpu_to_le16(namelen);
- memcpy(dentry_blk->filename[bit_pos], name->name, name->len);
- de->ino = cpu_to_le32(inode->i_ino);
- set_de_type(de, inode);
- for (i = 0; i < slots; i++)
- test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+
+ name_hash = f2fs_dentry_hash(name);
+ make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
+ f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos);
+
set_page_dirty(ipage);
/* we don't need to mark_inode_dirty now */
- F2FS_I(inode)->i_pino = dir->i_ino;
- update_inode(inode, page);
- f2fs_put_page(page, 1);
+ if (inode) {
+ F2FS_I(inode)->i_pino = dir->i_ino;
+ update_inode(inode, page);
+ f2fs_put_page(page, 1);
+ }
update_parent_metadata(dir, inode, 0);
fail:
- up_write(&F2FS_I(inode)->i_sem);
+ if (inode)
+ up_write(&F2FS_I(inode)->i_sem);
if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
update_inode(dir, ipage);
@@ -497,7 +528,8 @@ bool f2fs_empty_inline_dir(struct inode *dir)
return true;
}
-int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx)
+int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
+ struct f2fs_str *fstr)
{
struct inode *inode = file_inode(file);
struct f2fs_inline_dentry *inline_dentry = NULL;
@@ -513,9 +545,9 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx)
inline_dentry = inline_data_addr(ipage);
- make_dentry_ptr(&d, (void *)inline_dentry, 2);
+ make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
- if (!f2fs_fill_dentries(ctx, &d, 0))
+ if (!f2fs_fill_dentries(ctx, &d, 0, fstr))
ctx->pos = NR_INLINE_DENTRY;
f2fs_put_page(ipage, 1);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2d002e3738a7..2550868dc651 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -51,6 +51,15 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
}
}
+static bool __written_first_block(struct f2fs_inode *ri)
+{
+ block_t addr = le32_to_cpu(ri->i_addr[0]);
+
+ if (addr != NEW_ADDR && addr != NULL_ADDR)
+ return true;
+ return false;
+}
+
static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
{
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -130,7 +139,8 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
- get_extent_info(&fi->ext, ri->i_ext);
+ f2fs_init_extent_cache(inode, &ri->i_ext);
+
get_inline_info(fi, ri);
/* check data exist */
@@ -140,6 +150,9 @@ static int do_read_inode(struct inode *inode)
/* get rdev by using inline_info */
__get_inode_rdev(inode, ri);
+ if (__written_first_block(ri))
+ set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+
f2fs_put_page(node_page, 1);
stat_inc_inline_inode(inode);
@@ -185,7 +198,10 @@ make_now:
inode->i_mapping->a_ops = &f2fs_dblock_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
} else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = &f2fs_symlink_inode_operations;
+ if (f2fs_encrypted_inode(inode))
+ inode->i_op = &f2fs_encrypted_symlink_inode_operations;
+ else
+ inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
@@ -220,7 +236,11 @@ void update_inode(struct inode *inode, struct page *node_page)
ri->i_links = cpu_to_le32(inode->i_nlink);
ri->i_size = cpu_to_le64(i_size_read(inode));
ri->i_blocks = cpu_to_le64(inode->i_blocks);
+
+ read_lock(&F2FS_I(inode)->ext_lock);
set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
+ read_unlock(&F2FS_I(inode)->ext_lock);
+
set_raw_inline(F2FS_I(inode), ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -328,6 +348,12 @@ void f2fs_evict_inode(struct inode *inode)
no_delete:
stat_dec_inline_dir(inode);
stat_dec_inline_inode(inode);
+
+ /* update extent info in inode */
+ if (inode->i_nlink)
+ f2fs_preserve_extent_tree(inode);
+ f2fs_destroy_extent_tree(inode);
+
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
@@ -336,6 +362,10 @@ no_delete:
if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE))
add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
out_clear:
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ if (F2FS_I(inode)->i_crypt_info)
+ f2fs_free_encryption_info(inode, F2FS_I(inode)->i_crypt_info);
+#endif
clear_inode(inode);
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e79639a9787a..fdbae21ee8fb 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -14,6 +14,7 @@
#include <linux/sched.h>
#include <linux/ctype.h>
#include <linux/dcache.h>
+#include <linux/namei.h>
#include "f2fs.h"
#include "node.h"
@@ -55,11 +56,18 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
goto out;
}
- if (f2fs_may_inline(inode))
+ /* If the directory encrypted, then we should encrypt the inode. */
+ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
+ f2fs_set_encrypted_inode(inode);
+
+ if (f2fs_may_inline_data(inode))
set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
- if (test_opt(sbi, INLINE_DENTRY) && S_ISDIR(inode->i_mode))
+ if (f2fs_may_inline_dentry(inode))
set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
trace_f2fs_new_inode(inode, 0);
mark_inode_dirty(inode);
return inode;
@@ -135,7 +143,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
alloc_nid_done(sbi, ino);
- stat_inc_inline_inode(inode);
d_instantiate(dentry, inode);
unlock_new_inode(inode);
@@ -150,10 +157,14 @@ out:
static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
int err;
+ if (f2fs_encrypted_inode(dir) &&
+ !f2fs_is_child_context_consistent_with_parent(dir, inode))
+ return -EPERM;
+
f2fs_balance_fs(sbi);
inode->i_ctime = CURRENT_TIME;
@@ -181,10 +192,48 @@ out:
struct dentry *f2fs_get_parent(struct dentry *child)
{
struct qstr dotdot = QSTR_INIT("..", 2);
- unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot);
+ unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot);
if (!ino)
return ERR_PTR(-ENOENT);
- return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino));
+ return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino));
+}
+
+static int __recover_dot_dentries(struct inode *dir, nid_t pino)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct qstr dot = QSTR_INIT(".", 1);
+ struct qstr dotdot = QSTR_INIT("..", 2);
+ struct f2fs_dir_entry *de;
+ struct page *page;
+ int err = 0;
+
+ f2fs_lock_op(sbi);
+
+ de = f2fs_find_entry(dir, &dot, &page);
+ if (de) {
+ f2fs_dentry_kunmap(dir, page);
+ f2fs_put_page(page, 0);
+ } else {
+ err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
+ if (err)
+ goto out;
+ }
+
+ de = f2fs_find_entry(dir, &dotdot, &page);
+ if (de) {
+ f2fs_dentry_kunmap(dir, page);
+ f2fs_put_page(page, 0);
+ } else {
+ err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
+ }
+out:
+ if (!err) {
+ clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS);
+ mark_inode_dirty(dir);
+ }
+
+ f2fs_unlock_op(sbi);
+ return err;
}
static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
@@ -193,28 +242,40 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
struct f2fs_dir_entry *de;
struct page *page;
+ nid_t ino;
+ int err = 0;
if (dentry->d_name.len > F2FS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
- if (de) {
- nid_t ino = le32_to_cpu(de->ino);
- f2fs_dentry_kunmap(dir, page);
- f2fs_put_page(page, 0);
+ if (!de)
+ return d_splice_alias(inode, dentry);
- inode = f2fs_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
- }
+ ino = le32_to_cpu(de->ino);
+ f2fs_dentry_kunmap(dir, page);
+ f2fs_put_page(page, 0);
+ inode = f2fs_iget(dir->i_sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ if (f2fs_has_inline_dots(inode)) {
+ err = __recover_dot_dentries(inode, dir->i_ino);
+ if (err)
+ goto err_out;
+ }
return d_splice_alias(inode, dentry);
+
+err_out:
+ iget_failed(inode);
+ return ERR_PTR(err);
}
static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct f2fs_dir_entry *de;
struct page *page;
int err = -ENOENT;
@@ -247,21 +308,42 @@ fail:
return err;
}
+static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
+{
+ const char *link = page_follow_link_light(dentry, cookie);
+ if (!IS_ERR(link) && !*link) {
+ /* this is broken symlink case */
+ page_put_link(NULL, *cookie);
+ link = ERR_PTR(-ENOENT);
+ }
+ return link;
+}
+
static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
- size_t symlen = strlen(symname) + 1;
+ size_t len = strlen(symname);
+ size_t p_len;
+ char *p_str;
+ struct f2fs_str disk_link = FSTR_INIT(NULL, 0);
+ struct f2fs_encrypted_symlink_data *sd = NULL;
int err;
+ if (len > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
+
f2fs_balance_fs(sbi);
inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
return PTR_ERR(inode);
- inode->i_op = &f2fs_symlink_inode_operations;
+ if (f2fs_encrypted_inode(inode))
+ inode->i_op = &f2fs_encrypted_symlink_inode_operations;
+ else
+ inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
f2fs_lock_op(sbi);
@@ -269,15 +351,66 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
if (err)
goto out;
f2fs_unlock_op(sbi);
-
- err = page_symlink(inode, symname, symlen);
alloc_nid_done(sbi, inode->i_ino);
+ if (f2fs_encrypted_inode(dir)) {
+ struct qstr istr = QSTR_INIT(symname, len);
+
+ err = f2fs_get_encryption_info(inode);
+ if (err)
+ goto err_out;
+
+ err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link);
+ if (err)
+ goto err_out;
+
+ err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link);
+ if (err < 0)
+ goto err_out;
+
+ p_len = encrypted_symlink_data_len(disk_link.len) + 1;
+
+ if (p_len > dir->i_sb->s_blocksize) {
+ err = -ENAMETOOLONG;
+ goto err_out;
+ }
+
+ sd = kzalloc(p_len, GFP_NOFS);
+ if (!sd) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ memcpy(sd->encrypted_path, disk_link.name, disk_link.len);
+ sd->len = cpu_to_le16(disk_link.len);
+ p_str = (char *)sd;
+ } else {
+ p_len = len + 1;
+ p_str = (char *)symname;
+ }
+
+ err = page_symlink(inode, p_str, p_len);
+
+err_out:
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ /*
+ * Let's flush symlink data in order to avoid broken symlink as much as
+ * possible. Nevertheless, fsyncing is the best way, but there is no
+ * way to get a file descriptor in order to flush that.
+ *
+ * Note that, it needs to do dir->fsync to make this recoverable.
+ * If the symlink path is stored into inline_data, there is no
+ * performance regression.
+ */
+ if (!err)
+ filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
+
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
+
+ kfree(sd);
+ f2fs_fname_crypto_free_buffer(&disk_link);
return err;
out:
handle_failed_inode(inode);
@@ -308,7 +441,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_fail;
f2fs_unlock_op(sbi);
- stat_inc_inline_dir(inode);
alloc_nid_done(sbi, inode->i_ino);
d_instantiate(dentry, inode);
@@ -326,7 +458,7 @@ out_fail:
static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (f2fs_empty_dir(inode))
return f2fs_unlink(dir, dentry);
return -ENOTEMPTY;
@@ -370,19 +502,101 @@ out:
return err;
}
+static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
+ umode_t mode, struct inode **whiteout)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ struct inode *inode;
+ int err;
+
+ if (!whiteout)
+ f2fs_balance_fs(sbi);
+
+ inode = f2fs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (whiteout) {
+ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
+ inode->i_op = &f2fs_special_inode_operations;
+ } else {
+ inode->i_op = &f2fs_file_inode_operations;
+ inode->i_fop = &f2fs_file_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ }
+
+ f2fs_lock_op(sbi);
+ err = acquire_orphan_inode(sbi);
+ if (err)
+ goto out;
+
+ err = f2fs_do_tmpfile(inode, dir);
+ if (err)
+ goto release_out;
+
+ /*
+ * add this non-linked tmpfile to orphan list, in this way we could
+ * remove all unused data of tmpfile after abnormal power-off.
+ */
+ add_orphan_inode(sbi, inode->i_ino);
+ f2fs_unlock_op(sbi);
+
+ alloc_nid_done(sbi, inode->i_ino);
+
+ if (whiteout) {
+ inode_dec_link_count(inode);
+ *whiteout = inode;
+ } else {
+ d_tmpfile(dentry, inode);
+ }
+ unlock_new_inode(inode);
+ return 0;
+
+release_out:
+ release_orphan_inode(sbi);
+out:
+ handle_failed_inode(inode);
+ return err;
+}
+
+static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ if (f2fs_encrypted_inode(dir)) {
+ int err = f2fs_get_encryption_info(dir);
+ if (err)
+ return err;
+ }
+
+ return __f2fs_tmpfile(dir, dentry, mode, NULL);
+}
+
+static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
+{
+ return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
+}
+
static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
+ struct inode *whiteout = NULL;
struct page *old_dir_page;
- struct page *old_page, *new_page;
+ struct page *old_page, *new_page = NULL;
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
int err = -ENOENT;
+ if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
+ !f2fs_is_child_context_consistent_with_parent(new_dir,
+ old_inode)) {
+ err = -EPERM;
+ goto out;
+ }
+
f2fs_balance_fs(sbi);
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
@@ -396,17 +610,23 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_old;
}
+ if (flags & RENAME_WHITEOUT) {
+ err = f2fs_create_whiteout(old_dir, &whiteout);
+ if (err)
+ goto out_dir;
+ }
+
if (new_inode) {
err = -ENOTEMPTY;
if (old_dir_entry && !f2fs_empty_dir(new_inode))
- goto out_dir;
+ goto out_whiteout;
err = -ENOENT;
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
&new_page);
if (!new_entry)
- goto out_dir;
+ goto out_whiteout;
f2fs_lock_op(sbi);
@@ -414,7 +634,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
goto put_out_dir;
- if (update_dent_inode(old_inode, &new_dentry->d_name)) {
+ if (update_dent_inode(old_inode, new_inode,
+ &new_dentry->d_name)) {
release_orphan_inode(sbi);
goto put_out_dir;
}
@@ -443,7 +664,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
err = f2fs_add_link(new_dentry, old_inode);
if (err) {
f2fs_unlock_op(sbi);
- goto out_dir;
+ goto out_whiteout;
}
if (old_dir_entry) {
@@ -454,6 +675,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
down_write(&F2FS_I(old_inode)->i_sem);
file_lost_pino(old_inode);
+ if (new_inode && file_enc_name(new_inode))
+ file_set_enc_name(old_inode);
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = CURRENT_TIME;
@@ -461,8 +684,18 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
+ if (whiteout) {
+ whiteout->i_state |= I_LINKABLE;
+ set_inode_flag(F2FS_I(whiteout), FI_INC_LINK);
+ err = f2fs_add_link(old_dentry, whiteout);
+ if (err)
+ goto put_out_dir;
+ whiteout->i_state &= ~I_LINKABLE;
+ iput(whiteout);
+ }
+
if (old_dir_entry) {
- if (old_dir != new_dir) {
+ if (old_dir != new_dir && !whiteout) {
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
update_inode_page(old_inode);
@@ -483,8 +716,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
put_out_dir:
f2fs_unlock_op(sbi);
- f2fs_dentry_kunmap(new_dir, new_page);
- f2fs_put_page(new_page, 0);
+ if (new_page) {
+ f2fs_dentry_kunmap(new_dir, new_page);
+ f2fs_put_page(new_page, 0);
+ }
+out_whiteout:
+ if (whiteout)
+ iput(whiteout);
out_dir:
if (old_dir_entry) {
f2fs_dentry_kunmap(old_inode, old_dir_page);
@@ -501,8 +739,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct page *old_dir_page, *new_dir_page;
struct page *old_page, *new_page;
struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL;
@@ -510,6 +748,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
int old_nlink = 0, new_nlink = 0;
int err = -ENOENT;
+ if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
+ (old_dir != new_dir) &&
+ (!f2fs_is_child_context_consistent_with_parent(new_dir,
+ old_inode) ||
+ !f2fs_is_child_context_consistent_with_parent(old_dir,
+ new_inode)))
+ return -EPERM;
+
f2fs_balance_fs(sbi);
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
@@ -556,13 +802,17 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_lock_op(sbi);
- err = update_dent_inode(old_inode, &new_dentry->d_name);
+ err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
if (err)
goto out_unlock;
+ if (file_enc_name(new_inode))
+ file_set_enc_name(old_inode);
- err = update_dent_inode(new_inode, &old_dentry->d_name);
+ err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name);
if (err)
goto out_undo;
+ if (file_enc_name(old_inode))
+ file_set_enc_name(new_inode);
/* update ".." directory entry info of old dentry */
if (old_dir_entry)
@@ -620,8 +870,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_sync_fs(sbi->sb, 1);
return 0;
out_undo:
- /* Still we may fail to recover name info of f2fs_inode here */
- update_dent_inode(old_inode, &old_dentry->d_name);
+ /*
+ * Still we may fail to recover name info of f2fs_inode here
+ * Drop it, once its name is set as encrypted
+ */
+ update_dent_inode(old_inode, old_inode, &old_dentry->d_name);
out_unlock:
f2fs_unlock_op(sbi);
out_new_dir:
@@ -648,7 +901,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
- if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
if (flags & RENAME_EXCHANGE) {
@@ -659,51 +912,85 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
* VFS has already handled the new dentry existence case,
* here, we just deal with "RENAME_NOREPLACE" as regular rename.
*/
- return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry);
+ return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
-static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct inode *inode;
- int err;
-
- inode = f2fs_new_inode(dir, mode);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- inode->i_op = &f2fs_file_inode_operations;
- inode->i_fop = &f2fs_file_operations;
- inode->i_mapping->a_ops = &f2fs_dblock_aops;
-
- f2fs_lock_op(sbi);
- err = acquire_orphan_inode(sbi);
- if (err)
- goto out;
-
- err = f2fs_do_tmpfile(inode, dir);
- if (err)
- goto release_out;
-
- /*
- * add this non-linked tmpfile to orphan list, in this way we could
- * remove all unused data of tmpfile after abnormal power-off.
- */
- add_orphan_inode(sbi, inode->i_ino);
- f2fs_unlock_op(sbi);
-
- alloc_nid_done(sbi, inode->i_ino);
- d_tmpfile(dentry, inode);
- unlock_new_inode(inode);
- return 0;
+ struct page *cpage = NULL;
+ char *caddr, *paddr = NULL;
+ struct f2fs_str cstr;
+ struct f2fs_str pstr = FSTR_INIT(NULL, 0);
+ struct inode *inode = d_inode(dentry);
+ struct f2fs_encrypted_symlink_data *sd;
+ loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
+ u32 max_size = inode->i_sb->s_blocksize;
+ int res;
+
+ res = f2fs_get_encryption_info(inode);
+ if (res)
+ return ERR_PTR(res);
+
+ cpage = read_mapping_page(inode->i_mapping, 0, NULL);
+ if (IS_ERR(cpage))
+ return ERR_CAST(cpage);
+ caddr = kmap(cpage);
+ caddr[size] = 0;
+
+ /* Symlink is encrypted */
+ sd = (struct f2fs_encrypted_symlink_data *)caddr;
+ cstr.name = sd->encrypted_path;
+ cstr.len = le16_to_cpu(sd->len);
+
+ /* this is broken symlink case */
+ if (cstr.name[0] == 0 && cstr.len == 0) {
+ res = -ENOENT;
+ goto errout;
+ }
-release_out:
- release_orphan_inode(sbi);
-out:
- handle_failed_inode(inode);
- return err;
+ if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) >
+ max_size) {
+ /* Symlink data on the disk is corrupted */
+ res = -EIO;
+ goto errout;
+ }
+ res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr);
+ if (res)
+ goto errout;
+
+ res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+ if (res < 0)
+ goto errout;
+
+ paddr = pstr.name;
+
+ /* Null-terminate the name */
+ paddr[res] = '\0';
+
+ kunmap(cpage);
+ page_cache_release(cpage);
+ return *cookie = paddr;
+errout:
+ f2fs_fname_crypto_free_buffer(&pstr);
+ kunmap(cpage);
+ page_cache_release(cpage);
+ return ERR_PTR(res);
}
+const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = f2fs_encrypted_follow_link,
+ .put_link = kfree_put_link,
+ .getattr = f2fs_getattr,
+ .setattr = f2fs_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+};
+#endif
+
const struct inode_operations f2fs_dir_inode_operations = {
.create = f2fs_create,
.lookup = f2fs_lookup,
@@ -729,7 +1016,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
+ .follow_link = f2fs_follow_link,
.put_link = page_put_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 97bd9d3db882..7dd63b794bfb 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -41,7 +41,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
/* only uses low memory */
avail_ram = val.totalram - val.totalhigh;
- /* give 25%, 25%, 50%, 50% memory for each components respectively */
+ /*
+ * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
+ */
if (type == FREE_NIDS) {
mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
PAGE_CACHE_SHIFT;
@@ -51,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == DIRTY_DENTS) {
- if (sbi->sb->s_bdi->dirty_exceeded)
+ if (sbi->sb->s_bdi->wb.dirty_exceeded)
return false;
mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
@@ -62,8 +64,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
mem_size += (sbi->im[i].ino_num *
sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ } else if (type == EXTENT_CACHE) {
+ mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) +
+ atomic_read(&sbi->total_ext_node) *
+ sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else {
- if (sbi->sb->s_bdi->dirty_exceeded)
+ if (sbi->sb->s_bdi->wb.dirty_exceeded)
return false;
}
return res;
@@ -188,32 +195,35 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
start, nr);
}
-bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- bool is_cp = true;
+ bool need = false;
down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
- if (e && !get_nat_flag(e, IS_CHECKPOINTED))
- is_cp = false;
+ if (e) {
+ if (!get_nat_flag(e, IS_CHECKPOINTED) &&
+ !get_nat_flag(e, HAS_FSYNCED_INODE))
+ need = true;
+ }
up_read(&nm_i->nat_tree_lock);
- return is_cp;
+ return need;
}
-bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
+bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- bool fsynced = false;
+ bool is_cp = true;
down_read(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, ino);
- if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
- fsynced = true;
+ e = __lookup_nat_cache(nm_i, nid);
+ if (e && !get_nat_flag(e, IS_CHECKPOINTED))
+ is_cp = false;
up_read(&nm_i->nat_tree_lock);
- return fsynced;
+ return is_cp;
}
bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
@@ -305,7 +315,8 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
__set_nat_cache_dirty(nm_i, e);
/* update fsync_mark if its inode nat entry is still alive */
- e = __lookup_nat_cache(nm_i, ni->ino);
+ if (ni->nid != ni->ino)
+ e = __lookup_nat_cache(nm_i, ni->ino);
if (e) {
if (fsync_done && ni->nid == ni->ino)
set_nat_flag(e, HAS_FSYNCED_INODE, true);
@@ -494,7 +505,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
/* if inline_data is set, should not report any block indices */
if (f2fs_has_inline_data(dn->inode) && index) {
- err = -EINVAL;
+ err = -ENOENT;
f2fs_put_page(npage[0], 1);
goto release_out;
}
@@ -988,13 +999,17 @@ static int read_node_page(struct page *page, int rw)
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
struct node_info ni;
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = NODE,
.rw = rw,
+ .page = page,
+ .encrypted_page = NULL,
};
get_node_info(sbi, page->index, &ni);
if (unlikely(ni.blk_addr == NULL_ADDR)) {
+ ClearPageUptodate(page);
f2fs_put_page(page, 1);
return -ENOENT;
}
@@ -1003,7 +1018,7 @@ static int read_node_page(struct page *page, int rw)
return LOCKED_PAGE;
fio.blk_addr = ni.blk_addr;
- return f2fs_submit_page_bio(sbi, page, &fio);
+ return f2fs_submit_page_bio(&fio);
}
/*
@@ -1196,13 +1211,9 @@ continue_unlock:
/* called by fsync() */
if (ino && IS_DNODE(page)) {
set_fsync_mark(page, 1);
- if (IS_INODE(page)) {
- if (!is_checkpointed_node(sbi, ino) &&
- !has_fsynced_inode(sbi, ino))
- set_dentry_mark(page, 1);
- else
- set_dentry_mark(page, 0);
- }
+ if (IS_INODE(page))
+ set_dentry_mark(page,
+ need_dentry_mark(sbi, ino));
nwritten++;
} else {
set_fsync_mark(page, 0);
@@ -1285,8 +1296,11 @@ static int f2fs_write_node_page(struct page *page,
nid_t nid;
struct node_info ni;
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = NODE,
.rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ .page = page,
+ .encrypted_page = NULL,
};
trace_f2fs_writepage(page, NODE);
@@ -1306,6 +1320,7 @@ static int f2fs_write_node_page(struct page *page,
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
+ ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
unlock_page(page);
return 0;
@@ -1320,7 +1335,7 @@ static int f2fs_write_node_page(struct page *page,
set_page_writeback(page);
fio.blk_addr = ni.blk_addr;
- write_node_page(sbi, page, nid, &fio);
+ write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
up_read(&sbi->node_write);
@@ -1821,6 +1836,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
struct f2fs_nat_block *nat_blk;
struct nat_entry *ne, *cur;
struct page *page = NULL;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
/*
* there are two steps to flush nat entries:
@@ -1874,7 +1890,9 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, set->entry_cnt);
+ down_write(&nm_i->nat_tree_lock);
radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+ up_write(&nm_i->nat_tree_lock);
kmem_cache_free(nat_entry_set_slab, set);
}
@@ -1902,6 +1920,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
remove_nats_in_journal(sbi);
+ down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_set(nm_i,
set_idx, SETVEC_SIZE, setvec))) {
unsigned idx;
@@ -1910,6 +1929,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
__adjust_nat_entry_set(setvec[idx], &sets,
MAX_NAT_JENTRIES(sum));
}
+ up_write(&nm_i->nat_tree_lock);
/* flush dirty nats in nat entry set */
list_for_each_entry_safe(set, tmp, &sets, set_list)
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index f405bbf2435a..7427e956ad81 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -120,6 +120,7 @@ enum mem_type {
NAT_ENTRIES, /* indicates the cached nat entry */
DIRTY_DENTS, /* indicates dirty dentry pages */
INO_ENTRIES, /* indicates inode entries */
+ EXTENT_CACHE, /* indicates extent cache */
BASE_CHECK, /* check kernel status */
};
@@ -342,28 +343,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold node blocks in their node footer
* - Mark cold data pages in page cache
*/
-static inline int is_file(struct inode *inode, int type)
-{
- return F2FS_I(inode)->i_advise & type;
-}
-
-static inline void set_file(struct inode *inode, int type)
-{
- F2FS_I(inode)->i_advise |= type;
-}
-
-static inline void clear_file(struct inode *inode, int type)
-{
- F2FS_I(inode)->i_advise &= ~type;
-}
-
-#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
-#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
-#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
-#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
-#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
-#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
-
static inline int is_cold_data(struct page *page)
{
return PageChecked(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 41afb9534bbd..24a8c1d4f45f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -83,6 +83,11 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
goto out;
}
+ if (file_enc_name(inode)) {
+ iput(dir);
+ return 0;
+ }
+
name.len = le32_to_cpu(raw_inode->i_namelen);
name.name = raw_inode->i_name;
@@ -93,10 +98,9 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
}
retry:
de = f2fs_find_entry(dir, &name, &page);
- if (de && inode->i_ino == le32_to_cpu(de->ino)) {
- clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ if (de && inode->i_ino == le32_to_cpu(de->ino))
goto out_unmap_put;
- }
+
if (de) {
einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
if (IS_ERR(einode)) {
@@ -115,7 +119,7 @@ retry:
iput(einode);
goto retry;
}
- err = __f2fs_add_link(dir, &name, inode);
+ err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
if (err)
goto out_err;
@@ -144,6 +148,7 @@ out:
static void recover_inode(struct inode *inode, struct page *page)
{
struct f2fs_inode *raw = F2FS_INODE(page);
+ char *name;
inode->i_mode = le16_to_cpu(raw->i_mode);
i_size_write(inode, le64_to_cpu(raw->i_size));
@@ -154,8 +159,13 @@ static void recover_inode(struct inode *inode, struct page *page)
inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+ if (file_enc_name(inode))
+ name = "<encrypted>";
+ else
+ name = F2FS_INODE(page)->i_name;
+
f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
- ino_of_node(page), F2FS_INODE(page)->i_name);
+ ino_of_node(page), name);
}
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -175,7 +185,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
while (1) {
struct fsync_inode_entry *entry;
- if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+ if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
return 0;
page = get_meta_page(sbi, blkaddr);
@@ -187,11 +197,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
- if (entry) {
- if (IS_INODE(page) && is_dent_dnode(page))
- set_inode_flag(F2FS_I(entry->inode),
- FI_INC_LINK);
- } else {
+ if (!entry) {
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
@@ -212,8 +218,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
if (IS_ERR(entry->inode)) {
err = PTR_ERR(entry->inode);
kmem_cache_free(fsync_entry_slab, entry);
- if (err == -ENOENT)
+ if (err == -ENOENT) {
+ err = 0;
goto next;
+ }
break;
}
list_add_tail(&entry->list, head);
@@ -256,6 +264,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
struct f2fs_summary_block *sum_node;
struct f2fs_summary sum;
struct page *sum_page, *node_page;
+ struct dnode_of_data tdn = *dn;
nid_t ino, nid;
struct inode *inode;
unsigned int offset;
@@ -283,17 +292,15 @@ got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
if (dn->inode->i_ino == nid) {
- struct dnode_of_data tdn = *dn;
tdn.nid = nid;
+ if (!dn->inode_page_locked)
+ lock_page(dn->inode_page);
tdn.node_page = dn->inode_page;
tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
- truncate_data_blocks_range(&tdn, 1);
- return 0;
+ goto truncate_out;
} else if (dn->nid == nid) {
- struct dnode_of_data tdn = *dn;
tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
- truncate_data_blocks_range(&tdn, 1);
- return 0;
+ goto truncate_out;
}
/* Get the node page */
@@ -317,18 +324,33 @@ got_it:
bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
le16_to_cpu(sum.ofs_in_node);
- if (ino != dn->inode->i_ino) {
- truncate_hole(inode, bidx, bidx + 1);
+ /*
+ * if inode page is locked, unlock temporarily, but its reference
+ * count keeps alive.
+ */
+ if (ino == dn->inode->i_ino && dn->inode_page_locked)
+ unlock_page(dn->inode_page);
+
+ set_new_dnode(&tdn, inode, NULL, NULL, 0);
+ if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
+ goto out;
+
+ if (tdn.data_blkaddr == blkaddr)
+ truncate_data_blocks_range(&tdn, 1);
+
+ f2fs_put_dnode(&tdn);
+out:
+ if (ino != dn->inode->i_ino)
iput(inode);
- } else {
- struct dnode_of_data tdn;
- set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
- if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
- return 0;
- if (tdn.data_blkaddr != NULL_ADDR)
- truncate_data_blocks_range(&tdn, 1);
- f2fs_put_page(tdn.node_page, 1);
- }
+ else if (dn->inode_page_locked)
+ lock_page(dn->inode_page);
+ return 0;
+
+truncate_out:
+ if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
+ truncate_data_blocks_range(&tdn, 1);
+ if (dn->inode->i_ino == nid && !dn->inode_page_locked)
+ unlock_page(dn->inode_page);
return 0;
}
@@ -338,7 +360,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int start, end;
struct dnode_of_data dn;
- struct f2fs_summary sum;
struct node_info ni;
int err = 0, recovered = 0;
@@ -384,7 +405,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
src = datablock_addr(dn.node_page, dn.ofs_in_node);
dest = datablock_addr(page, dn.ofs_in_node);
- if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
+ if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR &&
+ is_valid_blkaddr(sbi, dest, META_POR)) {
+
if (src == NULL_ADDR) {
err = reserve_new_block(&dn);
/* We should not get -ENOSPC */
@@ -396,19 +419,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (err)
goto err;
- set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
-
/* write dummy data page */
- recover_data_page(sbi, NULL, &sum, src, dest);
- dn.data_blkaddr = dest;
- update_extent_cache(&dn);
+ f2fs_replace_block(sbi, &dn, src, dest,
+ ni.version, false);
recovered++;
}
dn.ofs_in_node++;
}
- /* write node page in place */
- set_summary(&sum, dn.nid, 0, 0);
if (IS_INODE(dn.node_page))
sync_inode_page(&dn);
@@ -442,7 +460,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
while (1) {
struct fsync_inode_entry *entry;
- if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
+ if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
ra_meta_pages_cond(sbi, blkaddr);
@@ -552,7 +570,7 @@ out:
mutex_unlock(&sbi->cp_mutex);
} else if (need_writecp) {
struct cp_control cpc = {
- .reason = CP_SYNC,
+ .reason = CP_RECOVERY,
};
mutex_unlock(&sbi->cp_mutex);
write_checkpoint(sbi, &cpc);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index daee4ab913da..61b97f9cb9f6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -75,6 +75,14 @@ static inline unsigned long __reverse_ffs(unsigned long word)
static unsigned long __find_rev_next_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
+ while (!f2fs_test_bit(offset, (unsigned char *)addr))
+ offset++;
+
+ if (offset > size)
+ offset = size;
+
+ return offset;
+#if 0
const unsigned long *p = addr + BIT_WORD(offset);
unsigned long result = offset & ~(BITS_PER_LONG - 1);
unsigned long tmp;
@@ -121,11 +129,20 @@ found_first:
return result + size; /* Nope. */
found_middle:
return result + __reverse_ffs(tmp);
+#endif
}
static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
+ while (f2fs_test_bit(offset, (unsigned char *)addr))
+ offset++;
+
+ if (offset > size)
+ offset = size;
+
+ return offset;
+#if 0
const unsigned long *p = addr + BIT_WORD(offset);
unsigned long result = offset & ~(BITS_PER_LONG - 1);
unsigned long tmp;
@@ -173,6 +190,7 @@ found_first:
return result + size; /* Nope. */
found_middle:
return result + __reverse_ffz(tmp);
+#endif
}
void register_inmem_page(struct inode *inode, struct page *page)
@@ -205,6 +223,8 @@ retry:
list_add_tail(&new->list, &fi->inmem_pages);
inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
mutex_unlock(&fi->inmem_lock);
+
+ trace_f2fs_register_inmem_page(page, INMEM);
}
void commit_inmem_pages(struct inode *inode, bool abort)
@@ -214,8 +234,10 @@ void commit_inmem_pages(struct inode *inode, bool abort)
struct inmem_pages *cur, *tmp;
bool submit_bio = false;
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = DATA,
.rw = WRITE_SYNC | REQ_PRIO,
+ .encrypted_page = NULL,
};
/*
@@ -235,14 +257,18 @@ void commit_inmem_pages(struct inode *inode, bool abort)
if (!abort) {
lock_page(cur->page);
if (cur->page->mapping == inode->i_mapping) {
+ set_page_dirty(cur->page);
f2fs_wait_on_page_writeback(cur->page, DATA);
if (clear_page_dirty_for_io(cur->page))
inode_dec_dirty_pages(inode);
- do_write_data_page(cur->page, &fio);
+ trace_f2fs_commit_inmem_page(cur->page, INMEM);
+ fio.page = cur->page;
+ do_write_data_page(&fio);
submit_bio = true;
}
f2fs_put_page(cur->page, 1);
} else {
+ trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
put_page(cur->page);
}
radix_tree_delete(&fi->inmem_root, cur->page->index);
@@ -277,6 +303,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
{
+ /* try to shrink extent cache when there is no enough memory */
+ f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+
/* check the # of cached NAT entries and prefree segments */
if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
excess_prefree_segs(sbi) ||
@@ -459,22 +488,43 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
{
sector_t start = SECTOR_FROM_BLOCK(blkstart);
sector_t len = SECTOR_FROM_BLOCK(blklen);
+ struct seg_entry *se;
+ unsigned int offset;
+ block_t i;
+
+ for (i = blkstart; i < blkstart + blklen; i++) {
+ se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
+ offset = GET_BLKOFF_FROM_SEG0(sbi, i);
+
+ if (!f2fs_test_and_set_bit(offset, se->discard_map))
+ sbi->discard_blks--;
+ }
trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
}
void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
{
- if (f2fs_issue_discard(sbi, blkaddr, 1)) {
- struct page *page = grab_meta_page(sbi, blkaddr);
- /* zero-filled page */
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ int err = -ENOTSUPP;
+
+ if (test_opt(sbi, DISCARD)) {
+ struct seg_entry *se = get_seg_entry(sbi,
+ GET_SEGNO(sbi, blkaddr));
+ unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+
+ if (f2fs_test_bit(offset, se->discard_map))
+ return;
+
+ err = f2fs_issue_discard(sbi, blkaddr, 1);
}
+
+ if (err)
+ update_meta_page(sbi, NULL, blkaddr);
}
static void __add_discard_entry(struct f2fs_sb_info *sbi,
- struct cp_control *cpc, unsigned int start, unsigned int end)
+ struct cp_control *cpc, struct seg_entry *se,
+ unsigned int start, unsigned int end)
{
struct list_head *head = &SM_I(sbi)->discard_list;
struct discard_entry *new, *last;
@@ -495,7 +545,6 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi,
list_add_tail(&new->list, head);
done:
SM_I(sbi)->nr_discards += end - start;
- cpc->trimmed += end - start;
}
static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
@@ -505,41 +554,24 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+ unsigned long *discard_map = (unsigned long *)se->discard_map;
unsigned long *dmap = SIT_I(sbi)->tmp_map;
unsigned int start = 0, end = -1;
bool force = (cpc->reason == CP_DISCARD);
int i;
- if (!force && (!test_opt(sbi, DISCARD) ||
- SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards))
+ if (se->valid_blocks == max_blocks)
return;
- if (force && !se->valid_blocks) {
- struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- /*
- * if this segment is registered in the prefree list, then
- * we should skip adding a discard candidate, and let the
- * checkpoint do that later.
- */
- mutex_lock(&dirty_i->seglist_lock);
- if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
- mutex_unlock(&dirty_i->seglist_lock);
- cpc->trimmed += sbi->blocks_per_seg;
+ if (!force) {
+ if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
+ SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)
return;
- }
- mutex_unlock(&dirty_i->seglist_lock);
-
- __add_discard_entry(sbi, cpc, 0, sbi->blocks_per_seg);
- return;
}
- /* zero block will be discarded through the prefree list */
- if (!se->valid_blocks || se->valid_blocks == max_blocks)
- return;
-
/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
for (i = 0; i < entries; i++)
- dmap[i] = force ? ~ckpt_map[i] :
+ dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] :
(cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
@@ -548,11 +580,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
break;
end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
-
- if (end - start < cpc->trim_minlen)
- continue;
-
- __add_discard_entry(sbi, cpc, start, end);
+ __add_discard_entry(sbi, cpc, se, start, end);
}
}
@@ -582,7 +610,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
mutex_unlock(&dirty_i->seglist_lock);
}
-void clear_prefree_segments(struct f2fs_sb_info *sbi)
+void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct list_head *head = &(SM_I(sbi)->discard_list);
struct discard_entry *entry, *this;
@@ -615,7 +643,11 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
/* send small discards */
list_for_each_entry_safe(entry, this, head, list) {
+ if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen)
+ goto skip;
f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
+ cpc->trimmed += entry->len;
+skip:
list_del(&entry->list);
SM_I(sbi)->nr_discards -= entry->len;
kmem_cache_free(discard_entry_slab, entry);
@@ -666,9 +698,13 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
if (del > 0) {
if (f2fs_test_and_set_bit(offset, se->cur_valid_map))
f2fs_bug_on(sbi, 1);
+ if (!f2fs_test_and_set_bit(offset, se->discard_map))
+ sbi->discard_blks--;
} else {
if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map))
f2fs_bug_on(sbi, 1);
+ if (f2fs_test_and_clear_bit(offset, se->discard_map))
+ sbi->discard_blks++;
}
if (!f2fs_test_bit(offset, se->ckpt_valid_map))
se->ckpt_valid_blocks += del;
@@ -762,16 +798,25 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
}
-static void write_sum_page(struct f2fs_sb_info *sbi,
- struct f2fs_summary_block *sum_blk, block_t blk_addr)
+void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
{
struct page *page = grab_meta_page(sbi, blk_addr);
- void *kaddr = page_address(page);
- memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE);
+ void *dst = page_address(page);
+
+ if (src)
+ memcpy(dst, src, PAGE_CACHE_SIZE);
+ else
+ memset(dst, 0, PAGE_CACHE_SIZE);
set_page_dirty(page);
f2fs_put_page(page, 1);
}
+static void write_sum_page(struct f2fs_sb_info *sbi,
+ struct f2fs_summary_block *sum_blk, block_t blk_addr)
+{
+ update_meta_page(sbi, (void *)sum_blk, blk_addr);
+}
+
static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -1053,8 +1098,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
unsigned int start_segno, end_segno;
struct cp_control cpc;
- if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
- range->len < sbi->blocksize)
+ if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
cpc.trimmed = 0;
@@ -1066,12 +1110,19 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
GET_SEGNO(sbi, end);
cpc.reason = CP_DISCARD;
- cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
+ cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
/* do checkpoint to issue discard commands safely */
for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
cpc.trim_start = start_segno;
- cpc.trim_end = min_t(unsigned int, rounddown(start_segno +
+
+ if (sbi->discard_blks == 0)
+ break;
+ else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi))
+ cpc.trim_end = end_segno;
+ else
+ cpc.trim_end = min_t(unsigned int,
+ rounddown(start_segno +
BATCHED_TRIM_SEGMENTS(sbi),
sbi->segs_per_sec) - 1, end_segno);
@@ -1164,6 +1215,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
+ mutex_lock(&sit_i->sentry_lock);
/* direct_io'ed data is aligned to the segment for better performance */
if (direct_io && curseg->next_blkoff)
@@ -1178,7 +1230,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
*/
__add_sum_entry(sbi, type, sum);
- mutex_lock(&sit_i->sentry_lock);
__refresh_next_blkoff(sbi, curseg);
stat_inc_block_count(sbi, curseg);
@@ -1199,84 +1250,95 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
}
-static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
- struct f2fs_summary *sum,
- struct f2fs_io_info *fio)
+static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
- int type = __get_segment_type(page, fio->type);
+ int type = __get_segment_type(fio->page, fio->type);
- allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type);
+ allocate_data_block(fio->sbi, fio->page, fio->blk_addr,
+ &fio->blk_addr, sum, type);
/* writeout dirty page into bdev */
- f2fs_submit_page_mbio(sbi, page, fio);
+ f2fs_submit_page_mbio(fio);
}
void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
{
struct f2fs_io_info fio = {
+ .sbi = sbi,
.type = META,
.rw = WRITE_SYNC | REQ_META | REQ_PRIO,
.blk_addr = page->index,
+ .page = page,
+ .encrypted_page = NULL,
};
set_page_writeback(page);
- f2fs_submit_page_mbio(sbi, page, &fio);
+ f2fs_submit_page_mbio(&fio);
}
-void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
- unsigned int nid, struct f2fs_io_info *fio)
+void write_node_page(unsigned int nid, struct f2fs_io_info *fio)
{
struct f2fs_summary sum;
+
set_summary(&sum, nid, 0, 0);
- do_write_page(sbi, page, &sum, fio);
+ do_write_page(&sum, fio);
}
-void write_data_page(struct page *page, struct dnode_of_data *dn,
- struct f2fs_io_info *fio)
+void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
+ struct f2fs_sb_info *sbi = fio->sbi;
struct f2fs_summary sum;
struct node_info ni;
f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
- do_write_page(sbi, page, &sum, fio);
+ do_write_page(&sum, fio);
dn->data_blkaddr = fio->blk_addr;
}
-void rewrite_data_page(struct page *page, struct f2fs_io_info *fio)
+void rewrite_data_page(struct f2fs_io_info *fio)
{
- stat_inc_inplace_blocks(F2FS_P_SB(page));
- f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio);
+ stat_inc_inplace_blocks(fio->sbi);
+ f2fs_submit_page_mbio(fio);
}
-void recover_data_page(struct f2fs_sb_info *sbi,
- struct page *page, struct f2fs_summary *sum,
- block_t old_blkaddr, block_t new_blkaddr)
+static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
+ struct f2fs_summary *sum,
+ block_t old_blkaddr, block_t new_blkaddr,
+ bool recover_curseg)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg;
unsigned int segno, old_cursegno;
struct seg_entry *se;
int type;
+ unsigned short old_blkoff;
segno = GET_SEGNO(sbi, new_blkaddr);
se = get_seg_entry(sbi, segno);
type = se->type;
- if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
- if (old_blkaddr == NULL_ADDR)
- type = CURSEG_COLD_DATA;
- else
+ if (!recover_curseg) {
+ /* for recovery flow */
+ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
+ if (old_blkaddr == NULL_ADDR)
+ type = CURSEG_COLD_DATA;
+ else
+ type = CURSEG_WARM_DATA;
+ }
+ } else {
+ if (!IS_CURSEG(sbi, segno))
type = CURSEG_WARM_DATA;
}
+
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
old_cursegno = curseg->segno;
+ old_blkoff = curseg->next_blkoff;
/* change the current segment */
if (segno != curseg->segno) {
@@ -1290,30 +1352,67 @@ void recover_data_page(struct f2fs_sb_info *sbi,
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
locate_dirty_segment(sbi, old_cursegno);
+ if (recover_curseg) {
+ if (old_cursegno != curseg->segno) {
+ curseg->next_segno = old_cursegno;
+ change_curseg(sbi, type, true);
+ }
+ curseg->next_blkoff = old_blkoff;
+ }
+
mutex_unlock(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
}
+void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
+ block_t old_addr, block_t new_addr,
+ unsigned char version, bool recover_curseg)
+{
+ struct f2fs_summary sum;
+
+ set_summary(&sum, dn->nid, dn->ofs_in_node, version);
+
+ __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg);
+
+ dn->data_blkaddr = new_addr;
+ set_data_blkaddr(dn);
+ f2fs_update_extent_cache(dn);
+}
+
static inline bool is_merged_page(struct f2fs_sb_info *sbi,
struct page *page, enum page_type type)
{
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = &sbi->write_io[btype];
struct bio_vec *bvec;
+ struct page *target;
int i;
down_read(&io->io_rwsem);
- if (!io->bio)
- goto out;
+ if (!io->bio) {
+ up_read(&io->io_rwsem);
+ return false;
+ }
bio_for_each_segment_all(bvec, io->bio, i) {
- if (page == bvec->bv_page) {
+
+ if (bvec->bv_page->mapping) {
+ target = bvec->bv_page;
+ } else {
+ struct f2fs_crypto_ctx *ctx;
+
+ /* encrypted page */
+ ctx = (struct f2fs_crypto_ctx *)page_private(
+ bvec->bv_page);
+ target = ctx->w.control_page;
+ }
+
+ if (page == target) {
up_read(&io->io_rwsem);
return true;
}
}
-out:
up_read(&io->io_rwsem);
return false;
}
@@ -1730,6 +1829,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
+ if (!sit_i->dirty_sentries)
+ goto out;
+
/*
* add and account sit entries of dirty bitmap in sit entry
* set temporarily
@@ -1744,9 +1846,6 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
remove_sits_in_journal(sbi);
- if (!sit_i->dirty_sentries)
- goto out;
-
/*
* there are two steps to flush sit entries:
* #1, flush sit entries to journal in current cold data summary block.
@@ -1850,8 +1949,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
sit_i->sentries[start].ckpt_valid_map
= kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
- if (!sit_i->sentries[start].cur_valid_map
- || !sit_i->sentries[start].ckpt_valid_map)
+ sit_i->sentries[start].discard_map
+ = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ if (!sit_i->sentries[start].cur_valid_map ||
+ !sit_i->sentries[start].ckpt_valid_map ||
+ !sit_i->sentries[start].discard_map)
return -ENOMEM;
}
@@ -1989,6 +2091,11 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
got_it:
check_block_count(sbi, start, &sit);
seg_info_from_raw_sit(se, &sit);
+
+ /* build discard map only one time */
+ memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
+ sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks;
+
if (sbi->segs_per_sec > 1) {
struct sec_entry *e = get_sec_entry(sbi, start);
e->valid_blocks += se->valid_blocks;
@@ -2238,6 +2345,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
for (start = 0; start < MAIN_SEGS(sbi); start++) {
kfree(sit_i->sentries[start].cur_valid_map);
kfree(sit_i->sentries[start].ckpt_valid_map);
+ kfree(sit_i->sentries[start].discard_map);
}
}
kfree(sit_i->tmp_map);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7fd35111cf62..79e7b879a753 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -9,6 +9,7 @@
* published by the Free Software Foundation.
*/
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
/* constant macro */
#define NULL_SEGNO ((unsigned int)(~0))
@@ -163,6 +164,7 @@ struct seg_entry {
*/
unsigned short ckpt_valid_blocks;
unsigned char *ckpt_valid_map;
+ unsigned char *discard_map;
unsigned char type; /* segment type like CURSEG_XXX_TYPE */
unsigned long long mtime; /* modification time of the segment */
};
@@ -336,7 +338,8 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
clear_bit(segno, free_i->free_segmap);
free_i->free_segments++;
- next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
+ next = find_next_bit(free_i->free_segmap,
+ start_segno + sbi->segs_per_sec, start_segno);
if (next >= start_segno + sbi->segs_per_sec) {
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
@@ -712,7 +715,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
*/
static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
{
- if (sbi->sb->s_bdi->dirty_exceeded)
+ if (sbi->sb->s_bdi->wb.dirty_exceeded)
return 0;
if (type == DATA)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f2fe666a6ea9..a06b0b46fe69 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -57,6 +57,8 @@ enum {
Opt_flush_merge,
Opt_nobarrier,
Opt_fastboot,
+ Opt_extent_cache,
+ Opt_noinline_data,
Opt_err,
};
@@ -78,6 +80,8 @@ static match_table_t f2fs_tokens = {
{Opt_flush_merge, "flush_merge"},
{Opt_nobarrier, "nobarrier"},
{Opt_fastboot, "fastboot"},
+ {Opt_extent_cache, "extent_cache"},
+ {Opt_noinline_data, "noinline_data"},
{Opt_err, NULL},
};
@@ -254,6 +258,7 @@ static void init_once(void *foo)
static int parse_options(struct super_block *sb, char *options)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct request_queue *q;
substring_t args[MAX_OPT_ARGS];
char *p, *name;
int arg = 0;
@@ -298,7 +303,14 @@ static int parse_options(struct super_block *sb, char *options)
return -EINVAL;
break;
case Opt_discard:
- set_opt(sbi, DISCARD);
+ q = bdev_get_queue(sb->s_bdev);
+ if (blk_queue_discard(q)) {
+ set_opt(sbi, DISCARD);
+ } else {
+ f2fs_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but "
+ "the device does not support discard");
+ }
break;
case Opt_noheap:
set_opt(sbi, NOHEAP);
@@ -367,6 +379,12 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_fastboot:
set_opt(sbi, FASTBOOT);
break;
+ case Opt_extent_cache:
+ set_opt(sbi, EXTENT_CACHE);
+ break;
+ case Opt_noinline_data:
+ clear_opt(sbi, INLINE_DATA);
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -392,7 +410,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
atomic_set(&fi->dirty_pages, 0);
fi->i_current_depth = 1;
fi->i_advise = 0;
- rwlock_init(&fi->ext.ext_lock);
+ rwlock_init(&fi->ext_lock);
init_rwsem(&fi->i_sem);
INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
INIT_LIST_HEAD(&fi->inmem_pages);
@@ -406,6 +424,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ fi->i_crypt_info = NULL;
+#endif
return &fi->vfs_inode;
}
@@ -418,8 +439,31 @@ static int f2fs_drop_inode(struct inode *inode)
* - f2fs_gc -> iput -> evict
* - inode_wait_for_writeback(inode)
*/
- if (!inode_unhashed(inode) && inode->i_state & I_SYNC)
+ if (!inode_unhashed(inode) && inode->i_state & I_SYNC) {
+ if (!inode->i_nlink && !is_bad_inode(inode)) {
+ spin_unlock(&inode->i_lock);
+
+ /* some remained atomic pages should discarded */
+ if (f2fs_is_atomic_file(inode))
+ commit_inmem_pages(inode, true);
+
+ sb_start_intwrite(inode->i_sb);
+ i_size_write(inode, 0);
+
+ if (F2FS_HAS_BLOCKS(inode))
+ f2fs_truncate(inode);
+
+ sb_end_intwrite(inode->i_sb);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ if (F2FS_I(inode)->i_crypt_info)
+ f2fs_free_encryption_info(inode,
+ F2FS_I(inode)->i_crypt_info);
+#endif
+ spin_lock(&inode->i_lock);
+ }
return 0;
+ }
return generic_drop_inode(inode);
}
@@ -510,7 +554,7 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
} else {
f2fs_balance_fs(sbi);
}
- f2fs_trace_ios(NULL, NULL, 1);
+ f2fs_trace_ios(NULL, 1);
return 0;
}
@@ -591,6 +635,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",disable_ext_identify");
if (test_opt(sbi, INLINE_DATA))
seq_puts(seq, ",inline_data");
+ else
+ seq_puts(seq, ",noinline_data");
if (test_opt(sbi, INLINE_DENTRY))
seq_puts(seq, ",inline_dentry");
if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
@@ -599,6 +645,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",nobarrier");
if (test_opt(sbi, FASTBOOT))
seq_puts(seq, ",fastboot");
+ if (test_opt(sbi, EXTENT_CACHE))
+ seq_puts(seq, ",extent_cache");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
return 0;
@@ -644,6 +692,22 @@ static const struct file_operations f2fs_seq_segment_info_fops = {
.release = single_release,
};
+static void default_options(struct f2fs_sb_info *sbi)
+{
+ /* init some FS parameters */
+ sbi->active_logs = NR_CURSEG_TYPE;
+
+ set_opt(sbi, BG_GC);
+ set_opt(sbi, INLINE_DATA);
+
+#ifdef CONFIG_F2FS_FS_XATTR
+ set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ set_opt(sbi, POSIX_ACL);
+#endif
+}
+
static int f2fs_remount(struct super_block *sb, int *flags, char *data)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -662,7 +726,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
active_logs = sbi->active_logs;
sbi->mount_opt.opt = 0;
- sbi->active_logs = NR_CURSEG_TYPE;
+ default_options(sbi);
/* parse mount options */
err = parse_options(sb, data);
@@ -915,29 +979,36 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
*/
static int read_raw_super_block(struct super_block *sb,
struct f2fs_super_block **raw_super,
- struct buffer_head **raw_super_buf)
+ struct buffer_head **raw_super_buf,
+ int *recovery)
{
int block = 0;
+ struct buffer_head *buffer;
+ struct f2fs_super_block *super;
+ int err = 0;
retry:
- *raw_super_buf = sb_bread(sb, block);
- if (!*raw_super_buf) {
+ buffer = sb_bread(sb, block);
+ if (!buffer) {
+ *recovery = 1;
f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
block + 1);
if (block == 0) {
block++;
goto retry;
} else {
- return -EIO;
+ err = -EIO;
+ goto out;
}
}
- *raw_super = (struct f2fs_super_block *)
- ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET);
+ super = (struct f2fs_super_block *)
+ ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET);
/* sanity checking of raw super */
- if (sanity_check_raw_super(sb, *raw_super)) {
- brelse(*raw_super_buf);
+ if (sanity_check_raw_super(sb, super)) {
+ brelse(buffer);
+ *recovery = 1;
f2fs_msg(sb, KERN_ERR,
"Can't find valid F2FS filesystem in %dth superblock",
block + 1);
@@ -945,25 +1016,76 @@ retry:
block++;
goto retry;
} else {
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
}
+ if (!*raw_super) {
+ *raw_super_buf = buffer;
+ *raw_super = super;
+ } else {
+ /* already have a valid superblock */
+ brelse(buffer);
+ }
+
+ /* check the validity of the second superblock */
+ if (block == 0) {
+ block++;
+ goto retry;
+ }
+
+out:
+ /* No valid superblock */
+ if (!*raw_super)
+ return err;
+
return 0;
}
+int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
+{
+ struct buffer_head *sbh = sbi->raw_super_buf;
+ sector_t block = sbh->b_blocknr;
+ int err;
+
+ /* write back-up superblock first */
+ sbh->b_blocknr = block ? 0 : 1;
+ mark_buffer_dirty(sbh);
+ err = sync_dirty_buffer(sbh);
+
+ sbh->b_blocknr = block;
+
+ /* if we are in recovery path, skip writing valid superblock */
+ if (recover || err)
+ goto out;
+
+ /* write current valid superblock */
+ mark_buffer_dirty(sbh);
+ err = sync_dirty_buffer(sbh);
+out:
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ return err;
+}
+
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
- struct f2fs_super_block *raw_super = NULL;
+ struct f2fs_super_block *raw_super;
struct buffer_head *raw_super_buf;
struct inode *root;
- long err = -EINVAL;
- bool retry = true;
+ long err;
+ bool retry = true, need_fsck = false;
char *options = NULL;
- int i;
+ int recovery, i;
try_onemore:
+ err = -EINVAL;
+ raw_super = NULL;
+ raw_super_buf = NULL;
+ recovery = 0;
+
/* allocate memory for f2fs-specific super block info */
sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
if (!sbi)
@@ -975,22 +1097,12 @@ try_onemore:
goto free_sbi;
}
- err = read_raw_super_block(sb, &raw_super, &raw_super_buf);
+ err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery);
if (err)
goto free_sbi;
sb->s_fs_info = sbi;
- /* init some FS parameters */
- sbi->active_logs = NR_CURSEG_TYPE;
-
- set_opt(sbi, BG_GC);
-
-#ifdef CONFIG_F2FS_FS_XATTR
- set_opt(sbi, XATTR_USER);
-#endif
-#ifdef CONFIG_F2FS_FS_POSIX_ACL
- set_opt(sbi, POSIX_ACL);
-#endif
+ default_options(sbi);
/* parse mount options */
options = kstrdup((const char *)data, GFP_KERNEL);
if (data && !options) {
@@ -1072,6 +1184,8 @@ try_onemore:
INIT_LIST_HEAD(&sbi->dir_inode_list);
spin_lock_init(&sbi->dir_inode_lock);
+ init_extent_cache_info(sbi);
+
init_ino_entry_info(sbi);
/* setup f2fs internal modules */
@@ -1131,14 +1245,6 @@ try_onemore:
proc_create_data("segment_info", S_IRUGO, sbi->s_proc,
&f2fs_seq_segment_info_fops, sb);
- if (test_opt(sbi, DISCARD)) {
- struct request_queue *q = bdev_get_queue(sb->s_bdev);
- if (!blk_queue_discard(q))
- f2fs_msg(sb, KERN_WARNING,
- "mounting with \"discard\" option, but "
- "the device does not support discard");
- }
-
sbi->s_kobj.kset = f2fs_kset;
init_completion(&sbi->s_kobj_unregister);
err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
@@ -1146,9 +1252,6 @@ try_onemore:
if (err)
goto free_proc;
- if (!retry)
- set_sbi_flag(sbi, SBI_NEED_FSCK);
-
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
/*
@@ -1160,8 +1263,13 @@ try_onemore:
err = -EROFS;
goto free_kobj;
}
+
+ if (need_fsck)
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+
err = recover_fsync_data(sbi);
if (err) {
+ need_fsck = true;
f2fs_msg(sb, KERN_ERR,
"Cannot recover all fsync data errno=%ld", err);
goto free_kobj;
@@ -1179,6 +1287,13 @@ try_onemore:
goto free_kobj;
}
kfree(options);
+
+ /* recover broken superblock */
+ if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) {
+ f2fs_msg(sb, KERN_INFO, "Recover invalid superblock");
+ f2fs_commit_super(sbi, true);
+ }
+
return 0;
free_kobj:
@@ -1212,7 +1327,7 @@ free_sbi:
/* give only one another chance */
if (retry) {
- retry = 0;
+ retry = false;
shrink_dcache_sb(sb);
goto try_onemore;
}
@@ -1278,20 +1393,30 @@ static int __init init_f2fs_fs(void)
err = create_checkpoint_caches();
if (err)
goto free_segment_manager_caches;
+ err = create_extent_cache();
+ if (err)
+ goto free_checkpoint_caches;
f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
if (!f2fs_kset) {
err = -ENOMEM;
- goto free_checkpoint_caches;
+ goto free_extent_cache;
}
- err = register_filesystem(&f2fs_fs_type);
+ err = f2fs_init_crypto();
if (err)
goto free_kset;
+ err = register_filesystem(&f2fs_fs_type);
+ if (err)
+ goto free_crypto;
f2fs_create_root_stats();
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
return 0;
+free_crypto:
+ f2fs_exit_crypto();
free_kset:
kset_unregister(f2fs_kset);
+free_extent_cache:
+ destroy_extent_cache();
free_checkpoint_caches:
destroy_checkpoint_caches();
free_segment_manager_caches:
@@ -1309,6 +1434,8 @@ static void __exit exit_f2fs_fs(void)
remove_proc_entry("fs/f2fs", NULL);
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
+ f2fs_exit_crypto();
+ destroy_extent_cache();
destroy_checkpoint_caches();
destroy_segment_manager_caches();
destroy_node_manager_caches();
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 875aa8179bc1..145fb659ad44 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -80,7 +80,7 @@ out:
radix_tree_preload_end();
}
-void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
+void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
{
struct inode *inode;
pid_t pid;
@@ -91,8 +91,8 @@ void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
return;
}
- inode = page->mapping->host;
- pid = page_private(page);
+ inode = fio->page->mapping->host;
+ pid = page_private(fio->page);
major = MAJOR(inode->i_sb->s_dev);
minor = MINOR(inode->i_sb->s_dev);
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
index 1041dbeb52ae..67db24ac1e85 100644
--- a/fs/f2fs/trace.h
+++ b/fs/f2fs/trace.h
@@ -33,12 +33,12 @@ struct last_io_info {
};
extern void f2fs_trace_pid(struct page *);
-extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int);
+extern void f2fs_trace_ios(struct f2fs_io_info *, int);
extern void f2fs_build_trace_ios(void);
extern void f2fs_destroy_trace_ios(void);
#else
#define f2fs_trace_pid(p)
-#define f2fs_trace_ios(p, i, n)
+#define f2fs_trace_ios(i, n)
#define f2fs_build_trace_ios()
#define f2fs_destroy_trace_ios()
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 5072bf9ae0ef..07449b980acb 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -83,7 +83,7 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_getxattr(dentry->d_inode, type, name, buffer, size, NULL);
+ return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL);
}
static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -108,7 +108,7 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_setxattr(dentry->d_inode, type, name,
+ return f2fs_setxattr(d_inode(dentry), type, name,
value, size, NULL, flags);
}
@@ -130,19 +130,20 @@ static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (strcmp(name, "") != 0)
return -EINVAL;
- *((char *)buffer) = F2FS_I(inode)->i_advise;
+ if (buffer)
+ *((char *)buffer) = F2FS_I(inode)->i_advise;
return sizeof(char);
}
static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags, int type)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (strcmp(name, "") != 0)
return -EINVAL;
@@ -152,6 +153,7 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
return -EINVAL;
F2FS_I(inode)->i_advise |= *(char *)value;
+ mark_inode_dirty(inode);
return 0;
}
@@ -442,7 +444,7 @@ cleanup:
ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct f2fs_xattr_entry *entry;
void *base_addr;
int error = 0;
@@ -582,6 +584,9 @@ static int __f2fs_setxattr(struct inode *inode, int index,
inode->i_ctime = CURRENT_TIME;
clear_inode_flag(fi, FI_ACL_MODE);
}
+ if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
+ !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
+ f2fs_set_encrypted_inode(inode);
if (ipage)
update_inode(inode, ipage);
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 969d792ca362..71a7100d5492 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -35,6 +35,10 @@
#define F2FS_XATTR_INDEX_LUSTRE 5
#define F2FS_XATTR_INDEX_SECURITY 6
#define F2FS_XATTR_INDEX_ADVISE 7
+/* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */
+#define F2FS_XATTR_INDEX_ENCRYPTION 9
+
+#define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c"
struct f2fs_xattr_header {
__le32 h_magic; /* magic number for identification */
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 91ad9e1c9441..93fc62232ec2 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -8,9 +8,7 @@
* May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers.
*/
-#include <linux/fs.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include "fat.h"
/* this must be > 0. */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index c5d6bb939d19..4afc4d9d2e41 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -13,13 +13,9 @@
* Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de>
*/
-#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/buffer_head.h>
#include <linux/compat.h>
#include <linux/uaccess.h>
-#include <linux/kernel.h>
#include "fat.h"
/*
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 64e295e8ff38..be5e15323bab 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -2,11 +2,8 @@
#define _FAT_H
#include <linux/buffer_head.h>
-#include <linux/string.h>
#include <linux/nls.h>
-#include <linux/fs.h>
#include <linux/hash.h>
-#include <linux/mutex.h>
#include <linux/ratelimit.h>
#include <linux/msdos_fs.h>
@@ -66,7 +63,7 @@ struct msdos_sb_info {
unsigned short sec_per_clus; /* sectors/cluster */
unsigned short cluster_bits; /* log2(cluster_size) */
unsigned int cluster_size; /* cluster size */
- unsigned char fats, fat_bits; /* number of FATs, FAT bits (12 or 16) */
+ unsigned char fats, fat_bits; /* number of FATs, FAT bits (12,16 or 32) */
unsigned short fat_start;
unsigned long fat_length; /* FAT start & length (sec.) */
unsigned long dir_start;
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 260705c58062..8226557130a2 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -3,9 +3,6 @@
* Released under GPL v2.
*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/msdos_fs.h>
#include <linux/blkdev.h>
#include "fat.h"
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 8429c68e3057..a08f1039909a 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -10,11 +10,8 @@
#include <linux/module.h>
#include <linux/compat.h>
#include <linux/mount.h>
-#include <linux/time.h>
-#include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include <linux/backing-dev.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include "fat.h"
@@ -170,8 +167,6 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
const struct file_operations fat_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
@@ -311,7 +306,7 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset)
int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
generic_fillattr(inode, stat);
stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
@@ -383,7 +378,7 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
int fat_setattr(struct dentry *dentry, struct iattr *attr)
{
struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
unsigned int ia_valid;
int error;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 497c7c5263c7..509411dd3698 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -11,22 +11,14 @@
*/
#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/time.h>
-#include <linux/slab.h>
-#include <linux/seq_file.h>
#include <linux/pagemap.h>
#include <linux/mpage.h>
-#include <linux/buffer_head.h>
-#include <linux/mount.h>
-#include <linux/aio.h>
#include <linux/vfs.h>
+#include <linux/seq_file.h>
#include <linux/parser.h>
#include <linux/uio.h>
-#include <linux/writeback.h>
-#include <linux/log2.h>
-#include <linux/hash.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <asm/unaligned.h>
#include "fat.h"
@@ -246,8 +238,7 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
return err;
}
-static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter,
+static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t offset)
{
struct file *file = iocb->ki_filp;
@@ -256,7 +247,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
size_t count = iov_iter_count(iter);
ssize_t ret;
- if (rw == WRITE) {
+ if (iov_iter_rw(iter) == WRITE) {
/*
* FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
* so we need to update the ->mmu_private to block boundary.
@@ -275,8 +266,8 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
* FAT need to use the DIO_LOCKING for avoiding the race
* condition of fat_get_block() and ->truncate().
*/
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block);
- if (ret < 0 && (rw & WRITE))
+ ret = blockdev_direct_IO(iocb, inode, iter, offset, fat_get_block);
+ if (ret < 0 && iov_iter_rw(iter) == WRITE)
fat_write_failed(mapping, offset + count);
return ret;
@@ -1280,8 +1271,7 @@ out:
static int fat_read_root(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
- struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
int error;
MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d8da2d2e30ae..c4589e981760 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -6,10 +6,6 @@
* and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/time.h>
#include "fat.h"
/*
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index a783b0e1272a..b7e2b33aa793 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -7,8 +7,6 @@
*/
#include <linux/module.h>
-#include <linux/time.h>
-#include <linux/buffer_head.h>
#include "fat.h"
/* Characters that are undesirable in an MS-DOS file name */
@@ -310,7 +308,7 @@ out:
static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
{
struct super_block *sb = dir->i_sb;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct fat_slot_info sinfo;
int err;
@@ -404,7 +402,7 @@ out:
/***** Unlink a file */
static int msdos_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
struct fat_slot_info sinfo;
int err;
@@ -442,8 +440,8 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
int err, old_attrs, is_dir, update_dotdot, corrupt = 0;
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
- old_inode = old_dentry->d_inode;
- new_inode = new_dentry->d_inode;
+ old_inode = d_inode(old_dentry);
+ new_inode = d_inode(new_dentry);
err = fat_scan(old_dir, old_name, &old_sinfo);
if (err) {
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b8b92c2f9683..7092584f424a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -16,10 +16,8 @@
*/
#include <linux/module.h>
-#include <linux/jiffies.h>
#include <linux/ctype.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include <linux/namei.h>
#include "fat.h"
@@ -35,7 +33,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
{
int ret = 1;
spin_lock(&dentry->d_lock);
- if (dentry->d_time != dentry->d_parent->d_inode->i_version)
+ if (dentry->d_time != d_inode(dentry->d_parent)->i_version)
ret = 0;
spin_unlock(&dentry->d_lock);
return ret;
@@ -47,7 +45,7 @@ static int vfat_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
/* This is not negative dentry. Always valid. */
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
return 1;
return vfat_revalidate_shortname(dentry);
}
@@ -67,7 +65,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags)
* positive dentry isn't good idea. So it's unsupported like
* rename("filename", "FILENAME") for now.
*/
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
return 1;
/*
@@ -803,7 +801,7 @@ out:
static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
@@ -834,7 +832,7 @@ out:
static int vfat_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
@@ -917,8 +915,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
struct super_block *sb = old_dir->i_sb;
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
- old_inode = old_dentry->d_inode;
- new_inode = new_dentry->d_inode;
+ old_inode = d_inode(old_dentry);
+ new_inode = d_inode(new_dentry);
mutex_lock(&MSDOS_SB(sb)->s_lock);
err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
if (err)
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index 93e14933dcb6..eb192656fba2 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -266,7 +266,7 @@ struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart)
* Find the parent for a directory that is not currently connected to
* the filesystem root.
*
- * On entry, the caller holds child_dir->d_inode->i_mutex.
+ * On entry, the caller holds d_inode(child_dir)->i_mutex.
*/
static struct dentry *fat_get_parent(struct dentry *child_dir)
{
@@ -276,7 +276,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir)
struct inode *parent_inode = NULL;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) {
+ if (!fat_get_dotdot_entry(d_inode(child_dir), &bh, &de)) {
int parent_logstart = fat_get_start(sbi, de);
parent_inode = fat_dget(sb, parent_logstart);
if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO)
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 999ff5c3cab0..d59712dfa3e7 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -195,8 +195,9 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
goto out_err;
}
/* copy the full handle */
- if (copy_from_user(handle, ufh,
- sizeof(struct file_handle) +
+ *handle = f_handle;
+ if (copy_from_user(&handle->f_handle,
+ &ufh->f_handle,
f_handle.handle_bytes)) {
retval = -EFAULT;
goto out_handle;
diff --git a/fs/file.c b/fs/file.c
index ee738ea028fa..6c672ad329e9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)
spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr);
+
+ /* make sure all __fd_install() have seen resize_in_progress
+ * or have finished their rcu_read_lock_sched() section.
+ */
+ if (atomic_read(&files->count) > 1)
+ synchronize_sched();
+
spin_lock(&files->file_lock);
if (!new_fdt)
return -ENOMEM;
@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
__free_fdtable(new_fdt);
return -EMFILE;
}
- /*
- * Check again since another task may have expanded the fd table while
- * we dropped the lock
- */
cur_fdt = files_fdtable(files);
- if (nr >= cur_fdt->max_fds) {
- /* Continue as planned */
- copy_fdtable(new_fdt, cur_fdt);
- rcu_assign_pointer(files->fdt, new_fdt);
- if (cur_fdt != &files->fdtab)
- call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
- } else {
- /* Somebody else expanded, so undo our attempt */
- __free_fdtable(new_fdt);
- }
+ BUG_ON(nr < cur_fdt->max_fds);
+ copy_fdtable(new_fdt, cur_fdt);
+ rcu_assign_pointer(files->fdt, new_fdt);
+ if (cur_fdt != &files->fdtab)
+ call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
+ /* coupled with smp_rmb() in __fd_install() */
+ smp_wmb();
return 1;
}
@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_files(struct files_struct *files, int nr)
+ __releases(files->file_lock)
+ __acquires(files->file_lock)
{
struct fdtable *fdt;
+ int expanded = 0;
+repeat:
fdt = files_fdtable(files);
/* Do we need to expand? */
if (nr < fdt->max_fds)
- return 0;
+ return expanded;
/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;
+ if (unlikely(files->resize_in_progress)) {
+ spin_unlock(&files->file_lock);
+ expanded = 1;
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ spin_lock(&files->file_lock);
+ goto repeat;
+ }
+
/* All good, so we try */
- return expand_fdtable(files, nr);
+ files->resize_in_progress = true;
+ expanded = expand_fdtable(files, nr);
+ files->resize_in_progress = false;
+
+ wake_up_all(&files->resize_wait);
+ return expanded;
}
static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
atomic_set(&newf->count, 1);
spin_lock_init(&newf->file_lock);
+ newf->resize_in_progress = false;
+ init_waitqueue_head(&newf->resize_wait);
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
struct fdtable *fdt;
- spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
+
+ might_sleep();
+ rcu_read_lock_sched();
+
+ while (unlikely(files->resize_in_progress)) {
+ rcu_read_unlock_sched();
+ wait_event(files->resize_wait, !files->resize_in_progress);
+ rcu_read_lock_sched();
+ }
+ /* coupled with smp_wmb() in expand_fdtable() */
+ smp_rmb();
+ fdt = rcu_dereference_sched(files->fdt);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
- spin_unlock(&files->file_lock);
+ rcu_read_unlock_sched();
}
void fd_install(unsigned int fd, struct file *file)
@@ -635,12 +664,17 @@ static struct file *__fget(unsigned int fd, fmode_t mask)
struct file *file;
rcu_read_lock();
+loop:
file = fcheck_files(files, fd);
if (file) {
- /* File object ref couldn't be taken */
- if ((file->f_mode & mask) ||
- !atomic_long_inc_not_zero(&file->f_count))
+ /* File object ref couldn't be taken.
+ * dup2() atomicity guarantee is the reason
+ * we loop to catch the new file (or NULL pointer)
+ */
+ if (file->f_mode & mask)
file = NULL;
+ else if (!get_file_rcu(file))
+ goto loop;
}
rcu_read_unlock();
diff --git a/fs/file_table.c b/fs/file_table.c
index 3f85411b03ce..ad17e05ebf95 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,12 +20,12 @@
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
-#include <linux/lglock.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/task_work.h>
#include <linux/ima.h>
+#include <linux/swap.h>
#include <linux/atomic.h>
@@ -168,10 +168,10 @@ struct file *alloc_file(struct path *path, fmode_t mode,
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
if ((mode & FMODE_READ) &&
- likely(fop->read || fop->aio_read || fop->read_iter))
+ likely(fop->read || fop->read_iter))
mode |= FMODE_CAN_READ;
if ((mode & FMODE_WRITE) &&
- likely(fop->write || fop->aio_write || fop->write_iter))
+ likely(fop->write || fop->write_iter))
mode |= FMODE_CAN_WRITE;
file->f_mode = mode;
file->f_op = fop;
@@ -309,19 +309,24 @@ void put_filp(struct file *file)
}
}
-void __init files_init(unsigned long mempages)
+void __init files_init(void)
{
- unsigned long n;
-
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ percpu_counter_init(&nr_files, 0, GFP_KERNEL);
+}
- /*
- * One file with associated inode and dcache is very roughly 1K.
- * Per default don't use more than 10% of our memory for files.
- */
+/*
+ * One file with associated inode and dcache is very roughly 1K. Per default
+ * do not use more than 10% of our memory for files.
+ */
+void __init files_maxfiles_init(void)
+{
+ unsigned long n;
+ unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2;
+
+ memreserve = min(memreserve, totalram_pages - 1);
+ n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
- n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
- percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index 881aa3d217f0..e3dcb4467d92 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -50,9 +50,6 @@ extern daddr_t vxfs_bmap1(struct inode *, long);
/* vxfs_fshead.c */
extern int vxfs_read_fshead(struct super_block *);
-/* vxfs_immed.c */
-extern const struct inode_operations vxfs_immed_symlink_iops;
-
/* vxfs_inode.c */
extern const struct address_space_operations vxfs_immed_aops;
extern struct kmem_cache *vxfs_inode_cachep;
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index c36aeaf92e41..cb84f0fcc72a 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -32,29 +32,15 @@
*/
#include <linux/fs.h>
#include <linux/pagemap.h>
-#include <linux/namei.h>
#include "vxfs.h"
#include "vxfs_extern.h"
#include "vxfs_inode.h"
-static void * vxfs_immed_follow_link(struct dentry *, struct nameidata *);
-
static int vxfs_immed_readpage(struct file *, struct page *);
/*
- * Inode operations for immed symlinks.
- *
- * Unliked all other operations we do not go through the pagecache,
- * but do all work directly on the inode.
- */
-const struct inode_operations vxfs_immed_symlink_iops = {
- .readlink = generic_readlink,
- .follow_link = vxfs_immed_follow_link,
-};
-
-/*
* Address space operations for immed files and directories.
*/
const struct address_space_operations vxfs_immed_aops = {
@@ -62,26 +48,6 @@ const struct address_space_operations vxfs_immed_aops = {
};
/**
- * vxfs_immed_follow_link - follow immed symlink
- * @dp: dentry for the link
- * @np: pathname lookup data for the current path walk
- *
- * Description:
- * vxfs_immed_follow_link restarts the pathname lookup with
- * the data obtained from @dp.
- *
- * Returns:
- * Zero on success, else a negative error code.
- */
-static void *
-vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np)
-{
- struct vxfs_inode_info *vip = VXFS_INO(dp->d_inode);
- nd_set_link(np, vip->vii_immed.vi_immed);
- return NULL;
-}
-
-/**
* vxfs_immed_readpage - read part of an immed inode into pagecache
* @file: file context (unused)
* @page: page frame to fill in.
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 363e3ae25f6b..ef73ed674a27 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -35,6 +35,7 @@
#include <linux/pagemap.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/namei.h>
#include "vxfs.h"
#include "vxfs_inode.h"
@@ -327,8 +328,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
ip->i_op = &page_symlink_inode_operations;
ip->i_mapping->a_ops = &vxfs_aops;
} else {
- ip->i_op = &vxfs_immed_symlink_iops;
- vip->vii_immed.vi_immed[ip->i_size] = '\0';
+ ip->i_op = &simple_symlink_inode_operations;
+ ip->i_link = vip->vii_immed.vi_immed;
+ nd_terminate_link(ip->i_link, ip->i_size,
+ sizeof(vip->vii_immed.vi_immed) - 1);
}
} else
init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 99c7f0a37af4..484b32d3234a 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -61,13 +61,6 @@ const struct file_operations vxfs_dir_operations = {
.iterate = vxfs_readdir,
};
-
-static inline u_long
-dir_pages(struct inode *inode)
-{
- return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
-
static inline u_long
dir_blocks(struct inode *ip)
{
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 32a8bbd7a9ad..518c6294bf6c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
+#include <linux/memcontrol.h>
#include "internal.h"
/*
@@ -34,6 +35,10 @@
*/
#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
+struct wb_completion {
+ atomic_t cnt;
+};
+
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
@@ -47,13 +52,29 @@ struct wb_writeback_work {
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
+ unsigned int auto_free:1; /* free on completion */
+ unsigned int single_wait:1;
+ unsigned int single_done:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
- struct completion *done; /* set if the caller waits */
+ struct wb_completion *done; /* set if the caller waits */
};
/*
+ * If one wants to wait for one or more wb_writeback_works, each work's
+ * ->done should be set to a wb_completion defined using the following
+ * macro. Once all work items are issued with wb_queue_work(), the caller
+ * can wait for the completion of all using wb_wait_for_completion(). Work
+ * items which are waited upon aren't freed automatically on completion.
+ */
+#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \
+ struct wb_completion cmpl = { \
+ .cnt = ATOMIC_INIT(1), \
+ }
+
+
+/*
* If an inode is constantly having its pages dirtied, but then the
* updates stop dirtytime_expire_interval seconds in the past, it's
* possible for the worst case time between when an inode has its
@@ -65,35 +86,6 @@ struct wb_writeback_work {
*/
unsigned int dirtytime_expire_interval = 12 * 60 * 60;
-/**
- * writeback_in_progress - determine whether there is writeback in progress
- * @bdi: the device's backing_dev_info structure.
- *
- * Determine whether there is writeback waiting to be handled against a
- * backing device.
- */
-int writeback_in_progress(struct backing_dev_info *bdi)
-{
- return test_bit(BDI_writeback_running, &bdi->state);
-}
-EXPORT_SYMBOL(writeback_in_progress);
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
- struct super_block *sb;
-
- if (!inode)
- return &noop_backing_dev_info;
-
- sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
- if (sb_is_blkdev_sb(sb))
- return blk_get_backing_dev_info(I_BDEV(inode));
-#endif
- return sb->s_bdi;
-}
-EXPORT_SYMBOL_GPL(inode_to_bdi);
-
static inline struct inode *wb_inode(struct list_head *head)
{
return list_entry(head, struct inode, i_wb_list);
@@ -109,45 +101,831 @@ static inline struct inode *wb_inode(struct list_head *head)
EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
-static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+static bool wb_io_lists_populated(struct bdi_writeback *wb)
+{
+ if (wb_has_dirty_io(wb)) {
+ return false;
+ } else {
+ set_bit(WB_has_dirty_io, &wb->state);
+ WARN_ON_ONCE(!wb->avg_write_bandwidth);
+ atomic_long_add(wb->avg_write_bandwidth,
+ &wb->bdi->tot_write_bandwidth);
+ return true;
+ }
+}
+
+static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
- spin_lock_bh(&bdi->wb_lock);
- if (test_bit(BDI_registered, &bdi->state))
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
- spin_unlock_bh(&bdi->wb_lock);
+ if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
+ list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
+ clear_bit(WB_has_dirty_io, &wb->state);
+ WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
+ &wb->bdi->tot_write_bandwidth) < 0);
+ }
}
-static void bdi_queue_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
+/**
+ * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * @inode: inode to be moved
+ * @wb: target bdi_writeback
+ * @head: one of @wb->b_{dirty|io|more_io}
+ *
+ * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Returns %true if @inode is the first occupant of the !dirty_time IO
+ * lists; otherwise, %false.
+ */
+static bool inode_wb_list_move_locked(struct inode *inode,
+ struct bdi_writeback *wb,
+ struct list_head *head)
{
- trace_writeback_queue(bdi, work);
+ assert_spin_locked(&wb->list_lock);
+
+ list_move(&inode->i_wb_list, head);
- spin_lock_bh(&bdi->wb_lock);
- if (!test_bit(BDI_registered, &bdi->state)) {
- if (work->done)
- complete(work->done);
+ /* dirty_time doesn't count as dirty_io until expiration */
+ if (head != &wb->b_dirty_time)
+ return wb_io_lists_populated(wb);
+
+ wb_io_lists_depopulated(wb);
+ return false;
+}
+
+/**
+ * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * @inode: inode to be removed
+ * @wb: bdi_writeback @inode is being removed from
+ *
+ * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
+ * clear %WB_has_dirty_io if all are empty afterwards.
+ */
+static void inode_wb_list_del_locked(struct inode *inode,
+ struct bdi_writeback *wb)
+{
+ assert_spin_locked(&wb->list_lock);
+
+ list_del_init(&inode->i_wb_list);
+ wb_io_lists_depopulated(wb);
+}
+
+static void wb_wakeup(struct bdi_writeback *wb)
+{
+ spin_lock_bh(&wb->work_lock);
+ if (test_bit(WB_registered, &wb->state))
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
+ spin_unlock_bh(&wb->work_lock);
+}
+
+static void wb_queue_work(struct bdi_writeback *wb,
+ struct wb_writeback_work *work)
+{
+ trace_writeback_queue(wb->bdi, work);
+
+ spin_lock_bh(&wb->work_lock);
+ if (!test_bit(WB_registered, &wb->state)) {
+ if (work->single_wait)
+ work->single_done = 1;
goto out_unlock;
}
- list_add_tail(&work->list, &bdi->work_list);
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ if (work->done)
+ atomic_inc(&work->done->cnt);
+ list_add_tail(&work->list, &wb->work_list);
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
out_unlock:
- spin_unlock_bh(&bdi->wb_lock);
+ spin_unlock_bh(&wb->work_lock);
+}
+
+/**
+ * wb_wait_for_completion - wait for completion of bdi_writeback_works
+ * @bdi: bdi work items were issued to
+ * @done: target wb_completion
+ *
+ * Wait for one or more work items issued to @bdi with their ->done field
+ * set to @done, which should have been defined with
+ * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such
+ * work items are completed. Work items which are waited upon aren't freed
+ * automatically on completion.
+ */
+static void wb_wait_for_completion(struct backing_dev_info *bdi,
+ struct wb_completion *done)
+{
+ atomic_dec(&done->cnt); /* put down the initial count */
+ wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/* parameters for foreign inode detection, see wb_detach_inode() */
+#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */
+#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */
+#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */
+
+#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */
+#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
+ /* each slot's duration is 2s / 16 */
+#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2)
+ /* if foreign slots >= 8, switch */
+#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
+ /* one round can affect upto 5 slots */
+
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb = NULL;
+
+ if (inode_cgwb_enabled(inode)) {
+ struct cgroup_subsys_state *memcg_css;
+
+ if (page) {
+ memcg_css = mem_cgroup_css_from_page(page);
+ wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ } else {
+ /* must pin memcg_css, see wb_get_create() */
+ memcg_css = task_get_css(current, memory_cgrp_id);
+ wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ css_put(memcg_css);
+ }
+ }
+
+ if (!wb)
+ wb = &bdi->wb;
+
+ /*
+ * There may be multiple instances of this function racing to
+ * update the same inode. Use cmpxchg() to tell the winner.
+ */
+ if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+ wb_put(wb);
+}
+
+/**
+ * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
+ * @inode: inode of interest with i_lock held
+ *
+ * Returns @inode's wb with its list_lock held. @inode->i_lock must be
+ * held on entry and is released on return. The returned wb is guaranteed
+ * to stay @inode's associated wb until its list_lock is released.
+ */
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+ __releases(&inode->i_lock)
+ __acquires(&wb->list_lock)
+{
+ while (true) {
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ /*
+ * inode_to_wb() association is protected by both
+ * @inode->i_lock and @wb->list_lock but list_lock nests
+ * outside i_lock. Drop i_lock and verify that the
+ * association hasn't changed after acquiring list_lock.
+ */
+ wb_get(wb);
+ spin_unlock(&inode->i_lock);
+ spin_lock(&wb->list_lock);
+ wb_put(wb); /* not gonna deref it anymore */
+
+ /* i_wb may have changed inbetween, can't use inode_to_wb() */
+ if (likely(wb == inode->i_wb))
+ return wb; /* @inode already has ref */
+
+ spin_unlock(&wb->list_lock);
+ cpu_relax();
+ spin_lock(&inode->i_lock);
+ }
+}
+
+/**
+ * inode_to_wb_and_lock_list - determine an inode's wb and lock it
+ * @inode: inode of interest
+ *
+ * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
+ * on entry.
+ */
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+ __acquires(&wb->list_lock)
+{
+ spin_lock(&inode->i_lock);
+ return locked_inode_to_wb_and_lock_list(inode);
+}
+
+struct inode_switch_wbs_context {
+ struct inode *inode;
+ struct bdi_writeback *new_wb;
+
+ struct rcu_head rcu_head;
+ struct work_struct work;
+};
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct inode_switch_wbs_context *isw =
+ container_of(work, struct inode_switch_wbs_context, work);
+ struct inode *inode = isw->inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct bdi_writeback *old_wb = inode->i_wb;
+ struct bdi_writeback *new_wb = isw->new_wb;
+ struct radix_tree_iter iter;
+ bool switched = false;
+ void **slot;
+
+ /*
+ * By the time control reaches here, RCU grace period has passed
+ * since I_WB_SWITCH assertion and all wb stat update transactions
+ * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+ * synchronizing against mapping->tree_lock.
+ *
+ * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+ * gives us exclusion against all wb related operations on @inode
+ * including IO list manipulations and stat updates.
+ */
+ if (old_wb < new_wb) {
+ spin_lock(&old_wb->list_lock);
+ spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+ } else {
+ spin_lock(&new_wb->list_lock);
+ spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+ }
+ spin_lock(&inode->i_lock);
+ spin_lock_irq(&mapping->tree_lock);
+
+ /*
+ * Once I_FREEING is visible under i_lock, the eviction path owns
+ * the inode and we shouldn't modify ->i_wb_list.
+ */
+ if (unlikely(inode->i_state & I_FREEING))
+ goto skip_switch;
+
+ /*
+ * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points
+ * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+ * pages actually under underwriteback.
+ */
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ PAGECACHE_TAG_DIRTY) {
+ struct page *page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (likely(page) && PageDirty(page)) {
+ __dec_wb_stat(old_wb, WB_RECLAIMABLE);
+ __inc_wb_stat(new_wb, WB_RECLAIMABLE);
+ }
+ }
+
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+ PAGECACHE_TAG_WRITEBACK) {
+ struct page *page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (likely(page)) {
+ WARN_ON_ONCE(!PageWriteback(page));
+ __dec_wb_stat(old_wb, WB_WRITEBACK);
+ __inc_wb_stat(new_wb, WB_WRITEBACK);
+ }
+ }
+
+ wb_get(new_wb);
+
+ /*
+ * Transfer to @new_wb's IO list if necessary. The specific list
+ * @inode was on is ignored and the inode is put on ->b_dirty which
+ * is always correct including from ->b_dirty_time. The transfer
+ * preserves @inode->dirtied_when ordering.
+ */
+ if (!list_empty(&inode->i_wb_list)) {
+ struct inode *pos;
+
+ inode_wb_list_del_locked(inode, old_wb);
+ inode->i_wb = new_wb;
+ list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+ if (time_after_eq(inode->dirtied_when,
+ pos->dirtied_when))
+ break;
+ inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+ } else {
+ inode->i_wb = new_wb;
+ }
+
+ /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
+ inode->i_wb_frn_winner = 0;
+ inode->i_wb_frn_avg_time = 0;
+ inode->i_wb_frn_history = 0;
+ switched = true;
+skip_switch:
+ /*
+ * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+ * ensures that the new wb is visible if they see !I_WB_SWITCH.
+ */
+ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+
+ spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&new_wb->list_lock);
+ spin_unlock(&old_wb->list_lock);
+
+ if (switched) {
+ wb_wakeup(new_wb);
+ wb_put(old_wb);
+ }
+ wb_put(new_wb);
+
+ iput(inode);
+ kfree(isw);
+}
+
+static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+{
+ struct inode_switch_wbs_context *isw = container_of(rcu_head,
+ struct inode_switch_wbs_context, rcu_head);
+
+ /* needs to grab bh-unsafe locks, bounce to work item */
+ INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
+ schedule_work(&isw->work);
+}
+
+/**
+ * inode_switch_wbs - change the wb association of an inode
+ * @inode: target inode
+ * @new_wb_id: ID of the new wb
+ *
+ * Switch @inode's wb association to the wb identified by @new_wb_id. The
+ * switching is performed asynchronously and may fail silently.
+ */
+static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+{
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct cgroup_subsys_state *memcg_css;
+ struct inode_switch_wbs_context *isw;
+
+ /* noop if seems to be already in progress */
+ if (inode->i_state & I_WB_SWITCH)
+ return;
+
+ isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+ if (!isw)
+ return;
+
+ /* find and pin the new wb */
+ rcu_read_lock();
+ memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
+ if (memcg_css)
+ isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+ rcu_read_unlock();
+ if (!isw->new_wb)
+ goto out_free;
+
+ /* while holding I_WB_SWITCH, no one else can update the association */
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ inode_to_wb(inode) == isw->new_wb) {
+ spin_unlock(&inode->i_lock);
+ goto out_free;
+ }
+ inode->i_state |= I_WB_SWITCH;
+ spin_unlock(&inode->i_lock);
+
+ ihold(inode);
+ isw->inode = inode;
+
+ /*
+ * In addition to synchronizing among switchers, I_WB_SWITCH tells
+ * the RCU protected stat update paths to grab the mapping's
+ * tree_lock so that stat transfer can synchronize against them.
+ * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+ */
+ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+ return;
+
+out_free:
+ if (isw->new_wb)
+ wb_put(isw->new_wb);
+ kfree(isw);
+}
+
+/**
+ * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * @inode is locked and about to be written back under the control of @wbc.
+ * Record @inode's writeback context into @wbc and unlock the i_lock. On
+ * writeback completion, wbc_detach_inode() should be called. This is used
+ * to track the cgroup writeback context.
+ */
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ if (!inode_cgwb_enabled(inode)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ wbc->wb = inode_to_wb(inode);
+ wbc->inode = inode;
+
+ wbc->wb_id = wbc->wb->memcg_css->id;
+ wbc->wb_lcand_id = inode->i_wb_frn_winner;
+ wbc->wb_tcand_id = 0;
+ wbc->wb_bytes = 0;
+ wbc->wb_lcand_bytes = 0;
+ wbc->wb_tcand_bytes = 0;
+
+ wb_get(wbc->wb);
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * A dying wb indicates that the memcg-blkcg mapping has changed
+ * and a new wb is already serving the memcg. Switch immediately.
+ */
+ if (unlikely(wb_dying(wbc->wb)))
+ inode_switch_wbs(inode, wbc->wb_id);
+}
+
+/**
+ * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
+ * @wbc: writeback_control of the just finished writeback
+ *
+ * To be called after a writeback attempt of an inode finishes and undoes
+ * wbc_attach_and_unlock_inode(). Can be called under any context.
+ *
+ * As concurrent write sharing of an inode is expected to be very rare and
+ * memcg only tracks page ownership on first-use basis severely confining
+ * the usefulness of such sharing, cgroup writeback tracks ownership
+ * per-inode. While the support for concurrent write sharing of an inode
+ * is deemed unnecessary, an inode being written to by different cgroups at
+ * different points in time is a lot more common, and, more importantly,
+ * charging only by first-use can too readily lead to grossly incorrect
+ * behaviors (single foreign page can lead to gigabytes of writeback to be
+ * incorrectly attributed).
+ *
+ * To resolve this issue, cgroup writeback detects the majority dirtier of
+ * an inode and transfers the ownership to it. To avoid unnnecessary
+ * oscillation, the detection mechanism keeps track of history and gives
+ * out the switch verdict only if the foreign usage pattern is stable over
+ * a certain amount of time and/or writeback attempts.
+ *
+ * On each writeback attempt, @wbc tries to detect the majority writer
+ * using Boyer-Moore majority vote algorithm. In addition to the byte
+ * count from the majority voting, it also counts the bytes written for the
+ * current wb and the last round's winner wb (max of last round's current
+ * wb, the winner from two rounds ago, and the last round's majority
+ * candidate). Keeping track of the historical winner helps the algorithm
+ * to semi-reliably detect the most active writer even when it's not the
+ * absolute majority.
+ *
+ * Once the winner of the round is determined, whether the winner is
+ * foreign or not and how much IO time the round consumed is recorded in
+ * inode->i_wb_frn_history. If the amount of recorded foreign IO time is
+ * over a certain threshold, the switch verdict is given.
+ */
+void wbc_detach_inode(struct writeback_control *wbc)
+{
+ struct bdi_writeback *wb = wbc->wb;
+ struct inode *inode = wbc->inode;
+ unsigned long avg_time, max_bytes, max_time;
+ u16 history;
+ int max_id;
+
+ if (!wb)
+ return;
+
+ history = inode->i_wb_frn_history;
+ avg_time = inode->i_wb_frn_avg_time;
+
+ /* pick the winner of this round */
+ if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
+ wbc->wb_bytes >= wbc->wb_tcand_bytes) {
+ max_id = wbc->wb_id;
+ max_bytes = wbc->wb_bytes;
+ } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
+ max_id = wbc->wb_lcand_id;
+ max_bytes = wbc->wb_lcand_bytes;
+ } else {
+ max_id = wbc->wb_tcand_id;
+ max_bytes = wbc->wb_tcand_bytes;
+ }
+
+ /*
+ * Calculate the amount of IO time the winner consumed and fold it
+ * into the running average kept per inode. If the consumed IO
+ * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
+ * deciding whether to switch or not. This is to prevent one-off
+ * small dirtiers from skewing the verdict.
+ */
+ max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
+ wb->avg_write_bandwidth);
+ if (avg_time)
+ avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
+ (avg_time >> WB_FRN_TIME_AVG_SHIFT);
+ else
+ avg_time = max_time; /* immediate catch up on first run */
+
+ if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
+ int slots;
+
+ /*
+ * The switch verdict is reached if foreign wb's consume
+ * more than a certain proportion of IO time in a
+ * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot
+ * history mask where each bit represents one sixteenth of
+ * the period. Determine the number of slots to shift into
+ * history from @max_time.
+ */
+ slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
+ (unsigned long)WB_FRN_HIST_MAX_SLOTS);
+ history <<= slots;
+ if (wbc->wb_id != max_id)
+ history |= (1U << slots) - 1;
+
+ /*
+ * Switch if the current wb isn't the consistent winner.
+ * If there are multiple closely competing dirtiers, the
+ * inode may switch across them repeatedly over time, which
+ * is okay. The main goal is avoiding keeping an inode on
+ * the wrong wb for an extended period of time.
+ */
+ if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+ inode_switch_wbs(inode, max_id);
+ }
+
+ /*
+ * Multiple instances of this function may race to update the
+ * following fields but we don't mind occassional inaccuracies.
+ */
+ inode->i_wb_frn_winner = max_id;
+ inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
+ inode->i_wb_frn_history = history;
+
+ wb_put(wbc->wb);
+ wbc->wb = NULL;
+}
+
+/**
+ * wbc_account_io - account IO issued during writeback
+ * @wbc: writeback_control of the writeback in progress
+ * @page: page being written out
+ * @bytes: number of bytes being written out
+ *
+ * @bytes from @page are about to written out during the writeback
+ * controlled by @wbc. Keep the book for foreign inode detection. See
+ * wbc_detach_inode().
+ */
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+ size_t bytes)
+{
+ int id;
+
+ /*
+ * pageout() path doesn't attach @wbc to the inode being written
+ * out. This is intentional as we don't want the function to block
+ * behind a slow cgroup. Ultimately, we want pageout() to kick off
+ * regular writeback instead of writing things out itself.
+ */
+ if (!wbc->wb)
+ return;
+
+ rcu_read_lock();
+ id = mem_cgroup_css_from_page(page)->id;
+ rcu_read_unlock();
+
+ if (id == wbc->wb_id) {
+ wbc->wb_bytes += bytes;
+ return;
+ }
+
+ if (id == wbc->wb_lcand_id)
+ wbc->wb_lcand_bytes += bytes;
+
+ /* Boyer-Moore majority vote algorithm */
+ if (!wbc->wb_tcand_bytes)
+ wbc->wb_tcand_id = id;
+ if (id == wbc->wb_tcand_id)
+ wbc->wb_tcand_bytes += bytes;
+ else
+ wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
}
+EXPORT_SYMBOL_GPL(wbc_account_io);
-static void
-__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- bool range_cyclic, enum wb_reason reason)
+/**
+ * inode_congested - test whether an inode is congested
+ * @inode: inode to test for congestion
+ * @cong_bits: mask of WB_[a]sync_congested bits to test
+ *
+ * Tests whether @inode is congested. @cong_bits is the mask of congestion
+ * bits to test and the return value is the mask of set bits.
+ *
+ * If cgroup writeback is enabled for @inode, the congestion state is
+ * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
+ * associated with @inode is congested; otherwise, the root wb's congestion
+ * state is used.
+ */
+int inode_congested(struct inode *inode, int cong_bits)
+{
+ /*
+ * Once set, ->i_wb never becomes NULL while the inode is alive.
+ * Start transaction iff ->i_wb is visible.
+ */
+ if (inode && inode_to_wb_is_valid(inode)) {
+ struct bdi_writeback *wb;
+ bool locked, congested;
+
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
+ congested = wb_congested(wb, cong_bits);
+ unlocked_inode_to_wb_end(inode, locked);
+ return congested;
+ }
+
+ return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+EXPORT_SYMBOL_GPL(inode_congested);
+
+/**
+ * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
+ * @bdi: bdi the work item was issued to
+ * @work: work item to wait for
+ *
+ * Wait for the completion of @work which was issued to one of @bdi's
+ * bdi_writeback's. The caller must have set @work->single_wait before
+ * issuing it. This wait operates independently fo
+ * wb_wait_for_completion() and also disables automatic freeing of @work.
+ */
+static void wb_wait_for_single_work(struct backing_dev_info *bdi,
+ struct wb_writeback_work *work)
+{
+ if (WARN_ON_ONCE(!work->single_wait))
+ return;
+
+ wait_event(bdi->wb_waitq, work->single_done);
+
+ /*
+ * Paired with smp_wmb() in wb_do_writeback() and ensures that all
+ * modifications to @work prior to assertion of ->single_done is
+ * visible to the caller once this function returns.
+ */
+ smp_rmb();
+}
+
+/**
+ * wb_split_bdi_pages - split nr_pages to write according to bandwidth
+ * @wb: target bdi_writeback to split @nr_pages to
+ * @nr_pages: number of pages to write for the whole bdi
+ *
+ * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
+ * relation to the total write bandwidth of all wb's w/ dirty inodes on
+ * @wb->bdi.
+ */
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+ unsigned long this_bw = wb->avg_write_bandwidth;
+ unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+
+ if (nr_pages == LONG_MAX)
+ return LONG_MAX;
+
+ /*
+ * This may be called on clean wb's and proportional distribution
+ * may not make sense, just use the original @nr_pages in those
+ * cases. In general, we wanna err on the side of writing more.
+ */
+ if (!tot_bw || this_bw >= tot_bw)
+ return nr_pages;
+ else
+ return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
+}
+
+/**
+ * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
+ * @wb: target bdi_writeback
+ * @base_work: source wb_writeback_work
+ *
+ * Try to make a clone of @base_work and issue it to @wb. If cloning
+ * succeeds, %true is returned; otherwise, @base_work is issued directly
+ * and %false is returned. In the latter case, the caller is required to
+ * wait for @base_work's completion using wb_wait_for_single_work().
+ *
+ * A clone is auto-freed on completion. @base_work never is.
+ */
+static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
+ struct wb_writeback_work *base_work)
{
struct wb_writeback_work *work;
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ *work = *base_work;
+ work->auto_free = 1;
+ work->single_wait = 0;
+ } else {
+ work = base_work;
+ work->auto_free = 0;
+ work->single_wait = 1;
+ }
+ work->single_done = 0;
+ wb_queue_work(wb, work);
+ return work != base_work;
+}
+
+/**
+ * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
+ * @bdi: target backing_dev_info
+ * @base_work: wb_writeback_work to issue
+ * @skip_if_busy: skip wb's which already have writeback in progress
+ *
+ * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
+ * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's
+ * distributed to the busy wbs according to each wb's proportion in the
+ * total active write bandwidth of @bdi.
+ */
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+ struct wb_writeback_work *base_work,
+ bool skip_if_busy)
+{
+ long nr_pages = base_work->nr_pages;
+ int next_blkcg_id = 0;
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
+
+ might_sleep();
+
+ if (!bdi_has_dirty_io(bdi))
+ return;
+restart:
+ rcu_read_lock();
+ bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+ if (!wb_has_dirty_io(wb) ||
+ (skip_if_busy && writeback_in_progress(wb)))
+ continue;
+
+ base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
+ if (!wb_clone_and_queue_work(wb, base_work)) {
+ next_blkcg_id = wb->blkcg_css->id + 1;
+ rcu_read_unlock();
+ wb_wait_for_single_work(bdi, base_work);
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+ __releases(&inode->i_lock)
+ __acquires(&wb->list_lock)
+{
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ spin_unlock(&inode->i_lock);
+ spin_lock(&wb->list_lock);
+ return wb;
+}
+
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+ __acquires(&wb->list_lock)
+{
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ spin_lock(&wb->list_lock);
+ return wb;
+}
+
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+ return nr_pages;
+}
+
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+ struct wb_writeback_work *base_work,
+ bool skip_if_busy)
+{
+ might_sleep();
+
+ if (bdi_has_dirty_io(bdi) &&
+ (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
+ base_work->auto_free = 0;
+ base_work->single_wait = 0;
+ base_work->single_done = 0;
+ wb_queue_work(&bdi->wb, base_work);
+ }
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+ bool range_cyclic, enum wb_reason reason)
+{
+ struct wb_writeback_work *work;
+
+ if (!wb_has_dirty_io(wb))
+ return;
+
/*
* This is WB_SYNC_NONE writeback, so if allocation fails just
* wakeup the thread for old dirty data writeback
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- trace_writeback_nowork(bdi);
- bdi_wakeup_thread(bdi);
+ trace_writeback_nowork(wb->bdi);
+ wb_wakeup(wb);
return;
}
@@ -155,46 +933,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
work->reason = reason;
+ work->auto_free = 1;
- bdi_queue_work(bdi, work);
+ wb_queue_work(wb, work);
}
/**
- * bdi_start_writeback - start writeback
- * @bdi: the backing device to write from
- * @nr_pages: the number of pages to write
- * @reason: reason why some writeback work was initiated
- *
- * Description:
- * This does WB_SYNC_NONE opportunistic writeback. The IO is only
- * started when this function returns, we make no guarantees on
- * completion. Caller need not hold sb s_umount semaphore.
- *
- */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
- enum wb_reason reason)
-{
- __bdi_start_writeback(bdi, nr_pages, true, reason);
-}
-
-/**
- * bdi_start_background_writeback - start background writeback
- * @bdi: the backing device to write from
+ * wb_start_background_writeback - start background writeback
+ * @wb: bdi_writback to write from
*
* Description:
* This makes sure WB_SYNC_NONE background writeback happens. When
- * this function returns, it is only guaranteed that for given BDI
+ * this function returns, it is only guaranteed that for given wb
* some IO is happening if we are over background dirty threshold.
* Caller need not hold sb s_umount semaphore.
*/
-void bdi_start_background_writeback(struct backing_dev_info *bdi)
+void wb_start_background_writeback(struct bdi_writeback *wb)
{
/*
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
- trace_writeback_wake_background(bdi);
- bdi_wakeup_thread(bdi);
+ trace_writeback_wake_background(wb->bdi);
+ wb_wakeup(wb);
}
/*
@@ -202,11 +963,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
*/
void inode_wb_list_del(struct inode *inode)
{
- struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb;
- spin_lock(&bdi->wb.list_lock);
- list_del_init(&inode->i_wb_list);
- spin_unlock(&bdi->wb.list_lock);
+ wb = inode_to_wb_and_lock_list(inode);
+ inode_wb_list_del_locked(inode, wb);
+ spin_unlock(&wb->list_lock);
}
/*
@@ -220,7 +981,6 @@ void inode_wb_list_del(struct inode *inode)
*/
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
- assert_spin_locked(&wb->list_lock);
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -228,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_wb_list, &wb->b_dirty);
+ inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
}
/*
@@ -236,8 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
*/
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
- assert_spin_locked(&wb->list_lock);
- list_move(&inode->i_wb_list, &wb->b_more_io);
+ inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -346,6 +1105,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
EXPIRE_DIRTY_ATIME, work);
+ if (moved)
+ wb_io_lists_populated(wb);
trace_writeback_queue_io(wb, work, moved);
}
@@ -471,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
redirty_tail(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
- list_move(&inode->i_wb_list, &wb->b_dirty_time);
+ inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
- list_del_init(&inode->i_wb_list);
+ inode_wb_list_del_locked(inode, wb);
}
}
@@ -605,10 +1366,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
inode->i_state |= I_SYNC;
- spin_unlock(&inode->i_lock);
+ wbc_attach_and_unlock_inode(wbc, inode);
ret = __writeback_single_inode(inode, wbc);
+ wbc_detach_inode(wbc);
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
/*
@@ -616,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* touch it. See comment above for explanation.
*/
if (!(inode->i_state & I_DIRTY_ALL))
- list_del_init(&inode->i_wb_list);
+ inode_wb_list_del_locked(inode, wb);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -624,7 +1386,7 @@ out:
return ret;
}
-static long writeback_chunk_size(struct backing_dev_info *bdi,
+static long writeback_chunk_size(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
long pages;
@@ -645,8 +1407,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
pages = LONG_MAX;
else {
- pages = min(bdi->avg_write_bandwidth / 2,
- global_dirty_limit / DIRTY_SCOPE);
+ pages = min(wb->avg_write_bandwidth / 2,
+ global_wb_domain.dirty_limit / DIRTY_SCOPE);
pages = min(pages, work->nr_pages);
pages = round_down(pages + MIN_WRITEBACK_PAGES,
MIN_WRITEBACK_PAGES);
@@ -741,9 +1503,9 @@ static long writeback_sb_inodes(struct super_block *sb,
continue;
}
inode->i_state |= I_SYNC;
- spin_unlock(&inode->i_lock);
+ wbc_attach_and_unlock_inode(&wbc, inode);
- write_chunk = writeback_chunk_size(wb->bdi, work);
+ write_chunk = writeback_chunk_size(wb, work);
wbc.nr_to_write = write_chunk;
wbc.pages_skipped = 0;
@@ -753,6 +1515,7 @@ static long writeback_sb_inodes(struct super_block *sb,
*/
__writeback_single_inode(inode, &wbc);
+ wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
spin_lock(&wb->list_lock);
@@ -830,33 +1593,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
return nr_pages - work.nr_pages;
}
-static bool over_bground_thresh(struct backing_dev_info *bdi)
-{
- unsigned long background_thresh, dirty_thresh;
-
- global_dirty_limits(&background_thresh, &dirty_thresh);
-
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) > background_thresh)
- return true;
-
- if (bdi_stat(bdi, BDI_RECLAIMABLE) >
- bdi_dirty_limit(bdi, background_thresh))
- return true;
-
- return false;
-}
-
-/*
- * Called under wb->list_lock. If there are multiple wb per bdi,
- * only the flusher working on the first wb should do it.
- */
-static void wb_update_bandwidth(struct bdi_writeback *wb,
- unsigned long start_time)
-{
- __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
-}
-
/*
* Explicit flushing or periodic writeback of "old" data.
*
@@ -899,14 +1635,14 @@ static long wb_writeback(struct bdi_writeback *wb,
* after the other works are all done.
*/
if ((work->for_background || work->for_kupdate) &&
- !list_empty(&wb->bdi->work_list))
+ !list_empty(&wb->work_list))
break;
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
- if (work->for_background && !over_bground_thresh(wb->bdi))
+ if (work->for_background && !wb_over_bg_thresh(wb))
break;
/*
@@ -970,18 +1706,17 @@ static long wb_writeback(struct bdi_writeback *wb,
/*
* Return the next wb_writeback_work struct that hasn't been processed yet.
*/
-static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi)
+static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
struct wb_writeback_work *work = NULL;
- spin_lock_bh(&bdi->wb_lock);
- if (!list_empty(&bdi->work_list)) {
- work = list_entry(bdi->work_list.next,
+ spin_lock_bh(&wb->work_lock);
+ if (!list_empty(&wb->work_list)) {
+ work = list_entry(wb->work_list.next,
struct wb_writeback_work, list);
list_del_init(&work->list);
}
- spin_unlock_bh(&bdi->wb_lock);
+ spin_unlock_bh(&wb->work_lock);
return work;
}
@@ -998,7 +1733,7 @@ static unsigned long get_nr_dirty_pages(void)
static long wb_check_background_flush(struct bdi_writeback *wb)
{
- if (over_bground_thresh(wb->bdi)) {
+ if (wb_over_bg_thresh(wb)) {
struct wb_writeback_work work = {
.nr_pages = LONG_MAX,
@@ -1053,25 +1788,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
*/
static long wb_do_writeback(struct bdi_writeback *wb)
{
- struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;
- set_bit(BDI_writeback_running, &wb->bdi->state);
- while ((work = get_next_work_item(bdi)) != NULL) {
+ set_bit(WB_writeback_running, &wb->state);
+ while ((work = get_next_work_item(wb)) != NULL) {
+ struct wb_completion *done = work->done;
+ bool need_wake_up = false;
- trace_writeback_exec(bdi, work);
+ trace_writeback_exec(wb->bdi, work);
wrote += wb_writeback(wb, work);
- /*
- * Notify the caller of completion if this is a synchronous
- * work item, otherwise just free it.
- */
- if (work->done)
- complete(work->done);
- else
+ if (work->single_wait) {
+ WARN_ON_ONCE(work->auto_free);
+ /* paired w/ rmb in wb_wait_for_single_work() */
+ smp_wmb();
+ work->single_done = 1;
+ need_wake_up = true;
+ } else if (work->auto_free) {
kfree(work);
+ }
+
+ if (done && atomic_dec_and_test(&done->cnt))
+ need_wake_up = true;
+
+ if (need_wake_up)
+ wake_up_all(&wb->bdi->wb_waitq);
}
/*
@@ -1079,7 +1822,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
- clear_bit(BDI_writeback_running, &wb->bdi->state);
+ clear_bit(WB_writeback_running, &wb->state);
return wrote;
}
@@ -1088,43 +1831,42 @@ static long wb_do_writeback(struct bdi_writeback *wb)
* Handle writeback of dirty data for the device backed by this bdi. Also
* reschedules periodically and does kupdated style flushing.
*/
-void bdi_writeback_workfn(struct work_struct *work)
+void wb_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, dwork);
- struct backing_dev_info *bdi = wb->bdi;
long pages_written;
- set_worker_desc("flush-%s", dev_name(bdi->dev));
+ set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
- !test_bit(BDI_registered, &bdi->state))) {
+ !test_bit(WB_registered, &wb->state))) {
/*
- * The normal path. Keep writing back @bdi until its
+ * The normal path. Keep writing back @wb until its
* work_list is empty. Note that this path is also taken
- * if @bdi is shutting down even when we're running off the
+ * if @wb is shutting down even when we're running off the
* rescuer as work_list needs to be drained.
*/
do {
pages_written = wb_do_writeback(wb);
trace_writeback_pages_written(pages_written);
- } while (!list_empty(&bdi->work_list));
+ } while (!list_empty(&wb->work_list));
} else {
/*
* bdi_wq can't get enough workers and we're running off
* the emergency worker. Don't hog it. Hopefully, 1024 is
* enough for efficient IO.
*/
- pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+ pages_written = writeback_inodes_wb(wb, 1024,
WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
}
- if (!list_empty(&bdi->work_list))
+ if (!list_empty(&wb->work_list))
mod_delayed_work(bdi_wq, &wb->dwork, 0);
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
- bdi_wakeup_thread_delayed(bdi);
+ wb_wakeup_delayed(wb);
current->flags &= ~PF_SWAPWRITE;
}
@@ -1142,9 +1884,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
+
if (!bdi_has_dirty_io(bdi))
continue;
- __bdi_start_writeback(bdi, nr_pages, false, reason);
+
+ bdi_for_each_wb(wb, bdi, &iter, 0)
+ wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
+ false, reason);
}
rcu_read_unlock();
}
@@ -1173,9 +1921,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
- if (list_empty(&bdi->wb.b_dirty_time))
- continue;
- bdi_wakeup_thread(bdi);
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
+
+ bdi_for_each_wb(wb, bdi, &iter, 0)
+ if (!list_empty(&bdi->wb.b_dirty_time))
+ wb_wakeup(&bdi->wb);
}
rcu_read_unlock();
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1249,7 +2000,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
- struct backing_dev_info *bdi = NULL;
int dirtytime;
trace_writeback_mark_inode_dirty(inode, flags);
@@ -1289,6 +2039,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
+ inode_attach_wb(inode, NULL);
+
if (flags & I_DIRTY_INODE)
inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
@@ -1317,38 +2069,39 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* reposition it (that would break b_dirty time-ordering).
*/
if (!was_dirty) {
+ struct bdi_writeback *wb;
+ struct list_head *dirty_list;
bool wakeup_bdi = false;
- bdi = inode_to_bdi(inode);
- spin_unlock(&inode->i_lock);
- spin_lock(&bdi->wb.list_lock);
- if (bdi_cap_writeback_dirty(bdi)) {
- WARN(!test_bit(BDI_registered, &bdi->state),
- "bdi-%s not registered\n", bdi->name);
+ wb = locked_inode_to_wb_and_lock_list(inode);
- /*
- * If this is the first dirty inode for this
- * bdi, we have to wake-up the corresponding
- * bdi thread to make sure background
- * write-back happens later.
- */
- if (!wb_has_dirty_io(&bdi->wb))
- wakeup_bdi = true;
- }
+ WARN(bdi_cap_writeback_dirty(wb->bdi) &&
+ !test_bit(WB_registered, &wb->state),
+ "bdi-%s not registered\n", wb->bdi->name);
inode->dirtied_when = jiffies;
if (dirtytime)
inode->dirtied_time_when = jiffies;
+
if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
- list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+ dirty_list = &wb->b_dirty;
else
- list_move(&inode->i_wb_list,
- &bdi->wb.b_dirty_time);
- spin_unlock(&bdi->wb.list_lock);
+ dirty_list = &wb->b_dirty_time;
+
+ wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+ dirty_list);
+
+ spin_unlock(&wb->list_lock);
trace_writeback_dirty_inode_enqueue(inode);
- if (wakeup_bdi)
- bdi_wakeup_thread_delayed(bdi);
+ /*
+ * If this is the first dirty inode for this bdi,
+ * we have to wake-up the corresponding bdi thread
+ * to make sure background write-back happens
+ * later.
+ */
+ if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
+ wb_wakeup_delayed(wb);
return;
}
}
@@ -1411,6 +2164,28 @@ static void wait_sb_inodes(struct super_block *sb)
iput(old_inode);
}
+static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+ enum wb_reason reason, bool skip_if_busy)
+{
+ DEFINE_WB_COMPLETION_ONSTACK(done);
+ struct wb_writeback_work work = {
+ .sb = sb,
+ .sync_mode = WB_SYNC_NONE,
+ .tagged_writepages = 1,
+ .done = &done,
+ .nr_pages = nr,
+ .reason = reason,
+ };
+ struct backing_dev_info *bdi = sb->s_bdi;
+
+ if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
+ return;
+ WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+ bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
+ wb_wait_for_completion(bdi, &done);
+}
+
/**
* writeback_inodes_sb_nr - writeback dirty inodes from given super_block
* @sb: the superblock
@@ -1425,21 +2200,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
unsigned long nr,
enum wb_reason reason)
{
- DECLARE_COMPLETION_ONSTACK(done);
- struct wb_writeback_work work = {
- .sb = sb,
- .sync_mode = WB_SYNC_NONE,
- .tagged_writepages = 1,
- .done = &done,
- .nr_pages = nr,
- .reason = reason,
- };
-
- if (sb->s_bdi == &noop_backing_dev_info)
- return;
- WARN_ON(!rwsem_is_locked(&sb->s_umount));
- bdi_queue_work(sb->s_bdi, &work);
- wait_for_completion(&done);
+ __writeback_inodes_sb_nr(sb, nr, reason, false);
}
EXPORT_SYMBOL(writeback_inodes_sb_nr);
@@ -1467,19 +2228,15 @@ EXPORT_SYMBOL(writeback_inodes_sb);
* Invoke writeback_inodes_sb_nr if no writeback is currently underway.
* Returns 1 if writeback was started, 0 if not.
*/
-int try_to_writeback_inodes_sb_nr(struct super_block *sb,
- unsigned long nr,
- enum wb_reason reason)
+bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+ enum wb_reason reason)
{
- if (writeback_in_progress(sb->s_bdi))
- return 1;
-
if (!down_read_trylock(&sb->s_umount))
- return 0;
+ return false;
- writeback_inodes_sb_nr(sb, nr, reason);
+ __writeback_inodes_sb_nr(sb, nr, reason, true);
up_read(&sb->s_umount);
- return 1;
+ return true;
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
@@ -1491,7 +2248,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
* Implement by try_to_writeback_inodes_sb_nr()
* Returns 1 if writeback was started, 0 if not.
*/
-int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
+bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
@@ -1506,7 +2263,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
*/
void sync_inodes_sb(struct super_block *sb)
{
- DECLARE_COMPLETION_ONSTACK(done);
+ DEFINE_WB_COMPLETION_ONSTACK(done);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_ALL,
@@ -1516,14 +2273,15 @@ void sync_inodes_sb(struct super_block *sb)
.reason = WB_REASON_SYNC,
.for_sync = 1,
};
+ struct backing_dev_info *bdi = sb->s_bdi;
/* Nothing to do? */
- if (sb->s_bdi == &noop_backing_dev_info)
+ if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- bdi_queue_work(sb->s_bdi, &work);
- wait_for_completion(&done);
+ bdi_split_work_to_wbs(bdi, &work, false);
+ wb_wait_for_completion(bdi, &done);
wait_sb_inodes(sb);
}
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index b06c98796afb..611b5408f6ec 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock);
void pin_remove(struct fs_pin *pin)
{
spin_lock(&pin_lock);
- hlist_del(&pin->m_list);
- hlist_del(&pin->s_list);
+ hlist_del_init(&pin->m_list);
+ hlist_del_init(&pin->s_list);
spin_unlock(&pin_lock);
spin_lock_irq(&pin->wait.lock);
pin->done = 1;
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 89acec742e0b..d403c69bee08 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache,
object_already_extant:
ret = -ENOBUFS;
- if (fscache_object_is_dead(object)) {
+ if (fscache_object_is_dying(object) ||
+ fscache_cache_is_broken(object)) {
spin_unlock(&cookie->lock);
goto error;
}
@@ -671,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
if (!op)
return -ENOMEM;
- fscache_operation_init(op, NULL, NULL);
+ fscache_operation_init(op, NULL, NULL, NULL);
op->flags = FSCACHE_OP_MYTHREAD |
(1 << FSCACHE_OP_WAITING) |
(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -695,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
/* the work queue now carries its own ref on the object */
spin_unlock(&cookie->lock);
- ret = fscache_wait_for_operation_activation(object, op,
- NULL, NULL, NULL);
+ ret = fscache_wait_for_operation_activation(object, op, NULL, NULL);
if (ret == 0) {
/* ask the cache to honour the operation */
ret = object->cache->ops->check_consistency(op);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 7872a62ef30c..97ec45110957 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -124,8 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,
struct fscache_operation *);
extern int fscache_submit_op(struct fscache_object *,
struct fscache_operation *);
-extern int fscache_cancel_op(struct fscache_operation *,
- void (*)(struct fscache_operation *));
+extern int fscache_cancel_op(struct fscache_operation *, bool);
extern void fscache_cancel_all_ops(struct fscache_object *);
extern void fscache_abort_object(struct fscache_object *);
extern void fscache_start_operations(struct fscache_object *);
@@ -138,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *);
extern int fscache_wait_for_operation_activation(struct fscache_object *,
struct fscache_operation *,
atomic_t *,
- atomic_t *,
- void (*)(struct fscache_operation *));
+ atomic_t *);
extern void fscache_invalidate_writes(struct fscache_cookie *);
/*
@@ -164,6 +162,7 @@ extern atomic_t fscache_n_op_pend;
extern atomic_t fscache_n_op_run;
extern atomic_t fscache_n_op_enqueue;
extern atomic_t fscache_n_op_deferred_release;
+extern atomic_t fscache_n_op_initialised;
extern atomic_t fscache_n_op_release;
extern atomic_t fscache_n_op_gc;
extern atomic_t fscache_n_op_cancelled;
@@ -271,6 +270,11 @@ extern atomic_t fscache_n_cop_write_page;
extern atomic_t fscache_n_cop_uncache_page;
extern atomic_t fscache_n_cop_dissociate_pages;
+extern atomic_t fscache_n_cache_no_space_reject;
+extern atomic_t fscache_n_cache_stale_objects;
+extern atomic_t fscache_n_cache_retired_objects;
+extern atomic_t fscache_n_cache_culled_objects;
+
static inline void fscache_stat(atomic_t *stat)
{
atomic_inc(stat);
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index da032daf0e0d..9e792e30f4db 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -328,6 +328,17 @@ void fscache_object_init(struct fscache_object *object,
EXPORT_SYMBOL(fscache_object_init);
/*
+ * Mark the object as no longer being live, making sure that we synchronise
+ * against op submission.
+ */
+static inline void fscache_mark_object_dead(struct fscache_object *object)
+{
+ spin_lock(&object->lock);
+ clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ spin_unlock(&object->lock);
+}
+
+/*
* Abort object initialisation before we start it.
*/
static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
@@ -610,6 +621,8 @@ static const struct fscache_state *fscache_lookup_failure(struct fscache_object
object->cache->ops->lookup_complete(object);
fscache_stat_d(&fscache_n_cop_lookup_complete);
+ set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags);
+
cookie = object->cookie;
set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
@@ -629,7 +642,7 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob
_enter("{OBJ%x,%d,%d},%d",
object->debug_id, object->n_ops, object->n_children, event);
- clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ fscache_mark_object_dead(object);
object->oob_event_mask = 0;
if (list_empty(&object->dependents) &&
@@ -948,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
if (!op)
goto nomem;
- fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
+ fscache_operation_init(op, object->cache->ops->invalidate_object,
+ NULL, NULL);
op->flags = FSCACHE_OP_ASYNC |
(1 << FSCACHE_OP_EXCLUSIVE) |
(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -974,13 +988,13 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
return transit_to(UPDATE_OBJECT);
nomem:
- clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ fscache_mark_object_dead(object);
fscache_unuse_cookie(object);
_leave(" [ENOMEM]");
return transit_to(KILL_OBJECT);
submit_op_failed:
- clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
+ fscache_mark_object_dead(object);
spin_unlock(&cookie->lock);
fscache_unuse_cookie(object);
kfree(op);
@@ -1016,3 +1030,50 @@ static const struct fscache_state *fscache_update_object(struct fscache_object *
_leave("");
return transit_to(WAIT_FOR_CMD);
}
+
+/**
+ * fscache_object_retrying_stale - Note retrying stale object
+ * @object: The object that will be retried
+ *
+ * Note that an object lookup found an on-disk object that was adjudged to be
+ * stale and has been deleted. The lookup will be retried.
+ */
+void fscache_object_retrying_stale(struct fscache_object *object)
+{
+ fscache_stat(&fscache_n_cache_no_space_reject);
+}
+EXPORT_SYMBOL(fscache_object_retrying_stale);
+
+/**
+ * fscache_object_mark_killed - Note that an object was killed
+ * @object: The object that was culled
+ * @why: The reason the object was killed.
+ *
+ * Note that an object was killed. Returns true if the object was
+ * already marked killed, false if it wasn't.
+ */
+void fscache_object_mark_killed(struct fscache_object *object,
+ enum fscache_why_object_killed why)
+{
+ if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) {
+ pr_err("Error: Object already killed by cache [%s]\n",
+ object->cache->identifier);
+ return;
+ }
+
+ switch (why) {
+ case FSCACHE_OBJECT_NO_SPACE:
+ fscache_stat(&fscache_n_cache_no_space_reject);
+ break;
+ case FSCACHE_OBJECT_IS_STALE:
+ fscache_stat(&fscache_n_cache_stale_objects);
+ break;
+ case FSCACHE_OBJECT_WAS_RETIRED:
+ fscache_stat(&fscache_n_cache_retired_objects);
+ break;
+ case FSCACHE_OBJECT_WAS_CULLED:
+ fscache_stat(&fscache_n_cache_culled_objects);
+ break;
+ }
+}
+EXPORT_SYMBOL(fscache_object_mark_killed);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e7b87a0e5185..de67745e1cd7 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -20,6 +20,35 @@
atomic_t fscache_op_debug_id;
EXPORT_SYMBOL(fscache_op_debug_id);
+static void fscache_operation_dummy_cancel(struct fscache_operation *op)
+{
+}
+
+/**
+ * fscache_operation_init - Do basic initialisation of an operation
+ * @op: The operation to initialise
+ * @release: The release function to assign
+ *
+ * Do basic initialisation of an operation. The caller must still set flags,
+ * object and processor if needed.
+ */
+void fscache_operation_init(struct fscache_operation *op,
+ fscache_operation_processor_t processor,
+ fscache_operation_cancel_t cancel,
+ fscache_operation_release_t release)
+{
+ INIT_WORK(&op->work, fscache_op_work_func);
+ atomic_set(&op->usage, 1);
+ op->state = FSCACHE_OP_ST_INITIALISED;
+ op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+ op->processor = processor;
+ op->cancel = cancel ?: fscache_operation_dummy_cancel;
+ op->release = release;
+ INIT_LIST_HEAD(&op->pend_link);
+ fscache_stat(&fscache_n_op_initialised);
+}
+EXPORT_SYMBOL(fscache_operation_init);
+
/**
* fscache_enqueue_operation - Enqueue an operation for processing
* @op: The operation to enqueue
@@ -76,6 +105,43 @@ static void fscache_run_op(struct fscache_object *object,
}
/*
+ * report an unexpected submission
+ */
+static void fscache_report_unexpected_submission(struct fscache_object *object,
+ struct fscache_operation *op,
+ const struct fscache_state *ostate)
+{
+ static bool once_only;
+ struct fscache_operation *p;
+ unsigned n;
+
+ if (once_only)
+ return;
+ once_only = true;
+
+ kdebug("unexpected submission OP%x [OBJ%x %s]",
+ op->debug_id, object->debug_id, object->state->name);
+ kdebug("objstate=%s [%s]", object->state->name, ostate->name);
+ kdebug("objflags=%lx", object->flags);
+ kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
+ kdebug("ops=%u inp=%u exc=%u",
+ object->n_ops, object->n_in_progress, object->n_exclusive);
+
+ if (!list_empty(&object->pending_ops)) {
+ n = 0;
+ list_for_each_entry(p, &object->pending_ops, pend_link) {
+ ASSERTCMP(p->object, ==, object);
+ kdebug("%p %p", op->processor, op->release);
+ n++;
+ }
+
+ kdebug("n=%u", n);
+ }
+
+ dump_stack();
+}
+
+/*
* submit an exclusive operation for an object
* - other ops are excluded from running simultaneously with this one
* - this gets any extra refs it needs on an op
@@ -83,6 +149,8 @@ static void fscache_run_op(struct fscache_object *object,
int fscache_submit_exclusive_op(struct fscache_object *object,
struct fscache_operation *op)
{
+ const struct fscache_state *ostate;
+ unsigned long flags;
int ret;
_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
@@ -95,8 +163,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
ASSERTCMP(object->n_ops, >=, object->n_exclusive);
ASSERT(list_empty(&op->pend_link));
+ ostate = object->state;
+ smp_rmb();
+
op->state = FSCACHE_OP_ST_PENDING;
- if (fscache_object_is_active(object)) {
+ flags = READ_ONCE(object->flags);
+ if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
+ fscache_stat(&fscache_n_op_rejected);
+ op->cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+ ret = -ENOBUFS;
+ } else if (unlikely(fscache_cache_is_broken(object))) {
+ op->cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+ ret = -EIO;
+ } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
op->object = object;
object->n_ops++;
object->n_exclusive++; /* reads and writes must wait */
@@ -118,7 +199,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
/* need to issue a new write op after this */
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
ret = 0;
- } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+ } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
op->object = object;
object->n_ops++;
object->n_exclusive++; /* reads and writes must wait */
@@ -126,12 +207,15 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
ret = 0;
+ } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
+ op->cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+ ret = -ENOBUFS;
} else {
- /* If we're in any other state, there must have been an I/O
- * error of some nature.
- */
- ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
- ret = -EIO;
+ fscache_report_unexpected_submission(object, op, ostate);
+ op->cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+ ret = -ENOBUFS;
}
spin_unlock(&object->lock);
@@ -139,43 +223,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
}
/*
- * report an unexpected submission
- */
-static void fscache_report_unexpected_submission(struct fscache_object *object,
- struct fscache_operation *op,
- const struct fscache_state *ostate)
-{
- static bool once_only;
- struct fscache_operation *p;
- unsigned n;
-
- if (once_only)
- return;
- once_only = true;
-
- kdebug("unexpected submission OP%x [OBJ%x %s]",
- op->debug_id, object->debug_id, object->state->name);
- kdebug("objstate=%s [%s]", object->state->name, ostate->name);
- kdebug("objflags=%lx", object->flags);
- kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
- kdebug("ops=%u inp=%u exc=%u",
- object->n_ops, object->n_in_progress, object->n_exclusive);
-
- if (!list_empty(&object->pending_ops)) {
- n = 0;
- list_for_each_entry(p, &object->pending_ops, pend_link) {
- ASSERTCMP(p->object, ==, object);
- kdebug("%p %p", op->processor, op->release);
- n++;
- }
-
- kdebug("n=%u", n);
- }
-
- dump_stack();
-}
-
-/*
* submit an operation for an object
* - objects may be submitted only in the following states:
* - during object creation (write ops may be submitted)
@@ -187,6 +234,7 @@ int fscache_submit_op(struct fscache_object *object,
struct fscache_operation *op)
{
const struct fscache_state *ostate;
+ unsigned long flags;
int ret;
_enter("{OBJ%x OP%x},{%u}",
@@ -204,7 +252,17 @@ int fscache_submit_op(struct fscache_object *object,
smp_rmb();
op->state = FSCACHE_OP_ST_PENDING;
- if (fscache_object_is_active(object)) {
+ flags = READ_ONCE(object->flags);
+ if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
+ fscache_stat(&fscache_n_op_rejected);
+ op->cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+ ret = -ENOBUFS;
+ } else if (unlikely(fscache_cache_is_broken(object))) {
+ op->cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+ ret = -EIO;
+ } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
op->object = object;
object->n_ops++;
@@ -222,23 +280,21 @@ int fscache_submit_op(struct fscache_object *object,
fscache_run_op(object, op);
}
ret = 0;
- } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+ } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
op->object = object;
object->n_ops++;
atomic_inc(&op->usage);
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
ret = 0;
- } else if (fscache_object_is_dying(object)) {
- fscache_stat(&fscache_n_op_rejected);
+ } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
+ op->cancel(op);
op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
- } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+ } else {
fscache_report_unexpected_submission(object, op, ostate);
ASSERT(!fscache_object_is_active(object));
- op->state = FSCACHE_OP_ST_CANCELLED;
- ret = -ENOBUFS;
- } else {
+ op->cancel(op);
op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
}
@@ -293,9 +349,10 @@ void fscache_start_operations(struct fscache_object *object)
* cancel an operation that's pending on an object
*/
int fscache_cancel_op(struct fscache_operation *op,
- void (*do_cancel)(struct fscache_operation *))
+ bool cancel_in_progress_op)
{
struct fscache_object *object = op->object;
+ bool put = false;
int ret;
_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
@@ -309,19 +366,37 @@ int fscache_cancel_op(struct fscache_operation *op,
ret = -EBUSY;
if (op->state == FSCACHE_OP_ST_PENDING) {
ASSERT(!list_empty(&op->pend_link));
- fscache_stat(&fscache_n_op_cancelled);
list_del_init(&op->pend_link);
- if (do_cancel)
- do_cancel(op);
+ put = true;
+
+ fscache_stat(&fscache_n_op_cancelled);
+ op->cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+ object->n_exclusive--;
+ if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+ wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+ ret = 0;
+ } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) {
+ ASSERTCMP(object->n_in_progress, >, 0);
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+ object->n_exclusive--;
+ object->n_in_progress--;
+ if (object->n_in_progress == 0)
+ fscache_start_operations(object);
+
+ fscache_stat(&fscache_n_op_cancelled);
+ op->cancel(op);
op->state = FSCACHE_OP_ST_CANCELLED;
if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
object->n_exclusive--;
if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
- fscache_put_operation(op);
ret = 0;
}
+ if (put)
+ fscache_put_operation(op);
spin_unlock(&object->lock);
_leave(" = %d", ret);
return ret;
@@ -345,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object)
list_del_init(&op->pend_link);
ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+ op->cancel(op);
op->state = FSCACHE_OP_ST_CANCELLED;
if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
@@ -377,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled)
spin_lock(&object->lock);
- op->state = cancelled ?
- FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
+ if (!cancelled) {
+ op->state = FSCACHE_OP_ST_COMPLETE;
+ } else {
+ op->cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+ }
if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
object->n_exclusive--;
@@ -409,9 +489,9 @@ void fscache_put_operation(struct fscache_operation *op)
return;
_debug("PUT OP");
- ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE,
+ ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED &&
+ op->state != FSCACHE_OP_ST_COMPLETE,
op->state, ==, FSCACHE_OP_ST_CANCELLED);
- op->state = FSCACHE_OP_ST_DEAD;
fscache_stat(&fscache_n_op_release);
@@ -419,37 +499,39 @@ void fscache_put_operation(struct fscache_operation *op)
op->release(op);
op->release = NULL;
}
+ op->state = FSCACHE_OP_ST_DEAD;
object = op->object;
+ if (likely(object)) {
+ if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
+ atomic_dec(&object->n_reads);
+ if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
+ fscache_unuse_cookie(object);
+
+ /* now... we may get called with the object spinlock held, so we
+ * complete the cleanup here only if we can immediately acquire the
+ * lock, and defer it otherwise */
+ if (!spin_trylock(&object->lock)) {
+ _debug("defer put");
+ fscache_stat(&fscache_n_op_deferred_release);
+
+ cache = object->cache;
+ spin_lock(&cache->op_gc_list_lock);
+ list_add_tail(&op->pend_link, &cache->op_gc_list);
+ spin_unlock(&cache->op_gc_list_lock);
+ schedule_work(&cache->op_gc);
+ _leave(" [defer]");
+ return;
+ }
- if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
- atomic_dec(&object->n_reads);
- if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
- fscache_unuse_cookie(object);
-
- /* now... we may get called with the object spinlock held, so we
- * complete the cleanup here only if we can immediately acquire the
- * lock, and defer it otherwise */
- if (!spin_trylock(&object->lock)) {
- _debug("defer put");
- fscache_stat(&fscache_n_op_deferred_release);
+ ASSERTCMP(object->n_ops, >, 0);
+ object->n_ops--;
+ if (object->n_ops == 0)
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
- cache = object->cache;
- spin_lock(&cache->op_gc_list_lock);
- list_add_tail(&op->pend_link, &cache->op_gc_list);
- spin_unlock(&cache->op_gc_list_lock);
- schedule_work(&cache->op_gc);
- _leave(" [defer]");
- return;
+ spin_unlock(&object->lock);
}
- ASSERTCMP(object->n_ops, >, 0);
- object->n_ops--;
- if (object->n_ops == 0)
- fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
-
- spin_unlock(&object->lock);
-
kfree(op);
_leave(" [done]");
}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index de33b3fccca6..483bbc613bf0 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
return -ENOMEM;
}
- fscache_operation_init(op, fscache_attr_changed_op, NULL);
+ fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL);
op->flags = FSCACHE_OP_ASYNC |
(1 << FSCACHE_OP_EXCLUSIVE) |
(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -239,7 +239,7 @@ nobufs_dec:
wake_cookie = __fscache_unuse_cookie(cookie);
nobufs:
spin_unlock(&cookie->lock);
- kfree(op);
+ fscache_put_operation(op);
if (wake_cookie)
__fscache_wake_unused_cookie(cookie);
fscache_stat(&fscache_n_attr_changed_nobufs);
@@ -249,6 +249,17 @@ nobufs:
EXPORT_SYMBOL(__fscache_attr_changed);
/*
+ * Handle cancellation of a pending retrieval op
+ */
+static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
+{
+ struct fscache_retrieval *op =
+ container_of(_op, struct fscache_retrieval, op);
+
+ atomic_set(&op->n_pages, 0);
+}
+
+/*
* release a retrieval op reference
*/
static void fscache_release_retrieval_op(struct fscache_operation *_op)
@@ -258,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
_enter("{OP%x}", op->op.debug_id);
- ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
+ ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED,
+ atomic_read(&op->n_pages), ==, 0);
fscache_hist(fscache_retrieval_histogram, op->start_time);
if (op->context)
- fscache_put_context(op->op.object->cookie, op->context);
+ fscache_put_context(op->cookie, op->context);
_leave("");
}
@@ -285,15 +297,24 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
return NULL;
}
- fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
+ fscache_operation_init(&op->op, NULL,
+ fscache_do_cancel_retrieval,
+ fscache_release_retrieval_op);
op->op.flags = FSCACHE_OP_MYTHREAD |
(1UL << FSCACHE_OP_WAITING) |
(1UL << FSCACHE_OP_UNUSE_COOKIE);
+ op->cookie = cookie;
op->mapping = mapping;
op->end_io_func = end_io_func;
op->context = context;
op->start_time = jiffies;
INIT_LIST_HEAD(&op->to_do);
+
+ /* Pin the netfs read context in case we need to do the actual netfs
+ * read because we've encountered a cache read failure.
+ */
+ if (context)
+ fscache_get_context(op->cookie, context);
return op;
}
@@ -330,24 +351,12 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
}
/*
- * Handle cancellation of a pending retrieval op
- */
-static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
-{
- struct fscache_retrieval *op =
- container_of(_op, struct fscache_retrieval, op);
-
- atomic_set(&op->n_pages, 0);
-}
-
-/*
* wait for an object to become active (or dead)
*/
int fscache_wait_for_operation_activation(struct fscache_object *object,
struct fscache_operation *op,
atomic_t *stat_op_waits,
- atomic_t *stat_object_dead,
- void (*do_cancel)(struct fscache_operation *))
+ atomic_t *stat_object_dead)
{
int ret;
@@ -359,7 +368,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
fscache_stat(stat_op_waits);
if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
TASK_INTERRUPTIBLE) != 0) {
- ret = fscache_cancel_op(op, do_cancel);
+ ret = fscache_cancel_op(op, false);
if (ret == 0)
return -ERESTARTSYS;
@@ -377,11 +386,13 @@ check_if_dead:
_leave(" = -ENOBUFS [cancelled]");
return -ENOBUFS;
}
- if (unlikely(fscache_object_is_dead(object))) {
- pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state);
- fscache_cancel_op(op, do_cancel);
+ if (unlikely(fscache_object_is_dying(object) ||
+ fscache_cache_is_broken(object))) {
+ enum fscache_operation_state state = op->state;
+ fscache_cancel_op(op, true);
if (stat_object_dead)
fscache_stat(stat_object_dead);
+ _leave(" = -ENOBUFS [obj dead %d]", state);
return -ENOBUFS;
}
return 0;
@@ -453,17 +464,12 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
fscache_stat(&fscache_n_retrieval_ops);
- /* pin the netfs read context in case we need to do the actual netfs
- * read because we've encountered a cache read failure */
- fscache_get_context(object->cookie, op->context);
-
/* we wait for the operation to become active, and then process it
* *here*, in this thread, and not in the thread pool */
ret = fscache_wait_for_operation_activation(
object, &op->op,
__fscache_stat(&fscache_n_retrieval_op_waits),
- __fscache_stat(&fscache_n_retrievals_object_dead),
- fscache_do_cancel_retrieval);
+ __fscache_stat(&fscache_n_retrievals_object_dead));
if (ret < 0)
goto error;
@@ -503,7 +509,7 @@ nobufs_unlock:
spin_unlock(&cookie->lock);
if (wake_cookie)
__fscache_wake_unused_cookie(cookie);
- kfree(op);
+ fscache_put_retrieval(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
_leave(" = -ENOBUFS");
@@ -584,17 +590,12 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
fscache_stat(&fscache_n_retrieval_ops);
- /* pin the netfs read context in case we need to do the actual netfs
- * read because we've encountered a cache read failure */
- fscache_get_context(object->cookie, op->context);
-
/* we wait for the operation to become active, and then process it
* *here*, in this thread, and not in the thread pool */
ret = fscache_wait_for_operation_activation(
object, &op->op,
__fscache_stat(&fscache_n_retrieval_op_waits),
- __fscache_stat(&fscache_n_retrievals_object_dead),
- fscache_do_cancel_retrieval);
+ __fscache_stat(&fscache_n_retrievals_object_dead));
if (ret < 0)
goto error;
@@ -632,7 +633,7 @@ nobufs_unlock_dec:
wake_cookie = __fscache_unuse_cookie(cookie);
nobufs_unlock:
spin_unlock(&cookie->lock);
- kfree(op);
+ fscache_put_retrieval(op);
if (wake_cookie)
__fscache_wake_unused_cookie(cookie);
nobufs:
@@ -700,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
ret = fscache_wait_for_operation_activation(
object, &op->op,
__fscache_stat(&fscache_n_alloc_op_waits),
- __fscache_stat(&fscache_n_allocs_object_dead),
- fscache_do_cancel_retrieval);
+ __fscache_stat(&fscache_n_allocs_object_dead));
if (ret < 0)
goto error;
@@ -726,7 +726,7 @@ nobufs_unlock_dec:
wake_cookie = __fscache_unuse_cookie(cookie);
nobufs_unlock:
spin_unlock(&cookie->lock);
- kfree(op);
+ fscache_put_retrieval(op);
if (wake_cookie)
__fscache_wake_unused_cookie(cookie);
nobufs:
@@ -944,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
if (!op)
goto nomem;
- fscache_operation_init(&op->op, fscache_write_op,
+ fscache_operation_init(&op->op, fscache_write_op, NULL,
fscache_release_write_op);
op->op.flags = FSCACHE_OP_ASYNC |
(1 << FSCACHE_OP_WAITING) |
@@ -1016,7 +1016,7 @@ already_pending:
spin_unlock(&object->lock);
spin_unlock(&cookie->lock);
radix_tree_preload_end();
- kfree(op);
+ fscache_put_operation(&op->op);
fscache_stat(&fscache_n_stores_ok);
_leave(" = 0");
return 0;
@@ -1036,7 +1036,7 @@ nobufs_unlock_obj:
nobufs:
spin_unlock(&cookie->lock);
radix_tree_preload_end();
- kfree(op);
+ fscache_put_operation(&op->op);
if (wake_cookie)
__fscache_wake_unused_cookie(cookie);
fscache_stat(&fscache_n_stores_nobufs);
@@ -1044,7 +1044,7 @@ nobufs:
return -ENOBUFS;
nomem_free:
- kfree(op);
+ fscache_put_operation(&op->op);
nomem:
fscache_stat(&fscache_n_stores_oom);
_leave(" = -ENOMEM");
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 40d13c70ef51..7cfa0aacdf6d 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -23,6 +23,7 @@ atomic_t fscache_n_op_run;
atomic_t fscache_n_op_enqueue;
atomic_t fscache_n_op_requeue;
atomic_t fscache_n_op_deferred_release;
+atomic_t fscache_n_op_initialised;
atomic_t fscache_n_op_release;
atomic_t fscache_n_op_gc;
atomic_t fscache_n_op_cancelled;
@@ -130,6 +131,11 @@ atomic_t fscache_n_cop_write_page;
atomic_t fscache_n_cop_uncache_page;
atomic_t fscache_n_cop_dissociate_pages;
+atomic_t fscache_n_cache_no_space_reject;
+atomic_t fscache_n_cache_stale_objects;
+atomic_t fscache_n_cache_retired_objects;
+atomic_t fscache_n_cache_culled_objects;
+
/*
* display the general statistics
*/
@@ -246,7 +252,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_op_enqueue),
atomic_read(&fscache_n_op_cancelled),
atomic_read(&fscache_n_op_rejected));
- seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n",
+ seq_printf(m, "Ops : ini=%u dfr=%u rel=%u gc=%u\n",
+ atomic_read(&fscache_n_op_initialised),
atomic_read(&fscache_n_op_deferred_release),
atomic_read(&fscache_n_op_release),
atomic_read(&fscache_n_op_gc));
@@ -271,6 +278,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_cop_write_page),
atomic_read(&fscache_n_cop_uncache_page),
atomic_read(&fscache_n_cop_dissociate_pages));
+ seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n",
+ atomic_read(&fscache_n_cache_no_space_reject),
+ atomic_read(&fscache_n_cache_stale_objects),
+ atomic_read(&fscache_n_cache_retired_objects),
+ atomic_read(&fscache_n_cache_culled_objects));
return 0;
}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 205e0d5d5307..f863ac6647ac 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -244,7 +244,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
return 0;
parent = fuse_control_sb->s_root;
- inc_nlink(parent->d_inode);
+ inc_nlink(d_inode(parent));
sprintf(name, "%u", fc->dev);
parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
&simple_dir_inode_operations,
@@ -283,11 +283,11 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
for (i = fc->ctl_ndents - 1; i >= 0; i--) {
struct dentry *dentry = fc->ctl_dentry[i];
- dentry->d_inode->i_private = NULL;
+ d_inode(dentry)->i_private = NULL;
d_drop(dentry);
dput(dentry);
}
- drop_nlink(fuse_control_sb->s_root->d_inode);
+ drop_nlink(d_inode(fuse_control_sb->s_root));
}
static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 28d0c7abba1c..eae2c11268bc 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,7 +38,6 @@
#include <linux/device.h>
#include <linux/file.h>
#include <linux/fs.h>
-#include <linux/aio.h>
#include <linux/kdev_t.h>
#include <linux/kthread.h>
#include <linux/list.h>
@@ -48,6 +47,7 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/module.h>
+#include <linux/uio.h>
#include "fuse_i.h"
@@ -88,32 +88,23 @@ static struct list_head *cuse_conntbl_head(dev_t devt)
* FUSE file.
*/
-static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
- loff_t *ppos)
+static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
{
+ struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
loff_t pos = 0;
- struct iovec iov = { .iov_base = buf, .iov_len = count };
- struct fuse_io_priv io = { .async = 0, .file = file };
- struct iov_iter ii;
- iov_iter_init(&ii, READ, &iov, 1, count);
- return fuse_direct_io(&io, &ii, &pos, FUSE_DIO_CUSE);
+ return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
}
-static ssize_t cuse_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
{
+ struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
loff_t pos = 0;
- struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
- struct fuse_io_priv io = { .async = 0, .file = file };
- struct iov_iter ii;
- iov_iter_init(&ii, WRITE, &iov, 1, count);
-
/*
* No locking or generic_write_checks(), the server is
* responsible for locking and sanity checks.
*/
- return fuse_direct_io(&io, &ii, &pos,
+ return fuse_direct_io(&io, from, &pos,
FUSE_DIO_WRITE | FUSE_DIO_CUSE);
}
@@ -186,8 +177,8 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
static const struct file_operations cuse_frontend_fops = {
.owner = THIS_MODULE,
- .read = cuse_read,
- .write = cuse_write,
+ .read_iter = cuse_read_iter,
+ .write_iter = cuse_write_iter,
.open = cuse_open,
.release = cuse_release,
.unlocked_ioctl = cuse_file_ioctl,
@@ -498,6 +489,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
*/
static int cuse_channel_open(struct inode *inode, struct file *file)
{
+ struct fuse_dev *fud;
struct cuse_conn *cc;
int rc;
@@ -508,17 +500,22 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
fuse_conn_init(&cc->fc);
+ fud = fuse_dev_alloc(&cc->fc);
+ if (!fud) {
+ kfree(cc);
+ return -ENOMEM;
+ }
+
INIT_LIST_HEAD(&cc->list);
cc->fc.release = cuse_fc_release;
- cc->fc.connected = 1;
cc->fc.initialized = 1;
rc = cuse_send_init(cc);
if (rc) {
- fuse_conn_put(&cc->fc);
+ fuse_dev_free(fud);
return rc;
}
- file->private_data = &cc->fc; /* channel owns base reference to cc */
+ file->private_data = fud;
return 0;
}
@@ -536,7 +533,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
*/
static int cuse_channel_release(struct inode *inode, struct file *file)
{
- struct cuse_conn *cc = fc_to_cc(file->private_data);
+ struct fuse_dev *fud = file->private_data;
+ struct cuse_conn *cc = fc_to_cc(fud->fc);
int rc;
/* remove from the conntbl, no more access from this point on */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 39706c57ad3c..80cc1b35d460 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,20 +19,19 @@
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
-#include <linux/aio.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
static struct kmem_cache *fuse_req_cachep;
-static struct fuse_conn *fuse_get_conn(struct file *file)
+static struct fuse_dev *fuse_get_dev(struct file *file)
{
/*
* Lockless access is OK, because file->private data is set
* once during mount and is valid until the file is released.
*/
- return file->private_data;
+ return ACCESS_ONCE(file->private_data);
}
static void fuse_request_init(struct fuse_req *req, struct page **pages,
@@ -49,6 +48,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages,
req->pages = pages;
req->page_descs = page_descs;
req->max_pages = npages;
+ __set_bit(FR_PENDING, &req->flags);
}
static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
@@ -169,6 +169,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
if (!fc->connected)
goto out;
+ err = -ECONNREFUSED;
+ if (fc->conn_error)
+ goto out;
+
req = fuse_request_alloc(npages);
err = -ENOMEM;
if (!req) {
@@ -178,8 +182,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
}
fuse_req_init_context(req);
- req->waiting = 1;
- req->background = for_background;
+ __set_bit(FR_WAITING, &req->flags);
+ if (for_background)
+ __set_bit(FR_BACKGROUND, &req->flags);
+
return req;
out:
@@ -269,15 +275,15 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
req = get_reserved_req(fc, file);
fuse_req_init_context(req);
- req->waiting = 1;
- req->background = 0;
+ __set_bit(FR_WAITING, &req->flags);
+ __clear_bit(FR_BACKGROUND, &req->flags);
return req;
}
void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
{
if (atomic_dec_and_test(&req->count)) {
- if (unlikely(req->background)) {
+ if (test_bit(FR_BACKGROUND, &req->flags)) {
/*
* We get here in the unlikely case that a background
* request was allocated but not sent
@@ -288,8 +294,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
spin_unlock(&fc->lock);
}
- if (req->waiting)
+ if (test_bit(FR_WAITING, &req->flags)) {
+ __clear_bit(FR_WAITING, &req->flags);
atomic_dec(&fc->num_waiting);
+ }
if (req->stolen_file)
put_reserved_req(fc, req);
@@ -310,46 +318,38 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
return nbytes;
}
-static u64 fuse_get_unique(struct fuse_conn *fc)
+static u64 fuse_get_unique(struct fuse_iqueue *fiq)
{
- fc->reqctr++;
- /* zero is special */
- if (fc->reqctr == 0)
- fc->reqctr = 1;
-
- return fc->reqctr;
+ return ++fiq->reqctr;
}
-static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req)
{
req->in.h.len = sizeof(struct fuse_in_header) +
len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
- list_add_tail(&req->list, &fc->pending);
- req->state = FUSE_REQ_PENDING;
- if (!req->waiting) {
- req->waiting = 1;
- atomic_inc(&fc->num_waiting);
- }
- wake_up(&fc->waitq);
- kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+ list_add_tail(&req->list, &fiq->pending);
+ wake_up_locked(&fiq->waitq);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
}
void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
u64 nodeid, u64 nlookup)
{
+ struct fuse_iqueue *fiq = &fc->iq;
+
forget->forget_one.nodeid = nodeid;
forget->forget_one.nlookup = nlookup;
- spin_lock(&fc->lock);
- if (fc->connected) {
- fc->forget_list_tail->next = forget;
- fc->forget_list_tail = forget;
- wake_up(&fc->waitq);
- kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+ spin_lock(&fiq->waitq.lock);
+ if (fiq->connected) {
+ fiq->forget_list_tail->next = forget;
+ fiq->forget_list_tail = forget;
+ wake_up_locked(&fiq->waitq);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
} else {
kfree(forget);
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fiq->waitq.lock);
}
static void flush_bg_queue(struct fuse_conn *fc)
@@ -357,12 +357,15 @@ static void flush_bg_queue(struct fuse_conn *fc)
while (fc->active_background < fc->max_background &&
!list_empty(&fc->bg_queue)) {
struct fuse_req *req;
+ struct fuse_iqueue *fiq = &fc->iq;
req = list_entry(fc->bg_queue.next, struct fuse_req, list);
list_del(&req->list);
fc->active_background++;
- req->in.h.unique = fuse_get_unique(fc);
- queue_request(fc, req);
+ spin_lock(&fiq->waitq.lock);
+ req->in.h.unique = fuse_get_unique(fiq);
+ queue_request(fiq, req);
+ spin_unlock(&fiq->waitq.lock);
}
}
@@ -373,20 +376,22 @@ static void flush_bg_queue(struct fuse_conn *fc)
* was closed. The requester thread is woken up (if still waiting),
* the 'end' callback is called if given, else the reference to the
* request is released
- *
- * Called with fc->lock, unlocks it
*/
static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-__releases(fc->lock)
{
- void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
- req->end = NULL;
- list_del(&req->list);
- list_del(&req->intr_entry);
- req->state = FUSE_REQ_FINISHED;
- if (req->background) {
- req->background = 0;
+ struct fuse_iqueue *fiq = &fc->iq;
+ if (test_and_set_bit(FR_FINISHED, &req->flags))
+ return;
+
+ spin_lock(&fiq->waitq.lock);
+ list_del_init(&req->intr_entry);
+ spin_unlock(&fiq->waitq.lock);
+ WARN_ON(test_bit(FR_PENDING, &req->flags));
+ WARN_ON(test_bit(FR_SENT, &req->flags));
+ if (test_bit(FR_BACKGROUND, &req->flags)) {
+ spin_lock(&fc->lock);
+ clear_bit(FR_BACKGROUND, &req->flags);
if (fc->num_background == fc->max_background)
fc->blocked = 0;
@@ -402,122 +407,105 @@ __releases(fc->lock)
fc->num_background--;
fc->active_background--;
flush_bg_queue(fc);
+ spin_unlock(&fc->lock);
}
- spin_unlock(&fc->lock);
wake_up(&req->waitq);
- if (end)
- end(fc, req);
+ if (req->end)
+ req->end(fc, req);
fuse_put_request(fc, req);
}
-static void wait_answer_interruptible(struct fuse_conn *fc,
- struct fuse_req *req)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
- if (signal_pending(current))
- return;
-
- spin_unlock(&fc->lock);
- wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
- spin_lock(&fc->lock);
-}
-
-static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
{
- list_add_tail(&req->intr_entry, &fc->interrupts);
- wake_up(&fc->waitq);
- kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+ spin_lock(&fiq->waitq.lock);
+ if (list_empty(&req->intr_entry)) {
+ list_add_tail(&req->intr_entry, &fiq->interrupts);
+ wake_up_locked(&fiq->waitq);
+ }
+ spin_unlock(&fiq->waitq.lock);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
}
static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-__releases(fc->lock)
-__acquires(fc->lock)
{
+ struct fuse_iqueue *fiq = &fc->iq;
+ int err;
+
if (!fc->no_interrupt) {
/* Any signal may interrupt this */
- wait_answer_interruptible(fc, req);
-
- if (req->aborted)
- goto aborted;
- if (req->state == FUSE_REQ_FINISHED)
+ err = wait_event_interruptible(req->waitq,
+ test_bit(FR_FINISHED, &req->flags));
+ if (!err)
return;
- req->interrupted = 1;
- if (req->state == FUSE_REQ_SENT)
- queue_interrupt(fc, req);
+ set_bit(FR_INTERRUPTED, &req->flags);
+ /* matches barrier in fuse_dev_do_read() */
+ smp_mb__after_atomic();
+ if (test_bit(FR_SENT, &req->flags))
+ queue_interrupt(fiq, req);
}
- if (!req->force) {
+ if (!test_bit(FR_FORCE, &req->flags)) {
sigset_t oldset;
/* Only fatal signals may interrupt this */
block_sigs(&oldset);
- wait_answer_interruptible(fc, req);
+ err = wait_event_interruptible(req->waitq,
+ test_bit(FR_FINISHED, &req->flags));
restore_sigs(&oldset);
- if (req->aborted)
- goto aborted;
- if (req->state == FUSE_REQ_FINISHED)
+ if (!err)
return;
+ spin_lock(&fiq->waitq.lock);
/* Request is not yet in userspace, bail out */
- if (req->state == FUSE_REQ_PENDING) {
+ if (test_bit(FR_PENDING, &req->flags)) {
list_del(&req->list);
+ spin_unlock(&fiq->waitq.lock);
__fuse_put_request(req);
req->out.h.error = -EINTR;
return;
}
+ spin_unlock(&fiq->waitq.lock);
}
/*
* Either request is already in userspace, or it was forced.
* Wait it out.
*/
- spin_unlock(&fc->lock);
- wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
- spin_lock(&fc->lock);
-
- if (!req->aborted)
- return;
-
- aborted:
- BUG_ON(req->state != FUSE_REQ_FINISHED);
- if (req->locked) {
- /* This is uninterruptible sleep, because data is
- being copied to/from the buffers of req. During
- locked state, there mustn't be any filesystem
- operation (e.g. page fault), since that could lead
- to deadlock */
- spin_unlock(&fc->lock);
- wait_event(req->waitq, !req->locked);
- spin_lock(&fc->lock);
- }
+ wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
}
static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
{
- BUG_ON(req->background);
- spin_lock(&fc->lock);
- if (!fc->connected)
+ struct fuse_iqueue *fiq = &fc->iq;
+
+ BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
+ spin_lock(&fiq->waitq.lock);
+ if (!fiq->connected) {
+ spin_unlock(&fiq->waitq.lock);
req->out.h.error = -ENOTCONN;
- else if (fc->conn_error)
- req->out.h.error = -ECONNREFUSED;
- else {
- req->in.h.unique = fuse_get_unique(fc);
- queue_request(fc, req);
+ } else {
+ req->in.h.unique = fuse_get_unique(fiq);
+ queue_request(fiq, req);
/* acquire extra reference, since request is still needed
after request_end() */
__fuse_get_request(req);
+ spin_unlock(&fiq->waitq.lock);
request_wait_answer(fc, req);
+ /* Pairs with smp_wmb() in request_end() */
+ smp_rmb();
}
- spin_unlock(&fc->lock);
}
void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
{
- req->isreply = 1;
+ __set_bit(FR_ISREPLY, &req->flags);
+ if (!test_bit(FR_WAITING, &req->flags)) {
+ __set_bit(FR_WAITING, &req->flags);
+ atomic_inc(&fc->num_waiting);
+ }
__fuse_request_send(fc, req);
}
EXPORT_SYMBOL_GPL(fuse_request_send);
@@ -587,10 +575,20 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
return ret;
}
-static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
- struct fuse_req *req)
+/*
+ * Called under fc->lock
+ *
+ * fc->connected must have been checked previously
+ */
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+ struct fuse_req *req)
{
- BUG_ON(!req->background);
+ BUG_ON(!test_bit(FR_BACKGROUND, &req->flags));
+ if (!test_bit(FR_WAITING, &req->flags)) {
+ __set_bit(FR_WAITING, &req->flags);
+ atomic_inc(&fc->num_waiting);
+ }
+ __set_bit(FR_ISREPLY, &req->flags);
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
@@ -603,54 +601,40 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
flush_bg_queue(fc);
}
-static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
{
+ BUG_ON(!req->end);
spin_lock(&fc->lock);
if (fc->connected) {
- fuse_request_send_nowait_locked(fc, req);
+ fuse_request_send_background_locked(fc, req);
spin_unlock(&fc->lock);
} else {
+ spin_unlock(&fc->lock);
req->out.h.error = -ENOTCONN;
- request_end(fc, req);
+ req->end(fc, req);
+ fuse_put_request(fc, req);
}
}
-
-void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
-{
- req->isreply = 1;
- fuse_request_send_nowait(fc, req);
-}
EXPORT_SYMBOL_GPL(fuse_request_send_background);
static int fuse_request_send_notify_reply(struct fuse_conn *fc,
struct fuse_req *req, u64 unique)
{
int err = -ENODEV;
+ struct fuse_iqueue *fiq = &fc->iq;
- req->isreply = 0;
+ __clear_bit(FR_ISREPLY, &req->flags);
req->in.h.unique = unique;
- spin_lock(&fc->lock);
- if (fc->connected) {
- queue_request(fc, req);
+ spin_lock(&fiq->waitq.lock);
+ if (fiq->connected) {
+ queue_request(fiq, req);
err = 0;
}
- spin_unlock(&fc->lock);
+ spin_unlock(&fiq->waitq.lock);
return err;
}
-/*
- * Called under fc->lock
- *
- * fc->connected must have been checked previously
- */
-void fuse_request_send_background_locked(struct fuse_conn *fc,
- struct fuse_req *req)
-{
- req->isreply = 1;
- fuse_request_send_nowait_locked(fc, req);
-}
-
void fuse_force_forget(struct file *file, u64 nodeid)
{
struct inode *inode = file_inode(file);
@@ -666,7 +650,7 @@ void fuse_force_forget(struct file *file, u64 nodeid)
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
- req->isreply = 0;
+ __clear_bit(FR_ISREPLY, &req->flags);
__fuse_request_send(fc, req);
/* ignore errors */
fuse_put_request(fc, req);
@@ -677,62 +661,58 @@ void fuse_force_forget(struct file *file, u64 nodeid)
* anything that could cause a page-fault. If the request was already
* aborted bail out.
*/
-static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
+static int lock_request(struct fuse_req *req)
{
int err = 0;
if (req) {
- spin_lock(&fc->lock);
- if (req->aborted)
+ spin_lock(&req->waitq.lock);
+ if (test_bit(FR_ABORTED, &req->flags))
err = -ENOENT;
else
- req->locked = 1;
- spin_unlock(&fc->lock);
+ set_bit(FR_LOCKED, &req->flags);
+ spin_unlock(&req->waitq.lock);
}
return err;
}
/*
- * Unlock request. If it was aborted during being locked, the
- * requester thread is currently waiting for it to be unlocked, so
- * wake it up.
+ * Unlock request. If it was aborted while locked, caller is responsible
+ * for unlocking and ending the request.
*/
-static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
+static int unlock_request(struct fuse_req *req)
{
+ int err = 0;
if (req) {
- spin_lock(&fc->lock);
- req->locked = 0;
- if (req->aborted)
- wake_up(&req->waitq);
- spin_unlock(&fc->lock);
+ spin_lock(&req->waitq.lock);
+ if (test_bit(FR_ABORTED, &req->flags))
+ err = -ENOENT;
+ else
+ clear_bit(FR_LOCKED, &req->flags);
+ spin_unlock(&req->waitq.lock);
}
+ return err;
}
struct fuse_copy_state {
- struct fuse_conn *fc;
int write;
struct fuse_req *req;
- const struct iovec *iov;
+ struct iov_iter *iter;
struct pipe_buffer *pipebufs;
struct pipe_buffer *currbuf;
struct pipe_inode_info *pipe;
unsigned long nr_segs;
- unsigned long seglen;
- unsigned long addr;
struct page *pg;
unsigned len;
unsigned offset;
unsigned move_pages:1;
};
-static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
- int write,
- const struct iovec *iov, unsigned long nr_segs)
+static void fuse_copy_init(struct fuse_copy_state *cs, int write,
+ struct iov_iter *iter)
{
memset(cs, 0, sizeof(*cs));
- cs->fc = fc;
cs->write = write;
- cs->iov = iov;
- cs->nr_segs = nr_segs;
+ cs->iter = iter;
}
/* Unmap and put previous page of userspace buffer */
@@ -763,7 +743,10 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
struct page *page;
int err;
- unlock_request(cs->fc, cs->req);
+ err = unlock_request(cs->req);
+ if (err)
+ return err;
+
fuse_copy_finish(cs);
if (cs->pipebufs) {
struct pipe_buffer *buf = cs->pipebufs;
@@ -800,25 +783,19 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
cs->nr_segs++;
}
} else {
- if (!cs->seglen) {
- BUG_ON(!cs->nr_segs);
- cs->seglen = cs->iov[0].iov_len;
- cs->addr = (unsigned long) cs->iov[0].iov_base;
- cs->iov++;
- cs->nr_segs--;
- }
- err = get_user_pages_fast(cs->addr, 1, cs->write, &page);
+ size_t off;
+ err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
if (err < 0)
return err;
- BUG_ON(err != 1);
+ BUG_ON(!err);
+ cs->len = err;
+ cs->offset = off;
cs->pg = page;
- cs->offset = cs->addr % PAGE_SIZE;
- cs->len = min(PAGE_SIZE - cs->offset, cs->seglen);
- cs->seglen -= cs->len;
- cs->addr += cs->len;
+ cs->offset = off;
+ iov_iter_advance(cs->iter, err);
}
- return lock_request(cs->fc, cs->req);
+ return lock_request(cs->req);
}
/* Do as much copy to/from userspace buffer as we can */
@@ -869,7 +846,10 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
struct page *newpage;
struct pipe_buffer *buf = cs->pipebufs;
- unlock_request(cs->fc, cs->req);
+ err = unlock_request(cs->req);
+ if (err)
+ return err;
+
fuse_copy_finish(cs);
err = buf->ops->confirm(cs->pipe, buf);
@@ -923,12 +903,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
lru_cache_add_file(newpage);
err = 0;
- spin_lock(&cs->fc->lock);
- if (cs->req->aborted)
+ spin_lock(&cs->req->waitq.lock);
+ if (test_bit(FR_ABORTED, &cs->req->flags))
err = -ENOENT;
else
*pagep = newpage;
- spin_unlock(&cs->fc->lock);
+ spin_unlock(&cs->req->waitq.lock);
if (err) {
unlock_page(newpage);
@@ -948,7 +928,7 @@ out_fallback:
cs->pg = buf->page;
cs->offset = buf->offset;
- err = lock_request(cs->fc, cs->req);
+ err = lock_request(cs->req);
if (err)
return err;
@@ -959,11 +939,15 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
unsigned offset, unsigned count)
{
struct pipe_buffer *buf;
+ int err;
if (cs->nr_segs == cs->pipe->buffers)
return -EIO;
- unlock_request(cs->fc, cs->req);
+ err = unlock_request(cs->req);
+ if (err)
+ return err;
+
fuse_copy_finish(cs);
buf = cs->pipebufs;
@@ -1074,36 +1058,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
return err;
}
-static int forget_pending(struct fuse_conn *fc)
+static int forget_pending(struct fuse_iqueue *fiq)
{
- return fc->forget_list_head.next != NULL;
+ return fiq->forget_list_head.next != NULL;
}
-static int request_pending(struct fuse_conn *fc)
+static int request_pending(struct fuse_iqueue *fiq)
{
- return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
- forget_pending(fc);
-}
-
-/* Wait until a request is available on the pending list */
-static void request_wait(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue_exclusive(&fc->waitq, &wait);
- while (fc->connected && !request_pending(fc)) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (signal_pending(current))
- break;
-
- spin_unlock(&fc->lock);
- schedule();
- spin_lock(&fc->lock);
- }
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&fc->waitq, &wait);
+ return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) ||
+ forget_pending(fiq);
}
/*
@@ -1112,11 +1075,12 @@ __acquires(fc->lock)
* Unlike other requests this is assembled on demand, without a need
* to allocate a separate fuse_req structure.
*
- * Called with fc->lock held, releases it
+ * Called with fiq->waitq.lock held, releases it
*/
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
+static int fuse_read_interrupt(struct fuse_iqueue *fiq,
+ struct fuse_copy_state *cs,
size_t nbytes, struct fuse_req *req)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
{
struct fuse_in_header ih;
struct fuse_interrupt_in arg;
@@ -1124,7 +1088,7 @@ __releases(fc->lock)
int err;
list_del_init(&req->intr_entry);
- req->intr_unique = fuse_get_unique(fc);
+ req->intr_unique = fuse_get_unique(fiq);
memset(&ih, 0, sizeof(ih));
memset(&arg, 0, sizeof(arg));
ih.len = reqsize;
@@ -1132,7 +1096,7 @@ __releases(fc->lock)
ih.unique = req->intr_unique;
arg.unique = req->in.h.unique;
- spin_unlock(&fc->lock);
+ spin_unlock(&fiq->waitq.lock);
if (nbytes < reqsize)
return -EINVAL;
@@ -1144,21 +1108,21 @@ __releases(fc->lock)
return err ? err : reqsize;
}
-static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq,
unsigned max,
unsigned *countp)
{
- struct fuse_forget_link *head = fc->forget_list_head.next;
+ struct fuse_forget_link *head = fiq->forget_list_head.next;
struct fuse_forget_link **newhead = &head;
unsigned count;
for (count = 0; *newhead != NULL && count < max; count++)
newhead = &(*newhead)->next;
- fc->forget_list_head.next = *newhead;
+ fiq->forget_list_head.next = *newhead;
*newhead = NULL;
- if (fc->forget_list_head.next == NULL)
- fc->forget_list_tail = &fc->forget_list_head;
+ if (fiq->forget_list_head.next == NULL)
+ fiq->forget_list_tail = &fiq->forget_list_head;
if (countp != NULL)
*countp = count;
@@ -1166,24 +1130,24 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
return head;
}
-static int fuse_read_single_forget(struct fuse_conn *fc,
+static int fuse_read_single_forget(struct fuse_iqueue *fiq,
struct fuse_copy_state *cs,
size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
{
int err;
- struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+ struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL);
struct fuse_forget_in arg = {
.nlookup = forget->forget_one.nlookup,
};
struct fuse_in_header ih = {
.opcode = FUSE_FORGET,
.nodeid = forget->forget_one.nodeid,
- .unique = fuse_get_unique(fc),
+ .unique = fuse_get_unique(fiq),
.len = sizeof(ih) + sizeof(arg),
};
- spin_unlock(&fc->lock);
+ spin_unlock(&fiq->waitq.lock);
kfree(forget);
if (nbytes < ih.len)
return -EINVAL;
@@ -1199,9 +1163,9 @@ __releases(fc->lock)
return ih.len;
}
-static int fuse_read_batch_forget(struct fuse_conn *fc,
+static int fuse_read_batch_forget(struct fuse_iqueue *fiq,
struct fuse_copy_state *cs, size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
{
int err;
unsigned max_forgets;
@@ -1210,18 +1174,18 @@ __releases(fc->lock)
struct fuse_batch_forget_in arg = { .count = 0 };
struct fuse_in_header ih = {
.opcode = FUSE_BATCH_FORGET,
- .unique = fuse_get_unique(fc),
+ .unique = fuse_get_unique(fiq),
.len = sizeof(ih) + sizeof(arg),
};
if (nbytes < ih.len) {
- spin_unlock(&fc->lock);
+ spin_unlock(&fiq->waitq.lock);
return -EINVAL;
}
max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
- head = dequeue_forget(fc, max_forgets, &count);
- spin_unlock(&fc->lock);
+ head = dequeue_forget(fiq, max_forgets, &count);
+ spin_unlock(&fiq->waitq.lock);
arg.count = count;
ih.len += count * sizeof(struct fuse_forget_one);
@@ -1248,14 +1212,15 @@ __releases(fc->lock)
return ih.len;
}
-static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq,
+ struct fuse_copy_state *cs,
size_t nbytes)
-__releases(fc->lock)
+__releases(fiq->waitq.lock)
{
- if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
- return fuse_read_single_forget(fc, cs, nbytes);
+ if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL)
+ return fuse_read_single_forget(fiq, cs, nbytes);
else
- return fuse_read_batch_forget(fc, cs, nbytes);
+ return fuse_read_batch_forget(fiq, cs, nbytes);
}
/*
@@ -1267,46 +1232,51 @@ __releases(fc->lock)
* request_end(). Otherwise add it to the processing list, and set
* the 'sent' flag.
*/
-static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
+static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
struct fuse_copy_state *cs, size_t nbytes)
{
- int err;
+ ssize_t err;
+ struct fuse_conn *fc = fud->fc;
+ struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_pqueue *fpq = &fud->pq;
struct fuse_req *req;
struct fuse_in *in;
unsigned reqsize;
restart:
- spin_lock(&fc->lock);
+ spin_lock(&fiq->waitq.lock);
err = -EAGAIN;
- if ((file->f_flags & O_NONBLOCK) && fc->connected &&
- !request_pending(fc))
+ if ((file->f_flags & O_NONBLOCK) && fiq->connected &&
+ !request_pending(fiq))
goto err_unlock;
- request_wait(fc);
- err = -ENODEV;
- if (!fc->connected)
+ err = wait_event_interruptible_exclusive_locked(fiq->waitq,
+ !fiq->connected || request_pending(fiq));
+ if (err)
goto err_unlock;
- err = -ERESTARTSYS;
- if (!request_pending(fc))
+
+ err = -ENODEV;
+ if (!fiq->connected)
goto err_unlock;
- if (!list_empty(&fc->interrupts)) {
- req = list_entry(fc->interrupts.next, struct fuse_req,
+ if (!list_empty(&fiq->interrupts)) {
+ req = list_entry(fiq->interrupts.next, struct fuse_req,
intr_entry);
- return fuse_read_interrupt(fc, cs, nbytes, req);
+ return fuse_read_interrupt(fiq, cs, nbytes, req);
}
- if (forget_pending(fc)) {
- if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
- return fuse_read_forget(fc, cs, nbytes);
+ if (forget_pending(fiq)) {
+ if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0)
+ return fuse_read_forget(fc, fiq, cs, nbytes);
- if (fc->forget_batch <= -8)
- fc->forget_batch = 16;
+ if (fiq->forget_batch <= -8)
+ fiq->forget_batch = 16;
}
- req = list_entry(fc->pending.next, struct fuse_req, list);
- req->state = FUSE_REQ_READING;
- list_move(&req->list, &fc->io);
+ req = list_entry(fiq->pending.next, struct fuse_req, list);
+ clear_bit(FR_PENDING, &req->flags);
+ list_del_init(&req->list);
+ spin_unlock(&fiq->waitq.lock);
in = &req->in;
reqsize = in->h.len;
@@ -1319,37 +1289,48 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
request_end(fc, req);
goto restart;
}
- spin_unlock(&fc->lock);
+ spin_lock(&fpq->lock);
+ list_add(&req->list, &fpq->io);
+ spin_unlock(&fpq->lock);
cs->req = req;
err = fuse_copy_one(cs, &in->h, sizeof(in->h));
if (!err)
err = fuse_copy_args(cs, in->numargs, in->argpages,
(struct fuse_arg *) in->args, 0);
fuse_copy_finish(cs);
- spin_lock(&fc->lock);
- req->locked = 0;
- if (req->aborted) {
- request_end(fc, req);
- return -ENODEV;
+ spin_lock(&fpq->lock);
+ clear_bit(FR_LOCKED, &req->flags);
+ if (!fpq->connected) {
+ err = -ENODEV;
+ goto out_end;
}
if (err) {
req->out.h.error = -EIO;
- request_end(fc, req);
- return err;
+ goto out_end;
}
- if (!req->isreply)
- request_end(fc, req);
- else {
- req->state = FUSE_REQ_SENT;
- list_move_tail(&req->list, &fc->processing);
- if (req->interrupted)
- queue_interrupt(fc, req);
- spin_unlock(&fc->lock);
+ if (!test_bit(FR_ISREPLY, &req->flags)) {
+ err = reqsize;
+ goto out_end;
}
+ list_move_tail(&req->list, &fpq->processing);
+ spin_unlock(&fpq->lock);
+ set_bit(FR_SENT, &req->flags);
+ /* matches barrier in request_wait_answer() */
+ smp_mb__after_atomic();
+ if (test_bit(FR_INTERRUPTED, &req->flags))
+ queue_interrupt(fiq, req);
+
return reqsize;
+out_end:
+ if (!test_bit(FR_PRIVATE, &req->flags))
+ list_del_init(&req->list);
+ spin_unlock(&fpq->lock);
+ request_end(fc, req);
+ return err;
+
err_unlock:
- spin_unlock(&fc->lock);
+ spin_unlock(&fiq->waitq.lock);
return err;
}
@@ -1364,18 +1345,21 @@ static int fuse_dev_open(struct inode *inode, struct file *file)
return 0;
}
-static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
{
struct fuse_copy_state cs;
struct file *file = iocb->ki_filp;
- struct fuse_conn *fc = fuse_get_conn(file);
- if (!fc)
+ struct fuse_dev *fud = fuse_get_dev(file);
+
+ if (!fud)
return -EPERM;
- fuse_copy_init(&cs, fc, 1, iov, nr_segs);
+ if (!iter_is_iovec(to))
+ return -EINVAL;
+
+ fuse_copy_init(&cs, 1, to);
- return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+ return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to));
}
static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
@@ -1387,18 +1371,19 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
int do_wakeup = 0;
struct pipe_buffer *bufs;
struct fuse_copy_state cs;
- struct fuse_conn *fc = fuse_get_conn(in);
- if (!fc)
+ struct fuse_dev *fud = fuse_get_dev(in);
+
+ if (!fud)
return -EPERM;
bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
if (!bufs)
return -ENOMEM;
- fuse_copy_init(&cs, fc, 1, NULL, 0);
+ fuse_copy_init(&cs, 1, NULL);
cs.pipebufs = bufs;
cs.pipe = pipe;
- ret = fuse_dev_do_read(fc, in, &cs, len);
+ ret = fuse_dev_do_read(fud, in, &cs, len);
if (ret < 0)
goto out;
@@ -1837,11 +1822,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
}
/* Look up request on processing list by unique ID */
-static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
{
struct fuse_req *req;
- list_for_each_entry(req, &fc->processing, list) {
+ list_for_each_entry(req, &fpq->processing, list) {
if (req->in.h.unique == unique || req->intr_unique == unique)
return req;
}
@@ -1878,10 +1863,12 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
* it from the list and copy the rest of the buffer to the request.
* The request is finished by calling request_end()
*/
-static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
+static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_copy_state *cs, size_t nbytes)
{
int err;
+ struct fuse_conn *fc = fud->fc;
+ struct fuse_pqueue *fpq = &fud->pq;
struct fuse_req *req;
struct fuse_out_header oh;
@@ -1909,79 +1896,79 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
if (oh.error <= -1000 || oh.error > 0)
goto err_finish;
- spin_lock(&fc->lock);
+ spin_lock(&fpq->lock);
err = -ENOENT;
- if (!fc->connected)
- goto err_unlock;
+ if (!fpq->connected)
+ goto err_unlock_pq;
- req = request_find(fc, oh.unique);
+ req = request_find(fpq, oh.unique);
if (!req)
- goto err_unlock;
+ goto err_unlock_pq;
- if (req->aborted) {
- spin_unlock(&fc->lock);
- fuse_copy_finish(cs);
- spin_lock(&fc->lock);
- request_end(fc, req);
- return -ENOENT;
- }
/* Is it an interrupt reply? */
if (req->intr_unique == oh.unique) {
+ spin_unlock(&fpq->lock);
+
err = -EINVAL;
if (nbytes != sizeof(struct fuse_out_header))
- goto err_unlock;
+ goto err_finish;
if (oh.error == -ENOSYS)
fc->no_interrupt = 1;
else if (oh.error == -EAGAIN)
- queue_interrupt(fc, req);
+ queue_interrupt(&fc->iq, req);
- spin_unlock(&fc->lock);
fuse_copy_finish(cs);
return nbytes;
}
- req->state = FUSE_REQ_WRITING;
- list_move(&req->list, &fc->io);
+ clear_bit(FR_SENT, &req->flags);
+ list_move(&req->list, &fpq->io);
req->out.h = oh;
- req->locked = 1;
+ set_bit(FR_LOCKED, &req->flags);
+ spin_unlock(&fpq->lock);
cs->req = req;
if (!req->out.page_replace)
cs->move_pages = 0;
- spin_unlock(&fc->lock);
err = copy_out_args(cs, &req->out, nbytes);
fuse_copy_finish(cs);
- spin_lock(&fc->lock);
- req->locked = 0;
- if (!err) {
- if (req->aborted)
- err = -ENOENT;
- } else if (!req->aborted)
+ spin_lock(&fpq->lock);
+ clear_bit(FR_LOCKED, &req->flags);
+ if (!fpq->connected)
+ err = -ENOENT;
+ else if (err)
req->out.h.error = -EIO;
+ if (!test_bit(FR_PRIVATE, &req->flags))
+ list_del_init(&req->list);
+ spin_unlock(&fpq->lock);
+
request_end(fc, req);
return err ? err : nbytes;
- err_unlock:
- spin_unlock(&fc->lock);
+ err_unlock_pq:
+ spin_unlock(&fpq->lock);
err_finish:
fuse_copy_finish(cs);
return err;
}
-static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
{
struct fuse_copy_state cs;
- struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
- if (!fc)
+ struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp);
+
+ if (!fud)
return -EPERM;
- fuse_copy_init(&cs, fc, 0, iov, nr_segs);
+ if (!iter_is_iovec(from))
+ return -EINVAL;
- return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
+ fuse_copy_init(&cs, 0, from);
+
+ return fuse_dev_do_write(fud, &cs, iov_iter_count(from));
}
static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
@@ -1992,12 +1979,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
unsigned idx;
struct pipe_buffer *bufs;
struct fuse_copy_state cs;
- struct fuse_conn *fc;
+ struct fuse_dev *fud;
size_t rem;
ssize_t ret;
- fc = fuse_get_conn(out);
- if (!fc)
+ fud = fuse_get_dev(out);
+ if (!fud)
return -EPERM;
bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
@@ -2044,14 +2031,15 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
}
pipe_unlock(pipe);
- fuse_copy_init(&cs, fc, 0, NULL, nbuf);
+ fuse_copy_init(&cs, 0, NULL);
cs.pipebufs = bufs;
+ cs.nr_segs = nbuf;
cs.pipe = pipe;
if (flags & SPLICE_F_MOVE)
cs.move_pages = 1;
- ret = fuse_dev_do_write(fc, &cs, len);
+ ret = fuse_dev_do_write(fud, &cs, len);
for (idx = 0; idx < nbuf; idx++) {
struct pipe_buffer *buf = &bufs[idx];
@@ -2065,18 +2053,21 @@ out:
static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
{
unsigned mask = POLLOUT | POLLWRNORM;
- struct fuse_conn *fc = fuse_get_conn(file);
- if (!fc)
+ struct fuse_iqueue *fiq;
+ struct fuse_dev *fud = fuse_get_dev(file);
+
+ if (!fud)
return POLLERR;
- poll_wait(file, &fc->waitq, wait);
+ fiq = &fud->fc->iq;
+ poll_wait(file, &fiq->waitq, wait);
- spin_lock(&fc->lock);
- if (!fc->connected)
+ spin_lock(&fiq->waitq.lock);
+ if (!fiq->connected)
mask = POLLERR;
- else if (request_pending(fc))
+ else if (request_pending(fiq))
mask |= POLLIN | POLLRDNORM;
- spin_unlock(&fc->lock);
+ spin_unlock(&fiq->waitq.lock);
return mask;
}
@@ -2087,67 +2078,18 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
* This function releases and reacquires fc->lock
*/
static void end_requests(struct fuse_conn *fc, struct list_head *head)
-__releases(fc->lock)
-__acquires(fc->lock)
{
while (!list_empty(head)) {
struct fuse_req *req;
req = list_entry(head->next, struct fuse_req, list);
req->out.h.error = -ECONNABORTED;
- request_end(fc, req);
- spin_lock(&fc->lock);
- }
-}
-
-/*
- * Abort requests under I/O
- *
- * The requests are set to aborted and finished, and the request
- * waiter is woken up. This will make request_wait_answer() wait
- * until the request is unlocked and then return.
- *
- * If the request is asynchronous, then the end function needs to be
- * called after waiting for the request to be unlocked (if it was
- * locked).
- */
-static void end_io_requests(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
- while (!list_empty(&fc->io)) {
- struct fuse_req *req =
- list_entry(fc->io.next, struct fuse_req, list);
- void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-
- req->aborted = 1;
- req->out.h.error = -ECONNABORTED;
- req->state = FUSE_REQ_FINISHED;
+ clear_bit(FR_PENDING, &req->flags);
+ clear_bit(FR_SENT, &req->flags);
list_del_init(&req->list);
- wake_up(&req->waitq);
- if (end) {
- req->end = NULL;
- __fuse_get_request(req);
- spin_unlock(&fc->lock);
- wait_event(req->waitq, !req->locked);
- end(fc, req);
- fuse_put_request(fc, req);
- spin_lock(&fc->lock);
- }
+ request_end(fc, req);
}
}
-static void end_queued_requests(struct fuse_conn *fc)
-__releases(fc->lock)
-__acquires(fc->lock)
-{
- fc->max_background = UINT_MAX;
- flush_bg_queue(fc);
- end_requests(fc, &fc->pending);
- end_requests(fc, &fc->processing);
- while (forget_pending(fc))
- kfree(dequeue_forget(fc, 1, NULL));
-}
-
static void end_polls(struct fuse_conn *fc)
{
struct rb_node *p;
@@ -2166,82 +2108,171 @@ static void end_polls(struct fuse_conn *fc)
/*
* Abort all requests.
*
- * Emergency exit in case of a malicious or accidental deadlock, or
- * just a hung filesystem.
+ * Emergency exit in case of a malicious or accidental deadlock, or just a hung
+ * filesystem.
*
- * The same effect is usually achievable through killing the
- * filesystem daemon and all users of the filesystem. The exception
- * is the combination of an asynchronous request and the tricky
- * deadlock (see Documentation/filesystems/fuse.txt).
+ * The same effect is usually achievable through killing the filesystem daemon
+ * and all users of the filesystem. The exception is the combination of an
+ * asynchronous request and the tricky deadlock (see
+ * Documentation/filesystems/fuse.txt).
*
- * During the aborting, progression of requests from the pending and
- * processing lists onto the io list, and progression of new requests
- * onto the pending list is prevented by req->connected being false.
- *
- * Progression of requests under I/O to the processing list is
- * prevented by the req->aborted flag being true for these requests.
- * For this reason requests on the io list must be aborted first.
+ * Aborting requests under I/O goes as follows: 1: Separate out unlocked
+ * requests, they should be finished off immediately. Locked requests will be
+ * finished after unlock; see unlock_request(). 2: Finish off the unlocked
+ * requests. It is possible that some request will finish before we can. This
+ * is OK, the request will in that case be removed from the list before we touch
+ * it.
*/
void fuse_abort_conn(struct fuse_conn *fc)
{
+ struct fuse_iqueue *fiq = &fc->iq;
+
spin_lock(&fc->lock);
if (fc->connected) {
+ struct fuse_dev *fud;
+ struct fuse_req *req, *next;
+ LIST_HEAD(to_end1);
+ LIST_HEAD(to_end2);
+
fc->connected = 0;
fc->blocked = 0;
fuse_set_initialized(fc);
- end_io_requests(fc);
- end_queued_requests(fc);
+ list_for_each_entry(fud, &fc->devices, entry) {
+ struct fuse_pqueue *fpq = &fud->pq;
+
+ spin_lock(&fpq->lock);
+ fpq->connected = 0;
+ list_for_each_entry_safe(req, next, &fpq->io, list) {
+ req->out.h.error = -ECONNABORTED;
+ spin_lock(&req->waitq.lock);
+ set_bit(FR_ABORTED, &req->flags);
+ if (!test_bit(FR_LOCKED, &req->flags)) {
+ set_bit(FR_PRIVATE, &req->flags);
+ list_move(&req->list, &to_end1);
+ }
+ spin_unlock(&req->waitq.lock);
+ }
+ list_splice_init(&fpq->processing, &to_end2);
+ spin_unlock(&fpq->lock);
+ }
+ fc->max_background = UINT_MAX;
+ flush_bg_queue(fc);
+
+ spin_lock(&fiq->waitq.lock);
+ fiq->connected = 0;
+ list_splice_init(&fiq->pending, &to_end2);
+ while (forget_pending(fiq))
+ kfree(dequeue_forget(fiq, 1, NULL));
+ wake_up_all_locked(&fiq->waitq);
+ spin_unlock(&fiq->waitq.lock);
+ kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
end_polls(fc);
- wake_up_all(&fc->waitq);
wake_up_all(&fc->blocked_waitq);
- kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+ spin_unlock(&fc->lock);
+
+ while (!list_empty(&to_end1)) {
+ req = list_first_entry(&to_end1, struct fuse_req, list);
+ __fuse_get_request(req);
+ list_del_init(&req->list);
+ request_end(fc, req);
+ }
+ end_requests(fc, &to_end2);
+ } else {
+ spin_unlock(&fc->lock);
}
- spin_unlock(&fc->lock);
}
EXPORT_SYMBOL_GPL(fuse_abort_conn);
int fuse_dev_release(struct inode *inode, struct file *file)
{
- struct fuse_conn *fc = fuse_get_conn(file);
- if (fc) {
- spin_lock(&fc->lock);
- fc->connected = 0;
- fc->blocked = 0;
- fuse_set_initialized(fc);
- end_queued_requests(fc);
- end_polls(fc);
- wake_up_all(&fc->blocked_waitq);
- spin_unlock(&fc->lock);
- fuse_conn_put(fc);
- }
+ struct fuse_dev *fud = fuse_get_dev(file);
+
+ if (fud) {
+ struct fuse_conn *fc = fud->fc;
+ struct fuse_pqueue *fpq = &fud->pq;
+ WARN_ON(!list_empty(&fpq->io));
+ end_requests(fc, &fpq->processing);
+ /* Are we the last open device? */
+ if (atomic_dec_and_test(&fc->dev_count)) {
+ WARN_ON(fc->iq.fasync != NULL);
+ fuse_abort_conn(fc);
+ }
+ fuse_dev_free(fud);
+ }
return 0;
}
EXPORT_SYMBOL_GPL(fuse_dev_release);
static int fuse_dev_fasync(int fd, struct file *file, int on)
{
- struct fuse_conn *fc = fuse_get_conn(file);
- if (!fc)
+ struct fuse_dev *fud = fuse_get_dev(file);
+
+ if (!fud)
return -EPERM;
/* No locking - fasync_helper does its own locking */
- return fasync_helper(fd, file, on, &fc->fasync);
+ return fasync_helper(fd, file, on, &fud->fc->iq.fasync);
+}
+
+static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
+{
+ struct fuse_dev *fud;
+
+ if (new->private_data)
+ return -EINVAL;
+
+ fud = fuse_dev_alloc(fc);
+ if (!fud)
+ return -ENOMEM;
+
+ new->private_data = fud;
+ atomic_inc(&fc->dev_count);
+
+ return 0;
+}
+
+static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int err = -ENOTTY;
+
+ if (cmd == FUSE_DEV_IOC_CLONE) {
+ int oldfd;
+
+ err = -EFAULT;
+ if (!get_user(oldfd, (__u32 __user *) arg)) {
+ struct file *old = fget(oldfd);
+
+ err = -EINVAL;
+ if (old) {
+ struct fuse_dev *fud = fuse_get_dev(old);
+
+ if (fud) {
+ mutex_lock(&fuse_mutex);
+ err = fuse_device_clone(fud->fc, file);
+ mutex_unlock(&fuse_mutex);
+ }
+ fput(old);
+ }
+ }
+ }
+ return err;
}
const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
.open = fuse_dev_open,
.llseek = no_llseek,
- .read = do_sync_read,
- .aio_read = fuse_dev_read,
+ .read_iter = fuse_dev_read,
.splice_read = fuse_dev_splice_read,
- .write = do_sync_write,
- .aio_write = fuse_dev_write,
+ .write_iter = fuse_dev_write,
.splice_write = fuse_dev_splice_write,
.poll = fuse_dev_poll,
.release = fuse_dev_release,
.fasync = fuse_dev_fasync,
+ .unlocked_ioctl = fuse_dev_ioctl,
+ .compat_ioctl = fuse_dev_ioctl,
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 1545b711ddcf..5e2e08712d3b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -192,7 +192,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
struct fuse_inode *fi;
int ret;
- inode = ACCESS_ONCE(entry->d_inode);
+ inode = d_inode_rcu(entry);
if (inode && is_bad_inode(inode))
goto invalid;
else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
@@ -220,7 +220,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
attr_version = fuse_get_attr_version(fc);
parent = dget_parent(entry);
- fuse_lookup_init(fc, &args, get_node_id(parent->d_inode),
+ fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)),
&entry->d_name, &outarg);
ret = fuse_simple_request(fc, &args);
dput(parent);
@@ -254,7 +254,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
return -ECHILD;
} else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
parent = dget_parent(entry);
- fuse_advise_use_readdirplus(parent->d_inode);
+ fuse_advise_use_readdirplus(d_inode(parent));
dput(parent);
}
}
@@ -487,7 +487,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
entry = res;
}
- if (!(flags & O_CREAT) || entry->d_inode)
+ if (!(flags & O_CREAT) || d_really_is_positive(entry))
goto no_open;
/* Only creates */
@@ -653,7 +653,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
args.in.args[0].value = entry->d_name.name;
err = fuse_simple_request(fc, &args);
if (!err) {
- struct inode *inode = entry->d_inode;
+ struct inode *inode = d_inode(entry);
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fc->lock);
@@ -689,7 +689,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
args.in.args[0].value = entry->d_name.name;
err = fuse_simple_request(fc, &args);
if (!err) {
- clear_nlink(entry->d_inode);
+ clear_nlink(d_inode(entry));
fuse_invalidate_attr(dir);
fuse_invalidate_entry_cache(entry);
} else if (err == -EINTR)
@@ -721,12 +721,12 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
err = fuse_simple_request(fc, &args);
if (!err) {
/* ctime changes */
- fuse_invalidate_attr(oldent->d_inode);
- fuse_update_ctime(oldent->d_inode);
+ fuse_invalidate_attr(d_inode(oldent));
+ fuse_update_ctime(d_inode(oldent));
if (flags & RENAME_EXCHANGE) {
- fuse_invalidate_attr(newent->d_inode);
- fuse_update_ctime(newent->d_inode);
+ fuse_invalidate_attr(d_inode(newent));
+ fuse_update_ctime(d_inode(newent));
}
fuse_invalidate_attr(olddir);
@@ -734,10 +734,10 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
fuse_invalidate_attr(newdir);
/* newent will end up negative */
- if (!(flags & RENAME_EXCHANGE) && newent->d_inode) {
- fuse_invalidate_attr(newent->d_inode);
+ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
+ fuse_invalidate_attr(d_inode(newent));
fuse_invalidate_entry_cache(newent);
- fuse_update_ctime(newent->d_inode);
+ fuse_update_ctime(d_inode(newent));
}
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
@@ -746,7 +746,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
directory), then there can be inconsistency between
the dcache and the real filesystem. Tough luck. */
fuse_invalidate_entry(oldent);
- if (newent->d_inode)
+ if (d_really_is_positive(newent))
fuse_invalidate_entry(newent);
}
@@ -788,7 +788,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
{
int err;
struct fuse_link_in inarg;
- struct inode *inode = entry->d_inode;
+ struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
@@ -961,9 +961,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
fuse_invalidate_attr(parent);
fuse_invalidate_entry(entry);
- if (child_nodeid != 0 && entry->d_inode) {
- mutex_lock(&entry->d_inode->i_mutex);
- if (get_node_id(entry->d_inode) != child_nodeid) {
+ if (child_nodeid != 0 && d_really_is_positive(entry)) {
+ mutex_lock(&d_inode(entry)->i_mutex);
+ if (get_node_id(d_inode(entry)) != child_nodeid) {
err = -ENOENT;
goto badentry;
}
@@ -977,13 +977,13 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
err = -ENOTEMPTY;
goto badentry;
}
- entry->d_inode->i_flags |= S_DEAD;
+ d_inode(entry)->i_flags |= S_DEAD;
}
dont_mount(entry);
- clear_nlink(entry->d_inode);
+ clear_nlink(d_inode(entry));
err = 0;
badentry:
- mutex_unlock(&entry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(entry)->i_mutex);
if (!err)
d_delete(entry);
} else {
@@ -1169,7 +1169,7 @@ static int fuse_direntplus_link(struct file *file,
struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
struct dentry *dentry;
struct dentry *alias;
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
struct fuse_conn *fc;
struct inode *inode;
@@ -1205,7 +1205,7 @@ static int fuse_direntplus_link(struct file *file,
name.hash = full_name_hash(name.name, name.len);
dentry = d_lookup(parent, &name);
if (dentry) {
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
if (!inode) {
d_drop(dentry);
} else if (get_node_id(inode) != o->nodeid ||
@@ -1365,9 +1365,9 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
return err;
}
-static char *read_link(struct dentry *dentry)
+static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
char *link;
@@ -1389,28 +1389,12 @@ static char *read_link(struct dentry *dentry)
link = ERR_PTR(ret);
} else {
link[ret] = '\0';
+ *cookie = link;
}
fuse_invalidate_atime(inode);
return link;
}
-static void free_link(char *link)
-{
- if (!IS_ERR(link))
- free_page((unsigned long) link);
-}
-
-static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- nd_set_link(nd, read_link(dentry));
- return NULL;
-}
-
-static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
-{
- free_link(nd_get_link(nd));
-}
-
static int fuse_dir_open(struct inode *inode, struct file *file)
{
return fuse_open_common(inode, file, true);
@@ -1712,7 +1696,7 @@ error:
static int fuse_setattr(struct dentry *entry, struct iattr *attr)
{
- struct inode *inode = entry->d_inode;
+ struct inode *inode = d_inode(entry);
if (!fuse_allow_current_process(get_fuse_conn(inode)))
return -EACCES;
@@ -1726,7 +1710,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
struct kstat *stat)
{
- struct inode *inode = entry->d_inode;
+ struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
if (!fuse_allow_current_process(fc))
@@ -1738,7 +1722,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
static int fuse_setxattr(struct dentry *entry, const char *name,
const void *value, size_t size, int flags)
{
- struct inode *inode = entry->d_inode;
+ struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
struct fuse_setxattr_in inarg;
@@ -1774,7 +1758,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
void *value, size_t size)
{
- struct inode *inode = entry->d_inode;
+ struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
@@ -1815,7 +1799,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
{
- struct inode *inode = entry->d_inode;
+ struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
struct fuse_getxattr_in inarg;
@@ -1857,7 +1841,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
static int fuse_removexattr(struct dentry *entry, const char *name)
{
- struct inode *inode = entry->d_inode;
+ struct inode *inode = d_inode(entry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
int err;
@@ -1926,7 +1910,7 @@ static const struct inode_operations fuse_common_inode_operations = {
static const struct inode_operations fuse_symlink_inode_operations = {
.setattr = fuse_setattr,
.follow_link = fuse_follow_link,
- .put_link = fuse_put_link,
+ .put_link = free_page_put_link,
.readlink = generic_readlink,
.getattr = fuse_getattr,
.setxattr = fuse_setxattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c01ec3bdcfd8..f523f2f04c19 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,8 +15,8 @@
#include <linux/module.h>
#include <linux/compat.h>
#include <linux/swap.h>
-#include <linux/aio.h>
#include <linux/falloc.h>
+#include <linux/uio.h>
static const struct file_operations fuse_direct_io_file_operations;
@@ -96,17 +96,17 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
* Drop the release request when client does not
* implement 'open'
*/
- req->background = 0;
+ __clear_bit(FR_BACKGROUND, &req->flags);
iput(req->misc.release.inode);
fuse_put_request(ff->fc, req);
} else if (sync) {
- req->background = 0;
+ __clear_bit(FR_BACKGROUND, &req->flags);
fuse_request_send(ff->fc, req);
iput(req->misc.release.inode);
fuse_put_request(ff->fc, req);
} else {
req->end = fuse_release_end;
- req->background = 1;
+ __set_bit(FR_BACKGROUND, &req->flags);
fuse_request_send_background(ff->fc, req);
}
kfree(ff);
@@ -299,8 +299,8 @@ void fuse_sync_release(struct fuse_file *ff, int flags)
{
WARN_ON(atomic_read(&ff->count) > 1);
fuse_prepare_release(ff, flags, FUSE_RELEASE);
- ff->reserved_req->force = 1;
- ff->reserved_req->background = 0;
+ __set_bit(FR_FORCE, &ff->reserved_req->flags);
+ __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags);
fuse_request_send(ff->fc, ff->reserved_req);
fuse_put_request(ff->fc, ff->reserved_req);
kfree(ff);
@@ -426,7 +426,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
req->in.numargs = 1;
req->in.args[0].size = sizeof(inarg);
req->in.args[0].value = &inarg;
- req->force = 1;
+ __set_bit(FR_FORCE, &req->flags);
fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -528,6 +528,17 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
}
}
+static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
+{
+ if (io->err)
+ return io->err;
+
+ if (io->bytes >= 0 && io->write)
+ return -EIO;
+
+ return io->bytes < 0 ? io->size : io->bytes;
+}
+
/**
* In case of short read, the caller sets 'pos' to the position of
* actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -546,6 +557,7 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
*/
static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
{
+ bool is_sync = is_sync_kiocb(io->iocb);
int left;
spin_lock(&io->lock);
@@ -555,30 +567,24 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
io->bytes = pos;
left = --io->reqs;
+ if (!left && is_sync)
+ complete(io->done);
spin_unlock(&io->lock);
- if (!left) {
- long res;
+ if (!left && !is_sync) {
+ ssize_t res = fuse_get_res_by_io(io);
- if (io->err)
- res = io->err;
- else if (io->bytes >= 0 && io->write)
- res = -EIO;
- else {
- res = io->bytes < 0 ? io->size : io->bytes;
-
- if (!is_sync_kiocb(io->iocb)) {
- struct inode *inode = file_inode(io->iocb->ki_filp);
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_inode *fi = get_fuse_inode(inode);
+ if (res >= 0) {
+ struct inode *inode = file_inode(io->iocb->ki_filp);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
- spin_unlock(&fc->lock);
- }
+ spin_lock(&fc->lock);
+ fi->attr_version = ++fc->attr_version;
+ spin_unlock(&fc->lock);
}
- aio_complete(io->iocb, res, 0);
+ io->iocb->ki_complete(io->iocb, res, 0);
kfree(io);
}
}
@@ -1139,13 +1145,11 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- size_t count = iov_iter_count(from);
ssize_t written = 0;
ssize_t written_buffered = 0;
struct inode *inode = mapping->host;
ssize_t err;
loff_t endbyte = 0;
- loff_t pos = iocb->ki_pos;
if (get_fuse_conn(inode)->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
@@ -1161,15 +1165,11 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err)
+ err = generic_write_checks(iocb, from);
+ if (err <= 0)
goto out;
- if (count == 0)
- goto out;
-
- iov_iter_truncate(from, count);
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (err)
goto out;
@@ -1177,7 +1177,8 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
goto out;
- if (file->f_flags & O_DIRECT) {
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ loff_t pos = iocb->ki_pos;
written = generic_file_direct_write(iocb, from, pos);
if (written < 0 || !iov_iter_count(from))
goto out;
@@ -1203,9 +1204,9 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
} else {
- written = fuse_perform_write(file, mapping, from, pos);
+ written = fuse_perform_write(file, mapping, from, iocb->ki_pos);
if (written >= 0)
- iocb->ki_pos = pos + written;
+ iocb->ki_pos += written;
}
out:
current->backing_dev_info = NULL;
@@ -1395,55 +1396,30 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
return res;
}
-static ssize_t fuse_direct_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct fuse_io_priv io = { .async = 0, .file = file };
- struct iovec iov = { .iov_base = buf, .iov_len = count };
- struct iov_iter ii;
- iov_iter_init(&ii, READ, &iov, 1, count);
- return __fuse_direct_read(&io, &ii, ppos);
-}
-
-static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
- struct iov_iter *iter,
- loff_t *ppos)
+static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct file *file = io->file;
- struct inode *inode = file_inode(file);
- size_t count = iov_iter_count(iter);
- ssize_t res;
-
-
- res = generic_write_checks(file, ppos, &count, 0);
- if (!res) {
- iov_iter_truncate(iter, count);
- res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE);
- }
-
- fuse_invalidate_attr(inode);
-
- return res;
+ struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp };
+ return __fuse_direct_read(&io, to, &iocb->ki_pos);
}
-static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- ssize_t res;
struct fuse_io_priv io = { .async = 0, .file = file };
- struct iov_iter ii;
- iov_iter_init(&ii, WRITE, &iov, 1, count);
+ ssize_t res;
if (is_bad_inode(inode))
return -EIO;
/* Don't allow parallel writes to the same file */
mutex_lock(&inode->i_mutex);
- res = __fuse_direct_write(&io, &ii, ppos);
+ res = generic_write_checks(iocb, from);
if (res > 0)
- fuse_write_update_size(inode, *ppos);
+ res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+ fuse_invalidate_attr(inode);
+ if (res > 0)
+ fuse_write_update_size(inode, iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
return res;
@@ -1469,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
list_del(&req->writepages_entry);
for (i = 0; i < req->num_pages; i++) {
- dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
- bdi_writeout_inc(bdi);
+ wb_writeout_inc(&bdi->wb);
}
wake_up(&fi->page_waitq);
}
@@ -1635,7 +1611,8 @@ static int fuse_writepage_locked(struct page *page)
if (!req)
goto err;
- req->background = 1; /* writeback always goes to bg_queue */
+ /* writeback always goes to bg_queue */
+ __set_bit(FR_BACKGROUND, &req->flags);
tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!tmp_page)
goto err_free;
@@ -1658,7 +1635,7 @@ static int fuse_writepage_locked(struct page *page)
req->end = fuse_writepage_end;
req->inode = inode;
- inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock);
@@ -1766,16 +1743,15 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
}
}
- if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
- old_req->state == FUSE_REQ_PENDING)) {
+ if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) {
struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
copy_highpage(old_req->pages[0], page);
spin_unlock(&fc->lock);
- dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK_TEMP);
- bdi_writeout_inc(bdi);
+ wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
goto out;
@@ -1854,7 +1830,7 @@ static int fuse_writepages_fill(struct page *page,
req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
req->misc.write.next = NULL;
req->in.argpages = 1;
- req->background = 1;
+ __set_bit(FR_BACKGROUND, &req->flags);
req->num_pages = 0;
req->end = fuse_writepage_end;
req->inode = inode;
@@ -1872,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page,
req->page_descs[req->num_pages].offset = 0;
req->page_descs[req->num_pages].length = PAGE_SIZE;
- inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
@@ -2798,9 +2774,9 @@ static inline loff_t fuse_round_up(loff_t off)
}
static ssize_t
-fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
+ DECLARE_COMPLETION_ONSTACK(wait);
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
@@ -2815,15 +2791,15 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
inode = file->f_mapping->host;
i_size = i_size_read(inode);
- if ((rw == READ) && (offset > i_size))
+ if ((iov_iter_rw(iter) == READ) && (offset > i_size))
return 0;
/* optimization for short read */
- if (async_dio && rw != WRITE && offset + count > i_size) {
+ if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
if (offset >= i_size)
return 0;
- count = min_t(loff_t, count, fuse_round_up(i_size - offset));
- iov_iter_truncate(iter, count);
+ iov_iter_truncate(iter, fuse_round_up(i_size - offset));
+ count = iov_iter_count(iter);
}
io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
@@ -2834,7 +2810,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
io->bytes = -1;
io->size = 0;
io->offset = offset;
- io->write = (rw == WRITE);
+ io->write = (iov_iter_rw(iter) == WRITE);
io->err = 0;
io->file = file;
/*
@@ -2849,13 +2825,19 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
* to wait on real async I/O requests, so we must submit this request
* synchronously.
*/
- if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
+ if (!is_sync_kiocb(iocb) && (offset + count > i_size) &&
+ iov_iter_rw(iter) == WRITE)
io->async = false;
- if (rw == WRITE)
- ret = __fuse_direct_write(io, iter, &pos);
- else
+ if (io->async && is_sync_kiocb(iocb))
+ io->done = &wait;
+
+ if (iov_iter_rw(iter) == WRITE) {
+ ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
+ fuse_invalidate_attr(inode);
+ } else {
ret = __fuse_direct_read(io, iter, &pos);
+ }
if (io->async) {
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
@@ -2864,12 +2846,13 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
if (!is_sync_kiocb(iocb))
return -EIOCBQUEUED;
- ret = wait_on_sync_kiocb(iocb);
- } else {
- kfree(io);
+ wait_for_completion(&wait);
+ ret = fuse_get_res_by_io(io);
}
- if (rw == WRITE) {
+ kfree(io);
+
+ if (iov_iter_rw(iter) == WRITE) {
if (ret > 0)
fuse_write_update_size(inode, pos);
else if (ret < 0 && offset + count > i_size)
@@ -2957,9 +2940,7 @@ out:
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
- .read = new_sync_read,
.read_iter = fuse_file_read_iter,
- .write = new_sync_write,
.write_iter = fuse_file_write_iter,
.mmap = fuse_file_mmap,
.open = fuse_open,
@@ -2977,8 +2958,8 @@ static const struct file_operations fuse_file_operations = {
static const struct file_operations fuse_direct_io_file_operations = {
.llseek = fuse_file_llseek,
- .read = fuse_direct_read,
- .write = fuse_direct_write,
+ .read_iter = fuse_direct_read_iter,
+ .write_iter = fuse_direct_write_iter,
.mmap = fuse_direct_mmap,
.open = fuse_open,
.flush = fuse_flush,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1cdfb07c1376..405113101db8 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -241,16 +241,6 @@ struct fuse_args {
#define FUSE_ARGS(args) struct fuse_args args = {}
-/** The request state */
-enum fuse_req_state {
- FUSE_REQ_INIT = 0,
- FUSE_REQ_PENDING,
- FUSE_REQ_READING,
- FUSE_REQ_SENT,
- FUSE_REQ_WRITING,
- FUSE_REQ_FINISHED
-};
-
/** The request IO state (for asynchronous processing) */
struct fuse_io_priv {
int async;
@@ -263,10 +253,44 @@ struct fuse_io_priv {
int err;
struct kiocb *iocb;
struct file *file;
+ struct completion *done;
+};
+
+/**
+ * Request flags
+ *
+ * FR_ISREPLY: set if the request has reply
+ * FR_FORCE: force sending of the request even if interrupted
+ * FR_BACKGROUND: request is sent in the background
+ * FR_WAITING: request is counted as "waiting"
+ * FR_ABORTED: the request was aborted
+ * FR_INTERRUPTED: the request has been interrupted
+ * FR_LOCKED: data is being copied to/from the request
+ * FR_PENDING: request is not yet in userspace
+ * FR_SENT: request is in userspace, waiting for an answer
+ * FR_FINISHED: request is finished
+ * FR_PRIVATE: request is on private list
+ */
+enum fuse_req_flag {
+ FR_ISREPLY,
+ FR_FORCE,
+ FR_BACKGROUND,
+ FR_WAITING,
+ FR_ABORTED,
+ FR_INTERRUPTED,
+ FR_LOCKED,
+ FR_PENDING,
+ FR_SENT,
+ FR_FINISHED,
+ FR_PRIVATE,
};
/**
* A request to the client
+ *
+ * .waitq.lock protects the following fields:
+ * - FR_ABORTED
+ * - FR_LOCKED (may also be modified under fc->lock, tested under both)
*/
struct fuse_req {
/** This can be on either pending processing or io lists in
@@ -282,35 +306,8 @@ struct fuse_req {
/** Unique ID for the interrupt request */
u64 intr_unique;
- /*
- * The following bitfields are either set once before the
- * request is queued or setting/clearing them is protected by
- * fuse_conn->lock
- */
-
- /** True if the request has reply */
- unsigned isreply:1;
-
- /** Force sending of the request even if interrupted */
- unsigned force:1;
-
- /** The request was aborted */
- unsigned aborted:1;
-
- /** Request is sent in the background */
- unsigned background:1;
-
- /** The request has been interrupted */
- unsigned interrupted:1;
-
- /** Data is being copied to/from the request */
- unsigned locked:1;
-
- /** Request is counted as "waiting" */
- unsigned waiting:1;
-
- /** State of the request */
- enum fuse_req_state state;
+ /* Request flags, updated with test/set/clear_bit() */
+ unsigned long flags;
/** The request input */
struct fuse_in in;
@@ -379,6 +376,61 @@ struct fuse_req {
struct file *stolen_file;
};
+struct fuse_iqueue {
+ /** Connection established */
+ unsigned connected;
+
+ /** Readers of the connection are waiting on this */
+ wait_queue_head_t waitq;
+
+ /** The next unique request id */
+ u64 reqctr;
+
+ /** The list of pending requests */
+ struct list_head pending;
+
+ /** Pending interrupts */
+ struct list_head interrupts;
+
+ /** Queue of pending forgets */
+ struct fuse_forget_link forget_list_head;
+ struct fuse_forget_link *forget_list_tail;
+
+ /** Batching of FORGET requests (positive indicates FORGET batch) */
+ int forget_batch;
+
+ /** O_ASYNC requests */
+ struct fasync_struct *fasync;
+};
+
+struct fuse_pqueue {
+ /** Connection established */
+ unsigned connected;
+
+ /** Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /** The list of requests being processed */
+ struct list_head processing;
+
+ /** The list of requests under I/O */
+ struct list_head io;
+};
+
+/**
+ * Fuse device instance
+ */
+struct fuse_dev {
+ /** Fuse connection for this device */
+ struct fuse_conn *fc;
+
+ /** Processing queue */
+ struct fuse_pqueue pq;
+
+ /** list entry on fc->devices */
+ struct list_head entry;
+};
+
/**
* A Fuse connection.
*
@@ -393,6 +445,9 @@ struct fuse_conn {
/** Refcount */
atomic_t count;
+ /** Number of fuse_dev's */
+ atomic_t dev_count;
+
struct rcu_head rcu;
/** The user id for this mount */
@@ -410,17 +465,8 @@ struct fuse_conn {
/** Maximum write size */
unsigned max_write;
- /** Readers of the connection are waiting on this */
- wait_queue_head_t waitq;
-
- /** The list of pending requests */
- struct list_head pending;
-
- /** The list of requests being processed */
- struct list_head processing;
-
- /** The list of requests under I/O */
- struct list_head io;
+ /** Input queue */
+ struct fuse_iqueue iq;
/** The next unique kernel file handle */
u64 khctr;
@@ -443,16 +489,6 @@ struct fuse_conn {
/** The list of background requests set aside for later queuing */
struct list_head bg_queue;
- /** Pending interrupts */
- struct list_head interrupts;
-
- /** Queue of pending forgets */
- struct fuse_forget_link forget_list_head;
- struct fuse_forget_link *forget_list_tail;
-
- /** Batching of FORGET requests (positive indicates FORGET batch) */
- int forget_batch;
-
/** Flag indicating that INIT reply has been received. Allocating
* any fuse request will be suspended until the flag is set */
int initialized;
@@ -468,9 +504,6 @@ struct fuse_conn {
/** waitq for reserved requests */
wait_queue_head_t reserved_req_waitq;
- /** The next unique request id */
- u64 reqctr;
-
/** Connection established, cleared on umount, connection
abort and device release */
unsigned connected;
@@ -593,9 +626,6 @@ struct fuse_conn {
/** number of dentries used in the above array */
int ctl_ndents;
- /** O_ASYNC requests */
- struct fasync_struct *fasync;
-
/** Key for lock owner ID scrambling */
u32 scramble_key[4];
@@ -613,6 +643,9 @@ struct fuse_conn {
/** Read/write semaphore to hold when accessing sb. */
struct rw_semaphore killsb;
+
+ /** List of device instances belonging to this connection */
+ struct list_head devices;
};
static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -825,6 +858,9 @@ void fuse_conn_init(struct fuse_conn *fc);
*/
void fuse_conn_put(struct fuse_conn *fc);
+struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc);
+void fuse_dev_free(struct fuse_dev *fud);
+
/**
* Add connection to control filesystem
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e8799c11424b..2913db2a5b99 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -362,8 +362,8 @@ static void fuse_send_destroy(struct fuse_conn *fc)
if (req && fc->conn_init) {
fc->destroy_req = NULL;
req->in.h.opcode = FUSE_DESTROY;
- req->force = 1;
- req->background = 0;
+ __set_bit(FR_FORCE, &req->flags);
+ __clear_bit(FR_BACKGROUND, &req->flags);
fuse_request_send(fc, req);
fuse_put_request(fc, req);
}
@@ -421,7 +421,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
memset(&outarg, 0, sizeof(outarg));
args.in.numargs = 0;
args.in.h.opcode = FUSE_STATFS;
- args.in.h.nodeid = get_node_id(dentry->d_inode);
+ args.in.h.nodeid = get_node_id(d_inode(dentry));
args.out.numargs = 1;
args.out.args[0].size = sizeof(outarg);
args.out.args[0].value = &outarg;
@@ -567,30 +567,46 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static void fuse_iqueue_init(struct fuse_iqueue *fiq)
+{
+ memset(fiq, 0, sizeof(struct fuse_iqueue));
+ init_waitqueue_head(&fiq->waitq);
+ INIT_LIST_HEAD(&fiq->pending);
+ INIT_LIST_HEAD(&fiq->interrupts);
+ fiq->forget_list_tail = &fiq->forget_list_head;
+ fiq->connected = 1;
+}
+
+static void fuse_pqueue_init(struct fuse_pqueue *fpq)
+{
+ memset(fpq, 0, sizeof(struct fuse_pqueue));
+ spin_lock_init(&fpq->lock);
+ INIT_LIST_HEAD(&fpq->processing);
+ INIT_LIST_HEAD(&fpq->io);
+ fpq->connected = 1;
+}
+
void fuse_conn_init(struct fuse_conn *fc)
{
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
init_rwsem(&fc->killsb);
atomic_set(&fc->count, 1);
- init_waitqueue_head(&fc->waitq);
+ atomic_set(&fc->dev_count, 1);
init_waitqueue_head(&fc->blocked_waitq);
init_waitqueue_head(&fc->reserved_req_waitq);
- INIT_LIST_HEAD(&fc->pending);
- INIT_LIST_HEAD(&fc->processing);
- INIT_LIST_HEAD(&fc->io);
- INIT_LIST_HEAD(&fc->interrupts);
+ fuse_iqueue_init(&fc->iq);
INIT_LIST_HEAD(&fc->bg_queue);
INIT_LIST_HEAD(&fc->entry);
- fc->forget_list_tail = &fc->forget_list_head;
+ INIT_LIST_HEAD(&fc->devices);
atomic_set(&fc->num_waiting, 0);
fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
fc->khctr = 0;
fc->polled_files = RB_ROOT;
- fc->reqctr = 0;
fc->blocked = 0;
fc->initialized = 0;
+ fc->connected = 1;
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
}
@@ -740,7 +756,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb,
static struct dentry *fuse_get_parent(struct dentry *child)
{
- struct inode *child_inode = child->d_inode;
+ struct inode *child_inode = d_inode(child);
struct fuse_conn *fc = get_fuse_conn(child_inode);
struct inode *inode;
struct dentry *parent;
@@ -930,6 +946,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
static void fuse_free_conn(struct fuse_conn *fc)
{
+ WARN_ON(!list_empty(&fc->devices));
kfree_rcu(fc, rcu);
}
@@ -975,8 +992,42 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
return 0;
}
+struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
+{
+ struct fuse_dev *fud;
+
+ fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL);
+ if (fud) {
+ fud->fc = fuse_conn_get(fc);
+ fuse_pqueue_init(&fud->pq);
+
+ spin_lock(&fc->lock);
+ list_add_tail(&fud->entry, &fc->devices);
+ spin_unlock(&fc->lock);
+ }
+
+ return fud;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_alloc);
+
+void fuse_dev_free(struct fuse_dev *fud)
+{
+ struct fuse_conn *fc = fud->fc;
+
+ if (fc) {
+ spin_lock(&fc->lock);
+ list_del(&fud->entry);
+ spin_unlock(&fc->lock);
+
+ fuse_conn_put(fc);
+ }
+ kfree(fud);
+}
+EXPORT_SYMBOL_GPL(fuse_dev_free);
+
static int fuse_fill_super(struct super_block *sb, void *data, int silent)
{
+ struct fuse_dev *fud;
struct fuse_conn *fc;
struct inode *root;
struct fuse_mount_data d;
@@ -1026,12 +1077,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
goto err_fput;
fuse_conn_init(fc);
+ fc->release = fuse_free_conn;
+
+ fud = fuse_dev_alloc(fc);
+ if (!fud)
+ goto err_put_conn;
fc->dev = sb->s_dev;
fc->sb = sb;
err = fuse_bdi_init(fc, sb);
if (err)
- goto err_put_conn;
+ goto err_dev_free;
sb->s_bdi = &fc->bdi;
@@ -1040,7 +1096,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
fc->dont_mask = 1;
sb->s_flags |= MS_POSIXACL;
- fc->release = fuse_free_conn;
fc->flags = d.flags;
fc->user_id = d.user_id;
fc->group_id = d.group_id;
@@ -1053,14 +1108,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
root = fuse_get_root_inode(sb, d.rootmode);
root_dentry = d_make_root(root);
if (!root_dentry)
- goto err_put_conn;
+ goto err_dev_free;
/* only now - we want root dentry with NULL ->d_op */
sb->s_d_op = &fuse_dentry_operations;
init_req = fuse_request_alloc(0);
if (!init_req)
goto err_put_root;
- init_req->background = 1;
+ __set_bit(FR_BACKGROUND, &init_req->flags);
if (is_bdev) {
fc->destroy_req = fuse_request_alloc(0);
@@ -1079,8 +1134,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
list_add_tail(&fc->entry, &fuse_conn_list);
sb->s_root = root_dentry;
- fc->connected = 1;
- file->private_data = fuse_conn_get(fc);
+ file->private_data = fud;
mutex_unlock(&fuse_mutex);
/*
* atomic_dec_and_test() in fput() provides the necessary
@@ -1099,6 +1153,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
fuse_request_free(init_req);
err_put_root:
dput(root_dentry);
+ err_dev_free:
+ fuse_dev_free(fud);
err_put_conn:
fuse_bdi_destroy(fc);
fuse_conn_put(fc);
@@ -1238,7 +1294,6 @@ static void fuse_fs_cleanup(void)
}
static struct kobject *fuse_kobj;
-static struct kobject *connections_kobj;
static int fuse_sysfs_init(void)
{
@@ -1250,11 +1305,9 @@ static int fuse_sysfs_init(void)
goto out_err;
}
- connections_kobj = kobject_create_and_add("connections", fuse_kobj);
- if (!connections_kobj) {
- err = -ENOMEM;
+ err = sysfs_create_mount_point(fuse_kobj, "connections");
+ if (err)
goto out_fuse_unregister;
- }
return 0;
@@ -1266,7 +1319,7 @@ static int fuse_sysfs_init(void)
static void fuse_sysfs_cleanup(void)
{
- kobject_put(connections_kobj);
+ sysfs_remove_mount_point(fuse_kobj, "connections");
kobject_put(fuse_kobj);
}
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7b3143064af1..1be3b061c05c 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -110,11 +110,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
if (error)
goto out;
-
- if (acl)
- set_cached_acl(inode, type, acl);
- else
- forget_cached_acl(inode, type);
+ set_cached_acl(inode, type, acl);
out:
kfree(data);
return error;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4ad4f94edebe..1caee0534587 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,7 +20,7 @@
#include <linux/swap.h>
#include <linux/gfs2_ondisk.h>
#include <linux/backing-dev.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include <trace/events/writeback.h>
#include "gfs2.h"
@@ -171,6 +171,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
/**
* gfs2_jdata_writepage - Write complete page
* @page: Page to write
+ * @wbc: The writeback control
*
* Returns: errno
*
@@ -221,9 +222,10 @@ static int gfs2_writepages(struct address_space *mapping,
* gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
* @mapping: The mapping
* @wbc: The writeback control
- * @writepage: The writepage function to call for each page
* @pvec: The vector of pages
* @nr_pages: The number of pages to write
+ * @end: End position
+ * @done_index: Page index
*
* Returns: non-zero if loop should terminate, zero otherwise
*/
@@ -333,8 +335,6 @@ continue_unlock:
* gfs2_write_cache_jdata - Like write_cache_pages but different
* @mapping: The mapping to write
* @wbc: The writeback control
- * @writepage: The writepage function to call
- * @data: The data to pass to writepage
*
* The reason that we use our own function here is that we need to
* start transactions before we grab page locks. This allows us
@@ -588,6 +588,10 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
/**
* gfs2_readpages - Read a bunch of pages at once
+ * @file: The file to read from
+ * @mapping: Address space info
+ * @pages: List of pages to read
+ * @nr_pages: Number of pages to read
*
* Some notes:
* 1. This is only for readahead, so we can simply ignore any things
@@ -671,12 +675,12 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
if (alloc_required) {
struct gfs2_alloc_parms ap = { .aflags = 0, };
- error = gfs2_quota_lock_check(ip);
+ requested = data_blocks + ind_blocks;
+ ap.target = requested;
+ error = gfs2_quota_lock_check(ip, &ap);
if (error)
goto out_unlock;
- requested = data_blocks + ind_blocks;
- ap.target = requested;
error = gfs2_inplace_reserve(ip, &ap);
if (error)
goto out_qunlock;
@@ -853,7 +857,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
* @mapping: The address space to write to
* @pos: The file position
* @len: The length of the data
- * @copied:
+ * @copied: How much was actually copied by the VFS
* @page: The page that has been written
* @fsdata: The fsdata (unused in GFS2)
*
@@ -1016,13 +1020,12 @@ out:
/**
* gfs2_ok_for_dio - check that dio is valid on this file
* @ip: The inode
- * @rw: READ or WRITE
* @offset: The offset at which we are reading or writing
*
* Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
* 1 (to accept the i/o request)
*/
-static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset)
{
/*
* Should we return an error here? I can't see that O_DIRECT for
@@ -1039,8 +1042,8 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
-static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -1061,7 +1064,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
rv = gfs2_glock_nq(&gh);
if (rv)
return rv;
- rv = gfs2_ok_for_dio(ip, rw, offset);
+ rv = gfs2_ok_for_dio(ip, offset);
if (rv != 1)
goto out; /* dio not valid, fall back to buffered i/o */
@@ -1091,13 +1094,12 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
rv = filemap_write_and_wait_range(mapping, lstart, end);
if (rv)
goto out;
- if (rw == WRITE)
+ if (iov_iter_rw(iter) == WRITE)
truncate_inode_pages_range(mapping, lstart, end);
}
- rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
- iter, offset,
- gfs2_get_block_direct, NULL, NULL, 0);
+ rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
+ offset, gfs2_get_block_direct, NULL, NULL, 0);
out:
gfs2_glock_dq(&gh);
gfs2_holder_uninit(&gh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index f0b945ab853e..61296ecbd0e2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1224,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size)
if (gfs2_is_stuffed(ip) &&
(size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
- error = gfs2_quota_lock_check(ip);
+ error = gfs2_quota_lock_check(ip, &ap);
if (error)
return error;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 589f4ea9381c..30822b148f3e 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -48,9 +48,9 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
parent = dget_parent(dentry);
- sdp = GFS2_SB(parent->d_inode);
- dip = GFS2_I(parent->d_inode);
- inode = dentry->d_inode;
+ sdp = GFS2_SB(d_inode(parent));
+ dip = GFS2_I(d_inode(parent));
+ inode = d_inode(dentry);
if (inode) {
if (is_bad_inode(inode))
@@ -68,7 +68,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
goto fail;
}
- error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip);
+ error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip);
switch (error) {
case 0:
if (!inode)
@@ -113,10 +113,10 @@ static int gfs2_dentry_delete(const struct dentry *dentry)
{
struct gfs2_inode *ginode;
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
return 0;
- ginode = GFS2_I(dentry->d_inode);
+ ginode = GFS2_I(d_inode(dentry));
if (!ginode->i_iopen_gh.gh_gl)
return 0;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index c41d255b6a7b..5d15e9498b48 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -49,7 +49,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
*len = GFS2_SMALL_FH_SIZE;
- if (!parent || inode == sb->s_root->d_inode)
+ if (!parent || inode == d_inode(sb->s_root))
return *len;
ip = GFS2_I(parent);
@@ -88,8 +88,8 @@ static int get_name_filldir(struct dir_context *ctx, const char *name,
static int gfs2_get_name(struct dentry *parent, char *name,
struct dentry *child)
{
- struct inode *dir = parent->d_inode;
- struct inode *inode = child->d_inode;
+ struct inode *dir = d_inode(parent);
+ struct inode *inode = d_inode(child);
struct gfs2_inode *dip, *ip;
struct get_name_filldir gnfd = {
.ctx.actor = get_name_filldir,
@@ -128,7 +128,7 @@ static int gfs2_get_name(struct dentry *parent, char *name,
static struct dentry *gfs2_get_parent(struct dentry *child)
{
- return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
+ return d_obtain_alias(gfs2_lookupi(d_inode(child), &gfs2_qdotdot, 1));
}
static struct dentry *gfs2_get_dentry(struct super_block *sb,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 3e32bb8e2d7e..cf4ab89159f4 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,7 +25,6 @@
#include <asm/uaccess.h>
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
-#include <linux/aio.h>
#include <linux/delay.h>
#include "gfs2.h"
@@ -181,7 +180,7 @@ void gfs2_set_inode_flags(struct inode *inode)
flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC);
if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode))
- inode->i_flags |= S_NOSEC;
+ flags |= S_NOSEC;
if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
flags |= S_IMMUTABLE;
if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
@@ -429,11 +428,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret)
goto out_unlock;
- ret = gfs2_quota_lock_check(ip);
- if (ret)
- goto out_unlock;
gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
ap.target = data_blocks + ind_blocks;
+ ret = gfs2_quota_lock_check(ip, &ap);
+ if (ret)
+ goto out_unlock;
ret = gfs2_inplace_reserve(ip, &ap);
if (ret)
goto out_quota_unlock;
@@ -710,7 +709,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
- if (file->f_flags & O_APPEND) {
+ if (iocb->ki_flags & IOCB_APPEND) {
struct gfs2_holder gh;
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
@@ -765,22 +764,30 @@ out:
brelse(dibh);
return error;
}
-
-static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
- unsigned int *data_blocks, unsigned int *ind_blocks)
+/**
+ * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of
+ * blocks, determine how many bytes can be written.
+ * @ip: The inode in question.
+ * @len: Max cap of bytes. What we return in *len must be <= this.
+ * @data_blocks: Compute and return the number of data blocks needed
+ * @ind_blocks: Compute and return the number of indirect blocks needed
+ * @max_blocks: The total blocks available to work with.
+ *
+ * Returns: void, but @len, @data_blocks and @ind_blocks are filled in.
+ */
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len,
+ unsigned int *data_blocks, unsigned int *ind_blocks,
+ unsigned int max_blocks)
{
+ loff_t max = *len;
const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int max_blocks = ip->i_rgd->rd_free_clone;
unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
for (tmp = max_data; tmp > sdp->sd_diptrs;) {
tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
max_data -= tmp;
}
- /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
- so it might end up with fewer data blocks */
- if (max_data <= *data_blocks)
- return;
+
*data_blocks = max_data;
*ind_blocks = max_blocks - max_data;
*len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
@@ -797,7 +804,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
- loff_t bytes, max_bytes;
+ loff_t bytes, max_bytes, max_blks = UINT_MAX;
int error;
const loff_t pos = offset;
const loff_t count = len;
@@ -819,6 +826,9 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
gfs2_size_hint(file, offset, len);
+ gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks);
+ ap.min_target = data_blocks + ind_blocks;
+
while (len > 0) {
if (len < bytes)
bytes = len;
@@ -827,27 +837,41 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
offset += bytes;
continue;
}
- error = gfs2_quota_lock_check(ip);
+
+ /* We need to determine how many bytes we can actually
+ * fallocate without exceeding quota or going over the
+ * end of the fs. We start off optimistically by assuming
+ * we can write max_bytes */
+ max_bytes = (len > max_chunk_size) ? max_chunk_size : len;
+
+ /* Since max_bytes is most likely a theoretical max, we
+ * calculate a more realistic 'bytes' to serve as a good
+ * starting point for the number of bytes we may be able
+ * to write */
+ gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+ ap.target = data_blocks + ind_blocks;
+
+ error = gfs2_quota_lock_check(ip, &ap);
if (error)
return error;
-retry:
- gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+ /* ap.allowed tells us how many blocks quota will allow
+ * us to write. Check if this reduces max_blks */
+ if (ap.allowed && ap.allowed < max_blks)
+ max_blks = ap.allowed;
- ap.target = data_blocks + ind_blocks;
error = gfs2_inplace_reserve(ip, &ap);
- if (error) {
- if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
- bytes >>= 1;
- bytes &= bsize_mask;
- if (bytes == 0)
- bytes = sdp->sd_sb.sb_bsize;
- goto retry;
- }
+ if (error)
goto out_qunlock;
- }
- max_bytes = bytes;
- calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len,
- &max_bytes, &data_blocks, &ind_blocks);
+
+ /* check if the selected rgrp limits our max_blks further */
+ if (ap.allowed && ap.allowed < max_blks)
+ max_blks = ap.allowed;
+
+ /* Almost done. Calculate bytes that can be written using
+ * max_blks. We also recompute max_bytes, data_blocks and
+ * ind_blocks */
+ calc_max_reserv(ip, &max_bytes, &data_blocks,
+ &ind_blocks, max_blks);
rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks);
@@ -893,7 +917,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
struct gfs2_holder gh;
int ret;
- if (mode & ~FALLOC_FL_KEEP_SIZE)
+ if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
return -EOPNOTSUPP;
mutex_lock(&inode->i_mutex);
@@ -931,6 +955,22 @@ out_uninit:
return ret;
}
+static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags)
+{
+ int error;
+ struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
+
+ error = gfs2_rs_alloc(ip);
+ if (error)
+ return (ssize_t)error;
+
+ gfs2_size_hint(out, *ppos, len);
+
+ return iter_file_splice_write(pipe, out, ppos, len, flags);
+}
+
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
/**
@@ -1065,9 +1105,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = gfs2_file_write_iter,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
@@ -1077,7 +1115,7 @@ const struct file_operations gfs2_file_fops = {
.lock = gfs2_lock,
.flock = gfs2_flock,
.splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
+ .splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
};
@@ -1097,9 +1135,7 @@ const struct file_operations gfs2_dir_fops = {
const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = gfs2_file_write_iter,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
@@ -1107,7 +1143,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.release = gfs2_release,
.fsync = gfs2_fsync,
.splice_read = generic_file_splice_read,
- .splice_write = iter_file_splice_write,
+ .splice_write = gfs2_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f42dffba056a..a38e38f7b6fc 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1076,7 +1076,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
!test_bit(GLF_DEMOTE, &gl->gl_flags))
fast_path = 1;
}
- if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
+ if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl) &&
+ (glops->go_flags & GLOF_LRU))
gfs2_glock_add_to_lru(gl);
trace_gfs2_glock_queue(gh, 0);
@@ -2047,34 +2048,41 @@ static const struct file_operations gfs2_sbstats_fops = {
int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
{
- sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
- if (!sdp->debugfs_dir)
- return -ENOMEM;
- sdp->debugfs_dentry_glocks = debugfs_create_file("glocks",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_glocks_fops);
- if (!sdp->debugfs_dentry_glocks)
+ struct dentry *dent;
+
+ dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
+ if (IS_ERR_OR_NULL(dent))
+ goto fail;
+ sdp->debugfs_dir = dent;
+
+ dent = debugfs_create_file("glocks",
+ S_IFREG | S_IRUGO,
+ sdp->debugfs_dir, sdp,
+ &gfs2_glocks_fops);
+ if (IS_ERR_OR_NULL(dent))
goto fail;
+ sdp->debugfs_dentry_glocks = dent;
- sdp->debugfs_dentry_glstats = debugfs_create_file("glstats",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_glstats_fops);
- if (!sdp->debugfs_dentry_glstats)
+ dent = debugfs_create_file("glstats",
+ S_IFREG | S_IRUGO,
+ sdp->debugfs_dir, sdp,
+ &gfs2_glstats_fops);
+ if (IS_ERR_OR_NULL(dent))
goto fail;
+ sdp->debugfs_dentry_glstats = dent;
- sdp->debugfs_dentry_sbstats = debugfs_create_file("sbstats",
- S_IFREG | S_IRUGO,
- sdp->debugfs_dir, sdp,
- &gfs2_sbstats_fops);
- if (!sdp->debugfs_dentry_sbstats)
+ dent = debugfs_create_file("sbstats",
+ S_IFREG | S_IRUGO,
+ sdp->debugfs_dir, sdp,
+ &gfs2_sbstats_fops);
+ if (IS_ERR_OR_NULL(dent))
goto fail;
+ sdp->debugfs_dentry_sbstats = dent;
return 0;
fail:
gfs2_delete_debugfs_file(sdp);
- return -ENOMEM;
+ return dent ? PTR_ERR(dent) : -ENOMEM;
}
void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
@@ -2100,6 +2108,8 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
int gfs2_register_debugfs(void)
{
gfs2_root = debugfs_create_dir("gfs2", NULL);
+ if (IS_ERR(gfs2_root))
+ return PTR_ERR(gfs2_root);
return gfs2_root ? 0 : -ENOMEM;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index fe91951c3361..fa3fa5e94553 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -144,6 +144,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
struct gfs2_rgrpd *rgd;
int error;
+ spin_lock(&gl->gl_spin);
+ rgd = gl->gl_object;
+ if (rgd)
+ gfs2_rgrp_brelse(rgd);
+ spin_unlock(&gl->gl_spin);
+
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
return;
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
@@ -175,15 +181,17 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct address_space *mapping = &sdp->sd_aspace;
+ struct gfs2_rgrpd *rgd = gl->gl_object;
+
+ if (rgd)
+ gfs2_rgrp_brelse(rgd);
WARN_ON_ONCE(!(flags & DIO_METADATA));
gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
- if (gl->gl_object) {
- struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+ if (rgd)
rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
- }
}
/**
@@ -561,7 +569,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_lock = inode_go_lock,
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
- .go_flags = GLOF_ASPACE,
+ .go_flags = GLOF_ASPACE | GLOF_LRU,
};
const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -584,10 +592,12 @@ const struct gfs2_glock_operations gfs2_freeze_glops = {
const struct gfs2_glock_operations gfs2_iopen_glops = {
.go_type = LM_TYPE_IOPEN,
.go_callback = iopen_go_callback,
+ .go_flags = GLOF_LRU,
};
const struct gfs2_glock_operations gfs2_flock_glops = {
.go_type = LM_TYPE_FLOCK,
+ .go_flags = GLOF_LRU,
};
const struct gfs2_glock_operations gfs2_nondisk_glops = {
@@ -596,7 +606,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
const struct gfs2_glock_operations gfs2_quota_glops = {
.go_type = LM_TYPE_QUOTA,
- .go_flags = GLOF_LVB,
+ .go_flags = GLOF_LVB | GLOF_LRU,
};
const struct gfs2_glock_operations gfs2_journal_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 7a2dbbc0d634..a1ec7c20e498 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -225,6 +225,7 @@ struct gfs2_glock_operations {
const unsigned long go_flags;
#define GLOF_ASPACE 1
#define GLOF_LVB 2
+#define GLOF_LRU 4
};
enum {
@@ -301,8 +302,10 @@ struct gfs2_blkreserv {
* to the allocation code.
*/
struct gfs2_alloc_parms {
- u32 target;
+ u64 target;
+ u32 min_target;
u32 aflags;
+ u64 allowed;
};
enum {
@@ -430,6 +433,7 @@ enum {
QDF_CHANGE = 1,
QDF_LOCKED = 2,
QDF_REFRESH = 3,
+ QDF_QMSG_QUIET = 4,
};
struct gfs2_quota_data {
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 73c72253faac..063fdfcf8275 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -295,7 +295,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
(name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
- dir == sb->s_root->d_inode)) {
+ dir == d_inode(sb->s_root))) {
igrab(dir);
return dir;
}
@@ -382,7 +382,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
int error;
- error = gfs2_quota_lock_check(ip);
+ error = gfs2_quota_lock_check(ip, &ap);
if (error)
goto out;
@@ -525,7 +525,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
int error;
if (da->nr_blocks) {
- error = gfs2_quota_lock_check(dip);
+ error = gfs2_quota_lock_check(dip, &ap);
if (error)
goto fail_quota_locks;
@@ -687,7 +687,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
}
gfs2_set_inode_flags(inode);
- if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) ||
+ if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) ||
(dip->i_diskflags & GFS2_DIF_TOPDIR))
aflags |= GFS2_AF_ORLOV;
@@ -888,7 +888,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
{
struct gfs2_inode *dip = GFS2_I(dir);
struct gfs2_sbd *sdp = GFS2_SB(dir);
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder ghs[2];
struct buffer_head *dibh;
@@ -953,7 +953,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (da.nr_blocks) {
struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
- error = gfs2_quota_lock_check(dip);
+ error = gfs2_quota_lock_check(dip, &ap);
if (error)
goto out_gunlock;
@@ -1055,7 +1055,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
static int gfs2_unlink_inode(struct gfs2_inode *dip,
const struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
int error;
@@ -1091,7 +1091,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
{
struct gfs2_inode *dip = GFS2_I(dir);
struct gfs2_sbd *sdp = GFS2_SB(dir);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder ghs[3];
struct gfs2_rgrpd *rgd;
@@ -1227,8 +1227,8 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
*/
static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
- struct file *file, unsigned flags,
- umode_t mode, int *opened)
+ struct file *file, unsigned flags,
+ umode_t mode, int *opened)
{
struct dentry *d;
bool excl = !!(flags & O_EXCL);
@@ -1241,7 +1241,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
return PTR_ERR(d);
if (d != NULL)
dentry = d;
- if (dentry->d_inode) {
+ if (d_really_is_positive(dentry)) {
if (!(*opened & FILE_OPENED))
return finish_no_open(file, d);
dput(d);
@@ -1282,7 +1282,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
error = -EINVAL;
break;
}
- if (dir == sb->s_root->d_inode) {
+ if (dir == d_inode(sb->s_root)) {
error = 0;
break;
}
@@ -1307,6 +1307,35 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
}
/**
+ * update_moved_ino - Update an inode that's being moved
+ * @ip: The inode being moved
+ * @ndip: The parent directory of the new filename
+ * @dir_rename: True of ip is a directory
+ *
+ * Returns: errno
+ */
+
+static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip,
+ int dir_rename)
+{
+ int error;
+ struct buffer_head *dibh;
+
+ if (dir_rename)
+ return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+ ip->i_inode.i_ctime = CURRENT_TIME;
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ brelse(dibh);
+ return 0;
+}
+
+
+/**
* gfs2_rename - Rename a file
* @odir: Parent directory of old file name
* @odentry: The old dentry of the file
@@ -1321,7 +1350,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
{
struct gfs2_inode *odip = GFS2_I(odir);
struct gfs2_inode *ndip = GFS2_I(ndir);
- struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
+ struct gfs2_inode *ip = GFS2_I(d_inode(odentry));
struct gfs2_inode *nip = NULL;
struct gfs2_sbd *sdp = GFS2_SB(odir);
struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
@@ -1332,8 +1361,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
unsigned int x;
int error;
- if (ndentry->d_inode) {
- nip = GFS2_I(ndentry->d_inode);
+ if (d_really_is_positive(ndentry)) {
+ nip = GFS2_I(d_inode(ndentry));
if (ip == nip)
return 0;
}
@@ -1354,7 +1383,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (S_ISDIR(ip->i_inode.i_mode)) {
dir_rename = 1;
- /* don't move a dirctory into it's subdir */
+ /* don't move a directory into its subdir */
error = gfs2_ok_to_move(ip, ndip);
if (error)
goto out_gunlock_r;
@@ -1457,7 +1486,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
/* Check out the dir to be renamed */
if (dir_rename) {
- error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+ error = gfs2_permission(d_inode(odentry), MAY_WRITE);
if (error)
goto out_gunlock;
}
@@ -1470,7 +1499,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (da.nr_blocks) {
struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
- error = gfs2_quota_lock_check(ndip);
+ error = gfs2_quota_lock_check(ndip, &ap);
if (error)
goto out_gunlock;
@@ -1494,20 +1523,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (nip)
error = gfs2_unlink_inode(ndip, ndentry);
- if (dir_rename) {
- error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
- if (error)
- goto out_end_trans;
- } else {
- struct buffer_head *dibh;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto out_end_trans;
- ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
- }
+ error = update_moved_ino(ip, ndip, dir_rename);
+ if (error)
+ goto out_end_trans;
error = gfs2_dir_del(odip, odentry);
if (error)
@@ -1539,6 +1557,161 @@ out:
}
/**
+ * gfs2_exchange - exchange two files
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ * @flags: The rename flags
+ *
+ * Returns: errno
+ */
+
+static int gfs2_exchange(struct inode *odir, struct dentry *odentry,
+ struct inode *ndir, struct dentry *ndentry,
+ unsigned int flags)
+{
+ struct gfs2_inode *odip = GFS2_I(odir);
+ struct gfs2_inode *ndip = GFS2_I(ndir);
+ struct gfs2_inode *oip = GFS2_I(odentry->d_inode);
+ struct gfs2_inode *nip = GFS2_I(ndentry->d_inode);
+ struct gfs2_sbd *sdp = GFS2_SB(odir);
+ struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+ unsigned int num_gh;
+ unsigned int x;
+ umode_t old_mode = oip->i_inode.i_mode;
+ umode_t new_mode = nip->i_inode.i_mode;
+ int error;
+
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
+ if (odip != ndip) {
+ error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+ 0, &r_gh);
+ if (error)
+ goto out;
+
+ if (S_ISDIR(old_mode)) {
+ /* don't move a directory into its subdir */
+ error = gfs2_ok_to_move(oip, ndip);
+ if (error)
+ goto out_gunlock_r;
+ }
+
+ if (S_ISDIR(new_mode)) {
+ /* don't move a directory into its subdir */
+ error = gfs2_ok_to_move(nip, odip);
+ if (error)
+ goto out_gunlock_r;
+ }
+ }
+
+ num_gh = 1;
+ gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ if (odip != ndip) {
+ gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+ num_gh++;
+ }
+ gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+ num_gh++;
+
+ gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+ num_gh++;
+
+ for (x = 0; x < num_gh; x++) {
+ error = gfs2_glock_nq(ghs + x);
+ if (error)
+ goto out_gunlock;
+ }
+
+ error = -ENOENT;
+ if (oip->i_inode.i_nlink == 0 || nip->i_inode.i_nlink == 0)
+ goto out_gunlock;
+
+ error = gfs2_unlink_ok(odip, &odentry->d_name, oip);
+ if (error)
+ goto out_gunlock;
+ error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+ if (error)
+ goto out_gunlock;
+
+ if (S_ISDIR(old_mode)) {
+ error = gfs2_permission(odentry->d_inode, MAY_WRITE);
+ if (error)
+ goto out_gunlock;
+ }
+ if (S_ISDIR(new_mode)) {
+ error = gfs2_permission(ndentry->d_inode, MAY_WRITE);
+ if (error)
+ goto out_gunlock;
+ }
+ error = gfs2_trans_begin(sdp, 4 * RES_DINODE + 4 * RES_LEAF, 0);
+ if (error)
+ goto out_gunlock;
+
+ error = update_moved_ino(oip, ndip, S_ISDIR(old_mode));
+ if (error)
+ goto out_end_trans;
+
+ error = update_moved_ino(nip, odip, S_ISDIR(new_mode));
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_dir_mvino(ndip, &ndentry->d_name, oip,
+ IF2DT(old_mode));
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_dir_mvino(odip, &odentry->d_name, nip,
+ IF2DT(new_mode));
+ if (error)
+ goto out_end_trans;
+
+ if (odip != ndip) {
+ if (S_ISDIR(new_mode) && !S_ISDIR(old_mode)) {
+ inc_nlink(&odip->i_inode);
+ drop_nlink(&ndip->i_inode);
+ } else if (S_ISDIR(old_mode) && !S_ISDIR(new_mode)) {
+ inc_nlink(&ndip->i_inode);
+ drop_nlink(&odip->i_inode);
+ }
+ }
+ mark_inode_dirty(&ndip->i_inode);
+ if (odip != ndip)
+ mark_inode_dirty(&odip->i_inode);
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_gunlock:
+ while (x--) {
+ gfs2_glock_dq(ghs + x);
+ gfs2_holder_uninit(ghs + x);
+ }
+out_gunlock_r:
+ if (r_gh.gh_gl)
+ gfs2_glock_dq_uninit(&r_gh);
+out:
+ return error;
+}
+
+static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
+ struct inode *ndir, struct dentry *ndentry,
+ unsigned int flags)
+{
+ flags &= ~RENAME_NOREPLACE;
+
+ if (flags & ~RENAME_EXCHANGE)
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE)
+ return gfs2_exchange(odir, odentry, ndir, ndentry, flags);
+
+ return gfs2_rename(odir, odentry, ndir, ndentry);
+}
+
+/**
* gfs2_follow_link - Follow a symbolic link
* @dentry: The dentry of the link
* @nd: Data that we pass to vfs_follow_link()
@@ -1548,9 +1721,9 @@ out:
* Returns: 0 on success or error code
*/
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
{
- struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
struct gfs2_holder i_gh;
struct buffer_head *dibh;
unsigned int size;
@@ -1561,8 +1734,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
error = gfs2_glock_nq(&i_gh);
if (error) {
gfs2_holder_uninit(&i_gh);
- nd_set_link(nd, ERR_PTR(error));
- return NULL;
+ return ERR_PTR(error);
}
size = (unsigned int)i_size_read(&ip->i_inode);
@@ -1586,8 +1758,9 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
brelse(dibh);
out:
gfs2_glock_dq_uninit(&i_gh);
- nd_set_link(nd, buf);
- return NULL;
+ if (!IS_ERR(buf))
+ *cookie = buf;
+ return buf;
}
/**
@@ -1669,6 +1842,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
kuid_t ouid, nuid;
kgid_t ogid, ngid;
int error;
+ struct gfs2_alloc_parms ap;
ouid = inode->i_uid;
ogid = inode->i_gid;
@@ -1696,9 +1870,11 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (error)
goto out;
+ ap.target = gfs2_get_inode_blocks(&ip->i_inode);
+
if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
!gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
- error = gfs2_quota_check(ip, nuid, ngid);
+ error = gfs2_quota_check(ip, nuid, ngid, &ap);
if (error)
goto out_gunlock_q;
}
@@ -1713,9 +1889,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
!gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
- u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
- gfs2_quota_change(ip, -blocks, ouid, ogid);
- gfs2_quota_change(ip, blocks, nuid, ngid);
+ gfs2_quota_change(ip, -(s64)ap.target, ouid, ogid);
+ gfs2_quota_change(ip, ap.target, nuid, ngid);
}
out_end_trans:
@@ -1740,7 +1915,7 @@ out:
static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder i_gh;
int error;
@@ -1796,7 +1971,7 @@ out:
static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int error;
@@ -1819,7 +1994,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
static int gfs2_setxattr(struct dentry *dentry, const char *name,
const void *data, size_t size, int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int ret;
@@ -1839,7 +2014,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
void *data, size_t size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int ret;
@@ -1860,7 +2035,7 @@ static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
static int gfs2_removexattr(struct dentry *dentry, const char *name)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int ret;
@@ -1941,7 +2116,7 @@ const struct inode_operations gfs2_dir_iops = {
.mkdir = gfs2_mkdir,
.rmdir = gfs2_unlink,
.mknod = gfs2_mknod,
- .rename = gfs2_rename,
+ .rename2 = gfs2_rename2,
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index efc8e254787c..1e3a93f2f71d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -647,7 +647,7 @@ out_unlock:
static int init_journal(struct gfs2_sbd *sdp, int undo)
{
- struct inode *master = sdp->sd_master_dir->d_inode;
+ struct inode *master = d_inode(sdp->sd_master_dir);
struct gfs2_holder ji_gh;
struct gfs2_inode *ip;
int jindex = 1;
@@ -756,6 +756,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
}
}
+ sdp->sd_log_idle = 1;
set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
gfs2_glock_dq_uninit(&ji_gh);
jindex = 0;
@@ -782,7 +783,7 @@ static struct lock_class_key gfs2_quota_imutex_key;
static int init_inodes(struct gfs2_sbd *sdp, int undo)
{
int error = 0;
- struct inode *master = sdp->sd_master_dir->d_inode;
+ struct inode *master = d_inode(sdp->sd_master_dir);
if (undo)
goto fail_qinode;
@@ -848,7 +849,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
char buf[30];
int error = 0;
struct gfs2_inode *ip;
- struct inode *master = sdp->sd_master_dir->d_inode;
+ struct inode *master = d_inode(sdp->sd_master_dir);
if (sdp->sd_args.ar_spectator)
return 0;
@@ -1357,7 +1358,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
return ERR_PTR(error);
}
s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
- path.dentry->d_inode->i_sb->s_bdev);
+ d_inode(path.dentry)->i_sb->s_bdev);
path_put(&path);
if (IS_ERR(s)) {
pr_warn("gfs2 mount does not exist\n");
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3aa17d4d1cfc..9b61f92fcfdf 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -649,9 +649,117 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
slot_hold(qd);
}
+ if (change < 0) /* Reset quiet flag if we freed some blocks */
+ clear_bit(QDF_QMSG_QUIET, &qd->qd_flags);
mutex_unlock(&sdp->sd_quota_mutex);
}
+static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
+ unsigned off, void *buf, unsigned bytes)
+{
+ struct inode *inode = &ip->i_inode;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ struct buffer_head *bh;
+ void *kaddr;
+ u64 blk;
+ unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0;
+ unsigned to_write = bytes, pg_off = off;
+ int done = 0;
+
+ blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift);
+ boff = off % bsize;
+
+ page = find_or_create_page(mapping, index, GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, bsize, 0);
+
+ bh = page_buffers(page);
+ while (!done) {
+ /* Find the beginning block within the page */
+ if (pg_off >= ((bnum * bsize) + bsize)) {
+ bh = bh->b_this_page;
+ bnum++;
+ blk++;
+ continue;
+ }
+ if (!buffer_mapped(bh)) {
+ gfs2_block_map(inode, blk, bh, 1);
+ if (!buffer_mapped(bh))
+ goto unlock_out;
+ /* If it's a newly allocated disk block, zero it */
+ if (buffer_new(bh))
+ zero_user(page, bnum * bsize, bh->b_size);
+ }
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+ if (!buffer_uptodate(bh)) {
+ ll_rw_block(READ | REQ_META, 1, &bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ goto unlock_out;
+ }
+ gfs2_trans_add_data(ip->i_gl, bh);
+
+ /* If we need to write to the next block as well */
+ if (to_write > (bsize - boff)) {
+ pg_off += (bsize - boff);
+ to_write -= (bsize - boff);
+ boff = pg_off % bsize;
+ continue;
+ }
+ done = 1;
+ }
+
+ /* Write to the page, now that we have setup the buffer(s) */
+ kaddr = kmap_atomic(page);
+ memcpy(kaddr + off, buf, bytes);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ unlock_page(page);
+ page_cache_release(page);
+
+ return 0;
+
+unlock_out:
+ unlock_page(page);
+ page_cache_release(page);
+ return -EIO;
+}
+
+static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp,
+ loff_t loc)
+{
+ unsigned long pg_beg;
+ unsigned pg_off, nbytes, overflow = 0;
+ int pg_oflow = 0, error;
+ void *ptr;
+
+ nbytes = sizeof(struct gfs2_quota);
+
+ pg_beg = loc >> PAGE_CACHE_SHIFT;
+ pg_off = loc % PAGE_CACHE_SIZE;
+
+ /* If the quota straddles a page boundary, split the write in two */
+ if ((pg_off + nbytes) > PAGE_CACHE_SIZE) {
+ pg_oflow = 1;
+ overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE;
+ }
+
+ ptr = qp;
+ error = gfs2_write_buf_to_page(ip, pg_beg, pg_off, ptr,
+ nbytes - overflow);
+ /* If there's an overflow, write the remaining bytes to the next page */
+ if (!error && pg_oflow)
+ error = gfs2_write_buf_to_page(ip, pg_beg + 1, 0,
+ ptr + nbytes - overflow,
+ overflow);
+ return error;
+}
+
/**
* gfs2_adjust_quota - adjust record of current block usage
* @ip: The quota inode
@@ -672,15 +780,8 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
{
struct inode *inode = &ip->i_inode;
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct address_space *mapping = inode->i_mapping;
- unsigned long index = loc >> PAGE_CACHE_SHIFT;
- unsigned offset = loc & (PAGE_CACHE_SIZE - 1);
- unsigned blocksize, iblock, pos;
- struct buffer_head *bh;
- struct page *page;
- void *kaddr, *ptr;
struct gfs2_quota q;
- int err, nbytes;
+ int err;
u64 size;
if (gfs2_is_stuffed(ip)) {
@@ -694,8 +795,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
if (err < 0)
return err;
+ loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */
err = -EIO;
be64_add_cpu(&q.qu_value, change);
+ if (((s64)be64_to_cpu(q.qu_value)) < 0)
+ q.qu_value = 0; /* Never go negative on quota usage */
qd->qd_qb.qb_value = q.qu_value;
if (fdq) {
if (fdq->d_fieldmask & QC_SPC_SOFT) {
@@ -712,79 +816,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
}
}
- /* Write the quota into the quota file on disk */
- ptr = &q;
- nbytes = sizeof(struct gfs2_quota);
-get_a_page:
- page = find_or_create_page(mapping, index, GFP_NOFS);
- if (!page)
- return -ENOMEM;
-
- blocksize = inode->i_sb->s_blocksize;
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
-
- bh = page_buffers(page);
- pos = blocksize;
- while (offset >= pos) {
- bh = bh->b_this_page;
- iblock++;
- pos += blocksize;
- }
-
- if (!buffer_mapped(bh)) {
- gfs2_block_map(inode, iblock, bh, 1);
- if (!buffer_mapped(bh))
- goto unlock_out;
- /* If it's a newly allocated disk block for quota, zero it */
- if (buffer_new(bh))
- zero_user(page, pos - blocksize, bh->b_size);
- }
-
- if (PageUptodate(page))
- set_buffer_uptodate(bh);
-
- if (!buffer_uptodate(bh)) {
- ll_rw_block(READ | REQ_META, 1, &bh);
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
- goto unlock_out;
- }
-
- gfs2_trans_add_data(ip->i_gl, bh);
-
- kaddr = kmap_atomic(page);
- if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
- nbytes = PAGE_CACHE_SIZE - offset;
- memcpy(kaddr + offset, ptr, nbytes);
- flush_dcache_page(page);
- kunmap_atomic(kaddr);
- unlock_page(page);
- page_cache_release(page);
-
- /* If quota straddles page boundary, we need to update the rest of the
- * quota at the beginning of the next page */
- if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
- ptr = ptr + nbytes;
- nbytes = sizeof(struct gfs2_quota) - nbytes;
- offset = 0;
- index++;
- goto get_a_page;
+ err = gfs2_write_disk_quota(ip, &q, loc);
+ if (!err) {
+ size = loc + sizeof(struct gfs2_quota);
+ if (size > inode->i_size)
+ i_size_write(inode, size);
+ inode->i_mtime = inode->i_atime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+ set_bit(QDF_REFRESH, &qd->qd_flags);
}
- size = loc + sizeof(struct gfs2_quota);
- if (size > inode->i_size)
- i_size_write(inode, size);
- inode->i_mtime = inode->i_atime = CURRENT_TIME;
- mark_inode_dirty(inode);
- set_bit(QDF_REFRESH, &qd->qd_flags);
- return 0;
-
-unlock_out:
- unlock_page(page);
- page_cache_release(page);
return err;
}
@@ -923,6 +964,9 @@ restart:
if (error)
return error;
+ if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+ force_refresh = FORCE;
+
qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
@@ -974,11 +1018,8 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
sizeof(struct gfs2_quota_data *), sort_qd, NULL);
for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- int force = NO_FORCE;
qd = ip->i_res->rs_qa_qd[x];
- if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
- force = FORCE;
- error = do_glock(qd, force, &ip->i_res->rs_qa_qd_ghs[x]);
+ error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]);
if (error)
break;
}
@@ -1094,14 +1135,33 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
return 0;
}
-int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
+/**
+ * gfs2_quota_check - check if allocating new blocks will exceed quota
+ * @ip: The inode for which this check is being performed
+ * @uid: The uid to check against
+ * @gid: The gid to check against
+ * @ap: The allocation parameters. ap->target contains the requested
+ * blocks. ap->min_target, if set, contains the minimum blks
+ * requested.
+ *
+ * Returns: 0 on success.
+ * min_req = ap->min_target ? ap->min_target : ap->target;
+ * quota must allow atleast min_req blks for success and
+ * ap->allowed is set to the number of blocks allowed
+ *
+ * -EDQUOT otherwise, quota violation. ap->allowed is set to number
+ * of blocks available.
+ */
+int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+ struct gfs2_alloc_parms *ap)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
- s64 value;
+ s64 value, warn, limit;
unsigned int x;
int error = 0;
+ ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
return 0;
@@ -1115,30 +1175,40 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
qid_eq(qd->qd_id, make_kqid_gid(gid))))
continue;
+ warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn);
+ limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit);
value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
spin_lock(&qd_lock);
value += qd->qd_change;
spin_unlock(&qd_lock);
- if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
- print_message(qd, "exceeded");
- quota_send_warning(qd->qd_id,
- sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN);
-
- error = -EDQUOT;
- break;
- } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
- (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
+ if (limit > 0 && (limit - value) < ap->allowed)
+ ap->allowed = limit - value;
+ /* If we can't meet the target */
+ if (limit && limit < (value + (s64)ap->target)) {
+ /* If no min_target specified or we don't meet
+ * min_target, return -EDQUOT */
+ if (!ap->min_target || ap->min_target > ap->allowed) {
+ if (!test_and_set_bit(QDF_QMSG_QUIET,
+ &qd->qd_flags)) {
+ print_message(qd, "exceeded");
+ quota_send_warning(qd->qd_id,
+ sdp->sd_vfs->s_dev,
+ QUOTA_NL_BHARDWARN);
+ }
+ error = -EDQUOT;
+ break;
+ }
+ } else if (warn && warn < value &&
time_after_eq(jiffies, qd->qd_last_warn +
- gfs2_tune_get(sdp,
- gt_quota_warn_period) * HZ)) {
+ gfs2_tune_get(sdp, gt_quota_warn_period)
+ * HZ)) {
quota_send_warning(qd->qd_id,
sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
error = print_message(qd, "warning");
qd->qd_last_warn = jiffies;
}
}
-
return error;
}
@@ -1468,32 +1538,34 @@ int gfs2_quotad(void *data)
return 0;
}
-static int gfs2_quota_get_xstate(struct super_block *sb,
- struct fs_quota_stat *fqs)
+static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
- memset(fqs, 0, sizeof(struct fs_quota_stat));
- fqs->qs_version = FS_QSTAT_VERSION;
+ memset(state, 0, sizeof(*state));
switch (sdp->sd_args.ar_quota) {
case GFS2_QUOTA_ON:
- fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
+ state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED;
+ state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED;
/*FALLTHRU*/
case GFS2_QUOTA_ACCOUNT:
- fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
+ state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED |
+ QCI_SYSFILE;
+ state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED |
+ QCI_SYSFILE;
break;
case GFS2_QUOTA_OFF:
break;
}
-
if (sdp->sd_quota_inode) {
- fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr;
- fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks;
+ state->s_state[USRQUOTA].ino =
+ GFS2_I(sdp->sd_quota_inode)->i_no_addr;
+ state->s_state[USRQUOTA].blocks = sdp->sd_quota_inode->i_blocks;
}
- fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
- fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
- fqs->qs_incoredqs = list_lru_count(&gfs2_qd_lru);
+ state->s_state[USRQUOTA].nextents = 1; /* unsupported */
+ state->s_state[GRPQUOTA] = state->s_state[USRQUOTA];
+ state->s_incoredqs = list_lru_count(&gfs2_qd_lru);
return 0;
}
@@ -1620,6 +1692,8 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
/* Apply changes */
error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+ if (!error)
+ clear_bit(QDF_QMSG_QUIET, &qd->qd_flags);
gfs2_trans_end(sdp);
out_release:
@@ -1638,7 +1712,7 @@ out_put:
const struct quotactl_ops gfs2_quotactl_ops = {
.quota_sync = gfs2_quota_sync,
- .get_xstate = gfs2_quota_get_xstate,
+ .get_state = gfs2_quota_get_state,
.get_dqblk = gfs2_get_dqblk,
.set_dqblk = gfs2_set_dqblk,
};
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 55d506eb3c4a..ad04b3acae2b 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -24,7 +24,8 @@ extern void gfs2_quota_unhold(struct gfs2_inode *ip);
extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unlock(struct gfs2_inode *ip);
-extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
+ struct gfs2_alloc_parms *ap);
extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
kuid_t uid, kgid_t gid);
@@ -37,7 +38,8 @@ extern int gfs2_quotad(void *data);
extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp);
-static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
+static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
+ struct gfs2_alloc_parms *ap)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
int ret;
@@ -48,7 +50,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
return ret;
if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
- ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
+ ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
if (ret)
gfs2_quota_unlock(ip);
return ret;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9150207f365c..c6c62321dfd6 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -978,10 +978,10 @@ static void set_rgrp_preferences(struct gfs2_sbd *sdp)
rgd->rd_flags |= GFS2_RDF_PREFERRED;
for (i = 0; i < sdp->sd_journals; i++) {
rgd = gfs2_rgrpd_get_next(rgd);
- if (rgd == first)
+ if (!rgd || rgd == first)
break;
}
- } while (rgd != first);
+ } while (rgd && rgd != first);
}
/**
@@ -1244,14 +1244,13 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
}
/**
- * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
- * @gh: The glock holder for the resource group
+ * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * @rgd: The resource group
*
*/
-void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
+void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
{
- struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
int x, length = rgd->rd_length;
for (x = 0; x < length; x++) {
@@ -1264,6 +1263,22 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
}
+/**
+ * gfs2_rgrp_go_unlock - Unlock a rgrp glock
+ * @gh: The glock holder for the resource group
+ *
+ */
+
+void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
+{
+ struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
+ int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
+ test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
+
+ if (rgd && demote_requested)
+ gfs2_rgrp_brelse(rgd);
+}
+
int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
struct buffer_head *bh,
const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
@@ -1711,10 +1726,8 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
return ret;
bitmap_full: /* Mark bitmap as full and fall through */
- if ((state == GFS2_BLKST_FREE) && initial_offset == 0) {
- struct gfs2_bitmap *bi = rbm_bi(rbm);
+ if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
set_bit(GBF_FULL, &bi->bi_flags);
- }
next_bitmap: /* Find next bitmap in the rgrp */
rbm->offset = 0;
@@ -1850,14 +1863,23 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
const struct gfs2_sbd *sdp = gl->gl_sbd;
struct gfs2_lkstats *st;
s64 r_dcount, l_dcount;
- s64 r_srttb, l_srttb;
+ s64 l_srttb, a_srttb = 0;
s64 srttb_diff;
s64 sqr_diff;
s64 var;
+ int cpu, nonzero = 0;
preempt_disable();
+ for_each_present_cpu(cpu) {
+ st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
+ if (st->stats[GFS2_LKS_SRTTB]) {
+ a_srttb += st->stats[GFS2_LKS_SRTTB];
+ nonzero++;
+ }
+ }
st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
- r_srttb = st->stats[GFS2_LKS_SRTTB];
+ if (nonzero)
+ do_div(a_srttb, nonzero);
r_dcount = st->stats[GFS2_LKS_DCOUNT];
var = st->stats[GFS2_LKS_SRTTVARB] +
gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
@@ -1866,10 +1888,10 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
- if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0))
+ if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
return false;
- srttb_diff = r_srttb - l_srttb;
+ srttb_diff = a_srttb - l_srttb;
sqr_diff = srttb_diff * srttb_diff;
var *= 2;
@@ -1946,10 +1968,18 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
* @ip: the inode to reserve space for
* @ap: the allocation parameters
*
- * Returns: errno
+ * We try our best to find an rgrp that has at least ap->target blocks
+ * available. After a couple of passes (loops == 2), the prospects of finding
+ * such an rgrp diminish. At this stage, we return the first rgrp that has
+ * atleast ap->min_target blocks available. Either way, we set ap->allowed to
+ * the number of blocks available in the chosen rgrp.
+ *
+ * Returns: 0 on success,
+ * -ENOMEM if a suitable rgrp can't be found
+ * errno otherwise
*/
-int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap)
+int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *begin = NULL;
@@ -2012,7 +2042,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
/* Skip unuseable resource groups */
if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
GFS2_RDF_ERROR)) ||
- (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
+ (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
goto skip_rgrp;
if (sdp->sd_args.ar_rgrplvb)
@@ -2027,11 +2057,13 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
goto check_rgrp;
/* If rgrp has enough free space, use it */
- if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) {
+ if (rs->rs_rbm.rgd->rd_free_clone >= ap->target ||
+ (loops == 2 && ap->min_target &&
+ rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) {
ip->i_rgd = rs->rs_rbm.rgd;
+ ap->allowed = ip->i_rgd->rd_free_clone;
return 0;
}
-
check_rgrp:
/* Check for unlinked inodes which can be reclaimed */
if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b104f4af3afd..c0ab33fa3eed 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -36,12 +36,14 @@ extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
#define GFS2_AF_ORLOV 1
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap);
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip,
+ struct gfs2_alloc_parms *ap);
extern void gfs2_inplace_release(struct gfs2_inode *ip);
extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 1666382b198d..2982445947e1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode == WB_SYNC_ALL)
gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
- if (bdi->dirty_exceeded)
+ if (bdi->wb.dirty_exceeded)
gfs2_ail1_flush(sdp, wbc);
else
filemap_fdatawrite(metamapping);
@@ -1171,7 +1171,7 @@ static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *s
static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct super_block *sb = dentry->d_inode->i_sb;
+ struct super_block *sb = d_inode(dentry)->i_sb;
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_statfs_change_host sc;
int error;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ae8e8811f0e8..c9ff1cf7d4f3 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -101,8 +101,11 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
{
- int error;
- int n = simple_strtol(buf, NULL, 0);
+ int error, n;
+
+ error = kstrtoint(buf, 0, &n);
+ if (error)
+ return error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -134,10 +137,16 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
{
+ int error, val;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (simple_strtol(buf, NULL, 0) != 1)
+ error = kstrtoint(buf, 0, &val);
+ if (error)
+ return error;
+
+ if (val != 1)
return -EINVAL;
gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
@@ -148,10 +157,16 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ int error, val;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (simple_strtol(buf, NULL, 0) != 1)
+ error = kstrtoint(buf, 0, &val);
+ if (error)
+ return error;
+
+ if (val != 1)
return -EINVAL;
gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -161,10 +176,16 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
+ int error, val;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (simple_strtol(buf, NULL, 0) != 1)
+ error = kstrtoint(buf, 0, &val);
+ if (error)
+ return error;
+
+ if (val != 1)
return -EINVAL;
gfs2_quota_sync(sdp->sd_vfs, 0);
@@ -181,7 +202,9 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- id = simple_strtoul(buf, NULL, 0);
+ error = kstrtou32(buf, 0, &id);
+ if (error)
+ return error;
qid = make_kqid(current_user_ns(), USRQUOTA, id);
if (!qid_valid(qid))
@@ -201,7 +224,9 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- id = simple_strtoul(buf, NULL, 0);
+ error = kstrtou32(buf, 0, &id);
+ if (error)
+ return error;
qid = make_kqid(current_user_ns(), GRPQUOTA, id);
if (!qid_valid(qid))
@@ -324,10 +349,11 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- ssize_t ret = len;
- int val;
+ int ret, val;
- val = simple_strtol(buf, NULL, 0);
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
if (val == 1)
set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
@@ -336,9 +362,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
smp_mb__after_atomic();
gfs2_glock_thaw(sdp);
} else {
- ret = -EINVAL;
+ return -EINVAL;
}
- return ret;
+ return len;
}
static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
@@ -350,17 +376,18 @@ static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
{
- ssize_t ret = len;
- int val;
+ int ret, val;
- val = simple_strtol(buf, NULL, 0);
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
if ((val == 1) &&
!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
complete(&sdp->sd_wdack);
else
- ret = -EINVAL;
- return ret;
+ return -EINVAL;
+ return len;
}
static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
@@ -553,11 +580,14 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
{
struct gfs2_tune *gt = &sdp->sd_tune;
unsigned int x;
+ int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- x = simple_strtoul(buf, NULL, 0);
+ error = kstrtouint(buf, 0, &x);
+ if (error)
+ return error;
if (check_zero && !x)
return -EINVAL;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 0b81f783f787..4c096fa9e2a1 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -420,7 +420,7 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
struct gfs2_ea_request er;
struct gfs2_holder i_gh;
int error;
@@ -586,7 +586,7 @@ out:
static int gfs2_xattr_get(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
struct gfs2_ea_location el;
int error;
@@ -732,7 +732,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (error)
return error;
- error = gfs2_quota_lock_check(ip);
+ error = gfs2_quota_lock_check(ip, &ap);
if (error)
return error;
@@ -1230,7 +1230,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
static int gfs2_xattr_set(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags, int type)
{
- return __gfs2_xattr_set(dentry->d_inode, name, value,
+ return __gfs2_xattr_set(d_inode(dentry), name, value,
size, flags, type);
}
diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c
index e057ec542a6a..8d931b157bbe 100644
--- a/fs/hfs/attr.c
+++ b/fs/hfs/attr.c
@@ -16,7 +16,7 @@
int hfs_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct hfs_find_data fd;
hfs_cat_rec rec;
struct hfs_cat_file *file;
@@ -59,7 +59,7 @@ out:
ssize_t hfs_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct hfs_find_data fd;
hfs_cat_rec rec;
struct hfs_cat_file *file;
@@ -105,7 +105,7 @@ out:
ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode))
return -EOPNOTSUPP;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 145566851e7a..70788e03820a 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -197,7 +197,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode = hfs_new_inode(dir, &dentry->d_name, mode);
if (!inode)
- return -ENOSPC;
+ return -ENOMEM;
res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
@@ -226,7 +226,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode);
if (!inode)
- return -ENOSPC;
+ return -ENOMEM;
res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
@@ -253,7 +253,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
*/
static int hfs_remove(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int res;
if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
@@ -285,18 +285,18 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int res;
/* Unlink destination if it already exists */
- if (new_dentry->d_inode) {
+ if (d_really_is_positive(new_dentry)) {
res = hfs_remove(new_dir, new_dentry);
if (res)
return res;
}
- res = hfs_cat_move(old_dentry->d_inode->i_ino,
+ res = hfs_cat_move(d_inode(old_dentry)->i_ino,
old_dir, &old_dentry->d_name,
new_dir, &new_dentry->d_name);
if (!res)
hfs_cat_build_key(old_dir->i_sb,
- (btree_key *)&HFS_I(old_dentry->d_inode)->cat_key,
+ (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key,
new_dir->i_ino, &new_dentry->d_name);
return res;
}
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 95d255219b1e..1f1c7dcbcc2f 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -252,7 +252,7 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb);
#define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60)
#define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60)
-#define HFS_I(inode) (list_entry(inode, struct hfs_inode_info, vfs_inode))
+#define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode))
#define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info)
#define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) }
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d0929bc81782..b99ebddb10cb 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,7 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "hfs_fs.h"
#include "btree.h"
@@ -124,8 +124,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
return res ? try_to_free_buffers(page) : 0;
}
-static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -133,13 +133,13 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfs_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, offset, hfs_get_block);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again.
*/
- if (unlikely((rw & WRITE) && ret < 0)) {
+ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
loff_t end = offset + count;
@@ -600,7 +600,7 @@ static int hfs_file_release(struct inode *inode, struct file *file)
int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct hfs_sb_info *hsb = HFS_SB(inode->i_sb);
int error;
@@ -674,9 +674,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
static const struct file_operations hfs_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index eee7206c38d1..55c03b9e9070 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/nls.h>
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 91b91fd3a901..2875961fdc10 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -21,7 +21,7 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
if(!inode)
return 1;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index c1422d91cd36..528e38b5af7f 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -118,9 +118,7 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd,
int b, e;
int res;
- if (!rec_found)
- BUG();
-
+ BUG_ON(!rec_found);
b = 0;
e = bnode->num_recs - 1;
res = -ENOENT;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 7892e6fddb66..022974ab6e3c 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -350,10 +350,11 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
&fd.search_key->cat.name.unicode,
off + 2, len);
fd.search_key->key_len = cpu_to_be16(6 + len);
- } else
+ } else {
err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str);
if (unlikely(err))
goto out;
+ }
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f0235c1640af..d0f39dcbb58e 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -81,7 +81,7 @@ again:
HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->
create_date ||
entry.file.create_date ==
- HFSPLUS_I(sb->s_root->d_inode)->
+ HFSPLUS_I(d_inode(sb->s_root))->
create_date) &&
HFSPLUS_SB(sb)->hidden_dir) {
struct qstr str;
@@ -296,8 +296,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
struct dentry *dst_dentry)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
- struct inode *inode = src_dentry->d_inode;
- struct inode *src_dir = src_dentry->d_parent->d_inode;
+ struct inode *inode = d_inode(src_dentry);
+ struct inode *src_dir = d_inode(src_dentry->d_parent);
struct qstr str;
char name[32];
u32 cnid, id;
@@ -353,7 +353,7 @@ out:
static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct qstr str;
char name[32];
u32 cnid;
@@ -410,7 +410,7 @@ out:
static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int res;
if (inode->i_size != 2)
@@ -434,7 +434,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
struct inode *inode;
- int res = -ENOSPC;
+ int res = -ENOMEM;
mutex_lock(&sbi->vh_mutex);
inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
@@ -476,7 +476,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
struct inode *inode;
- int res = -ENOSPC;
+ int res = -ENOMEM;
mutex_lock(&sbi->vh_mutex);
inode = hfsplus_new_inode(dir->i_sb, mode);
@@ -529,7 +529,7 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
int res;
/* Unlink destination if it already exists */
- if (new_dentry->d_inode) {
+ if (d_really_is_positive(new_dentry)) {
if (d_is_dir(new_dentry))
res = hfsplus_rmdir(new_dir, new_dentry);
else
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index b0441d65fa54..f91a1faf819e 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -263,7 +263,7 @@ struct hfsplus_inode_info {
static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
{
- return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
+ return container_of(inode, struct hfsplus_inode_info, vfs_inode);
}
/*
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 0cf786f2d046..6dd107d7421e 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,7 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
@@ -122,8 +122,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
return res ? try_to_free_buffers(page) : 0;
}
-static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -131,14 +131,13 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
- hfsplus_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, offset, hfsplus_get_block);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again.
*/
- if (unlikely((rw & WRITE) && ret < 0)) {
+ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
loff_t end = offset + count;
@@ -244,7 +243,7 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
error = inode_change_ok(inode, attr);
@@ -254,6 +253,12 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
+ if (attr->ia_size > inode->i_size) {
+ error = generic_cont_expand_simple(inode,
+ attr->ia_size);
+ if (error)
+ return error;
+ }
truncate_setsize(inode, attr->ia_size);
hfsplus_file_truncate(inode);
}
@@ -341,9 +346,7 @@ static const struct inode_operations hfsplus_file_inode_operations = {
static const struct file_operations hfsplus_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index d3ff5cc317d7..0624ce4e0702 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -26,7 +26,7 @@
static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags)
{
struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
struct hfsplus_vh *vh = sbi->s_vhdr;
struct hfsplus_vh *bvh = sbi->s_backup_vhdr;
@@ -76,7 +76,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
{
struct inode *inode = file_inode(file);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
- unsigned int flags;
+ unsigned int flags, new_fl = 0;
int err = 0;
err = mnt_want_write_file(file);
@@ -110,14 +110,12 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
}
if (flags & FS_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & FS_APPEND_FL)
- inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
+ new_fl |= S_APPEND;
+
+ inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND);
if (flags & FS_NODUMP_FL)
hip->userflags |= HFSPLUS_FLG_NODUMP;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 593af2fdcc2d..7302d96ae8bf 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/vfs.h>
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index d98094a9f476..416b1dbafe51 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -44,7 +44,7 @@ static int strcmp_xattr_acl(const char *name)
return -1;
}
-static inline int is_known_namespace(const char *name)
+static bool is_known_namespace(const char *name)
{
if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) &&
strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
@@ -424,6 +424,28 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
return len;
}
+int hfsplus_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags,
+ const char *prefix, size_t prefixlen)
+{
+ char *xattr_name;
+ int res;
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
+ strcpy(xattr_name, prefix);
+ strcpy(xattr_name + prefixlen, name);
+ res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size,
+ flags);
+ kfree(xattr_name);
+ return res;
+}
+
static ssize_t hfsplus_getxattr_finder_info(struct inode *inode,
void *value, size_t size)
{
@@ -560,6 +582,30 @@ failed_getxattr_init:
return res;
}
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size,
+ const char *prefix, size_t prefixlen)
+{
+ int res;
+ char *xattr_name;
+
+ if (!strcmp(name, ""))
+ return -EINVAL;
+
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
+
+ strcpy(xattr_name, prefix);
+ strcpy(xattr_name + prefixlen, name);
+
+ res = __hfsplus_getxattr(d_inode(dentry), xattr_name, value, size);
+ kfree(xattr_name);
+ return res;
+
+}
+
static inline int can_list(const char *xattr_name)
{
if (!xattr_name)
@@ -574,7 +620,7 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry,
char *buffer, size_t size)
{
ssize_t res = 0;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct hfs_find_data fd;
u16 entry_type;
u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)];
@@ -642,7 +688,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
ssize_t err;
ssize_t res = 0;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct hfs_find_data fd;
u16 key_len = 0;
struct hfsplus_attr_key attr_key;
@@ -806,9 +852,6 @@ end_removexattr:
static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char *xattr_name;
- int res;
-
if (!strcmp(name, ""))
return -EINVAL;
@@ -818,24 +861,19 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
*/
if (is_known_namespace(name))
return -EOPNOTSUPP;
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN
- + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
- strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
- res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
- kfree(xattr_name);
- return res;
+ /*
+ * osx is the namespace we use to indicate an unprefixed
+ * attribute on the filesystem (like the ones that OS X
+ * creates), so we pass the name through unmodified (after
+ * ensuring it doesn't conflict with another namespace).
+ */
+ return __hfsplus_getxattr(d_inode(dentry), name, buffer, size);
}
static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char *xattr_name;
- int res;
-
if (!strcmp(name, ""))
return -EINVAL;
@@ -845,16 +883,14 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
*/
if (is_known_namespace(name))
return -EOPNOTSUPP;
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN
- + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
- strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
- res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
- kfree(xattr_name);
- return res;
+ /*
+ * osx is the namespace we use to indicate an unprefixed
+ * attribute on the filesystem (like the ones that OS X
+ * creates), so we pass the name through unmodified (after
+ * ensuring it doesn't conflict with another namespace).
+ */
+ return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
}
static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 288530cf80b5..f9b0955b3d28 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -21,22 +21,16 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[];
int __hfsplus_setxattr(struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-static inline int hfsplus_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
-{
- return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags);
-}
+int hfsplus_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags,
+ const char *prefix, size_t prefixlen);
ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
- void *value, size_t size);
-
-static inline ssize_t hfsplus_getxattr(struct dentry *dentry,
- const char *name,
- void *value,
- size_t size)
-{
- return __hfsplus_getxattr(dentry->d_inode, name, value, size);
-}
+ void *value, size_t size);
+
+ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size,
+ const char *prefix, size_t prefixlen);
ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 6ec5e107691f..aacff00a9ff9 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -16,43 +16,17 @@
static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_SECURITY_PREFIX);
- strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
-
- res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
- kfree(xattr_name);
- return res;
+ return hfsplus_getxattr(dentry, name, buffer, size,
+ XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
}
static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_SECURITY_PREFIX);
- strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
-
- res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
- kfree(xattr_name);
- return res;
+ return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN);
}
static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index 3c5f27e4746a..bcf65089b7f7 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -14,43 +14,16 @@
static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
- strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
-
- res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
- kfree(xattr_name);
- return res;
+ return hfsplus_getxattr(dentry, name, buffer, size,
+ XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN);
}
static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
- strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
-
- res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
- kfree(xattr_name);
- return res;
+ return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 2b625a538b64..5aa0e6dc4a1e 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -14,43 +14,16 @@
static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char *xattr_name;
- int res;
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_USER_PREFIX);
- strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
-
- res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
- kfree(xattr_name);
- return res;
+ return hfsplus_getxattr(dentry, name, buffer, size,
+ XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char *xattr_name;
- int res;
-
- if (!strcmp(name, ""))
- return -EINVAL;
-
- xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
- GFP_KERNEL);
- if (!xattr_name)
- return -ENOMEM;
- strcpy(xattr_name, XATTR_USER_PREFIX);
- strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
-
- res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
- kfree(xattr_name);
- return res;
+ return hfsplus_setxattr(dentry, name, buffer, size, flags,
+ XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 4fcd40d6f308..91e19f9dffe5 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -66,7 +66,8 @@ extern int stat_file(const char *path, struct hostfs_stat *p, int fd);
extern int access_file(char *path, int r, int w, int x);
extern int open_file(char *path, int r, int w, int append);
extern void *open_dir(char *path, int *err_out);
-extern char *read_dir(void *stream, unsigned long long *pos,
+extern void seek_dir(void *stream, unsigned long long pos);
+extern char *read_dir(void *stream, unsigned long long *pos_out,
unsigned long long *ino_out, int *len_out,
unsigned int *type_out);
extern void close_file(void *stream);
@@ -77,8 +78,7 @@ extern int write_file(int fd, unsigned long long *offset, const char *buf,
int len);
extern int lseek_file(int fd, long long offset, int whence);
extern int fsync_file(int fd, int datasync);
-extern int file_create(char *name, int ur, int uw, int ux, int gr,
- int gw, int gx, int or, int ow, int ox);
+extern int file_create(char *name, int mode);
extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd);
extern int make_symlink(const char *from, const char *to);
extern int unlink_file(const char *file);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fd62cae0fdcb..059597b23f67 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -24,6 +24,7 @@ struct hostfs_inode_info {
int fd;
fmode_t mode;
struct inode vfs_inode;
+ struct mutex open_mutex;
};
static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
@@ -92,16 +93,22 @@ static char *__dentry_name(struct dentry *dentry, char *name)
__putname(name);
return NULL;
}
+
+ /*
+ * This function relies on the fact that dentry_path_raw() will place
+ * the path name at the end of the provided buffer.
+ */
+ BUG_ON(p + strlen(p) + 1 != name + PATH_MAX);
+
strlcpy(name, root, PATH_MAX);
if (len > p - name) {
__putname(name);
return NULL;
}
- if (p > name + len) {
- char *s = name + len;
- while ((*s++ = *p++) != '\0')
- ;
- }
+
+ if (p > name + len)
+ strcpy(name + len, p);
+
return name;
}
@@ -135,21 +142,19 @@ static char *follow_link(char *link)
int len, n;
char *name, *resolved, *end;
- len = 64;
- while (1) {
+ name = __getname();
+ if (!name) {
n = -ENOMEM;
- name = kmalloc(len, GFP_KERNEL);
- if (name == NULL)
- goto out;
-
- n = hostfs_do_readlink(link, name, len);
- if (n < len)
- break;
- len *= 2;
- kfree(name);
+ goto out_free;
}
+
+ n = hostfs_do_readlink(link, name, PATH_MAX);
if (n < 0)
goto out_free;
+ else if (n == PATH_MAX) {
+ n = -E2BIG;
+ goto out_free;
+ }
if (*name == '/')
return name;
@@ -168,13 +173,12 @@ static char *follow_link(char *link)
}
sprintf(resolved, "%s%s", link, name);
- kfree(name);
+ __putname(name);
kfree(link);
return resolved;
out_free:
- kfree(name);
- out:
+ __putname(name);
return ERR_PTR(n);
}
@@ -225,6 +229,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
hi->fd = -1;
hi->mode = 0;
inode_init_once(&hi->vfs_inode);
+ mutex_init(&hi->open_mutex);
return &hi->vfs_inode;
}
@@ -257,6 +262,9 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
if (strlen(root_path) > offset)
seq_printf(seq, ",%s", root_path + offset);
+ if (append)
+ seq_puts(seq, ",append");
+
return 0;
}
@@ -284,6 +292,7 @@ static int hostfs_readdir(struct file *file, struct dir_context *ctx)
if (dir == NULL)
return -error;
next = ctx->pos;
+ seek_dir(dir, next);
while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
if (!dir_emit(ctx, name, len, ino, type))
break;
@@ -293,13 +302,12 @@ static int hostfs_readdir(struct file *file, struct dir_context *ctx)
return 0;
}
-static int hostfs_file_open(struct inode *ino, struct file *file)
+static int hostfs_open(struct inode *ino, struct file *file)
{
- static DEFINE_MUTEX(open_mutex);
char *name;
- fmode_t mode = 0;
+ fmode_t mode;
int err;
- int r = 0, w = 0, fd;
+ int r, w, fd;
mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
if ((mode & HOSTFS_I(ino)->mode) == mode)
@@ -308,12 +316,12 @@ static int hostfs_file_open(struct inode *ino, struct file *file)
mode |= HOSTFS_I(ino)->mode;
retry:
+ r = w = 0;
+
if (mode & FMODE_READ)
r = 1;
if (mode & FMODE_WRITE)
- w = 1;
- if (w)
- r = 1;
+ r = w = 1;
name = dentry_name(file->f_path.dentry);
if (name == NULL)
@@ -324,15 +332,16 @@ retry:
if (fd < 0)
return fd;
- mutex_lock(&open_mutex);
+ mutex_lock(&HOSTFS_I(ino)->open_mutex);
/* somebody else had handled it first? */
if ((mode & HOSTFS_I(ino)->mode) == mode) {
- mutex_unlock(&open_mutex);
+ mutex_unlock(&HOSTFS_I(ino)->open_mutex);
+ close_file(&fd);
return 0;
}
if ((mode | HOSTFS_I(ino)->mode) != mode) {
mode |= HOSTFS_I(ino)->mode;
- mutex_unlock(&open_mutex);
+ mutex_unlock(&HOSTFS_I(ino)->open_mutex);
close_file(&fd);
goto retry;
}
@@ -342,12 +351,12 @@ retry:
err = replace_file(fd, HOSTFS_I(ino)->fd);
close_file(&fd);
if (err < 0) {
- mutex_unlock(&open_mutex);
+ mutex_unlock(&HOSTFS_I(ino)->open_mutex);
return err;
}
}
HOSTFS_I(ino)->mode = mode;
- mutex_unlock(&open_mutex);
+ mutex_unlock(&HOSTFS_I(ino)->open_mutex);
return 0;
}
@@ -378,13 +387,11 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
static const struct file_operations hostfs_file_fops = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.splice_read = generic_file_splice_read,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .write = new_sync_write,
.mmap = generic_file_mmap,
- .open = hostfs_file_open,
+ .open = hostfs_open,
.release = hostfs_file_release,
.fsync = hostfs_fsync,
};
@@ -393,6 +400,8 @@ static const struct file_operations hostfs_dir_fops = {
.llseek = generic_file_llseek,
.iterate = hostfs_readdir,
.read = generic_read_dir,
+ .open = hostfs_open,
+ .fsync = hostfs_fsync,
};
static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -400,7 +409,7 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
char *buffer;
- unsigned long long base;
+ loff_t base = page_offset(page);
int count = PAGE_CACHE_SIZE;
int end_index = inode->i_size >> PAGE_CACHE_SHIFT;
int err;
@@ -409,7 +418,6 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
count = inode->i_size & (PAGE_CACHE_SIZE-1);
buffer = kmap(page);
- base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT;
err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
if (err != count) {
@@ -434,26 +442,29 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
static int hostfs_readpage(struct file *file, struct page *page)
{
char *buffer;
- long long start;
- int err = 0;
+ loff_t start = page_offset(page);
+ int bytes_read, ret = 0;
- start = (long long) page->index << PAGE_CACHE_SHIFT;
buffer = kmap(page);
- err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
+ bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
PAGE_CACHE_SIZE);
- if (err < 0)
+ if (bytes_read < 0) {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ ret = bytes_read;
goto out;
+ }
- memset(&buffer[err], 0, PAGE_CACHE_SIZE - err);
+ memset(buffer + bytes_read, 0, PAGE_CACHE_SIZE - bytes_read);
- flush_dcache_page(page);
+ ClearPageError(page);
SetPageUptodate(page);
- if (PageError(page)) ClearPageError(page);
- err = 0;
+
out:
+ flush_dcache_page(page);
kunmap(page);
unlock_page(page);
- return err;
+ return ret;
}
static int hostfs_write_begin(struct file *file, struct address_space *mapping,
@@ -530,11 +541,13 @@ static int read_name(struct inode *ino, char *name)
init_special_inode(ino, st.mode & S_IFMT, rdev);
ino->i_op = &hostfs_iops;
break;
-
- default:
+ case S_IFREG:
ino->i_op = &hostfs_iops;
ino->i_fop = &hostfs_file_fops;
ino->i_mapping->a_ops = &hostfs_aops;
+ break;
+ default:
+ return -EIO;
}
ino->i_ino = st.ino;
@@ -568,10 +581,7 @@ static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (name == NULL)
goto out_put;
- fd = file_create(name,
- mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR,
- mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP,
- mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH);
+ fd = file_create(name, mode & 0777);
if (fd < 0)
error = fd;
else
@@ -797,7 +807,7 @@ static int hostfs_permission(struct inode *ino, int desired)
static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct hostfs_iattr attrs;
char *name;
int err;
@@ -882,7 +892,7 @@ static const struct inode_operations hostfs_dir_iops = {
.setattr = hostfs_setattr,
};
-static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
{
char *link = __getname();
if (link) {
@@ -896,21 +906,18 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
}
if (err < 0) {
__putname(link);
- link = ERR_PTR(err);
+ return ERR_PTR(err);
}
} else {
- link = ERR_PTR(-ENOMEM);
+ return ERR_PTR(-ENOMEM);
}
- nd_set_link(nd, link);
- return NULL;
+ return *cookie = link;
}
-static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void hostfs_put_link(struct inode *unused, void *cookie)
{
- char *s = nd_get_link(nd);
- if (!IS_ERR(s))
- __putname(s);
+ __putname(cookie);
}
static const struct inode_operations hostfs_link_iops = {
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 9765dab95cbd..9c1e0f019880 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -97,21 +97,27 @@ void *open_dir(char *path, int *err_out)
return dir;
}
-char *read_dir(void *stream, unsigned long long *pos,
+void seek_dir(void *stream, unsigned long long pos)
+{
+ DIR *dir = stream;
+
+ seekdir(dir, pos);
+}
+
+char *read_dir(void *stream, unsigned long long *pos_out,
unsigned long long *ino_out, int *len_out,
unsigned int *type_out)
{
DIR *dir = stream;
struct dirent *ent;
- seekdir(dir, *pos);
ent = readdir(dir);
if (ent == NULL)
return NULL;
*len_out = strlen(ent->d_name);
*ino_out = ent->d_ino;
*type_out = ent->d_type;
- *pos = telldir(dir);
+ *pos_out = ent->d_off;
return ent->d_name;
}
@@ -175,21 +181,10 @@ void close_dir(void *stream)
closedir(stream);
}
-int file_create(char *name, int ur, int uw, int ux, int gr,
- int gw, int gx, int or, int ow, int ox)
+int file_create(char *name, int mode)
{
- int mode, fd;
-
- mode = 0;
- mode |= ur ? S_IRUSR : 0;
- mode |= uw ? S_IWUSR : 0;
- mode |= ux ? S_IXUSR : 0;
- mode |= gr ? S_IRGRP : 0;
- mode |= gw ? S_IWGRP : 0;
- mode |= gx ? S_IXGRP : 0;
- mode |= or ? S_IROTH : 0;
- mode |= ow ? S_IWOTH : 0;
- mode |= ox ? S_IXOTH : 0;
+ int fd;
+
fd = open64(name, O_CREAT | O_RDWR, mode);
if (fd < 0)
return -errno;
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index f005046e1591..d6a4b55d2ab0 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -484,3 +484,98 @@ struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *a
a->btree.first_free = cpu_to_le16(8);
return a;
}
+
+static unsigned find_run(__le32 *bmp, unsigned *idx)
+{
+ unsigned len;
+ while (tstbits(bmp, *idx, 1)) {
+ (*idx)++;
+ if (unlikely(*idx >= 0x4000))
+ return 0;
+ }
+ len = 1;
+ while (!tstbits(bmp, *idx + len, 1))
+ len++;
+ return len;
+}
+
+static int do_trim(struct super_block *s, secno start, unsigned len, secno limit_start, secno limit_end, unsigned minlen, unsigned *result)
+{
+ int err;
+ secno end;
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ end = start + len;
+ if (start < limit_start)
+ start = limit_start;
+ if (end > limit_end)
+ end = limit_end;
+ if (start >= end)
+ return 0;
+ if (end - start < minlen)
+ return 0;
+ err = sb_issue_discard(s, start, end - start, GFP_NOFS, 0);
+ if (err)
+ return err;
+ *result += end - start;
+ return 0;
+}
+
+int hpfs_trim_fs(struct super_block *s, u64 start, u64 end, u64 minlen, unsigned *result)
+{
+ int err = 0;
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ unsigned idx, len, start_bmp, end_bmp;
+ __le32 *bmp;
+ struct quad_buffer_head qbh;
+
+ *result = 0;
+ if (!end || end > sbi->sb_fs_size)
+ end = sbi->sb_fs_size;
+ if (start >= sbi->sb_fs_size)
+ return 0;
+ if (minlen > 0x4000)
+ return 0;
+ if (start < sbi->sb_dirband_start + sbi->sb_dirband_size && end > sbi->sb_dirband_start) {
+ hpfs_lock(s);
+ if (s->s_flags & MS_RDONLY) {
+ err = -EROFS;
+ goto unlock_1;
+ }
+ if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
+ err = -EIO;
+ goto unlock_1;
+ }
+ idx = 0;
+ while ((len = find_run(bmp, &idx)) && !err) {
+ err = do_trim(s, sbi->sb_dirband_start + idx * 4, len * 4, start, end, minlen, result);
+ idx += len;
+ }
+ hpfs_brelse4(&qbh);
+unlock_1:
+ hpfs_unlock(s);
+ }
+ start_bmp = start >> 14;
+ end_bmp = (end + 0x3fff) >> 14;
+ while (start_bmp < end_bmp && !err) {
+ hpfs_lock(s);
+ if (s->s_flags & MS_RDONLY) {
+ err = -EROFS;
+ goto unlock_2;
+ }
+ if (!(bmp = hpfs_map_bitmap(s, start_bmp, &qbh, "trim"))) {
+ err = -EIO;
+ goto unlock_2;
+ }
+ idx = 0;
+ while ((len = find_run(bmp, &idx)) && !err) {
+ err = do_trim(s, (start_bmp << 14) + idx, len, start, end, minlen, result);
+ idx += len;
+ }
+ hpfs_brelse4(&qbh);
+unlock_2:
+ hpfs_unlock(s);
+ start_bmp++;
+ }
+ return err;
+}
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2a8e07425de0..dc540bfcee1d 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -327,4 +327,5 @@ const struct file_operations hpfs_dir_ops =
.iterate = hpfs_readdir,
.release = hpfs_dir_release,
.fsync = hpfs_file_fsync,
+ .unlocked_ioctl = hpfs_ioctl,
};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 7f54e5f76cec..7ca28d604bf7 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -197,14 +197,13 @@ const struct address_space_operations hpfs_aops = {
const struct file_operations hpfs_file_ops =
{
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.release = hpfs_file_release,
.fsync = hpfs_file_fsync,
.splice_read = generic_file_splice_read,
+ .unlocked_ioctl = hpfs_ioctl,
};
const struct inode_operations hpfs_file_iops =
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b63b75fa00e7..c4867b5116dd 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -18,6 +18,8 @@
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
#include <asm/unaligned.h>
#include "hpfs.h"
@@ -200,6 +202,7 @@ void hpfs_free_dnode(struct super_block *, secno);
struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *);
struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **);
struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **);
+int hpfs_trim_fs(struct super_block *, u64, u64, u64, unsigned *);
/* anode.c */
@@ -304,7 +307,7 @@ extern const struct address_space_operations hpfs_symlink_aops;
static inline struct hpfs_inode_info *hpfs_i(struct inode *inode)
{
- return list_entry(inode, struct hpfs_inode_info, vfs_inode);
+ return container_of(inode, struct hpfs_inode_info, vfs_inode);
}
static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
@@ -318,6 +321,7 @@ __printf(2, 3)
void hpfs_error(struct super_block *, const char *, ...);
int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
unsigned hpfs_get_free_dnodes(struct super_block *);
+long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg);
/*
* local time (HPFS) to GMT (Unix)
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 7ce4b74234a1..933c73780813 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -257,7 +257,7 @@ void hpfs_write_inode_nolock(struct inode *i)
int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error = -EINVAL;
hpfs_lock(inode->i_sb);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index bdbc2c3080a4..a0872f239f04 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -359,7 +359,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
unsigned len = dentry->d_name.len;
struct quad_buffer_head qbh;
struct hpfs_dirent *de;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
dnode_secno dno;
int r;
int rep = 0;
@@ -433,7 +433,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
unsigned len = dentry->d_name.len;
struct quad_buffer_head qbh;
struct hpfs_dirent *de;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
dnode_secno dno;
int n_items = 0;
int err;
@@ -522,8 +522,8 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned old_len = old_dentry->d_name.len;
const unsigned char *new_name = new_dentry->d_name.name;
unsigned new_len = new_dentry->d_name.len;
- struct inode *i = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *i = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct quad_buffer_head qbh, qbh1;
struct hpfs_dirent *dep, *nde;
struct hpfs_dirent de;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 7cd00d3a7c9b..68a9bed05628 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -52,17 +52,20 @@ static void unmark_dirty(struct super_block *s)
}
/* Filesystem error... */
-static char err_buf[1024];
-
void hpfs_error(struct super_block *s, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("filesystem error: %pV", &vaf);
+
va_end(args);
- pr_err("filesystem error: %s", err_buf);
if (!hpfs_sb(s)->sb_was_error) {
if (hpfs_sb(s)->sb_err == 2) {
pr_cont("; crashing the system because you wanted it\n");
@@ -196,12 +199,39 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+
+long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FITRIM: {
+ struct fstrim_range range;
+ secno n_trimmed;
+ int r;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range)))
+ return -EFAULT;
+ r = hpfs_trim_fs(file_inode(file)->i_sb, range.start >> 9, (range.start + range.len) >> 9, (range.minlen + 511) >> 9, &n_trimmed);
+ if (r)
+ return r;
+ range.len = (u64)n_trimmed << 9;
+ if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range)))
+ return -EFAULT;
+ return 0;
+ }
+ default: {
+ return -ENOIOCTLCMD;
+ }
+ }
+}
+
+
static struct kmem_cache * hpfs_inode_cachep;
static struct inode *hpfs_alloc_inode(struct super_block *sb)
{
struct hpfs_inode_info *ei;
- ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
+ ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
ei->vfs_inode.i_version = 1;
@@ -424,11 +454,14 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
int o;
struct hpfs_sb_info *sbi = hpfs_sb(s);
char *new_opts = kstrdup(data, GFP_KERNEL);
-
+
+ if (!new_opts)
+ return -ENOMEM;
+
sync_filesystem(s);
*flags |= MS_NOATIME;
-
+
hpfs_lock(s);
uid = sbi->sb_uid; gid = sbi->sb_gid;
umask = 0777 & ~sbi->sb_mode;
diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile
deleted file mode 100644
index 3a982bd975d2..000000000000
--- a/fs/hppfs/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-#
-# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
-# Licensed under the GPL
-#
-
-obj-$(CONFIG_HPPFS) += hppfs.o
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
deleted file mode 100644
index 043ac9d77262..000000000000
--- a/fs/hppfs/hppfs.c
+++ /dev/null
@@ -1,766 +0,0 @@
-/*
- * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <linux/ctype.h>
-#include <linux/dcache.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/slab.h>
-#include <linux/statfs.h>
-#include <linux/types.h>
-#include <linux/pid_namespace.h>
-#include <linux/namei.h>
-#include <asm/uaccess.h>
-#include <os.h>
-
-static struct inode *get_inode(struct super_block *, struct dentry *);
-
-struct hppfs_data {
- struct list_head list;
- char contents[PAGE_SIZE - sizeof(struct list_head)];
-};
-
-struct hppfs_private {
- struct file *proc_file;
- int host_fd;
- loff_t len;
- struct hppfs_data *contents;
-};
-
-struct hppfs_inode_info {
- struct dentry *proc_dentry;
- struct inode vfs_inode;
-};
-
-static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode)
-{
- return container_of(inode, struct hppfs_inode_info, vfs_inode);
-}
-
-#define HPPFS_SUPER_MAGIC 0xb00000ee
-
-static const struct super_operations hppfs_sbops;
-
-static int is_pid(struct dentry *dentry)
-{
- struct super_block *sb;
- int i;
-
- sb = dentry->d_sb;
- if (dentry->d_parent != sb->s_root)
- return 0;
-
- for (i = 0; i < dentry->d_name.len; i++) {
- if (!isdigit(dentry->d_name.name[i]))
- return 0;
- }
- return 1;
-}
-
-static char *dentry_name(struct dentry *dentry, int extra)
-{
- struct dentry *parent;
- char *root, *name;
- const char *seg_name;
- int len, seg_len, root_len;
-
- len = 0;
- parent = dentry;
- while (parent->d_parent != parent) {
- if (is_pid(parent))
- len += strlen("pid") + 1;
- else len += parent->d_name.len + 1;
- parent = parent->d_parent;
- }
-
- root = "proc";
- root_len = strlen(root);
- len += root_len;
- name = kmalloc(len + extra + 1, GFP_KERNEL);
- if (name == NULL)
- return NULL;
-
- name[len] = '\0';
- parent = dentry;
- while (parent->d_parent != parent) {
- if (is_pid(parent)) {
- seg_name = "pid";
- seg_len = strlen(seg_name);
- }
- else {
- seg_name = parent->d_name.name;
- seg_len = parent->d_name.len;
- }
-
- len -= seg_len + 1;
- name[len] = '/';
- memcpy(&name[len + 1], seg_name, seg_len);
- parent = parent->d_parent;
- }
- memcpy(name, root, root_len);
- return name;
-}
-
-static int file_removed(struct dentry *dentry, const char *file)
-{
- char *host_file;
- int extra, fd;
-
- extra = 0;
- if (file != NULL)
- extra += strlen(file) + 1;
-
- host_file = dentry_name(dentry, extra + strlen("/remove"));
- if (host_file == NULL) {
- printk(KERN_ERR "file_removed : allocation failed\n");
- return -ENOMEM;
- }
-
- if (file != NULL) {
- strcat(host_file, "/");
- strcat(host_file, file);
- }
- strcat(host_file, "/remove");
-
- fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
- kfree(host_file);
- if (fd > 0) {
- os_close_file(fd);
- return 1;
- }
- return 0;
-}
-
-static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
- unsigned int flags)
-{
- struct dentry *proc_dentry, *parent;
- struct qstr *name = &dentry->d_name;
- struct inode *inode;
- int err, deleted;
-
- deleted = file_removed(dentry, NULL);
- if (deleted < 0)
- return ERR_PTR(deleted);
- else if (deleted)
- return ERR_PTR(-ENOENT);
-
- parent = HPPFS_I(ino)->proc_dentry;
- mutex_lock(&parent->d_inode->i_mutex);
- proc_dentry = lookup_one_len(name->name, parent, name->len);
- mutex_unlock(&parent->d_inode->i_mutex);
-
- if (IS_ERR(proc_dentry))
- return proc_dentry;
-
- err = -ENOMEM;
- inode = get_inode(ino->i_sb, proc_dentry);
- if (!inode)
- goto out;
-
- d_add(dentry, inode);
- return NULL;
-
- out:
- return ERR_PTR(err);
-}
-
-static const struct inode_operations hppfs_file_iops = {
-};
-
-static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count,
- loff_t *ppos, int is_user)
-{
- ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
- ssize_t n;
-
- read = file_inode(file)->i_fop->read;
-
- if (!is_user)
- set_fs(KERNEL_DS);
-
- n = (*read)(file, buf, count, &file->f_pos);
-
- if (!is_user)
- set_fs(USER_DS);
-
- if (ppos)
- *ppos = file->f_pos;
- return n;
-}
-
-static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count)
-{
- ssize_t n;
- int cur, err;
- char *new_buf;
-
- n = -ENOMEM;
- new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (new_buf == NULL) {
- printk(KERN_ERR "hppfs_read_file : kmalloc failed\n");
- goto out;
- }
- n = 0;
- while (count > 0) {
- cur = min_t(ssize_t, count, PAGE_SIZE);
- err = os_read_file(fd, new_buf, cur);
- if (err < 0) {
- printk(KERN_ERR "hppfs_read : read failed, "
- "errno = %d\n", err);
- n = err;
- goto out_free;
- } else if (err == 0)
- break;
-
- if (copy_to_user(buf, new_buf, err)) {
- n = -EFAULT;
- goto out_free;
- }
- n += err;
- count -= err;
- }
- out_free:
- kfree(new_buf);
- out:
- return n;
-}
-
-static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct hppfs_private *hppfs = file->private_data;
- struct hppfs_data *data;
- loff_t off;
- int err;
-
- if (hppfs->contents != NULL) {
- int rem;
-
- if (*ppos >= hppfs->len)
- return 0;
-
- data = hppfs->contents;
- off = *ppos;
- while (off >= sizeof(data->contents)) {
- data = list_entry(data->list.next, struct hppfs_data,
- list);
- off -= sizeof(data->contents);
- }
-
- if (off + count > hppfs->len)
- count = hppfs->len - off;
- rem = copy_to_user(buf, &data->contents[off], count);
- *ppos += count - rem;
- if (rem > 0)
- return -EFAULT;
- } else if (hppfs->host_fd != -1) {
- err = os_seek_file(hppfs->host_fd, *ppos);
- if (err) {
- printk(KERN_ERR "hppfs_read : seek failed, "
- "errno = %d\n", err);
- return err;
- }
- err = hppfs_read_file(hppfs->host_fd, buf, count);
- if (err < 0) {
- printk(KERN_ERR "hppfs_read: read failed: %d\n", err);
- return err;
- }
- count = err;
- if (count > 0)
- *ppos += count;
- }
- else count = read_proc(hppfs->proc_file, buf, count, ppos, 1);
-
- return count;
-}
-
-static ssize_t hppfs_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
-{
- struct hppfs_private *data = file->private_data;
- struct file *proc_file = data->proc_file;
- ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-
- write = file_inode(proc_file)->i_fop->write;
- return (*write)(proc_file, buf, len, ppos);
-}
-
-static int open_host_sock(char *host_file, int *filter_out)
-{
- char *end;
- int fd;
-
- end = &host_file[strlen(host_file)];
- strcpy(end, "/rw");
- *filter_out = 1;
- fd = os_connect_socket(host_file);
- if (fd > 0)
- return fd;
-
- strcpy(end, "/r");
- *filter_out = 0;
- fd = os_connect_socket(host_file);
- return fd;
-}
-
-static void free_contents(struct hppfs_data *head)
-{
- struct hppfs_data *data;
- struct list_head *ele, *next;
-
- if (head == NULL)
- return;
-
- list_for_each_safe(ele, next, &head->list) {
- data = list_entry(ele, struct hppfs_data, list);
- kfree(data);
- }
- kfree(head);
-}
-
-static struct hppfs_data *hppfs_get_data(int fd, int filter,
- struct file *proc_file,
- struct file *hppfs_file,
- loff_t *size_out)
-{
- struct hppfs_data *data, *new, *head;
- int n, err;
-
- err = -ENOMEM;
- data = kmalloc(sizeof(*data), GFP_KERNEL);
- if (data == NULL) {
- printk(KERN_ERR "hppfs_get_data : head allocation failed\n");
- goto failed;
- }
-
- INIT_LIST_HEAD(&data->list);
-
- head = data;
- *size_out = 0;
-
- if (filter) {
- while ((n = read_proc(proc_file, data->contents,
- sizeof(data->contents), NULL, 0)) > 0)
- os_write_file(fd, data->contents, n);
- err = os_shutdown_socket(fd, 0, 1);
- if (err) {
- printk(KERN_ERR "hppfs_get_data : failed to shut down "
- "socket\n");
- goto failed_free;
- }
- }
- while (1) {
- n = os_read_file(fd, data->contents, sizeof(data->contents));
- if (n < 0) {
- err = n;
- printk(KERN_ERR "hppfs_get_data : read failed, "
- "errno = %d\n", err);
- goto failed_free;
- } else if (n == 0)
- break;
-
- *size_out += n;
-
- if (n < sizeof(data->contents))
- break;
-
- new = kmalloc(sizeof(*data), GFP_KERNEL);
- if (new == 0) {
- printk(KERN_ERR "hppfs_get_data : data allocation "
- "failed\n");
- err = -ENOMEM;
- goto failed_free;
- }
-
- INIT_LIST_HEAD(&new->list);
- list_add(&new->list, &data->list);
- data = new;
- }
- return head;
-
- failed_free:
- free_contents(head);
- failed:
- return ERR_PTR(err);
-}
-
-static struct hppfs_private *hppfs_data(void)
-{
- struct hppfs_private *data;
-
- data = kmalloc(sizeof(*data), GFP_KERNEL);
- if (data == NULL)
- return data;
-
- *data = ((struct hppfs_private ) { .host_fd = -1,
- .len = -1,
- .contents = NULL } );
- return data;
-}
-
-static int file_mode(int fmode)
-{
- if (fmode == (FMODE_READ | FMODE_WRITE))
- return O_RDWR;
- if (fmode == FMODE_READ)
- return O_RDONLY;
- if (fmode == FMODE_WRITE)
- return O_WRONLY;
- return 0;
-}
-
-static int hppfs_open(struct inode *inode, struct file *file)
-{
- const struct cred *cred = file->f_cred;
- struct hppfs_private *data;
- struct path path;
- char *host_file;
- int err, fd, type, filter;
-
- err = -ENOMEM;
- data = hppfs_data();
- if (data == NULL)
- goto out;
-
- host_file = dentry_name(file->f_path.dentry, strlen("/rw"));
- if (host_file == NULL)
- goto out_free2;
-
- path.mnt = inode->i_sb->s_fs_info;
- path.dentry = HPPFS_I(inode)->proc_dentry;
-
- data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
- err = PTR_ERR(data->proc_file);
- if (IS_ERR(data->proc_file))
- goto out_free1;
-
- type = os_file_type(host_file);
- if (type == OS_TYPE_FILE) {
- fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
- if (fd >= 0)
- data->host_fd = fd;
- else
- printk(KERN_ERR "hppfs_open : failed to open '%s', "
- "errno = %d\n", host_file, -fd);
-
- data->contents = NULL;
- } else if (type == OS_TYPE_DIR) {
- fd = open_host_sock(host_file, &filter);
- if (fd > 0) {
- data->contents = hppfs_get_data(fd, filter,
- data->proc_file,
- file, &data->len);
- if (!IS_ERR(data->contents))
- data->host_fd = fd;
- } else
- printk(KERN_ERR "hppfs_open : failed to open a socket "
- "in '%s', errno = %d\n", host_file, -fd);
- }
- kfree(host_file);
-
- file->private_data = data;
- return 0;
-
- out_free1:
- kfree(host_file);
- out_free2:
- free_contents(data->contents);
- kfree(data);
- out:
- return err;
-}
-
-static int hppfs_dir_open(struct inode *inode, struct file *file)
-{
- const struct cred *cred = file->f_cred;
- struct hppfs_private *data;
- struct path path;
- int err;
-
- err = -ENOMEM;
- data = hppfs_data();
- if (data == NULL)
- goto out;
-
- path.mnt = inode->i_sb->s_fs_info;
- path.dentry = HPPFS_I(inode)->proc_dentry;
- data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred);
- err = PTR_ERR(data->proc_file);
- if (IS_ERR(data->proc_file))
- goto out_free;
-
- file->private_data = data;
- return 0;
-
- out_free:
- kfree(data);
- out:
- return err;
-}
-
-static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
-{
- struct hppfs_private *data = file->private_data;
- struct file *proc_file = data->proc_file;
- loff_t (*llseek)(struct file *, loff_t, int);
- loff_t ret;
-
- llseek = file_inode(proc_file)->i_fop->llseek;
- if (llseek != NULL) {
- ret = (*llseek)(proc_file, off, where);
- if (ret < 0)
- return ret;
- }
-
- return default_llseek(file, off, where);
-}
-
-static int hppfs_release(struct inode *inode, struct file *file)
-{
- struct hppfs_private *data = file->private_data;
- struct file *proc_file = data->proc_file;
- if (proc_file)
- fput(proc_file);
- kfree(data);
- return 0;
-}
-
-static const struct file_operations hppfs_file_fops = {
- .owner = NULL,
- .llseek = hppfs_llseek,
- .read = hppfs_read,
- .write = hppfs_write,
- .open = hppfs_open,
- .release = hppfs_release,
-};
-
-struct hppfs_dirent {
- struct dir_context ctx;
- struct dir_context *caller;
- struct dentry *dentry;
-};
-
-static int hppfs_filldir(struct dir_context *ctx, const char *name, int size,
- loff_t offset, u64 inode, unsigned int type)
-{
- struct hppfs_dirent *dirent =
- container_of(ctx, struct hppfs_dirent, ctx);
-
- if (file_removed(dirent->dentry, name))
- return 0;
-
- dirent->caller->pos = dirent->ctx.pos;
- return !dir_emit(dirent->caller, name, size, inode, type);
-}
-
-static int hppfs_readdir(struct file *file, struct dir_context *ctx)
-{
- struct hppfs_private *data = file->private_data;
- struct file *proc_file = data->proc_file;
- struct hppfs_dirent d = {
- .ctx.actor = hppfs_filldir,
- .caller = ctx,
- .dentry = file->f_path.dentry
- };
- int err;
- proc_file->f_pos = ctx->pos;
- err = iterate_dir(proc_file, &d.ctx);
- ctx->pos = d.ctx.pos;
- return err;
-}
-
-static const struct file_operations hppfs_dir_fops = {
- .owner = NULL,
- .iterate = hppfs_readdir,
- .open = hppfs_dir_open,
- .llseek = default_llseek,
- .release = hppfs_release,
-};
-
-static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
-{
- sf->f_blocks = 0;
- sf->f_bfree = 0;
- sf->f_bavail = 0;
- sf->f_files = 0;
- sf->f_ffree = 0;
- sf->f_type = HPPFS_SUPER_MAGIC;
- return 0;
-}
-
-static struct inode *hppfs_alloc_inode(struct super_block *sb)
-{
- struct hppfs_inode_info *hi;
-
- hi = kmalloc(sizeof(*hi), GFP_KERNEL);
- if (!hi)
- return NULL;
-
- hi->proc_dentry = NULL;
- inode_init_once(&hi->vfs_inode);
- return &hi->vfs_inode;
-}
-
-void hppfs_evict_inode(struct inode *ino)
-{
- clear_inode(ino);
- dput(HPPFS_I(ino)->proc_dentry);
- mntput(ino->i_sb->s_fs_info);
-}
-
-static void hppfs_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kfree(HPPFS_I(inode));
-}
-
-static void hppfs_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, hppfs_i_callback);
-}
-
-static const struct super_operations hppfs_sbops = {
- .alloc_inode = hppfs_alloc_inode,
- .destroy_inode = hppfs_destroy_inode,
- .evict_inode = hppfs_evict_inode,
- .statfs = hppfs_statfs,
-};
-
-static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
- int buflen)
-{
- struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
- return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
- buflen);
-}
-
-static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-
- return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
-}
-
-static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd,
- void *cookie)
-{
- struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-
- if (proc_dentry->d_inode->i_op->put_link)
- proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie);
-}
-
-static const struct inode_operations hppfs_dir_iops = {
- .lookup = hppfs_lookup,
-};
-
-static const struct inode_operations hppfs_link_iops = {
- .readlink = hppfs_readlink,
- .follow_link = hppfs_follow_link,
- .put_link = hppfs_put_link,
-};
-
-static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
-{
- struct inode *proc_ino = dentry->d_inode;
- struct inode *inode = new_inode(sb);
-
- if (!inode) {
- dput(dentry);
- return NULL;
- }
-
- if (d_is_dir(dentry)) {
- inode->i_op = &hppfs_dir_iops;
- inode->i_fop = &hppfs_dir_fops;
- } else if (d_is_symlink(dentry)) {
- inode->i_op = &hppfs_link_iops;
- inode->i_fop = &hppfs_file_fops;
- } else {
- inode->i_op = &hppfs_file_iops;
- inode->i_fop = &hppfs_file_fops;
- }
-
- HPPFS_I(inode)->proc_dentry = dentry;
-
- inode->i_uid = proc_ino->i_uid;
- inode->i_gid = proc_ino->i_gid;
- inode->i_atime = proc_ino->i_atime;
- inode->i_mtime = proc_ino->i_mtime;
- inode->i_ctime = proc_ino->i_ctime;
- inode->i_ino = proc_ino->i_ino;
- inode->i_mode = proc_ino->i_mode;
- set_nlink(inode, proc_ino->i_nlink);
- inode->i_size = proc_ino->i_size;
- inode->i_blocks = proc_ino->i_blocks;
-
- return inode;
-}
-
-static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
-{
- struct inode *root_inode;
- struct vfsmount *proc_mnt;
- int err = -ENOENT;
-
- proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
- if (IS_ERR(proc_mnt))
- goto out;
-
- sb->s_blocksize = 1024;
- sb->s_blocksize_bits = 10;
- sb->s_magic = HPPFS_SUPER_MAGIC;
- sb->s_op = &hppfs_sbops;
- sb->s_fs_info = proc_mnt;
-
- err = -ENOMEM;
- root_inode = get_inode(sb, dget(proc_mnt->mnt_root));
- sb->s_root = d_make_root(root_inode);
- if (!sb->s_root)
- goto out_mntput;
-
- return 0;
-
- out_mntput:
- mntput(proc_mnt);
- out:
- return(err);
-}
-
-static struct dentry *hppfs_read_super(struct file_system_type *type,
- int flags, const char *dev_name,
- void *data)
-{
- return mount_nodev(type, flags, data, hppfs_fill_super);
-}
-
-static struct file_system_type hppfs_type = {
- .owner = THIS_MODULE,
- .name = "hppfs",
- .mount = hppfs_read_super,
- .kill_sb = kill_anon_super,
- .fs_flags = 0,
-};
-MODULE_ALIAS_FS("hppfs");
-
-static int __init init_hppfs(void)
-{
- return register_filesystem(&hppfs_type);
-}
-
-static void __exit exit_hppfs(void)
-{
- unregister_filesystem(&hppfs_type);
-}
-
-module_init(init_hppfs)
-module_exit(exit_hppfs)
-MODULE_LICENSE("GPL");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c274aca8e8dc..973c24ce59ad 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -34,6 +34,7 @@
#include <linux/security.h>
#include <linux/magic.h>
#include <linux/migrate.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
@@ -47,9 +48,10 @@ struct hugetlbfs_config {
kuid_t uid;
kgid_t gid;
umode_t mode;
- long nr_blocks;
+ long max_hpages;
long nr_inodes;
struct hstate *hstate;
+ long min_hpages;
};
struct hugetlbfs_inode_info {
@@ -67,7 +69,7 @@ int sysctl_hugetlb_shm_group;
enum {
Opt_size, Opt_nr_inodes,
Opt_mode, Opt_uid, Opt_gid,
- Opt_pagesize,
+ Opt_pagesize, Opt_min_size,
Opt_err,
};
@@ -78,6 +80,7 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_pagesize, "pagesize=%s"},
+ {Opt_min_size, "min_size=%s"},
{Opt_err, NULL},
};
@@ -127,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
goto out;
ret = 0;
- hugetlb_prefault_arch_hook(vma->vm_mm);
if (vma->vm_flags & VM_WRITE && inode->i_size < len)
inode->i_size = len;
out:
@@ -179,42 +181,33 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
#endif
-static int
+static size_t
hugetlbfs_read_actor(struct page *page, unsigned long offset,
- char __user *buf, unsigned long count,
- unsigned long size)
+ struct iov_iter *to, unsigned long size)
{
- char *kaddr;
- unsigned long left, copied = 0;
+ size_t copied = 0;
int i, chunksize;
- if (size > count)
- size = count;
-
/* Find which 4k chunk and offset with in that chunk */
i = offset >> PAGE_CACHE_SHIFT;
offset = offset & ~PAGE_CACHE_MASK;
while (size) {
+ size_t n;
chunksize = PAGE_CACHE_SIZE;
if (offset)
chunksize -= offset;
if (chunksize > size)
chunksize = size;
- kaddr = kmap(&page[i]);
- left = __copy_to_user(buf, kaddr + offset, chunksize);
- kunmap(&page[i]);
- if (left) {
- copied += (chunksize - left);
- break;
- }
+ n = copy_page_to_iter(&page[i], offset, chunksize, to);
+ copied += n;
+ if (n != chunksize)
+ return copied;
offset = 0;
size -= chunksize;
- buf += chunksize;
- copied += chunksize;
i++;
}
- return copied ? copied : -EFAULT;
+ return copied;
}
/*
@@ -222,39 +215,34 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
* data. Its *very* similar to do_generic_mapping_read(), we can't use that
* since it has PAGE_CACHE_SIZE assumptions.
*/
-static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct hstate *h = hstate_file(filp);
- struct address_space *mapping = filp->f_mapping;
+ struct file *file = iocb->ki_filp;
+ struct hstate *h = hstate_file(file);
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- unsigned long index = *ppos >> huge_page_shift(h);
- unsigned long offset = *ppos & ~huge_page_mask(h);
+ unsigned long index = iocb->ki_pos >> huge_page_shift(h);
+ unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
unsigned long end_index;
loff_t isize;
ssize_t retval = 0;
- /* validate length */
- if (len == 0)
- goto out;
-
- for (;;) {
+ while (iov_iter_count(to)) {
struct page *page;
- unsigned long nr, ret;
- int ra;
+ size_t nr, copied;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
isize = i_size_read(inode);
if (!isize)
- goto out;
+ break;
end_index = (isize - 1) >> huge_page_shift(h);
- if (index >= end_index) {
- if (index > end_index)
- goto out;
+ if (index > end_index)
+ break;
+ if (index == end_index) {
nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
if (nr <= offset)
- goto out;
+ break;
}
nr = nr - offset;
@@ -265,39 +253,27 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
* We have a HOLE, zero out the user-buffer for the
* length of the hole or request.
*/
- ret = len < nr ? len : nr;
- if (clear_user(buf, ret))
- ra = -EFAULT;
- else
- ra = 0;
+ copied = iov_iter_zero(nr, to);
} else {
unlock_page(page);
/*
* We have the page, copy it to user space buffer.
*/
- ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
- ret = ra;
+ copied = hugetlbfs_read_actor(page, offset, to, nr);
page_cache_release(page);
}
- if (ra < 0) {
- if (retval == 0)
- retval = ra;
- goto out;
+ offset += copied;
+ retval += copied;
+ if (copied != nr && iov_iter_count(to)) {
+ if (!retval)
+ retval = -EFAULT;
+ break;
}
-
- offset += ret;
- retval += ret;
- len -= ret;
index += offset >> huge_page_shift(h);
offset &= ~huge_page_mask(h);
-
- /* short read or no more work */
- if ((ret != nr) || (len == 0))
- break;
}
-out:
- *ppos = ((loff_t)index << huge_page_shift(h)) + offset;
+ iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
return retval;
}
@@ -319,7 +295,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
static void truncate_huge_page(struct page *page)
{
- cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
+ ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
@@ -416,7 +392,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct hstate *h = hstate_inode(inode);
int error;
unsigned int ia_valid = attr->ia_valid;
@@ -610,7 +586,7 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
- struct hstate *h = hstate_inode(dentry->d_inode);
+ struct hstate *h = hstate_inode(d_inode(dentry));
buf->f_type = HUGETLBFS_MAGIC;
buf->f_bsize = huge_page_size(h);
@@ -721,7 +697,7 @@ static void init_once(void *foo)
}
const struct file_operations hugetlbfs_file_operations = {
- .read = hugetlbfs_read,
+ .read_iter = hugetlbfs_read_iter,
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
@@ -754,14 +730,38 @@ static const struct super_operations hugetlbfs_ops = {
.show_options = generic_show_options,
};
+enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };
+
+/*
+ * Convert size option passed from command line to number of huge pages
+ * in the pool specified by hstate. Size option could be in bytes
+ * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
+ */
+static long long
+hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
+ int val_type)
+{
+ if (val_type == NO_SIZE)
+ return -1;
+
+ if (val_type == SIZE_PERCENT) {
+ size_opt <<= huge_page_shift(h);
+ size_opt *= h->max_huge_pages;
+ do_div(size_opt, 100);
+ }
+
+ size_opt >>= huge_page_shift(h);
+ return size_opt;
+}
+
static int
hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
{
char *p, *rest;
substring_t args[MAX_OPT_ARGS];
int option;
- unsigned long long size = 0;
- enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
+ unsigned long long max_size_opt = 0, min_size_opt = 0;
+ int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
if (!options)
return 0;
@@ -799,10 +799,10 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
/* memparse() will accept a K/M/G without a digit */
if (!isdigit(*args[0].from))
goto bad_val;
- size = memparse(args[0].from, &rest);
- setsize = SIZE_STD;
+ max_size_opt = memparse(args[0].from, &rest);
+ max_val_type = SIZE_STD;
if (*rest == '%')
- setsize = SIZE_PERCENT;
+ max_val_type = SIZE_PERCENT;
break;
}
@@ -825,6 +825,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
break;
}
+ case Opt_min_size: {
+ /* memparse() will accept a K/M/G without a digit */
+ if (!isdigit(*args[0].from))
+ goto bad_val;
+ min_size_opt = memparse(args[0].from, &rest);
+ min_val_type = SIZE_STD;
+ if (*rest == '%')
+ min_val_type = SIZE_PERCENT;
+ break;
+ }
+
default:
pr_err("Bad mount option: \"%s\"\n", p);
return -EINVAL;
@@ -832,15 +843,22 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
}
}
- /* Do size after hstate is set up */
- if (setsize > NO_SIZE) {
- struct hstate *h = pconfig->hstate;
- if (setsize == SIZE_PERCENT) {
- size <<= huge_page_shift(h);
- size *= h->max_huge_pages;
- do_div(size, 100);
- }
- pconfig->nr_blocks = (size >> huge_page_shift(h));
+ /*
+ * Use huge page pool size (in hstate) to convert the size
+ * options to number of huge pages. If NO_SIZE, -1 is returned.
+ */
+ pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+ max_size_opt, max_val_type);
+ pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+ min_size_opt, min_val_type);
+
+ /*
+ * If max_size was specified, then min_size must be smaller
+ */
+ if (max_val_type > NO_SIZE &&
+ pconfig->min_hpages > pconfig->max_hpages) {
+ pr_err("minimum size can not be greater than maximum size\n");
+ return -EINVAL;
}
return 0;
@@ -859,12 +877,13 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
save_mount_options(sb, data);
- config.nr_blocks = -1; /* No limit on size by default */
+ config.max_hpages = -1; /* No limit on size by default */
config.nr_inodes = -1; /* No limit on number of inodes by default */
config.uid = current_fsuid();
config.gid = current_fsgid();
config.mode = 0755;
config.hstate = &default_hstate;
+ config.min_hpages = -1; /* No default minimum size */
ret = hugetlbfs_parse_options(data, &config);
if (ret)
return ret;
@@ -878,8 +897,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
sbinfo->max_inodes = config.nr_inodes;
sbinfo->free_inodes = config.nr_inodes;
sbinfo->spool = NULL;
- if (config.nr_blocks != -1) {
- sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+ /*
+ * Allocate and initialize subpool if maximum or minimum size is
+ * specified. Any needed reservations (for minimim size) are taken
+ * taken when the subpool is created.
+ */
+ if (config.max_hpages != -1 || config.min_hpages != -1) {
+ sbinfo->spool = hugepage_new_subpool(config.hstate,
+ config.max_hpages,
+ config.min_hpages);
if (!sbinfo->spool)
goto out_free;
}
@@ -984,6 +1010,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
if (!inode)
goto out_dentry;
+ if (creat_flags == HUGETLB_SHMFS_INODE)
+ inode->i_flags |= S_PRIVATE;
file = ERR_PTR(-ENOMEM);
if (hugetlb_reserve_pages(inode, 0,
diff --git a/fs/inode.c b/fs/inode.c
index f00b16f45507..d30640f7a193 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -152,6 +152,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_pipe = NULL;
inode->i_bdev = NULL;
inode->i_cdev = NULL;
+ inode->i_link = NULL;
inode->i_rdev = 0;
inode->dirtied_when = 0;
@@ -223,6 +224,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
void __destroy_inode(struct inode *inode)
{
BUG_ON(inode_has_buffers(inode));
+ inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
locks_free_lock_context(inode->i_flctx);
@@ -839,7 +841,11 @@ unsigned int get_next_ino(void)
}
#endif
- *p = ++res;
+ res++;
+ /* get_next_ino should not provide a 0 inode number */
+ if (unlikely(!res))
+ res++;
+ *p = res;
put_cpu_var(last_ino);
return res;
}
@@ -1584,36 +1590,47 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
* This function automatically handles read only file systems and media,
* as well as the "noatime" flag and inode specific "noatime" markers.
*/
-void touch_atime(const struct path *path)
+bool atime_needs_update(const struct path *path, struct inode *inode)
{
struct vfsmount *mnt = path->mnt;
- struct inode *inode = path->dentry->d_inode;
struct timespec now;
if (inode->i_flags & S_NOATIME)
- return;
+ return false;
if (IS_NOATIME(inode))
- return;
+ return false;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
+ return false;
if (mnt->mnt_flags & MNT_NOATIME)
- return;
+ return false;
if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
+ return false;
now = current_fs_time(inode->i_sb);
if (!relatime_need_update(mnt, inode, now))
- return;
+ return false;
if (timespec_equal(&inode->i_atime, &now))
+ return false;
+
+ return true;
+}
+
+void touch_atime(const struct path *path)
+{
+ struct vfsmount *mnt = path->mnt;
+ struct inode *inode = d_inode(path->dentry);
+ struct timespec now;
+
+ if (!atime_needs_update(path, inode))
return;
if (!sb_start_write_trylock(inode->i_sb))
return;
- if (__mnt_want_write(mnt))
+ if (__mnt_want_write(mnt) != 0)
goto skip_update;
/*
* File systems can error out when updating inodes if they need to
@@ -1624,6 +1641,7 @@ void touch_atime(const struct path *path)
* We may also fail on filesystems that have the ability to make parts
* of the fs read only, e.g. subvolumes in Btrfs.
*/
+ now = current_fs_time(inode->i_sb);
update_time(inode, &now, S_ATIME);
__mnt_drop_write(mnt);
skip_update:
@@ -1639,7 +1657,7 @@ EXPORT_SYMBOL(touch_atime);
*/
int should_remove_suid(struct dentry *dentry)
{
- umode_t mode = dentry->d_inode->i_mode;
+ umode_t mode = d_inode(dentry)->i_mode;
int kill = 0;
/* suid always must be killed */
@@ -1660,7 +1678,31 @@ int should_remove_suid(struct dentry *dentry)
}
EXPORT_SYMBOL(should_remove_suid);
-static int __remove_suid(struct dentry *dentry, int kill)
+/*
+ * Return mask of changes for notify_change() that need to be done as a
+ * response to write or truncate. Return 0 if nothing has to be changed.
+ * Negative value on error (change should be denied).
+ */
+int dentry_needs_remove_privs(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ int mask = 0;
+ int ret;
+
+ if (IS_NOSEC(inode))
+ return 0;
+
+ mask = should_remove_suid(dentry);
+ ret = security_inode_need_killpriv(dentry);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ mask |= ATTR_KILL_PRIV;
+ return mask;
+}
+EXPORT_SYMBOL(dentry_needs_remove_privs);
+
+static int __remove_privs(struct dentry *dentry, int kill)
{
struct iattr newattrs;
@@ -1672,33 +1714,32 @@ static int __remove_suid(struct dentry *dentry, int kill)
return notify_change(dentry, &newattrs, NULL);
}
-int file_remove_suid(struct file *file)
+/*
+ * Remove special file priviledges (suid, capabilities) when file is written
+ * to or truncated.
+ */
+int file_remove_privs(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- int killsuid;
- int killpriv;
+ struct inode *inode = d_inode(dentry);
+ int kill;
int error = 0;
/* Fast path for nothing security related */
if (IS_NOSEC(inode))
return 0;
- killsuid = should_remove_suid(dentry);
- killpriv = security_inode_need_killpriv(dentry);
-
- if (killpriv < 0)
- return killpriv;
- if (killpriv)
- error = security_inode_killpriv(dentry);
- if (!error && killsuid)
- error = __remove_suid(dentry, killsuid);
- if (!error && (inode->i_sb->s_flags & MS_NOSEC))
- inode->i_flags |= S_NOSEC;
+ kill = file_needs_remove_privs(file);
+ if (kill < 0)
+ return kill;
+ if (kill)
+ error = __remove_privs(dentry, kill);
+ if (!error)
+ inode_has_no_xattr(inode);
return error;
}
-EXPORT_SYMBOL(file_remove_suid);
+EXPORT_SYMBOL(file_remove_privs);
/**
* file_update_time - update mtime and ctime time
@@ -1946,20 +1987,6 @@ void inode_dio_wait(struct inode *inode)
EXPORT_SYMBOL(inode_dio_wait);
/*
- * inode_dio_done - signal finish of a direct I/O requests
- * @inode: inode the direct I/O happens on
- *
- * This is called once we've finished processing a direct I/O request,
- * and is used to wake up callers waiting for direct I/O to be quiesced.
- */
-void inode_dio_done(struct inode *inode)
-{
- if (atomic_dec_and_test(&inode->i_dio_count))
- wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
-}
-EXPORT_SYMBOL(inode_dio_done);
-
-/*
* inode_set_flags - atomically set some inode flags
*
* Note: the caller should be holding i_mutex, or else be sure that
@@ -1967,9 +1994,8 @@ EXPORT_SYMBOL(inode_dio_done);
* inode is being instantiated). The reason for the cmpxchg() loop
* --- which wouldn't be necessary if all code paths which modify
* i_flags actually followed this rule, is that there is at least one
- * code path which doesn't today --- for example,
- * __generic_file_aio_write() calls file_remove_suid() without holding
- * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ * code path which doesn't today so we use cmpxchg() out of an abundance
+ * of caution.
*
* In the long run, i_mutex is overkill, and we should probably look
* at using the i_lock spinlock to protect i_flags, and then make sure
diff --git a/fs/internal.h b/fs/internal.h
index 01dce1d1476b..4d5af583ab03 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -107,6 +107,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
extern long do_handle_open(int mountdirfd,
struct file_handle __user *ufh, int open_flag);
extern int open_check_o_direct(struct file *f);
+extern int vfs_open(const struct path *, struct file *, const struct cred *);
/*
* inode.c
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 12088d8de3fa..0c5f721b4e91 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -44,7 +44,7 @@ static struct dentry *isofs_export_get_parent(struct dentry *child)
{
unsigned long parent_block = 0;
unsigned long parent_offset = 0;
- struct inode *child_inode = child->d_inode;
+ struct inode *child_inode = d_inode(child);
struct iso_inode_info *e_child_inode = ISOFS_I(child_inode);
struct iso_directory_record *de = NULL;
struct buffer_head * bh = NULL;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 988b32ed4c87..4227dc4f7437 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
unsigned long blocknr;
if (is_journal_aborted(journal))
- return 1;
+ return -EIO;
if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
return 1;
@@ -405,10 +405,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
* jbd2_cleanup_journal_tail() doesn't get called all that often.
*/
if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
- __jbd2_update_log_tail(journal, first_tid, blocknr);
- return 0;
+ return __jbd2_update_log_tail(journal, first_tid, blocknr);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b96bd8076b70..4ff3fad4e9e3 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
*/
J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
-retry_alloc:
- new_bh = alloc_buffer_head(GFP_NOFS);
- if (!new_bh) {
- /*
- * Failure is not an option, but __GFP_NOFAIL is going
- * away; so we retry ourselves here.
- */
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto retry_alloc;
- }
+ new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
@@ -885,9 +876,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
*
* Requires j_checkpoint_mutex
*/
-void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
{
unsigned long freed;
+ int ret;
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
@@ -897,7 +889,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
* space and if we lose sb update during power failure we'd replay
* old transaction with possibly newly overwritten data.
*/
- jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+ ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+ if (ret)
+ goto out;
+
write_lock(&journal->j_state_lock);
freed = block - journal->j_tail;
if (block < journal->j_tail)
@@ -913,6 +908,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
journal->j_tail_sequence = tid;
journal->j_tail = block;
write_unlock(&journal->j_state_lock);
+
+out:
+ return ret;
}
/*
@@ -1137,7 +1135,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
{
journal_t *journal = journal_init_common();
struct buffer_head *bh;
- char *p;
int n;
if (!journal)
@@ -1150,9 +1147,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
journal->j_blk_offset = start;
journal->j_maxlen = len;
bdevname(journal->j_dev, journal->j_devname);
- p = journal->j_devname;
- while ((p = strchr(p, '/')))
- *p = '!';
+ strreplace(journal->j_devname, '/', '!');
jbd2_stats_proc_init(journal);
n = journal->j_blocksize / sizeof(journal_block_tag_t);
journal->j_wbufsize = n;
@@ -1204,10 +1199,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
journal->j_inode = inode;
bdevname(journal->j_dev, journal->j_devname);
- p = journal->j_devname;
- while ((p = strchr(p, '/')))
- *p = '!';
- p = journal->j_devname + strlen(journal->j_devname);
+ p = strreplace(journal->j_devname, '/', '!');
sprintf(p, "-%lu", journal->j_inode->i_ino);
jbd_debug(1,
"journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
@@ -1331,7 +1323,7 @@ static int journal_reset(journal_t *journal)
return jbd2_journal_start_thread(journal);
}
-static void jbd2_write_superblock(journal_t *journal, int write_op)
+static int jbd2_write_superblock(journal_t *journal, int write_op)
{
struct buffer_head *bh = journal->j_sb_buffer;
journal_superblock_t *sb = journal->j_superblock;
@@ -1370,7 +1362,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
printk(KERN_ERR "JBD2: Error %d detected when updating "
"journal superblock for %s.\n", ret,
journal->j_devname);
+ jbd2_journal_abort(journal, ret);
}
+
+ return ret;
}
/**
@@ -1383,10 +1378,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
* Update a journal's superblock information about log tail and write it to
* disk, waiting for the IO to complete.
*/
-void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
unsigned long tail_block, int write_op)
{
journal_superblock_t *sb = journal->j_superblock;
+ int ret;
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
@@ -1395,13 +1391,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
sb->s_sequence = cpu_to_be32(tail_tid);
sb->s_start = cpu_to_be32(tail_block);
- jbd2_write_superblock(journal, write_op);
+ ret = jbd2_write_superblock(journal, write_op);
+ if (ret)
+ goto out;
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
WARN_ON(!sb->s_sequence);
journal->j_flags &= ~JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
+
+out:
+ return ret;
}
/**
@@ -1950,7 +1951,14 @@ int jbd2_journal_flush(journal_t *journal)
return -EIO;
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_cleanup_journal_tail(journal);
+ if (!err) {
+ err = jbd2_cleanup_journal_tail(journal);
+ if (err < 0) {
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ goto out;
+ }
+ err = 0;
+ }
/* Finally, mark the journal as really needing no recovery.
* This sets s_start==0 in the underlying superblock, which is
@@ -1966,7 +1974,8 @@ int jbd2_journal_flush(journal_t *journal)
J_ASSERT(journal->j_head == journal->j_tail);
J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
write_unlock(&journal->j_state_lock);
- return 0;
+out:
+ return err;
}
/**
@@ -2330,7 +2339,7 @@ static int jbd2_journal_init_journal_head_cache(void)
jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
sizeof(struct journal_head),
0, /* offset */
- SLAB_TEMPORARY, /* flags */
+ SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
NULL); /* ctor */
retval = 0;
if (!jbd2_journal_head_cache) {
@@ -2362,10 +2371,8 @@ static struct journal_head *journal_alloc_journal_head(void)
if (!ret) {
jbd_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
- while (!ret) {
- yield();
- ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
- }
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache,
+ GFP_NOFS | __GFP_NOFAIL);
}
return ret;
}
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index b5128c6e63ad..a9079d035ae5 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -842,15 +842,23 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
{
jbd2_journal_revoke_header_t *header;
int offset, max;
+ int csum_size = 0;
+ __u32 rcount;
int record_len = 4;
header = (jbd2_journal_revoke_header_t *) bh->b_data;
offset = sizeof(jbd2_journal_revoke_header_t);
- max = be32_to_cpu(header->r_count);
+ rcount = be32_to_cpu(header->r_count);
if (!jbd2_revoke_block_csum_verify(journal, header))
return -EINVAL;
+ if (jbd2_journal_has_csum_v2or3(journal))
+ csum_size = sizeof(struct jbd2_journal_revoke_tail);
+ if (rcount > journal->j_blocksize - csum_size)
+ return -EINVAL;
+ max = rcount;
+
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
record_len = 8;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index c6cbaef2bda1..0abf2e7f725b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -141,11 +141,13 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
{
struct list_head *hash_list;
struct jbd2_revoke_record_s *record;
+ gfp_t gfp_mask = GFP_NOFS;
-repeat:
- record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
+ if (journal_oom_retry)
+ gfp_mask |= __GFP_NOFAIL;
+ record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask);
if (!record)
- goto oom;
+ return -ENOMEM;
record->sequence = seq;
record->blocknr = blocknr;
@@ -154,13 +156,6 @@ repeat:
list_add(&record->hash, hash_list);
spin_unlock(&journal->j_revoke_lock);
return 0;
-
-oom:
- if (!journal_oom_retry)
- return -ENOMEM;
- jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
- yield();
- goto repeat;
}
/* Find a revoke record in the journal's hash table. */
@@ -577,7 +572,7 @@ static void write_one_revoke_record(journal_t *journal,
{
int csum_size = 0;
struct buffer_head *descriptor;
- int offset;
+ int sz, offset;
journal_header_t *header;
/* If we are already aborting, this all becomes a noop. We
@@ -594,9 +589,14 @@ static void write_one_revoke_record(journal_t *journal,
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ sz = 8;
+ else
+ sz = 4;
+
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
- if (offset >= journal->j_blocksize - csum_size) {
+ if (offset + sz > journal->j_blocksize - csum_size) {
flush_descriptor(journal, descriptor, offset, write_op);
descriptor = NULL;
}
@@ -619,16 +619,13 @@ static void write_one_revoke_record(journal_t *journal,
*descriptorp = descriptor;
}
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
* ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
- offset += 8;
-
- } else {
+ else
* ((__be32 *)(&descriptor->b_data[offset])) =
cpu_to_be32(record->blocknr);
- offset += 4;
- }
+ offset += sz;
*offsetp = offset;
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5f09370c90a8..f3d06174b051 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
alloc_transaction:
if (!journal->j_running_transaction) {
+ /*
+ * If __GFP_FS is not present, then we may be being called from
+ * inside the fs writeback layer, so we MUST NOT fail.
+ */
+ if ((gfp_mask & __GFP_FS) == 0)
+ gfp_mask |= __GFP_NOFAIL;
new_transaction = kmem_cache_zalloc(transaction_cache,
gfp_mask);
- if (!new_transaction) {
- /*
- * If __GFP_FS is not present, then we may be
- * being called from inside the fs writeback
- * layer, so we MUST NOT fail. Since
- * __GFP_NOFAIL is going away, we will arrange
- * to retry the allocation ourselves.
- */
- if ((gfp_mask & __GFP_FS) == 0) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto alloc_transaction;
- }
+ if (!new_transaction)
return -ENOMEM;
- }
}
jbd_debug(3, "New handle %p going live.\n", handle);
@@ -551,7 +545,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
int result;
int wanted;
- WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
journal = transaction->t_journal;
@@ -627,7 +620,6 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
tid_t tid;
int need_to_start, ret;
- WARN_ON(!transaction);
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
@@ -763,6 +755,30 @@ static void warn_dirty_buffer(struct buffer_head *bh)
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
}
+/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
+static void jbd2_freeze_jh_data(struct journal_head *jh)
+{
+ struct page *page;
+ int offset;
+ char *source;
+ struct buffer_head *bh = jh2bh(jh);
+
+ J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
+ page = bh->b_page;
+ offset = offset_in_page(bh->b_data);
+ source = kmap_atomic(page);
+ /* Fire data frozen trigger just before we copy the data */
+ jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
+ memcpy(jh->b_frozen_data, source + offset, bh->b_size);
+ kunmap_atomic(source);
+
+ /*
+ * Now that the frozen data is saved off, we need to store any matching
+ * triggers.
+ */
+ jh->b_frozen_triggers = jh->b_triggers;
+}
+
/*
* If the buffer is already part of the current transaction, then there
* is nothing we need to do. If it is already part of a prior
@@ -782,10 +798,8 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
journal_t *journal;
int error;
char *frozen_buffer = NULL;
- int need_copy = 0;
unsigned long start_lock, time_lock;
- WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
journal = transaction->t_journal;
@@ -870,119 +884,96 @@ repeat:
jh->b_modified = 0;
/*
+ * If the buffer is not journaled right now, we need to make sure it
+ * doesn't get written to disk before the caller actually commits the
+ * new data
+ */
+ if (!jh->b_transaction) {
+ JBUFFER_TRACE(jh, "no transaction");
+ J_ASSERT_JH(jh, !jh->b_next_transaction);
+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ /*
+ * Make sure all stores to jh (b_modified, b_frozen_data) are
+ * visible before attaching it to the running transaction.
+ * Paired with barrier in jbd2_write_access_granted()
+ */
+ smp_wmb();
+ spin_lock(&journal->j_list_lock);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+ spin_unlock(&journal->j_list_lock);
+ goto done;
+ }
+ /*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
*/
if (jh->b_frozen_data) {
JBUFFER_TRACE(jh, "has frozen data");
J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
- jh->b_next_transaction = transaction;
- goto done;
+ goto attach_next;
}
- /* Is there data here we need to preserve? */
+ JBUFFER_TRACE(jh, "owned by older transaction");
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
- if (jh->b_transaction && jh->b_transaction != transaction) {
- JBUFFER_TRACE(jh, "owned by older transaction");
- J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
+ /*
+ * There is one case we have to be very careful about. If the
+ * committing transaction is currently writing this buffer out to disk
+ * and has NOT made a copy-out, then we cannot modify the buffer
+ * contents at all right now. The essence of copy-out is that it is
+ * the extra copy, not the primary copy, which gets journaled. If the
+ * primary copy is already going to disk then we cannot do copy-out
+ * here.
+ */
+ if (buffer_shadow(bh)) {
+ JBUFFER_TRACE(jh, "on shadow: sleep");
+ jbd_unlock_bh_state(bh);
+ wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
+ goto repeat;
+ }
- /* There is one case we have to be very careful about.
- * If the committing transaction is currently writing
- * this buffer out to disk and has NOT made a copy-out,
- * then we cannot modify the buffer contents at all
- * right now. The essence of copy-out is that it is the
- * extra copy, not the primary copy, which gets
- * journaled. If the primary copy is already going to
- * disk then we cannot do copy-out here. */
-
- if (buffer_shadow(bh)) {
- JBUFFER_TRACE(jh, "on shadow: sleep");
+ /*
+ * Only do the copy if the currently-owning transaction still needs it.
+ * If buffer isn't on BJ_Metadata list, the committing transaction is
+ * past that stage (here we use the fact that BH_Shadow is set under
+ * bh_state lock together with refiling to BJ_Shadow list and at this
+ * point we know the buffer doesn't have BH_Shadow set).
+ *
+ * Subtle point, though: if this is a get_undo_access, then we will be
+ * relying on the frozen_data to contain the new value of the
+ * committed_data record after the transaction, so we HAVE to force the
+ * frozen_data copy in that case.
+ */
+ if (jh->b_jlist == BJ_Metadata || force_copy) {
+ JBUFFER_TRACE(jh, "generate frozen data");
+ if (!frozen_buffer) {
+ JBUFFER_TRACE(jh, "allocate memory for buffer");
jbd_unlock_bh_state(bh);
- wait_on_bit_io(&bh->b_state, BH_Shadow,
- TASK_UNINTERRUPTIBLE);
- goto repeat;
- }
-
- /*
- * Only do the copy if the currently-owning transaction still
- * needs it. If buffer isn't on BJ_Metadata list, the
- * committing transaction is past that stage (here we use the
- * fact that BH_Shadow is set under bh_state lock together with
- * refiling to BJ_Shadow list and at this point we know the
- * buffer doesn't have BH_Shadow set).
- *
- * Subtle point, though: if this is a get_undo_access,
- * then we will be relying on the frozen_data to contain
- * the new value of the committed_data record after the
- * transaction, so we HAVE to force the frozen_data copy
- * in that case.
- */
- if (jh->b_jlist == BJ_Metadata || force_copy) {
- JBUFFER_TRACE(jh, "generate frozen data");
+ frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!frozen_buffer) {
- JBUFFER_TRACE(jh, "allocate memory for buffer");
- jbd_unlock_bh_state(bh);
- frozen_buffer =
- jbd2_alloc(jh2bh(jh)->b_size,
- GFP_NOFS);
- if (!frozen_buffer) {
- printk(KERN_ERR
- "%s: OOM for frozen_buffer\n",
- __func__);
- JBUFFER_TRACE(jh, "oom!");
- error = -ENOMEM;
- jbd_lock_bh_state(bh);
- goto done;
- }
- goto repeat;
+ printk(KERN_ERR "%s: OOM for frozen_buffer\n",
+ __func__);
+ JBUFFER_TRACE(jh, "oom!");
+ error = -ENOMEM;
+ goto out;
}
- jh->b_frozen_data = frozen_buffer;
- frozen_buffer = NULL;
- need_copy = 1;
+ goto repeat;
}
- jh->b_next_transaction = transaction;
+ jh->b_frozen_data = frozen_buffer;
+ frozen_buffer = NULL;
+ jbd2_freeze_jh_data(jh);
}
-
-
+attach_next:
/*
- * Finally, if the buffer is not journaled right now, we need to make
- * sure it doesn't get written to disk before the caller actually
- * commits the new data
+ * Make sure all stores to jh (b_modified, b_frozen_data) are visible
+ * before attaching it to the running transaction. Paired with barrier
+ * in jbd2_write_access_granted()
*/
- if (!jh->b_transaction) {
- JBUFFER_TRACE(jh, "no transaction");
- J_ASSERT_JH(jh, !jh->b_next_transaction);
- JBUFFER_TRACE(jh, "file as BJ_Reserved");
- spin_lock(&journal->j_list_lock);
- __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
- spin_unlock(&journal->j_list_lock);
- }
+ smp_wmb();
+ jh->b_next_transaction = transaction;
done:
- if (need_copy) {
- struct page *page;
- int offset;
- char *source;
-
- J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
- "Possible IO failure.\n");
- page = jh2bh(jh)->b_page;
- offset = offset_in_page(jh2bh(jh)->b_data);
- source = kmap_atomic(page);
- /* Fire data frozen trigger just before we copy the data */
- jbd2_buffer_frozen_trigger(jh, source + offset,
- jh->b_triggers);
- memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
- kunmap_atomic(source);
-
- /*
- * Now that the frozen data is saved off, we need to store
- * any matching triggers.
- */
- jh->b_frozen_triggers = jh->b_triggers;
- }
jbd_unlock_bh_state(bh);
/*
@@ -999,6 +990,55 @@ out:
return error;
}
+/* Fast check whether buffer is already attached to the required transaction */
+static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh)
+{
+ struct journal_head *jh;
+ bool ret = false;
+
+ /* Dirty buffers require special handling... */
+ if (buffer_dirty(bh))
+ return false;
+
+ /*
+ * RCU protects us from dereferencing freed pages. So the checks we do
+ * are guaranteed not to oops. However the jh slab object can get freed
+ * & reallocated while we work with it. So we have to be careful. When
+ * we see jh attached to the running transaction, we know it must stay
+ * so until the transaction is committed. Thus jh won't be freed and
+ * will be attached to the same bh while we run. However it can
+ * happen jh gets freed, reallocated, and attached to the transaction
+ * just after we get pointer to it from bh. So we have to be careful
+ * and recheck jh still belongs to our bh before we return success.
+ */
+ rcu_read_lock();
+ if (!buffer_jbd(bh))
+ goto out;
+ /* This should be bh2jh() but that doesn't work with inline functions */
+ jh = READ_ONCE(bh->b_private);
+ if (!jh)
+ goto out;
+ if (jh->b_transaction != handle->h_transaction &&
+ jh->b_next_transaction != handle->h_transaction)
+ goto out;
+ /*
+ * There are two reasons for the barrier here:
+ * 1) Make sure to fetch b_bh after we did previous checks so that we
+ * detect when jh went through free, realloc, attach to transaction
+ * while we were checking. Paired with implicit barrier in that path.
+ * 2) So that access to bh done after jbd2_write_access_granted()
+ * doesn't get reordered and see inconsistent state of concurrent
+ * do_get_write_access().
+ */
+ smp_mb();
+ if (unlikely(jh->b_bh != bh))
+ goto out;
+ ret = true;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
/**
* int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
* @handle: transaction to add buffer modifications to
@@ -1012,9 +1052,13 @@ out:
int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
{
- struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ struct journal_head *jh;
int rc;
+ if (jbd2_write_access_granted(handle, bh))
+ return 0;
+
+ jh = jbd2_journal_add_journal_head(bh);
/* We do not want to get caught playing with fields which the
* log thread also manipulates. Make sure that the buffer
* completes any outstanding IO before proceeding. */
@@ -1051,7 +1095,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
int err;
jbd_debug(5, "journal_head %p\n", jh);
- WARN_ON(!transaction);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
@@ -1145,11 +1188,14 @@ out:
int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
{
int err;
- struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ struct journal_head *jh;
char *committed_data = NULL;
JBUFFER_TRACE(jh, "entry");
+ if (jbd2_write_access_granted(handle, bh))
+ return 0;
+ jh = jbd2_journal_add_journal_head(bh);
/*
* Do this first --- it can drop the journal lock, so we want to
* make sure that obtaining the committed_data is done
@@ -1266,7 +1312,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int ret = 0;
- WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
journal = transaction->t_journal;
@@ -1397,7 +1442,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
int err = 0;
int was_modified = 0;
- WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
journal = transaction->t_journal;
@@ -1530,8 +1574,22 @@ int jbd2_journal_stop(handle_t *handle)
tid_t tid;
pid_t pid;
- if (!transaction)
- goto free_and_exit;
+ if (!transaction) {
+ /*
+ * Handle is already detached from the transaction so
+ * there is nothing to do other than decrease a refcount,
+ * or free the handle if refcount drops to zero
+ */
+ if (--handle->h_ref > 0) {
+ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ handle->h_ref);
+ return err;
+ } else {
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
+ goto free_and_exit;
+ }
+ }
journal = transaction->t_journal;
J_ASSERT(journal_current_handle() == handle);
@@ -2373,7 +2431,6 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
- WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
journal = transaction->t_journal;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index f21b6fb5e4c4..81180022923f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -224,14 +224,14 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
{
struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb);
struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
- struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(dentry->d_inode);
+ struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(d_inode(dentry));
int ret;
uint32_t now = get_seconds();
ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
dentry->d_name.len, dead_f, now);
if (dead_f->inocache)
- set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink);
+ set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink);
if (!ret)
dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
return ret;
@@ -241,8 +241,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry)
{
- struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dentry->d_inode->i_sb);
- struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode);
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(d_inode(old_dentry)->i_sb);
+ struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry));
struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
int ret;
uint8_t type;
@@ -256,7 +256,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
return -EPERM;
/* XXX: This is ugly */
- type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12;
+ type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12;
if (!type) type = DT_REG;
now = get_seconds();
@@ -264,11 +264,11 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
if (!ret) {
mutex_lock(&f->sem);
- set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink);
+ set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink);
mutex_unlock(&f->sem);
- d_instantiate(dentry, old_dentry->d_inode);
+ d_instantiate(dentry, d_inode(old_dentry));
dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
- ihold(old_dentry->d_inode);
+ ihold(d_inode(old_dentry));
}
return ret;
}
@@ -354,6 +354,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
ret = -ENOMEM;
goto fail;
}
+ inode->i_link = f->target;
jffs2_dbg(1, "%s(): symlink's target '%s' cached\n",
__func__, (char *)f->target);
@@ -585,7 +586,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
{
struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb);
struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
- struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode);
+ struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry));
struct jffs2_full_dirent *fd;
int ret;
uint32_t now = get_seconds();
@@ -599,7 +600,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
dentry->d_name.len, f, now);
if (!ret) {
dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
- clear_nlink(dentry->d_inode);
+ clear_nlink(d_inode(dentry));
drop_nlink(dir_i);
}
return ret;
@@ -770,8 +771,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
* the VFS can't check whether the victim is empty. The filesystem
* needs to do that for itself.
*/
- if (new_dentry->d_inode) {
- victim_f = JFFS2_INODE_INFO(new_dentry->d_inode);
+ if (d_really_is_positive(new_dentry)) {
+ victim_f = JFFS2_INODE_INFO(d_inode(new_dentry));
if (d_is_dir(new_dentry)) {
struct jffs2_full_dirent *fd;
@@ -794,12 +795,12 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
/* Make a hard link */
/* XXX: This is ugly */
- type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12;
+ type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12;
if (!type) type = DT_REG;
now = get_seconds();
ret = jffs2_do_link(c, JFFS2_INODE_INFO(new_dir_i),
- old_dentry->d_inode->i_ino, type,
+ d_inode(old_dentry)->i_ino, type,
new_dentry->d_name.name, new_dentry->d_name.len, now);
if (ret)
@@ -808,9 +809,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
if (victim_f) {
/* There was a victim. Kill it off nicely */
if (d_is_dir(new_dentry))
- clear_nlink(new_dentry->d_inode);
+ clear_nlink(d_inode(new_dentry));
else
- drop_nlink(new_dentry->d_inode);
+ drop_nlink(d_inode(new_dentry));
/* Don't oops if the victim was a dirent pointing to an
inode which didn't exist. */
if (victim_f->inocache) {
@@ -836,9 +837,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
if (ret) {
/* Oh shit. We really ought to make a single node which can do both atomically */
- struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode);
+ struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry));
mutex_lock(&f->sem);
- inc_nlink(old_dentry->d_inode);
+ inc_nlink(d_inode(old_dentry));
if (f->inocache && !d_is_dir(old_dentry))
f->inocache->pino_nlink++;
mutex_unlock(&f->sem);
@@ -846,8 +847,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
__func__, ret);
/* Might as well let the VFS know */
- d_instantiate(new_dentry, old_dentry->d_inode);
- ihold(old_dentry->d_inode);
+ d_instantiate(new_dentry, d_inode(old_dentry));
+ ihold(d_inode(old_dentry));
new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
return ret;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 64989ca9ba90..f509f62e12f6 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -51,9 +51,7 @@ const struct file_operations jffs2_file_operations =
{
.llseek = generic_file_llseek,
.open = generic_file_open,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.unlocked_ioctl=jffs2_ioctl,
.mmap = generic_file_readonly_mmap,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 601afd1afddf..2caf1682036d 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -190,7 +190,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int rc;
rc = inode_change_ok(inode, iattr);
@@ -272,12 +272,9 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
mutex_lock(&f->sem);
ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node);
+ if (ret)
+ goto error;
- if (ret) {
- mutex_unlock(&f->sem);
- iget_failed(inode);
- return ERR_PTR(ret);
- }
inode->i_mode = jemode_to_cpu(latest_node.mode);
i_uid_write(inode, je16_to_cpu(latest_node.uid));
i_gid_write(inode, je16_to_cpu(latest_node.gid));
@@ -294,6 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
case S_IFLNK:
inode->i_op = &jffs2_symlink_inode_operations;
+ inode->i_link = f->target;
break;
case S_IFDIR:
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index d200a9b8fd5e..824e61ede465 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -19,7 +19,7 @@
struct kstatfs;
struct kvec;
-#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode))
+#define JFFS2_INODE_INFO(i) (container_of(i, struct jffs2_inode_info, vfs_inode))
#define OFNI_EDONI_2SFFJ(f) (&(f)->vfs_inode)
#define JFFS2_SB_INFO(sb) (sb->s_fs_info)
#define OFNI_BS_2SFFJ(c) ((struct super_block *)c->os_priv)
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index dddbde4f56f4..28e0aab42bc3 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1203,17 +1203,13 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n",
ret, retlen, sizeof(*latest_node));
/* FIXME: If this fails, there seems to be a memory leak. Find it. */
- mutex_unlock(&f->sem);
- jffs2_do_clear_inode(c, f);
- return ret?ret:-EIO;
+ return ret ? ret : -EIO;
}
crc = crc32(0, latest_node, sizeof(*latest_node)-8);
if (crc != je32_to_cpu(latest_node->node_crc)) {
JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n",
f->inocache->ino, ref_offset(rii.latest_ref));
- mutex_unlock(&f->sem);
- jffs2_do_clear_inode(c, f);
return -EIO;
}
@@ -1250,16 +1246,11 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
* keep in RAM to facilitate quick follow symlink
* operation. */
uint32_t csize = je32_to_cpu(latest_node->csize);
- if (csize > JFFS2_MAX_NAME_LEN) {
- mutex_unlock(&f->sem);
- jffs2_do_clear_inode(c, f);
+ if (csize > JFFS2_MAX_NAME_LEN)
return -ENAMETOOLONG;
- }
f->target = kmalloc(csize + 1, GFP_KERNEL);
if (!f->target) {
JFFS2_ERROR("can't allocate %u bytes of memory for the symlink target path cache\n", csize);
- mutex_unlock(&f->sem);
- jffs2_do_clear_inode(c, f);
return -ENOMEM;
}
@@ -1271,8 +1262,6 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
ret = -EIO;
kfree(f->target);
f->target = NULL;
- mutex_unlock(&f->sem);
- jffs2_do_clear_inode(c, f);
return ret;
}
@@ -1289,15 +1278,11 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
if (f->metadata) {
JFFS2_ERROR("Argh. Special inode #%u with mode 0%o had metadata node\n",
f->inocache->ino, jemode_to_cpu(latest_node->mode));
- mutex_unlock(&f->sem);
- jffs2_do_clear_inode(c, f);
return -EIO;
}
if (!frag_first(&f->fragtree)) {
JFFS2_ERROR("Argh. Special inode #%u with mode 0%o has no fragments\n",
f->inocache->ino, jemode_to_cpu(latest_node->mode));
- mutex_unlock(&f->sem);
- jffs2_do_clear_inode(c, f);
return -EIO;
}
/* ASSERT: f->fraglist != NULL */
@@ -1305,8 +1290,6 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
JFFS2_ERROR("Argh. Special inode #%u with mode 0x%x had more than one node\n",
f->inocache->ino, jemode_to_cpu(latest_node->mode));
/* FIXME: Deal with it - check crc32, check for duplicate node, check times and discard the older one */
- mutex_unlock(&f->sem);
- jffs2_do_clear_inode(c, f);
return -EIO;
}
/* OK. We're happy */
@@ -1400,10 +1383,8 @@ int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
f->inocache = ic;
ret = jffs2_do_read_inode_internal(c, f, &n);
- if (!ret) {
- mutex_unlock(&f->sem);
- jffs2_do_clear_inode(c, f);
- }
+ mutex_unlock(&f->sem);
+ jffs2_do_clear_inode(c, f);
jffs2_xattr_do_crccheck_inode(c, ic);
kfree (f);
return ret;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index aca97f35b292..d4b43fb7adb1 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -54,7 +54,7 @@ static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
if (!strcmp(name, ""))
return -EINVAL;
- return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
+ return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size);
}
@@ -64,7 +64,7 @@ static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
if (!strcmp(name, ""))
return -EINVAL;
- return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
+ return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size, flags);
}
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 3d76f28a2ba9..d86c5e3176a1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -140,14 +140,14 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
BUG_ON(!d_is_dir(child));
- f = JFFS2_INODE_INFO(child->d_inode);
+ f = JFFS2_INODE_INFO(d_inode(child));
pino = f->inocache->pino_nlink;
JFFS2_DEBUG("Parent of directory ino #%u is #%u\n",
f->inocache->ino, pino);
- return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
+ return d_obtain_alias(jffs2_iget(d_inode(child)->i_sb, pino));
}
static const struct export_operations jffs2_export_ops = {
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index c7c77b0dfccd..8ce2f240125b 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -9,58 +9,15 @@
*
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/namei.h>
#include "nodelist.h"
-static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd);
-
const struct inode_operations jffs2_symlink_inode_operations =
{
.readlink = generic_readlink,
- .follow_link = jffs2_follow_link,
+ .follow_link = simple_follow_link,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
.listxattr = jffs2_listxattr,
.removexattr = jffs2_removexattr
};
-
-static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode);
- char *p = (char *)f->target;
-
- /*
- * We don't acquire the f->sem mutex here since the only data we
- * use is f->target.
- *
- * 1. If we are here the inode has already built and f->target has
- * to point to the target path.
- * 2. Nobody uses f->target (if the inode is symlink's inode). The
- * exception is inode freeing function which frees f->target. But
- * it can't be called while we are here and before VFS has
- * stopped using our f->target string which we provide by means of
- * nd_set_link() call.
- */
-
- if (!p) {
- pr_err("%s(): can't find symlink target\n", __func__);
- p = ERR_PTR(-EIO);
- }
- jffs2_dbg(1, "%s(): target path is '%s'\n",
- __func__, (char *)f->target);
-
- nd_set_link(nd, p);
-
- /*
- * We will unlock the f->sem mutex but VFS will use the f->target string. This is safe
- * since the only way that may cause f->target to be changed is iput() operation.
- * But VFS will not use f->target after iput() has been called.
- */
- return NULL;
-}
-
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index d72817ac51f6..f092fee5be50 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -195,7 +195,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
/* unchecked xdatum is chained with c->xattr_unchecked */
list_del_init(&xd->xindex);
- dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n",
+ dbg_xattr("success on verifying xdatum (xid=%u, version=%u)\n",
xd->xid, xd->version);
return 0;
@@ -960,7 +960,7 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) {
ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
struct jffs2_inode_cache *ic = f->inocache;
@@ -1266,7 +1266,6 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_
if (rc) {
JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n",
__func__, rc, totlen);
- rc = rc ? rc : -EBADFD;
goto out;
}
rc = save_xattr_ref(c, ref);
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 1c868194c504..ceaf9c693225 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -21,7 +21,7 @@ static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
{
if (!strcmp(name, ""))
return -EINVAL;
- return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
+ return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size);
}
@@ -30,7 +30,7 @@ static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
{
if (!strcmp(name, ""))
return -EINVAL;
- return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
+ return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size, flags);
}
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 916b5c966039..a71391eba514 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -21,7 +21,7 @@ static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
{
if (!strcmp(name, ""))
return -EINVAL;
- return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
+ return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size);
}
@@ -30,7 +30,7 @@ static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
{
if (!strcmp(name, ""))
return -EINVAL;
- return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
+ return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size, flags);
}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 10815f8dfd8b..b9dc23cd04f2 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -76,7 +76,7 @@ static int jfs_open(struct inode *inode, struct file *file)
if (ji->active_ag == -1) {
struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb);
ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb);
- atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]);
+ atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]);
}
spin_unlock_irq(&ji->ag_lock);
}
@@ -100,7 +100,7 @@ static int jfs_release(struct inode *inode, struct file *file)
int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int rc;
rc = inode_change_ok(inode, iattr);
@@ -151,8 +151,6 @@ const struct inode_operations jfs_file_inode_operations = {
const struct file_operations jfs_file_operations = {
.open = jfs_open,
.llseek = generic_file_llseek,
- .write = new_sync_write,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index bd3df1ca3c9b..41aa3ca6a6a4 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,8 +22,8 @@
#include <linux/buffer_head.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
+#include <linux/uio.h>
#include <linux/writeback.h>
-#include <linux/aio.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
@@ -63,11 +63,12 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
inode->i_mapping->a_ops = &jfs_aops;
} else {
inode->i_op = &jfs_fast_symlink_inode_operations;
+ inode->i_link = JFS_IP(inode)->i_inline;
/*
* The inline data should be null-terminated, but
* don't let on-disk corruption crash the kernel
*/
- JFS_IP(inode)->i_inline[inode->i_size] = '\0';
+ inode->i_link[inode->i_size] = '\0';
}
} else {
inode->i_op = &jfs_file_inode_operations;
@@ -133,11 +134,11 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc)
* It has been committed since the last change, but was still
* on the dirty inode list.
*/
- if (!test_cflag(COMMIT_Dirty, inode)) {
+ if (!test_cflag(COMMIT_Dirty, inode)) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait);
return 0;
- }
+ }
if (jfs_commit_inode(inode, wait)) {
jfs_err("jfs_write_inode: jfs_commit_inode failed!");
@@ -330,8 +331,8 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, jfs_get_block);
}
-static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -339,13 +340,13 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, jfs_get_block);
+ ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again.
*/
- if (unlikely((rw & WRITE) && ret < 0)) {
+ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
loff_t end = offset + count;
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 93a1232894f6..8db8b7d61e40 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -180,9 +180,6 @@ long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case JFS_IOC_SETFLAGS32:
cmd = JFS_IOC_SETFLAGS;
break;
- case FITRIM:
- cmd = FITRIM;
- break;
}
return jfs_ioctl(filp, cmd, arg);
}
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index fa7e795bd8ae..1f26d1910409 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -206,7 +206,7 @@ struct jfs_sb_info {
static inline struct jfs_inode_info *JFS_IP(struct inode *inode)
{
- return list_entry(inode, struct jfs_inode_info, vfs_inode);
+ return container_of(inode, struct jfs_inode_info, vfs_inode);
}
static inline int jfs_dirtable_inline(struct inode *inode)
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 49ba7ff1bbb9..16a0922beb59 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -183,30 +183,23 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
#endif
-static void init_once(void *foo)
-{
- struct metapage *mp = (struct metapage *)foo;
-
- mp->lid = 0;
- mp->lsn = 0;
- mp->flag = 0;
- mp->data = NULL;
- mp->clsn = 0;
- mp->log = NULL;
- set_bit(META_free, &mp->flag);
- init_waitqueue_head(&mp->wait);
-}
-
static inline struct metapage *alloc_metapage(gfp_t gfp_mask)
{
- return mempool_alloc(metapage_mempool, gfp_mask);
+ struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask);
+
+ if (mp) {
+ mp->lid = 0;
+ mp->lsn = 0;
+ mp->data = NULL;
+ mp->clsn = 0;
+ mp->log = NULL;
+ init_waitqueue_head(&mp->wait);
+ }
+ return mp;
}
static inline void free_metapage(struct metapage *mp)
{
- mp->flag = 0;
- set_bit(META_free, &mp->flag);
-
mempool_free(mp, metapage_mempool);
}
@@ -216,7 +209,7 @@ int __init metapage_init(void)
* Allocate the metapage structures
*/
metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage),
- 0, 0, init_once);
+ 0, 0, NULL);
if (metapage_cache == NULL)
return -ENOMEM;
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index a78beda85f68..337e9e51ac06 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -48,7 +48,6 @@ struct metapage {
/* metapage flag */
#define META_locked 0
-#define META_free 1
#define META_dirty 2
#define META_sync 3
#define META_discard 4
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 38fdc533f4ec..a5ac97b9a933 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -346,7 +346,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
{
int rc;
tid_t tid; /* transaction id */
- struct inode *ip = dentry->d_inode;
+ struct inode *ip = d_inode(dentry);
ino_t ino;
struct component_name dname;
struct inode *iplist[2];
@@ -472,7 +472,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
{
int rc;
tid_t tid; /* transaction id */
- struct inode *ip = dentry->d_inode;
+ struct inode *ip = d_inode(dentry);
ino_t ino;
struct component_name dname; /* object name */
struct inode *iplist[2];
@@ -791,7 +791,7 @@ static int jfs_link(struct dentry *old_dentry,
{
int rc;
tid_t tid;
- struct inode *ip = old_dentry->d_inode;
+ struct inode *ip = d_inode(old_dentry);
ino_t ino;
struct component_name dname;
struct btstack btstack;
@@ -879,8 +879,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
struct component_name dname;
int ssize; /* source pathname size */
struct btstack btstack;
- struct inode *ip = dentry->d_inode;
- unchar *i_fastsymlink;
+ struct inode *ip = d_inode(dentry);
s64 xlen = 0;
int bmask = 0, xsize;
s64 xaddr;
@@ -946,8 +945,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
if (ssize <= IDATASIZE) {
ip->i_op = &jfs_fast_symlink_inode_operations;
- i_fastsymlink = JFS_IP(ip)->i_inline;
- memcpy(i_fastsymlink, name, ssize);
+ ip->i_link = JFS_IP(ip)->i_inline;
+ memcpy(ip->i_link, name, ssize);
ip->i_size = ssize - 1;
/*
@@ -1086,8 +1085,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
dquot_initialize(old_dir);
dquot_initialize(new_dir);
- old_ip = old_dentry->d_inode;
- new_ip = new_dentry->d_inode;
+ old_ip = d_inode(old_dentry);
+ new_ip = d_inode(new_dentry);
if ((rc = get_UCSname(&old_dname, old_dentry)))
goto out1;
@@ -1161,7 +1160,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
rc = dtModify(tid, new_dir, &new_dname, &ino,
old_ip->i_ino, JFS_RENAME);
if (rc)
- goto out4;
+ goto out_tx;
drop_nlink(new_ip);
if (S_ISDIR(new_ip->i_mode)) {
drop_nlink(new_ip);
@@ -1186,7 +1185,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if ((new_size = commitZeroLink(tid, new_ip)) < 0) {
txAbort(tid, 1); /* Marks FS Dirty */
rc = new_size;
- goto out4;
+ goto out_tx;
}
tblk = tid_to_tblock(tid);
tblk->xflag |= COMMIT_DELETE;
@@ -1204,7 +1203,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (rc) {
jfs_err("jfs_rename didn't expect dtSearch to fail "
"w/rc = %d", rc);
- goto out4;
+ goto out_tx;
}
ino = old_ip->i_ino;
@@ -1212,7 +1211,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (rc) {
if (rc == -EIO)
jfs_err("jfs_rename: dtInsert returned -EIO");
- goto out4;
+ goto out_tx;
}
if (S_ISDIR(old_ip->i_mode))
inc_nlink(new_dir);
@@ -1227,7 +1226,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
jfs_err("jfs_rename did not expect dtDelete to return rc = %d",
rc);
txAbort(tid, 1); /* Marks Filesystem dirty */
- goto out4;
+ goto out_tx;
}
if (S_ISDIR(old_ip->i_mode)) {
drop_nlink(old_dir);
@@ -1286,7 +1285,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
rc = txCommit(tid, ipcount, iplist, commit_flag);
- out4:
+ out_tx:
txEnd(tid);
if (new_ip)
mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
@@ -1309,13 +1308,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (new_ip && (new_ip->i_nlink == 0))
set_cflag(COMMIT_Nolink, new_ip);
- out3:
- free_UCSname(&new_dname);
- out2:
- free_UCSname(&old_dname);
- out1:
- if (new_ip && !S_ISDIR(new_ip->i_mode))
- IWRITE_UNLOCK(new_ip);
/*
* Truncating the directory index table is not guaranteed. It
* may need to be done iteratively
@@ -1326,7 +1318,13 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
clear_cflag(COMMIT_Stale, old_dir);
}
-
+ if (new_ip && !S_ISDIR(new_ip->i_mode))
+ IWRITE_UNLOCK(new_ip);
+ out3:
+ free_UCSname(&new_dname);
+ out2:
+ free_UCSname(&old_dname);
+ out1:
jfs_info("jfs_rename: returning %d", rc);
return rc;
}
@@ -1500,9 +1498,9 @@ struct dentry *jfs_get_parent(struct dentry *dentry)
unsigned long parent_ino;
parent_ino =
- le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot);
+ le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot);
- return d_obtain_alias(jfs_iget(dentry->d_inode->i_sb, parent_ino));
+ return d_obtain_alias(jfs_iget(d_inode(dentry)->i_sb, parent_ino));
}
const struct inode_operations jfs_dir_inode_operations = {
@@ -1578,7 +1576,7 @@ static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags)
* positive dentry isn't good idea. So it's unsupported like
* rename("filename", "FILENAME") for now.
*/
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
return 1;
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 5d30c56ae075..4cd9798f4948 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -102,7 +102,7 @@ void jfs_error(struct super_block *sb, const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- pr_err("ERROR: (device %s): %pf: %pV\n",
+ pr_err("ERROR: (device %s): %ps: %pV\n",
sb->s_id, __builtin_return_address(0), &vaf);
va_end(args);
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 205b946d8e0d..5929e2363cb8 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -17,21 +17,13 @@
*/
#include <linux/fs.h>
-#include <linux/namei.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_xattr.h"
-static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- char *s = JFS_IP(dentry->d_inode)->i_inline;
- nd_set_link(nd, s);
- return NULL;
-}
-
const struct inode_operations jfs_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = jfs_follow_link,
+ .follow_link = simple_follow_link,
.setattr = jfs_setattr,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 46325d5c34fc..48b15a6e5558 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -849,7 +849,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t value_len, int flags)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct jfs_inode_info *ji = JFS_IP(inode);
int rc;
tid_t tid;
@@ -872,7 +872,7 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
tid = txBegin(inode->i_sb, 0);
mutex_lock(&ji->commit_mutex);
- rc = __jfs_setxattr(tid, dentry->d_inode, name, value, value_len,
+ rc = __jfs_setxattr(tid, d_inode(dentry), name, value, value_len,
flags);
if (!rc)
rc = txCommit(tid, 1, &inode, 0);
@@ -959,7 +959,7 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
return -EOPNOTSUPP;
}
- err = __jfs_getxattr(dentry->d_inode, name, data, buf_size);
+ err = __jfs_getxattr(d_inode(dentry), name, data, buf_size);
return err;
}
@@ -976,7 +976,7 @@ static inline int can_list(struct jfs_ea *ea)
ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
char *buffer;
ssize_t size = 0;
int xattr_size;
@@ -1029,7 +1029,7 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
int jfs_removexattr(struct dentry *dentry, const char *name)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct jfs_inode_info *ji = JFS_IP(inode);
int rc;
tid_t tid;
@@ -1047,7 +1047,7 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
tid = txBegin(inode->i_sb, 0);
mutex_lock(&ji->commit_mutex);
- rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+ rc = __jfs_setxattr(tid, d_inode(dentry), name, NULL, 0, XATTR_REPLACE);
if (!rc)
rc = txCommit(tid, 1, &inode, 0);
txEnd(tid);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 6acc9648f986..2d48d28e1640 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -444,7 +444,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
/* Always perform fresh lookup for negatives */
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
goto out_bad_unlocked;
kn = dentry->d_fsdata;
@@ -518,7 +518,14 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
if (!kn)
goto err_out1;
- ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+ /*
+ * If the ino of the sysfs entry created for a kmem cache gets
+ * allocated from an ida layer, which is accounted to the memcg that
+ * owns the cache, the memcg will get pinned forever. So do not account
+ * ino ida allocations.
+ */
+ ret = ida_simple_get(&root->ino_ida, 1, 0,
+ GFP_KERNEL | __GFP_NOACCOUNT);
if (ret < 0)
goto err_out2;
kn->ino = ret;
@@ -585,6 +592,9 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;
ret = -ENOENT;
+ if (parent->flags & KERNFS_EMPTY_DIR)
+ goto out_unlock;
+
if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
goto out_unlock;
@@ -776,6 +786,38 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
return ERR_PTR(rc);
}
+/**
+ * kernfs_create_empty_dir - create an always empty directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
+ const char *name)
+{
+ struct kernfs_node *kn;
+ int rc;
+
+ /* allocate */
+ kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR);
+ if (!kn)
+ return ERR_PTR(-ENOMEM);
+
+ kn->flags |= KERNFS_EMPTY_DIR;
+ kn->dir.root = parent->dir.root;
+ kn->ns = NULL;
+ kn->priv = NULL;
+
+ /* link in */
+ rc = kernfs_add_one(kn);
+ if (!rc)
+ return kn;
+
+ kernfs_put(kn);
+ return ERR_PTR(rc);
+}
+
static struct dentry *kernfs_iop_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
@@ -1247,7 +1289,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
mutex_lock(&kernfs_mutex);
error = -ENOENT;
- if (!kernfs_active(kn) || !kernfs_active(new_parent))
+ if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
+ (new_parent->flags & KERNFS_EMPTY_DIR))
goto out;
error = 0;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 2bacb9988566..7247252ee9b1 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -785,7 +785,6 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
struct kernfs_open_node *on = kn->attr.open;
- /* need parent for the kobj, grab both */
if (!kernfs_get_active(kn))
goto trigger;
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 9000874a945b..756dd56aaf60 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -111,7 +111,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct kernfs_node *kn = dentry->d_fsdata;
int error;
@@ -172,11 +172,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
- error = security_inode_setsecurity(dentry->d_inode, suffix,
+ error = security_inode_setsecurity(d_inode(dentry), suffix,
value, size, flags);
if (error)
return error;
- error = security_inode_getsecctx(dentry->d_inode,
+ error = security_inode_getsecctx(d_inode(dentry),
&secdata, &secdata_len);
if (error)
return error;
@@ -271,7 +271,7 @@ int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct kernfs_node *kn = dentry->d_fsdata;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
mutex_lock(&kernfs_mutex);
kernfs_refresh_inode(kn, inode);
@@ -296,6 +296,8 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
case KERNFS_DIR:
inode->i_op = &kernfs_dir_iops;
inode->i_fop = &kernfs_dir_fops;
+ if (kn->flags & KERNFS_EMPTY_DIR)
+ make_empty_dir_inode(inode);
break;
case KERNFS_FILE:
inode->i_size = kn->attr.size;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index af9fa7499919..6762bfbd8207 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -76,7 +76,6 @@ extern struct kmem_cache *kernfs_node_cache;
/*
* inode.c
*/
-struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct inode *inode, int mask);
int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 8a198898e39a..db272528ab5b 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,25 +112,18 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
return error;
}
-static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
{
int error = -ENOMEM;
unsigned long page = get_zeroed_page(GFP_KERNEL);
- if (page) {
- error = kernfs_getlink(dentry, (char *) page);
- if (error < 0)
- free_page((unsigned long)page);
- }
- nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
- return NULL;
-}
-
-static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
- void *cookie)
-{
- char *page = nd_get_link(nd);
- if (!IS_ERR(page))
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+ error = kernfs_getlink(dentry, (char *)page);
+ if (unlikely(error < 0)) {
free_page((unsigned long)page);
+ return ERR_PTR(error);
+ }
+ return *cookie = (char *)page;
}
const struct inode_operations kernfs_symlink_iops = {
@@ -140,7 +133,7 @@ const struct inode_operations kernfs_symlink_iops = {
.listxattr = kernfs_iop_listxattr,
.readlink = generic_readlink,
.follow_link = kernfs_iop_follow_link,
- .put_link = kernfs_iop_put_link,
+ .put_link = free_page_put_link,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
.permission = kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c
index 0ab65122ee45..102edfd39000 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,15 +20,10 @@
#include "internal.h"
-static inline int simple_positive(struct dentry *dentry)
-{
- return dentry->d_inode && !d_unhashed(dentry);
-}
-
int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
generic_fillattr(inode, stat);
stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
return 0;
@@ -94,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close);
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
struct dentry *dentry = file->f_path.dentry;
- mutex_lock(&dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(dentry)->i_mutex);
switch (whence) {
case 1:
offset += file->f_pos;
@@ -102,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
return -EINVAL;
}
if (offset != file->f_pos) {
@@ -129,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
spin_unlock(&dentry->d_lock);
}
}
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);
@@ -169,7 +164,7 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
spin_unlock(&next->d_lock);
spin_unlock(&dentry->d_lock);
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
- next->d_inode->i_ino, dt_type(next->d_inode)))
+ d_inode(next)->i_ino, dt_type(d_inode(next))))
return 0;
spin_lock(&dentry->d_lock);
spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
@@ -270,7 +265,7 @@ EXPORT_SYMBOL(simple_open);
int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
inc_nlink(inode);
@@ -304,7 +299,7 @@ EXPORT_SYMBOL(simple_empty);
int simple_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
drop_nlink(inode);
@@ -318,7 +313,7 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
if (!simple_empty(dentry))
return -ENOTEMPTY;
- drop_nlink(dentry->d_inode);
+ drop_nlink(d_inode(dentry));
simple_unlink(dir, dentry);
drop_nlink(dir);
return 0;
@@ -328,16 +323,16 @@ EXPORT_SYMBOL(simple_rmdir);
int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
- if (new_dentry->d_inode) {
+ if (d_really_is_positive(new_dentry)) {
simple_unlink(new_dir, new_dentry);
if (they_are_dirs) {
- drop_nlink(new_dentry->d_inode);
+ drop_nlink(d_inode(new_dentry));
drop_nlink(old_dir);
}
} else if (they_are_dirs) {
@@ -368,7 +363,7 @@ EXPORT_SYMBOL(simple_rename);
*/
int simple_setattr(struct dentry *dentry, struct iattr *iattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
error = inode_change_ok(inode, iattr);
@@ -1024,15 +1019,18 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
-void kfree_put_link(struct dentry *dentry, struct nameidata *nd,
- void *cookie)
+void kfree_put_link(struct inode *unused, void *cookie)
{
- char *s = nd_get_link(nd);
- if (!IS_ERR(s))
- kfree(s);
+ kfree(cookie);
}
EXPORT_SYMBOL(kfree_put_link);
+void free_page_put_link(struct inode *unused, void *cookie)
+{
+ free_page((unsigned long) cookie);
+}
+EXPORT_SYMBOL(free_page_put_link);
+
/*
* nop .set_page_dirty method so that people can use .page_mkwrite on
* anon inodes.
@@ -1093,3 +1091,110 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
return -EINVAL;
}
EXPORT_SYMBOL(simple_nosetlease);
+
+const char *simple_follow_link(struct dentry *dentry, void **cookie)
+{
+ return d_inode(dentry)->i_link;
+}
+EXPORT_SYMBOL(simple_follow_link);
+
+const struct inode_operations simple_symlink_inode_operations = {
+ .follow_link = simple_follow_link,
+ .readlink = generic_readlink
+};
+EXPORT_SYMBOL(simple_symlink_inode_operations);
+
+/*
+ * Operations for a permanently empty directory.
+ */
+static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+ return ERR_PTR(-ENOENT);
+}
+
+static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = d_inode(dentry);
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
+static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ return -EPERM;
+}
+
+static int empty_dir_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name,
+ void *value, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static int empty_dir_removexattr(struct dentry *dentry, const char *name)
+{
+ return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct inode_operations empty_dir_inode_operations = {
+ .lookup = empty_dir_lookup,
+ .permission = generic_permission,
+ .setattr = empty_dir_setattr,
+ .getattr = empty_dir_getattr,
+ .setxattr = empty_dir_setxattr,
+ .getxattr = empty_dir_getxattr,
+ .removexattr = empty_dir_removexattr,
+ .listxattr = empty_dir_listxattr,
+};
+
+static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ /* An empty directory has two entries . and .. at offsets 0 and 1 */
+ return generic_file_llseek_size(file, offset, whence, 2, 2);
+}
+
+static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+ dir_emit_dots(file, ctx);
+ return 0;
+}
+
+static const struct file_operations empty_dir_operations = {
+ .llseek = empty_dir_llseek,
+ .read = generic_read_dir,
+ .iterate = empty_dir_readdir,
+ .fsync = noop_fsync,
+};
+
+
+void make_empty_dir_inode(struct inode *inode)
+{
+ set_nlink(inode, 2);
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_rdev = 0;
+ inode->i_size = 2;
+ inode->i_blkbits = PAGE_SHIFT;
+ inode->i_blocks = 0;
+
+ inode->i_op = &empty_dir_inode_operations;
+ inode->i_fop = &empty_dir_operations;
+}
+
+bool is_empty_dir_inode(struct inode *inode)
+{
+ return (inode->i_fop == &empty_dir_operations) &&
+ (inode->i_op == &empty_dir_inode_operations);
+}
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 665ef5a05183..a563ddbc19e6 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -31,7 +31,7 @@
static struct hlist_head nlm_files[FILE_NRHASH];
static DEFINE_MUTEX(nlm_file_mutex);
-#ifdef NFSD_DEBUG
+#ifdef CONFIG_SUNRPC_DEBUG
static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
{
u32 *fhp = (u32*)f->data;
diff --git a/fs/locks.c b/fs/locks.c
index 40bc384728c0..d3d558ba4da7 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -203,11 +203,11 @@ static struct kmem_cache *flctx_cache __read_mostly;
static struct kmem_cache *filelock_cache __read_mostly;
static struct file_lock_context *
-locks_get_lock_context(struct inode *inode)
+locks_get_lock_context(struct inode *inode, int type)
{
struct file_lock_context *new;
- if (likely(inode->i_flctx))
+ if (likely(inode->i_flctx) || type == F_UNLCK)
goto out;
new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
@@ -223,14 +223,7 @@ locks_get_lock_context(struct inode *inode)
* Assign the pointer if it's not already assigned. If it is, then
* free the context we just allocated.
*/
- spin_lock(&inode->i_lock);
- if (likely(!inode->i_flctx)) {
- inode->i_flctx = new;
- new = NULL;
- }
- spin_unlock(&inode->i_lock);
-
- if (new)
+ if (cmpxchg(&inode->i_flctx, NULL, new))
kmem_cache_free(flctx_cache, new);
out:
return inode->i_flctx;
@@ -276,8 +269,10 @@ void locks_release_private(struct file_lock *fl)
}
if (fl->fl_lmops) {
- if (fl->fl_lmops->lm_put_owner)
- fl->fl_lmops->lm_put_owner(fl);
+ if (fl->fl_lmops->lm_put_owner) {
+ fl->fl_lmops->lm_put_owner(fl->fl_owner);
+ fl->fl_owner = NULL;
+ }
fl->fl_lmops = NULL;
}
}
@@ -333,7 +328,7 @@ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
if (fl->fl_lmops) {
if (fl->fl_lmops->lm_get_owner)
- fl->fl_lmops->lm_get_owner(new, fl);
+ fl->fl_lmops->lm_get_owner(fl->fl_owner);
}
}
EXPORT_SYMBOL(locks_copy_conflock);
@@ -592,11 +587,15 @@ posix_owner_key(struct file_lock *fl)
static void locks_insert_global_blocked(struct file_lock *waiter)
{
+ lockdep_assert_held(&blocked_lock_lock);
+
hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
}
static void locks_delete_global_blocked(struct file_lock *waiter)
{
+ lockdep_assert_held(&blocked_lock_lock);
+
hash_del(&waiter->fl_link);
}
@@ -730,7 +729,7 @@ static int posix_locks_conflict(struct file_lock *caller_fl, struct file_lock *s
/* POSIX locks owned by the same process do not conflict with
* each other.
*/
- if (!IS_POSIX(sys_fl) || posix_same_owner(caller_fl, sys_fl))
+ if (posix_same_owner(caller_fl, sys_fl))
return (0);
/* Check whether they overlap */
@@ -748,7 +747,7 @@ static int flock_locks_conflict(struct file_lock *caller_fl, struct file_lock *s
/* FLOCK locks referring to the same filp do not conflict with
* each other.
*/
- if (!IS_FLOCK(sys_fl) || (caller_fl->fl_file == sys_fl->fl_file))
+ if (caller_fl->fl_file == sys_fl->fl_file)
return (0);
if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND))
return 0;
@@ -838,6 +837,8 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
{
int i = 0;
+ lockdep_assert_held(&blocked_lock_lock);
+
/*
* This deadlock detector can't reasonably detect deadlocks with
* FL_OFDLCK locks, since they aren't owned by a process, per-se.
@@ -861,19 +862,21 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
* whether or not a lock was successfully freed by testing the return
* value for -ENOENT.
*/
-static int flock_lock_file(struct file *filp, struct file_lock *request)
+static int flock_lock_inode(struct inode *inode, struct file_lock *request)
{
struct file_lock *new_fl = NULL;
struct file_lock *fl;
struct file_lock_context *ctx;
- struct inode *inode = file_inode(filp);
int error = 0;
bool found = false;
LIST_HEAD(dispose);
- ctx = locks_get_lock_context(inode);
- if (!ctx)
- return -ENOMEM;
+ ctx = locks_get_lock_context(inode, request->fl_type);
+ if (!ctx) {
+ if (request->fl_type != F_UNLCK)
+ return -ENOMEM;
+ return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0;
+ }
if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
new_fl = locks_alloc_lock();
@@ -886,7 +889,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
goto find_conflict;
list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
- if (filp != fl->fl_file)
+ if (request->fl_file != fl->fl_file)
continue;
if (request->fl_type == fl->fl_type)
goto out;
@@ -939,9 +942,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
bool added = false;
LIST_HEAD(dispose);
- ctx = locks_get_lock_context(inode);
+ ctx = locks_get_lock_context(inode, request->fl_type);
if (!ctx)
- return -ENOMEM;
+ return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM;
/*
* We may need two file_lock structures for this operation,
@@ -964,8 +967,6 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
*/
if (request->fl_type != F_UNLCK) {
list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
- if (!IS_POSIX(fl))
- continue;
if (!posix_locks_conflict(request, fl))
continue;
if (conflock)
@@ -1162,20 +1163,19 @@ int posix_lock_file(struct file *filp, struct file_lock *fl,
EXPORT_SYMBOL(posix_lock_file);
/**
- * posix_lock_file_wait - Apply a POSIX-style lock to a file
- * @filp: The file to apply the lock to
+ * posix_lock_inode_wait - Apply a POSIX-style lock to a file
+ * @inode: inode of file to which lock request should be applied
* @fl: The lock to be applied
*
- * Add a POSIX style lock to a file.
- * We merge adjacent & overlapping locks whenever possible.
- * POSIX locks are sorted by owner task, then by starting address
+ * Variant of posix_lock_file_wait that does not take a filp, and so can be
+ * used after the filp has already been torn down.
*/
-int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
+int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep ();
for (;;) {
- error = posix_lock_file(filp, fl, NULL);
+ error = __posix_lock_file(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1187,7 +1187,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
}
return error;
}
-EXPORT_SYMBOL(posix_lock_file_wait);
+EXPORT_SYMBOL(posix_lock_inode_wait);
/**
* locks_mandatory_locked - Check for an active lock
@@ -1605,7 +1605,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
lease = *flp;
trace_generic_add_lease(inode, lease);
- ctx = locks_get_lock_context(inode);
+ /* Note that arg is never F_UNLCK here */
+ ctx = locks_get_lock_context(inode, arg);
if (!ctx)
return -ENOMEM;
@@ -1848,18 +1849,18 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
}
/**
- * flock_lock_file_wait - Apply a FLOCK-style lock to a file
- * @filp: The file to apply the lock to
+ * flock_lock_inode_wait - Apply a FLOCK-style lock to a file
+ * @inode: inode of the file to apply to
* @fl: The lock to be applied
*
- * Add a FLOCK style lock to a file.
+ * Apply a FLOCK style lock request to an inode.
*/
-int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
+int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep();
for (;;) {
- error = flock_lock_file(filp, fl);
+ error = flock_lock_inode(inode, fl);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1871,8 +1872,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
}
return error;
}
-
-EXPORT_SYMBOL(flock_lock_file_wait);
+EXPORT_SYMBOL(flock_lock_inode_wait);
/**
* sys_flock: - flock() system call.
@@ -2398,7 +2398,8 @@ locks_remove_flock(struct file *filp)
.fl_type = F_UNLCK,
.fl_end = OFFSET_MAX,
};
- struct file_lock_context *flctx = file_inode(filp)->i_flctx;
+ struct inode *inode = file_inode(filp);
+ struct file_lock_context *flctx = inode->i_flctx;
if (list_empty(&flctx->flc_flock))
return;
@@ -2406,7 +2407,7 @@ locks_remove_flock(struct file *filp)
if (filp->f_op->flock)
filp->f_op->flock(filp, F_SETLKW, &fl);
else
- flock_lock_file(filp, &fl);
+ flock_lock_inode(inode, &fl);
if (fl.fl_ops && fl.fl_ops->fl_release_private)
fl.fl_ops->fl_release_private(&fl);
@@ -2555,15 +2556,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
: (fl->fl_type == F_WRLCK) ? "WRITE" : "READ ");
}
if (inode) {
-#ifdef WE_CAN_BREAK_LSLK_NOW
- seq_printf(f, "%d %s:%ld ", fl_pid,
- inode->i_sb->s_id, inode->i_ino);
-#else
- /* userspace relies on this representation of dev_t ;-( */
+ /* userspace relies on this representation of dev_t */
seq_printf(f, "%d %02x:%02x:%ld ", fl_pid,
MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev), inode->i_ino);
-#endif
} else {
seq_printf(f, "%d <none>:0 ", fl_pid);
}
@@ -2592,6 +2588,44 @@ static int locks_show(struct seq_file *f, void *v)
return 0;
}
+static void __show_fd_locks(struct seq_file *f,
+ struct list_head *head, int *id,
+ struct file *filp, struct files_struct *files)
+{
+ struct file_lock *fl;
+
+ list_for_each_entry(fl, head, fl_list) {
+
+ if (filp != fl->fl_file)
+ continue;
+ if (fl->fl_owner != files &&
+ fl->fl_owner != filp)
+ continue;
+
+ (*id)++;
+ seq_puts(f, "lock:\t");
+ lock_get_status(f, fl, *id, "");
+ }
+}
+
+void show_fd_locks(struct seq_file *f,
+ struct file *filp, struct files_struct *files)
+{
+ struct inode *inode = file_inode(filp);
+ struct file_lock_context *ctx;
+ int id = 0;
+
+ ctx = inode->i_flctx;
+ if (!ctx)
+ return;
+
+ spin_lock(&ctx->flc_lock);
+ __show_fd_locks(f, &ctx->flc_flock, &id, filp, files);
+ __show_fd_locks(f, &ctx->flc_posix, &id, filp, files);
+ __show_fd_locks(f, &ctx->flc_lease, &id, filp, files);
+ spin_unlock(&ctx->flc_lock);
+}
+
static void *locks_start(struct seq_file *f, loff_t *pos)
__acquires(&blocked_lock_lock)
{
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 6bdc347008f5..f9b45d46d4c4 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -213,7 +213,7 @@ static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
static int logfs_unlink(struct inode *dir, struct dentry *dentry)
{
struct logfs_super *super = logfs_super(dir->i_sb);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct logfs_transaction *ta;
struct page *page;
pgoff_t index;
@@ -271,7 +271,7 @@ static inline int logfs_empty_dir(struct inode *dir)
static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (!logfs_empty_dir(inode))
return -ENOTEMPTY;
@@ -537,7 +537,7 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
static int logfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
ihold(inode);
@@ -607,7 +607,7 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
/* 2. write target dd */
mutex_lock(&super->s_dirop_mutex);
logfs_add_transaction(new_dir, ta);
- err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
+ err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry));
if (!err)
err = write_inode(new_dir);
@@ -658,8 +658,8 @@ static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
struct logfs_super *super = logfs_super(old_dir->i_sb);
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
int isdir = S_ISDIR(old_inode->i_mode);
struct logfs_disk_dentry dd;
struct logfs_transaction *ta;
@@ -719,7 +719,7 @@ out:
static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- if (new_dentry->d_inode)
+ if (d_really_is_positive(new_dentry))
return logfs_rename_target(old_dir, old_dentry,
new_dir, new_dentry);
return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
@@ -779,6 +779,7 @@ fail:
const struct inode_operations logfs_symlink_iops = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
+ .put_link = page_put_link,
};
const struct inode_operations logfs_dir_iops = {
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 8538752df2f6..1a6f0167b16a 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -241,7 +241,7 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int err = 0;
err = inode_change_ok(inode, attr);
@@ -271,8 +271,6 @@ const struct file_operations logfs_reg_fops = {
.llseek = generic_file_llseek,
.mmap = generic_file_readonly_mmap,
.open = generic_file_open,
- .read = new_sync_read,
- .write = new_sync_write,
};
const struct address_space_operations logfs_reg_aops = {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index dfaf6fa9b7b5..d19ac258105a 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -45,11 +45,6 @@ minix_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static inline unsigned long dir_pages(struct inode *inode)
-{
- return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
{
struct address_space *mapping = page->mapping;
@@ -156,7 +151,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
{
const char * name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- struct inode * dir = dentry->d_parent->d_inode;
+ struct inode * dir = d_inode(dentry->d_parent);
struct super_block * sb = dir->i_sb;
struct minix_sb_info * sbi = minix_sb(sb);
unsigned long n;
@@ -203,7 +198,7 @@ found:
int minix_add_link(struct dentry *dentry, struct inode *inode)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
const char * name = dentry->d_name.name;
int namelen = dentry->d_name.len;
struct super_block * sb = dir->i_sb;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index a967de085ac0..94f0eb9a6e2c 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -14,9 +14,7 @@
*/
const struct file_operations minix_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
@@ -25,7 +23,7 @@ const struct file_operations minix_file_operations = {
static int minix_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
error = inode_change_ok(inode, attr);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 3f57af196a7d..086cd0a61e80 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -62,7 +62,7 @@ static struct kmem_cache * minix_inode_cachep;
static struct inode *minix_alloc_inode(struct super_block *sb)
{
struct minix_inode_info *ei;
- ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -626,8 +626,8 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct super_block *sb = dentry->d_sb;
- generic_fillattr(dentry->d_inode, stat);
- if (INODE_VERSION(dentry->d_inode) == MINIX_V1)
+ generic_fillattr(d_inode(dentry), stat);
+ if (INODE_VERSION(d_inode(dentry)) == MINIX_V1)
stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
else
stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 1ebd11854622..01ad81dcacc5 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -84,7 +84,7 @@ static inline struct minix_sb_info *minix_sb(struct super_block *sb)
static inline struct minix_inode_info *minix_i(struct inode *inode)
{
- return list_entry(inode, struct minix_inode_info, vfs_inode);
+ return container_of(inode, struct minix_inode_info, vfs_inode);
}
static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index cd950e2331b6..a795a11e50c7 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -104,7 +104,7 @@ out_fail:
static int minix_link(struct dentry * old_dentry, struct inode * dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
@@ -151,7 +151,7 @@ out_dir:
static int minix_unlink(struct inode * dir, struct dentry *dentry)
{
int err = -ENOENT;
- struct inode * inode = dentry->d_inode;
+ struct inode * inode = d_inode(dentry);
struct page * page;
struct minix_dir_entry * de;
@@ -171,7 +171,7 @@ end_unlink:
static int minix_rmdir(struct inode * dir, struct dentry *dentry)
{
- struct inode * inode = dentry->d_inode;
+ struct inode * inode = d_inode(dentry);
int err = -ENOTEMPTY;
if (minix_empty_dir(inode)) {
@@ -187,8 +187,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
struct inode * new_dir, struct dentry *new_dentry)
{
- struct inode * old_inode = old_dentry->d_inode;
- struct inode * new_inode = new_dentry->d_inode;
+ struct inode * old_inode = d_inode(old_dentry);
+ struct inode * new_inode = d_inode(new_dentry);
struct page * dir_page = NULL;
struct minix_dir_entry * dir_de = NULL;
struct page * old_page;
diff --git a/fs/mount.h b/fs/mount.h
index 6a61c2b3e385..14db05d424f7 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -88,6 +88,7 @@ static inline int is_mounted(struct vfsmount *mnt)
extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
+extern int __legitimize_mnt(struct vfsmount *, unsigned);
extern bool legitimize_mnt(struct vfsmount *, unsigned);
extern void __detach_mounts(struct dentry *dentry);
@@ -117,7 +118,6 @@ static inline void unlock_mount_hash(void)
}
struct proc_mounts {
- struct seq_file m;
struct mnt_namespace *ns;
struct path root;
int (*show)(struct seq_file *, struct vfsmount *);
@@ -126,8 +126,6 @@ struct proc_mounts {
loff_t cached_index;
};
-#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
-
extern const struct seq_operations mounts_op;
extern bool __is_local_mountpoint(struct dentry *dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index 3e79220babac..ca0244b69de8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,8 @@ alloc_new:
bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
goto confused;
+
+ wbc_init_bio(wbc, bio);
}
/*
@@ -612,6 +614,7 @@ alloc_new:
* the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything)
*/
+ wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(WRITE, bio);
diff --git a/fs/namei.c b/fs/namei.c
index c83145af4bfc..fbbcf0993312 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -119,15 +119,14 @@
* PATH_MAX includes the nul terminator --RR.
*/
-#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
+#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
- struct filename *result, *err;
- int len;
- long max;
+ struct filename *result;
char *kname;
+ int len;
result = audit_reusename(filename);
if (result)
@@ -136,22 +135,18 @@ getname_flags(const char __user *filename, int flags, int *empty)
result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
- result->refcnt = 1;
/*
* First, try to embed the struct filename inside the names_cache
* allocation
*/
- kname = (char *)result + sizeof(*result);
+ kname = (char *)result->iname;
result->name = kname;
- result->separate = false;
- max = EMBEDDED_NAME_MAX;
-recopy:
- len = strncpy_from_user(kname, filename, max);
+ len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
if (unlikely(len < 0)) {
- err = ERR_PTR(len);
- goto error;
+ __putname(result);
+ return ERR_PTR(len);
}
/*
@@ -160,43 +155,49 @@ recopy:
* names_cache allocation for the pathname, and re-do the copy from
* userland.
*/
- if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
+ if (unlikely(len == EMBEDDED_NAME_MAX)) {
+ const size_t size = offsetof(struct filename, iname[1]);
kname = (char *)result;
- result = kzalloc(sizeof(*result), GFP_KERNEL);
- if (!result) {
- err = ERR_PTR(-ENOMEM);
- result = (struct filename *)kname;
- goto error;
+ /*
+ * size is chosen that way we to guarantee that
+ * result->iname[0] is within the same object and that
+ * kname can't be equal to result->iname, no matter what.
+ */
+ result = kzalloc(size, GFP_KERNEL);
+ if (unlikely(!result)) {
+ __putname(kname);
+ return ERR_PTR(-ENOMEM);
}
result->name = kname;
- result->separate = true;
- result->refcnt = 1;
- max = PATH_MAX;
- goto recopy;
+ len = strncpy_from_user(kname, filename, PATH_MAX);
+ if (unlikely(len < 0)) {
+ __putname(kname);
+ kfree(result);
+ return ERR_PTR(len);
+ }
+ if (unlikely(len == PATH_MAX)) {
+ __putname(kname);
+ kfree(result);
+ return ERR_PTR(-ENAMETOOLONG);
+ }
}
+ result->refcnt = 1;
/* The empty path is special. */
if (unlikely(!len)) {
if (empty)
*empty = 1;
- err = ERR_PTR(-ENOENT);
- if (!(flags & LOOKUP_EMPTY))
- goto error;
+ if (!(flags & LOOKUP_EMPTY)) {
+ putname(result);
+ return ERR_PTR(-ENOENT);
+ }
}
- err = ERR_PTR(-ENAMETOOLONG);
- if (unlikely(len >= PATH_MAX))
- goto error;
-
result->uptr = filename;
result->aname = NULL;
audit_getname(result);
return result;
-
-error:
- putname(result);
- return err;
}
struct filename *
@@ -216,8 +217,7 @@ getname_kernel(const char * filename)
return ERR_PTR(-ENOMEM);
if (len <= EMBEDDED_NAME_MAX) {
- result->name = (char *)(result) + sizeof(*result);
- result->separate = false;
+ result->name = (char *)result->iname;
} else if (len <= PATH_MAX) {
struct filename *tmp;
@@ -227,7 +227,6 @@ getname_kernel(const char * filename)
return ERR_PTR(-ENOMEM);
}
tmp->name = (char *)result;
- tmp->separate = true;
result = tmp;
} else {
__putname(result);
@@ -249,7 +248,7 @@ void putname(struct filename *name)
if (--name->refcnt > 0)
return;
- if (name->separate) {
+ if (name->name != name->iname) {
__putname(name->name);
kfree(name);
} else
@@ -493,6 +492,7 @@ void path_put(const struct path *path)
}
EXPORT_SYMBOL(path_put);
+#define EMBEDDED_LEVELS 2
struct nameidata {
struct path path;
struct qstr last;
@@ -502,10 +502,139 @@ struct nameidata {
unsigned seq, m_seq;
int last_type;
unsigned depth;
- struct file *base;
- char *saved_names[MAX_NESTED_LINKS + 1];
+ int total_link_count;
+ struct saved {
+ struct path link;
+ void *cookie;
+ const char *name;
+ struct inode *inode;
+ unsigned seq;
+ } *stack, internal[EMBEDDED_LEVELS];
+ struct filename *name;
+ struct nameidata *saved;
+ unsigned root_seq;
+ int dfd;
};
+static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
+{
+ struct nameidata *old = current->nameidata;
+ p->stack = p->internal;
+ p->dfd = dfd;
+ p->name = name;
+ p->total_link_count = old ? old->total_link_count : 0;
+ p->saved = old;
+ current->nameidata = p;
+}
+
+static void restore_nameidata(void)
+{
+ struct nameidata *now = current->nameidata, *old = now->saved;
+
+ current->nameidata = old;
+ if (old)
+ old->total_link_count = now->total_link_count;
+ if (now->stack != now->internal) {
+ kfree(now->stack);
+ now->stack = now->internal;
+ }
+}
+
+static int __nd_alloc_stack(struct nameidata *nd)
+{
+ struct saved *p;
+
+ if (nd->flags & LOOKUP_RCU) {
+ p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
+ GFP_ATOMIC);
+ if (unlikely(!p))
+ return -ECHILD;
+ } else {
+ p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
+ GFP_KERNEL);
+ if (unlikely(!p))
+ return -ENOMEM;
+ }
+ memcpy(p, nd->internal, sizeof(nd->internal));
+ nd->stack = p;
+ return 0;
+}
+
+static inline int nd_alloc_stack(struct nameidata *nd)
+{
+ if (likely(nd->depth != EMBEDDED_LEVELS))
+ return 0;
+ if (likely(nd->stack != nd->internal))
+ return 0;
+ return __nd_alloc_stack(nd);
+}
+
+static void drop_links(struct nameidata *nd)
+{
+ int i = nd->depth;
+ while (i--) {
+ struct saved *last = nd->stack + i;
+ struct inode *inode = last->inode;
+ if (last->cookie && inode->i_op->put_link) {
+ inode->i_op->put_link(inode, last->cookie);
+ last->cookie = NULL;
+ }
+ }
+}
+
+static void terminate_walk(struct nameidata *nd)
+{
+ drop_links(nd);
+ if (!(nd->flags & LOOKUP_RCU)) {
+ int i;
+ path_put(&nd->path);
+ for (i = 0; i < nd->depth; i++)
+ path_put(&nd->stack[i].link);
+ if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+ path_put(&nd->root);
+ nd->root.mnt = NULL;
+ }
+ } else {
+ nd->flags &= ~LOOKUP_RCU;
+ if (!(nd->flags & LOOKUP_ROOT))
+ nd->root.mnt = NULL;
+ rcu_read_unlock();
+ }
+ nd->depth = 0;
+}
+
+/* path_put is needed afterwards regardless of success or failure */
+static bool legitimize_path(struct nameidata *nd,
+ struct path *path, unsigned seq)
+{
+ int res = __legitimize_mnt(path->mnt, nd->m_seq);
+ if (unlikely(res)) {
+ if (res > 0)
+ path->mnt = NULL;
+ path->dentry = NULL;
+ return false;
+ }
+ if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
+ path->dentry = NULL;
+ return false;
+ }
+ return !read_seqcount_retry(&path->dentry->d_seq, seq);
+}
+
+static bool legitimize_links(struct nameidata *nd)
+{
+ int i;
+ for (i = 0; i < nd->depth; i++) {
+ struct saved *last = nd->stack + i;
+ if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
+ drop_links(nd);
+ nd->depth = i + 1;
+ return false;
+ }
+ }
+ return true;
+}
+
/*
* Path walking has 2 modes, rcu-walk and ref-walk (see
* Documentation/filesystems/path-lookup.txt). In situations when we can't
@@ -521,35 +650,28 @@ struct nameidata {
* unlazy_walk - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
* @dentry: child of nd->path.dentry or NULL
+ * @seq: seq number to check dentry against
* Returns: 0 on success, -ECHILD on failure
*
* unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
* for ref-walk mode. @dentry must be a path found by a do_lookup call on
* @nd or NULL. Must be called from rcu-walk context.
+ * Nothing should touch nameidata between unlazy_walk() failure and
+ * terminate_walk().
*/
-static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
- struct fs_struct *fs = current->fs;
struct dentry *parent = nd->path.dentry;
BUG_ON(!(nd->flags & LOOKUP_RCU));
- /*
- * After legitimizing the bastards, terminate_walk()
- * will do the right thing for non-RCU mode, and all our
- * subsequent exit cases should rcu_read_unlock()
- * before returning. Do vfsmount first; if dentry
- * can't be legitimized, just set nd->path.dentry to NULL
- * and rely on dput(NULL) being a no-op.
- */
- if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
- return -ECHILD;
nd->flags &= ~LOOKUP_RCU;
-
- if (!lockref_get_not_dead(&parent->d_lockref)) {
- nd->path.dentry = NULL;
- goto out;
- }
+ if (unlikely(!legitimize_links(nd)))
+ goto out2;
+ if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
+ goto out2;
+ if (unlikely(!lockref_get_not_dead(&parent->d_lockref)))
+ goto out1;
/*
* For a negative lookup, the lookup sequence point is the parents
@@ -569,7 +691,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
} else {
if (!lockref_get_not_dead(&dentry->d_lockref))
goto out;
- if (read_seqcount_retry(&dentry->d_seq, nd->seq))
+ if (read_seqcount_retry(&dentry->d_seq, seq))
goto drop_dentry;
}
@@ -578,22 +700,24 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
* still valid and get it if required.
*/
if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
- spin_lock(&fs->lock);
- if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry)
- goto unlock_and_drop_dentry;
- path_get(&nd->root);
- spin_unlock(&fs->lock);
+ if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
+ rcu_read_unlock();
+ dput(dentry);
+ return -ECHILD;
+ }
}
rcu_read_unlock();
return 0;
-unlock_and_drop_dentry:
- spin_unlock(&fs->lock);
drop_dentry:
rcu_read_unlock();
dput(dentry);
goto drop_root_mnt;
+out2:
+ nd->path.mnt = NULL;
+out1:
+ nd->path.dentry = NULL;
out:
rcu_read_unlock();
drop_root_mnt:
@@ -602,6 +726,24 @@ drop_root_mnt:
return -ECHILD;
}
+static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq)
+{
+ if (unlikely(!legitimize_path(nd, link, seq))) {
+ drop_links(nd);
+ nd->depth = 0;
+ nd->flags &= ~LOOKUP_RCU;
+ nd->path.mnt = NULL;
+ nd->path.dentry = NULL;
+ if (!(nd->flags & LOOKUP_ROOT))
+ nd->root.mnt = NULL;
+ rcu_read_unlock();
+ } else if (likely(unlazy_walk(nd, NULL, 0)) == 0) {
+ return 0;
+ }
+ path_put(link);
+ return -ECHILD;
+}
+
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
{
return dentry->d_op->d_revalidate(dentry, flags);
@@ -623,26 +765,10 @@ static int complete_walk(struct nameidata *nd)
int status;
if (nd->flags & LOOKUP_RCU) {
- nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
-
- if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
- rcu_read_unlock();
- return -ECHILD;
- }
- if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
- rcu_read_unlock();
- mntput(nd->path.mnt);
- return -ECHILD;
- }
- if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
- rcu_read_unlock();
- dput(dentry);
- mntput(nd->path.mnt);
+ if (unlikely(unlazy_walk(nd, NULL, 0)))
return -ECHILD;
- }
- rcu_read_unlock();
}
if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -658,28 +784,24 @@ static int complete_walk(struct nameidata *nd)
if (!status)
status = -ESTALE;
- path_put(&nd->path);
return status;
}
-static __always_inline void set_root(struct nameidata *nd)
+static void set_root(struct nameidata *nd)
{
get_fs_root(current->fs, &nd->root);
}
-static int link_path_walk(const char *, struct nameidata *);
-
-static __always_inline unsigned set_root_rcu(struct nameidata *nd)
+static void set_root_rcu(struct nameidata *nd)
{
struct fs_struct *fs = current->fs;
- unsigned seq, res;
+ unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
nd->root = fs->root;
- res = __read_seqcount_begin(&nd->root.dentry->d_seq);
+ nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
- return res;
}
static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -705,8 +827,9 @@ static inline void path_to_nameidata(const struct path *path,
* Helper to directly jump to a known parsed path from ->follow_link,
* caller must have taken a reference to path beforehand.
*/
-void nd_jump_link(struct nameidata *nd, struct path *path)
+void nd_jump_link(struct path *path)
{
+ struct nameidata *nd = current->nameidata;
path_put(&nd->path);
nd->path = *path;
@@ -714,24 +837,14 @@ void nd_jump_link(struct nameidata *nd, struct path *path)
nd->flags |= LOOKUP_JUMPED;
}
-void nd_set_link(struct nameidata *nd, char *path)
-{
- nd->saved_names[nd->depth] = path;
-}
-EXPORT_SYMBOL(nd_set_link);
-
-char *nd_get_link(struct nameidata *nd)
-{
- return nd->saved_names[nd->depth];
-}
-EXPORT_SYMBOL(nd_get_link);
-
-static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+static inline void put_link(struct nameidata *nd)
{
- struct inode *inode = link->dentry->d_inode;
- if (inode->i_op->put_link)
- inode->i_op->put_link(link->dentry, nd, cookie);
- path_put(link);
+ struct saved *last = nd->stack + --nd->depth;
+ struct inode *inode = last->inode;
+ if (last->cookie && inode->i_op->put_link)
+ inode->i_op->put_link(inode, last->cookie);
+ if (!(nd->flags & LOOKUP_RCU))
+ path_put(&last->link);
}
int sysctl_protected_symlinks __read_mostly = 0;
@@ -739,7 +852,6 @@ int sysctl_protected_hardlinks __read_mostly = 0;
/**
* may_follow_link - Check symlink following for unsafe situations
- * @link: The path of the symlink
* @nd: nameidata pathwalk data
*
* In the case of the sysctl_protected_symlinks sysctl being enabled,
@@ -753,7 +865,7 @@ int sysctl_protected_hardlinks __read_mostly = 0;
*
* Returns 0 if following the symlink is allowed, -ve on error.
*/
-static inline int may_follow_link(struct path *link, struct nameidata *nd)
+static inline int may_follow_link(struct nameidata *nd)
{
const struct inode *inode;
const struct inode *parent;
@@ -762,7 +874,7 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
return 0;
/* Allowed if owner and follower match. */
- inode = link->dentry->d_inode;
+ inode = nd->stack[0].inode;
if (uid_eq(current_cred()->fsuid, inode->i_uid))
return 0;
@@ -775,9 +887,10 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd)
if (uid_eq(parent->i_uid, inode->i_uid))
return 0;
- audit_log_link_denied("follow_link", link);
- path_put_conditional(link, nd);
- path_put(&nd->path);
+ if (nd->flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ audit_log_link_denied("follow_link", &nd->stack[0].link);
return -EACCES;
}
@@ -850,82 +963,68 @@ static int may_linkat(struct path *link)
return -EPERM;
}
-static __always_inline int
-follow_link(struct path *link, struct nameidata *nd, void **p)
+static __always_inline
+const char *get_link(struct nameidata *nd)
{
- struct dentry *dentry = link->dentry;
+ struct saved *last = nd->stack + nd->depth - 1;
+ struct dentry *dentry = last->link.dentry;
+ struct inode *inode = last->inode;
int error;
- char *s;
+ const char *res;
- BUG_ON(nd->flags & LOOKUP_RCU);
-
- if (link->mnt == nd->path.mnt)
- mntget(link->mnt);
-
- error = -ELOOP;
- if (unlikely(current->total_link_count >= 40))
- goto out_put_nd_path;
-
- cond_resched();
- current->total_link_count++;
-
- touch_atime(link);
- nd_set_link(nd, NULL);
+ if (!(nd->flags & LOOKUP_RCU)) {
+ touch_atime(&last->link);
+ cond_resched();
+ } else if (atime_needs_update(&last->link, inode)) {
+ if (unlikely(unlazy_walk(nd, NULL, 0)))
+ return ERR_PTR(-ECHILD);
+ touch_atime(&last->link);
+ }
- error = security_inode_follow_link(link->dentry, nd);
- if (error)
- goto out_put_nd_path;
+ error = security_inode_follow_link(dentry, inode,
+ nd->flags & LOOKUP_RCU);
+ if (unlikely(error))
+ return ERR_PTR(error);
nd->last_type = LAST_BIND;
- *p = dentry->d_inode->i_op->follow_link(dentry, nd);
- error = PTR_ERR(*p);
- if (IS_ERR(*p))
- goto out_put_nd_path;
-
- error = 0;
- s = nd_get_link(nd);
- if (s) {
- if (unlikely(IS_ERR(s))) {
- path_put(&nd->path);
- put_link(nd, link, *p);
- return PTR_ERR(s);
+ res = inode->i_link;
+ if (!res) {
+ if (nd->flags & LOOKUP_RCU) {
+ if (unlikely(unlazy_walk(nd, NULL, 0)))
+ return ERR_PTR(-ECHILD);
}
- if (*s == '/') {
+ res = inode->i_op->follow_link(dentry, &last->cookie);
+ if (IS_ERR_OR_NULL(res)) {
+ last->cookie = NULL;
+ return res;
+ }
+ }
+ if (*res == '/') {
+ if (nd->flags & LOOKUP_RCU) {
+ struct dentry *d;
+ if (!nd->root.mnt)
+ set_root_rcu(nd);
+ nd->path = nd->root;
+ d = nd->path.dentry;
+ nd->inode = d->d_inode;
+ nd->seq = nd->root_seq;
+ if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+ return ERR_PTR(-ECHILD);
+ } else {
if (!nd->root.mnt)
set_root(nd);
path_put(&nd->path);
nd->path = nd->root;
path_get(&nd->root);
- nd->flags |= LOOKUP_JUMPED;
+ nd->inode = nd->path.dentry->d_inode;
}
- nd->inode = nd->path.dentry->d_inode;
- error = link_path_walk(s, nd);
- if (unlikely(error))
- put_link(nd, link, *p);
+ nd->flags |= LOOKUP_JUMPED;
+ while (unlikely(*++res == '/'))
+ ;
}
-
- return error;
-
-out_put_nd_path:
- *p = NULL;
- path_put(&nd->path);
- path_put(link);
- return error;
-}
-
-static int follow_up_rcu(struct path *path)
-{
- struct mount *mnt = real_mount(path->mnt);
- struct mount *parent;
- struct dentry *mountpoint;
-
- parent = mnt->mnt_parent;
- if (&parent->mnt == path->mnt)
- return 0;
- mountpoint = mnt->mnt_mountpoint;
- path->dentry = mountpoint;
- path->mnt = &parent->mnt;
- return 1;
+ if (!*res)
+ res = NULL;
+ return res;
}
/*
@@ -966,7 +1065,7 @@ EXPORT_SYMBOL(follow_up);
* - return -EISDIR to tell follow_managed() to stop and return the path we
* were called with.
*/
-static int follow_automount(struct path *path, unsigned flags,
+static int follow_automount(struct path *path, struct nameidata *nd,
bool *need_mntput)
{
struct vfsmount *mnt;
@@ -986,13 +1085,13 @@ static int follow_automount(struct path *path, unsigned flags,
* as being automount points. These will need the attentions
* of the daemon to instantiate them before they can be used.
*/
- if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
- LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+ LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
path->dentry->d_inode)
return -EISDIR;
- current->total_link_count++;
- if (current->total_link_count >= 40)
+ nd->total_link_count++;
+ if (nd->total_link_count >= 40)
return -ELOOP;
mnt = path->dentry->d_op->d_automount(path);
@@ -1006,7 +1105,7 @@ static int follow_automount(struct path *path, unsigned flags,
* the path being looked up; if it wasn't then the remainder of
* the path is inaccessible and we should say so.
*/
- if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
+ if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
return -EREMOTE;
return PTR_ERR(mnt);
}
@@ -1046,7 +1145,7 @@ static int follow_automount(struct path *path, unsigned flags,
*
* Serialization is taken care of in namespace.c
*/
-static int follow_managed(struct path *path, unsigned flags)
+static int follow_managed(struct path *path, struct nameidata *nd)
{
struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
unsigned managed;
@@ -1090,7 +1189,7 @@ static int follow_managed(struct path *path, unsigned flags)
/* Handle an automount point */
if (managed & DCACHE_NEED_AUTOMOUNT) {
- ret = follow_automount(path, flags, &need_mntput);
+ ret = follow_automount(path, nd, &need_mntput);
if (ret < 0)
break;
continue;
@@ -1104,7 +1203,11 @@ static int follow_managed(struct path *path, unsigned flags)
mntput(path->mnt);
if (ret == -EISDIR)
ret = 0;
- return ret < 0 ? ret : need_mntput;
+ if (need_mntput)
+ nd->flags |= LOOKUP_JUMPED;
+ if (unlikely(ret < 0))
+ path_put_conditional(path, nd);
+ return ret;
}
int follow_down_one(struct path *path)
@@ -1134,7 +1237,7 @@ static inline int managed_dentry_rcu(struct dentry *dentry)
* we meet a managed dentry that would need blocking.
*/
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
- struct inode **inode)
+ struct inode **inode, unsigned *seqp)
{
for (;;) {
struct mount *mounted;
@@ -1161,7 +1264,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
path->mnt = &mounted->mnt;
path->dentry = mounted->mnt.mnt_root;
nd->flags |= LOOKUP_JUMPED;
- nd->seq = read_seqcount_begin(&path->dentry->d_seq);
+ *seqp = read_seqcount_begin(&path->dentry->d_seq);
/*
* Update the inode too. We don't need to re-check the
* dentry sequence number here after this d_inode read,
@@ -1180,10 +1283,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
set_root_rcu(nd);
while (1) {
- if (nd->path.dentry == nd->root.dentry &&
- nd->path.mnt == nd->root.mnt) {
+ if (path_equal(&nd->path, &nd->root))
break;
- }
if (nd->path.dentry != nd->path.mnt->mnt_root) {
struct dentry *old = nd->path.dentry;
struct dentry *parent = old->d_parent;
@@ -1191,38 +1292,42 @@ static int follow_dotdot_rcu(struct nameidata *nd)
inode = parent->d_inode;
seq = read_seqcount_begin(&parent->d_seq);
- if (read_seqcount_retry(&old->d_seq, nd->seq))
- goto failed;
+ if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
+ return -ECHILD;
nd->path.dentry = parent;
nd->seq = seq;
break;
+ } else {
+ struct mount *mnt = real_mount(nd->path.mnt);
+ struct mount *mparent = mnt->mnt_parent;
+ struct dentry *mountpoint = mnt->mnt_mountpoint;
+ struct inode *inode2 = mountpoint->d_inode;
+ unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
+ if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+ return -ECHILD;
+ if (&mparent->mnt == nd->path.mnt)
+ break;
+ /* we know that mountpoint was pinned */
+ nd->path.dentry = mountpoint;
+ nd->path.mnt = &mparent->mnt;
+ inode = inode2;
+ nd->seq = seq;
}
- if (!follow_up_rcu(&nd->path))
- break;
- inode = nd->path.dentry->d_inode;
- nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
}
- while (d_mountpoint(nd->path.dentry)) {
+ while (unlikely(d_mountpoint(nd->path.dentry))) {
struct mount *mounted;
mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
+ if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+ return -ECHILD;
if (!mounted)
break;
nd->path.mnt = &mounted->mnt;
nd->path.dentry = mounted->mnt.mnt_root;
inode = nd->path.dentry->d_inode;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
- if (read_seqretry(&mount_lock, nd->m_seq))
- goto failed;
}
nd->inode = inode;
return 0;
-
-failed:
- nd->flags &= ~LOOKUP_RCU;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- rcu_read_unlock();
- return -ECHILD;
}
/*
@@ -1401,7 +1506,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
* It _is_ time-critical.
*/
static int lookup_fast(struct nameidata *nd,
- struct path *path, struct inode **inode)
+ struct path *path, struct inode **inode,
+ unsigned *seqp)
{
struct vfsmount *mnt = nd->path.mnt;
struct dentry *dentry, *parent = nd->path.dentry;
@@ -1416,6 +1522,7 @@ static int lookup_fast(struct nameidata *nd,
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
+ bool negative;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
if (!dentry)
goto unlazy;
@@ -1424,9 +1531,12 @@ static int lookup_fast(struct nameidata *nd,
* This sequence count validates that the inode matches
* the dentry name information from lookup.
*/
- *inode = dentry->d_inode;
+ *inode = d_backing_inode(dentry);
+ negative = d_is_negative(dentry);
if (read_seqcount_retry(&dentry->d_seq, seq))
return -ECHILD;
+ if (negative)
+ return -ENOENT;
/*
* This sequence count validates that the parent had no
@@ -1437,8 +1547,8 @@ static int lookup_fast(struct nameidata *nd,
*/
if (__read_seqcount_retry(&parent->d_seq, nd->seq))
return -ECHILD;
- nd->seq = seq;
+ *seqp = seq;
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
status = d_revalidate(dentry, nd->flags);
if (unlikely(status <= 0)) {
@@ -1449,10 +1559,10 @@ static int lookup_fast(struct nameidata *nd,
}
path->mnt = mnt;
path->dentry = dentry;
- if (likely(__follow_mount_rcu(nd, path, inode)))
+ if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
return 0;
unlazy:
- if (unlazy_walk(nd, dentry))
+ if (unlazy_walk(nd, dentry, seq))
return -ECHILD;
} else {
dentry = __d_lookup(parent, &nd->last);
@@ -1473,17 +1583,16 @@ unlazy:
goto need_lookup;
}
+ if (unlikely(d_is_negative(dentry))) {
+ dput(dentry);
+ return -ENOENT;
+ }
path->mnt = mnt;
path->dentry = dentry;
- err = follow_managed(path, nd->flags);
- if (unlikely(err < 0)) {
- path_put_conditional(path, nd);
- return err;
- }
- if (err)
- nd->flags |= LOOKUP_JUMPED;
- *inode = path->dentry->d_inode;
- return 0;
+ err = follow_managed(path, nd);
+ if (likely(!err))
+ *inode = d_backing_inode(path->dentry);
+ return err;
need_lookup:
return 1;
@@ -1493,7 +1602,6 @@ need_lookup:
static int lookup_slow(struct nameidata *nd, struct path *path)
{
struct dentry *dentry, *parent;
- int err;
parent = nd->path.dentry;
BUG_ON(nd->inode != parent->d_inode);
@@ -1505,14 +1613,7 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
return PTR_ERR(dentry);
path->mnt = nd->path.mnt;
path->dentry = dentry;
- err = follow_managed(path, nd->flags);
- if (unlikely(err < 0)) {
- path_put_conditional(path, nd);
- return err;
- }
- if (err)
- nd->flags |= LOOKUP_JUMPED;
- return 0;
+ return follow_managed(path, nd);
}
static inline int may_lookup(struct nameidata *nd)
@@ -1521,7 +1622,7 @@ static inline int may_lookup(struct nameidata *nd)
int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD)
return err;
- if (unlazy_walk(nd, NULL))
+ if (unlazy_walk(nd, NULL, 0))
return -ECHILD;
}
return inode_permission(nd->inode, MAY_EXEC);
@@ -1531,24 +1632,45 @@ static inline int handle_dots(struct nameidata *nd, int type)
{
if (type == LAST_DOTDOT) {
if (nd->flags & LOOKUP_RCU) {
- if (follow_dotdot_rcu(nd))
- return -ECHILD;
+ return follow_dotdot_rcu(nd);
} else
follow_dotdot(nd);
}
return 0;
}
-static void terminate_walk(struct nameidata *nd)
+static int pick_link(struct nameidata *nd, struct path *link,
+ struct inode *inode, unsigned seq)
{
+ int error;
+ struct saved *last;
+ if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
+ path_to_nameidata(link, nd);
+ return -ELOOP;
+ }
if (!(nd->flags & LOOKUP_RCU)) {
- path_put(&nd->path);
- } else {
- nd->flags &= ~LOOKUP_RCU;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- rcu_read_unlock();
+ if (link->mnt == nd->path.mnt)
+ mntget(link->mnt);
}
+ error = nd_alloc_stack(nd);
+ if (unlikely(error)) {
+ if (error == -ECHILD) {
+ if (unlikely(unlazy_link(nd, link, seq)))
+ return -ECHILD;
+ error = nd_alloc_stack(nd);
+ }
+ if (error) {
+ path_put(link);
+ return error;
+ }
+ }
+
+ last = nd->stack + nd->depth++;
+ last->link = *link;
+ last->cookie = NULL;
+ last->inode = inode;
+ last->seq = seq;
+ return 1;
}
/*
@@ -1557,97 +1679,68 @@ static void terminate_walk(struct nameidata *nd)
* so we keep a cache of "no, this doesn't need follow_link"
* for the common case.
*/
-static inline int should_follow_link(struct dentry *dentry, int follow)
+static inline int should_follow_link(struct nameidata *nd, struct path *link,
+ int follow,
+ struct inode *inode, unsigned seq)
{
- return unlikely(d_is_symlink(dentry)) ? follow : 0;
+ if (likely(!d_is_symlink(link->dentry)))
+ return 0;
+ if (!follow)
+ return 0;
+ return pick_link(nd, link, inode, seq);
}
-static inline int walk_component(struct nameidata *nd, struct path *path,
- int follow)
+enum {WALK_GET = 1, WALK_PUT = 2};
+
+static int walk_component(struct nameidata *nd, int flags)
{
+ struct path path;
struct inode *inode;
+ unsigned seq;
int err;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
- if (unlikely(nd->last_type != LAST_NORM))
- return handle_dots(nd, nd->last_type);
- err = lookup_fast(nd, path, &inode);
+ if (unlikely(nd->last_type != LAST_NORM)) {
+ err = handle_dots(nd, nd->last_type);
+ if (flags & WALK_PUT)
+ put_link(nd);
+ return err;
+ }
+ err = lookup_fast(nd, &path, &inode, &seq);
if (unlikely(err)) {
if (err < 0)
- goto out_err;
+ return err;
- err = lookup_slow(nd, path);
+ err = lookup_slow(nd, &path);
if (err < 0)
- goto out_err;
+ return err;
- inode = path->dentry->d_inode;
+ inode = d_backing_inode(path.dentry);
+ seq = 0; /* we are already out of RCU mode */
+ err = -ENOENT;
+ if (d_is_negative(path.dentry))
+ goto out_path_put;
}
- err = -ENOENT;
- if (!inode || d_is_negative(path->dentry))
- goto out_path_put;
- if (should_follow_link(path->dentry, follow)) {
- if (nd->flags & LOOKUP_RCU) {
- if (unlikely(unlazy_walk(nd, path->dentry))) {
- err = -ECHILD;
- goto out_err;
- }
- }
- BUG_ON(inode != path->dentry->d_inode);
- return 1;
- }
- path_to_nameidata(path, nd);
+ if (flags & WALK_PUT)
+ put_link(nd);
+ err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
+ if (unlikely(err))
+ return err;
+ path_to_nameidata(&path, nd);
nd->inode = inode;
+ nd->seq = seq;
return 0;
out_path_put:
- path_to_nameidata(path, nd);
-out_err:
- terminate_walk(nd);
+ path_to_nameidata(&path, nd);
return err;
}
/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups.
- */
-static inline int nested_symlink(struct path *path, struct nameidata *nd)
-{
- int res;
-
- if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
- path_put_conditional(path, nd);
- path_put(&nd->path);
- return -ELOOP;
- }
- BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-
- nd->depth++;
- current->link_count++;
-
- do {
- struct path link = *path;
- void *cookie;
-
- res = follow_link(&link, nd, &cookie);
- if (res)
- break;
- res = walk_component(nd, path, LOOKUP_FOLLOW);
- put_link(nd, &link, cookie);
- } while (res > 0);
-
- current->link_count--;
- nd->depth--;
- return res;
-}
-
-/*
* We can do the critical dentry name comparison and hashing
* operations one word at a time, but we are limited to:
*
@@ -1773,9 +1866,8 @@ static inline u64 hash_name(const char *name)
*/
static int link_path_walk(const char *name, struct nameidata *nd)
{
- struct path next;
int err;
-
+
while (*name=='/')
name++;
if (!*name)
@@ -1788,7 +1880,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
err = may_lookup(nd);
if (err)
- break;
+ return err;
hash_len = hash_name(name);
@@ -1810,7 +1902,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
struct qstr this = { { .hash_len = hash_len }, .name = name };
err = parent->d_op->d_hash(parent, &this);
if (err < 0)
- break;
+ return err;
hash_len = this.hash_len;
name = this.name;
}
@@ -1822,7 +1914,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
name += hashlen_len(hash_len);
if (!*name)
- return 0;
+ goto OK;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
@@ -1830,71 +1922,94 @@ static int link_path_walk(const char *name, struct nameidata *nd)
do {
name++;
} while (unlikely(*name == '/'));
- if (!*name)
- return 0;
-
- err = walk_component(nd, &next, LOOKUP_FOLLOW);
+ if (unlikely(!*name)) {
+OK:
+ /* pathname body, done */
+ if (!nd->depth)
+ return 0;
+ name = nd->stack[nd->depth - 1].name;
+ /* trailing symlink, done */
+ if (!name)
+ return 0;
+ /* last component of nested symlink */
+ err = walk_component(nd, WALK_GET | WALK_PUT);
+ } else {
+ err = walk_component(nd, WALK_GET);
+ }
if (err < 0)
return err;
if (err) {
- err = nested_symlink(&next, nd);
- if (err)
- return err;
+ const char *s = get_link(nd);
+
+ if (unlikely(IS_ERR(s)))
+ return PTR_ERR(s);
+ err = 0;
+ if (unlikely(!s)) {
+ /* jumped */
+ put_link(nd);
+ } else {
+ nd->stack[nd->depth - 1].name = name;
+ name = s;
+ continue;
+ }
}
- if (!d_can_lookup(nd->path.dentry)) {
- err = -ENOTDIR;
- break;
+ if (unlikely(!d_can_lookup(nd->path.dentry))) {
+ if (nd->flags & LOOKUP_RCU) {
+ if (unlazy_walk(nd, NULL, 0))
+ return -ECHILD;
+ }
+ return -ENOTDIR;
}
}
- terminate_walk(nd);
- return err;
}
-static int path_init(int dfd, const char *name, unsigned int flags,
- struct nameidata *nd)
+static const char *path_init(struct nameidata *nd, unsigned flags)
{
int retval = 0;
+ const char *s = nd->name->name;
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
nd->depth = 0;
- nd->base = NULL;
+ nd->total_link_count = 0;
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
- if (*name) {
+ if (*s) {
if (!d_can_lookup(root))
- return -ENOTDIR;
+ return ERR_PTR(-ENOTDIR);
retval = inode_permission(inode, MAY_EXEC);
if (retval)
- return retval;
+ return ERR_PTR(retval);
}
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
rcu_read_lock();
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+ nd->root_seq = nd->seq;
nd->m_seq = read_seqbegin(&mount_lock);
} else {
path_get(&nd->path);
}
- goto done;
+ return s;
}
nd->root.mnt = NULL;
nd->m_seq = read_seqbegin(&mount_lock);
- if (*name=='/') {
+ if (*s == '/') {
if (flags & LOOKUP_RCU) {
rcu_read_lock();
- nd->seq = set_root_rcu(nd);
+ set_root_rcu(nd);
+ nd->seq = nd->root_seq;
} else {
set_root(nd);
path_get(&nd->root);
}
nd->path = nd->root;
- } else if (dfd == AT_FDCWD) {
+ } else if (nd->dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
unsigned seq;
@@ -1911,188 +2026,205 @@ static int path_init(int dfd, const char *name, unsigned int flags,
}
} else {
/* Caller must check execute permissions on the starting path component */
- struct fd f = fdget_raw(dfd);
+ struct fd f = fdget_raw(nd->dfd);
struct dentry *dentry;
if (!f.file)
- return -EBADF;
+ return ERR_PTR(-EBADF);
dentry = f.file->f_path.dentry;
- if (*name) {
+ if (*s) {
if (!d_can_lookup(dentry)) {
fdput(f);
- return -ENOTDIR;
+ return ERR_PTR(-ENOTDIR);
}
}
nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
- if (f.flags & FDPUT_FPUT)
- nd->base = f.file;
- nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
rcu_read_lock();
+ nd->inode = nd->path.dentry->d_inode;
+ nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
} else {
path_get(&nd->path);
- fdput(f);
+ nd->inode = nd->path.dentry->d_inode;
}
+ fdput(f);
+ return s;
}
nd->inode = nd->path.dentry->d_inode;
if (!(flags & LOOKUP_RCU))
- goto done;
+ return s;
if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
- goto done;
+ return s;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
rcu_read_unlock();
- return -ECHILD;
-done:
- current->total_link_count = 0;
- return link_path_walk(name, nd);
+ return ERR_PTR(-ECHILD);
}
-static void path_cleanup(struct nameidata *nd)
+static const char *trailing_symlink(struct nameidata *nd)
{
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
- path_put(&nd->root);
- nd->root.mnt = NULL;
- }
- if (unlikely(nd->base))
- fput(nd->base);
+ const char *s;
+ int error = may_follow_link(nd);
+ if (unlikely(error))
+ return ERR_PTR(error);
+ nd->flags |= LOOKUP_PARENT;
+ nd->stack[0].name = NULL;
+ s = get_link(nd);
+ return s ? s : "";
}
-static inline int lookup_last(struct nameidata *nd, struct path *path)
+static inline int lookup_last(struct nameidata *nd)
{
if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
nd->flags &= ~LOOKUP_PARENT;
- return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW);
+ return walk_component(nd,
+ nd->flags & LOOKUP_FOLLOW
+ ? nd->depth
+ ? WALK_PUT | WALK_GET
+ : WALK_GET
+ : 0);
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int path_lookupat(int dfd, const char *name,
- unsigned int flags, struct nameidata *nd)
+static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
- struct path path;
+ const char *s = path_init(nd, flags);
int err;
- /*
- * Path walking is largely split up into 2 different synchronisation
- * schemes, rcu-walk and ref-walk (explained in
- * Documentation/filesystems/path-lookup.txt). These share much of the
- * path walk code, but some things particularly setup, cleanup, and
- * following mounts are sufficiently divergent that functions are
- * duplicated. Typically there is a function foo(), and its RCU
- * analogue, foo_rcu().
- *
- * -ECHILD is the error number of choice (just to avoid clashes) that
- * is returned if some aspect of an rcu-walk fails. Such an error must
- * be handled by restarting a traditional ref-walk (which will always
- * be able to complete).
- */
- err = path_init(dfd, name, flags, nd);
- if (!err && !(flags & LOOKUP_PARENT)) {
- err = lookup_last(nd, &path);
- while (err > 0) {
- void *cookie;
- struct path link = path;
- err = may_follow_link(&link, nd);
- if (unlikely(err))
- break;
- nd->flags |= LOOKUP_PARENT;
- err = follow_link(&link, nd, &cookie);
- if (err)
- break;
- err = lookup_last(nd, &path);
- put_link(nd, &link, cookie);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+ while (!(err = link_path_walk(s, nd))
+ && ((err = lookup_last(nd)) > 0)) {
+ s = trailing_symlink(nd);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
+ break;
}
}
-
if (!err)
err = complete_walk(nd);
- if (!err && nd->flags & LOOKUP_DIRECTORY) {
- if (!d_can_lookup(nd->path.dentry)) {
- path_put(&nd->path);
+ if (!err && nd->flags & LOOKUP_DIRECTORY)
+ if (!d_can_lookup(nd->path.dentry))
err = -ENOTDIR;
- }
+ if (!err) {
+ *path = nd->path;
+ nd->path.mnt = NULL;
+ nd->path.dentry = NULL;
}
-
- path_cleanup(nd);
+ terminate_walk(nd);
return err;
}
-static int filename_lookup(int dfd, struct filename *name,
- unsigned int flags, struct nameidata *nd)
+static int filename_lookup(int dfd, struct filename *name, unsigned flags,
+ struct path *path, struct path *root)
{
- int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
+ int retval;
+ struct nameidata nd;
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+ if (unlikely(root)) {
+ nd.root = *root;
+ flags |= LOOKUP_ROOT;
+ }
+ set_nameidata(&nd, dfd, name);
+ retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
if (unlikely(retval == -ECHILD))
- retval = path_lookupat(dfd, name->name, flags, nd);
+ retval = path_lookupat(&nd, flags, path);
if (unlikely(retval == -ESTALE))
- retval = path_lookupat(dfd, name->name,
- flags | LOOKUP_REVAL, nd);
+ retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
if (likely(!retval))
- audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
+ audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
+ restore_nameidata();
+ putname(name);
return retval;
}
-static int do_path_lookup(int dfd, const char *name,
- unsigned int flags, struct nameidata *nd)
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int path_parentat(struct nameidata *nd, unsigned flags,
+ struct path *parent)
{
- struct filename *filename = getname_kernel(name);
- int retval = PTR_ERR(filename);
+ const char *s = path_init(nd, flags);
+ int err;
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+ err = link_path_walk(s, nd);
+ if (!err)
+ err = complete_walk(nd);
+ if (!err) {
+ *parent = nd->path;
+ nd->path.mnt = NULL;
+ nd->path.dentry = NULL;
+ }
+ terminate_walk(nd);
+ return err;
+}
- if (!IS_ERR(filename)) {
- retval = filename_lookup(dfd, filename, flags, nd);
- putname(filename);
+static struct filename *filename_parentat(int dfd, struct filename *name,
+ unsigned int flags, struct path *parent,
+ struct qstr *last, int *type)
+{
+ int retval;
+ struct nameidata nd;
+
+ if (IS_ERR(name))
+ return name;
+ set_nameidata(&nd, dfd, name);
+ retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
+ if (unlikely(retval == -ECHILD))
+ retval = path_parentat(&nd, flags, parent);
+ if (unlikely(retval == -ESTALE))
+ retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
+ if (likely(!retval)) {
+ *last = nd.last;
+ *type = nd.last_type;
+ audit_inode(name, parent->dentry, LOOKUP_PARENT);
+ } else {
+ putname(name);
+ name = ERR_PTR(retval);
}
- return retval;
+ restore_nameidata();
+ return name;
}
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
{
- struct filename *filename = getname_kernel(name);
- struct nameidata nd;
+ struct filename *filename;
struct dentry *d;
- int err;
+ struct qstr last;
+ int type;
+ filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
+ &last, &type);
if (IS_ERR(filename))
return ERR_CAST(filename);
-
- err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd);
- if (err) {
- d = ERR_PTR(err);
- goto out;
- }
- if (nd.last_type != LAST_NORM) {
- path_put(&nd.path);
- d = ERR_PTR(-EINVAL);
- goto out;
+ if (unlikely(type != LAST_NORM)) {
+ path_put(path);
+ putname(filename);
+ return ERR_PTR(-EINVAL);
}
- mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
- d = __lookup_hash(&nd.last, nd.path.dentry, 0);
+ mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ d = __lookup_hash(&last, path->dentry, 0);
if (IS_ERR(d)) {
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- path_put(&nd.path);
- goto out;
+ mutex_unlock(&path->dentry->d_inode->i_mutex);
+ path_put(path);
}
- *path = nd.path;
-out:
putname(filename);
return d;
}
int kern_path(const char *name, unsigned int flags, struct path *path)
{
- struct nameidata nd;
- int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
- if (!res)
- *path = nd.path;
- return res;
+ return filename_lookup(AT_FDCWD, getname_kernel(name),
+ flags, path, NULL);
}
EXPORT_SYMBOL(kern_path);
@@ -2108,29 +2240,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
const char *name, unsigned int flags,
struct path *path)
{
- struct nameidata nd;
- int err;
- nd.root.dentry = dentry;
- nd.root.mnt = mnt;
- BUG_ON(flags & LOOKUP_PARENT);
- /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
- err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
- if (!err)
- *path = nd.path;
- return err;
+ struct path root = {.mnt = mnt, .dentry = dentry};
+ /* the first argument of filename_lookup() is ignored with root */
+ return filename_lookup(AT_FDCWD, getname_kernel(name),
+ flags , path, &root);
}
EXPORT_SYMBOL(vfs_path_lookup);
-/*
- * Restricted form of lookup. Doesn't follow links, single-component only,
- * needs parent already locked. Doesn't follow mounts.
- * SMP-safe.
- */
-static struct dentry *lookup_hash(struct nameidata *nd)
-{
- return __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
-}
-
/**
* lookup_one_len - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
@@ -2138,9 +2254,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
* @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code. Also note that by using this function the
- * nameidata argument is passed to the filesystem methods and a filesystem
- * using this helper needs to be prepared for that.
+ * not be called by generic code.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2187,27 +2301,10 @@ EXPORT_SYMBOL(lookup_one_len);
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
- struct nameidata nd;
- struct filename *tmp = getname_flags(name, flags, empty);
- int err = PTR_ERR(tmp);
- if (!IS_ERR(tmp)) {
-
- BUG_ON(flags & LOOKUP_PARENT);
-
- err = filename_lookup(dfd, tmp, flags, &nd);
- putname(tmp);
- if (!err)
- *path = nd.path;
- }
- return err;
+ return filename_lookup(dfd, getname_flags(name, flags, empty),
+ flags, path, NULL);
}
-
-int user_path_at(int dfd, const char __user *name, unsigned flags,
- struct path *path)
-{
- return user_path_at_empty(dfd, name, flags, path, NULL);
-}
-EXPORT_SYMBOL(user_path_at);
+EXPORT_SYMBOL(user_path_at_empty);
/*
* NB: most callers don't do anything directly with the reference to the
@@ -2215,26 +2312,16 @@ EXPORT_SYMBOL(user_path_at);
* allocated by getname. So we must hold the reference to it until all
* path-walking is complete.
*/
-static struct filename *
-user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
+static inline struct filename *
+user_path_parent(int dfd, const char __user *path,
+ struct path *parent,
+ struct qstr *last,
+ int *type,
unsigned int flags)
{
- struct filename *s = getname(path);
- int error;
-
/* only LOOKUP_REVAL is allowed in extra flags */
- flags &= LOOKUP_REVAL;
-
- if (IS_ERR(s))
- return s;
-
- error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
- if (error) {
- putname(s);
- return ERR_PTR(error);
- }
-
- return s;
+ return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
+ parent, last, type);
}
/**
@@ -2273,10 +2360,8 @@ mountpoint_last(struct nameidata *nd, struct path *path)
/* If we're in rcuwalk, drop out of it to handle last component */
if (nd->flags & LOOKUP_RCU) {
- if (unlazy_walk(nd, NULL)) {
- error = -ECHILD;
- goto out;
- }
+ if (unlazy_walk(nd, NULL, 0))
+ return -ECHILD;
}
nd->flags &= ~LOOKUP_PARENT;
@@ -2284,7 +2369,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
if (unlikely(nd->last_type != LAST_NORM)) {
error = handle_dots(nd, nd->last_type);
if (error)
- goto out;
+ return error;
dentry = dget(nd->path.dentry);
goto done;
}
@@ -2299,91 +2384,81 @@ mountpoint_last(struct nameidata *nd, struct path *path)
*/
dentry = d_alloc(dir, &nd->last);
if (!dentry) {
- error = -ENOMEM;
mutex_unlock(&dir->d_inode->i_mutex);
- goto out;
+ return -ENOMEM;
}
dentry = lookup_real(dir->d_inode, dentry, nd->flags);
- error = PTR_ERR(dentry);
if (IS_ERR(dentry)) {
mutex_unlock(&dir->d_inode->i_mutex);
- goto out;
+ return PTR_ERR(dentry);
}
}
mutex_unlock(&dir->d_inode->i_mutex);
done:
- if (!dentry->d_inode || d_is_negative(dentry)) {
- error = -ENOENT;
+ if (d_is_negative(dentry)) {
dput(dentry);
- goto out;
+ return -ENOENT;
}
+ if (nd->depth)
+ put_link(nd);
path->dentry = dentry;
path->mnt = nd->path.mnt;
- if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW))
- return 1;
+ error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
+ d_backing_inode(dentry), 0);
+ if (unlikely(error))
+ return error;
mntget(path->mnt);
follow_mount(path);
- error = 0;
-out:
- terminate_walk(nd);
- return error;
+ return 0;
}
/**
* path_mountpoint - look up a path to be umounted
- * @dfd: directory file descriptor to start walk from
- * @name: full pathname to walk
- * @path: pointer to container for result
+ * @nameidata: lookup context
* @flags: lookup flags
+ * @path: pointer to container for result
*
* Look up the given name, but don't attempt to revalidate the last component.
* Returns 0 and "path" will be valid on success; Returns error otherwise.
*/
static int
-path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
+path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
{
- struct nameidata nd;
+ const char *s = path_init(nd, flags);
int err;
-
- err = path_init(dfd, name, flags, &nd);
- if (unlikely(err))
- goto out;
-
- err = mountpoint_last(&nd, path);
- while (err > 0) {
- void *cookie;
- struct path link = *path;
- err = may_follow_link(&link, &nd);
- if (unlikely(err))
- break;
- nd.flags |= LOOKUP_PARENT;
- err = follow_link(&link, &nd, &cookie);
- if (err)
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+ while (!(err = link_path_walk(s, nd)) &&
+ (err = mountpoint_last(nd, path)) > 0) {
+ s = trailing_symlink(nd);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
break;
- err = mountpoint_last(&nd, path);
- put_link(&nd, &link, cookie);
+ }
}
-out:
- path_cleanup(&nd);
+ terminate_walk(nd);
return err;
}
static int
-filename_mountpoint(int dfd, struct filename *s, struct path *path,
+filename_mountpoint(int dfd, struct filename *name, struct path *path,
unsigned int flags)
{
+ struct nameidata nd;
int error;
- if (IS_ERR(s))
- return PTR_ERR(s);
- error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+ set_nameidata(&nd, dfd, name);
+ error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
if (unlikely(error == -ECHILD))
- error = path_mountpoint(dfd, s->name, path, flags);
+ error = path_mountpoint(&nd, flags, path);
if (unlikely(error == -ESTALE))
- error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
+ error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
if (likely(!error))
- audit_inode(s, path->dentry, 0);
- putname(s);
+ audit_inode(name, path->dentry, 0);
+ restore_nameidata();
+ putname(name);
return error;
}
@@ -2449,7 +2524,7 @@ EXPORT_SYMBOL(__check_sticky);
*/
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
{
- struct inode *inode = victim->d_inode;
+ struct inode *inode = d_backing_inode(victim);
int error;
if (d_is_negative(victim))
@@ -2915,18 +2990,19 @@ out_dput:
/*
* Handle the last step of open()
*/
-static int do_last(struct nameidata *nd, struct path *path,
+static int do_last(struct nameidata *nd,
struct file *file, const struct open_flags *op,
- int *opened, struct filename *name)
+ int *opened)
{
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool will_truncate = (open_flag & O_TRUNC) != 0;
bool got_write = false;
int acc_mode = op->acc_mode;
+ unsigned seq;
struct inode *inode;
- bool symlink_ok = false;
struct path save_parent = { .dentry = NULL, .mnt = NULL };
+ struct path path;
bool retried = false;
int error;
@@ -2935,7 +3011,7 @@ static int do_last(struct nameidata *nd, struct path *path,
if (nd->last_type != LAST_NORM) {
error = handle_dots(nd, nd->last_type);
- if (error)
+ if (unlikely(error))
return error;
goto finish_open;
}
@@ -2943,15 +3019,13 @@ static int do_last(struct nameidata *nd, struct path *path,
if (!(open_flag & O_CREAT)) {
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
- if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
- symlink_ok = true;
/* we _can_ be in RCU mode here */
- error = lookup_fast(nd, path, &inode);
+ error = lookup_fast(nd, &path, &inode, &seq);
if (likely(!error))
goto finish_lookup;
if (error < 0)
- goto out;
+ return error;
BUG_ON(nd->inode != dir->d_inode);
} else {
@@ -2965,11 +3039,10 @@ static int do_last(struct nameidata *nd, struct path *path,
if (error)
return error;
- audit_inode(name, dir, LOOKUP_PARENT);
- error = -EISDIR;
+ audit_inode(nd->name, dir, LOOKUP_PARENT);
/* trailing slashes? */
- if (nd->last.name[nd->last.len])
- goto out;
+ if (unlikely(nd->last.name[nd->last.len]))
+ return -EISDIR;
}
retry_lookup:
@@ -2984,7 +3057,7 @@ retry_lookup:
*/
}
mutex_lock(&dir->d_inode->i_mutex);
- error = lookup_open(nd, path, file, op, got_write, opened);
+ error = lookup_open(nd, &path, file, op, got_write, opened);
mutex_unlock(&dir->d_inode->i_mutex);
if (error <= 0) {
@@ -2995,7 +3068,7 @@ retry_lookup:
!S_ISREG(file_inode(file)->i_mode))
will_truncate = false;
- audit_inode(name, file->f_path.dentry, 0);
+ audit_inode(nd->name, file->f_path.dentry, 0);
goto opened;
}
@@ -3004,15 +3077,15 @@ retry_lookup:
open_flag &= ~O_TRUNC;
will_truncate = false;
acc_mode = MAY_OPEN;
- path_to_nameidata(path, nd);
+ path_to_nameidata(&path, nd);
goto finish_open_created;
}
/*
* create/update audit record if it already exists.
*/
- if (d_is_positive(path->dentry))
- audit_inode(name, path->dentry, 0);
+ if (d_is_positive(path.dentry))
+ audit_inode(nd->name, path.dentry, 0);
/*
* If atomic_open() acquired write access it is dropped now due to
@@ -3024,47 +3097,45 @@ retry_lookup:
got_write = false;
}
- error = -EEXIST;
- if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
- goto exit_dput;
-
- error = follow_managed(path, nd->flags);
- if (error < 0)
- goto exit_dput;
+ if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
+ path_to_nameidata(&path, nd);
+ return -EEXIST;
+ }
- if (error)
- nd->flags |= LOOKUP_JUMPED;
+ error = follow_managed(&path, nd);
+ if (unlikely(error < 0))
+ return error;
BUG_ON(nd->flags & LOOKUP_RCU);
- inode = path->dentry->d_inode;
-finish_lookup:
- /* we _can_ be in RCU mode here */
- error = -ENOENT;
- if (!inode || d_is_negative(path->dentry)) {
- path_to_nameidata(path, nd);
- goto out;
+ inode = d_backing_inode(path.dentry);
+ seq = 0; /* out of RCU mode, so the value doesn't matter */
+ if (unlikely(d_is_negative(path.dentry))) {
+ path_to_nameidata(&path, nd);
+ return -ENOENT;
}
+finish_lookup:
+ if (nd->depth)
+ put_link(nd);
+ error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
+ inode, seq);
+ if (unlikely(error))
+ return error;
- if (should_follow_link(path->dentry, !symlink_ok)) {
- if (nd->flags & LOOKUP_RCU) {
- if (unlikely(unlazy_walk(nd, path->dentry))) {
- error = -ECHILD;
- goto out;
- }
- }
- BUG_ON(inode != path->dentry->d_inode);
- return 1;
+ if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
+ path_to_nameidata(&path, nd);
+ return -ELOOP;
}
- if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
- path_to_nameidata(path, nd);
+ if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
+ path_to_nameidata(&path, nd);
} else {
save_parent.dentry = nd->path.dentry;
- save_parent.mnt = mntget(path->mnt);
- nd->path.dentry = path->dentry;
+ save_parent.mnt = mntget(path.mnt);
+ nd->path.dentry = path.dentry;
}
nd->inode = inode;
+ nd->seq = seq;
/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
finish_open:
error = complete_walk(nd);
@@ -3072,14 +3143,14 @@ finish_open:
path_put(&save_parent);
return error;
}
- audit_inode(name, nd->path.dentry, 0);
+ audit_inode(nd->name, nd->path.dentry, 0);
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
error = -ENOTDIR;
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
goto out;
- if (!S_ISREG(nd->inode->i_mode))
+ if (!d_is_reg(nd->path.dentry))
will_truncate = false;
if (will_truncate) {
@@ -3119,12 +3190,8 @@ out:
if (got_write)
mnt_drop_write(nd->path.mnt);
path_put(&save_parent);
- terminate_walk(nd);
return error;
-exit_dput:
- path_put_conditional(path, nd);
- goto out;
exit_fput:
fput(file);
goto out;
@@ -3148,50 +3215,46 @@ stale_open:
goto retry_lookup;
}
-static int do_tmpfile(int dfd, struct filename *pathname,
- struct nameidata *nd, int flags,
+static int do_tmpfile(struct nameidata *nd, unsigned flags,
const struct open_flags *op,
struct file *file, int *opened)
{
static const struct qstr name = QSTR_INIT("/", 1);
- struct dentry *dentry, *child;
+ struct dentry *child;
struct inode *dir;
- int error = path_lookupat(dfd, pathname->name,
- flags | LOOKUP_DIRECTORY, nd);
+ struct path path;
+ int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);
if (unlikely(error))
return error;
- error = mnt_want_write(nd->path.mnt);
+ error = mnt_want_write(path.mnt);
if (unlikely(error))
goto out;
+ dir = path.dentry->d_inode;
/* we want directory to be writable */
- error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
+ error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
if (error)
goto out2;
- dentry = nd->path.dentry;
- dir = dentry->d_inode;
if (!dir->i_op->tmpfile) {
error = -EOPNOTSUPP;
goto out2;
}
- child = d_alloc(dentry, &name);
+ child = d_alloc(path.dentry, &name);
if (unlikely(!child)) {
error = -ENOMEM;
goto out2;
}
- nd->flags &= ~LOOKUP_DIRECTORY;
- nd->flags |= op->intent;
- dput(nd->path.dentry);
- nd->path.dentry = child;
- error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
+ dput(path.dentry);
+ path.dentry = child;
+ error = dir->i_op->tmpfile(dir, child, op->mode);
if (error)
goto out2;
- audit_inode(pathname, nd->path.dentry, 0);
+ audit_inode(nd->name, child, 0);
/* Don't check for other permissions, the inode was just created */
- error = may_open(&nd->path, MAY_OPEN, op->open_flag);
+ error = may_open(&path, MAY_OPEN, op->open_flag);
if (error)
goto out2;
- file->f_path.mnt = nd->path.mnt;
- error = finish_open(file, nd->path.dentry, NULL, opened);
+ file->f_path.mnt = path.mnt;
+ error = finish_open(file, child, NULL, opened);
if (error)
goto out2;
error = open_check_o_direct(file);
@@ -3204,17 +3267,17 @@ static int do_tmpfile(int dfd, struct filename *pathname,
spin_unlock(&inode->i_lock);
}
out2:
- mnt_drop_write(nd->path.mnt);
+ mnt_drop_write(path.mnt);
out:
- path_put(&nd->path);
+ path_put(&path);
return error;
}
-static struct file *path_openat(int dfd, struct filename *pathname,
- struct nameidata *nd, const struct open_flags *op, int flags)
+static struct file *path_openat(struct nameidata *nd,
+ const struct open_flags *op, unsigned flags)
{
+ const char *s;
struct file *file;
- struct path path;
int opened = 0;
int error;
@@ -3225,37 +3288,26 @@ static struct file *path_openat(int dfd, struct filename *pathname,
file->f_flags = op->open_flag;
if (unlikely(file->f_flags & __O_TMPFILE)) {
- error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
- goto out;
+ error = do_tmpfile(nd, flags, op, file, &opened);
+ goto out2;
}
- error = path_init(dfd, pathname->name, flags, nd);
- if (unlikely(error))
- goto out;
-
- error = do_last(nd, &path, file, op, &opened, pathname);
- while (unlikely(error > 0)) { /* trailing symlink */
- struct path link = path;
- void *cookie;
- if (!(nd->flags & LOOKUP_FOLLOW)) {
- path_put_conditional(&path, nd);
- path_put(&nd->path);
- error = -ELOOP;
- break;
- }
- error = may_follow_link(&link, nd);
- if (unlikely(error))
- break;
- nd->flags |= LOOKUP_PARENT;
+ s = path_init(nd, flags);
+ if (IS_ERR(s)) {
+ put_filp(file);
+ return ERR_CAST(s);
+ }
+ while (!(error = link_path_walk(s, nd)) &&
+ (error = do_last(nd, file, op, &opened)) > 0) {
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
- error = follow_link(&link, nd, &cookie);
- if (unlikely(error))
+ s = trailing_symlink(nd);
+ if (IS_ERR(s)) {
+ error = PTR_ERR(s);
break;
- error = do_last(nd, &path, file, op, &opened, pathname);
- put_link(nd, &link, cookie);
+ }
}
-out:
- path_cleanup(nd);
+ terminate_walk(nd);
+out2:
if (!(opened & FILE_OPENED)) {
BUG_ON(!error);
put_filp(file);
@@ -3279,11 +3331,13 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
int flags = op->lookup_flags;
struct file *filp;
- filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
+ set_nameidata(&nd, dfd, pathname);
+ filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
- filp = path_openat(dfd, pathname, &nd, op, flags);
+ filp = path_openat(&nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE)))
- filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+ filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
+ restore_nameidata();
return filp;
}
@@ -3305,11 +3359,13 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
if (unlikely(IS_ERR(filename)))
return ERR_CAST(filename);
- file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
+ set_nameidata(&nd, -1, filename);
+ file = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(file == ERR_PTR(-ECHILD)))
- file = path_openat(-1, filename, &nd, op, flags);
+ file = path_openat(&nd, op, flags);
if (unlikely(file == ERR_PTR(-ESTALE)))
- file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
+ file = path_openat(&nd, op, flags | LOOKUP_REVAL);
+ restore_nameidata();
putname(filename);
return file;
}
@@ -3318,7 +3374,8 @@ static struct dentry *filename_create(int dfd, struct filename *name,
struct path *path, unsigned int lookup_flags)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
- struct nameidata nd;
+ struct qstr last;
+ int type;
int err2;
int error;
bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
@@ -3329,26 +3386,25 @@ static struct dentry *filename_create(int dfd, struct filename *name,
*/
lookup_flags &= LOOKUP_REVAL;
- error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
- if (error)
- return ERR_PTR(error);
+ name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
+ if (IS_ERR(name))
+ return ERR_CAST(name);
/*
* Yucky last component or no last component at all?
* (foo/., foo/.., /////)
*/
- if (nd.last_type != LAST_NORM)
+ if (unlikely(type != LAST_NORM))
goto out;
- nd.flags &= ~LOOKUP_PARENT;
- nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
/* don't fail immediately if it's r/o, at least try to report other errors */
- err2 = mnt_want_write(nd.path.mnt);
+ err2 = mnt_want_write(path->mnt);
/*
* Do the final lookup.
*/
- mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
- dentry = lookup_hash(&nd);
+ lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+ mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ dentry = __lookup_hash(&last, path->dentry, lookup_flags);
if (IS_ERR(dentry))
goto unlock;
@@ -3362,7 +3418,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* all is fine. Let's be bastards - you had / on the end, you've
* been asking for (non-existent) directory. -ENOENT for you.
*/
- if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
+ if (unlikely(!is_dir && last.name[last.len])) {
error = -ENOENT;
goto fail;
}
@@ -3370,31 +3426,26 @@ static struct dentry *filename_create(int dfd, struct filename *name,
error = err2;
goto fail;
}
- *path = nd.path;
+ putname(name);
return dentry;
fail:
dput(dentry);
dentry = ERR_PTR(error);
unlock:
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ mutex_unlock(&path->dentry->d_inode->i_mutex);
if (!err2)
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(path->mnt);
out:
- path_put(&nd.path);
+ path_put(path);
+ putname(name);
return dentry;
}
struct dentry *kern_path_create(int dfd, const char *pathname,
struct path *path, unsigned int lookup_flags)
{
- struct filename *filename = getname_kernel(pathname);
- struct dentry *res;
-
- if (IS_ERR(filename))
- return ERR_CAST(filename);
- res = filename_create(dfd, filename, path, lookup_flags);
- putname(filename);
- return res;
+ return filename_create(dfd, getname_kernel(pathname),
+ path, lookup_flags);
}
EXPORT_SYMBOL(kern_path_create);
@@ -3407,16 +3458,10 @@ void done_path_create(struct path *path, struct dentry *dentry)
}
EXPORT_SYMBOL(done_path_create);
-struct dentry *user_path_create(int dfd, const char __user *pathname,
+inline struct dentry *user_path_create(int dfd, const char __user *pathname,
struct path *path, unsigned int lookup_flags)
{
- struct filename *tmp = getname(pathname);
- struct dentry *res;
- if (IS_ERR(tmp))
- return ERR_CAST(tmp);
- res = filename_create(dfd, tmp, path, lookup_flags);
- putname(tmp);
- return res;
+ return filename_create(dfd, getname(pathname), path, lookup_flags);
}
EXPORT_SYMBOL(user_path_create);
@@ -3637,14 +3682,17 @@ static long do_rmdir(int dfd, const char __user *pathname)
int error = 0;
struct filename *name;
struct dentry *dentry;
- struct nameidata nd;
+ struct path path;
+ struct qstr last;
+ int type;
unsigned int lookup_flags = 0;
retry:
- name = user_path_parent(dfd, pathname, &nd, lookup_flags);
+ name = user_path_parent(dfd, pathname,
+ &path, &last, &type, lookup_flags);
if (IS_ERR(name))
return PTR_ERR(name);
- switch(nd.last_type) {
+ switch (type) {
case LAST_DOTDOT:
error = -ENOTEMPTY;
goto exit1;
@@ -3656,13 +3704,12 @@ retry:
goto exit1;
}
- nd.flags &= ~LOOKUP_PARENT;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto exit1;
- mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
- dentry = lookup_hash(&nd);
+ mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit2;
@@ -3670,17 +3717,17 @@ retry:
error = -ENOENT;
goto exit3;
}
- error = security_path_rmdir(&nd.path, dentry);
+ error = security_path_rmdir(&path, dentry);
if (error)
goto exit3;
- error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+ error = vfs_rmdir(path.dentry->d_inode, dentry);
exit3:
dput(dentry);
exit2:
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- mnt_drop_write(nd.path.mnt);
+ mutex_unlock(&path.dentry->d_inode->i_mutex);
+ mnt_drop_write(path.mnt);
exit1:
- path_put(&nd.path);
+ path_put(&path);
putname(name);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -3763,43 +3810,45 @@ static long do_unlinkat(int dfd, const char __user *pathname)
int error;
struct filename *name;
struct dentry *dentry;
- struct nameidata nd;
+ struct path path;
+ struct qstr last;
+ int type;
struct inode *inode = NULL;
struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
- name = user_path_parent(dfd, pathname, &nd, lookup_flags);
+ name = user_path_parent(dfd, pathname,
+ &path, &last, &type, lookup_flags);
if (IS_ERR(name))
return PTR_ERR(name);
error = -EISDIR;
- if (nd.last_type != LAST_NORM)
+ if (type != LAST_NORM)
goto exit1;
- nd.flags &= ~LOOKUP_PARENT;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto exit1;
retry_deleg:
- mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
- dentry = lookup_hash(&nd);
+ mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
/* Why not before? Because we want correct error value */
- if (nd.last.name[nd.last.len])
+ if (last.name[last.len])
goto slashes;
inode = dentry->d_inode;
if (d_is_negative(dentry))
goto slashes;
ihold(inode);
- error = security_path_unlink(&nd.path, dentry);
+ error = security_path_unlink(&path, dentry);
if (error)
goto exit2;
- error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode);
+ error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
exit2:
dput(dentry);
}
- mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ mutex_unlock(&path.dentry->d_inode->i_mutex);
if (inode)
iput(inode); /* truncate the inode here */
inode = NULL;
@@ -3808,9 +3857,9 @@ exit2:
if (!error)
goto retry_deleg;
}
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(path.mnt);
exit1:
- path_put(&nd.path);
+ path_put(&path);
putname(name);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
@@ -4240,14 +4289,15 @@ EXPORT_SYMBOL(vfs_rename);
SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
int, newdfd, const char __user *, newname, unsigned int, flags)
{
- struct dentry *old_dir, *new_dir;
struct dentry *old_dentry, *new_dentry;
struct dentry *trap;
- struct nameidata oldnd, newnd;
+ struct path old_path, new_path;
+ struct qstr old_last, new_last;
+ int old_type, new_type;
struct inode *delegated_inode = NULL;
struct filename *from;
struct filename *to;
- unsigned int lookup_flags = 0;
+ unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
bool should_retry = false;
int error;
@@ -4261,47 +4311,45 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
return -EPERM;
+ if (flags & RENAME_EXCHANGE)
+ target_flags = 0;
+
retry:
- from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
+ from = user_path_parent(olddfd, oldname,
+ &old_path, &old_last, &old_type, lookup_flags);
if (IS_ERR(from)) {
error = PTR_ERR(from);
goto exit;
}
- to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
+ to = user_path_parent(newdfd, newname,
+ &new_path, &new_last, &new_type, lookup_flags);
if (IS_ERR(to)) {
error = PTR_ERR(to);
goto exit1;
}
error = -EXDEV;
- if (oldnd.path.mnt != newnd.path.mnt)
+ if (old_path.mnt != new_path.mnt)
goto exit2;
- old_dir = oldnd.path.dentry;
error = -EBUSY;
- if (oldnd.last_type != LAST_NORM)
+ if (old_type != LAST_NORM)
goto exit2;
- new_dir = newnd.path.dentry;
if (flags & RENAME_NOREPLACE)
error = -EEXIST;
- if (newnd.last_type != LAST_NORM)
+ if (new_type != LAST_NORM)
goto exit2;
- error = mnt_want_write(oldnd.path.mnt);
+ error = mnt_want_write(old_path.mnt);
if (error)
goto exit2;
- oldnd.flags &= ~LOOKUP_PARENT;
- newnd.flags &= ~LOOKUP_PARENT;
- if (!(flags & RENAME_EXCHANGE))
- newnd.flags |= LOOKUP_RENAME_TARGET;
-
retry_deleg:
- trap = lock_rename(new_dir, old_dir);
+ trap = lock_rename(new_path.dentry, old_path.dentry);
- old_dentry = lookup_hash(&oldnd);
+ old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
error = PTR_ERR(old_dentry);
if (IS_ERR(old_dentry))
goto exit3;
@@ -4309,7 +4357,7 @@ retry_deleg:
error = -ENOENT;
if (d_is_negative(old_dentry))
goto exit4;
- new_dentry = lookup_hash(&newnd);
+ new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto exit4;
@@ -4323,16 +4371,16 @@ retry_deleg:
if (!d_is_dir(new_dentry)) {
error = -ENOTDIR;
- if (newnd.last.name[newnd.last.len])
+ if (new_last.name[new_last.len])
goto exit5;
}
}
/* unless the source is a directory trailing slashes give -ENOTDIR */
if (!d_is_dir(old_dentry)) {
error = -ENOTDIR;
- if (oldnd.last.name[oldnd.last.len])
+ if (old_last.name[old_last.len])
goto exit5;
- if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
+ if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
goto exit5;
}
/* source should not be ancestor of target */
@@ -4345,32 +4393,32 @@ retry_deleg:
if (new_dentry == trap)
goto exit5;
- error = security_path_rename(&oldnd.path, old_dentry,
- &newnd.path, new_dentry, flags);
+ error = security_path_rename(&old_path, old_dentry,
+ &new_path, new_dentry, flags);
if (error)
goto exit5;
- error = vfs_rename(old_dir->d_inode, old_dentry,
- new_dir->d_inode, new_dentry,
+ error = vfs_rename(old_path.dentry->d_inode, old_dentry,
+ new_path.dentry->d_inode, new_dentry,
&delegated_inode, flags);
exit5:
dput(new_dentry);
exit4:
dput(old_dentry);
exit3:
- unlock_rename(new_dir, old_dir);
+ unlock_rename(new_path.dentry, old_path.dentry);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
goto retry_deleg;
}
- mnt_drop_write(oldnd.path.mnt);
+ mnt_drop_write(old_path.mnt);
exit2:
if (retry_estale(error, lookup_flags))
should_retry = true;
- path_put(&newnd.path);
+ path_put(&new_path);
putname(to);
exit1:
- path_put(&oldnd.path);
+ path_put(&old_path);
putname(from);
if (should_retry) {
should_retry = false;
@@ -4429,18 +4477,19 @@ EXPORT_SYMBOL(readlink_copy);
*/
int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- struct nameidata nd;
void *cookie;
+ struct inode *inode = d_inode(dentry);
+ const char *link = inode->i_link;
int res;
- nd.depth = 0;
- cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
- if (IS_ERR(cookie))
- return PTR_ERR(cookie);
-
- res = readlink_copy(buffer, buflen, nd_get_link(&nd));
- if (dentry->d_inode->i_op->put_link)
- dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+ if (!link) {
+ link = inode->i_op->follow_link(dentry, &cookie);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ }
+ res = readlink_copy(buffer, buflen, link);
+ if (inode->i_op->put_link)
+ inode->i_op->put_link(inode, cookie);
return res;
}
EXPORT_SYMBOL(generic_readlink);
@@ -4472,22 +4521,21 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
}
EXPORT_SYMBOL(page_readlink);
-void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
+const char *page_follow_link_light(struct dentry *dentry, void **cookie)
{
struct page *page = NULL;
- nd_set_link(nd, page_getlink(dentry, &page));
- return page;
+ char *res = page_getlink(dentry, &page);
+ if (!IS_ERR(res))
+ *cookie = page;
+ return res;
}
EXPORT_SYMBOL(page_follow_link_light);
-void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+void page_put_link(struct inode *unused, void *cookie)
{
struct page *page = cookie;
-
- if (page) {
- kunmap(page);
- page_cache_release(page);
- }
+ kunmap(page);
+ page_cache_release(page);
}
EXPORT_SYMBOL(page_put_link);
diff --git a/fs/namespace.c b/fs/namespace.c
index 82ef1405260e..2b8aa15fd6df 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -590,24 +590,35 @@ static void delayed_free_vfsmnt(struct rcu_head *head)
}
/* call under rcu_read_lock */
-bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
struct mount *mnt;
if (read_seqretry(&mount_lock, seq))
- return false;
+ return 1;
if (bastard == NULL)
- return true;
+ return 0;
mnt = real_mount(bastard);
mnt_add_count(mnt, 1);
if (likely(!read_seqretry(&mount_lock, seq)))
- return true;
+ return 0;
if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
mnt_add_count(mnt, -1);
- return false;
+ return 1;
+ }
+ return -1;
+}
+
+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+ int res = __legitimize_mnt(bastard, seq);
+ if (likely(!res))
+ return true;
+ if (unlikely(res < 0)) {
+ rcu_read_unlock();
+ mntput(bastard);
+ rcu_read_lock();
}
- rcu_read_unlock();
- mntput(bastard);
- rcu_read_lock();
return false;
}
@@ -632,14 +643,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
*/
struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
{
- struct mount *p, *res;
- res = p = __lookup_mnt(mnt, dentry);
+ struct mount *p, *res = NULL;
+ p = __lookup_mnt(mnt, dentry);
if (!p)
goto out;
+ if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+ res = p;
hlist_for_each_entry_continue(p, mnt_hash) {
if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
break;
- res = p;
+ if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+ res = p;
}
out:
return res;
@@ -795,10 +809,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
/*
* vfsmount lock must be held for write
*/
-static void detach_mnt(struct mount *mnt, struct path *old_path)
+static void unhash_mnt(struct mount *mnt)
{
- old_path->dentry = mnt->mnt_mountpoint;
- old_path->mnt = &mnt->mnt_parent->mnt;
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
@@ -811,6 +823,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
/*
* vfsmount lock must be held for write
*/
+static void detach_mnt(struct mount *mnt, struct path *old_path)
+{
+ old_path->dentry = mnt->mnt_mountpoint;
+ old_path->mnt = &mnt->mnt_parent->mnt;
+ unhash_mnt(mnt);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void umount_mnt(struct mount *mnt)
+{
+ /* old mountpoint will be dropped when we can do that */
+ mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
+ unhash_mnt(mnt);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
void mnt_set_mountpoint(struct mount *mnt,
struct mountpoint *mp,
struct mount *child_mnt)
@@ -1078,6 +1110,13 @@ static void mntput_no_expire(struct mount *mnt)
rcu_read_unlock();
list_del(&mnt->mnt_instance);
+
+ if (unlikely(!list_empty(&mnt->mnt_mounts))) {
+ struct mount *p, *tmp;
+ list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
+ umount_mnt(p);
+ }
+ }
unlock_mount_hash();
if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
@@ -1187,7 +1226,7 @@ EXPORT_SYMBOL(replace_mount_options);
/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
down_read(&namespace_sem);
if (p->cached_event == p->ns->event) {
@@ -1208,7 +1247,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
p->cached_mount = seq_list_next(v, &p->ns->list, pos);
p->cached_index = *pos;
@@ -1222,7 +1261,7 @@ static void m_stop(struct seq_file *m, void *v)
static int m_show(struct seq_file *m, void *v)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
struct mount *r = list_entry(v, struct mount, mnt_list);
return p->show(m, &r->mnt);
}
@@ -1298,17 +1337,15 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static void namespace_unlock(void)
{
- struct hlist_head head = unmounted;
+ struct hlist_head head;
- if (likely(hlist_empty(&head))) {
- up_write(&namespace_sem);
- return;
- }
+ hlist_move_list(&unmounted, &head);
- head.first->pprev = &head.first;
- INIT_HLIST_HEAD(&unmounted);
up_write(&namespace_sem);
+ if (likely(hlist_empty(&head)))
+ return;
+
synchronize_rcu();
group_pin_kill(&head);
@@ -1319,49 +1356,90 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}
+enum umount_tree_flags {
+ UMOUNT_SYNC = 1,
+ UMOUNT_PROPAGATE = 2,
+ UMOUNT_CONNECTED = 4,
+};
+
+static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
+{
+ /* Leaving mounts connected is only valid for lazy umounts */
+ if (how & UMOUNT_SYNC)
+ return true;
+
+ /* A mount without a parent has nothing to be connected to */
+ if (!mnt_has_parent(mnt))
+ return true;
+
+ /* Because the reference counting rules change when mounts are
+ * unmounted and connected, umounted mounts may not be
+ * connected to mounted mounts.
+ */
+ if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
+ return true;
+
+ /* Has it been requested that the mount remain connected? */
+ if (how & UMOUNT_CONNECTED)
+ return false;
+
+ /* Is the mount locked such that it needs to remain connected? */
+ if (IS_MNT_LOCKED(mnt))
+ return false;
+
+ /* By default disconnect the mount */
+ return true;
+}
+
/*
* mount_lock must be held
* namespace_sem must be held for write
- * how = 0 => just this tree, don't propagate
- * how = 1 => propagate; we know that nobody else has reference to any victims
- * how = 2 => lazy umount
*/
-void umount_tree(struct mount *mnt, int how)
+static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
- HLIST_HEAD(tmp_list);
+ LIST_HEAD(tmp_list);
struct mount *p;
+ if (how & UMOUNT_PROPAGATE)
+ propagate_mount_unlock(mnt);
+
+ /* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) {
- hlist_del_init_rcu(&p->mnt_hash);
- hlist_add_head(&p->mnt_hash, &tmp_list);
+ p->mnt.mnt_flags |= MNT_UMOUNT;
+ list_move(&p->mnt_list, &tmp_list);
}
- hlist_for_each_entry(p, &tmp_list, mnt_hash)
+ /* Hide the mounts from mnt_mounts */
+ list_for_each_entry(p, &tmp_list, mnt_list) {
list_del_init(&p->mnt_child);
+ }
- if (how)
+ /* Add propogated mounts to the tmp_list */
+ if (how & UMOUNT_PROPAGATE)
propagate_umount(&tmp_list);
- while (!hlist_empty(&tmp_list)) {
- p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
- hlist_del_init_rcu(&p->mnt_hash);
+ while (!list_empty(&tmp_list)) {
+ bool disconnect;
+ p = list_first_entry(&tmp_list, struct mount, mnt_list);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
__touch_mnt_namespace(p->mnt_ns);
p->mnt_ns = NULL;
- if (how < 2)
+ if (how & UMOUNT_SYNC)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
- pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
+ disconnect = disconnect_mount(p, how);
+
+ pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
+ disconnect ? &unmounted : NULL);
if (mnt_has_parent(p)) {
- hlist_del_init(&p->mnt_mp_list);
- put_mountpoint(p->mnt_mp);
mnt_add_count(p->mnt_parent, -1);
- /* old mountpoint will be dropped when we can do that */
- p->mnt_ex_mountpoint = p->mnt_mountpoint;
- p->mnt_mountpoint = p->mnt.mnt_root;
- p->mnt_parent = p;
- p->mnt_mp = NULL;
+ if (!disconnect) {
+ /* Don't forget about p */
+ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
+ } else {
+ umount_mnt(p);
+ }
}
change_mnt_propagation(p, MS_PRIVATE);
}
@@ -1447,14 +1525,14 @@ static int do_umount(struct mount *mnt, int flags)
if (flags & MNT_DETACH) {
if (!list_empty(&mnt->mnt_list))
- umount_tree(mnt, 2);
+ umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
if (!list_empty(&mnt->mnt_list))
- umount_tree(mnt, 1);
+ umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
}
@@ -1480,13 +1558,17 @@ void __detach_mounts(struct dentry *dentry)
namespace_lock();
mp = lookup_mountpoint(dentry);
- if (!mp)
+ if (IS_ERR_OR_NULL(mp))
goto out_unlock;
lock_mount_hash();
while (!hlist_empty(&mp->m_list)) {
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
- umount_tree(mnt, 2);
+ if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
+ hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
+ umount_mnt(mnt);
+ }
+ else umount_tree(mnt, UMOUNT_CONNECTED);
}
unlock_mount_hash();
put_mountpoint(mp);
@@ -1648,7 +1730,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
out:
if (res) {
lock_mount_hash();
- umount_tree(res, 0);
+ umount_tree(res, UMOUNT_SYNC);
unlock_mount_hash();
}
return q;
@@ -1660,8 +1742,11 @@ struct vfsmount *collect_mounts(struct path *path)
{
struct mount *tree;
namespace_lock();
- tree = copy_tree(real_mount(path->mnt), path->dentry,
- CL_COPY_ALL | CL_PRIVATE);
+ if (!check_mnt(real_mount(path->mnt)))
+ tree = ERR_PTR(-EINVAL);
+ else
+ tree = copy_tree(real_mount(path->mnt), path->dentry,
+ CL_COPY_ALL | CL_PRIVATE);
namespace_unlock();
if (IS_ERR(tree))
return ERR_CAST(tree);
@@ -1672,7 +1757,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
{
namespace_lock();
lock_mount_hash();
- umount_tree(real_mount(mnt), 0);
+ umount_tree(real_mount(mnt), UMOUNT_SYNC);
unlock_mount_hash();
namespace_unlock();
}
@@ -1855,7 +1940,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
out_cleanup_ids:
while (!hlist_empty(&tree_list)) {
child = hlist_entry(tree_list.first, struct mount, mnt_hash);
- umount_tree(child, 0);
+ umount_tree(child, UMOUNT_SYNC);
}
unlock_mount_hash();
cleanup_group_ids(source_mnt, NULL);
@@ -2035,7 +2120,7 @@ static int do_loopback(struct path *path, const char *old_name,
err = graft_tree(mnt, parent, mp);
if (err) {
lock_mount_hash();
- umount_tree(mnt, 0);
+ umount_tree(mnt, UMOUNT_SYNC);
unlock_mount_hash();
}
out2:
@@ -2282,6 +2367,8 @@ unlock:
return err;
}
+static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags);
+
/*
* create a new mount for userspace and request it to be added into the
* namespace's tree
@@ -2313,6 +2400,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
flags |= MS_NODEV;
mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
}
+ if (type->fs_flags & FS_USERNS_VISIBLE) {
+ if (!fs_fully_visible(type, &mnt_flags))
+ return -EPERM;
+ }
}
mnt = vfs_kern_mount(type, flags, name, data);
@@ -2406,7 +2497,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
while (!list_empty(&graveyard)) {
mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
touch_mnt_namespace(mnt->mnt_ns);
- umount_tree(mnt, 1);
+ umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
unlock_mount_hash();
namespace_unlock();
@@ -2477,7 +2568,7 @@ static void shrink_submounts(struct mount *mnt)
m = list_first_entry(&graveyard, struct mount,
mnt_expire);
touch_mnt_namespace(m->mnt_ns);
- umount_tree(m, 1);
+ umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
}
}
@@ -3114,9 +3205,10 @@ bool current_chrooted(void)
return chrooted;
}
-bool fs_fully_visible(struct file_system_type *type)
+static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ int new_flags = *new_mnt_flags;
struct mount *mnt;
bool visible = false;
@@ -3129,16 +3221,42 @@ bool fs_fully_visible(struct file_system_type *type)
if (mnt->mnt.mnt_sb->s_type != type)
continue;
- /* This mount is not fully visible if there are any child mounts
- * that cover anything except for empty directories.
+ /* This mount is not fully visible if it's root directory
+ * is not the root directory of the filesystem.
+ */
+ if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
+ continue;
+
+ /* Verify the mount flags are equal to or more permissive
+ * than the proposed new mount.
+ */
+ if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
+ !(new_flags & MNT_READONLY))
+ continue;
+ if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
+ !(new_flags & MNT_NODEV))
+ continue;
+ if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
+ ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
+ continue;
+
+ /* This mount is not fully visible if there are any
+ * locked child mounts that cover anything except for
+ * empty directories.
*/
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
struct inode *inode = child->mnt_mountpoint->d_inode;
- if (!S_ISDIR(inode->i_mode))
- goto next;
- if (inode->i_nlink > 2)
+ /* Only worry about locked mounts */
+ if (!(mnt->mnt.mnt_flags & MNT_LOCKED))
+ continue;
+ /* Is the directory permanetly empty? */
+ if (!is_empty_dir_inode(inode))
goto next;
}
+ /* Preserve the locked attributes */
+ *new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \
+ MNT_LOCK_NODEV | \
+ MNT_LOCK_ATIME);
visible = true;
goto found;
next: ;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e7ca827d7694..93575e91a7aa 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -127,7 +127,7 @@ static inline int ncp_case_sensitive(const struct inode *i)
static int
ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
{
- struct inode *inode = ACCESS_ONCE(dentry->d_inode);
+ struct inode *inode = d_inode_rcu(dentry);
if (!inode)
return 0;
@@ -162,7 +162,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
if (len != name->len)
return 1;
- pinode = ACCESS_ONCE(parent->d_inode);
+ pinode = d_inode_rcu(parent);
if (!pinode)
return 1;
@@ -180,7 +180,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
static int
ncp_delete_dentry(const struct dentry * dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (inode) {
if (is_bad_inode(inode))
@@ -224,7 +224,7 @@ ncp_force_unlink(struct inode *dir, struct dentry* dentry)
memset(&info, 0, sizeof(info));
/* remove the Read-Only flag on the NW server */
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
old_nwattr = NCP_FINFO(inode)->nwattr;
info.attributes = old_nwattr & ~(aRONLY|aDELETEINHIBIT|aRENAMEINHIBIT);
@@ -254,7 +254,7 @@ ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_na
{
struct nw_modify_dos_info info;
int res=0x90,res2;
- struct inode *old_inode = old_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
__le32 old_nwattr = NCP_FINFO(old_inode)->nwattr;
__le32 new_nwattr = 0; /* shut compiler warning */
int old_nwattr_changed = 0;
@@ -268,8 +268,8 @@ ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_na
res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info);
if (!res2)
old_nwattr_changed = 1;
- if (new_dentry && new_dentry->d_inode) {
- new_nwattr = NCP_FINFO(new_dentry->d_inode)->nwattr;
+ if (new_dentry && d_really_is_positive(new_dentry)) {
+ new_nwattr = NCP_FINFO(d_inode(new_dentry))->nwattr;
info.attributes = new_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT);
res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info);
if (!res2)
@@ -324,9 +324,9 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
return -ECHILD;
parent = dget_parent(dentry);
- dir = parent->d_inode;
+ dir = d_inode(parent);
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
goto finished;
server = NCP_SERVER(dir);
@@ -367,7 +367,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
* what we remember, it's not valid any more.
*/
if (!res) {
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
mutex_lock(&inode->i_mutex);
if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
@@ -388,7 +388,7 @@ finished:
static time_t ncp_obtain_mtime(struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ncp_server *server = NCP_SERVER(inode);
struct nw_info_struct i;
@@ -404,7 +404,7 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
static inline void
ncp_invalidate_dircache_entries(struct dentry *parent)
{
- struct ncp_server *server = NCP_SERVER(parent->d_inode);
+ struct ncp_server *server = NCP_SERVER(d_inode(parent));
struct dentry *dentry;
spin_lock(&parent->d_lock);
@@ -418,7 +418,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent)
static int ncp_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct page *page = NULL;
struct ncp_server *server = NCP_SERVER(inode);
union ncp_dir_cache *cache = NULL;
@@ -491,13 +491,13 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
goto invalid_cache;
}
spin_unlock(&dentry->d_lock);
- if (!dent->d_inode) {
+ if (d_really_is_negative(dent)) {
dput(dent);
goto invalid_cache;
}
over = !dir_emit(ctx, dent->d_name.name,
dent->d_name.len,
- dent->d_inode->i_ino, DT_UNKNOWN);
+ d_inode(dent)->i_ino, DT_UNKNOWN);
dput(dent);
if (over)
goto finished;
@@ -571,7 +571,7 @@ static void ncp_d_prune(struct dentry *dentry)
{
if (!dentry->d_fsdata) /* not referenced from page cache */
return;
- NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE;
+ NCP_FINFO(d_inode(dentry->d_parent))->flags &= ~NCPI_DIR_CACHE;
}
static int
@@ -580,7 +580,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
int inval_childs)
{
struct dentry *newdent, *dentry = file->f_path.dentry;
- struct inode *dir = dentry->d_inode;
+ struct inode *dir = d_inode(dentry);
struct ncp_cache_control ctl = *ctrl;
struct qstr qname;
int valid = 0;
@@ -621,7 +621,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
dentry_update_name_case(newdent, &qname);
}
- if (!newdent->d_inode) {
+ if (d_really_is_negative(newdent)) {
struct inode *inode;
entry->opened = 0;
@@ -637,7 +637,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
spin_unlock(&dentry->d_lock);
}
} else {
- struct inode *inode = newdent->d_inode;
+ struct inode *inode = d_inode(newdent);
mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
ncp_update_inode2(inode, entry);
@@ -659,10 +659,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
ctl.cache = kmap(ctl.page);
}
if (ctl.cache) {
- if (newdent->d_inode) {
+ if (d_really_is_positive(newdent)) {
newdent->d_fsdata = newdent;
ctl.cache->dentry[ctl.idx] = newdent;
- ino = newdent->d_inode->i_ino;
+ ino = d_inode(newdent)->i_ino;
ncp_new_dentry(newdent);
}
valid = 1;
@@ -807,7 +807,7 @@ int ncp_conn_logged_in(struct super_block *sb)
}
dent = sb->s_root;
if (dent) {
- struct inode* ino = dent->d_inode;
+ struct inode* ino = d_inode(dent);
if (ino) {
ncp_update_known_namespace(server, volNumber, NULL);
NCP_FINFO(ino)->volNumber = volNumber;
@@ -815,7 +815,7 @@ int ncp_conn_logged_in(struct super_block *sb)
NCP_FINFO(ino)->DosDirNum = DosDirNum;
result = 0;
} else {
- ncp_dbg(1, "sb->s_root->d_inode == NULL!\n");
+ ncp_dbg(1, "d_inode(sb->s_root) == NULL!\n");
}
} else {
ncp_dbg(1, "sb->s_root == NULL!\n");
@@ -1055,7 +1055,7 @@ out:
static int ncp_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ncp_server *server;
int error;
@@ -1145,6 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
case 0x00:
ncp_dbg(1, "renamed %pd -> %pd\n",
old_dentry, new_dentry);
+ ncp_d_prune(old_dentry);
+ ncp_d_prune(new_dentry);
break;
case 0x9E:
error = -ENAMETOOLONG;
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1dd7007f974d..011324ce9df2 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -98,30 +98,24 @@ out:
}
static ssize_t
-ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+ncp_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
size_t already_read = 0;
- off_t pos;
+ off_t pos = iocb->ki_pos;
size_t bufsize;
int error;
- void* freepage;
+ void *freepage;
size_t freelen;
ncp_dbg(1, "enter %pD2\n", file);
- pos = *ppos;
-
- if ((ssize_t) count < 0) {
- return -EINVAL;
- }
- if (!count)
+ if (!iov_iter_count(to))
return 0;
if (pos > inode->i_sb->s_maxbytes)
return 0;
- if (pos + count > inode->i_sb->s_maxbytes) {
- count = inode->i_sb->s_maxbytes - pos;
- }
+ iov_iter_truncate(to, inode->i_sb->s_maxbytes - pos);
error = ncp_make_open(inode, O_RDONLY);
if (error) {
@@ -138,31 +132,29 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
goto outrel;
error = 0;
/* First read in as much as possible for each bufsize. */
- while (already_read < count) {
+ while (iov_iter_count(to)) {
int read_this_time;
- size_t to_read = min_t(unsigned int,
+ size_t to_read = min_t(size_t,
bufsize - (pos % bufsize),
- count - already_read);
+ iov_iter_count(to));
error = ncp_read_bounce(NCP_SERVER(inode),
NCP_FINFO(inode)->file_handle,
- pos, to_read, buf, &read_this_time,
+ pos, to_read, to, &read_this_time,
freepage, freelen);
if (error) {
error = -EIO; /* NW errno -> Linux errno */
break;
}
pos += read_this_time;
- buf += read_this_time;
already_read += read_this_time;
- if (read_this_time != to_read) {
+ if (read_this_time != to_read)
break;
- }
}
vfree(freepage);
- *ppos = pos;
+ iocb->ki_pos = pos;
file_accessed(file);
@@ -173,42 +165,21 @@ outrel:
}
static ssize_t
-ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
size_t already_written = 0;
- off_t pos;
size_t bufsize;
int errno;
- void* bouncebuffer;
+ void *bouncebuffer;
+ off_t pos;
ncp_dbg(1, "enter %pD2\n", file);
- if ((ssize_t) count < 0)
- return -EINVAL;
- pos = *ppos;
- if (file->f_flags & O_APPEND) {
- pos = i_size_read(inode);
- }
+ errno = generic_write_checks(iocb, from);
+ if (errno <= 0)
+ return errno;
- if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
- if (pos >= MAX_NON_LFS) {
- return -EFBIG;
- }
- if (count > MAX_NON_LFS - (u32)pos) {
- count = MAX_NON_LFS - (u32)pos;
- }
- }
- if (pos >= inode->i_sb->s_maxbytes) {
- if (count || pos > inode->i_sb->s_maxbytes) {
- return -EFBIG;
- }
- }
- if (pos + count > inode->i_sb->s_maxbytes) {
- count = inode->i_sb->s_maxbytes - pos;
- }
-
- if (!count)
- return 0;
errno = ncp_make_open(inode, O_WRONLY);
if (errno) {
ncp_dbg(1, "open failed, error=%d\n", errno);
@@ -216,8 +187,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
}
bufsize = NCP_SERVER(inode)->buffer_size;
- already_written = 0;
-
errno = file_update_time(file);
if (errno)
goto outrel;
@@ -227,13 +196,14 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
errno = -EIO; /* -ENOMEM */
goto outrel;
}
- while (already_written < count) {
+ pos = iocb->ki_pos;
+ while (iov_iter_count(from)) {
int written_this_time;
- size_t to_write = min_t(unsigned int,
+ size_t to_write = min_t(size_t,
bufsize - (pos % bufsize),
- count - already_written);
+ iov_iter_count(from));
- if (copy_from_user(bouncebuffer, buf, to_write)) {
+ if (copy_from_iter(bouncebuffer, to_write, from) != to_write) {
errno = -EFAULT;
break;
}
@@ -244,16 +214,14 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
break;
}
pos += written_this_time;
- buf += written_this_time;
already_written += written_this_time;
- if (written_this_time != to_write) {
+ if (written_this_time != to_write)
break;
- }
}
vfree(bouncebuffer);
- *ppos = pos;
+ iocb->ki_pos = pos;
if (pos > i_size_read(inode)) {
mutex_lock(&inode->i_mutex);
@@ -277,8 +245,8 @@ static int ncp_release(struct inode *inode, struct file *file) {
const struct file_operations ncp_file_operations =
{
.llseek = generic_file_llseek,
- .read = ncp_file_read,
- .write = ncp_file_write,
+ .read_iter = ncp_file_read_iter,
+ .write_iter = ncp_file_write_iter,
.unlocked_ioctl = ncp_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ncp_compat_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 01a9e16e9782..9605a2f63549 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -812,7 +812,7 @@ static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
if (!d) {
goto dflt;
}
- i = d->d_inode;
+ i = d_inode(d);
if (!i) {
goto dflt;
}
@@ -865,7 +865,7 @@ dflt:;
int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int result = 0;
__le32 info_mask;
struct nw_modify_dos_info info;
@@ -878,7 +878,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
goto out;
result = -EPERM;
- if (IS_DEADDIR(dentry->d_inode))
+ if (IS_DEADDIR(d_inode(dentry)))
goto out;
/* ageing the dentry to force validation */
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index cf7e043a9447..79b113048eac 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -376,7 +376,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
struct dentry* dentry = inode->i_sb->s_root;
if (dentry) {
- struct inode* s_inode = dentry->d_inode;
+ struct inode* s_inode = d_inode(dentry);
if (s_inode) {
sr.volNumber = NCP_FINFO(s_inode)->volNumber;
@@ -384,7 +384,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
sr.namespace = server->name_space[sr.volNumber];
result = 0;
} else
- ncp_dbg(1, "s_root->d_inode==NULL\n");
+ ncp_dbg(1, "d_inode(s_root)==NULL\n");
} else
ncp_dbg(1, "s_root==NULL\n");
} else {
@@ -431,7 +431,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
if (result == 0) {
dentry = inode->i_sb->s_root;
if (dentry) {
- struct inode* s_inode = dentry->d_inode;
+ struct inode* s_inode = d_inode(dentry);
if (s_inode) {
NCP_FINFO(s_inode)->volNumber = vnum;
@@ -439,7 +439,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
NCP_FINFO(s_inode)->DosDirNum = dosde;
server->root_setuped = 1;
} else {
- ncp_dbg(1, "s_root->d_inode==NULL\n");
+ ncp_dbg(1, "d_inode(s_root)==NULL\n");
result = -EIO;
}
} else {
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 482387532f54..88dbbc9fcf4d 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -727,7 +727,7 @@ int
ncp_del_file_or_subdir2(struct ncp_server *server,
struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
__u8 volnum;
__le32 dirent;
@@ -1001,8 +1001,8 @@ out:
*/
int
ncp_read_bounce(struct ncp_server *server, const char *file_id,
- __u32 offset, __u16 to_read, char __user *target, int *bytes_read,
- void* bounce, __u32 bufsize)
+ __u32 offset, __u16 to_read, struct iov_iter *to,
+ int *bytes_read, void *bounce, __u32 bufsize)
{
int result;
@@ -1025,7 +1025,7 @@ ncp_read_bounce(struct ncp_server *server, const char *file_id,
(offset & 1);
*bytes_read = len;
result = 0;
- if (copy_to_user(target, source, len))
+ if (copy_to_iter(source, len, to) != len)
result = -EFAULT;
}
}
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 250e443a07f3..5233fbc1747a 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -53,7 +53,7 @@ static inline int ncp_read_bounce_size(__u32 size) {
return sizeof(struct ncp_reply_header) + 2 + 2 + size + 8;
};
int ncp_read_bounce(struct ncp_server *, const char *, __u32, __u16,
- char __user *, int *, void* bounce, __u32 bouncelen);
+ struct iov_iter *, int *, void *bounce, __u32 bouncelen);
int ncp_read_kernel(struct ncp_server *, const char *, __u32, __u16,
char *, int *);
int ncp_write_kernel(struct ncp_server *, const char *, __u32, __u16,
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index 1a63bfdb4a65..421b6f91e8ec 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -156,7 +156,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
goto failfree;
}
- inode=dentry->d_inode;
+ inode=d_inode(dentry);
if (ncp_make_open(inode, O_WRONLY))
goto failfree;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index c7abc10279af..f31fd0dd92c6 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,6 @@
config NFS_FS
tristate "NFS client support"
- depends on INET && FILE_LOCKING
+ depends on INET && FILE_LOCKING && MULTIUSER
select LOCKD
select SUNRPC
select NFS_ACL_SUPPORT if NFS_V3_ACL
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 1e987acf20c9..8664417955a2 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -22,7 +22,7 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
obj-$(CONFIG_NFS_V4) += nfsv4.o
CFLAGS_nfs4trace.o += -I$(src)
nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
- delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
+ delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \
nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \
dns_resolve.o nfs4trace.o
nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 1cac3c175d18..d2554fe140a3 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -890,6 +890,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.free_deviceid_node = bl_free_deviceid_node,
.pg_read_ops = &bl_pg_read_ops,
.pg_write_ops = &bl_pg_write_ops,
+ .sync = pnfs_generic_sync,
};
static int __init nfs4blocklayout_init(void)
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 5aed4f98df41..e535599a0719 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -33,7 +33,7 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d)
container_of(d, struct pnfs_block_dev, node);
bl_free_device(dev);
- kfree(dev);
+ kfree_rcu(dev, node.rcu);
}
static int
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 351be9205bf8..682529c00996 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -128,7 +128,7 @@ nfs41_callback_svc(void *vrqstp)
if (try_to_freeze())
continue;
- prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -142,10 +142,10 @@ nfs41_callback_svc(void *vrqstp)
error);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
- /* schedule_timeout to game the hung task watchdog */
- schedule_timeout(60 * HZ);
+ schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
+ flush_signals(current);
}
return 0;
}
@@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
* pg_authenticate method for nfsv4 callback threads.
*
* The authflavor has been negotiated, so an incorrect flavor is a server
- * bug. Drop packets with incorrect authflavor.
+ * bug. Deny packets with incorrect authflavor.
*
* All other checking done after NFS decoding where the nfs_client can be
* found in nfs4_callback_compound
@@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
switch (rqstp->rq_authop->flavour) {
case RPC_AUTH_NULL:
if (rqstp->rq_proc != CB_NULL)
- return SVC_DROP;
+ return SVC_DENIED;
break;
case RPC_AUTH_GSS:
/* No RPC_AUTH_GSS support yet in NFSv4.1 */
if (svc_is_backchannel(rqstp))
- return SVC_DROP;
+ return SVC_DENIED;
}
return SVC_OK;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 197806fb87ff..29e3c1b011b7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -327,10 +327,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
/* Normal */
- if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
- slot->seq_nr++;
+ if (likely(args->csa_sequenceid == slot->seq_nr + 1))
goto out_ok;
- }
/* Replay */
if (args->csa_sequenceid == slot->seq_nr) {
@@ -418,6 +416,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
struct cb_process_state *cps)
{
struct nfs4_slot_table *tbl;
+ struct nfs4_slot *slot;
struct nfs_client *clp;
int i;
__be32 status = htonl(NFS4ERR_BADSESSION);
@@ -429,25 +428,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
goto out;
+
tbl = &clp->cl_session->bc_slot_table;
+ slot = tbl->slots + args->csa_slotid;
spin_lock(&tbl->slot_tbl_lock);
/* state manager is resetting the session */
if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
- spin_unlock(&tbl->slot_tbl_lock);
status = htonl(NFS4ERR_DELAY);
/* Return NFS4ERR_BADSESSION if we're draining the session
* in order to reset it.
*/
if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
status = htonl(NFS4ERR_BADSESSION);
- goto out;
+ goto out_unlock;
}
- status = validate_seqid(&clp->cl_session->bc_slot_table, args);
- spin_unlock(&tbl->slot_tbl_lock);
+ memcpy(&res->csr_sessionid, &args->csa_sessionid,
+ sizeof(res->csr_sessionid));
+ res->csr_sequenceid = args->csa_sequenceid;
+ res->csr_slotid = args->csa_slotid;
+ res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+ res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+
+ status = validate_seqid(tbl, args);
if (status)
- goto out;
+ goto out_unlock;
cps->slotid = args->csa_slotid;
@@ -458,15 +464,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
*/
if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
status = htonl(NFS4ERR_DELAY);
- goto out;
+ goto out_unlock;
}
- memcpy(&res->csr_sessionid, &args->csa_sessionid,
- sizeof(res->csr_sessionid));
- res->csr_sequenceid = args->csa_sequenceid;
- res->csr_slotid = args->csa_slotid;
- res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
- res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+ /*
+ * RFC5661 20.9.3
+ * If CB_SEQUENCE returns an error, then the state of the slot
+ * (sequence ID, cached reply) MUST NOT change.
+ */
+ slot->seq_nr++;
+out_unlock:
+ spin_unlock(&tbl->slot_tbl_lock);
out:
cps->clp = clp; /* put in nfs4_callback_compound */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 19ca95cdfd9b..6b1697a01dde 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
- if (status == __constant_htonl(NFS4ERR_RESOURCE))
+ if (status == htonl(NFS4ERR_RESOURCE))
return rpc_garbage_args;
if (hdr_arg.minorversion == 0) {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 19874151e95c..4a90c9bb3135 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -31,7 +31,6 @@
#include <linux/lockd/bind.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
-#include <linux/nfs_idmap.h>
#include <linux/vfs.h>
#include <linux/inet.h>
#include <linux/in6.h>
@@ -776,7 +775,7 @@ static int nfs_init_server(struct nfs_server *server,
server->options = data->options;
server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
- NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR;
+ NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
@@ -826,7 +825,6 @@ error:
* Load up the server record from information gained in an fsinfo record
*/
static void nfs_server_set_fsinfo(struct nfs_server *server,
- struct nfs_fh *mntfh,
struct nfs_fsinfo *fsinfo)
{
unsigned long max_rpc_payload;
@@ -902,7 +900,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
if (error < 0)
goto out_error;
- nfs_server_set_fsinfo(server, mntfh, &fsinfo);
+ nfs_server_set_fsinfo(server, &fsinfo);
/* Get some general file system info */
if (server->namelen == 0) {
@@ -1194,8 +1192,6 @@ void nfs_clients_init(struct net *net)
}
#ifdef CONFIG_PROC_FS
-static struct proc_dir_entry *proc_fs_nfs;
-
static int nfs_server_list_open(struct inode *inode, struct file *file);
static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
@@ -1365,27 +1361,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
{
struct nfs_server *server;
struct nfs_client *clp;
- char dev[8], fsid[17];
+ char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0'
+ char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0'
struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* display header on line 1 */
if (v == &nn->nfs_volume_list) {
- seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
+ seq_puts(m, "NV SERVER PORT DEV FSID"
+ " FSC\n");
return 0;
}
/* display one transport per line on subsequent lines */
server = list_entry(v, struct nfs_server, master_link);
clp = server->nfs_client;
- snprintf(dev, 8, "%u:%u",
+ snprintf(dev, sizeof(dev), "%u:%u",
MAJOR(server->s_dev), MINOR(server->s_dev));
- snprintf(fsid, 17, "%llx:%llx",
+ snprintf(fsid, sizeof(fsid), "%llx:%llx",
(unsigned long long) server->fsid.major,
(unsigned long long) server->fsid.minor);
rcu_read_lock();
- seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
+ seq_printf(m, "v%u %s %s %-12s %-33s %s\n",
clp->rpc_ops->version,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
@@ -1435,27 +1433,20 @@ void nfs_fs_proc_net_exit(struct net *net)
*/
int __init nfs_fs_proc_init(void)
{
- struct proc_dir_entry *p;
-
- proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
- if (!proc_fs_nfs)
+ if (!proc_mkdir("fs/nfsfs", NULL))
goto error_0;
/* a file of servers with which we're dealing */
- p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
- if (!p)
+ if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers"))
goto error_1;
/* a file of volumes that we have mounted */
- p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
- if (!p)
- goto error_2;
- return 0;
+ if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes"))
+ goto error_1;
-error_2:
- remove_proc_entry("servers", proc_fs_nfs);
+ return 0;
error_1:
- remove_proc_entry("fs/nfsfs", NULL);
+ remove_proc_subtree("fs/nfsfs", NULL);
error_0:
return -ENOMEM;
}
@@ -1465,9 +1456,7 @@ error_0:
*/
void nfs_fs_proc_exit(void)
{
- remove_proc_entry("volumes", proc_fs_nfs);
- remove_proc_entry("servers", proc_fs_nfs);
- remove_proc_entry("fs/nfsfs", NULL);
+ remove_proc_subtree("fs/nfsfs", NULL);
}
#endif /* CONFIG_PROC_FS */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index a6ad68865880..029d688a969f 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -378,7 +378,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
if (freeme == NULL)
goto out;
}
- list_add_rcu(&delegation->super_list, &server->delegations);
+ list_add_tail_rcu(&delegation->super_list, &server->delegations);
rcu_assign_pointer(nfsi->delegation, delegation);
delegation = NULL;
@@ -514,7 +514,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
delegation = nfs_inode_detach_delegation(inode);
if (delegation != NULL)
- nfs_do_return_delegation(inode, delegation, 0);
+ nfs_do_return_delegation(inode, delegation, 1);
}
/**
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c19e16f0b2d0..547308a5ec6f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -416,15 +416,14 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
{
struct nfs_inode *nfsi;
- if (dentry->d_inode == NULL)
- goto different;
+ if (d_really_is_negative(dentry))
+ return 0;
- nfsi = NFS_I(dentry->d_inode);
+ nfsi = NFS_I(d_inode(dentry));
if (entry->fattr->fileid == nfsi->fileid)
return 1;
if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
return 1;
-different:
return 0;
}
@@ -473,7 +472,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
struct qstr filename = QSTR_INIT(entry->name, entry->len);
struct dentry *dentry;
struct dentry *alias;
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
struct inode *inode;
int status;
@@ -497,9 +496,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
goto out;
if (nfs_same_file(dentry, entry)) {
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
+ status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
if (!status)
- nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
+ nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label);
goto out;
} else {
d_invalidate(dentry);
@@ -544,6 +543,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
if (scratch == NULL)
return -ENOMEM;
+ if (buflen == 0)
+ goto out_nopages;
+
xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
@@ -565,6 +567,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
break;
} while (!entry->eof);
+out_nopages:
if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
array = nfs_readdir_get_array(page);
if (!IS_ERR(array)) {
@@ -870,7 +873,7 @@ static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
@@ -1118,15 +1121,15 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU) {
parent = ACCESS_ONCE(dentry->d_parent);
- dir = ACCESS_ONCE(parent->d_inode);
+ dir = d_inode_rcu(parent);
if (!dir)
return -ECHILD;
} else {
parent = dget_parent(dentry);
- dir = parent->d_inode;
+ dir = d_inode(parent);
}
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
if (!inode) {
if (nfs_neg_need_reval(dir, dentry, flags)) {
@@ -1242,7 +1245,7 @@ out_error:
}
/*
- * A weaker form of d_revalidate for revalidating just the dentry->d_inode
+ * A weaker form of d_revalidate for revalidating just the d_inode(dentry)
* when we don't really care about the dentry name. This is called when a
* pathwalk ends on a dentry that was not found via a normal lookup in the
* parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals).
@@ -1253,7 +1256,7 @@ out_error:
static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
{
int error;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
/*
* I believe we can only get a negative dentry here in the case of a
@@ -1287,7 +1290,7 @@ static int nfs_dentry_delete(const struct dentry *dentry)
dentry, dentry->d_flags);
/* Unhash any dentry with a stale inode */
- if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode))
+ if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry)))
return 1;
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
@@ -1467,9 +1470,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
{
int err;
- if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
- *opened |= FILE_CREATED;
-
err = finish_open(file, dentry, do_open, opened);
if (err)
goto out;
@@ -1491,7 +1491,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
int err;
/* Expect a negative dentry */
- BUG_ON(dentry->d_inode);
+ BUG_ON(d_inode(dentry));
dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
@@ -1587,7 +1587,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
goto no_open;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
/* We can't create new files in nfs_open_revalidate(), so we
* optimize away revalidation of negative dentries.
@@ -1598,12 +1598,12 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU) {
parent = ACCESS_ONCE(dentry->d_parent);
- dir = ACCESS_ONCE(parent->d_inode);
+ dir = d_inode_rcu(parent);
if (!dir)
return -ECHILD;
} else {
parent = dget_parent(dentry);
- dir = parent->d_inode;
+ dir = d_inode(parent);
}
if (!nfs_neg_need_reval(dir, dentry, flags))
ret = 1;
@@ -1643,14 +1643,14 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
struct nfs4_label *label)
{
struct dentry *parent = dget_parent(dentry);
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
struct inode *inode;
int error = -EACCES;
d_drop(dentry);
/* We may have been initialized further down */
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
goto out;
if (fhandle->size == 0) {
error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
@@ -1768,7 +1768,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir);
static void nfs_dentry_handle_enoent(struct dentry *dentry)
{
- if (dentry->d_inode != NULL && !d_unhashed(dentry))
+ if (simple_positive(dentry))
d_delete(dentry);
}
@@ -1780,13 +1780,13 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
dir->i_sb->s_id, dir->i_ino, dentry);
trace_nfs_rmdir_enter(dir, dentry);
- if (dentry->d_inode) {
+ if (d_really_is_positive(dentry)) {
nfs_wait_on_sillyrename(dentry);
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
/* Ensure the VFS deletes this inode */
switch (error) {
case 0:
- clear_nlink(dentry->d_inode);
+ clear_nlink(d_inode(dentry));
break;
case -ENOENT:
nfs_dentry_handle_enoent(dentry);
@@ -1808,8 +1808,8 @@ EXPORT_SYMBOL_GPL(nfs_rmdir);
*/
static int nfs_safe_remove(struct dentry *dentry)
{
- struct inode *dir = dentry->d_parent->d_inode;
- struct inode *inode = dentry->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *inode = d_inode(dentry);
int error = -EBUSY;
dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry);
@@ -1853,7 +1853,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
if (d_count(dentry) > 1) {
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
- write_inode_now(dentry->d_inode, 0);
+ write_inode_now(d_inode(dentry), 0);
error = nfs_sillyrename(dir, dentry);
goto out;
}
@@ -1931,7 +1931,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
* No big deal if we can't add this page to the page cache here.
* READLINK will get the missing page from the server if needed.
*/
- if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
+ if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0,
GFP_KERNEL)) {
SetPageUptodate(page);
unlock_page(page);
@@ -1950,7 +1950,7 @@ EXPORT_SYMBOL_GPL(nfs_symlink);
int
nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int error;
dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n",
@@ -1997,8 +1997,8 @@ EXPORT_SYMBOL_GPL(nfs_link);
int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct dentry *dentry = NULL, *rehash = NULL;
struct rpc_task *task;
int error = -EBUSY;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e907c8cf732e..38678d9a5cc4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -129,22 +129,25 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
int i;
ssize_t count;
- WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);
-
- count = dreq->mirrors[hdr->pgio_mirror_idx].count;
- if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
- count = hdr->io_start + hdr->good_bytes - dreq->io_start;
- dreq->mirrors[hdr->pgio_mirror_idx].count = count;
- }
-
- /* update the dreq->count by finding the minimum agreed count from all
- * mirrors */
- count = dreq->mirrors[0].count;
+ if (dreq->mirror_count == 1) {
+ dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes;
+ dreq->count += hdr->good_bytes;
+ } else {
+ /* mirrored writes */
+ count = dreq->mirrors[hdr->pgio_mirror_idx].count;
+ if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
+ count = hdr->io_start + hdr->good_bytes - dreq->io_start;
+ dreq->mirrors[hdr->pgio_mirror_idx].count = count;
+ }
+ /* update the dreq->count by finding the minimum agreed count from all
+ * mirrors */
+ count = dreq->mirrors[0].count;
- for (i = 1; i < dreq->mirror_count; i++)
- count = min(count, dreq->mirrors[i].count);
+ for (i = 1; i < dreq->mirror_count; i++)
+ count = min(count, dreq->mirrors[i].count);
- dreq->count = count;
+ dreq->count = count;
+ }
}
/*
@@ -240,7 +243,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
/**
* nfs_direct_IO - NFS address space operation for direct I/O
- * @rw: direction (read or write)
* @iocb: target I/O control block
* @iov: array of vectors that define I/O buffer
* @pos: offset in file to begin the operation
@@ -251,7 +253,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
* shunt off direct read and write requests before the VFS gets them,
* so this method is only ever called for swap.
*/
-ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
@@ -259,18 +261,11 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
if (!IS_SWAPFILE(inode))
return 0;
-#ifndef CONFIG_NFS_SWAP
- dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
- iocb->ki_filp, (long long) pos, iter->nr_segs);
+ VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
- return -EINVAL;
-#else
- VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
-
- if (rw == READ)
+ if (iov_iter_rw(iter) == READ)
return nfs_file_direct_read(iocb, iter, pos);
- return nfs_file_direct_write(iocb, iter, pos);
-#endif /* CONFIG_NFS_SWAP */
+ return nfs_file_direct_write(iocb, iter);
}
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -387,13 +382,13 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
if (write)
nfs_zap_mapping(inode, inode->i_mapping);
- inode_dio_done(inode);
+ inode_dio_end(inode);
if (dreq->iocb) {
long res = (long) dreq->error;
if (!res)
res = (long) dreq->count;
- aio_complete(dreq->iocb, res, 0);
+ dreq->iocb->ki_complete(dreq->iocb, res, 0);
}
complete_all(&dreq->completion);
@@ -404,8 +399,8 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
static void nfs_direct_readpage_release(struct nfs_page *req)
{
dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
- req->wb_context->dentry->d_inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+ d_inode(req->wb_context->dentry)->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)),
req->wb_bytes,
(long long)req_offset(req));
nfs_release_request(req);
@@ -487,7 +482,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
&nfs_direct_read_completion_ops);
get_dreq(dreq);
desc.pg_dreq = dreq;
- atomic_inc(&inode->i_dio_count);
+ inode_dio_begin(inode);
while (iov_iter_count(iter)) {
struct page **pagevec;
@@ -539,7 +534,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* generic layer handle the completion.
*/
if (requested_bytes == 0) {
- inode_dio_done(inode);
+ inode_dio_end(inode);
nfs_direct_req_release(dreq);
return result < 0 ? result : -EIO;
}
@@ -873,7 +868,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
get_dreq(dreq);
- atomic_inc(&inode->i_dio_count);
+ inode_dio_begin(inode);
NFS_I(inode)->write_io += iov_iter_count(iter);
while (iov_iter_count(iter)) {
@@ -929,7 +924,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* generic layer handle the completion.
*/
if (requested_bytes == 0) {
- inode_dio_done(inode);
+ inode_dio_end(inode);
nfs_direct_req_release(dreq);
return result < 0 ? result : -EIO;
}
@@ -960,8 +955,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
* Note that O_APPEND is not supported for NFS direct writes, as there
* is no atomic O_APPEND write facility in the NFS protocol.
*/
-ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
- loff_t pos)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t result = -EINVAL;
struct file *file = iocb->ki_filp;
@@ -969,25 +963,16 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
struct inode *inode = mapping->host;
struct nfs_direct_req *dreq;
struct nfs_lock_context *l_ctx;
- loff_t end;
- size_t count = iov_iter_count(iter);
- end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
-
- nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
+ loff_t pos, end;
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
- file, count, (long long) pos);
+ file, iov_iter_count(iter), (long long) iocb->ki_pos);
- result = generic_write_checks(file, &pos, &count, 0);
- if (result)
- goto out;
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES,
+ iov_iter_count(iter));
- result = -EINVAL;
- if ((ssize_t) count < 0)
- goto out;
- result = 0;
- if (!count)
- goto out;
+ pos = iocb->ki_pos;
+ end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT;
mutex_lock(&inode->i_mutex);
@@ -1002,7 +987,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
goto out_unlock;
}
- task_io_account_write(count);
+ task_io_account_write(iov_iter_count(iter));
result = -ENOMEM;
dreq = nfs_direct_req_alloc();
@@ -1010,7 +995,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
goto out_unlock;
dreq->inode = inode;
- dreq->bytes_left = count;
+ dreq->bytes_left = iov_iter_count(iter);
dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -1041,6 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
if (i_size_read(inode) < iocb->ki_pos)
i_size_write(inode, iocb->ki_pos);
spin_unlock(&inode->i_lock);
+ generic_write_sync(file, pos, result);
}
}
nfs_direct_req_release(dreq);
@@ -1050,7 +1036,6 @@ out_release:
nfs_direct_req_release(dreq);
out_unlock:
mutex_unlock(&inode->i_mutex);
-out:
return result;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e679d24c39d3..cc4fa1ed61fc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -26,7 +26,6 @@
#include <linux/nfs_mount.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
-#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/swap.h>
@@ -171,7 +170,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t result;
- if (iocb->ki_filp->f_flags & O_DIRECT)
+ if (iocb->ki_flags & IOCB_DIRECT)
return nfs_file_direct_read(iocb, to, iocb->ki_pos);
dprintk("NFS: read(%pD2, %zu@%lu)\n",
@@ -281,6 +280,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
trace_nfs_fsync_enter(inode);
+ nfs_inode_dio_wait(inode);
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
@@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page)
return nfs_wb_page(inode, page);
}
-#ifdef CONFIG_NFS_SWAP
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
- int ret;
struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
*span = sis->pages;
- rcu_read_lock();
- ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
- rcu_read_unlock();
-
- return ret;
+ return rpc_clnt_swap_activate(clnt);
}
static void nfs_swap_deactivate(struct file *file)
{
struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
- rcu_read_lock();
- xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
- rcu_read_unlock();
+ rpc_clnt_swap_deactivate(clnt);
}
-#endif
const struct address_space_operations nfs_file_aops = {
.readpage = nfs_readpage,
@@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = {
.launder_page = nfs_launder_page,
.is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
-#ifdef CONFIG_NFS_SWAP
.swap_activate = nfs_swap_activate,
.swap_deactivate = nfs_swap_deactivate,
-#endif
};
/*
@@ -675,17 +664,20 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
unsigned long written = 0;
ssize_t result;
size_t count = iov_iter_count(from);
- loff_t pos = iocb->ki_pos;
result = nfs_key_timeout_notify(file, inode);
if (result)
return result;
- if (file->f_flags & O_DIRECT)
- return nfs_file_direct_write(iocb, from, pos);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ result = generic_write_checks(iocb, from);
+ if (result <= 0)
+ return result;
+ return nfs_file_direct_write(iocb, from);
+ }
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
- file, count, (long long) pos);
+ file, count, (long long) iocb->ki_pos);
result = -EBUSY;
if (IS_SWAPFILE(inode))
@@ -693,7 +685,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
/*
* O_APPEND implies that we must revalidate the file length.
*/
- if (file->f_flags & O_APPEND) {
+ if (iocb->ki_flags & IOCB_APPEND) {
result = nfs_revalidate_file_size(inode, file);
if (result)
goto out;
@@ -780,7 +772,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
* Flush all pending writes before doing anything
* with locks..
*/
- nfs_sync_mapping(filp->f_mapping);
+ vfs_fsync(filp, 0);
l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
if (!IS_ERR(l_ctx)) {
@@ -927,8 +919,6 @@ EXPORT_SYMBOL_GPL(nfs_flock);
const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 91e88a7ecef0..b34f2e228601 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -32,6 +32,7 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/module.h>
+#include <linux/backing-dev.h>
#include <linux/sunrpc/metrics.h>
@@ -258,7 +259,8 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
hdr->res.verf->committed != NFS_DATA_SYNC)
return;
- pnfs_set_layoutcommit(hdr);
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+ hdr->mds_offset + hdr->res.count);
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
}
@@ -373,7 +375,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
}
if (data->verf.committed == NFS_UNSTABLE)
- pnfs_commit_set_layoutcommit(data);
+ pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0;
}
@@ -1086,7 +1088,7 @@ filelayout_alloc_deviceid_node(struct nfs_server *server,
}
static void
-filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+filelayout_free_deviceid_node(struct nfs4_deviceid_node *d)
{
nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
}
@@ -1137,7 +1139,8 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
.alloc_deviceid_node = filelayout_alloc_deviceid_node,
- .free_deviceid_node = filelayout_free_deveiceid_node,
+ .free_deviceid_node = filelayout_free_deviceid_node,
+ .sync = pnfs_nfs_generic_sync,
};
static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 4f372e224603..4946ef40ba87 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -55,7 +55,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
nfs4_pnfs_ds_put(ds);
}
kfree(dsaddr->stripe_indices);
- kfree(dsaddr);
+ kfree_rcu(dsaddr, id_node.rcu);
}
/* Decode opaque device data and return the result */
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 315cc68945b9..b3289d701eea 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -11,15 +11,16 @@
#include <linux/module.h>
#include <linux/sunrpc/metrics.h>
-#include <linux/nfs_idmap.h>
#include "flexfilelayout.h"
#include "../nfs4session.h"
+#include "../nfs4idmap.h"
#include "../internal.h"
#include "../delegation.h"
#include "../nfs4trace.h"
#include "../iostat.h"
#include "../nfs.h"
+#include "../nfs42.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -182,17 +183,14 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{
- struct nfs4_ff_layout_mirror *tmp;
int i, j;
for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
for (j = i + 1; j < fls->mirror_array_cnt; j++)
if (fls->mirror_array[i]->efficiency <
- fls->mirror_array[j]->efficiency) {
- tmp = fls->mirror_array[i];
- fls->mirror_array[i] = fls->mirror_array[j];
- fls->mirror_array[j] = tmp;
- }
+ fls->mirror_array[j]->efficiency)
+ swap(fls->mirror_array[i],
+ fls->mirror_array[j]);
}
}
@@ -274,6 +272,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
spin_lock_init(&fls->mirror_array[i]->lock);
fls->mirror_array[i]->ds_count = ds_count;
+ fls->mirror_array[i]->lseg = &fls->generic_hdr;
/* deviceid */
rc = decode_deviceid(&stream, &devid);
@@ -344,6 +343,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array[i]->gid);
}
+ p = xdr_inline_decode(&stream, 4);
+ if (p)
+ fls->flags = be32_to_cpup(p);
+
ff_layout_sort_mirrors(fls);
rc = ff_layout_check_layout(lgr);
if (rc)
@@ -415,6 +418,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
return 1;
}
+static void
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+{
+ /* first IO request? */
+ if (atomic_inc_return(&timer->n_ops) == 1) {
+ timer->start_time = ktime_get();
+ }
+}
+
+static ktime_t
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+{
+ ktime_t start, now;
+
+ if (atomic_dec_return(&timer->n_ops) < 0)
+ WARN_ON_ONCE(1);
+
+ now = ktime_get();
+ start = timer->start_time;
+ timer->start_time = now;
+ return ktime_sub(now, start);
+}
+
+static ktime_t
+nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
+{
+ return ktime_sub(ktime_get(), task->tk_start);
+}
+
+static bool
+nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+ struct nfs4_ff_layoutstat *layoutstat)
+{
+ static const ktime_t notime = {0};
+ ktime_t now = ktime_get();
+
+ nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+ if (ktime_equal(mirror->start_time, notime))
+ mirror->start_time = now;
+ if (ktime_equal(mirror->last_report_time, notime))
+ mirror->last_report_time = now;
+ if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
+ FF_LAYOUTSTATS_REPORT_INTERVAL) {
+ mirror->last_report_time = now;
+ return true;
+ }
+
+ return false;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
+ __u64 requested)
+{
+ struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+
+ iostat->ops_requested++;
+ iostat->bytes_requested += requested;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
+ __u64 requested,
+ __u64 completed,
+ ktime_t time_completed)
+{
+ struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+ ktime_t timer;
+
+ iostat->ops_completed++;
+ iostat->bytes_completed += completed;
+ iostat->bytes_not_delivered += requested - completed;
+
+ timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+ iostat->total_busy_time =
+ ktime_add(iostat->total_busy_time, timer);
+ iostat->aggregate_completion_time =
+ ktime_add(iostat->aggregate_completion_time, time_completed);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested)
+{
+ bool report;
+
+ spin_lock(&mirror->lock);
+ report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+ nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+ spin_unlock(&mirror->lock);
+
+ if (report)
+ pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested,
+ __u64 completed)
+{
+ spin_lock(&mirror->lock);
+ nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+ requested, completed,
+ nfs4_ff_layout_calc_completion_time(task));
+ spin_unlock(&mirror->lock);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested)
+{
+ bool report;
+
+ spin_lock(&mirror->lock);
+ report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+ nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+ spin_unlock(&mirror->lock);
+
+ if (report)
+ pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested,
+ __u64 completed,
+ enum nfs3_stable_how committed)
+{
+ if (committed == NFS_UNSTABLE)
+ requested = completed = 0;
+
+ spin_lock(&mirror->lock);
+ nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+ requested, completed,
+ nfs4_ff_layout_calc_completion_time(task));
+ spin_unlock(&mirror->lock);
+}
+
static int
ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo,
@@ -631,7 +774,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
nfs_direct_set_resched_writes(hdr->dreq);
/* fake unstable write to let common nfs resend pages */
hdr->verf.committed = NFS_UNSTABLE;
- hdr->good_bytes = 0;
+ hdr->good_bytes = hdr->args.count;
}
return;
}
@@ -879,6 +1022,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
return 0;
}
+static bool
+ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
+{
+ return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
+}
+
/*
* We reference the rpc_cred of the first WRITE that triggers the need for
* a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
@@ -891,7 +1040,11 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
static void
ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
{
- pnfs_set_layoutcommit(hdr);
+ if (!ff_layout_need_layoutcommit(hdr->lseg))
+ return;
+
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+ hdr->mds_offset + hdr->res.count);
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
}
@@ -908,6 +1061,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
static int ff_layout_read_prepare_common(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ nfs4_ff_layout_stat_io_start_read(
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count);
+
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -961,15 +1118,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- if (ff_layout_read_prepare_common(task, hdr))
- return;
-
if (ff_layout_setup_sequence(hdr->ds_clp,
&hdr->args.seq_args,
&hdr->res.seq_res,
task))
return;
+ if (ff_layout_read_prepare_common(task, hdr))
+ return;
+
if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
hdr->args.lock_context, FMODE_READ) == -EIO)
rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -981,6 +1138,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+ nfs4_ff_layout_stat_io_end_read(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count, hdr->res.count);
+
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1073,8 +1234,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
- if (data->verf.committed == NFS_UNSTABLE)
- pnfs_commit_set_layoutcommit(data);
+ if (data->verf.committed == NFS_UNSTABLE
+ && ff_layout_need_layoutcommit(data->lseg))
+ pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0;
}
@@ -1082,6 +1244,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
static int ff_layout_write_prepare_common(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ nfs4_ff_layout_stat_io_start_write(
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count);
+
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1115,15 +1281,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- if (ff_layout_write_prepare_common(task, hdr))
- return;
-
if (ff_layout_setup_sequence(hdr->ds_clp,
&hdr->args.seq_args,
&hdr->res.seq_res,
task))
return;
+ if (ff_layout_write_prepare_common(task, hdr))
+ return;
+
if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
hdr->args.lock_context, FMODE_WRITE) == -EIO)
rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -1133,6 +1299,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count, hdr->res.count,
+ hdr->res.verf->committed);
+
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1151,8 +1322,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
}
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ nfs4_ff_layout_stat_io_start_write(
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ 0);
+}
+
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
{
+ ff_layout_commit_prepare_common(task, data);
rpc_call_start(task);
}
@@ -1160,10 +1340,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
{
struct nfs_commit_data *wdata = data;
- ff_layout_setup_sequence(wdata->ds_clp,
+ if (ff_layout_setup_sequence(wdata->ds_clp,
&wdata->args.seq_args,
&wdata->res.seq_res,
- task);
+ task))
+ return;
+ ff_layout_commit_prepare_common(task, data);
+}
+
+static void ff_layout_commit_done(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *cdata = data;
+ struct nfs_page *req;
+ __u64 count = 0;
+
+ if (task->tk_status == 0) {
+ list_for_each_entry(req, &cdata->pages, wb_list)
+ count += req->wb_bytes;
+ }
+
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ count, count, NFS_FILE_SYNC);
+
+ pnfs_generic_write_commit_done(task, data);
}
static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
@@ -1204,14 +1404,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
.rpc_call_prepare = ff_layout_commit_prepare_v3,
- .rpc_call_done = pnfs_generic_write_commit_done,
+ .rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
.rpc_release = pnfs_generic_commit_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
.rpc_call_prepare = ff_layout_commit_prepare_v4,
- .rpc_call_done = pnfs_generic_write_commit_done,
+ .rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
.rpc_release = pnfs_generic_commit_release,
};
@@ -1255,7 +1455,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
if (fh)
hdr->args.fh = fh;
-
/*
* Note that if we ever decide to split across DSes,
* then we may need to handle dense-like offsets.
@@ -1384,6 +1583,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
if (fh)
data->args.fh = fh;
+
return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
vers == 3 ? &ff_layout_commit_call_ops_v3 :
&ff_layout_commit_call_ops_v4,
@@ -1414,7 +1614,7 @@ ff_layout_get_ds_info(struct inode *inode)
}
static void
-ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
{
nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
id_node));
@@ -1487,6 +1687,247 @@ out:
dprintk("%s: Return\n", __func__);
}
+static int
+ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+ return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+static size_t
+ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
+ const int buflen)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ const struct in6_addr *addr = &sin6->sin6_addr;
+
+ /*
+ * RFC 4291, Section 2.2.2
+ *
+ * Shorthanded ANY address
+ */
+ if (ipv6_addr_any(addr))
+ return snprintf(buf, buflen, "::");
+
+ /*
+ * RFC 4291, Section 2.2.2
+ *
+ * Shorthanded loopback address
+ */
+ if (ipv6_addr_loopback(addr))
+ return snprintf(buf, buflen, "::1");
+
+ /*
+ * RFC 4291, Section 2.2.3
+ *
+ * Special presentation address format for mapped v4
+ * addresses.
+ */
+ if (ipv6_addr_v4mapped(addr))
+ return snprintf(buf, buflen, "::ffff:%pI4",
+ &addr->s6_addr32[3]);
+
+ /*
+ * RFC 4291, Section 2.2.1
+ */
+ return snprintf(buf, buflen, "%pI6c", addr);
+}
+
+/* Derived from rpc_sockaddr2uaddr */
+static void
+ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
+{
+ struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
+ char portbuf[RPCBIND_MAXUADDRPLEN];
+ char addrbuf[RPCBIND_MAXUADDRLEN];
+ char *netid;
+ unsigned short port;
+ int len, netid_len;
+ __be32 *p;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+ return;
+ port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+ netid = "tcp";
+ netid_len = 3;
+ break;
+ case AF_INET6:
+ if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+ return;
+ port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+ netid = "tcp6";
+ netid_len = 4;
+ break;
+ default:
+ /* we only support tcp and tcp6 */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
+ len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
+
+ p = xdr_reserve_space(xdr, 4 + netid_len);
+ xdr_encode_opaque(p, netid, netid_len);
+
+ p = xdr_reserve_space(xdr, 4 + len);
+ xdr_encode_opaque(p, addrbuf, len);
+}
+
+static void
+ff_layout_encode_nfstime(struct xdr_stream *xdr,
+ ktime_t t)
+{
+ struct timespec64 ts;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 12);
+ ts = ktime_to_timespec64(t);
+ p = xdr_encode_hyper(p, ts.tv_sec);
+ *p++ = cpu_to_be32(ts.tv_nsec);
+}
+
+static void
+ff_layout_encode_io_latency(struct xdr_stream *xdr,
+ struct nfs4_ff_io_stat *stat)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 5 * 8);
+ p = xdr_encode_hyper(p, stat->ops_requested);
+ p = xdr_encode_hyper(p, stat->bytes_requested);
+ p = xdr_encode_hyper(p, stat->ops_completed);
+ p = xdr_encode_hyper(p, stat->bytes_completed);
+ p = xdr_encode_hyper(p, stat->bytes_not_delivered);
+ ff_layout_encode_nfstime(xdr, stat->total_busy_time);
+ ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr,
+ struct nfs42_layoutstat_args *args,
+ struct nfs42_layoutstat_devinfo *devinfo)
+{
+ struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
+ struct nfs4_pnfs_ds_addr *da;
+ struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
+ struct nfs_fh *fh = &mirror->fh_versions[0];
+ __be32 *p, *start;
+
+ da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
+ dprintk("%s: DS %s: encoding address %s\n",
+ __func__, ds->ds_remotestr, da->da_remotestr);
+ /* layoutupdate length */
+ start = xdr_reserve_space(xdr, 4);
+ /* netaddr4 */
+ ff_layout_encode_netaddr(xdr, da);
+ /* nfs_fh4 */
+ p = xdr_reserve_space(xdr, 4 + fh->size);
+ xdr_encode_opaque(p, fh->data, fh->size);
+ /* ff_io_latency4 read */
+ spin_lock(&mirror->lock);
+ ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+ /* ff_io_latency4 write */
+ ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
+ spin_unlock(&mirror->lock);
+ /* nfstime4 */
+ ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+ /* bool */
+ p = xdr_reserve_space(xdr, 4);
+ *p = cpu_to_be32(false);
+
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+}
+
+static bool
+ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
+ struct pnfs_layout_segment *pls,
+ int *dev_count, int dev_limit)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *dev;
+ struct nfs42_layoutstat_devinfo *devinfo;
+ int i;
+
+ for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) {
+ if (*dev_count >= dev_limit)
+ break;
+ mirror = FF_LAYOUT_COMP(pls, i);
+ if (!mirror || !mirror->mirror_ds)
+ continue;
+ dev = FF_LAYOUT_DEVID_NODE(pls, i);
+ devinfo = &args->devinfo[*dev_count];
+ memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
+ devinfo->offset = pls->pls_range.offset;
+ devinfo->length = pls->pls_range.length;
+ /* well, we don't really know if IO is continuous or not! */
+ devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+ devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
+ devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+ devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+ devinfo->layout_type = LAYOUT_FLEX_FILES;
+ devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
+ devinfo->layout_private = mirror;
+ /* lseg refcount put in cleanup_layoutstats */
+ pnfs_get_lseg(pls);
+
+ ++(*dev_count);
+ }
+
+ return *dev_count < dev_limit;
+}
+
+static int
+ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+{
+ struct pnfs_layout_segment *pls;
+ int dev_count = 0;
+
+ spin_lock(&args->inode->i_lock);
+ list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
+ dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+ }
+ spin_unlock(&args->inode->i_lock);
+ /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
+ if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
+ dprintk("%s: truncating devinfo to limit (%d:%d)\n",
+ __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
+ dev_count = PNFS_LAYOUTSTATS_MAXDEV;
+ }
+ args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+ if (!args->devinfo)
+ return -ENOMEM;
+
+ dev_count = 0;
+ spin_lock(&args->inode->i_lock);
+ list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
+ if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
+ PNFS_LAYOUTSTATS_MAXDEV)) {
+ break;
+ }
+ }
+ spin_unlock(&args->inode->i_lock);
+ args->num_dev = dev_count;
+
+ return 0;
+}
+
+static void
+ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ int i;
+
+ for (i = 0; i < data->args.num_dev; i++) {
+ mirror = data->args.devinfo[i].layout_private;
+ data->args.devinfo[i].layout_private = NULL;
+ pnfs_put_lseg(mirror->lseg);
+ }
+}
+
static struct pnfs_layoutdriver_type flexfilelayout_type = {
.id = LAYOUT_FLEX_FILES,
.name = "LAYOUT_FLEX_FILES",
@@ -1498,7 +1939,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.pg_read_ops = &ff_layout_pg_read_ops,
.pg_write_ops = &ff_layout_pg_write_ops,
.get_ds_info = ff_layout_get_ds_info,
- .free_deviceid_node = ff_layout_free_deveiceid_node,
+ .free_deviceid_node = ff_layout_free_deviceid_node,
.mark_request_commit = pnfs_layout_mark_request_commit,
.clear_request_commit = pnfs_generic_clear_request_commit,
.scan_commit_lists = pnfs_generic_scan_commit_lists,
@@ -1508,6 +1949,9 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.write_pagelist = ff_layout_write_pagelist,
.alloc_deviceid_node = ff_layout_alloc_deviceid_node,
.encode_layoutreturn = ff_layout_encode_layoutreturn,
+ .sync = pnfs_nfs_generic_sync,
+ .prepare_layoutstats = ff_layout_prepare_layoutstats,
+ .cleanup_layoutstats = ff_layout_cleanup_layoutstats,
};
static int __init nfs4flexfilelayout_init(void)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 070f20445b2d..f92f9a0a856b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -9,12 +9,17 @@
#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
#define FS_NFS_NFS4FLEXFILELAYOUT_H
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+
#include "../pnfs.h"
/* XXX: Let's filter out insanely large mirror count for now to avoid oom
* due to network error etc. */
#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+/* LAYOUTSTATS report interval in ms */
+#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
+
struct nfs4_ff_ds_version {
u32 version;
u32 minor_version;
@@ -41,24 +46,48 @@ struct nfs4_ff_layout_ds_err {
struct nfs4_deviceid deviceid;
};
+struct nfs4_ff_io_stat {
+ __u64 ops_requested;
+ __u64 bytes_requested;
+ __u64 ops_completed;
+ __u64 bytes_completed;
+ __u64 bytes_not_delivered;
+ ktime_t total_busy_time;
+ ktime_t aggregate_completion_time;
+};
+
+struct nfs4_ff_busy_timer {
+ ktime_t start_time;
+ atomic_t n_ops;
+};
+
+struct nfs4_ff_layoutstat {
+ struct nfs4_ff_io_stat io_stat;
+ struct nfs4_ff_busy_timer busy_timer;
+};
+
struct nfs4_ff_layout_mirror {
+ struct pnfs_layout_segment *lseg; /* back pointer */
u32 ds_count;
u32 efficiency;
struct nfs4_ff_layout_ds *mirror_ds;
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
nfs4_stateid stateid;
- struct nfs4_string user_name;
- struct nfs4_string group_name;
u32 uid;
u32 gid;
struct rpc_cred *cred;
spinlock_t lock;
+ struct nfs4_ff_layoutstat read_stat;
+ struct nfs4_ff_layoutstat write_stat;
+ ktime_t start_time;
+ ktime_t last_report_time;
};
struct nfs4_ff_layout_segment {
struct pnfs_layout_segment generic_hdr;
u64 stripe_unit;
+ u32 flags;
u32 mirror_array_cnt;
struct nfs4_ff_layout_mirror **mirror_array;
};
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e2c01f204a95..f13e1969eedd 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -30,7 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
{
nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
nfs4_pnfs_ds_put(mirror_ds->ds);
- kfree(mirror_ds);
+ kfree_rcu(mirror_ds, id_node.rcu);
}
/* Decode opaque device data and construct new_ds using it */
@@ -324,7 +324,8 @@ static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
__func__, PTR_ERR(cred));
return PTR_ERR(cred);
} else {
- mirror->cred = cred;
+ if (cmpxchg(&mirror->cred, NULL, cred))
+ put_rpccred(cred);
}
}
return 0;
@@ -386,7 +387,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
smp_rmb();
if (ds->ds_clp)
- goto out;
+ goto out_update_creds;
flavor = nfs4_ff_layout_choose_authflavor(mirror);
@@ -430,7 +431,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
}
}
}
-
+out_update_creds:
if (ff_layout_update_mirror_cred(mirror, ds))
ds = NULL;
out:
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 9ac3846cb59e..a608ffd28acc 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -56,11 +56,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
* This again causes shrink_dcache_for_umount_subtree() to
* Oops, since the test for IS_ROOT() will fail.
*/
- spin_lock(&sb->s_root->d_inode->i_lock);
+ spin_lock(&d_inode(sb->s_root)->i_lock);
spin_lock(&sb->s_root->d_lock);
hlist_del_init(&sb->s_root->d_u.d_alias);
spin_unlock(&sb->s_root->d_lock);
- spin_unlock(&sb->s_root->d_inode->i_lock);
+ spin_unlock(&d_inode(sb->s_root)->i_lock);
}
return 0;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d42dff6d5e98..0adc7d245b3d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -133,6 +133,13 @@ void nfs_evict_inode(struct inode *inode)
nfs_clear_inode(inode);
}
+int nfs_sync_inode(struct inode *inode)
+{
+ nfs_inode_dio_wait(inode);
+ return nfs_wb_all(inode);
+}
+EXPORT_SYMBOL_GPL(nfs_sync_inode);
+
/**
* nfs_sync_mapping - helper to flush all mmapped dirty data to disk
*/
@@ -192,7 +199,6 @@ void nfs_zap_caches(struct inode *inode)
nfs_zap_caches_locked(inode);
spin_unlock(&inode->i_lock);
}
-EXPORT_SYMBOL_GPL(nfs_zap_caches);
void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
{
@@ -436,8 +442,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
inode->i_version = fattr->change_attr;
- else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
+ else
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_PAGECACHE);
if (fattr->valid & NFS_ATTR_FATTR_SIZE)
inode->i_size = nfs_size_to_loff_t(fattr->size);
else
@@ -495,7 +502,7 @@ EXPORT_SYMBOL_GPL(nfs_fhget);
int
nfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct nfs_fattr *fattr;
int error = -ENOMEM;
@@ -525,10 +532,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
trace_nfs_setattr_enter(inode);
/* Write all dirty data */
- if (S_ISREG(inode->i_mode)) {
- nfs_inode_dio_wait(inode);
- nfs_wb_all(inode);
- }
+ if (S_ISREG(inode->i_mode))
+ nfs_sync_inode(inode);
fattr = nfs_alloc_fattr();
if (fattr == NULL)
@@ -621,7 +626,7 @@ static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
struct dentry *parent;
parent = dget_parent(dentry);
- nfs_force_use_readdirplus(parent->d_inode);
+ nfs_force_use_readdirplus(d_inode(parent));
dput(parent);
}
@@ -637,15 +642,16 @@ static bool nfs_need_revalidate_inode(struct inode *inode)
int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
int err = 0;
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
if (S_ISREG(inode->i_mode)) {
- nfs_inode_dio_wait(inode);
- err = filemap_write_and_wait(inode->i_mapping);
+ mutex_lock(&inode->i_mutex);
+ err = nfs_sync_inode(inode);
+ mutex_unlock(&inode->i_mutex);
if (err)
goto out;
}
@@ -673,6 +679,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
if (!err) {
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
+ if (S_ISDIR(inode->i_mode))
+ stat->blksize = NFS_SERVER(inode)->dtsize;
}
out:
trace_nfs_getattr_exit(inode, err);
@@ -708,7 +716,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
{
struct nfs_lock_context *res, *new = NULL;
- struct inode *inode = ctx->dentry->d_inode;
+ struct inode *inode = d_inode(ctx->dentry);
spin_lock(&inode->i_lock);
res = __nfs_find_lock_context(ctx);
@@ -736,7 +744,7 @@ EXPORT_SYMBOL_GPL(nfs_get_lock_context);
void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
{
struct nfs_open_context *ctx = l_ctx->open_context;
- struct inode *inode = ctx->dentry->d_inode;
+ struct inode *inode = d_inode(ctx->dentry);
if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
return;
@@ -763,7 +771,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
return;
if (!is_sync)
return;
- inode = ctx->dentry->d_inode;
+ inode = d_inode(ctx->dentry);
if (!list_empty(&NFS_I(inode)->open_files))
return;
server = NFS_SERVER(inode);
@@ -810,7 +818,7 @@ EXPORT_SYMBOL_GPL(get_nfs_open_context);
static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
{
- struct inode *inode = ctx->dentry->d_inode;
+ struct inode *inode = d_inode(ctx->dentry);
struct super_block *sb = ctx->dentry->d_sb;
if (!list_empty(&ctx->list)) {
@@ -842,7 +850,7 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
*/
void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
{
- struct inode *inode = ctx->dentry->d_inode;
+ struct inode *inode = d_inode(ctx->dentry);
struct nfs_inode *nfsi = NFS_I(inode);
spin_lock(&inode->i_lock);
@@ -885,7 +893,7 @@ static void nfs_file_clear_open_context(struct file *filp)
struct nfs_open_context *ctx = nfs_file_open_context(filp);
if (ctx) {
- struct inode *inode = ctx->dentry->d_inode;
+ struct inode *inode = d_inode(ctx->dentry);
filp->private_data = NULL;
spin_lock(&inode->i_lock);
@@ -1237,9 +1245,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
cur_size = i_size_read(inode);
new_isize = nfs_size_to_loff_t(fattr->size);
- if (cur_size != new_isize && nfsi->nrequests == 0)
+ if (cur_size != new_isize)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
}
+ if (nfsi->nrequests != 0)
+ invalid &= ~NFS_INO_REVAL_PAGECACHE;
/* Have any file permissions changed? */
if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
@@ -1588,6 +1598,19 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
}
EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc);
+
+static inline bool nfs_fileid_valid(struct nfs_inode *nfsi,
+ struct nfs_fattr *fattr)
+{
+ bool ret1 = true, ret2 = true;
+
+ if (fattr->valid & NFS_ATTR_FATTR_FILEID)
+ ret1 = (nfsi->fileid == fattr->fileid);
+ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
+ ret2 = (nfsi->fileid == fattr->mounted_on_fileid);
+ return ret1 || ret2;
+}
+
/*
* Many nfs protocol calls return the new file attributes after
* an operation. Here we update the inode to reflect the state
@@ -1614,7 +1637,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_display_fhandle_hash(NFS_FH(inode)),
atomic_read(&inode->i_count), fattr->valid);
- if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) {
+ if (!nfs_fileid_valid(nfsi, fattr)) {
printk(KERN_ERR "NFS: server %s error: fileid changed\n"
"fsid %s: expected fileid 0x%Lx, got 0x%Lx\n",
NFS_SERVER(inode)->nfs_client->cl_hostname,
@@ -1664,13 +1687,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_DATA
| NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL
- | NFS_INO_REVAL_PAGECACHE;
+ | NFS_INO_INVALID_ACL;
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
inode->i_version = fattr->change_attr;
}
- } else if (server->caps & NFS_CAP_CHANGE_ATTR)
+ } else
nfsi->cache_validity |= save_cache_validity;
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
@@ -1697,7 +1719,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if ((nfsi->nrequests == 0) || new_isize > cur_isize) {
i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
- invalid &= ~NFS_INO_REVAL_PAGECACHE;
}
dprintk("NFS: isize change on server for file %s/%ld "
"(%Ld to %Ld)\n",
@@ -1819,7 +1840,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
struct inode *nfs_alloc_inode(struct super_block *sb)
{
struct nfs_inode *nfsi;
- nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
+ nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
if (!nfsi)
return NULL;
nfsi->flags = 0UL;
@@ -1990,17 +2011,15 @@ static int __init init_nfs_fs(void)
if (err)
goto out1;
-#ifdef CONFIG_PROC_FS
rpc_proc_register(&init_net, &nfs_rpcstat);
-#endif
- if ((err = register_nfs_fs()) != 0)
+
+ err = register_nfs_fs();
+ if (err)
goto out0;
return 0;
out0:
-#ifdef CONFIG_PROC_FS
rpc_proc_unregister(&init_net, "nfs");
-#endif
nfs_destroy_directcache();
out1:
nfs_destroy_writepagecache();
@@ -2031,9 +2050,7 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_nfspagecache();
nfs_fscache_unregister();
unregister_pernet_subsys(&nfs_net_ops);
-#ifdef CONFIG_PROC_FS
rpc_proc_unregister(&init_net, "nfs");
-#endif
unregister_nfs_fs();
nfs_fs_proc_exit();
nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9e6475bc5ba2..9b372b845f6a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -296,6 +296,22 @@ extern struct rpc_procinfo nfs4_procedures[];
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
+static inline struct nfs4_label *
+nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
+{
+ if (!dst || !src)
+ return NULL;
+
+ if (src->len > NFS4_MAXLABELLEN)
+ return NULL;
+
+ dst->lfs = src->lfs;
+ dst->pi = src->pi;
+ dst->len = src->len;
+ memcpy(dst->label, src->label, src->len);
+
+ return dst;
+}
static inline void nfs4_label_free(struct nfs4_label *label)
{
if (label) {
@@ -316,6 +332,11 @@ static inline void nfs4_label_free(void *label) {}
static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
{
}
+static inline struct nfs4_label *
+nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
+{
+ return NULL;
+}
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
/* proc.c */
@@ -607,7 +628,7 @@ void nfs_mark_page_unstable(struct page *page)
struct inode *inode = page_file_mapping(page)->host;
inc_zone_page_state(page, NR_UNSTABLE_NFS);
- inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index b5a0afc3ee10..c8162c660c44 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -139,7 +139,7 @@ EXPORT_SYMBOL_GPL(nfs_path);
struct vfsmount *nfs_d_automount(struct path *path)
{
struct vfsmount *mnt;
- struct nfs_server *server = NFS_SERVER(path->dentry->d_inode);
+ struct nfs_server *server = NFS_SERVER(d_inode(path->dentry));
struct nfs_fh *fh = NULL;
struct nfs_fattr *fattr = NULL;
@@ -180,16 +180,16 @@ out_nofree:
static int
nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
- if (NFS_FH(dentry->d_inode)->size != 0)
+ if (NFS_FH(d_inode(dentry))->size != 0)
return nfs_getattr(mnt, dentry, stat);
- generic_fillattr(dentry->d_inode, stat);
+ generic_fillattr(d_inode(dentry), stat);
return 0;
}
static int
nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
{
- if (NFS_FH(dentry->d_inode)->size != 0)
+ if (NFS_FH(d_inode(dentry))->size != 0)
return nfs_setattr(dentry, attr);
return -EACCES;
}
@@ -279,7 +279,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
struct dentry *parent = dget_parent(dentry);
/* Look it up again to get its attributes */
- err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
+ err = server->nfs_client->rpc_ops->lookup(d_inode(parent), &dentry->d_name, fh, fattr, NULL);
dput(parent);
if (err != 0)
return ERR_PTR(err);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 658e586ca438..1ebe2fc7cda2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -279,7 +279,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
ssize_t
nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
ssize_t result = 0;
int error;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1f11d2533ee4..cb28cceefebe 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -120,7 +120,7 @@ static int
nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct iattr *sattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct nfs3_sattrargs arg = {
.fh = NFS_FH(inode),
.sattr = sattr,
@@ -386,13 +386,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
* not sure this buys us anything (and I'd have
* to revamp the NFSv3 XDR code) */
status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
- nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
+ nfs_post_op_update_inode(d_inode(dentry), data->res.fattr);
dprintk("NFS reply setattr (post-create): %d\n", status);
if (status != 0)
goto out_release_acls;
}
- status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+ status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
out_release_acls:
posix_acl_release(acl);
@@ -570,7 +570,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
if (status != 0)
goto out_release_acls;
- status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+ status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
out_release_acls:
posix_acl_release(acl);
@@ -623,7 +623,7 @@ static int
nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
u64 cookie, struct page **pages, unsigned int count, int plus)
{
- struct inode *dir = dentry->d_inode;
+ struct inode *dir = d_inode(dentry);
__be32 *verf = NFS_I(dir)->cookieverf;
struct nfs3_readdirargs arg = {
.fh = NFS_FH(dir),
@@ -715,7 +715,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
if (status != 0)
goto out_release_acls;
- status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+ status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
out_release_acls:
posix_acl_release(acl);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 53852a4bd88b..9b04c2e6fffc 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1342,7 +1342,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
if (args->npages != 0)
xdr_write_pages(xdr, args->pages, 0, args->len);
else
- xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
+ xdr_reserve_space(xdr, args->len);
error = nfsacl_encode(xdr->buf, base, args->inode,
(args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 7afb8947dfdf..ff66ae700b89 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -5,11 +5,18 @@
#ifndef __LINUX_FS_NFS_NFS4_2_H
#define __LINUX_FS_NFS_NFS4_2_H
+/*
+ * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support
+ * more? Need to consider not to pre-alloc too much for a compound.
+ */
+#define PNFS_LAYOUTSTATS_MAXDEV (4)
+
/* nfs4.2proc.c */
int nfs42_proc_allocate(struct file *, loff_t, loff_t);
int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
-
+int nfs42_proc_layoutstats_generic(struct nfs_server *,
+ struct nfs42_layoutstat_data *);
/* nfs4.2xdr.h */
extern struct rpc_procinfo nfs4_2_procedures[];
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index cb170722769c..d731bbf974aa 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -10,6 +10,11 @@
#include <linux/nfs_fs.h>
#include "nfs4_fs.h"
#include "nfs42.h"
+#include "iostat.h"
+#include "pnfs.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
fmode_t fmode)
@@ -36,13 +41,16 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(filep);
+ struct nfs_server *server = NFS_SERVER(inode);
struct nfs42_falloc_args args = {
.falloc_fh = NFS_FH(inode),
.falloc_offset = offset,
.falloc_length = len,
+ .falloc_bitmask = server->cache_consistency_bitmask,
+ };
+ struct nfs42_falloc_res res = {
+ .falloc_server = server,
};
- struct nfs42_falloc_res res;
- struct nfs_server *server = NFS_SERVER(inode);
int status;
msg->rpc_argp = &args;
@@ -52,8 +60,17 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
if (status)
return status;
- return nfs4_call_sync(server->client, server, msg,
- &args.seq_args, &res.seq_res, 0);
+ res.falloc_fattr = nfs_alloc_fattr();
+ if (!res.falloc_fattr)
+ return -ENOMEM;
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == 0)
+ status = nfs_post_op_update_inode(inode, res.falloc_fattr);
+
+ kfree(res.falloc_fattr);
+ return status;
}
static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
@@ -84,9 +101,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
return -EOPNOTSUPP;
+ mutex_lock(&inode->i_mutex);
+
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
+
+ mutex_unlock(&inode->i_mutex);
return err;
}
@@ -101,13 +122,20 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
return -EOPNOTSUPP;
+ nfs_wb_all(inode);
+ mutex_lock(&inode->i_mutex);
+
err = nfs42_proc_fallocate(&msg, filep, offset, len);
+ if (err == 0)
+ truncate_pagecache_range(inode, offset, (offset + len) -1);
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
+
+ mutex_unlock(&inode->i_mutex);
return err;
}
-loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
struct inode *inode = file_inode(filep);
struct nfs42_seek_args args = {
@@ -142,3 +170,102 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
}
+
+loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(filep));
+ struct nfs4_exception exception = { };
+ int err;
+
+ do {
+ err = _nfs42_proc_llseek(filep, offset, whence);
+ if (err == -ENOTSUPP)
+ return -EOPNOTSUPP;
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+
+ return err;
+}
+
+
+static void
+nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layoutstat_data *data = calldata;
+ struct nfs_server *server = NFS_SERVER(data->args.inode);
+
+ nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
+ &data->res.seq_res, task);
+}
+
+static void
+nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
+{
+ struct nfs42_layoutstat_data *data = calldata;
+
+ if (!nfs4_sequence_done(task, &data->res.seq_res))
+ return;
+
+ switch (task->tk_status) {
+ case 0:
+ break;
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+ default:
+ dprintk("%s server returns %d\n", __func__, task->tk_status);
+ }
+}
+
+static void
+nfs42_layoutstat_release(void *calldata)
+{
+ struct nfs42_layoutstat_data *data = calldata;
+ struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+
+ if (nfss->pnfs_curr_ld->cleanup_layoutstats)
+ nfss->pnfs_curr_ld->cleanup_layoutstats(data);
+
+ pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
+ smp_mb__before_atomic();
+ clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
+ smp_mb__after_atomic();
+ nfs_iput_and_deactive(data->inode);
+ kfree(data->args.devinfo);
+ kfree(data);
+}
+
+static const struct rpc_call_ops nfs42_layoutstat_ops = {
+ .rpc_call_prepare = nfs42_layoutstat_prepare,
+ .rpc_call_done = nfs42_layoutstat_done,
+ .rpc_release = nfs42_layoutstat_release,
+};
+
+int nfs42_proc_layoutstats_generic(struct nfs_server *server,
+ struct nfs42_layoutstat_data *data)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ };
+ struct rpc_task_setup task_setup = {
+ .rpc_client = server->client,
+ .rpc_message = &msg,
+ .callback_ops = &nfs42_layoutstat_ops,
+ .callback_data = data,
+ .flags = RPC_TASK_ASYNC,
+ };
+ struct rpc_task *task;
+
+ data->inode = nfs_igrab_and_active(data->args.inode);
+ if (!data->inode) {
+ nfs42_layoutstat_release(data);
+ return -EAGAIN;
+ }
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+ task = rpc_run_task(&task_setup);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ return 0;
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 038a7e1521fa..a6bd27da6286 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -4,6 +4,8 @@
#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
#define __LINUX_FS_NFS_NFS4_2XDR_H
+#include "nfs42.h"
+
#define encode_fallocate_maxsz (encode_stateid_maxsz + \
2 /* offset */ + \
2 /* length */)
@@ -22,25 +24,47 @@
1 /* whence */ + \
2 /* offset */ + \
2 /* length */)
+#define encode_io_info_maxsz 4
+#define encode_layoutstats_maxsz (op_decode_hdr_maxsz + \
+ 2 /* offset */ + \
+ 2 /* length */ + \
+ encode_stateid_maxsz + \
+ encode_io_info_maxsz + \
+ encode_io_info_maxsz + \
+ 1 /* opaque devaddr4 length */ + \
+ XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
+#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
- encode_allocate_maxsz)
+ encode_allocate_maxsz + \
+ encode_getattr_maxsz)
#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
- decode_allocate_maxsz)
+ decode_allocate_maxsz + \
+ decode_getattr_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
- encode_deallocate_maxsz)
+ encode_deallocate_maxsz + \
+ encode_getattr_maxsz)
#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
- decode_deallocate_maxsz)
+ decode_deallocate_maxsz + \
+ decode_getattr_maxsz)
#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_seek_maxsz)
#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
decode_seek_maxsz)
+#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
+#define NFS4_dec_layoutstats_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
static void encode_fallocate(struct xdr_stream *xdr,
@@ -77,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr,
encode_uint32(xdr, args->sa_what);
}
+static void encode_layoutstats(struct xdr_stream *xdr,
+ struct nfs42_layoutstat_args *args,
+ struct nfs42_layoutstat_devinfo *devinfo,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
+ p = reserve_space(xdr, 8 + 8);
+ p = xdr_encode_hyper(p, devinfo->offset);
+ p = xdr_encode_hyper(p, devinfo->length);
+ encode_nfs4_stateid(xdr, &args->stateid);
+ p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
+ p = xdr_encode_hyper(p, devinfo->read_count);
+ p = xdr_encode_hyper(p, devinfo->read_bytes);
+ p = xdr_encode_hyper(p, devinfo->write_count);
+ p = xdr_encode_hyper(p, devinfo->write_bytes);
+ p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
+ NFS4_DEVICEID4_SIZE);
+ /* Encode layoutupdate4 */
+ *p++ = cpu_to_be32(devinfo->layout_type);
+ if (devinfo->layoutstats_encode != NULL)
+ devinfo->layoutstats_encode(xdr, args, devinfo);
+ else
+ encode_uint32(xdr, 0);
+}
+
/*
* Encode ALLOCATE request
*/
@@ -92,6 +143,7 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->falloc_fh, &hdr);
encode_allocate(xdr, args, &hdr);
+ encode_getfattr(xdr, args->falloc_bitmask, &hdr);
encode_nops(&hdr);
}
@@ -110,6 +162,7 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->falloc_fh, &hdr);
encode_deallocate(xdr, args, &hdr);
+ encode_getfattr(xdr, args->falloc_bitmask, &hdr);
encode_nops(&hdr);
}
@@ -131,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
encode_nops(&hdr);
}
+/*
+ * Encode LAYOUTSTATS request
+ */
+static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_layoutstat_args *args)
+{
+ int i;
+
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+ for (i = 0; i < args->num_dev; i++)
+ encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
+ encode_nops(&hdr);
+}
+
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -163,6 +238,12 @@ out_overflow:
return -EIO;
}
+static int decode_layoutstats(struct xdr_stream *xdr,
+ struct nfs42_layoutstat_res *res)
+{
+ return decode_op_hdr(xdr, OP_LAYOUTSTATS);
+}
+
/*
* Decode ALLOCATE request
*/
@@ -183,6 +264,9 @@ static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp,
if (status)
goto out;
status = decode_allocate(xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
out:
return status;
}
@@ -207,6 +291,9 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
if (status)
goto out;
status = decode_deallocate(xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
out:
return status;
}
@@ -234,4 +321,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
out:
return status;
}
+
+/*
+ * Decode LAYOUTSTATS request
+ */
+static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_layoutstat_res *res)
+{
+ struct compound_hdr hdr;
+ int status, i;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+ for (i = 0; i < res->num_dev; i++) {
+ status = decode_layoutstats(xdr, res);
+ if (status)
+ goto out;
+ }
+out:
+ res->rpc_status = status;
+ return status;
+}
+
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index fdef424b0cd3..ea3bee919a76 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception
extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
struct rpc_message *, struct nfs4_sequence_args *,
struct nfs4_sequence_res *, int);
+extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int);
extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 86d6214ea022..3aa6a9ba5113 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -4,7 +4,6 @@
*/
#include <linux/module.h>
#include <linux/nfs_fs.h>
-#include <linux/nfs_idmap.h>
#include <linux/nfs_mount.h>
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/auth.h>
@@ -15,6 +14,7 @@
#include "callback.h"
#include "delegation.h"
#include "nfs4session.h"
+#include "nfs4idmap.h"
#include "pnfs.h"
#include "netns.h"
@@ -676,7 +676,6 @@ found:
break;
}
- /* No matching nfs_client found. */
spin_unlock(&nn->nfs_client_lock);
dprintk("NFS: <-- %s status = %d\n", __func__, status);
nfs_put_client(prev);
@@ -1130,7 +1129,7 @@ error:
*/
static int nfs_probe_destination(struct nfs_server *server)
{
- struct inode *inode = server->super->s_root->d_inode;
+ struct inode *inode = d_inode(server->super->s_root);
struct nfs_fattr *fattr;
int error;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 8b46389c4c5b..dcd39d4e2efe 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -10,6 +10,8 @@
#include "fscache.h"
#include "pnfs.h"
+#include "nfstrace.h"
+
#ifdef CONFIG_NFS_V4_2
#include "nfs42.h"
#endif
@@ -39,6 +41,10 @@ nfs4_file_open(struct inode *inode, struct file *filp)
dprintk("NFS: open file(%pd2)\n", dentry);
+ err = nfs_check_flags(openflags);
+ if (err)
+ return err;
+
if ((openflags & O_ACCMODE) == 3)
openflags--;
@@ -46,7 +52,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
openflags &= ~(O_CREAT|O_EXCL);
parent = dget_parent(dentry);
- dir = parent->d_inode;
+ dir = d_inode(parent);
ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
err = PTR_ERR(ctx);
@@ -57,7 +63,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (openflags & O_TRUNC) {
attr.ia_valid |= ATTR_SIZE;
attr.ia_size = 0;
- nfs_wb_all(inode);
+ nfs_sync_inode(inode);
}
inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
@@ -74,7 +80,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
goto out_drop;
}
}
- if (inode != dentry->d_inode)
+ if (inode != d_inode(dentry))
goto out_drop;
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
@@ -100,6 +106,9 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
int ret;
struct inode *inode = file_inode(file);
+ trace_nfs_fsync_enter(inode);
+
+ nfs_inode_dio_wait(inode);
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
@@ -107,7 +116,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
mutex_lock(&inode->i_mutex);
ret = nfs_file_fsync_commit(file, start, end, datasync);
if (!ret)
- ret = pnfs_layoutcommit_inode(inode, true);
+ ret = pnfs_sync_inode(inode, !!datasync);
mutex_unlock(&inode->i_mutex);
/*
* If nfs_file_fsync_commit detected a server reboot, then
@@ -118,6 +127,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
end = LLONG_MAX;
} while (ret == -EAGAIN);
+ trace_nfs_fsync_exit(inode, ret);
return ret;
}
@@ -152,15 +162,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
if (ret < 0)
return ret;
- mutex_lock(&inode->i_mutex);
if (mode & FALLOC_FL_PUNCH_HOLE)
- ret = nfs42_proc_deallocate(filep, offset, len);
- else
- ret = nfs42_proc_allocate(filep, offset, len);
- mutex_unlock(&inode->i_mutex);
-
- nfs_zap_caches(inode);
- return ret;
+ return nfs42_proc_deallocate(filep, offset, len);
+ return nfs42_proc_allocate(filep, offset, len);
}
#endif /* CONFIG_NFS_V4_2 */
@@ -170,8 +174,6 @@ const struct file_operations nfs4_file_operations = {
#else
.llseek = nfs_file_llseek,
#endif
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
index c0b3a16b4a00..039b3eb6d834 100644
--- a/fs/nfs/nfs4getroot.c
+++ b/fs/nfs/nfs4getroot.c
@@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
goto out;
}
- if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
- printk(KERN_ERR "nfs4_get_rootfh:"
- " getroot obtained referral\n");
- ret = -EREMOTE;
- goto out;
- }
-
memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
out:
nfs_free_fattr(fsinfo.fattr);
diff --git a/fs/nfs/idmap.c b/fs/nfs/nfs4idmap.c
index 857e2a99acc8..535dfc69c628 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -36,7 +36,6 @@
#include <linux/types.h>
#include <linux/parser.h>
#include <linux/fs.h>
-#include <linux/nfs_idmap.h>
#include <net/net_namespace.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/nfs_fs.h>
@@ -49,6 +48,7 @@
#include "internal.h"
#include "netns.h"
+#include "nfs4idmap.h"
#include "nfs4trace.h"
#define NFS_UINT_MAXLEN 11
@@ -494,12 +494,7 @@ nfs_idmap_delete(struct nfs_client *clp)
int nfs_idmap_init(void)
{
- int ret;
- ret = nfs_idmap_init_keyring();
- if (ret != 0)
- goto out;
-out:
- return ret;
+ return nfs_idmap_init_keyring();
}
void nfs_idmap_quit(void)
diff --git a/fs/nfs/nfs4idmap.h b/fs/nfs/nfs4idmap.h
new file mode 100644
index 000000000000..de44d7330ab3
--- /dev/null
+++ b/fs/nfs/nfs4idmap.h
@@ -0,0 +1,68 @@
+/*
+ * fs/nfs/nfs4idmap.h
+ *
+ * UID and GID to name mapping for clients.
+ *
+ * Copyright (c) 2002 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ * Marius Aamodt Eriksen <marius@umich.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef NFS_IDMAP_H
+#define NFS_IDMAP_H
+
+#include <linux/uidgid.h>
+#include <uapi/linux/nfs_idmap.h>
+
+
+/* Forward declaration to make this header independent of others */
+struct nfs_client;
+struct nfs_server;
+struct nfs_fattr;
+struct nfs4_string;
+
+int nfs_idmap_init(void);
+void nfs_idmap_quit(void);
+int nfs_idmap_new(struct nfs_client *);
+void nfs_idmap_delete(struct nfs_client *);
+
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+ struct nfs4_string *owner_name,
+ struct nfs4_string *group_name);
+void nfs_fattr_free_names(struct nfs_fattr *);
+void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *);
+
+int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, kuid_t *);
+int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t *);
+int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t);
+int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t);
+
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res);
+
+extern unsigned int nfs_idmap_cache_timeout;
+#endif /* NFS_IDMAP_H */
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3d83cb1fdc70..f592672373cb 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -375,7 +375,7 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry *
dprintk("%s: getting locations for %pd2\n",
__func__, dentry);
- err = nfs4_proc_fs_locations(client, parent->d_inode, &dentry->d_name, fs_locations, page);
+ err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page);
dput(parent);
if (err != 0 ||
fs_locations->nlocations <= 0 ||
@@ -396,7 +396,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
{
rpc_authflavor_t flavor = server->client->cl_auth->au_flavor;
struct dentry *parent = dget_parent(dentry);
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
struct qstr *name = &dentry->d_name;
struct rpc_clnt *client;
struct vfsmount *mnt;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 627f37c44456..3acb1eb72930 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -38,6 +38,7 @@
#include <linux/mm.h>
#include <linux/delay.h>
#include <linux/errno.h>
+#include <linux/file.h>
#include <linux/string.h>
#include <linux/ratelimit.h>
#include <linux/printk.h>
@@ -51,7 +52,6 @@
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/module.h>
-#include <linux/nfs_idmap.h>
#include <linux/xattr.h>
#include <linux/utsname.h>
#include <linux/freezer.h>
@@ -63,6 +63,7 @@
#include "callback.h"
#include "pnfs.h"
#include "netns.h"
+#include "nfs4idmap.h"
#include "nfs4session.h"
#include "fscache.h"
@@ -185,7 +186,8 @@ const u32 nfs4_fattr_bitmap[3] = {
| FATTR4_WORD1_SPACE_USED
| FATTR4_WORD1_TIME_ACCESS
| FATTR4_WORD1_TIME_METADATA
- | FATTR4_WORD1_TIME_MODIFY,
+ | FATTR4_WORD1_TIME_MODIFY
+ | FATTR4_WORD1_MOUNTED_ON_FILEID,
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
FATTR4_WORD2_SECURITY_LABEL
#endif
@@ -293,7 +295,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
*p++ = xdr_one; /* bitmap length */
*p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */
*p++ = htonl(8); /* attribute buffer length */
- p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode));
+ p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry)));
}
*p++ = xdr_one; /* next */
@@ -305,7 +307,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
*p++ = xdr_one; /* bitmap length */
*p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */
*p++ = htonl(8); /* attribute buffer length */
- p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode));
+ p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent)));
readdir->pgbase = (char *)p - (char *)start;
readdir->count -= readdir->pgbase;
@@ -354,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
case 0:
return 0;
case -NFS4ERR_OPENMODE:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
nfs4_inode_return_delegation(inode);
exception->retry = 1;
@@ -365,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
if (ret < 0)
break;
goto wait_on_recovery;
- case -NFS4ERR_DELEG_REVOKED:
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_BAD_STATEID:
- if (state == NULL)
- break;
- ret = nfs4_schedule_stateid_recovery(server, state);
- if (ret < 0)
- break;
- goto wait_on_recovery;
case -NFS4ERR_EXPIRED:
if (state != NULL) {
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -471,7 +467,10 @@ static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
{
- do_renew_lease(server->nfs_client, timestamp);
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!nfs4_has_session(clp))
+ do_renew_lease(clp, timestamp);
}
struct nfs4_call_sync_data {
@@ -480,8 +479,8 @@ struct nfs4_call_sync_data {
struct nfs4_sequence_res *seq_res;
};
-static void nfs4_init_sequence(struct nfs4_sequence_args *args,
- struct nfs4_sequence_res *res, int cache_reply)
+void nfs4_init_sequence(struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res, int cache_reply)
{
args->sa_slot = NULL;
args->sa_cache_this = cache_reply;
@@ -620,8 +619,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
clp = session->clp;
do_renew_lease(clp, res->sr_timestamp);
/* Check sequence flags */
- if (res->sr_status_flags != 0)
- nfs4_schedule_lease_recovery(clp);
+ nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
nfs41_update_target_slotid(slot->table, slot, res);
break;
case 1:
@@ -914,6 +912,7 @@ struct nfs4_opendata {
struct nfs_open_confirmres c_res;
struct nfs4_string owner_name;
struct nfs4_string group_name;
+ struct nfs4_label *a_label;
struct nfs_fattr f_attr;
struct nfs4_label *f_label;
struct dentry *dir;
@@ -1004,7 +1003,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
gfp_t gfp_mask)
{
struct dentry *parent = dget_parent(dentry);
- struct inode *dir = parent->d_inode;
+ struct inode *dir = d_inode(parent);
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
struct nfs4_opendata *p;
@@ -1017,6 +1016,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
if (IS_ERR(p->f_label))
goto err_free_p;
+ p->a_label = nfs4_label_alloc(server, gfp_mask);
+ if (IS_ERR(p->a_label))
+ goto err_free_f;
+
alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
if (IS_ERR(p->o_arg.seqid))
@@ -1045,7 +1048,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
p->o_arg.server = server;
p->o_arg.bitmask = nfs4_bitmask(server, label);
p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
- p->o_arg.label = label;
+ p->o_arg.label = nfs4_label_copy(p->a_label, label);
p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
switch (p->o_arg.claim) {
case NFS4_OPEN_CLAIM_NULL:
@@ -1057,7 +1060,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
case NFS4_OPEN_CLAIM_FH:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
- p->o_arg.fh = NFS_FH(dentry->d_inode);
+ p->o_arg.fh = NFS_FH(d_inode(dentry));
}
if (attrs != NULL && attrs->ia_valid != 0) {
__u32 verf[2];
@@ -1078,6 +1081,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
return p;
err_free_label:
+ nfs4_label_free(p->a_label);
+err_free_f:
nfs4_label_free(p->f_label);
err_free_p:
kfree(p);
@@ -1097,6 +1102,7 @@ static void nfs4_opendata_free(struct kref *kref)
nfs4_put_open_state(p->state);
nfs4_put_state_owner(p->owner);
+ nfs4_label_free(p->a_label);
nfs4_label_free(p->f_label);
dput(p->dir);
@@ -1202,12 +1208,15 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state,
static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
{
+ if (!(state->n_wronly || state->n_rdonly || state->n_rdwr))
+ return;
if (state->n_wronly)
set_bit(NFS_O_WRONLY_STATE, &state->flags);
if (state->n_rdonly)
set_bit(NFS_O_RDONLY_STATE, &state->flags);
if (state->n_rdwr)
set_bit(NFS_O_RDWR_STATE, &state->flags);
+ set_bit(NFS_OPEN_STATE, &state->flags);
}
static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
@@ -1551,6 +1560,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
struct nfs4_state *newstate;
int ret;
+ if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
+ opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) &&
+ (opendata->o_arg.u.delegation_type & fmode) != fmode)
+ /* This mode can't have been delegated, so we must have
+ * a valid open_stateid to cover it - not need to reclaim.
+ */
+ return 0;
opendata->o_arg.open_flags = 0;
opendata->o_arg.fmode = fmode;
opendata->o_arg.share_access = nfs4_map_atomic_open_share(
@@ -1682,6 +1698,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
"%d.\n", __func__, err);
case 0:
case -ENOENT:
+ case -EAGAIN:
case -ESTALE:
break;
case -NFS4ERR_BADSESSION:
@@ -1794,7 +1811,7 @@ static const struct rpc_call_ops nfs4_open_confirm_ops = {
*/
static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
{
- struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
+ struct nfs_server *server = NFS_SERVER(d_inode(data->dir));
struct rpc_task *task;
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM],
@@ -1951,7 +1968,7 @@ static const struct rpc_call_ops nfs4_open_ops = {
static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
{
- struct inode *dir = data->dir->d_inode;
+ struct inode *dir = d_inode(data->dir);
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_openargs *o_arg = &data->o_arg;
struct nfs_openres *o_res = &data->o_res;
@@ -1998,7 +2015,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
{
- struct inode *dir = data->dir->d_inode;
+ struct inode *dir = d_inode(data->dir);
struct nfs_openres *o_res = &data->o_res;
int status;
@@ -2067,7 +2084,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
*/
static int _nfs4_proc_open(struct nfs4_opendata *data)
{
- struct inode *dir = data->dir->d_inode;
+ struct inode *dir = d_inode(data->dir);
struct nfs_server *server = NFS_SERVER(dir);
struct nfs_openargs *o_arg = &data->o_arg;
struct nfs_openres *o_res = &data->o_res;
@@ -2314,7 +2331,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
dentry = opendata->dentry;
- if (dentry->d_inode == NULL) {
+ if (d_really_is_negative(dentry)) {
/* FIXME: Is this d_drop() ever needed? */
d_drop(dentry);
dentry = d_add_unique(dentry, igrab(state->inode));
@@ -2325,7 +2342,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
ctx->dentry = dget(dentry);
}
nfs_set_verifier(dentry,
- nfs_save_change_attribute(opendata->dir->d_inode));
+ nfs_save_change_attribute(d_inode(opendata->dir)));
}
ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
@@ -2333,7 +2350,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
goto out;
ctx->state = state;
- if (dentry->d_inode == state->inode) {
+ if (d_inode(dentry) == state->inode) {
nfs_inode_attach_open_context(ctx);
if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
nfs4_schedule_stateid_recovery(server, state);
@@ -2374,10 +2391,10 @@ static int _nfs4_do_open(struct inode *dir,
status = nfs4_recover_expired_lease(server);
if (status != 0)
goto err_put_state_owner;
- if (dentry->d_inode != NULL)
- nfs4_return_incompatible_delegation(dentry->d_inode, fmode);
+ if (d_really_is_positive(dentry))
+ nfs4_return_incompatible_delegation(d_inode(dentry), fmode);
status = -ENOMEM;
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
claim = NFS4_OPEN_CLAIM_FH;
opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
label, claim, GFP_KERNEL);
@@ -2400,8 +2417,8 @@ static int _nfs4_do_open(struct inode *dir,
}
opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
}
- if (dentry->d_inode != NULL)
- opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
+ if (d_really_is_positive(dentry))
+ opendata->state = nfs4_get_open_state(d_inode(dentry), sp);
status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
if (status != 0)
@@ -3095,16 +3112,13 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_fsinfo *info,
bool auth_probe)
{
- int status;
+ int status = 0;
- switch (auth_probe) {
- case false:
+ if (!auth_probe)
status = nfs4_lookup_root(server, fhandle, info);
- if (status != -NFS4ERR_WRONGSEC)
- break;
- default:
+
+ if (auth_probe || status == NFS4ERR_WRONGSEC)
status = nfs4_do_find_root_sec(server, fhandle, info);
- }
if (status == 0)
status = nfs4_server_capabilities(server, fhandle);
@@ -3254,7 +3268,7 @@ static int
nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct iattr *sattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct rpc_cred *cred = NULL;
struct nfs4_state *state = NULL;
struct nfs4_label *label = NULL;
@@ -3356,6 +3370,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
goto out;
case -NFS4ERR_MOVED:
err = nfs4_get_referral(client, dir, name, fattr, fhandle);
+ if (err == -NFS4ERR_MOVED)
+ err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
goto out;
case -NFS4ERR_WRONGSEC:
err = -EPERM;
@@ -3871,13 +3887,13 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
u64 cookie, struct page **pages, unsigned int count, int plus)
{
- struct inode *dir = dentry->d_inode;
+ struct inode *dir = d_inode(dentry);
struct nfs4_readdir_arg args = {
.fh = NFS_FH(dir),
.pages = pages,
.pgbase = 0,
.count = count,
- .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+ .bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask,
.plus = plus,
};
struct nfs4_readdir_res res;
@@ -3914,8 +3930,8 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
do {
err = _nfs4_proc_readdir(dentry, cred, cookie,
pages, count, plus);
- trace_nfs4_readdir(dentry->d_inode, err);
- err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), err,
+ trace_nfs4_readdir(d_inode(dentry), err);
+ err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err,
&exception);
} while (exception.retry);
return err;
@@ -4830,7 +4846,7 @@ nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
struct nfs4_label ilabel, *olabel = NULL;
struct nfs_fattr fattr;
struct rpc_cred *cred;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int status;
if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -4956,49 +4972,128 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
memcpy(bootverf->data, verf, sizeof(bootverf->data));
}
-static unsigned int
-nfs4_init_nonuniform_client_string(struct nfs_client *clp,
- char *buf, size_t len)
+static int
+nfs4_init_nonuniform_client_string(struct nfs_client *clp)
{
- unsigned int result;
+ int result;
+ size_t len;
+ char *str;
+ bool retried = false;
if (clp->cl_owner_id != NULL)
- return strlcpy(buf, clp->cl_owner_id, len);
+ return 0;
+retry:
+ rcu_read_lock();
+ len = 10 + strlen(clp->cl_ipaddr) + 1 +
+ strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
+ 1 +
+ strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
+ 1;
+ rcu_read_unlock();
+
+ if (len > NFS4_OPAQUE_LIMIT + 1)
+ return -EINVAL;
+
+ /*
+ * Since this string is allocated at mount time, and held until the
+ * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+ * about a memory-reclaim deadlock.
+ */
+ str = kmalloc(len, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
rcu_read_lock();
- result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
- clp->cl_ipaddr,
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_ADDR),
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_PROTO));
+ result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+ clp->cl_ipaddr,
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
rcu_read_unlock();
- clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
- return result;
+
+ /* Did something change? */
+ if (result >= len) {
+ kfree(str);
+ if (retried)
+ return -EINVAL;
+ retried = true;
+ goto retry;
+ }
+ clp->cl_owner_id = str;
+ return 0;
}
-static unsigned int
-nfs4_init_uniform_client_string(struct nfs_client *clp,
- char *buf, size_t len)
+static int
+nfs4_init_uniquifier_client_string(struct nfs_client *clp)
{
- const char *nodename = clp->cl_rpcclient->cl_nodename;
- unsigned int result;
+ int result;
+ size_t len;
+ char *str;
+
+ len = 10 + 10 + 1 + 10 + 1 +
+ strlen(nfs4_client_id_uniquifier) + 1 +
+ strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+ if (len > NFS4_OPAQUE_LIMIT + 1)
+ return -EINVAL;
+
+ /*
+ * Since this string is allocated at mount time, and held until the
+ * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+ * about a memory-reclaim deadlock.
+ */
+ str = kmalloc(len, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
+
+ result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ nfs4_client_id_uniquifier,
+ clp->cl_rpcclient->cl_nodename);
+ if (result >= len) {
+ kfree(str);
+ return -EINVAL;
+ }
+ clp->cl_owner_id = str;
+ return 0;
+}
+
+static int
+nfs4_init_uniform_client_string(struct nfs_client *clp)
+{
+ int result;
+ size_t len;
+ char *str;
if (clp->cl_owner_id != NULL)
- return strlcpy(buf, clp->cl_owner_id, len);
+ return 0;
if (nfs4_client_id_uniquifier[0] != '\0')
- result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
- clp->rpc_ops->version,
- clp->cl_minorversion,
- nfs4_client_id_uniquifier,
- nodename);
- else
- result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
- clp->rpc_ops->version, clp->cl_minorversion,
- nodename);
- clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
- return result;
+ return nfs4_init_uniquifier_client_string(clp);
+
+ len = 10 + 10 + 1 + 10 + 1 +
+ strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+ if (len > NFS4_OPAQUE_LIMIT + 1)
+ return -EINVAL;
+
+ /*
+ * Since this string is allocated at mount time, and held until the
+ * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+ * about a memory-reclaim deadlock.
+ */
+ str = kmalloc(len, GFP_KERNEL);
+ if (!str)
+ return -ENOMEM;
+
+ result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+ clp->rpc_ops->version, clp->cl_minorversion,
+ clp->cl_rpcclient->cl_nodename);
+ if (result >= len) {
+ kfree(str);
+ return -EINVAL;
+ }
+ clp->cl_owner_id = str;
+ return 0;
}
/*
@@ -5045,7 +5140,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
struct nfs4_setclientid setclientid = {
.sc_verifier = &sc_verifier,
.sc_prog = program,
- .sc_cb_ident = clp->cl_cb_ident,
+ .sc_clnt = clp,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -5065,16 +5160,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
/* nfs_client_id4 */
nfs4_init_boot_verifier(clp, &sc_verifier);
+
if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
- setclientid.sc_name_len =
- nfs4_init_uniform_client_string(clp,
- setclientid.sc_name,
- sizeof(setclientid.sc_name));
+ status = nfs4_init_uniform_client_string(clp);
else
- setclientid.sc_name_len =
- nfs4_init_nonuniform_client_string(clp,
- setclientid.sc_name,
- sizeof(setclientid.sc_name));
+ status = nfs4_init_nonuniform_client_string(clp);
+
+ if (status)
+ goto out;
+
/* cb_client4 */
setclientid.sc_netid_len =
nfs4_init_callback_netid(clp,
@@ -5084,9 +5178,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
sizeof(setclientid.sc_uaddr), "%s.%u.%u",
clp->cl_ipaddr, port >> 8, port & 255);
- dprintk("NFS call setclientid auth=%s, '%.*s'\n",
+ dprintk("NFS call setclientid auth=%s, '%s'\n",
clp->cl_rpcclient->cl_auth->au_ops->au_name,
- setclientid.sc_name_len, setclientid.sc_name);
+ clp->cl_owner_id);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task)) {
status = PTR_ERR(task);
@@ -5358,15 +5452,15 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
return err;
}
-static int do_vfs_lock(struct file *file, struct file_lock *fl)
+static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
{
int res = 0;
switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
case FL_POSIX:
- res = posix_lock_file_wait(file, fl);
+ res = posix_lock_inode_wait(inode, fl);
break;
case FL_FLOCK:
- res = flock_lock_file_wait(file, fl);
+ res = flock_lock_inode_wait(inode, fl);
break;
default:
BUG();
@@ -5426,7 +5520,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
switch (task->tk_status) {
case 0:
renew_lease(calldata->server, calldata->timestamp);
- do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
+ do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl);
if (nfs4_update_lock_stateid(calldata->lsp,
&calldata->res.stateid))
break;
@@ -5534,7 +5628,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
mutex_lock(&sp->so_delegreturn_mutex);
/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
down_read(&nfsi->rwsem);
- if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
+ if (do_vfs_lock(inode, request) == -ENOENT) {
up_read(&nfsi->rwsem);
mutex_unlock(&sp->so_delegreturn_mutex);
goto out;
@@ -5606,6 +5700,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
p->server = server;
atomic_inc(&lsp->ls_count);
p->ctx = get_nfs_open_context(ctx);
+ get_file(fl->fl_file);
memcpy(&p->fl, fl, sizeof(p->fl));
return p;
out_free_seqid:
@@ -5670,11 +5765,11 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
data->rpc_status = task->tk_status;
switch (task->tk_status) {
case 0:
- renew_lease(NFS_SERVER(data->ctx->dentry->d_inode),
+ renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
data->timestamp);
if (data->arg.new_lock) {
data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
- if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
+ if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) {
rpc_restart_call_prepare(task);
break;
}
@@ -5718,6 +5813,7 @@ static void nfs4_lock_release(void *calldata)
nfs_free_seqid(data->arg.lock_seqid);
nfs4_put_lock_state(data->lsp);
put_nfs_open_context(data->ctx);
+ fput(data->fl.fl_file);
kfree(data);
dprintk("%s: done!\n", __func__);
}
@@ -5915,7 +6011,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
if (status != 0)
goto out;
request->fl_flags |= FL_ACCESS;
- status = do_vfs_lock(request->fl_file, request);
+ status = do_vfs_lock(state->inode, request);
if (status < 0)
goto out;
down_read(&nfsi->rwsem);
@@ -5923,7 +6019,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
/* Yes: cache locks! */
/* ...but avoid races with delegation recall... */
request->fl_flags = fl_flags & ~FL_SLEEP;
- status = do_vfs_lock(request->fl_file, request);
+ status = do_vfs_lock(state->inode, request);
up_read(&nfsi->rwsem);
goto out;
}
@@ -6112,7 +6208,7 @@ static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
if (strcmp(key, "") != 0)
return -EINVAL;
- return nfs4_proc_set_acl(dentry->d_inode, buf, buflen);
+ return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
}
static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
@@ -6121,7 +6217,7 @@ static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
if (strcmp(key, "") != 0)
return -EINVAL;
- return nfs4_proc_get_acl(dentry->d_inode, buf, buflen);
+ return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
}
static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
@@ -6130,7 +6226,7 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
{
size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
- if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode)))
+ if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))))
return 0;
if (list && len <= list_len)
@@ -6158,7 +6254,7 @@ static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
void *buf, size_t buflen, int type)
{
if (security_ismaclabel(key))
- return nfs4_get_security_label(dentry->d_inode, buf, buflen);
+ return nfs4_get_security_label(d_inode(dentry), buf, buflen);
return -EOPNOTSUPP;
}
@@ -6168,10 +6264,10 @@ static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
{
size_t len = 0;
- if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
- len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
+ if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(d_inode(dentry), NULL, 0);
if (list && len <= list_len)
- security_inode_listsecurity(dentry->d_inode, list, len);
+ security_inode_listsecurity(d_inode(dentry), list, len);
}
return len;
}
@@ -6845,11 +6941,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
};
nfs4_init_boot_verifier(clp, &verifier);
- args.id_len = nfs4_init_uniform_client_string(clp, args.id,
- sizeof(args.id));
- dprintk("NFS call exchange_id auth=%s, '%.*s'\n",
+
+ status = nfs4_init_uniform_client_string(clp);
+ if (status)
+ goto out;
+
+ dprintk("NFS call exchange_id auth=%s, '%s'\n",
clp->cl_rpcclient->cl_auth->au_ops->au_name,
- args.id_len, args.id);
+ clp->cl_owner_id);
res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
GFP_NOFS);
@@ -6884,7 +6983,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
/* unsupported! */
WARN_ON_ONCE(1);
status = -EINVAL;
- goto out_server_scope;
+ goto out_impl_id;
}
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@@ -6912,6 +7011,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
/* use the most recent implementation id */
kfree(clp->cl_implid);
clp->cl_implid = res.impl_id;
+ res.impl_id = NULL;
if (clp->cl_serverscope != NULL &&
!nfs41_same_server_scope(clp->cl_serverscope,
@@ -6925,15 +7025,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
if (clp->cl_serverscope == NULL) {
clp->cl_serverscope = res.server_scope;
- goto out;
+ res.server_scope = NULL;
}
- } else
- kfree(res.impl_id);
+ }
-out_server_owner:
- kfree(res.server_owner);
+out_impl_id:
+ kfree(res.impl_id);
out_server_scope:
kfree(res.server_scope);
+out_server_owner:
+ kfree(res.server_owner);
out:
if (clp->cl_implid != NULL)
dprintk("NFS reply exchange_id: Server Implementation ID: "
@@ -7483,13 +7584,8 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
goto out;
}
ret = rpc_wait_for_completion_task(task);
- if (!ret) {
- struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
-
- if (task->tk_status == 0)
- nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+ if (!ret)
ret = task->tk_status;
- }
rpc_put_task(task);
out:
dprintk("<-- %s status=%d\n", __func__, ret);
@@ -7877,16 +7973,17 @@ static void nfs4_layoutreturn_release(void *calldata)
{
struct nfs4_layoutreturn *lrp = calldata;
struct pnfs_layout_hdr *lo = lrp->args.layout;
+ LIST_HEAD(freeme);
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
if (lrp->res.lrs_present)
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
pnfs_clear_layoutreturn_waitbit(lo);
- clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
- rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
lo->plh_block_lgets--;
spin_unlock(&lo->plh_inode->i_lock);
+ pnfs_free_lseg_list(&freeme);
pnfs_put_layout_hdr(lrp->args.layout);
nfs_iput_and_deactive(lrp->inode);
kfree(calldata);
@@ -7944,6 +8041,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
{
struct nfs4_getdeviceinfo_args args = {
.pdev = pdev,
+ .notify_types = NOTIFY_DEVICEID4_CHANGE |
+ NOTIFY_DEVICEID4_DELETE,
};
struct nfs4_getdeviceinfo_res res = {
.pdev = pdev,
@@ -7958,6 +8057,11 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
dprintk("--> %s\n", __func__);
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
+ if (res.notification & ~args.notify_types)
+ dprintk("%s: unsupported notification\n", __func__);
+ if (res.notification != args.notify_types)
+ pdev->nocache = 1;
+
dprintk("<-- %s status=%d\n", __func__, status);
return status;
@@ -8053,9 +8157,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
struct rpc_task *task;
int status = 0;
- dprintk("NFS: %4d initiating layoutcommit call. sync %d "
- "lbw: %llu inode %lu\n",
- data->task.tk_pid, sync,
+ dprintk("NFS: initiating layoutcommit call. sync %d "
+ "lbw: %llu inode %lu\n", sync,
data->args.lastbytewritten,
data->args.inode->i_ino);
@@ -8494,7 +8597,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
.minor_version = 0,
.init_caps = NFS_CAP_READDIRPLUS
| NFS_CAP_ATOMIC_OPEN
- | NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK,
.init_client = nfs40_init_client,
.shutdown_client = nfs40_shutdown_client,
@@ -8520,7 +8622,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
.minor_version = 1,
.init_caps = NFS_CAP_READDIRPLUS
| NFS_CAP_ATOMIC_OPEN
- | NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
| NFS_CAP_ATOMIC_OPEN_V1,
@@ -8543,13 +8644,13 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
.minor_version = 2,
.init_caps = NFS_CAP_READDIRPLUS
| NFS_CAP_ATOMIC_OPEN
- | NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
| NFS_CAP_ATOMIC_OPEN_V1
| NFS_CAP_ALLOCATE
| NFS_CAP_DEALLOCATE
- | NFS_CAP_SEEK,
+ | NFS_CAP_SEEK
+ | NFS_CAP_LAYOUTSTATS,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f95e3b58bbc3..f2e2ad894461 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -42,7 +42,6 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/nfs_fs.h>
-#include <linux/nfs_idmap.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/random.h>
@@ -57,6 +56,7 @@
#include "callback.h"
#include "delegation.h"
#include "internal.h"
+#include "nfs4idmap.h"
#include "nfs4session.h"
#include "pnfs.h"
#include "netns.h"
@@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
goto do_confirm;
- nfs4_begin_drain_session(clp);
status = nfs4_proc_exchange_id(clp, cred);
if (status != 0)
goto out;
@@ -1482,6 +1481,8 @@ restart:
spin_unlock(&state->state_lock);
}
nfs4_put_open_state(state);
+ clear_bit(NFS4CLNT_RECLAIM_NOGRACE,
+ &state->flags);
spin_lock(&sp->so_lock);
goto restart;
}
@@ -1830,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
clp->cl_mvops->reboot_recovery_ops;
int status;
+ nfs4_begin_drain_session(clp);
cred = nfs4_get_clid_cred(clp);
if (cred == NULL)
return -ENOENT;
@@ -1902,7 +1904,7 @@ static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred)
goto out;
}
- inode = server->super->s_root->d_inode;
+ inode = d_inode(server->super->s_root);
result = nfs4_proc_get_locations(inode, locations, page, cred);
if (result) {
dprintk("<-- %s: failed to retrieve fs_locations: %d\n",
@@ -2021,7 +2023,7 @@ restart:
rcu_read_unlock();
- inode = server->super->s_root->d_inode;
+ inode = d_inode(server->super->s_root);
status = nfs4_proc_fsid_present(inode, cred);
if (status != -NFS4ERR_MOVED)
goto restart; /* wasn't this one */
@@ -2189,25 +2191,35 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
}
}
-static void nfs41_handle_state_revoked(struct nfs_client *clp)
+static void nfs41_handle_all_state_revoked(struct nfs_client *clp)
{
nfs4_reset_all_state(clp);
dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
}
+static void nfs41_handle_some_state_revoked(struct nfs_client *clp)
+{
+ nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+ nfs4_schedule_state_manager(clp);
+
+ dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
+}
+
static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp)
{
- /* This will need to handle layouts too */
- nfs_expire_all_delegations(clp);
+ /* FIXME: For now, we destroy all layouts. */
+ pnfs_destroy_all_layouts(clp);
+ /* FIXME: For now, we test all delegations+open state+locks. */
+ nfs41_handle_some_state_revoked(clp);
dprintk("%s: Recallable state revoked on server %s!\n", __func__,
clp->cl_hostname);
}
static void nfs41_handle_backchannel_fault(struct nfs_client *clp)
{
- nfs_expire_all_delegations(clp);
- if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
- nfs4_schedule_state_manager(clp);
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+
dprintk("%s: server %s declared a backchannel fault\n", __func__,
clp->cl_hostname);
}
@@ -2229,10 +2241,11 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
nfs41_handle_server_reboot(clp);
- if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
- SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
+ if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED))
+ nfs41_handle_all_state_revoked(clp);
+ if (flags & (SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
SEQ4_STATUS_ADMIN_STATE_REVOKED))
- nfs41_handle_state_revoked(clp);
+ nfs41_handle_some_state_revoked(clp);
if (flags & SEQ4_STATUS_LEASE_MOVED)
nfs4_schedule_lease_moved_recovery(clp);
if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 75090feeafad..6fb7cb6b3f4b 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -3,12 +3,12 @@
*/
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/nfs_idmap.h>
#include <linux/nfs4_mount.h>
#include <linux/nfs_fs.h>
#include "delegation.h"
#include "internal.h"
#include "nfs4_fs.h"
+#include "nfs4idmap.h"
#include "dns_resolve.h"
#include "pnfs.h"
#include "nfs.h"
@@ -91,10 +91,11 @@ static void nfs4_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- pnfs_return_layout(inode);
- pnfs_destroy_layout(NFS_I(inode));
/* If we are holding a delegation, return it! */
nfs_inode_return_delegation_noreclaim(inode);
+ /* Note that above delegreturn would trigger pnfs return-on-close */
+ pnfs_return_layout(inode);
+ pnfs_destroy_layout(NFS_I(inode));
/* First call standard NFS clear_inode() code */
nfs_clear_inode(inode);
}
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index b6ebe7e445f6..0fbd3ab1be22 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -6,10 +6,10 @@
* Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com>
*/
#include <linux/sysctl.h>
-#include <linux/nfs_idmap.h>
#include <linux/nfs_fs.h>
#include "nfs4_fs.h"
+#include "nfs4idmap.h"
#include "callback.h"
static const int nfs_set_port_min = 0;
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 1c32adbe728d..470af1a78bec 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -418,7 +418,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->fileid = 0;
__entry->fhandle = 0;
}
- __entry->dir = NFS_FILEID(ctx->dentry->d_parent->d_inode);
+ __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent));
__assign_str(name, ctx->dentry->d_name.name);
),
@@ -1110,7 +1110,7 @@ TRACE_EVENT(nfs4_layoutget,
),
TP_fast_assign(
- const struct inode *inode = ctx->dentry->d_inode;
+ const struct inode *inode = d_inode(ctx->dentry);
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5c399ec41079..558cd65dbdb7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,10 +52,10 @@
#include <linux/nfs.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
-#include <linux/nfs_idmap.h>
#include "nfs4_fs.h"
#include "internal.h"
+#include "nfs4idmap.h"
#include "nfs4session.h"
#include "pnfs.h"
#include "netns.h"
@@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int);
#define encode_setclientid_maxsz \
(op_encode_hdr_maxsz + \
XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
- XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
+ /* client name */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
1 /* sc_prog */ + \
1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
@@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int);
#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
encode_verifier_maxsz + \
1 /* co_ownerid.len */ + \
- XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
+ /* eia_clientowner */ \
+ 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
1 /* flags */ + \
1 /* spa_how */ + \
/* max is SP4_MACH_CRED (for now) */ + \
@@ -1667,13 +1669,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
encode_nfs4_verifier(xdr, setclientid->sc_verifier);
- encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
+ encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
+ setclientid->sc_clnt->cl_owner_id);
p = reserve_space(xdr, 4);
*p = cpu_to_be32(setclientid->sc_prog);
encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
p = reserve_space(xdr, 4);
- *p = cpu_to_be32(setclientid->sc_cb_ident);
+ *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
}
static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
@@ -1747,7 +1750,8 @@ static void encode_exchange_id(struct xdr_stream *xdr,
encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
encode_nfs4_verifier(xdr, args->verifier);
- encode_string(xdr, args->id_len, args->id);
+ encode_string(xdr, strlen(args->client->cl_owner_id),
+ args->client->cl_owner_id);
encode_uint32(xdr, args->flags);
encode_uint32(xdr, args->state_protect.how);
@@ -1920,7 +1924,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
p = reserve_space(xdr, 4 + 4);
*p++ = cpu_to_be32(1); /* bitmap length */
- *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
+ *p++ = cpu_to_be32(args->notify_types);
}
static void
@@ -5753,8 +5757,9 @@ out_overflow:
#if defined(CONFIG_NFS_V4_1)
static int decode_getdeviceinfo(struct xdr_stream *xdr,
- struct pnfs_device *pdev)
+ struct nfs4_getdeviceinfo_res *res)
{
+ struct pnfs_device *pdev = res->pdev;
__be32 *p;
uint32_t len, type;
int status;
@@ -5802,12 +5807,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
if (unlikely(!p))
goto out_overflow;
- if (be32_to_cpup(p++) &
- ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
- dprintk("%s: unsupported notification\n",
- __func__);
- }
-
+ res->notification = be32_to_cpup(p++);
for (i = 1; i < len; i++) {
if (be32_to_cpup(p++)) {
dprintk("%s: unsupported notification\n",
@@ -7061,7 +7061,7 @@ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
status = decode_sequence(xdr, &res->seq_res, rqstp);
if (status != 0)
goto out;
- status = decode_getdeviceinfo(xdr, res->pdev);
+ status = decode_getdeviceinfo(xdr, res);
out:
return status;
}
@@ -7365,6 +7365,11 @@ nfs4_stat_to_errno(int stat)
.p_name = #proc, \
}
+#define STUB(proc) \
+[NFSPROC4_CLNT_##proc] = { \
+ .p_name = #proc, \
+}
+
struct rpc_procinfo nfs4_procedures[] = {
PROC(READ, enc_read, dec_read),
PROC(WRITE, enc_write, dec_write),
@@ -7417,6 +7422,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
+ STUB(GETDEVICELIST),
PROC(BIND_CONN_TO_SESSION,
enc_bind_conn_to_session, dec_bind_conn_to_session),
PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid),
@@ -7425,6 +7431,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(SEEK, enc_seek, dec_seek),
PROC(ALLOCATE, enc_allocate, dec_allocate),
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
+ PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
#endif /* CONFIG_NFS_V4_2 */
};
diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c
index 4eb0aead69b6..c74f7af23d77 100644
--- a/fs/nfs/nfstrace.c
+++ b/fs/nfs/nfstrace.c
@@ -7,3 +7,6 @@
#define CREATE_TRACE_POINTS
#include "nfstrace.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter);
+EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit);
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 24e1d7403c0b..5aaed363556a 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -57,7 +57,7 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
dprintk("%s: free od=%p\n", __func__, de->od.od);
osduld_put_device(de->od.od);
- kfree(de);
+ kfree_rcu(d, rcu);
}
struct objio_segment {
@@ -637,6 +637,8 @@ static struct pnfs_layoutdriver_type objlayout_type = {
.pg_read_ops = &objio_pg_read_ops,
.pg_write_ops = &objio_pg_write_ops,
+ .sync = pnfs_generic_sync,
+
.free_deviceid_node = objio_free_deviceid_node,
.encode_layoutcommit = objlayout_encode_layoutcommit,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index d57190a0d533..4984bbe55ff1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
- dprintk("NFS: %5u initiated pgio call "
+ dprintk("NFS: initiated pgio call "
"(req %s/%llu, %u bytes @ offset %llu)\n",
- hdr->task.tk_pid,
hdr->inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(hdr->inode),
hdr->args.count,
@@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
static void nfs_pgio_release(void *calldata)
{
struct nfs_pgio_header *hdr = calldata;
- if (hdr->rw_ops->rw_release)
- hdr->rw_ops->rw_release(hdr);
nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
}
@@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
* nfs_pageio_init - initialise a page io descriptor
* @desc: pointer to descriptor
* @inode: pointer to inode
- * @doio: pointer to io function
+ * @pg_ops: pointer to pageio operations
+ * @compl_ops: pointer to pageio completion operations
+ * @rw_ops: pointer to nfs read/write operations
* @bsize: io block size
* @io_flags: extra parameters for the io function
*/
@@ -938,7 +937,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
if (prev) {
if (!nfs_match_open_context(req->wb_context, prev->wb_context))
return false;
- flctx = req->wb_context->dentry->d_inode->i_flctx;
+ flctx = d_inode(req->wb_context->dentry)->i_flctx;
if (flctx != NULL &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock)) &&
@@ -1101,8 +1100,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
mirror->pg_base = 0;
mirror->pg_recoalesce = 0;
- desc->pg_moreio = 0;
-
while (!list_empty(&head)) {
struct nfs_page *req;
@@ -1110,8 +1107,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
nfs_list_remove_request(req);
if (__nfs_pageio_add_request(desc, req))
continue;
- if (desc->pg_error < 0)
+ if (desc->pg_error < 0) {
+ list_splice_tail(&head, &mirror->pg_list);
+ mirror->pg_recoalesce = 1;
return 0;
+ }
break;
}
} while (mirror->pg_recoalesce);
@@ -1186,6 +1186,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
* nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
* nfs_pageio_descriptor
* @desc: pointer to io descriptor
+ * @mirror_idx: pointer to mirror index
*/
static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
u32 mirror_idx)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4f802b02fbb9..70bf706b1090 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -35,6 +35,7 @@
#include "iostat.h"
#include "nfs4trace.h"
#include "delegation.h"
+#include "nfs42.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -351,7 +352,7 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
{
struct pnfs_layout_segment *s;
- if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
return false;
list_for_each_entry(s, &lo->plh_segs, pls_list)
@@ -361,6 +362,18 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
return true;
}
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+{
+ if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ return false;
+ lo->plh_return_iomode = 0;
+ lo->plh_block_lgets++;
+ pnfs_get_layout_hdr(lo);
+ clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
+ return true;
+}
+
static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
struct pnfs_layout_hdr *lo, struct inode *inode)
{
@@ -371,17 +384,16 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
if (pnfs_layout_need_return(lo, lseg)) {
nfs4_stateid stateid;
enum pnfs_iomode iomode;
+ bool send;
stateid = lo->plh_stateid;
iomode = lo->plh_return_iomode;
- /* decreased in pnfs_send_layoutreturn() */
- lo->plh_block_lgets++;
- lo->plh_return_iomode = 0;
+ send = pnfs_prepare_layoutreturn(lo);
spin_unlock(&inode->i_lock);
- pnfs_get_layout_hdr(lo);
-
- /* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, stateid, iomode, false);
+ if (send) {
+ /* Send an async layoutreturn so we dont deadlock */
+ pnfs_send_layoutreturn(lo, stateid, iomode, false);
+ }
} else
spin_unlock(&inode->i_lock);
}
@@ -410,6 +422,10 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+ if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
pnfs_get_layout_hdr(lo);
pnfs_layout_remove_lseg(lo, lseg);
spin_unlock(&inode->i_lock);
@@ -450,6 +466,8 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
if (atomic_dec_and_test(&lseg->pls_refcount)) {
struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ return;
pnfs_get_layout_hdr(lo);
pnfs_layout_remove_lseg(lo, lseg);
pnfs_free_lseg_async(lseg);
@@ -923,6 +941,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
smp_mb__after_atomic();
wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+ rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
}
static int
@@ -977,6 +996,7 @@ _pnfs_return_layout(struct inode *ino)
LIST_HEAD(tmp_list);
nfs4_stateid stateid;
int status = 0, empty;
+ bool send;
dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
@@ -1006,17 +1026,18 @@ _pnfs_return_layout(struct inode *ino)
/* Don't send a LAYOUTRETURN if list was initially empty */
if (empty) {
spin_unlock(&ino->i_lock);
- pnfs_put_layout_hdr(lo);
dprintk("NFS: %s no layout segments to return\n", __func__);
- goto out;
+ goto out_put_layout_hdr;
}
set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- lo->plh_block_lgets++;
+ send = pnfs_prepare_layoutreturn(lo);
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
-
- status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+ if (send)
+ status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+out_put_layout_hdr:
+ pnfs_put_layout_hdr(lo);
out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
@@ -1090,22 +1111,21 @@ bool pnfs_roc(struct inode *ino)
pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
+ pnfs_layoutcommit_inode(ino, true);
return true;
out_noroc:
if (lo) {
stateid = lo->plh_stateid;
- layoutreturn =
- test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags);
- if (layoutreturn) {
- lo->plh_block_lgets++;
- pnfs_get_layout_hdr(lo);
- }
+ if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags))
+ layoutreturn = pnfs_prepare_layoutreturn(lo);
}
spin_unlock(&ino->i_lock);
- if (layoutreturn)
+ if (layoutreturn) {
+ pnfs_layoutcommit_inode(ino, true);
pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+ }
return false;
}
@@ -1142,15 +1162,18 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
struct pnfs_layout_segment *lseg;
nfs4_stateid stateid;
u32 current_seqid;
- bool found = false, layoutreturn = false;
+ bool layoutreturn = false;
spin_lock(&ino->i_lock);
- list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
- if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
- found = true;
- goto out;
- }
+ list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags))
+ continue;
+ if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ continue;
+ rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+ spin_unlock(&ino->i_lock);
+ return true;
+ }
lo = nfsi->layout;
current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
@@ -1158,23 +1181,19 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
* a barrier, we choose the worst-case barrier.
*/
*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
-out:
- if (!found) {
- stateid = lo->plh_stateid;
- layoutreturn =
- test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags);
- if (layoutreturn) {
- lo->plh_block_lgets++;
- pnfs_get_layout_hdr(lo);
- }
- }
+ stateid = lo->plh_stateid;
+ if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags))
+ layoutreturn = pnfs_prepare_layoutreturn(lo);
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+
spin_unlock(&ino->i_lock);
if (layoutreturn) {
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
+ return true;
}
- return found;
+ return false;
}
/*
@@ -1691,7 +1710,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
spin_lock(&inode->i_lock);
/* set failure bit so that pnfs path will be retried later */
pnfs_layout_set_fail_bit(lo, iomode);
- set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
if (lo->plh_return_iomode == 0)
lo->plh_return_iomode = range.iomode;
else if (lo->plh_return_iomode != range.iomode)
@@ -1818,6 +1836,7 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
/* Resend all requests through the MDS */
nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
hdr->completion_ops);
+ set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
return nfs_pageio_resend(&pgio, hdr);
}
EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
@@ -1841,7 +1860,8 @@ void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
{
trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
if (!hdr->pnfs_error) {
- pnfs_set_layoutcommit(hdr);
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
+ hdr->mds_offset + hdr->res.count);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
} else
pnfs_ld_handle_write_error(hdr);
@@ -1861,6 +1881,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
mirror->pg_recoalesce = 1;
}
nfs_pgio_data_destroy(hdr);
+ hdr->release(hdr);
}
static enum pnfs_try_status
@@ -1902,7 +1923,6 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
pnfs_put_lseg(hdr->lseg);
nfs_pgio_header_free(hdr);
}
-EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
@@ -1976,6 +1996,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
mirror->pg_recoalesce = 1;
}
nfs_pgio_data_destroy(hdr);
+ hdr->release(hdr);
}
/*
@@ -2032,7 +2053,6 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
pnfs_put_lseg(hdr->lseg);
nfs_pgio_header_free(hdr);
}
-EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
@@ -2099,64 +2119,34 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
void
-pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
+pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
+ loff_t end_pos)
{
- struct inode *inode = hdr->inode;
struct nfs_inode *nfsi = NFS_I(inode);
- loff_t end_pos = hdr->mds_offset + hdr->res.count;
bool mark_as_dirty = false;
spin_lock(&inode->i_lock);
if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
- mark_as_dirty = true;
- dprintk("%s: Set layoutcommit for inode %lu ",
- __func__, inode->i_ino);
- }
- if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) {
- /* references matched in nfs4_layoutcommit_release */
- pnfs_get_lseg(hdr->lseg);
- }
- if (end_pos > nfsi->layout->plh_lwb)
nfsi->layout->plh_lwb = end_pos;
- spin_unlock(&inode->i_lock);
- dprintk("%s: lseg %p end_pos %llu\n",
- __func__, hdr->lseg, nfsi->layout->plh_lwb);
-
- /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
- * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
- if (mark_as_dirty)
- mark_inode_dirty_sync(inode);
-}
-EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
-
-void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
-{
- struct inode *inode = data->inode;
- struct nfs_inode *nfsi = NFS_I(inode);
- bool mark_as_dirty = false;
-
- spin_lock(&inode->i_lock);
- if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
mark_as_dirty = true;
dprintk("%s: Set layoutcommit for inode %lu ",
__func__, inode->i_ino);
- }
- if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
+ } else if (end_pos > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
+ if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
/* references matched in nfs4_layoutcommit_release */
- pnfs_get_lseg(data->lseg);
+ pnfs_get_lseg(lseg);
}
- if (data->lwb > nfsi->layout->plh_lwb)
- nfsi->layout->plh_lwb = data->lwb;
spin_unlock(&inode->i_lock);
dprintk("%s: lseg %p end_pos %llu\n",
- __func__, data->lseg, nfsi->layout->plh_lwb);
+ __func__, lseg, nfsi->layout->plh_lwb);
/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
* will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
if (mark_as_dirty)
mark_inode_dirty_sync(inode);
}
-EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
+EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
{
@@ -2216,7 +2206,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
pnfs_list_write_lseg(inode, &data->lseg_list);
end_pos = nfsi->layout->plh_lwb;
- nfsi->layout->plh_lwb = 0;
nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
spin_unlock(&inode->i_lock);
@@ -2232,13 +2221,12 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
if (ld->prepare_layoutcommit) {
status = ld->prepare_layoutcommit(&data->args);
if (status) {
- spin_lock(&inode->i_lock);
- if (end_pos < nfsi->layout->plh_lwb)
- nfsi->layout->plh_lwb = end_pos;
- spin_unlock(&inode->i_lock);
put_rpccred(data->cred);
+ spin_lock(&inode->i_lock);
set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
- goto clear_layoutcommitting;
+ if (end_pos > nfsi->layout->plh_lwb)
+ nfsi->layout->plh_lwb = end_pos;
+ goto out_unlock;
}
}
@@ -2258,6 +2246,13 @@ clear_layoutcommitting:
}
EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
+int
+pnfs_generic_sync(struct inode *inode, bool datasync)
+{
+ return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_sync);
+
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{
struct nfs4_threshold *thp;
@@ -2269,3 +2264,63 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
}
return thp;
}
+
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int
+pnfs_report_layoutstat(struct inode *inode)
+{
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs42_layoutstat_data *data;
+ struct pnfs_layout_hdr *hdr;
+ int status = 0;
+
+ if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
+ goto out;
+
+ if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
+ goto out;
+
+ if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
+ goto out;
+
+ spin_lock(&inode->i_lock);
+ if (!NFS_I(inode)->layout) {
+ spin_unlock(&inode->i_lock);
+ goto out;
+ }
+ hdr = NFS_I(inode)->layout;
+ pnfs_get_layout_hdr(hdr);
+ spin_unlock(&inode->i_lock);
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data) {
+ status = -ENOMEM;
+ goto out_put;
+ }
+
+ data->args.fh = NFS_FH(inode);
+ data->args.inode = inode;
+ nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
+ status = ld->prepare_layoutstats(&data->args);
+ if (status)
+ goto out_free;
+
+ status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
+
+out:
+ dprintk("%s returns %d\n", __func__, status);
+ return status;
+
+out_free:
+ kfree(data);
+out_put:
+ pnfs_put_layout_hdr(hdr);
+ smp_mb__before_atomic();
+ clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
+ smp_mb__after_atomic();
+ goto out;
+}
+EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
+#endif
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 635f0865671c..3e6ab7bfbabd 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -155,6 +155,8 @@ struct pnfs_layoutdriver_type {
int how,
struct nfs_commit_info *cinfo);
+ int (*sync)(struct inode *inode, bool datasync);
+
/*
* Return PNFS_ATTEMPTED to indicate the layout code has attempted
* I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
@@ -176,6 +178,8 @@ struct pnfs_layoutdriver_type {
void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);
+ int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
+ void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data);
};
struct pnfs_layout_hdr {
@@ -203,6 +207,7 @@ struct pnfs_device {
struct page **pages;
unsigned int pgbase;
unsigned int pglen; /* reply buffer length */
+ unsigned char nocache : 1;/* May not be cached */
};
#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -263,10 +268,11 @@ bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_pgio_header *);
-void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
+void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int pnfs_generic_sync(struct inode *inode, bool datasync);
+int pnfs_nfs_generic_sync(struct inode *inode, bool datasync);
int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
void pnfs_ld_write_done(struct nfs_pgio_header *);
@@ -286,11 +292,11 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg);
-
/* nfs4_deviceid_flags */
enum {
NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */
+ NFS_DEVICEID_NOCACHE, /* device may not be cached */
};
/* pnfs_dev.c */
@@ -302,6 +308,7 @@ struct nfs4_deviceid_node {
unsigned long flags;
unsigned long timestamp_unavailable;
struct nfs4_deviceid deviceid;
+ struct rcu_head rcu;
atomic_t ref;
};
@@ -426,7 +433,7 @@ static inline bool
pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo, u32 ds_commit_idx)
{
- struct inode *inode = req->wb_context->dentry->d_inode;
+ struct inode *inode = d_inode(req->wb_context->dentry);
struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
if (lseg == NULL || ld->mark_request_commit == NULL)
@@ -438,7 +445,7 @@ pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
static inline bool
pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
{
- struct inode *inode = req->wb_context->dentry->d_inode;
+ struct inode *inode = d_inode(req->wb_context->dentry);
struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
if (ld == NULL || ld->clear_request_commit == NULL)
@@ -486,6 +493,14 @@ pnfs_ld_read_whole_page(struct inode *inode)
return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
}
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+ if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+ return 0;
+ return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync);
+}
+
static inline bool
pnfs_layoutcommit_outstanding(struct inode *inode)
{
@@ -568,6 +583,12 @@ pnfs_ld_read_whole_page(struct inode *inode)
return false;
}
+static inline int
+pnfs_sync_inode(struct inode *inode, bool datasync)
+{
+ return 0;
+}
+
static inline bool
pnfs_roc(struct inode *ino)
{
@@ -669,4 +690,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
#endif /* CONFIG_NFS_V4_1 */
+#if IS_ENABLED(CONFIG_NFS_V4_2)
+int pnfs_report_layoutstat(struct inode *inode);
+#else
+static inline int
+pnfs_report_layoutstat(struct inode *inode)
+{
+ return 0;
+}
+#endif
+
#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index aa2ec0015183..2961fcd7a2df 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -149,6 +149,8 @@ nfs4_get_device_info(struct nfs_server *server,
*/
d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
gfp_flags);
+ if (d && pdev->nocache)
+ set_bit(NFS_DEVICEID_NOCACHE, &d->flags);
out_free_pages:
for (i = 0; i < max_pages; i++)
@@ -175,8 +177,8 @@ __nfs4_find_get_deviceid(struct nfs_server *server,
rcu_read_lock();
d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
hash);
- if (d != NULL)
- atomic_inc(&d->ref);
+ if (d != NULL && !atomic_inc_not_zero(&d->ref))
+ d = NULL;
rcu_read_unlock();
return d;
}
@@ -235,12 +237,11 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
return;
}
hlist_del_init_rcu(&d->node);
+ clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
spin_unlock(&nfs4_deviceid_lock);
- synchronize_rcu();
/* balance the initial ref set in pnfs_insert_deviceid */
- if (atomic_dec_and_test(&d->ref))
- d->ld->free_deviceid_node(d);
+ nfs4_put_deviceid_node(d);
}
EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
@@ -271,6 +272,11 @@ EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
bool
nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
{
+ if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) {
+ if (atomic_add_unless(&d->ref, -1, 2))
+ return false;
+ nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid);
+ }
if (!atomic_dec_and_test(&d->ref))
return false;
d->ld->free_deviceid_node(d);
@@ -314,6 +320,7 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash)
if (d->nfs_client == clp && atomic_read(&d->ref)) {
hlist_del_init_rcu(&d->node);
hlist_add_head(&d->tmpnode, &tmp);
+ clear_bit(NFS_DEVICEID_NOCACHE, &d->flags);
}
rcu_read_unlock();
spin_unlock(&nfs4_deviceid_lock);
@@ -321,12 +328,10 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash)
if (hlist_empty(&tmp))
return;
- synchronize_rcu();
while (!hlist_empty(&tmp)) {
d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode);
hlist_del(&d->tmpnode);
- if (atomic_dec_and_test(&d->ref))
- d->ld->free_deviceid_node(d);
+ nfs4_put_deviceid_node(d);
}
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 54e36b38fb5f..f37e25b6311c 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -561,7 +561,7 @@ static bool load_v3_ds_connect(void)
return(get_v3_ds_connect != NULL);
}
-void __exit nfs4_pnfs_v3_ds_connect_unload(void)
+void nfs4_pnfs_v3_ds_connect_unload(void)
{
if (get_v3_ds_connect) {
symbol_put(nfs3_set_ds_client);
@@ -868,3 +868,13 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
nfs_request_add_commit_list(req, list, cinfo);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
+
+int
+pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
+{
+ if (datasync)
+ return 0;
+ return pnfs_layoutcommit_inode(inode, true);
+}
+EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync);
+
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c63189acd052..b417bbcd9704 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -118,7 +118,7 @@ static int
nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct iattr *sattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct nfs_sattrargs arg = {
.fh = NFS_FH(inode),
.sattr = sattr
@@ -487,7 +487,7 @@ static int
nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
u64 cookie, struct page **pages, unsigned int count, int plus)
{
- struct inode *dir = dentry->d_inode;
+ struct inode *dir = d_inode(dentry);
struct nfs_readdirargs arg = {
.fh = NFS_FH(dir),
.cookie = cookie,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 568ecf0a880f..ae0ff7a11b40 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -117,15 +117,15 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
static void nfs_readpage_release(struct nfs_page *req)
{
- struct inode *d_inode = req->wb_context->dentry->d_inode;
+ struct inode *inode = d_inode(req->wb_context->dentry);
- dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(d_inode), req->wb_bytes,
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
(long long)req_offset(req));
if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
if (PageUptodate(req->wb_page))
- nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+ nfs_readpage_to_fscache(inode, req->wb_page, 0);
unlock_page(req->wb_page);
}
@@ -284,7 +284,7 @@ int nfs_readpage(struct file *file, struct page *page)
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
page, PAGE_CACHE_SIZE, page_file_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
- nfs_inc_stats(inode, NFSIOS_READPAGES);
+ nfs_add_stats(inode, NFSIOS_READPAGES, 1);
/*
* Try to flush any pending writes to the file..
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 322b2de02988..aa62004f1706 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -43,7 +43,6 @@
#include <linux/seq_file.h>
#include <linux/mount.h>
#include <linux/namei.h>
-#include <linux/nfs_idmap.h>
#include <linux/vfs.h>
#include <linux/inet.h>
#include <linux/in6.h>
@@ -433,7 +432,7 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct nfs_server *server = NFS_SB(dentry->d_sb);
unsigned char blockbits;
unsigned long blockres;
- struct nfs_fh *fh = NFS_FH(dentry->d_inode);
+ struct nfs_fh *fh = NFS_FH(d_inode(dentry));
struct nfs_fsstat res;
int error = -ENOMEM;
@@ -447,7 +446,7 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
pd_dentry = dget_parent(dentry);
if (pd_dentry != NULL) {
- nfs_zap_caches(pd_dentry->d_inode);
+ nfs_zap_caches(d_inode(pd_dentry));
dput(pd_dentry);
}
}
@@ -2193,7 +2192,7 @@ nfs_compare_remount_data(struct nfs_server *nfss,
data->version != nfss->nfs_client->rpc_ops->version ||
data->minorversion != nfss->nfs_client->cl_minorversion ||
data->retrans != nfss->client->cl_timeout->to_retries ||
- data->selected_flavor != nfss->client->cl_auth->au_flavor ||
+ !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) ||
data->acregmin != nfss->acregmin / HZ ||
data->acregmax != nfss->acregmax / HZ ||
data->acdirmin != nfss->acdirmin / HZ ||
@@ -2241,7 +2240,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
data->wsize = nfss->wsize;
data->retrans = nfss->client->cl_timeout->to_retries;
data->selected_flavor = nfss->client->cl_auth->au_flavor;
- data->auth_info = nfss->auth_info;
data->acregmin = nfss->acregmin / HZ;
data->acregmax = nfss->acregmax / HZ;
data->acdirmin = nfss->acdirmin / HZ;
@@ -2526,7 +2524,7 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
/* clone any lsm security options from the parent to the new sb */
- if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops)
+ if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops)
return -ESTALE;
return security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
}
@@ -2849,7 +2847,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp)
*((unsigned int *)kp->arg) = num;
return 0;
}
-static struct kernel_param_ops param_ops_portnr = {
+static const struct kernel_param_ops param_ops_portnr = {
.set = param_set_portnr,
.get = param_get_uint,
};
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 05c9e02f4153..b6de433da5db 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -20,7 +20,6 @@
#include <linux/stat.h>
#include <linux/mm.h>
#include <linux/string.h>
-#include <linux/namei.h>
/* Symlink caching in the page cache is even more simplistic
* and straight-forward than readdir caching.
@@ -43,27 +42,21 @@ error:
return -EIO;
}
-static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct page *page;
void *err;
err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
if (err)
- goto read_failed;
+ return err;
page = read_cache_page(&inode->i_data, 0,
(filler_t *)nfs_symlink_filler, inode);
- if (IS_ERR(page)) {
- err = page;
- goto read_failed;
- }
- nd_set_link(nd, kmap(page));
- return page;
-
-read_failed:
- nd_set_link(nd, err);
- return NULL;
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ *cookie = page;
+ return kmap(page);
}
/*
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index de54129336c6..fa538b2ba251 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -143,7 +143,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
nfs_free_dname(data);
ret = nfs_copy_dname(alias, data);
spin_lock(&alias->d_lock);
- if (ret == 0 && alias->d_inode != NULL &&
+ if (ret == 0 && d_really_is_positive(alias) &&
!(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
devname_garbage = alias->d_fsdata;
alias->d_fsdata = data;
@@ -190,7 +190,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
parent = dget_parent(dentry);
if (parent == NULL)
goto out_free;
- dir = parent->d_inode;
+ dir = d_inode(parent);
/* Non-exclusive lock protects against concurrent lookup() calls */
spin_lock(&dir->i_lock);
if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) {
@@ -210,21 +210,21 @@ out_free:
void nfs_wait_on_sillyrename(struct dentry *dentry)
{
- struct nfs_inode *nfsi = NFS_I(dentry->d_inode);
+ struct nfs_inode *nfsi = NFS_I(d_inode(dentry));
wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1);
}
void nfs_block_sillyrename(struct dentry *dentry)
{
- struct nfs_inode *nfsi = NFS_I(dentry->d_inode);
+ struct nfs_inode *nfsi = NFS_I(d_inode(dentry));
wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1);
}
void nfs_unblock_sillyrename(struct dentry *dentry)
{
- struct inode *dir = dentry->d_inode;
+ struct inode *dir = d_inode(dentry);
struct nfs_inode *nfsi = NFS_I(dir);
struct nfs_unlinkdata *data;
@@ -367,8 +367,8 @@ static void nfs_async_rename_release(void *calldata)
struct nfs_renamedata *data = calldata;
struct super_block *sb = data->old_dir->i_sb;
- if (data->old_dentry->d_inode)
- nfs_mark_for_revalidate(data->old_dentry->d_inode);
+ if (d_really_is_positive(data->old_dentry))
+ nfs_mark_for_revalidate(d_inode(data->old_dentry));
dput(data->old_dentry);
dput(data->new_dentry);
@@ -529,10 +529,10 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
goto out;
- fileid = NFS_FILEID(dentry->d_inode);
+ fileid = NFS_FILEID(d_inode(dentry));
/* Return delegation in anticipation of the rename */
- NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode);
+ NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry));
sdentry = NULL;
do {
@@ -554,7 +554,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
*/
if (IS_ERR(sdentry))
goto out;
- } while (sdentry->d_inode != NULL); /* need negative lookup */
+ } while (d_inode(sdentry) != NULL); /* need negative lookup */
/* queue unlink first. Can't do this from rpc_release as it
* has to allocate memory
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 849ed784d6ac..75a35a1afa79 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -580,7 +580,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
int ret;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
- nfs_inc_stats(inode, NFSIOS_WRITEPAGES);
+ nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
nfs_pageio_cond_complete(pgio, page_file_index(page));
ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
@@ -702,7 +702,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
- struct inode *inode = req->wb_context->dentry->d_inode;
+ struct inode *inode = d_inode(req->wb_context->dentry);
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_page *head;
@@ -853,7 +853,8 @@ static void
nfs_clear_page_commit(struct page *page)
{
dec_zone_page_state(page, NR_UNSTABLE_NFS);
- dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+ dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+ WB_RECLAIMABLE);
}
/* Called holding inode (/cinfo) lock */
@@ -861,7 +862,7 @@ static void
nfs_clear_request_commit(struct nfs_page *req)
{
if (test_bit(PG_CLEAN, &req->wb_flags)) {
- struct inode *inode = req->wb_context->dentry->d_inode;
+ struct inode *inode = d_inode(req->wb_context->dentry);
struct nfs_commit_info cinfo;
nfs_init_cinfo_from_inode(&cinfo, inode);
@@ -1289,6 +1290,7 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
static void nfs_redirty_request(struct nfs_page *req)
{
nfs_mark_request_dirty(req);
+ set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
nfs_unlock_request(req);
nfs_end_page_writeback(req);
nfs_release_request(req);
@@ -1347,11 +1349,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
}
-static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
-{
- /* do nothing! */
-}
-
/*
* Special version of should_remove_suid() that ignores capabilities.
*/
@@ -1382,24 +1379,27 @@ static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
{
struct nfs_pgio_args *argp = &hdr->args;
struct nfs_pgio_res *resp = &hdr->res;
+ u64 size = argp->offset + resp->count;
if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+ fattr->size = size;
+ if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) {
+ fattr->valid &= ~NFS_ATTR_FATTR_SIZE;
return;
- if (argp->offset + resp->count != fattr->size)
- return;
- if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode))
+ }
+ if (size != fattr->size)
return;
/* Set attribute barrier */
nfs_fattr_set_barrier(fattr);
+ /* ...and update size */
+ fattr->valid |= NFS_ATTR_FATTR_SIZE;
}
void nfs_writeback_update_inode(struct nfs_pgio_header *hdr)
{
- struct nfs_fattr *fattr = hdr->res.fattr;
+ struct nfs_fattr *fattr = &hdr->fattr;
struct inode *inode = hdr->inode;
- if (fattr == NULL)
- return;
spin_lock(&inode->i_lock);
nfs_writeback_check_extend(hdr, fattr);
nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
@@ -1555,7 +1555,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
/* Set up the initial task struct. */
nfs_ops->commit_setup(data, &msg);
- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+ dprintk("NFS: initiated commit call\n");
nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
@@ -1591,7 +1591,7 @@ void nfs_init_commit(struct nfs_commit_data *data,
struct nfs_commit_info *cinfo)
{
struct nfs_page *first = nfs_list_entry(head->next);
- struct inode *inode = first->wb_context->dentry->d_inode;
+ struct inode *inode = d_inode(first->wb_context->dentry);
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */
@@ -1690,7 +1690,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
dprintk("NFS: commit (%s/%llu %d@%lld)",
req->wb_context->dentry->d_sb->s_id,
- (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+ (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)),
req->wb_bytes,
(long long)req_offset(req));
if (status < 0) {
@@ -1840,18 +1840,20 @@ EXPORT_SYMBOL_GPL(nfs_write_inode);
*/
int nfs_wb_all(struct inode *inode)
{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = 0,
- .range_end = LLONG_MAX,
- };
int ret;
trace_nfs_writeback_inode_enter(inode);
- ret = sync_inode(inode, &wbc);
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ goto out;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ goto out;
+ pnfs_sync_inode(inode, true);
+ ret = 0;
+out:
trace_nfs_writeback_inode_exit(inode, ret);
return ret;
}
@@ -1876,11 +1878,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
* request from the inode / page_private pointer and
* release it */
nfs_inode_remove_request(req);
- /*
- * In case nfs_inode_remove_request has marked the
- * page as being dirty
- */
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
nfs_unlock_and_release_request(req);
}
@@ -2015,7 +2012,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
.rw_mode = FMODE_WRITE,
.rw_alloc_header = nfs_writehdr_alloc,
.rw_free_header = nfs_writehdr_free,
- .rw_release = nfs_writeback_release_common,
.rw_done = nfs_writeback_done,
.rw_result = nfs_writeback_result,
.rw_initiate = nfs_initiate_write,
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 683bf718aead..a0b77fc1bd39 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -6,6 +6,7 @@ config NFSD
select SUNRPC
select EXPORTFS
select NFS_ACL_SUPPORT if NFSD_V2_ACL
+ depends on MULTIUSER
help
Choose Y here if you want to allow other computers to access
files residing on this system using Sun's Network File System
@@ -107,7 +108,7 @@ config NFSD_V4_SECURITY_LABEL
config NFSD_FAULT_INJECTION
bool "NFS server manual fault injection"
- depends on NFSD_V4 && DEBUG_KERNEL
+ depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS
help
This option enables support for manually injecting faults
into the NFS server. This is intended to be used for
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 03d647bf195d..cdefaa331a07 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -181,6 +181,17 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
}
const struct nfsd4_layout_ops bl_layout_ops = {
+ /*
+ * Pretend that we send notification to the client. This is a blatant
+ * lie to force recent Linux clients to cache our device IDs.
+ * We rarely ever change the device ID, so the harm of leaking deviceids
+ * for a while isn't too bad. Unfortunately RFC5661 is a complete mess
+ * in this regard, but I filed errata 4119 for this a while ago, and
+ * hopefully the Linux client will eventually start caching deviceids
+ * without this again.
+ */
+ .notify_types =
+ NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
.proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo,
.encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
.proc_layoutget = nfsd4_block_proc_layoutget,
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c3e3b6e55ae2..f79521a59747 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -599,7 +599,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
goto out4;
}
- err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags,
+ err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags,
exp.ex_uuid);
if (err)
goto out4;
@@ -691,8 +691,7 @@ static int svc_export_match(struct cache_head *a, struct cache_head *b)
struct svc_export *orig = container_of(a, struct svc_export, h);
struct svc_export *new = container_of(b, struct svc_export, h);
return orig->ex_client == new->ex_client &&
- orig->ex_path.dentry == new->ex_path.dentry &&
- orig->ex_path.mnt == new->ex_path.mnt;
+ path_equal(&orig->ex_path, &new->ex_path);
}
static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
@@ -891,7 +890,7 @@ exp_rootfh(struct net *net, struct auth_domain *clp, char *name,
printk("nfsd: exp_rootfh path not found %s", name);
return err;
}
- inode = path.dentry->d_inode;
+ inode = d_inode(path.dentry);
dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
name, path.dentry, clp->name,
@@ -1159,6 +1158,7 @@ static struct flags {
{ NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
{ NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
{ NFSEXP_V4ROOT, {"v4root", ""}},
+ { NFSEXP_PNFS, {"pnfs", ""}},
{ 0, {"", ""}}
};
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index ac54ea60b3f6..d54701f6dc78 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -42,7 +42,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
if (nfserr)
RETURN_STATUS(nfserr);
- inode = fh->fh_dentry->d_inode;
+ inode = d_inode(fh->fh_dentry);
if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
RETURN_STATUS(nfserr_inval);
@@ -103,7 +103,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
if (nfserr)
goto out;
- inode = fh->fh_dentry->d_inode;
+ inode = d_inode(fh->fh_dentry);
if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
error = -EOPNOTSUPP;
goto out_errno;
@@ -266,9 +266,9 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
* nfsd_dispatch actually ensures the following cannot happen.
* However, it seems fragile to depend on that.
*/
- if (dentry == NULL || dentry->d_inode == NULL)
+ if (dentry == NULL || d_really_is_negative(dentry))
return 0;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
*p++ = htonl(resp->mask);
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 34cbbab6abd7..882b1a14bc3e 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -39,7 +39,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
if (nfserr)
RETURN_STATUS(nfserr);
- inode = fh->fh_dentry->d_inode;
+ inode = d_inode(fh->fh_dentry);
if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
RETURN_STATUS(nfserr_inval);
@@ -94,7 +94,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
if (nfserr)
goto out;
- inode = fh->fh_dentry->d_inode;
+ inode = d_inode(fh->fh_dentry);
if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
error = -EOPNOTSUPP;
goto out_errno;
@@ -174,8 +174,8 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
struct dentry *dentry = resp->fh.fh_dentry;
p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
- if (resp->status == 0 && dentry && dentry->d_inode) {
- struct inode *inode = dentry->d_inode;
+ if (resp->status == 0 && dentry && d_really_is_positive(dentry)) {
+ struct inode *inode = d_inode(dentry);
struct kvec *head = rqstp->rq_res.head;
unsigned int base;
int n;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 12f2aab4f614..7b755b7f785c 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -166,7 +166,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
rqstp->rq_vec, argp->vlen,
&resp->count);
if (nfserr == 0) {
- struct inode *inode = resp->fh.fh_dentry->d_inode;
+ struct inode *inode = d_inode(resp->fh.fh_dentry);
resp->eof = (argp->offset + resp->count) >= inode->i_size;
}
@@ -551,7 +551,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
* different read/write sizes for file systems known to have
* problems with large blocks */
if (nfserr == 0) {
- struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
+ struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb;
/* Note that we don't care for remote fs's here */
if (sb->s_magic == MSDOS_SUPER_MAGIC) {
@@ -587,7 +587,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
if (nfserr == 0) {
- struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
+ struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb;
/* Note that we don't care for remote fs's here */
switch (sb->s_magic) {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 39c5eb3ad33a..f6e7cbabac5a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -146,7 +146,7 @@ static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp)
default:
case FSIDSOURCE_DEV:
p = xdr_encode_hyper(p, (u64)huge_encode_dev
- (fhp->fh_dentry->d_inode->i_sb->s_dev));
+ (d_inode(fhp->fh_dentry)->i_sb->s_dev));
break;
case FSIDSOURCE_FSID:
p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
@@ -203,14 +203,14 @@ static __be32 *
encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
struct dentry *dentry = fhp->fh_dentry;
- if (dentry && dentry->d_inode) {
+ if (dentry && d_really_is_positive(dentry)) {
__be32 err;
struct kstat stat;
err = fh_getattr(fhp, &stat);
if (!err) {
*p++ = xdr_one; /* attributes follow */
- lease_get_mtime(dentry->d_inode, &stat.mtime);
+ lease_get_mtime(d_inode(dentry), &stat.mtime);
return encode_fattr3(rqstp, p, fhp, &stat);
}
}
@@ -233,7 +233,7 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
struct dentry *dentry = fhp->fh_dentry;
- if (dentry && dentry->d_inode && fhp->fh_post_saved) {
+ if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) {
if (fhp->fh_pre_saved) {
*p++ = xdr_one;
p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size);
@@ -260,11 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp)
printk("nfsd: inode locked twice during operation.\n");
err = fh_getattr(fhp, &fhp->fh_post_attr);
- fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
+ fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
if (err) {
fhp->fh_post_saved = 0;
/* Grab the ctime anyway - set_change_info might use it */
- fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime;
+ fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
} else
fhp->fh_post_saved = 1;
}
@@ -628,7 +628,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_attrstat *resp)
{
if (resp->status == 0) {
- lease_get_mtime(resp->fh.fh_dentry->d_inode,
+ lease_get_mtime(d_inode(resp->fh.fh_dentry),
&resp->stat.mtime);
p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
}
@@ -805,7 +805,7 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
static __be32
compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
- const char *name, int namlen)
+ const char *name, int namlen, u64 ino)
{
struct svc_export *exp;
struct dentry *dparent, *dchild;
@@ -828,7 +828,9 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
return rv;
if (d_mountpoint(dchild))
goto out;
- if (!dchild->d_inode)
+ if (d_really_is_negative(dchild))
+ goto out;
+ if (dchild->d_inode->i_ino != ino)
goto out;
rv = fh_compose(fhp, exp, dchild, &cd->fh);
out:
@@ -836,13 +838,13 @@ out:
return rv;
}
-static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino)
{
struct svc_fh *fh = &cd->scratch;
__be32 err;
fh_init(fh, NFS3_FHSIZE);
- err = compose_entry_fh(cd, fh, name, namlen);
+ err = compose_entry_fh(cd, fh, name, namlen, ino);
if (err) {
*p++ = 0;
*p++ = 0;
@@ -927,7 +929,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p = encode_entry_baggage(cd, p, name, namlen, ino);
if (plus)
- p = encode_entryplus_baggage(cd, p, name, namlen);
+ p = encode_entryplus_baggage(cd, p, name, namlen, ino);
num_entry_words = p - cd->buffer;
} else if (*(page+1) != NULL) {
/* temporarily encode entry into next page, then move back to
@@ -941,7 +943,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
if (plus)
- p1 = encode_entryplus_baggage(cd, p1, name, namlen);
+ p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino);
/* determine entry word length and lengths to go in pages */
num_entry_words = p1 - tmp;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 59fd76651781..eb5accf1b37f 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -52,10 +52,6 @@
#define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE)
#define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)
-/* We don't support these bits; insist they be neither allowed nor denied */
-#define NFS4_MASK_UNSUPP (NFS4_ACE_DELETE | NFS4_ACE_WRITE_OWNER \
- | NFS4_ACE_READ_NAMED_ATTRS | NFS4_ACE_WRITE_NAMED_ATTRS)
-
/* flags used to simulate posix default ACLs */
#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
| NFS4_ACE_DIRECTORY_INHERIT_ACE)
@@ -64,9 +60,6 @@
| NFS4_ACE_INHERIT_ONLY_ACE \
| NFS4_ACE_IDENTIFIER_GROUP)
-#define MASK_EQUAL(mask1, mask2) \
- ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
-
static u32
mask_from_posix(unsigned short perm, unsigned int flags)
{
@@ -126,11 +119,6 @@ low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
*mode |= ACL_EXECUTE;
}
-struct ace_container {
- struct nfs4_ace *ace;
- struct list_head ace_l;
-};
-
static short ace2type(struct nfs4_ace *);
static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
unsigned int);
@@ -139,7 +127,7 @@ int
nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
struct nfs4_acl **acl)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error = 0;
struct posix_acl *pacl = NULL, *dpacl = NULL;
unsigned int flags = 0;
@@ -384,7 +372,6 @@ pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2)
static void
sort_pacl_range(struct posix_acl *pacl, int start, int end) {
int sorted = 0, i;
- struct posix_acl_entry tmp;
/* We just do a bubble sort; easy to do in place, and we're not
* expecting acl's to be long enough to justify anything more. */
@@ -394,9 +381,8 @@ sort_pacl_range(struct posix_acl *pacl, int start, int end) {
if (pace_gt(&pacl->a_entries[i],
&pacl->a_entries[i+1])) {
sorted = 0;
- tmp = pacl->a_entries[i];
- pacl->a_entries[i] = pacl->a_entries[i+1];
- pacl->a_entries[i+1] = tmp;
+ swap(pacl->a_entries[i],
+ pacl->a_entries[i + 1]);
}
}
}
@@ -499,43 +485,13 @@ static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_s
state->mask.allow |= astate->allow;
}
-/*
- * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS,
- * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate
- * to traditional read/write/execute permissions.
- *
- * It's problematic to reject acls that use certain mode bits, because it
- * places the burden on users to learn the rules about which bits one
- * particular server sets, without giving the user a lot of help--we return an
- * error that could mean any number of different things. To make matters
- * worse, the problematic bits might be introduced by some application that's
- * automatically mapping from some other acl model.
- *
- * So wherever possible we accept anything, possibly erring on the side of
- * denying more permissions than necessary.
- *
- * However we do reject *explicit* DENY's of a few bits representing
- * permissions we could never deny:
- */
-
-static inline int check_deny(u32 mask, int isowner)
-{
- if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL))
- return -EINVAL;
- if (!isowner)
- return 0;
- if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL))
- return -EINVAL;
- return 0;
-}
-
static struct posix_acl *
posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
{
struct posix_acl_entry *pace;
struct posix_acl *pacl;
int nace;
- int i, error = 0;
+ int i;
/*
* ACLs with no ACEs are treated differently in the inheritable
@@ -560,17 +516,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
pace = pacl->a_entries;
pace->e_tag = ACL_USER_OBJ;
- error = check_deny(state->owner.deny, 1);
- if (error)
- goto out_err;
low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
for (i=0; i < state->users->n; i++) {
pace++;
pace->e_tag = ACL_USER;
- error = check_deny(state->users->aces[i].perms.deny, 0);
- if (error)
- goto out_err;
low_mode_from_nfs4(state->users->aces[i].perms.allow,
&pace->e_perm, flags);
pace->e_uid = state->users->aces[i].uid;
@@ -579,18 +529,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
pace++;
pace->e_tag = ACL_GROUP_OBJ;
- error = check_deny(state->group.deny, 0);
- if (error)
- goto out_err;
low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
add_to_mask(state, &state->group);
for (i=0; i < state->groups->n; i++) {
pace++;
pace->e_tag = ACL_GROUP;
- error = check_deny(state->groups->aces[i].perms.deny, 0);
- if (error)
- goto out_err;
low_mode_from_nfs4(state->groups->aces[i].perms.allow,
&pace->e_perm, flags);
pace->e_gid = state->groups->aces[i].gid;
@@ -605,15 +549,9 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
pace++;
pace->e_tag = ACL_OTHER;
- error = check_deny(state->other.deny, 0);
- if (error)
- goto out_err;
low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
return pacl;
-out_err:
- posix_acl_release(pacl);
- return ERR_PTR(error);
}
static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
@@ -828,7 +766,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
return error;
dentry = fhp->fh_dentry;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
if (!inode->i_op->set_acl || !IS_POSIXACL(inode))
return nfserr_attrnotsupp;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 58277859a467..a49201835a97 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -224,7 +224,7 @@ static int nfs_cb_stat_to_errno(int status)
}
static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
- enum nfsstat4 *status)
+ int *status)
{
__be32 *p;
u32 op;
@@ -235,7 +235,7 @@ static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
op = be32_to_cpup(p++);
if (unlikely(op != expected))
goto out_unexpected;
- *status = be32_to_cpup(p);
+ *status = nfs_cb_stat_to_errno(be32_to_cpup(p));
return 0;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -446,22 +446,17 @@ out_overflow:
static int decode_cb_sequence4res(struct xdr_stream *xdr,
struct nfsd4_callback *cb)
{
- enum nfsstat4 nfserr;
int status;
if (cb->cb_minorversion == 0)
return 0;
- status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr);
- if (unlikely(status))
- goto out;
- if (unlikely(nfserr != NFS4_OK))
- goto out_default;
- status = decode_cb_sequence4resok(xdr, cb);
-out:
- return status;
-out_default:
- return nfs_cb_stat_to_errno(nfserr);
+ status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status);
+ if (unlikely(status || cb->cb_status))
+ return status;
+
+ cb->cb_update_seq_nr = true;
+ return decode_cb_sequence4resok(xdr, cb);
}
/*
@@ -524,26 +519,19 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
struct nfsd4_callback *cb)
{
struct nfs4_cb_compound_hdr hdr;
- enum nfsstat4 nfserr;
int status;
status = decode_cb_compound4res(xdr, &hdr);
if (unlikely(status))
- goto out;
+ return status;
if (cb != NULL) {
status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status))
- goto out;
+ if (unlikely(status || cb->cb_status))
+ return status;
}
- status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr);
- if (unlikely(status))
- goto out;
- if (unlikely(nfserr != NFS4_OK))
- status = nfs_cb_stat_to_errno(nfserr);
-out:
- return status;
+ return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
}
#ifdef CONFIG_NFSD_PNFS
@@ -621,24 +609,18 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
struct nfsd4_callback *cb)
{
struct nfs4_cb_compound_hdr hdr;
- enum nfsstat4 nfserr;
int status;
status = decode_cb_compound4res(xdr, &hdr);
if (unlikely(status))
- goto out;
+ return status;
+
if (cb) {
status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status))
- goto out;
+ if (unlikely(status || cb->cb_status))
+ return status;
}
- status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
- if (unlikely(status))
- goto out;
- if (unlikely(nfserr != NFS4_OK))
- status = nfs_cb_stat_to_errno(nfserr);
-out:
- return status;
+ return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status);
}
#endif /* CONFIG_NFSD_PNFS */
@@ -894,17 +876,12 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
u32 minorversion = clp->cl_minorversion;
cb->cb_minorversion = minorversion;
+ cb->cb_update_seq_nr = false;
+ cb->cb_status = 0;
if (minorversion) {
if (!nfsd41_cb_get_slot(clp, task))
return;
}
- spin_lock(&clp->cl_lock);
- if (list_empty(&cb->cb_per_client)) {
- /* This is the first call, not a restart */
- cb->cb_done = false;
- list_add(&cb->cb_per_client, &clp->cl_callbacks);
- }
- spin_unlock(&clp->cl_lock);
rpc_call_start(task);
}
@@ -917,23 +894,41 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
clp->cl_minorversion);
if (clp->cl_minorversion) {
- /* No need for lock, access serialized in nfsd4_cb_prepare */
- ++clp->cl_cb_session->se_cb_seq_nr;
+ /*
+ * No need for lock, access serialized in nfsd4_cb_prepare
+ *
+ * RFC5661 20.9.3
+ * If CB_SEQUENCE returns an error, then the state of the slot
+ * (sequence ID, cached reply) MUST NOT change.
+ */
+ if (cb->cb_update_seq_nr)
+ ++clp->cl_cb_session->se_cb_seq_nr;
+
clear_bit(0, &clp->cl_cb_slot_busy);
rpc_wake_up_next(&clp->cl_cb_waitq);
dprintk("%s: freed slot, new seqid=%d\n", __func__,
clp->cl_cb_session->se_cb_seq_nr);
}
- if (clp->cl_cb_client != task->tk_client) {
- /* We're shutting down or changing cl_cb_client; leave
- * it to nfsd4_process_cb_update to restart the call if
- * necessary. */
+ /*
+ * If the backchannel connection was shut down while this
+ * task was queued, we need to resubmit it after setting up
+ * a new backchannel connection.
+ *
+ * Note that if we lost our callback connection permanently
+ * the submission code will error out, so we don't need to
+ * handle that case here.
+ */
+ if (task->tk_flags & RPC_TASK_KILLED) {
+ task->tk_status = 0;
+ cb->cb_need_restart = true;
return;
}
- if (cb->cb_done)
- return;
+ if (cb->cb_status) {
+ WARN_ON_ONCE(task->tk_status);
+ task->tk_status = cb->cb_status;
+ }
switch (cb->cb_ops->done(cb, task)) {
case 0:
@@ -949,21 +944,17 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
default:
BUG();
}
- cb->cb_done = true;
}
static void nfsd4_cb_release(void *calldata)
{
struct nfsd4_callback *cb = calldata;
- struct nfs4_client *clp = cb->cb_clp;
-
- if (cb->cb_done) {
- spin_lock(&clp->cl_lock);
- list_del(&cb->cb_per_client);
- spin_unlock(&clp->cl_lock);
+ if (cb->cb_need_restart)
+ nfsd4_run_cb(cb);
+ else
cb->cb_ops->release(cb);
- }
+
}
static const struct rpc_call_ops nfsd4_cb_ops = {
@@ -1058,9 +1049,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
nfsd4_mark_cb_down(clp, err);
return;
}
- /* Yay, the callback channel's back! Restart any callbacks: */
- list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
- queue_work(callback_wq, &cb->cb_work);
}
static void
@@ -1071,8 +1059,12 @@ nfsd4_run_cb_work(struct work_struct *work)
struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *clnt;
- if (cb->cb_ops && cb->cb_ops->prepare)
- cb->cb_ops->prepare(cb);
+ if (cb->cb_need_restart) {
+ cb->cb_need_restart = false;
+ } else {
+ if (cb->cb_ops && cb->cb_ops->prepare)
+ cb->cb_ops->prepare(cb);
+ }
if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
nfsd4_process_cb_update(cb);
@@ -1084,6 +1076,15 @@ nfsd4_run_cb_work(struct work_struct *work)
cb->cb_ops->release(cb);
return;
}
+
+ /*
+ * Don't send probe messages for 4.1 or later.
+ */
+ if (!cb->cb_ops && clp->cl_minorversion) {
+ clp->cl_cb_state = NFSD4_CB_UP;
+ return;
+ }
+
cb->cb_msg.rpc_cred = clp->cl_cb_cred;
rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
@@ -1098,8 +1099,9 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_msg.rpc_resp = cb;
cb->cb_ops = ops;
INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
- INIT_LIST_HEAD(&cb->cb_per_client);
- cb->cb_done = true;
+ cb->cb_status = 0;
+ cb->cb_update_seq_nr = false;
+ cb->cb_need_restart = false;
}
void nfsd4_run_cb(struct nfsd4_callback *cb)
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 6904213a4363..ebf90e487c75 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -212,6 +212,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
BUG_ON(!ls->ls_file);
if (nfsd4_layout_setlease(ls)) {
+ fput(ls->ls_file);
put_nfs4_file(fp);
kmem_cache_free(nfs4_layout_stateid_cache, ls);
return NULL;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 92b9d97aff4f..90cfda75313c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -52,7 +52,7 @@
static inline void
nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
{
- struct inode *inode = resfh->fh_dentry->d_inode;
+ struct inode *inode = d_inode(resfh->fh_dentry);
int status;
mutex_lock(&inode->i_mutex);
@@ -110,7 +110,7 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* in current environment or not.
*/
if (bmval[0] & FATTR4_WORD0_ACL) {
- if (!IS_POSIXACL(dentry->d_inode))
+ if (!IS_POSIXACL(d_inode(dentry)))
return nfserr_attrnotsupp;
}
@@ -209,7 +209,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
{
- umode_t mode = fh->fh_dentry->d_inode->i_mode;
+ umode_t mode = d_inode(fh->fh_dentry)->i_mode;
if (S_ISREG(mode))
return nfs_ok;
@@ -470,7 +470,7 @@ out:
fh_put(resfh);
kfree(resfh);
}
- nfsd4_cleanup_open_state(cstate, open, status);
+ nfsd4_cleanup_open_state(cstate, open);
nfsd4_bump_seqid(cstate, status);
return status;
}
@@ -760,8 +760,6 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
- /* no need to check permission - this will be done in nfsd_read() */
-
read->rd_filp = NULL;
if (read->rd_offset >= OFFSET_MAX)
return nfserr_inval;
@@ -778,9 +776,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
- if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
- cstate, &read->rd_stateid,
- RD_STATE, &read->rd_filp))) {
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid,
+ RD_STATE, &read->rd_filp, &read->rd_tmp_file);
+ if (status) {
dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
goto out;
}
@@ -881,7 +879,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&exp, &dentry);
if (err)
return err;
- if (dentry->d_inode == NULL) {
+ if (d_really_is_negative(dentry)) {
exp_put(exp);
err = nfserr_noent;
} else
@@ -924,8 +922,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int err;
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
- status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
- &setattr->sa_stateid, WR_STATE, NULL);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate,
+ &setattr->sa_stateid, WR_STATE, NULL, NULL);
if (status) {
dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
return status;
@@ -986,13 +984,11 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
unsigned long cnt;
int nvecs;
- /* no need to check permission - this will be done in nfsd_write() */
-
if (write->wr_offset >= OFFSET_MAX)
return nfserr_inval;
- status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
- cstate, stateid, WR_STATE, &filp);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE,
+ &filp, NULL);
if (status) {
dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
return status;
@@ -1005,11 +1001,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nvecs = fill_in_write_vector(rqstp->rq_vec, write);
WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
- status = nfsd_write(rqstp, &cstate->current_fh, filp,
- write->wr_offset, rqstp->rq_vec, nvecs,
- &cnt, &write->wr_how_written);
- if (filp)
- fput(filp);
+ status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
+ write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
+ &write->wr_how_written);
+ fput(filp);
write->wr_bytes_written = cnt;
@@ -1023,9 +1018,9 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status = nfserr_notsupp;
struct file *file;
- status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate,
&fallocate->falloc_stateid,
- WR_STATE, &file);
+ WR_STATE, &file, NULL);
if (status != nfs_ok) {
dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
return status;
@@ -1062,9 +1057,9 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct file *file;
- status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate,
&seek->seek_stateid,
- RD_STATE, &file);
+ RD_STATE, &file, NULL);
if (status) {
dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
return status;
@@ -1308,7 +1303,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
goto out_put_stid;
- nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+ nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry),
current_fh, lgp);
if (nfserr)
goto out_put_stid;
@@ -1342,7 +1337,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
if (!ops)
goto out;
- inode = current_fh->fh_dentry->d_inode;
+ inode = d_inode(current_fh->fh_dentry);
nfserr = nfserr_inval;
if (new_size <= seg->offset) {
@@ -1728,10 +1723,6 @@ encode_op:
be32_to_cpu(status));
nfsd4_cstate_clear_replay(cstate);
- /* XXX Ugh, we need to get rid of this kind of special case: */
- if (op->opnum == OP_READ && op->u.read.rd_filp)
- fput(op->u.read.rd_filp);
-
nfsd4_increment_op_stats(op->opnum);
}
@@ -1815,7 +1806,7 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
bmap0 &= ~FATTR4_WORD0_FILEHANDLE;
}
if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) {
- ret += NFSD4_MAX_SEC_LABEL_LEN + 12;
+ ret += NFS4_MAXLABELLEN + 12;
bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL;
}
/*
@@ -2282,13 +2273,13 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_func = (nfsd4op_func)nfsd4_allocate,
.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_ALLOCATE",
- .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_DEALLOCATE] = {
.op_func = (nfsd4op_func)nfsd4_deallocate,
.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
.op_name = "OP_DEALLOCATE",
- .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1c307f02baa8..d88ea7b9a85c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -192,14 +192,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
dir = nn->rec_file->f_path.dentry;
/* lock the parent */
- mutex_lock(&dir->d_inode->i_mutex);
+ mutex_lock(&d_inode(dir)->i_mutex);
dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
goto out_unlock;
}
- if (dentry->d_inode)
+ if (d_really_is_positive(dentry))
/*
* In the 4.1 case, where we're called from
* reclaim_complete(), records from the previous reboot
@@ -209,11 +209,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
* as well be forgiving and just succeed silently.
*/
goto out_put;
- status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
+ status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU);
out_put:
dput(dentry);
out_unlock:
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
if (status == 0) {
if (nn->in_grace) {
crp = nfs4_client_to_reclaim(dname, nn);
@@ -285,7 +285,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
}
status = iterate_dir(nn->rec_file, &ctx.ctx);
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
while (!list_empty(&ctx.names)) {
struct name_list *entry;
entry = list_entry(ctx.names.next, struct name_list, list);
@@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
list_del(&entry->list);
kfree(entry);
}
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
nfs4_reset_creds(original_cred);
return status;
}
@@ -316,20 +316,20 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
dir = nn->rec_file->f_path.dentry;
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
dentry = lookup_one_len(name, dir, namlen);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
goto out_unlock;
}
status = -ENOENT;
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
goto out;
- status = vfs_rmdir(dir->d_inode, dentry);
+ status = vfs_rmdir(d_inode(dir), dentry);
out:
dput(dentry);
out_unlock:
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
return status;
}
@@ -385,7 +385,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
if (nfs4_has_reclaimed_state(child->d_name.name, nn))
return 0;
- status = vfs_rmdir(parent->d_inode, child);
+ status = vfs_rmdir(d_inode(parent), child);
if (status)
printk("failed to remove client recovery directory %pd\n",
child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8ba1d888f1e6..95202719a1fd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -94,6 +94,7 @@ static struct kmem_cache *lockowner_slab;
static struct kmem_cache *file_slab;
static struct kmem_cache *stateid_slab;
static struct kmem_cache *deleg_slab;
+static struct kmem_cache *odstate_slab;
static void free_session(struct nfsd4_session *);
@@ -281,6 +282,7 @@ put_nfs4_file(struct nfs4_file *fi)
if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
hlist_del_rcu(&fi->fi_hash);
spin_unlock(&state_lock);
+ WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
}
@@ -471,6 +473,86 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
__nfs4_file_put_access(fp, O_RDONLY);
}
+/*
+ * Allocate a new open/delegation state counter. This is needed for
+ * pNFS for proper return on close semantics.
+ *
+ * Note that we only allocate it for pNFS-enabled exports, otherwise
+ * all pointers to struct nfs4_clnt_odstate are always NULL.
+ */
+static struct nfs4_clnt_odstate *
+alloc_clnt_odstate(struct nfs4_client *clp)
+{
+ struct nfs4_clnt_odstate *co;
+
+ co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL);
+ if (co) {
+ co->co_client = clp;
+ atomic_set(&co->co_odcount, 1);
+ }
+ return co;
+}
+
+static void
+hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co)
+{
+ struct nfs4_file *fp = co->co_file;
+
+ lockdep_assert_held(&fp->fi_lock);
+ list_add(&co->co_perfile, &fp->fi_clnt_odstate);
+}
+
+static inline void
+get_clnt_odstate(struct nfs4_clnt_odstate *co)
+{
+ if (co)
+ atomic_inc(&co->co_odcount);
+}
+
+static void
+put_clnt_odstate(struct nfs4_clnt_odstate *co)
+{
+ struct nfs4_file *fp;
+
+ if (!co)
+ return;
+
+ fp = co->co_file;
+ if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) {
+ list_del(&co->co_perfile);
+ spin_unlock(&fp->fi_lock);
+
+ nfsd4_return_all_file_layouts(co->co_client, fp);
+ kmem_cache_free(odstate_slab, co);
+ }
+}
+
+static struct nfs4_clnt_odstate *
+find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new)
+{
+ struct nfs4_clnt_odstate *co;
+ struct nfs4_client *cl;
+
+ if (!new)
+ return NULL;
+
+ cl = new->co_client;
+
+ spin_lock(&fp->fi_lock);
+ list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) {
+ if (co->co_client == cl) {
+ get_clnt_odstate(co);
+ goto out;
+ }
+ }
+ co = new;
+ co->co_file = fp;
+ hash_clnt_odstate_locked(new);
+out:
+ spin_unlock(&fp->fi_lock);
+ return co;
+}
+
struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
struct kmem_cache *slab)
{
@@ -606,7 +688,8 @@ static void block_delegations(struct knfsd_fh *fh)
}
static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
+alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh,
+ struct nfs4_clnt_odstate *odstate)
{
struct nfs4_delegation *dp;
long n;
@@ -631,6 +714,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
INIT_LIST_HEAD(&dp->dl_perfile);
INIT_LIST_HEAD(&dp->dl_perclnt);
INIT_LIST_HEAD(&dp->dl_recall_lru);
+ dp->dl_clnt_odstate = odstate;
+ get_clnt_odstate(odstate);
dp->dl_type = NFS4_OPEN_DELEGATE_READ;
dp->dl_retries = 1;
nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
@@ -714,6 +799,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
spin_lock(&state_lock);
unhash_delegation_locked(dp);
spin_unlock(&state_lock);
+ put_clnt_odstate(dp->dl_clnt_odstate);
nfs4_put_deleg_lease(dp->dl_stid.sc_file);
nfs4_put_stid(&dp->dl_stid);
}
@@ -724,6 +810,7 @@ static void revoke_delegation(struct nfs4_delegation *dp)
WARN_ON(!list_empty(&dp->dl_recall_lru));
+ put_clnt_odstate(dp->dl_clnt_odstate);
nfs4_put_deleg_lease(dp->dl_stid.sc_file);
if (clp->cl_minorversion == 0)
@@ -933,6 +1020,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
{
struct nfs4_ol_stateid *stp = openlockstateid(stid);
+ put_clnt_odstate(stp->st_clnt_odstate);
release_all_access(stp);
if (stp->st_stateowner)
nfs4_put_stateowner(stp->st_stateowner);
@@ -1139,7 +1227,7 @@ hash_sessionid(struct nfs4_sessionid *sessionid)
return sid->sequence % SESSION_HASH_SIZE;
}
-#ifdef NFSD_DEBUG
+#ifdef CONFIG_SUNRPC_DEBUG
static inline void
dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
{
@@ -1538,7 +1626,6 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
INIT_LIST_HEAD(&clp->cl_lru);
- INIT_LIST_HEAD(&clp->cl_callbacks);
INIT_LIST_HEAD(&clp->cl_revoked);
#ifdef CONFIG_NFSD_PNFS
INIT_LIST_HEAD(&clp->cl_lo_states);
@@ -1634,6 +1721,7 @@ __destroy_client(struct nfs4_client *clp)
while (!list_empty(&reaplist)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
list_del_init(&dp->dl_recall_lru);
+ put_clnt_odstate(dp->dl_clnt_odstate);
nfs4_put_deleg_lease(dp->dl_stid.sc_file);
nfs4_put_stid(&dp->dl_stid);
}
@@ -3057,6 +3145,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
spin_lock_init(&fp->fi_lock);
INIT_LIST_HEAD(&fp->fi_stateids);
INIT_LIST_HEAD(&fp->fi_delegations);
+ INIT_LIST_HEAD(&fp->fi_clnt_odstate);
fh_copy_shallow(&fp->fi_fhandle, fh);
fp->fi_deleg_file = NULL;
fp->fi_had_conflict = false;
@@ -3073,6 +3162,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
void
nfsd4_free_slabs(void)
{
+ kmem_cache_destroy(odstate_slab);
kmem_cache_destroy(openowner_slab);
kmem_cache_destroy(lockowner_slab);
kmem_cache_destroy(file_slab);
@@ -3103,8 +3193,14 @@ nfsd4_init_slabs(void)
sizeof(struct nfs4_delegation), 0, 0, NULL);
if (deleg_slab == NULL)
goto out_free_stateid_slab;
+ odstate_slab = kmem_cache_create("nfsd4_odstate",
+ sizeof(struct nfs4_clnt_odstate), 0, 0, NULL);
+ if (odstate_slab == NULL)
+ goto out_free_deleg_slab;
return 0;
+out_free_deleg_slab:
+ kmem_cache_destroy(deleg_slab);
out_free_stateid_slab:
kmem_cache_destroy(stateid_slab);
out_free_file_slab:
@@ -3581,6 +3677,14 @@ alloc_stateid:
open->op_stp = nfs4_alloc_open_stateid(clp);
if (!open->op_stp)
return nfserr_jukebox;
+
+ if (nfsd4_has_session(cstate) &&
+ (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) {
+ open->op_odstate = alloc_clnt_odstate(clp);
+ if (!open->op_odstate)
+ return nfserr_jukebox;
+ }
+
return nfs_ok;
}
@@ -3757,7 +3861,7 @@ static __be32
nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
{
__be32 status;
- unsigned char old_deny_bmap;
+ unsigned char old_deny_bmap = stp->st_deny_bmap;
if (!test_access(open->op_share_access, stp))
return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
@@ -3766,7 +3870,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
spin_lock(&fp->fi_lock);
status = nfs4_file_check_deny(fp, open->op_share_deny);
if (status == nfs_ok) {
- old_deny_bmap = stp->st_deny_bmap;
set_deny(open->op_share_deny, stp);
fp->fi_share_deny |=
(open->op_share_deny & NFS4_SHARE_DENY_BOTH);
@@ -3869,7 +3972,7 @@ out_fput:
static struct nfs4_delegation *
nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
- struct nfs4_file *fp)
+ struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
{
int status;
struct nfs4_delegation *dp;
@@ -3877,7 +3980,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
if (fp->fi_had_conflict)
return ERR_PTR(-EAGAIN);
- dp = alloc_init_deleg(clp, fh);
+ dp = alloc_init_deleg(clp, fh, odstate);
if (!dp)
return ERR_PTR(-ENOMEM);
@@ -3903,6 +4006,7 @@ out_unlock:
spin_unlock(&state_lock);
out:
if (status) {
+ put_clnt_odstate(dp->dl_clnt_odstate);
nfs4_put_stid(&dp->dl_stid);
return ERR_PTR(status);
}
@@ -3980,7 +4084,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
default:
goto out_no_deleg;
}
- dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file);
+ dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate);
if (IS_ERR(dp))
goto out_no_deleg;
@@ -4049,7 +4153,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfserr_bad_stateid;
if (nfsd4_is_deleg_cur(open))
goto out;
- status = nfserr_jukebox;
}
/*
@@ -4070,6 +4173,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
release_open_stateid(stp);
goto out;
}
+
+ stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp,
+ open->op_odstate);
+ if (stp->st_clnt_odstate == open->op_odstate)
+ open->op_odstate = NULL;
}
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
@@ -4118,7 +4226,7 @@ out:
}
void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
- struct nfsd4_open *open, __be32 status)
+ struct nfsd4_open *open)
{
if (open->op_openowner) {
struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
@@ -4130,6 +4238,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
kmem_cache_free(file_slab, open->op_file);
if (open->op_stp)
nfs4_put_stid(&open->op_stp->st_stid);
+ if (open->op_odstate)
+ kmem_cache_free(odstate_slab, open->op_odstate);
}
__be32
@@ -4286,9 +4396,9 @@ laundromat_main(struct work_struct *laundry)
queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
}
-static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
+static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
{
- if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
+ if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle))
return nfserr_bad_stateid;
return nfs_ok;
}
@@ -4386,10 +4496,17 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s
return nfserr_old_stateid;
}
+static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols)
+{
+ if (ols->st_stateowner->so_is_open_owner &&
+ !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
+ return nfserr_bad_stateid;
+ return nfs_ok;
+}
+
static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
{
struct nfs4_stid *s;
- struct nfs4_ol_stateid *ols;
__be32 status = nfserr_bad_stateid;
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
@@ -4419,13 +4536,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
break;
case NFS4_OPEN_STID:
case NFS4_LOCK_STID:
- ols = openlockstateid(s);
- if (ols->st_stateowner->so_is_open_owner
- && !(openowner(ols->st_stateowner)->oo_flags
- & NFS4_OO_CONFIRMED))
- status = nfserr_bad_stateid;
- else
- status = nfs_ok;
+ status = nfsd4_check_openowner_confirmed(openlockstateid(s));
break;
default:
printk("unknown stateid type %x\n", s->sc_type);
@@ -4462,85 +4573,130 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
return nfs_ok;
}
+static struct file *
+nfs4_find_file(struct nfs4_stid *s, int flags)
+{
+ if (!s)
+ return NULL;
+
+ switch (s->sc_type) {
+ case NFS4_DELEG_STID:
+ if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file))
+ return NULL;
+ return get_file(s->sc_file->fi_deleg_file);
+ case NFS4_OPEN_STID:
+ case NFS4_LOCK_STID:
+ if (flags & RD_STATE)
+ return find_readable_file(s->sc_file);
+ else
+ return find_writeable_file(s->sc_file);
+ break;
+ }
+
+ return NULL;
+}
+
+static __be32
+nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags)
+{
+ __be32 status;
+
+ status = nfsd4_check_openowner_confirmed(ols);
+ if (status)
+ return status;
+ return nfs4_check_openmode(ols, flags);
+}
+
+static __be32
+nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
+ struct file **filpp, bool *tmp_file, int flags)
+{
+ int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE;
+ struct file *file;
+ __be32 status;
+
+ file = nfs4_find_file(s, flags);
+ if (file) {
+ status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+ acc | NFSD_MAY_OWNER_OVERRIDE);
+ if (status) {
+ fput(file);
+ return status;
+ }
+
+ *filpp = file;
+ } else {
+ status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp);
+ if (status)
+ return status;
+
+ if (tmp_file)
+ *tmp_file = true;
+ }
+
+ return 0;
+}
+
/*
-* Checks for stateid operations
-*/
+ * Checks for stateid operations
+ */
__be32
-nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
- stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, stateid_t *stateid,
+ int flags, struct file **filpp, bool *tmp_file)
{
- struct nfs4_stid *s;
- struct nfs4_ol_stateid *stp = NULL;
- struct nfs4_delegation *dp = NULL;
- struct svc_fh *current_fh = &cstate->current_fh;
- struct inode *ino = current_fh->fh_dentry->d_inode;
+ struct svc_fh *fhp = &cstate->current_fh;
+ struct inode *ino = d_inode(fhp->fh_dentry);
+ struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- struct file *file = NULL;
+ struct nfs4_stid *s = NULL;
__be32 status;
if (filpp)
*filpp = NULL;
+ if (tmp_file)
+ *tmp_file = false;
if (grace_disallows_io(net, ino))
return nfserr_grace;
- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
- return check_special_stateids(net, current_fh, stateid, flags);
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
+ status = check_special_stateids(net, fhp, stateid, flags);
+ goto done;
+ }
status = nfsd4_lookup_stateid(cstate, stateid,
NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
&s, nn);
if (status)
return status;
- status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
+ status = check_stateid_generation(stateid, &s->sc_stateid,
+ nfsd4_has_session(cstate));
if (status)
goto out;
+
switch (s->sc_type) {
case NFS4_DELEG_STID:
- dp = delegstateid(s);
- status = nfs4_check_delegmode(dp, flags);
- if (status)
- goto out;
- if (filpp) {
- file = dp->dl_stid.sc_file->fi_deleg_file;
- if (!file) {
- WARN_ON_ONCE(1);
- status = nfserr_serverfault;
- goto out;
- }
- get_file(file);
- }
+ status = nfs4_check_delegmode(delegstateid(s), flags);
break;
case NFS4_OPEN_STID:
case NFS4_LOCK_STID:
- stp = openlockstateid(s);
- status = nfs4_check_fh(current_fh, stp);
- if (status)
- goto out;
- if (stp->st_stateowner->so_is_open_owner
- && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED))
- goto out;
- status = nfs4_check_openmode(stp, flags);
- if (status)
- goto out;
- if (filpp) {
- struct nfs4_file *fp = stp->st_stid.sc_file;
-
- if (flags & RD_STATE)
- file = find_readable_file(fp);
- else
- file = find_writeable_file(fp);
- }
+ status = nfs4_check_olstateid(fhp, openlockstateid(s), flags);
break;
default:
status = nfserr_bad_stateid;
- goto out;
+ break;
}
- status = nfs_ok;
- if (file)
- *filpp = file;
+ if (status)
+ goto out;
+ status = nfs4_check_fh(fhp, s);
+
+done:
+ if (!status && filpp)
+ status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags);
out:
- nfs4_put_stid(s);
+ if (s)
+ nfs4_put_stid(s);
return status;
}
@@ -4642,7 +4798,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
if (status)
return status;
- return nfs4_check_fh(current_fh, stp);
+ return nfs4_check_fh(current_fh, &stp->st_stid);
}
/*
@@ -4853,9 +5009,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
- nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
- stp->st_stid.sc_file);
-
nfsd4_close_open_stateid(stp);
/* put reference from nfs4_preprocess_seqid_op */
@@ -4932,20 +5085,22 @@ nfs4_transform_lock_offset(struct file_lock *lock)
lock->fl_end = OFFSET_MAX;
}
-static void nfsd4_fl_get_owner(struct file_lock *dst, struct file_lock *src)
+static fl_owner_t
+nfsd4_fl_get_owner(fl_owner_t owner)
{
- struct nfs4_lockowner *lo = (struct nfs4_lockowner *)src->fl_owner;
- dst->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lo->lo_owner));
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
+
+ nfs4_get_stateowner(&lo->lo_owner);
+ return owner;
}
-static void nfsd4_fl_put_owner(struct file_lock *fl)
+static void
+nfsd4_fl_put_owner(fl_owner_t owner)
{
- struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner;
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
- if (lo) {
+ if (lo)
nfs4_put_stateowner(&lo->lo_owner);
- fl->fl_owner = NULL;
- }
}
static const struct lock_manager_operations nfsd_posix_mng_ops = {
@@ -5169,7 +5324,7 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
struct nfs4_file *fi = ost->st_stid.sc_file;
struct nfs4_openowner *oo = openowner(ost->st_stateowner);
struct nfs4_client *cl = oo->oo_owner.so_client;
- struct inode *inode = cstate->current_fh.fh_dentry->d_inode;
+ struct inode *inode = d_inode(cstate->current_fh.fh_dentry);
struct nfs4_lockowner *lo;
unsigned int strhashval;
@@ -5394,7 +5549,7 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct
__be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
if (!err) {
err = nfserrno(vfs_test_lock(file, lock));
- nfsd_close(file);
+ fput(file);
}
return err;
}
@@ -6487,6 +6642,7 @@ nfs4_state_shutdown_net(struct net *net)
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
list_del_init(&dp->dl_recall_lru);
+ put_clnt_odstate(dp->dl_clnt_odstate);
nfs4_put_deleg_lease(dp->dl_stid.sc_file);
nfs4_put_stid(&dp->dl_stid);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5fb7e78169a6..75e0563c09d1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -33,6 +33,7 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <linux/file.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/statfs.h>
@@ -424,7 +425,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
len += 4;
dummy32 = be32_to_cpup(p++);
READ_BUF(dummy32);
- if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
+ if (dummy32 > NFS4_MAXLABELLEN)
return nfserr_badlabel;
len += (XDR_QUADLEN(dummy32) << 2);
READMEM(buf, dummy32);
@@ -2020,7 +2021,7 @@ static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
* dentries/path components in an array.
*/
for (;;) {
- if (cur.dentry == root->dentry && cur.mnt == root->mnt)
+ if (path_equal(&cur, root))
break;
if (cur.dentry == cur.mnt->mnt_root) {
if (follow_up(&cur))
@@ -2142,6 +2143,7 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
FATTR4_WORD0_RDATTR_ERROR)
#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+#define WORD2_ABSENT_FS_ATTRS 0
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
static inline __be32
@@ -2170,7 +2172,7 @@ nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
{ return 0; }
#endif
-static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
+static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 *rdattr_err)
{
/* As per referral draft: */
if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS ||
@@ -2183,6 +2185,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
}
*bmval0 &= WORD0_ABSENT_FS_ATTRS;
*bmval1 &= WORD1_ABSENT_FS_ATTRS;
+ *bmval2 &= WORD2_ABSENT_FS_ATTRS;
return 0;
}
@@ -2227,7 +2230,6 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
u32 rdattr_err = 0;
__be32 status;
int err;
- int aclsupport = 0;
struct nfs4_acl *acl = NULL;
void *context = NULL;
int contextlen;
@@ -2246,8 +2248,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
if (exp->ex_fslocs.migrated) {
- BUG_ON(bmval[2]);
- status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
+ status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err);
if (status)
goto out;
}
@@ -2274,25 +2275,21 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
goto out;
fhp = tempfh;
}
- if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
- | FATTR4_WORD0_SUPPORTED_ATTRS)) {
+ if (bmval0 & FATTR4_WORD0_ACL) {
err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
- aclsupport = (err == 0);
- if (bmval0 & FATTR4_WORD0_ACL) {
- if (err == -EOPNOTSUPP)
- bmval0 &= ~FATTR4_WORD0_ACL;
- else if (err == -EINVAL) {
- status = nfserr_attrnotsupp;
- goto out;
- } else if (err != 0)
- goto out_nfserr;
- }
+ if (err == -EOPNOTSUPP)
+ bmval0 &= ~FATTR4_WORD0_ACL;
+ else if (err == -EINVAL) {
+ status = nfserr_attrnotsupp;
+ goto out;
+ } else if (err != 0)
+ goto out_nfserr;
}
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
- if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) ||
- bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
- err = security_inode_getsecctx(dentry->d_inode,
+ if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) ||
+ bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ err = security_inode_getsecctx(d_inode(dentry),
&context, &contextlen);
contextsupport = (err == 0);
if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
@@ -2338,7 +2335,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
u32 word1 = nfsd_suppattrs1(minorversion);
u32 word2 = nfsd_suppattrs2(minorversion);
- if (!aclsupport)
+ if (!IS_POSIXACL(dentry->d_inode))
word0 &= ~FATTR4_WORD0_ACL;
if (!contextsupport)
word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
@@ -2384,7 +2381,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
p = xdr_reserve_space(xdr, 8);
if (!p)
goto out_resource;
- p = encode_change(p, &stat, dentry->d_inode);
+ p = encode_change(p, &stat, d_inode(dentry));
}
if (bmval0 & FATTR4_WORD0_SIZE) {
p = xdr_reserve_space(xdr, 8);
@@ -2486,7 +2483,7 @@ out_acl:
p = xdr_reserve_space(xdr, 4);
if (!p)
goto out_resource;
- *p++ = cpu_to_be32(aclsupport ?
+ *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ?
ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
}
if (bmval0 & FATTR4_WORD0_CANSETTIME) {
@@ -2807,7 +2804,7 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
- if (!dentry->d_inode) {
+ if (d_really_is_negative(dentry)) {
/*
* nfsd_buffered_readdir drops the i_mutex between
* readdir and calling this callback, leaving a window
@@ -3324,7 +3321,7 @@ static __be32 nfsd4_encode_splice_read(
}
eof = (read->rd_offset + maxcount >=
- read->rd_fhp->fh_dentry->d_inode->i_size);
+ d_inode(read->rd_fhp->fh_dentry)->i_size);
*(p++) = htonl(eof);
*(p++) = htonl(maxcount);
@@ -3401,7 +3398,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
eof = (read->rd_offset + maxcount >=
- read->rd_fhp->fh_dentry->d_inode->i_size);
+ d_inode(read->rd_fhp->fh_dentry)->i_size);
tmp = htonl(eof);
write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
@@ -3423,47 +3420,50 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
struct xdr_stream *xdr = &resp->xdr;
struct file *file = read->rd_filp;
int starting_len = xdr->buf->len;
- struct raparms *ra;
+ struct raparms *ra = NULL;
__be32 *p;
- __be32 err;
if (nfserr)
- return nfserr;
+ goto out;
p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
if (!p) {
WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
- return nfserr_resource;
+ nfserr = nfserr_resource;
+ goto out;
}
- if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
+ if (resp->xdr.buf->page_len &&
+ test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
WARN_ON_ONCE(1);
- return nfserr_resource;
+ nfserr = nfserr_resource;
+ goto out;
}
xdr_commit_encode(xdr);
maxcount = svc_max_payload(resp->rqstp);
- maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
+ maxcount = min_t(unsigned long, maxcount,
+ (xdr->buf->buflen - xdr->buf->len));
maxcount = min_t(unsigned long, maxcount, read->rd_length);
- if (!read->rd_filp) {
- err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
- &file, &ra);
- if (err)
- goto err_truncate;
- }
+ if (read->rd_tmp_file)
+ ra = nfsd_init_raparms(file);
- if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
- err = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ if (file->f_op->splice_read &&
+ test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
+ nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
else
- err = nfsd4_encode_readv(resp, read, file, maxcount);
+ nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
- if (!read->rd_filp)
- nfsd_put_tmp_read_open(file, ra);
+ if (ra)
+ nfsd_put_raparams(file, ra);
-err_truncate:
- if (err)
+ if (nfserr)
xdr_truncate_encode(xdr, starting_len);
- return err;
+
+out:
+ if (file)
+ fput(file);
+ return nfserr;
}
static __be32
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index aa47d75ddb26..9690cb4dd588 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1250,15 +1250,15 @@ static int __init init_nfsd(void)
int retval;
printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
- retval = register_cld_notifier();
- if (retval)
- return retval;
retval = register_pernet_subsys(&nfsd_net_ops);
if (retval < 0)
- goto out_unregister_notifier;
- retval = nfsd4_init_slabs();
+ return retval;
+ retval = register_cld_notifier();
if (retval)
goto out_unregister_pernet;
+ retval = nfsd4_init_slabs();
+ if (retval)
+ goto out_unregister_notifier;
retval = nfsd4_init_pnfs();
if (retval)
goto out_free_slabs;
@@ -1290,10 +1290,10 @@ out_exit_pnfs:
nfsd4_exit_pnfs();
out_free_slabs:
nfsd4_free_slabs();
-out_unregister_pernet:
- unregister_pernet_subsys(&nfsd_net_ops);
out_unregister_notifier:
unregister_cld_notifier();
+out_unregister_pernet:
+ unregister_pernet_subsys(&nfsd_net_ops);
return retval;
}
@@ -1308,8 +1308,8 @@ static void __exit exit_nfsd(void)
nfsd4_exit_pnfs();
nfsd_fault_inject_cleanup();
unregister_filesystem(&nfsd_fs_type);
- unregister_pernet_subsys(&nfsd_net_ops);
unregister_cld_notifier();
+ unregister_pernet_subsys(&nfsd_net_ops);
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 565c4da1a9eb..cf980523898b 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
#include "export.h"
#undef ifdebug
-#ifdef NFSD_DEBUG
+#ifdef CONFIG_SUNRPC_DEBUG
# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag)
#else
# define ifdebug(flag) if (0)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e9fa966fc37f..350041a40fe5 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -38,7 +38,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
/* make sure parents give x permission to user */
int err;
parent = dget_parent(tdentry);
- err = inode_permission(parent->d_inode, MAY_EXEC);
+ err = inode_permission(d_inode(parent), MAY_EXEC);
if (err < 0) {
dput(parent);
break;
@@ -340,7 +340,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
if (error)
goto out;
- error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type);
+ error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type);
if (error)
goto out;
@@ -412,8 +412,8 @@ static inline void _fh_update_old(struct dentry *dentry,
struct svc_export *exp,
struct knfsd_fh *fh)
{
- fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino);
- fh->ofh_generation = dentry->d_inode->i_generation;
+ fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino);
+ fh->ofh_generation = d_inode(dentry)->i_generation;
if (d_is_dir(dentry) ||
(exp->ex_flags & NFSEXP_NOSUBTREECHECK))
fh->ofh_dirino = 0;
@@ -426,7 +426,7 @@ static bool is_root_export(struct svc_export *exp)
static struct super_block *exp_sb(struct svc_export *exp)
{
- return exp->ex_path.dentry->d_inode->i_sb;
+ return d_inode(exp->ex_path.dentry)->i_sb;
}
static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
@@ -520,12 +520,12 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
*
*/
- struct inode * inode = dentry->d_inode;
+ struct inode * inode = d_inode(dentry);
dev_t ex_dev = exp_sb(exp)->s_dev;
dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n",
MAJOR(ex_dev), MINOR(ex_dev),
- (long) exp->ex_path.dentry->d_inode->i_ino,
+ (long) d_inode(exp->ex_path.dentry)->i_ino,
dentry,
(inode ? inode->i_ino : 0));
@@ -558,7 +558,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev);
fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
fhp->fh_handle.ofh_xino =
- ino_t_to_u32(exp->ex_path.dentry->d_inode->i_ino);
+ ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino);
fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
if (inode)
_fh_update_old(dentry, exp, &fhp->fh_handle);
@@ -570,7 +570,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
mk_fsid(fhp->fh_handle.fh_fsid_type,
fhp->fh_handle.fh_fsid,
ex_dev,
- exp->ex_path.dentry->d_inode->i_ino,
+ d_inode(exp->ex_path.dentry)->i_ino,
exp->ex_fsid, exp->ex_uuid);
if (inode)
@@ -597,7 +597,7 @@ fh_update(struct svc_fh *fhp)
goto out_bad;
dentry = fhp->fh_dentry;
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
goto out_negative;
if (fhp->fh_handle.fh_version != 1) {
_fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index f22920442172..1e90dad4926b 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -225,7 +225,7 @@ fill_pre_wcc(struct svc_fh *fhp)
{
struct inode *inode;
- inode = fhp->fh_dentry->d_inode;
+ inode = d_inode(fhp->fh_dentry);
if (!fhp->fh_pre_saved) {
fhp->fh_pre_mtime = inode->i_mtime;
fhp->fh_pre_ctime = inode->i_ctime;
@@ -264,7 +264,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
return;
}
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
mutex_lock_nested(&inode->i_mutex, subclass);
fill_pre_wcc(fhp);
fhp->fh_locked = 1;
@@ -284,7 +284,7 @@ fh_unlock(struct svc_fh *fhp)
{
if (fhp->fh_locked) {
fill_post_wcc(fhp);
- mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
fhp->fh_locked = 0;
}
}
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index b8680738f588..4cd78ef4c95c 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -59,13 +59,61 @@ static __be32
nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
struct nfsd_attrstat *resp)
{
+ struct iattr *iap = &argp->attrs;
+ struct svc_fh *fhp;
__be32 nfserr;
+
dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n",
SVCFH_fmt(&argp->fh),
argp->attrs.ia_valid, (long) argp->attrs.ia_size);
- fh_copy(&resp->fh, &argp->fh);
- nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0);
+ fhp = fh_copy(&resp->fh, &argp->fh);
+
+ /*
+ * NFSv2 does not differentiate between "set-[ac]time-to-now"
+ * which only requires access, and "set-[ac]time-to-X" which
+ * requires ownership.
+ * So if it looks like it might be "set both to the same time which
+ * is close to now", and if inode_change_ok fails, then we
+ * convert to "set to now" instead of "set to explicit time"
+ *
+ * We only call inode_change_ok as the last test as technically
+ * it is not an interface that we should be using.
+ */
+#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
+#define MAX_TOUCH_TIME_ERROR (30*60)
+ if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET &&
+ iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) {
+ /*
+ * Looks probable.
+ *
+ * Now just make sure time is in the right ballpark.
+ * Solaris, at least, doesn't seem to care what the time
+ * request is. We require it be within 30 minutes of now.
+ */
+ time_t delta = iap->ia_atime.tv_sec - get_seconds();
+ struct inode *inode;
+
+ nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+ if (nfserr)
+ goto done;
+ inode = d_inode(fhp->fh_dentry);
+
+ if (delta < 0)
+ delta = -delta;
+ if (delta < MAX_TOUCH_TIME_ERROR &&
+ inode_change_ok(inode, iap) != 0) {
+ /*
+ * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
+ * This will cause notify_change to set these times
+ * to "now"
+ */
+ iap->ia_valid &= ~BOTH_TIME_SET;
+ }
+ }
+
+ nfserr = nfsd_setattr(rqstp, fhp, iap, 0, (time_t)0);
+done:
return nfsd_return_attrs(nfserr, resp);
}
@@ -223,7 +271,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
}
fh_init(newfhp, NFS_FHSIZE);
nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp);
- if (!nfserr && !dchild->d_inode)
+ if (!nfserr && d_really_is_negative(dchild))
nfserr = nfserr_noent;
dput(dchild);
if (nfserr) {
@@ -241,7 +289,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
}
}
- inode = newfhp->fh_dentry->d_inode;
+ inode = d_inode(newfhp->fh_dentry);
/* Unfudge the mode bits */
if (attr->ia_valid & ATTR_MODE) {
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 412d7061f9e5..79d964aa8079 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -187,7 +187,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
*p++ = htonl((u32) stat->ino);
*p++ = htonl((u32) stat->atime.tv_sec);
*p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
- lease_get_mtime(dentry->d_inode, &time);
+ lease_get_mtime(d_inode(dentry), &time);
*p++ = htonl((u32) time.tv_sec);
*p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0);
*p++ = htonl((u32) stat->ctime.tv_sec);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4f3bfeb11766..4874ce515fc1 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -63,12 +63,13 @@ typedef struct {
struct nfsd4_callback {
struct nfs4_client *cb_clp;
- struct list_head cb_per_client;
u32 cb_minorversion;
struct rpc_message cb_msg;
struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
- bool cb_done;
+ int cb_status;
+ bool cb_update_seq_nr;
+ bool cb_need_restart;
};
struct nfsd4_callback_ops {
@@ -126,6 +127,7 @@ struct nfs4_delegation {
struct list_head dl_perfile;
struct list_head dl_perclnt;
struct list_head dl_recall_lru; /* delegation recalled */
+ struct nfs4_clnt_odstate *dl_clnt_odstate;
u32 dl_type;
time_t dl_time;
/* For recall: */
@@ -332,7 +334,6 @@ struct nfs4_client {
int cl_cb_state;
struct nfsd4_callback cl_cb_null;
struct nfsd4_session *cl_cb_session;
- struct list_head cl_callbacks; /* list of in-progress callbacks */
/* for all client information that callback code might need: */
spinlock_t cl_lock;
@@ -465,6 +466,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
}
/*
+ * Per-client state indicating no. of opens and outstanding delegations
+ * on a file from a particular client.'od' stands for 'open & delegation'
+ */
+struct nfs4_clnt_odstate {
+ struct nfs4_client *co_client;
+ struct nfs4_file *co_file;
+ struct list_head co_perfile;
+ atomic_t co_odcount;
+};
+
+/*
* nfs4_file: a file opened by some number of (open) nfs4_stateowners.
*
* These objects are global. nfsd keeps one instance of a nfs4_file per
@@ -485,6 +497,7 @@ struct nfs4_file {
struct list_head fi_delegations;
struct rcu_head fi_rcu;
};
+ struct list_head fi_clnt_odstate;
/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
struct file * fi_fds[3];
/*
@@ -526,6 +539,7 @@ struct nfs4_ol_stateid {
struct list_head st_perstateowner;
struct list_head st_locks;
struct nfs4_stateowner * st_stateowner;
+ struct nfs4_clnt_odstate * st_clnt_odstate;
unsigned char st_access_bmap;
unsigned char st_deny_bmap;
struct nfs4_ol_stateid * st_openstp;
@@ -569,9 +583,9 @@ enum nfsd4_cb_op {
struct nfsd4_compound_state;
struct nfsd_net;
-extern __be32 nfs4_preprocess_stateid_op(struct net *net,
- struct nfsd4_compound_state *cstate,
- stateid_t *stateid, int flags, struct file **filp);
+extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, stateid_t *stateid,
+ int flags, struct file **filp, bool *tmp_file);
__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
stateid_t *stateid, unsigned char typemask,
struct nfs4_stid **s, struct nfsd_net *nn);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 368526582429..b5e077a6e7d4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -174,7 +174,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
return 1;
if (!(exp->ex_flags & NFSEXP_V4ROOT))
return 0;
- return dentry->d_inode != NULL;
+ return d_inode(dentry) != NULL;
}
__be32
@@ -270,7 +270,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
* dentry may be negative, it may need to be updated.
*/
err = fh_compose(resfh, exp, dentry, fhp);
- if (!err && !dentry->d_inode)
+ if (!err && d_really_is_negative(dentry))
err = nfserr_noent;
out:
dput(dentry);
@@ -284,7 +284,7 @@ out:
static int
commit_metadata(struct svc_fh *fhp)
{
- struct inode *inode = fhp->fh_dentry->d_inode;
+ struct inode *inode = d_inode(fhp->fh_dentry);
const struct export_operations *export_ops = inode->i_sb->s_export_op;
if (!EX_ISSYNC(fhp->fh_export))
@@ -302,42 +302,6 @@ commit_metadata(struct svc_fh *fhp)
static void
nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
{
- /*
- * NFSv2 does not differentiate between "set-[ac]time-to-now"
- * which only requires access, and "set-[ac]time-to-X" which
- * requires ownership.
- * So if it looks like it might be "set both to the same time which
- * is close to now", and if inode_change_ok fails, then we
- * convert to "set to now" instead of "set to explicit time"
- *
- * We only call inode_change_ok as the last test as technically
- * it is not an interface that we should be using.
- */
-#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
-#define MAX_TOUCH_TIME_ERROR (30*60)
- if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET &&
- iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) {
- /*
- * Looks probable.
- *
- * Now just make sure time is in the right ballpark.
- * Solaris, at least, doesn't seem to care what the time
- * request is. We require it be within 30 minutes of now.
- */
- time_t delta = iap->ia_atime.tv_sec - get_seconds();
- if (delta < 0)
- delta = -delta;
- if (delta < MAX_TOUCH_TIME_ERROR &&
- inode_change_ok(inode, iap) != 0) {
- /*
- * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME.
- * This will cause notify_change to set these times
- * to "now"
- */
- iap->ia_valid &= ~BOTH_TIME_SET;
- }
- }
-
/* sanitize the mode change */
if (iap->ia_valid & ATTR_MODE) {
iap->ia_mode &= S_IALLUGO;
@@ -364,7 +328,7 @@ static __be32
nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct iattr *iap)
{
- struct inode *inode = fhp->fh_dentry->d_inode;
+ struct inode *inode = d_inode(fhp->fh_dentry);
int host_err;
if (iap->ia_size < inode->i_size) {
@@ -426,7 +390,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
}
dentry = fhp->fh_dentry;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
/* Ignore any mode updates on symlinks */
if (S_ISLNK(inode->i_mode))
@@ -495,7 +459,7 @@ out:
*/
int nfsd4_is_junction(struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (inode == NULL)
return 0;
@@ -521,9 +485,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = fhp->fh_dentry;
- mutex_lock(&dentry->d_inode->i_mutex);
+ mutex_lock(&d_inode(dentry)->i_mutex);
host_error = security_inode_setsecctx(dentry, label->data, label->len);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
return nfserrno(host_error);
}
#else
@@ -538,16 +502,11 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset, loff_t len,
int flags)
{
- __be32 err;
int error;
if (!S_ISREG(file_inode(file)->i_mode))
return nfserr_inval;
- err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE);
- if (err)
- return err;
-
error = vfs_fallocate(file, flags, offset, len);
if (!error)
error = commit_metadata(fhp);
@@ -706,7 +665,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
path.mnt = fhp->fh_export->ex_path.mnt;
path.dentry = fhp->fh_dentry;
- inode = path.dentry->d_inode;
+ inode = d_inode(path.dentry);
/* Disallow write access to files with the append-only bit set
* or any access when mandatory locking enabled
@@ -744,7 +703,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
host_err = ima_file_check(file, may_flags, 0);
if (host_err) {
- nfsd_close(file);
+ fput(file);
goto out_nfserr;
}
@@ -761,23 +720,12 @@ out:
return err;
}
-/*
- * Close a file.
- */
-void
-nfsd_close(struct file *filp)
-{
- fput(filp);
-}
-
-/*
- * Obtain the readahead parameters for the file
- * specified by (dev, ino).
- */
-
-static inline struct raparms *
-nfsd_get_raparms(dev_t dev, ino_t ino)
+struct raparms *
+nfsd_init_raparms(struct file *file)
{
+ struct inode *inode = file_inode(file);
+ dev_t dev = inode->i_sb->s_dev;
+ ino_t ino = inode->i_ino;
struct raparms *ra, **rap, **frap = NULL;
int depth = 0;
unsigned int hash;
@@ -814,9 +762,23 @@ found:
ra->p_count++;
nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
spin_unlock(&rab->pb_lock);
+
+ if (ra->p_set)
+ file->f_ra = ra->p_ra;
return ra;
}
+void nfsd_put_raparams(struct file *file, struct raparms *ra)
+{
+ struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+
+ spin_lock(&rab->pb_lock);
+ ra->p_ra = file->f_ra;
+ ra->p_set = 1;
+ ra->p_count--;
+ spin_unlock(&rab->pb_lock);
+}
+
/*
* Grab and keep cached pages associated with a file in the svc_rqst
* so that they can be passed to the network sendmsg/sendpage routines
@@ -945,7 +907,7 @@ static int wait_for_concurrent_writes(struct file *file)
return err;
}
-static __be32
+__be32
nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen,
unsigned long *cnt, int *stablep)
@@ -1009,40 +971,6 @@ out_nfserr:
return err;
}
-__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct file **file, struct raparms **ra)
-{
- struct inode *inode;
- __be32 err;
-
- err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file);
- if (err)
- return err;
-
- inode = file_inode(*file);
-
- /* Get readahead parameters */
- *ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
-
- if (*ra && (*ra)->p_set)
- (*file)->f_ra = (*ra)->p_ra;
- return nfs_ok;
-}
-
-void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra)
-{
- /* Write back readahead params */
- if (ra) {
- struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
- spin_lock(&rab->pb_lock);
- ra->p_ra = file->f_ra;
- ra->p_set = 1;
- ra->p_count--;
- spin_unlock(&rab->pb_lock);
- }
- nfsd_close(file);
-}
-
/*
* Read data from a file. count must contain the requested read count
* on entry. On return, *count contains the number of bytes actually read.
@@ -1055,13 +983,15 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct raparms *ra;
__be32 err;
- err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra);
+ err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
if (err)
return err;
+ ra = nfsd_init_raparms(file);
err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
-
- nfsd_put_tmp_read_open(file, ra);
+ if (ra)
+ nfsd_put_raparams(file, ra);
+ fput(file);
return err;
}
@@ -1093,7 +1023,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (cnt)
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
cnt, stablep);
- nfsd_close(file);
+ fput(file);
}
out:
return err;
@@ -1138,7 +1068,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserr_notsupp;
}
- nfsd_close(file);
+ fput(file);
out:
return err;
}
@@ -1211,7 +1141,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
dentry = fhp->fh_dentry;
- dirp = dentry->d_inode;
+ dirp = d_inode(dentry);
err = nfserr_notdir;
if (!dirp->i_op->lookup)
@@ -1250,7 +1180,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
* Make sure the child dentry is still negative ...
*/
err = nfserr_exist;
- if (dchild->d_inode) {
+ if (d_really_is_positive(dchild)) {
dprintk("nfsd_create: dentry %pd/%pd not negative!\n",
dentry, dchild);
goto out;
@@ -1353,7 +1283,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
dentry = fhp->fh_dentry;
- dirp = dentry->d_inode;
+ dirp = d_inode(dentry);
/* Get all the sanity checks out of the way before
* we lock the parent. */
@@ -1376,7 +1306,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out_nfserr;
/* If file doesn't exist, check for permissions to create one */
- if (!dchild->d_inode) {
+ if (d_really_is_negative(dchild)) {
err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
@@ -1397,7 +1327,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
v_atime = verifier[1]&0x7fffffff;
}
- if (dchild->d_inode) {
+ if (d_really_is_positive(dchild)) {
err = 0;
switch (createmode) {
@@ -1420,17 +1350,17 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
break;
case NFS3_CREATE_EXCLUSIVE:
- if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
- && dchild->d_inode->i_atime.tv_sec == v_atime
- && dchild->d_inode->i_size == 0 ) {
+ if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
+ && d_inode(dchild)->i_atime.tv_sec == v_atime
+ && d_inode(dchild)->i_size == 0 ) {
if (created)
*created = 1;
break;
}
case NFS4_CREATE_EXCLUSIVE4_1:
- if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
- && dchild->d_inode->i_atime.tv_sec == v_atime
- && dchild->d_inode->i_size == 0 ) {
+ if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
+ && d_inode(dchild)->i_atime.tv_sec == v_atime
+ && d_inode(dchild)->i_size == 0 ) {
if (created)
*created = 1;
goto set_attr;
@@ -1513,7 +1443,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
path.mnt = fhp->fh_export->ex_path.mnt;
path.dentry = fhp->fh_dentry;
- inode = path.dentry->d_inode;
+ inode = d_inode(path.dentry);
err = nfserr_inval;
if (!inode->i_op->readlink)
@@ -1576,7 +1506,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (IS_ERR(dnew))
goto out_nfserr;
- host_err = vfs_symlink(dentry->d_inode, dnew, path);
+ host_err = vfs_symlink(d_inode(dentry), dnew, path);
err = nfserrno(host_err);
if (!err)
err = nfserrno(commit_metadata(fhp));
@@ -1632,7 +1562,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
fh_lock_nested(ffhp, I_MUTEX_PARENT);
ddir = ffhp->fh_dentry;
- dirp = ddir->d_inode;
+ dirp = d_inode(ddir);
dnew = lookup_one_len(name, ddir, len);
host_err = PTR_ERR(dnew);
@@ -1642,7 +1572,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
dold = tfhp->fh_dentry;
err = nfserr_noent;
- if (!dold->d_inode)
+ if (d_really_is_negative(dold))
goto out_dput;
host_err = vfs_link(dold, dirp, dnew, NULL);
if (!host_err) {
@@ -1689,10 +1619,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
goto out;
fdentry = ffhp->fh_dentry;
- fdir = fdentry->d_inode;
+ fdir = d_inode(fdentry);
tdentry = tfhp->fh_dentry;
- tdir = tdentry->d_inode;
+ tdir = d_inode(tdentry);
err = nfserr_perm;
if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
@@ -1717,7 +1647,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
goto out_nfserr;
host_err = -ENOENT;
- if (!odentry->d_inode)
+ if (d_really_is_negative(odentry))
goto out_dput_old;
host_err = -EINVAL;
if (odentry == trap)
@@ -1790,21 +1720,21 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
fh_lock_nested(fhp, I_MUTEX_PARENT);
dentry = fhp->fh_dentry;
- dirp = dentry->d_inode;
+ dirp = d_inode(dentry);
rdentry = lookup_one_len(fname, dentry, flen);
host_err = PTR_ERR(rdentry);
if (IS_ERR(rdentry))
goto out_nfserr;
- if (!rdentry->d_inode) {
+ if (d_really_is_negative(rdentry)) {
dput(rdentry);
err = nfserr_noent;
goto out;
}
if (!type)
- type = rdentry->d_inode->i_mode & S_IFMT;
+ type = d_inode(rdentry)->i_mode & S_IFMT;
if (type != S_IFDIR)
host_err = vfs_unlink(dirp, rdentry, NULL);
@@ -1977,7 +1907,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
if (err == nfserr_eof || err == nfserr_toosmall)
err = nfs_ok; /* can still be found in ->err */
out_close:
- nfsd_close(file);
+ fput(file);
out:
return err;
}
@@ -2015,7 +1945,7 @@ __be32
nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
struct dentry *dentry, int acc)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int err;
if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 2050cb016998..5be875e3e638 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -71,11 +71,7 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
#endif /* CONFIG_NFSD_V3 */
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
-void nfsd_close(struct file *);
struct raparms;
-__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *,
- struct file **, struct raparms **);
-void nfsd_put_tmp_read_open(struct file *, struct raparms *);
__be32 nfsd_splice_read(struct svc_rqst *,
struct file *, loff_t, unsigned long *);
__be32 nfsd_readv(struct file *, loff_t, struct kvec *, int,
@@ -84,6 +80,10 @@ __be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
loff_t, struct kvec *, int, unsigned long *);
__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
loff_t, struct kvec *,int, unsigned long *, int *);
+__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+ struct kvec *vec, int vlen, unsigned long *cnt,
+ int *stablep);
__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
char *, int *);
__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
@@ -104,6 +104,9 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
struct dentry *, int);
+struct raparms *nfsd_init_raparms(struct file *file);
+void nfsd_put_raparams(struct file *file, struct raparms *ra);
+
static inline int fh_want_write(struct svc_fh *fh)
{
int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 0bda93e58e1b..9f991007a578 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,7 +40,6 @@
#include "state.h"
#include "nfsd.h"
-#define NFSD4_MAX_SEC_LABEL_LEN 2048
#define NFSD4_MAX_TAGLEN 128
#define XDR_LEN(n) (((n) + 3) & ~3)
@@ -248,6 +247,7 @@ struct nfsd4_open {
struct nfs4_openowner *op_openowner; /* used during processing */
struct nfs4_file *op_file; /* used during processing */
struct nfs4_ol_stateid *op_stp; /* used during processing */
+ struct nfs4_clnt_odstate *op_odstate; /* used during processing */
struct nfs4_acl *op_acl;
struct xdr_netobj op_label;
};
@@ -273,6 +273,7 @@ struct nfsd4_read {
u32 rd_length; /* request */
int rd_vlen;
struct file *rd_filp;
+ bool rd_tmp_file;
struct svc_rqst *rd_rqstp; /* response */
struct svc_fh * rd_fhp; /* response */
@@ -632,7 +633,7 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
{
BUG_ON(!fhp->fh_pre_saved);
cinfo->atomic = fhp->fh_post_saved;
- cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
+ cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
cinfo->before_change = fhp->fh_pre_change;
cinfo->after_change = fhp->fh_post_change;
@@ -683,7 +684,7 @@ extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
struct svc_fh *current_fh, struct nfsd4_open *open);
extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate);
extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
- struct nfsd4_open *open, __be32 status);
+ struct nfsd4_open *open);
extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
extern __be32 nfsd4_close(struct svc_rqst *rqstp,
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 741fd02e0444..8df0f3b7839b 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -405,13 +405,14 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
static int nilfs_palloc_count_desc_blocks(struct inode *inode,
unsigned long *desc_blocks)
{
- unsigned long blknum;
+ __u64 blknum;
int ret;
ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
if (likely(!ret))
*desc_blocks = DIV_ROUND_UP(
- blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
+ (unsigned long)blknum,
+ NILFS_MDT(inode)->mi_blocks_per_desc_block);
return ret;
}
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index aadbd0b5e3e8..27f75bcbeb30 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -152,9 +152,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
*
* %-EEXIST - A record associated with @key already exist.
*/
-int nilfs_bmap_insert(struct nilfs_bmap *bmap,
- unsigned long key,
- unsigned long rec)
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec)
{
int ret;
@@ -191,19 +189,47 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
return bmap->b_ops->bop_delete(bmap, key);
}
-int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+/**
+ * nilfs_bmap_seek_key - seek a valid entry and return its key
+ * @bmap: bmap struct
+ * @start: start key number
+ * @keyp: place to store valid key
+ *
+ * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap
+ * starting from @start, and stores it to @keyp if found.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No valid entry was found
+ */
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp)
{
- __u64 lastkey;
int ret;
down_read(&bmap->b_sem);
- ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+ ret = bmap->b_ops->bop_seek_key(bmap, start, keyp);
+ up_read(&bmap->b_sem);
+
+ if (ret < 0)
+ ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+ return ret;
+}
+
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
+{
+ int ret;
+
+ down_read(&bmap->b_sem);
+ ret = bmap->b_ops->bop_last_key(bmap, keyp);
up_read(&bmap->b_sem);
if (ret < 0)
ret = nilfs_bmap_convert_error(bmap, __func__, ret);
- else
- *key = lastkey;
return ret;
}
@@ -224,7 +250,7 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
*
* %-ENOENT - A record associated with @key does not exist.
*/
-int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key)
{
int ret;
@@ -235,7 +261,7 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
return nilfs_bmap_convert_error(bmap, __func__, ret);
}
-static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key)
{
__u64 lastkey;
int ret;
@@ -276,7 +302,7 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
*
* %-ENOMEM - Insufficient amount of memory available.
*/
-int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key)
{
int ret;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b89e68076adc..bfa817ce40b3 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -76,8 +76,10 @@ struct nilfs_bmap_operations {
union nilfs_binfo *);
int (*bop_mark)(struct nilfs_bmap *, __u64, int);
- /* The following functions are internal use only. */
+ int (*bop_seek_key)(const struct nilfs_bmap *, __u64, __u64 *);
int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+
+ /* The following functions are internal use only. */
int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
int (*bop_check_delete)(struct nilfs_bmap *, __u64);
int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
@@ -153,10 +155,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
-int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
-int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
-int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
-int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
+int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp);
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp);
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key);
void nilfs_bmap_clear(struct nilfs_bmap *);
int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index ecdbae19a766..919fd5bb14a8 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -388,7 +388,7 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
nchildren = nilfs_btree_node_get_nchildren(node);
if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
- level > NILFS_BTREE_LEVEL_MAX ||
+ level >= NILFS_BTREE_LEVEL_MAX ||
nchildren < 0 ||
nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
@@ -633,6 +633,44 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
return 0;
}
+/**
+ * nilfs_btree_get_next_key - get next valid key from btree path array
+ * @btree: bmap struct of btree
+ * @path: array of nilfs_btree_path struct
+ * @minlevel: start level
+ * @nextkey: place to store the next valid key
+ *
+ * Return Value: If a next key was found, 0 is returned. Otherwise,
+ * -ENOENT is returned.
+ */
+static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree,
+ const struct nilfs_btree_path *path,
+ int minlevel, __u64 *nextkey)
+{
+ struct nilfs_btree_node *node;
+ int maxlevel = nilfs_btree_height(btree) - 1;
+ int index, next_adj, level;
+
+ /* Next index is already set to bp_index for leaf nodes. */
+ next_adj = 0;
+ for (level = minlevel; level <= maxlevel; level++) {
+ if (level == maxlevel)
+ node = nilfs_btree_get_root(btree);
+ else
+ node = nilfs_btree_get_nonroot_node(path, level);
+
+ index = path[level].bp_index + next_adj;
+ if (index < nilfs_btree_node_get_nchildren(node)) {
+ /* Next key is in this node */
+ *nextkey = nilfs_btree_node_get_key(node, index);
+ return 0;
+ }
+ /* For non-leaf nodes, next index is stored at bp_index + 1. */
+ next_adj = 1;
+ }
+ return -ENOENT;
+}
+
static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
__u64 key, int level, __u64 *ptrp)
{
@@ -1563,6 +1601,27 @@ out:
return ret;
}
+static int nilfs_btree_seek_key(const struct nilfs_bmap *btree, __u64 start,
+ __u64 *keyp)
+{
+ struct nilfs_btree_path *path;
+ const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN;
+ int ret;
+
+ path = nilfs_btree_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = nilfs_btree_do_lookup(btree, path, start, NULL, minlevel, 0);
+ if (!ret)
+ *keyp = start;
+ else if (ret == -ENOENT)
+ ret = nilfs_btree_get_next_key(btree, path, minlevel, keyp);
+
+ nilfs_btree_free_path(path);
+ return ret;
+}
+
static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
{
struct nilfs_btree_path *path;
@@ -2298,7 +2357,9 @@ static const struct nilfs_bmap_operations nilfs_btree_ops = {
.bop_assign = nilfs_btree_assign,
.bop_mark = nilfs_btree_mark,
+ .bop_seek_key = nilfs_btree_seek_key,
.bop_last_key = nilfs_btree_last_key,
+
.bop_check_insert = NULL,
.bop_check_delete = nilfs_btree_check_delete,
.bop_gather_data = nilfs_btree_gather_data,
@@ -2318,7 +2379,9 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
.bop_assign = nilfs_btree_assign_gc,
.bop_mark = NULL,
+ .bop_seek_key = NULL,
.bop_last_key = NULL,
+
.bop_check_insert = NULL,
.bop_check_delete = NULL,
.bop_gather_data = NULL,
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 0d58075f34e2..b6596cab9e99 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -53,6 +53,13 @@ nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
}
+static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile,
+ unsigned long blkoff)
+{
+ return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff
+ + 1 - NILFS_MDT(cpfile)->mi_first_entry_offset;
+}
+
static unsigned long
nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
__u64 curr,
@@ -146,6 +153,44 @@ static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
create, nilfs_cpfile_block_init, bhp);
}
+/**
+ * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile
+ * @cpfile: inode of cpfile
+ * @start_cno: start checkpoint number (inclusive)
+ * @end_cno: end checkpoint number (inclusive)
+ * @cnop: place to store the next checkpoint number
+ * @bhp: place to store a pointer to buffer_head struct
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block exists in the range.
+ */
+static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile,
+ __u64 start_cno, __u64 end_cno,
+ __u64 *cnop,
+ struct buffer_head **bhp)
+{
+ unsigned long start, end, blkoff;
+ int ret;
+
+ if (unlikely(start_cno > end_cno))
+ return -ENOENT;
+
+ start = nilfs_cpfile_get_blkoff(cpfile, start_cno);
+ end = nilfs_cpfile_get_blkoff(cpfile, end_cno);
+
+ ret = nilfs_mdt_find_block(cpfile, start, end, &blkoff, bhp);
+ if (!ret)
+ *cnop = (blkoff == start) ? start_cno :
+ nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff);
+ return ret;
+}
+
static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
__u64 cno)
{
@@ -403,14 +448,15 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
return -ENOENT; /* checkpoint number 0 is invalid */
down_read(&NILFS_MDT(cpfile)->mi_sem);
- for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
- ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
- ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+ for (n = 0; n < nci; cno += ncps) {
+ ret = nilfs_cpfile_find_checkpoint_block(
+ cpfile, cno, cur_cno - 1, &cno, &bh);
if (ret < 0) {
- if (ret != -ENOENT)
- goto out;
- continue; /* skip hole */
+ if (likely(ret == -ENOENT))
+ break;
+ goto out;
}
+ ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
kaddr = kmap_atomic(bh->b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 197a63e9d102..6b8b92b19cec 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page)
page_cache_release(page);
}
-static inline unsigned long dir_pages(struct inode *inode)
-{
- return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
/*
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
@@ -435,7 +430,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
*/
int nilfs_add_link(struct dentry *dentry, struct inode *inode)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
const unsigned char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned chunk_size = nilfs_chunk_size(dir);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 82f4865e86dd..ebf89fd8ac1a 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -173,6 +173,21 @@ static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
return ret;
}
+static int nilfs_direct_seek_key(const struct nilfs_bmap *direct, __u64 start,
+ __u64 *keyp)
+{
+ __u64 key;
+
+ for (key = start; key <= NILFS_DIRECT_KEY_MAX; key++) {
+ if (nilfs_direct_get_ptr(direct, key) !=
+ NILFS_BMAP_INVALID_PTR) {
+ *keyp = key;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
{
__u64 key, lastkey;
@@ -355,7 +370,9 @@ static const struct nilfs_bmap_operations nilfs_direct_ops = {
.bop_assign = nilfs_direct_assign,
.bop_mark = NULL,
+ .bop_seek_key = nilfs_direct_seek_key,
.bop_last_key = nilfs_direct_last_key,
+
.bop_check_insert = nilfs_direct_check_insert,
.bop_check_delete = NULL,
.bop_gather_data = nilfs_direct_gather_data,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index a8c728acb7a8..54575e3cc1a2 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -143,8 +143,6 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
*/
const struct file_operations nilfs_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.unlocked_ioctl = nilfs_ioctl,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8b5969538f39..4a73d6dffabf 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -26,7 +26,7 @@
#include <linux/mpage.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
@@ -106,7 +106,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
if (unlikely(err))
goto out;
- err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
+ err = nilfs_bmap_insert(ii->i_bmap, blkoff,
(unsigned long)bh_result);
if (unlikely(err != 0)) {
if (err == -EEXIST) {
@@ -305,35 +305,15 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
}
static ssize_t
-nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
+nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = file->f_mapping->host;
- size_t count = iov_iter_count(iter);
- ssize_t size;
+ struct inode *inode = file_inode(iocb->ki_filp);
- if (rw == WRITE)
+ if (iov_iter_rw(iter) == WRITE)
return 0;
/* Needs synchronization with the cleaner */
- size = blockdev_direct_IO(rw, iocb, inode, iter, offset,
- nilfs_get_block);
-
- /*
- * In case of error extending write may have instantiated a few
- * blocks outside i_size. Trim these off again.
- */
- if (unlikely((rw & WRITE) && size < 0)) {
- loff_t isize = i_size_read(inode);
- loff_t end = offset + count;
-
- if (end > isize)
- nilfs_write_failed(mapping, end);
- }
-
- return size;
+ return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block);
}
const struct address_space_operations nilfs_aops = {
@@ -443,21 +423,20 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
void nilfs_set_inode_flags(struct inode *inode)
{
unsigned int flags = NILFS_I(inode)->i_flags;
+ unsigned int new_fl = 0;
- inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
- S_DIRSYNC);
if (flags & FS_SYNC_FL)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (flags & FS_APPEND_FL)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (flags & FS_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & FS_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (flags & FS_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
- mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ new_fl |= S_DIRSYNC;
+ inode_set_flags(inode, new_fl, S_SYNC | S_APPEND | S_IMMUTABLE |
+ S_NOATIME | S_DIRSYNC);
}
int nilfs_read_inode_common(struct inode *inode,
@@ -542,6 +521,8 @@ static int __nilfs_read_inode(struct super_block *sb,
brelse(bh);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
+ mapping_set_gfp_mask(inode->i_mapping,
+ mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
return 0;
failed_unmap:
@@ -714,7 +695,7 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
unsigned long from)
{
- unsigned long b;
+ __u64 b;
int ret;
if (!test_bit(NILFS_I_BMAP, &ii->i_state))
@@ -729,7 +710,7 @@ repeat:
if (b < from)
return;
- b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
+ b -= min_t(__u64, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
ret = nilfs_bmap_truncate(ii->i_bmap, b);
nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
if (!ret || (ret == -ENOMEM &&
@@ -836,7 +817,7 @@ void nilfs_evict_inode(struct inode *inode)
int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
{
struct nilfs_transaction_info ti;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
int err;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 9a20e513d7eb..aba43811d6ef 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1369,7 +1369,6 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case NILFS_IOCTL_SYNC:
case NILFS_IOCTL_RESIZE:
case NILFS_IOCTL_SET_ALLOC_RANGE:
- case FITRIM:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 892cf5ffdb8e..dee34d990281 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -261,6 +261,60 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
}
/**
+ * nilfs_mdt_find_block - find and get a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @start: start block offset (inclusive)
+ * @end: end block offset (inclusive)
+ * @blkoff: block offset
+ * @out_bh: place to store a pointer to buffer_head struct
+ *
+ * nilfs_mdt_find_block() looks up an existing block in range of
+ * [@start, @end] and stores pointer to a buffer head of the block to
+ * @out_bh, and block offset to @blkoff, respectively. @out_bh and
+ * @blkoff are substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block was found in the range
+ */
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+ unsigned long end, unsigned long *blkoff,
+ struct buffer_head **out_bh)
+{
+ __u64 next;
+ int ret;
+
+ if (unlikely(start > end))
+ return -ENOENT;
+
+ ret = nilfs_mdt_read_block(inode, start, true, out_bh);
+ if (!ret) {
+ *blkoff = start;
+ goto out;
+ }
+ if (unlikely(ret != -ENOENT || start == ULONG_MAX))
+ goto out;
+
+ ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next);
+ if (!ret) {
+ if (next <= end) {
+ ret = nilfs_mdt_read_block(inode, next, true, out_bh);
+ if (!ret)
+ *blkoff = next;
+ } else {
+ ret = -ENOENT;
+ }
+ }
+out:
+ return ret;
+}
+
+/**
* nilfs_mdt_delete_block - make a hole on the meta data file.
* @inode: inode of the meta data file
* @block: block offset
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ab172e8549c5..fe529a87a208 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -78,6 +78,9 @@ int nilfs_mdt_get_block(struct inode *, unsigned long, int,
void (*init_block)(struct inode *,
struct buffer_head *, void *),
struct buffer_head **);
+int nilfs_mdt_find_block(struct inode *inode, unsigned long start,
+ unsigned long end, unsigned long *blkoff,
+ struct buffer_head **out_bh);
int nilfs_mdt_delete_block(struct inode *, unsigned long);
int nilfs_mdt_forget_block(struct inode *, unsigned long);
int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
@@ -111,7 +114,10 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
}
-#define nilfs_mdt_bgl_lock(inode, bg) \
- (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
+static inline spinlock_t *
+nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group)
+{
+ return bgl_lock_ptr(NILFS_MDT(inode)->mi_bgl, block_group);
+}
#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 0f84b257932c..37dd6b05b1b5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -192,7 +192,7 @@ out_fail:
static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct nilfs_transaction_info ti;
int err;
@@ -283,7 +283,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
if (!de)
goto out;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
err = -EIO;
if (le64_to_cpu(de->inode) != inode->i_ino)
goto out;
@@ -318,7 +318,7 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
if (!err) {
nilfs_mark_inode_dirty(dir);
- nilfs_mark_inode_dirty(dentry->d_inode);
+ nilfs_mark_inode_dirty(d_inode(dentry));
err = nilfs_transaction_commit(dir->i_sb);
} else
nilfs_transaction_abort(dir->i_sb);
@@ -328,7 +328,7 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct nilfs_transaction_info ti;
int err;
@@ -358,8 +358,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct page *dir_page = NULL;
struct nilfs_dir_entry *dir_de = NULL;
struct page *old_page;
@@ -453,13 +453,13 @@ static struct dentry *nilfs_get_parent(struct dentry *child)
struct qstr dotdot = QSTR_INIT("..", 2);
struct nilfs_root *root;
- ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+ ino = nilfs_inode_by_name(d_inode(child), &dotdot);
if (!ino)
return ERR_PTR(-ENOENT);
- root = NILFS_I(child->d_inode)->i_root;
+ root = NILFS_I(d_inode(child))->i_root;
- inode = nilfs_iget(child->d_inode->i_sb, root, ino);
+ inode = nilfs_iget(d_inode(child)->i_sb, root, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -496,8 +496,7 @@ static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
{
struct nilfs_fid *fid = (struct nilfs_fid *)fh;
- if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE &&
- fh_len != NILFS_FID_SIZE_CONNECTABLE) ||
+ if (fh_len < NILFS_FID_SIZE_NON_CONNECTABLE ||
(fh_type != FILEID_NILFS_WITH_PARENT &&
fh_type != FILEID_NILFS_WITHOUT_PARENT))
return NULL;
@@ -510,7 +509,7 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
{
struct nilfs_fid *fid = (struct nilfs_fid *)fh;
- if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
+ if (fh_len < NILFS_FID_SIZE_CONNECTABLE ||
fh_type != FILEID_NILFS_WITH_PARENT)
return NULL;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 700ecbcca55d..45d650addd56 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -89,18 +89,16 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
void nilfs_forget_buffer(struct buffer_head *bh)
{
struct page *page = bh->b_page;
+ const unsigned long clear_bits =
+ (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
+ 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
+ 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
lock_buffer(bh);
- clear_buffer_nilfs_volatile(bh);
- clear_buffer_nilfs_checked(bh);
- clear_buffer_nilfs_redirected(bh);
- clear_buffer_async_write(bh);
- clear_buffer_dirty(bh);
+ set_mask_bits(&bh->b_state, clear_bits, 0);
if (nilfs_page_buffers_clean(page))
__nilfs_clear_page_dirty(page);
- clear_buffer_uptodate(bh);
- clear_buffer_mapped(bh);
bh->b_blocknr = -1;
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -421,6 +419,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
if (page_has_buffers(page)) {
struct buffer_head *bh, *head;
+ const unsigned long clear_bits =
+ (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
+ 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
+ 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
bh = head = page_buffers(page);
do {
@@ -430,13 +432,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
"discard block %llu, size %zu",
(u64)bh->b_blocknr, bh->b_size);
}
- clear_buffer_async_write(bh);
- clear_buffer_dirty(bh);
- clear_buffer_nilfs_volatile(bh);
- clear_buffer_nilfs_checked(bh);
- clear_buffer_nilfs_redirected(bh);
- clear_buffer_uptodate(bh);
- clear_buffer_mapped(bh);
+ set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
} while (bh = bh->b_this_page, bh != head);
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index dc3a9efdaab8..42468e5ab3e7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err)
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct nilfs_segment_buffer *segbuf = bio->bi_private;
- if (err == -EOPNOTSUPP) {
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- /* to be detected by nilfs_segbuf_submit_bio() */
- }
-
if (!uptodate)
atomic_inc(&segbuf->sb_err);
@@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
bio->bi_end_io = nilfs_end_bio_write;
bio->bi_private = segbuf;
- bio_get(bio);
submit_bio(mode, bio);
segbuf->sb_nbio++;
- if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- bio_put(bio);
- err = -EOPNOTSUPP;
- goto failed;
- }
- bio_put(bio);
wi->bio = NULL;
wi->rest_blocks -= wi->end - wi->start;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 0c3f303baf32..c6abbad9b8e3 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -24,6 +24,7 @@
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/bitops.h>
#include <linux/bio.h>
#include <linux/completion.h>
#include <linux/blkdev.h>
@@ -1588,7 +1589,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page) {
lock_page(bd_page);
@@ -1688,7 +1688,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1768,7 +1767,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
- clear_buffer_async_write(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1788,12 +1786,13 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
*/
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
- clear_buffer_async_write(bh);
- clear_buffer_delay(bh);
- clear_buffer_nilfs_volatile(bh);
- clear_buffer_nilfs_redirected(bh);
+ const unsigned long set_bits = (1 << BH_Uptodate);
+ const unsigned long clear_bits =
+ (1 << BH_Dirty | 1 << BH_Async_Write |
+ 1 << BH_Delay | 1 << BH_NILFS_Volatile |
+ 1 << BH_NILFS_Redirected);
+
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5bc2a1cf73c3..f47585bfeb01 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -610,7 +610,7 @@ static int nilfs_unfreeze(struct super_block *sb)
static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
- struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
+ struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root;
struct the_nilfs *nilfs = root->nilfs;
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
unsigned long long blocks;
@@ -681,7 +681,7 @@ static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct super_block *sb = dentry->d_sb;
struct the_nilfs *nilfs = sb->s_fs_info;
- struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root;
+ struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root;
if (!nilfs_test_opt(nilfs, BARRIER))
seq_puts(seq, ",nobarrier");
@@ -1020,7 +1020,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
struct dentry *dentry;
int ret;
- if (cno < 0 || cno > nilfs->ns_cno)
+ if (cno > nilfs->ns_cno)
return false;
if (cno >= nilfs_last_cno(nilfs))
@@ -1190,7 +1190,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags &= ~MS_RDONLY;
- root = NILFS_I(sb->s_root->d_inode)->i_root;
+ root = NILFS_I(d_inode(sb->s_root))->i_root;
err = nilfs_attach_log_writer(sb, root);
if (err)
goto restore_opts;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 450648697433..5b1e2a497e51 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -26,7 +26,7 @@
#include <linux/fs.h> /* struct inode */
#include <linux/fsnotify_backend.h>
#include <linux/idr.h>
-#include <linux/init.h> /* module_init */
+#include <linux/init.h> /* fs_initcall */
#include <linux/inotify.h>
#include <linux/kernel.h> /* roundup() */
#include <linux/namei.h> /* LOOKUP_FOLLOW */
@@ -812,4 +812,4 @@ static int __init inotify_user_setup(void)
return 0;
}
-module_init(inotify_user_setup);
+fs_initcall(inotify_user_setup);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 92e48c70f0f0..39ddcaf0918f 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -412,16 +412,36 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
unsigned int flags)
{
struct fsnotify_mark *lmark, *mark;
+ LIST_HEAD(to_free);
+ /*
+ * We have to be really careful here. Anytime we drop mark_mutex, e.g.
+ * fsnotify_clear_marks_by_inode() can come and free marks. Even in our
+ * to_free list so we have to use mark_mutex even when accessing that
+ * list. And freeing mark requires us to drop mark_mutex. So we can
+ * reliably free only the first mark in the list. That's why we first
+ * move marks to free to to_free list in one go and then free marks in
+ * to_free list one by one.
+ */
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
- if (mark->flags & flags) {
- fsnotify_get_mark(mark);
- fsnotify_destroy_mark_locked(mark, group);
- fsnotify_put_mark(mark);
- }
+ if (mark->flags & flags)
+ list_move(&mark->g_list, &to_free);
}
mutex_unlock(&group->mark_mutex);
+
+ while (1) {
+ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ if (list_empty(&to_free)) {
+ mutex_unlock(&group->mark_mutex);
+ break;
+ }
+ mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
+ fsnotify_get_mark(mark);
+ fsnotify_destroy_mark_locked(mark, group);
+ mutex_unlock(&group->mark_mutex);
+ fsnotify_put_mark(mark);
+ }
}
/*
diff --git a/fs/nsfs.c b/fs/nsfs.c
index af1b24fa899d..99521e7c492b 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -13,7 +13,7 @@ static const struct file_operations ns_file_operations = {
static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
@@ -22,7 +22,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
static void ns_prune_dentry(struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (inode) {
struct ns_common *ns = inode->i_private;
atomic_long_set(&ns->stashed, 0);
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 36ae529511c4..2ff263e6d363 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -8,7 +8,7 @@ ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-ccflags-y := -DNTFS_VERSION=\"2.1.31\"
+ccflags-y := -DNTFS_VERSION=\"2.1.32\"
ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1da9b2d184dc..262561fea923 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
/*
* file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
@@ -28,7 +28,6 @@
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/writeback.h>
-#include <linux/aio.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -329,62 +328,166 @@ err_out:
return err;
}
-/**
- * ntfs_fault_in_pages_readable -
- *
- * Fault a number of userspace pages into pagetables.
- *
- * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
- * with more than two userspace pages as well as handling the single page case
- * elegantly.
- *
- * If you find this difficult to understand, then think of the while loop being
- * the following code, except that we do without the integer variable ret:
- *
- * do {
- * ret = __get_user(c, uaddr);
- * uaddr += PAGE_SIZE;
- * } while (!ret && uaddr < end);
- *
- * Note, the final __get_user() may well run out-of-bounds of the user buffer,
- * but _not_ out-of-bounds of the page the user buffer belongs to, and since
- * this is only a read and not a write, and since it is still in the same page,
- * it should not matter and this makes the code much simpler.
- */
-static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
- int bytes)
+static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
+ struct iov_iter *from)
{
- const char __user *end;
- volatile char c;
-
- /* Set @end to the first byte outside the last page we care about. */
- end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
-
- while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
- ;
-}
-
-/**
- * ntfs_fault_in_pages_readable_iovec -
- *
- * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
- */
-static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
- size_t iov_ofs, int bytes)
-{
- do {
- const char __user *buf;
- unsigned len;
+ loff_t pos;
+ s64 end, ll;
+ ssize_t err;
+ unsigned long flags;
+ struct file *file = iocb->ki_filp;
+ struct inode *vi = file_inode(file);
+ ntfs_inode *base_ni, *ni = NTFS_I(vi);
+ ntfs_volume *vol = ni->vol;
- buf = iov->iov_base + iov_ofs;
- len = iov->iov_len - iov_ofs;
- if (len > bytes)
- len = bytes;
- ntfs_fault_in_pages_readable(buf, len);
- bytes -= len;
- iov++;
- iov_ofs = 0;
- } while (bytes);
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ "0x%llx, count 0x%zx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)iocb->ki_pos,
+ iov_iter_count(from));
+ err = generic_write_checks(iocb, from);
+ if (unlikely(err <= 0))
+ goto out;
+ /*
+ * All checks have passed. Before we start doing any writing we want
+ * to abort any totally illegal writes.
+ */
+ BUG_ON(NInoMstProtected(ni));
+ BUG_ON(ni->type != AT_DATA);
+ /* If file is encrypted, deny access, just like NT4. */
+ if (NInoEncrypted(ni)) {
+ /* Only $DATA attributes can be encrypted. */
+ /*
+ * Reminder for later: Encrypted files are _always_
+ * non-resident so that the content can always be encrypted.
+ */
+ ntfs_debug("Denying write access to encrypted file.");
+ err = -EACCES;
+ goto out;
+ }
+ if (NInoCompressed(ni)) {
+ /* Only unnamed $DATA attribute can be compressed. */
+ BUG_ON(ni->name_len);
+ /*
+ * Reminder for later: If resident, the data is not actually
+ * compressed. Only on the switch to non-resident does
+ * compression kick in. This is in contrast to encrypted files
+ * (see above).
+ */
+ ntfs_error(vi->i_sb, "Writing to compressed files is not "
+ "implemented yet. Sorry.");
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ base_ni = ni;
+ if (NInoAttr(ni))
+ base_ni = ni->ext.base_ntfs_ino;
+ err = file_remove_privs(file);
+ if (unlikely(err))
+ goto out;
+ /*
+ * Our ->update_time method always succeeds thus file_update_time()
+ * cannot fail either so there is no need to check the return code.
+ */
+ file_update_time(file);
+ pos = iocb->ki_pos;
+ /* The first byte after the last cluster being written to. */
+ end = (pos + iov_iter_count(from) + vol->cluster_size_mask) &
+ ~(u64)vol->cluster_size_mask;
+ /*
+ * If the write goes beyond the allocated size, extend the allocation
+ * to cover the whole of the write, rounded up to the nearest cluster.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (end > ll) {
+ /*
+ * Extend the allocation without changing the data size.
+ *
+ * Note we ensure the allocation is big enough to at least
+ * write some data but we do not require the allocation to be
+ * complete, i.e. it may be partial.
+ */
+ ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+ if (likely(ll >= 0)) {
+ BUG_ON(pos >= ll);
+ /* If the extension was partial truncate the write. */
+ if (end > ll) {
+ ntfs_debug("Truncating write to inode 0x%lx, "
+ "attribute type 0x%x, because "
+ "the allocation was only "
+ "partially extended.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ iov_iter_truncate(from, ll - pos);
+ }
+ } else {
+ err = ll;
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /* Perform a partial write if possible or fail. */
+ if (pos < ll) {
+ ntfs_debug("Truncating write to inode 0x%lx "
+ "attribute type 0x%x, because "
+ "extending the allocation "
+ "failed (error %d).",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type),
+ (int)-err);
+ iov_iter_truncate(from, ll - pos);
+ } else {
+ if (err != -ENOSPC)
+ ntfs_error(vi->i_sb, "Cannot perform "
+ "write to inode "
+ "0x%lx, attribute "
+ "type 0x%x, because "
+ "extending the "
+ "allocation failed "
+ "(error %ld).",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type),
+ (long)-err);
+ else
+ ntfs_debug("Cannot perform write to "
+ "inode 0x%lx, "
+ "attribute type 0x%x, "
+ "because there is not "
+ "space left.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ goto out;
+ }
+ }
+ }
+ /*
+ * If the write starts beyond the initialized size, extend it up to the
+ * beginning of the write and initialize all non-sparse space between
+ * the old initialized size and the new one. This automatically also
+ * increments the vfs inode->i_size to keep it above or equal to the
+ * initialized_size.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (pos > ll) {
+ /*
+ * Wait for ongoing direct i/o to complete before proceeding.
+ * New direct i/o cannot start as we hold i_mutex.
+ */
+ inode_dio_wait(vi);
+ err = ntfs_attr_extend_initialized(ni, pos);
+ if (unlikely(err < 0))
+ ntfs_error(vi->i_sb, "Cannot perform write to inode "
+ "0x%lx, attribute type 0x%x, because "
+ "extending the initialized size "
+ "failed (error %d).", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (int)-err);
+ }
+out:
+ return err;
}
/**
@@ -421,8 +524,9 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
goto err_out;
}
}
- err = add_to_page_cache_lru(*cached_page, mapping, index,
- GFP_KERNEL);
+ err = add_to_page_cache_lru(*cached_page, mapping,
+ index,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (unlikely(err)) {
if (err == -EEXIST)
continue;
@@ -1268,180 +1372,6 @@ rl_not_mapped_enoent:
return err;
}
-/*
- * Copy as much as we can into the pages and return the number of bytes which
- * were successfully copied. If a fault is encountered then clear the pages
- * out to (ofs + bytes) and return the number of bytes which were copied.
- */
-static inline size_t ntfs_copy_from_user(struct page **pages,
- unsigned nr_pages, unsigned ofs, const char __user *buf,
- size_t bytes)
-{
- struct page **last_page = pages + nr_pages;
- char *addr;
- size_t total = 0;
- unsigned len;
- int left;
-
- do {
- len = PAGE_CACHE_SIZE - ofs;
- if (len > bytes)
- len = bytes;
- addr = kmap_atomic(*pages);
- left = __copy_from_user_inatomic(addr + ofs, buf, len);
- kunmap_atomic(addr);
- if (unlikely(left)) {
- /* Do it the slow way. */
- addr = kmap(*pages);
- left = __copy_from_user(addr + ofs, buf, len);
- kunmap(*pages);
- if (unlikely(left))
- goto err_out;
- }
- total += len;
- bytes -= len;
- if (!bytes)
- break;
- buf += len;
- ofs = 0;
- } while (++pages < last_page);
-out:
- return total;
-err_out:
- total += len - left;
- /* Zero the rest of the target like __copy_from_user(). */
- while (++pages < last_page) {
- bytes -= len;
- if (!bytes)
- break;
- len = PAGE_CACHE_SIZE;
- if (len > bytes)
- len = bytes;
- zero_user(*pages, 0, len);
- }
- goto out;
-}
-
-static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
- const struct iovec *iov, size_t iov_ofs, size_t bytes)
-{
- size_t total = 0;
-
- while (1) {
- const char __user *buf = iov->iov_base + iov_ofs;
- unsigned len;
- size_t left;
-
- len = iov->iov_len - iov_ofs;
- if (len > bytes)
- len = bytes;
- left = __copy_from_user_inatomic(vaddr, buf, len);
- total += len;
- bytes -= len;
- vaddr += len;
- if (unlikely(left)) {
- total -= left;
- break;
- }
- if (!bytes)
- break;
- iov++;
- iov_ofs = 0;
- }
- return total;
-}
-
-static inline void ntfs_set_next_iovec(const struct iovec **iovp,
- size_t *iov_ofsp, size_t bytes)
-{
- const struct iovec *iov = *iovp;
- size_t iov_ofs = *iov_ofsp;
-
- while (bytes) {
- unsigned len;
-
- len = iov->iov_len - iov_ofs;
- if (len > bytes)
- len = bytes;
- bytes -= len;
- iov_ofs += len;
- if (iov->iov_len == iov_ofs) {
- iov++;
- iov_ofs = 0;
- }
- }
- *iovp = iov;
- *iov_ofsp = iov_ofs;
-}
-
-/*
- * This has the same side-effects and return value as ntfs_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
- * single-segment behaviour.
- *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- * atomic and when not atomic. This is ok because it calls
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
- * fact, the only difference between __copy_from_user_inatomic() and
- * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error. And on many architectures
- * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- * makes no difference at all on those architectures.
- */
-static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
- unsigned nr_pages, unsigned ofs, const struct iovec **iov,
- size_t *iov_ofs, size_t bytes)
-{
- struct page **last_page = pages + nr_pages;
- char *addr;
- size_t copied, len, total = 0;
-
- do {
- len = PAGE_CACHE_SIZE - ofs;
- if (len > bytes)
- len = bytes;
- addr = kmap_atomic(*pages);
- copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
- *iov, *iov_ofs, len);
- kunmap_atomic(addr);
- if (unlikely(copied != len)) {
- /* Do it the slow way. */
- addr = kmap(*pages);
- copied = __ntfs_copy_from_user_iovec_inatomic(addr +
- ofs, *iov, *iov_ofs, len);
- if (unlikely(copied != len))
- goto err_out;
- kunmap(*pages);
- }
- total += len;
- ntfs_set_next_iovec(iov, iov_ofs, len);
- bytes -= len;
- if (!bytes)
- break;
- ofs = 0;
- } while (++pages < last_page);
-out:
- return total;
-err_out:
- BUG_ON(copied > len);
- /* Zero the rest of the target like __copy_from_user(). */
- memset(addr + ofs + copied, 0, len - copied);
- kunmap(*pages);
- total += copied;
- ntfs_set_next_iovec(iov, iov_ofs, copied);
- while (++pages < last_page) {
- bytes -= len;
- if (!bytes)
- break;
- len = PAGE_CACHE_SIZE;
- if (len > bytes)
- len = bytes;
- zero_user(*pages, 0, len);
- }
- goto out;
-}
-
static inline void ntfs_flush_dcache_pages(struct page **pages,
unsigned nr_pages)
{
@@ -1762,86 +1692,83 @@ err_out:
return err;
}
-static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were successfully copied. If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
+ unsigned ofs, struct iov_iter *i, size_t bytes)
{
- struct inode *inode = mapping->host;
+ struct page **last_page = pages + nr_pages;
+ size_t total = 0;
+ struct iov_iter data = *i;
+ unsigned len, copied;
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- ntfs_truncate_vfs(inode);
- }
+ do {
+ len = PAGE_CACHE_SIZE - ofs;
+ if (len > bytes)
+ len = bytes;
+ copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
+ len);
+ total += copied;
+ bytes -= copied;
+ if (!bytes)
+ break;
+ iov_iter_advance(&data, copied);
+ if (copied < len)
+ goto err;
+ ofs = 0;
+ } while (++pages < last_page);
+out:
+ return total;
+err:
+ /* Zero the rest of the target like __copy_from_user(). */
+ len = PAGE_CACHE_SIZE - copied;
+ do {
+ if (len > bytes)
+ len = bytes;
+ zero_user(*pages, copied, len);
+ bytes -= len;
+ copied = 0;
+ len = PAGE_CACHE_SIZE;
+ } while (++pages < last_page);
+ goto out;
}
/**
- * ntfs_file_buffered_write -
- *
- * Locking: The vfs is holding ->i_mutex on the inode.
+ * ntfs_perform_write - perform buffered write to a file
+ * @file: file to write to
+ * @i: iov_iter with data to write
+ * @pos: byte offset in file at which to begin writing to
*/
-static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs,
- loff_t pos, loff_t *ppos, size_t count)
+static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
+ loff_t pos)
{
- struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *vi = mapping->host;
ntfs_inode *ni = NTFS_I(vi);
ntfs_volume *vol = ni->vol;
struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
struct page *cached_page = NULL;
- char __user *buf = NULL;
- s64 end, ll;
VCN last_vcn;
LCN lcn;
- unsigned long flags;
- size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */
- ssize_t status, written;
+ size_t bytes;
+ ssize_t status, written = 0;
unsigned nr_pages;
- int err;
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
- "pos 0x%llx, count 0x%lx.",
- vi->i_ino, (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)pos, (unsigned long)count);
- if (unlikely(!count))
- return 0;
- BUG_ON(NInoMstProtected(ni));
- /*
- * If the attribute is not an index root and it is encrypted or
- * compressed, we cannot write to it yet. Note we need to check for
- * AT_INDEX_ALLOCATION since this is the type of both directory and
- * index inodes.
- */
- if (ni->type != AT_INDEX_ALLOCATION) {
- /* If file is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- /*
- * Reminder for later: Encrypted files are _always_
- * non-resident so that the content can always be
- * encrypted.
- */
- ntfs_debug("Denying write access to encrypted file.");
- return -EACCES;
- }
- if (NInoCompressed(ni)) {
- /* Only unnamed $DATA attribute can be compressed. */
- BUG_ON(ni->type != AT_DATA);
- BUG_ON(ni->name_len);
- /*
- * Reminder for later: If resident, the data is not
- * actually compressed. Only on the switch to non-
- * resident does compression kick in. This is in
- * contrast to encrypted files (see above).
- */
- ntfs_error(vi->i_sb, "Writing to compressed files is "
- "not implemented yet. Sorry.");
- return -EOPNOTSUPP;
- }
- }
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ "0x%llx, count 0x%lx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)pos,
+ (unsigned long)iov_iter_count(i));
/*
* If a previous ntfs_truncate() failed, repeat it and abort if it
* fails again.
*/
if (unlikely(NInoTruncateFailed(ni))) {
+ int err;
+
inode_dio_wait(vi);
err = ntfs_truncate(vi);
if (err || NInoTruncateFailed(ni)) {
@@ -1855,81 +1782,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
return err;
}
}
- /* The first byte after the write. */
- end = pos + count;
- /*
- * If the write goes beyond the allocated size, extend the allocation
- * to cover the whole of the write, rounded up to the nearest cluster.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (end > ll) {
- /* Extend the allocation without changing the data size. */
- ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
- if (likely(ll >= 0)) {
- BUG_ON(pos >= ll);
- /* If the extension was partial truncate the write. */
- if (end > ll) {
- ntfs_debug("Truncating write to inode 0x%lx, "
- "attribute type 0x%x, because "
- "the allocation was only "
- "partially extended.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- end = ll;
- count = ll - pos;
- }
- } else {
- err = ll;
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- /* Perform a partial write if possible or fail. */
- if (pos < ll) {
- ntfs_debug("Truncating write to inode 0x%lx, "
- "attribute type 0x%x, because "
- "extending the allocation "
- "failed (error code %i).",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type), err);
- end = ll;
- count = ll - pos;
- } else {
- ntfs_error(vol->sb, "Cannot perform write to "
- "inode 0x%lx, attribute type "
- "0x%x, because extending the "
- "allocation failed (error "
- "code %i).", vi->i_ino,
- (unsigned)
- le32_to_cpu(ni->type), err);
- return err;
- }
- }
- }
- written = 0;
- /*
- * If the write starts beyond the initialized size, extend it up to the
- * beginning of the write and initialize all non-sparse space between
- * the old initialized size and the new one. This automatically also
- * increments the vfs inode->i_size to keep it above or equal to the
- * initialized_size.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (pos > ll) {
- err = ntfs_attr_extend_initialized(ni, pos);
- if (err < 0) {
- ntfs_error(vol->sb, "Cannot perform write to inode "
- "0x%lx, attribute type 0x%x, because "
- "extending the initialized size "
- "failed (error code %i).", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- status = err;
- goto err_out;
- }
- }
/*
* Determine the number of pages per cluster for non-resident
* attributes.
@@ -1937,10 +1789,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
nr_pages = 1;
if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
- /* Finally, perform the actual write. */
last_vcn = -1;
- if (likely(nr_segs == 1))
- buf = iov->iov_base;
do {
VCN vcn;
pgoff_t idx, start_idx;
@@ -1965,10 +1814,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
vol->cluster_size_bits, false);
up_read(&ni->runlist.lock);
if (unlikely(lcn < LCN_HOLE)) {
- status = -EIO;
if (lcn == LCN_ENOMEM)
status = -ENOMEM;
- else
+ else {
+ status = -EIO;
ntfs_error(vol->sb, "Cannot "
"perform write to "
"inode 0x%lx, "
@@ -1977,6 +1826,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
"is corrupt.",
vi->i_ino, (unsigned)
le32_to_cpu(ni->type));
+ }
break;
}
if (lcn == LCN_HOLE) {
@@ -1989,8 +1839,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
}
}
}
- if (bytes > count)
- bytes = count;
+ if (bytes > iov_iter_count(i))
+ bytes = iov_iter_count(i);
+again:
/*
* Bring in the user page(s) that we will copy from _first_.
* Otherwise there is a nasty deadlock on copying from the same
@@ -1999,10 +1850,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
* pages being swapped out between us bringing them into memory
* and doing the actual copying.
*/
- if (likely(nr_segs == 1))
- ntfs_fault_in_pages_readable(buf, bytes);
- else
- ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
+ if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
/* Get and lock @do_pages starting at index @start_idx. */
status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
pages, &cached_page);
@@ -2018,56 +1869,57 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
status = ntfs_prepare_pages_for_non_resident_write(
pages, do_pages, pos, bytes);
if (unlikely(status)) {
- loff_t i_size;
-
do {
unlock_page(pages[--do_pages]);
page_cache_release(pages[do_pages]);
} while (do_pages);
- /*
- * The write preparation may have instantiated
- * allocated space outside i_size. Trim this
- * off again. We can ignore any errors in this
- * case as we will just be waisting a bit of
- * allocated space, which is not a disaster.
- */
- i_size = i_size_read(vi);
- if (pos + bytes > i_size) {
- ntfs_write_failed(mapping, pos + bytes);
- }
break;
}
}
u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
- if (likely(nr_segs == 1)) {
- copied = ntfs_copy_from_user(pages + u, do_pages - u,
- ofs, buf, bytes);
- buf += copied;
- } else
- copied = ntfs_copy_from_user_iovec(pages + u,
- do_pages - u, ofs, &iov, &iov_ofs,
- bytes);
+ copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
+ i, bytes);
ntfs_flush_dcache_pages(pages + u, do_pages - u);
- status = ntfs_commit_pages_after_write(pages, do_pages, pos,
- bytes);
- if (likely(!status)) {
- written += copied;
- count -= copied;
- pos += copied;
- if (unlikely(copied != bytes))
- status = -EFAULT;
+ status = 0;
+ if (likely(copied == bytes)) {
+ status = ntfs_commit_pages_after_write(pages, do_pages,
+ pos, bytes);
+ if (!status)
+ status = bytes;
}
do {
unlock_page(pages[--do_pages]);
page_cache_release(pages[do_pages]);
} while (do_pages);
- if (unlikely(status))
+ if (unlikely(status < 0))
break;
- balance_dirty_pages_ratelimited(mapping);
+ copied = status;
cond_resched();
- } while (count);
-err_out:
- *ppos = pos;
+ if (unlikely(!copied)) {
+ size_t sc;
+
+ /*
+ * We failed to copy anything. Fall back to single
+ * segment length write.
+ *
+ * This is needed to avoid possible livelock in the
+ * case that all segments in the iov cannot be copied
+ * at once without a pagefault.
+ */
+ sc = iov_iter_single_seg_count(i);
+ if (bytes > sc)
+ bytes = sc;
+ goto again;
+ }
+ iov_iter_advance(i, copied);
+ pos += copied;
+ written += copied;
+ balance_dirty_pages_ratelimited(mapping);
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
+ }
+ } while (iov_iter_count(i));
if (cached_page)
page_cache_release(cached_page);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
@@ -2077,63 +1929,36 @@ err_out:
}
/**
- * ntfs_file_aio_write_nolock -
+ * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Basically the same as generic_file_write_iter() except that it ends up
+ * up calling ntfs_perform_write() instead of generic_perform_write() and that
+ * O_DIRECT is not implemented.
*/
-static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
+static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos;
- size_t count; /* after file limit checks */
- ssize_t written, err;
+ struct inode *vi = file_inode(file);
+ ssize_t written = 0;
+ ssize_t err;
- count = iov_length(iov, nr_segs);
- pos = *ppos;
+ mutex_lock(&vi->i_mutex);
/* We can write back this queue in page reclaim. */
- current->backing_dev_info = inode_to_bdi(inode);
- written = 0;
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err)
- goto out;
- if (!count)
- goto out;
- err = file_remove_suid(file);
- if (err)
- goto out;
- err = file_update_time(file);
- if (err)
- goto out;
- written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
- count);
-out:
+ current->backing_dev_info = inode_to_bdi(vi);
+ err = ntfs_prepare_file_for_write(iocb, from);
+ if (iov_iter_count(from) && !err)
+ written = ntfs_perform_write(file, from, iocb->ki_pos);
current->backing_dev_info = NULL;
- return written ? written : err;
-}
-
-/**
- * ntfs_file_aio_write -
- */
-static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t ret;
-
- BUG_ON(iocb->ki_pos != pos);
-
- mutex_lock(&inode->i_mutex);
- ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
- mutex_unlock(&inode->i_mutex);
- if (ret > 0) {
- int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ mutex_unlock(&vi->i_mutex);
+ if (likely(written > 0)) {
+ err = generic_write_sync(file, iocb->ki_pos, written);
if (err < 0)
- ret = err;
+ written = 0;
}
- return ret;
+ iocb->ki_pos += written;
+ return written ? written : err;
}
/**
@@ -2197,37 +2022,15 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
#endif /* NTFS_RW */
const struct file_operations ntfs_file_ops = {
- .llseek = generic_file_llseek, /* Seek inside file. */
- .read = new_sync_read, /* Read from file. */
- .read_iter = generic_file_read_iter, /* Async read from file. */
+ .llseek = generic_file_llseek,
+ .read_iter = generic_file_read_iter,
#ifdef NTFS_RW
- .write = do_sync_write, /* Write to file. */
- .aio_write = ntfs_file_aio_write, /* Async write to file. */
- /*.release = ,*/ /* Last file is closed. See
- fs/ext2/file.c::
- ext2_release_file() for
- how to use this to discard
- preallocated space for
- write opened files. */
- .fsync = ntfs_file_fsync, /* Sync a file to disk. */
- /*.aio_fsync = ,*/ /* Sync all outstanding async
- i/o operations on a
- kiocb. */
+ .write_iter = ntfs_file_write_iter,
+ .fsync = ntfs_file_fsync,
#endif /* NTFS_RW */
- /*.ioctl = ,*/ /* Perform function on the
- mounted filesystem. */
- .mmap = generic_file_mmap, /* Mmap file. */
- .open = ntfs_file_open, /* Open file. */
- .splice_read = generic_file_splice_read /* Zero-copy data send with
- the data source being on
- the ntfs partition. We do
- not need to care about the
- data destination. */
- /*.sendpage = ,*/ /* Zero-copy data send with
- the data destination being
- on the ntfs partition. We
- do not need to care about
- the data source. */
+ .mmap = generic_file_mmap,
+ .open = ntfs_file_open,
+ .splice_read = generic_file_splice_read,
};
const struct inode_operations ntfs_file_inode_ops = {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 898b9949d363..d284f07eda77 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,7 +28,6 @@
#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/log2.h>
-#include <linux/aio.h>
#include "aops.h"
#include "attrib.h"
@@ -2890,7 +2889,7 @@ void ntfs_truncate_vfs(struct inode *vi) {
*/
int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *vi = dentry->d_inode;
+ struct inode *vi = d_inode(dentry);
int err;
unsigned int ia_valid = attr->ia_valid;
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 76b6cfb579d7..b3c3469de6cb 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -239,7 +239,7 @@ typedef struct {
*/
static inline ntfs_inode *NTFS_I(struct inode *inode)
{
- return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode);
+ return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode);
}
static inline struct inode *VFS_I(ntfs_inode *ni)
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index a44b14cbceeb..ab172e5f51d9 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -85,12 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
static inline void ntfs_free(void *addr)
{
- if (!is_vmalloc_addr(addr)) {
- kfree(addr);
- /* free_page((unsigned long)addr); */
- return;
- }
- vfree(addr);
+ kvfree(addr);
}
#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index b3973c2fd190..443abecf01b7 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -35,7 +35,7 @@
* ntfs_lookup - find the inode represented by a dentry in a directory inode
* @dir_ino: directory inode in which to look for the inode
* @dent: dentry representing the inode to look for
- * @nd: lookup nameidata
+ * @flags: lookup flags
*
* In short, ntfs_lookup() looks for the inode represented by the dentry @dent
* in the directory inode @dir_ino and if found attaches the inode to the
@@ -292,14 +292,14 @@ const struct inode_operations ntfs_dir_inode_ops = {
* The code is based on the ext3 ->get_parent() implementation found in
* fs/ext3/namei.c::ext3_get_parent().
*
- * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down.
+ * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down.
*
* Return the dentry of the parent directory on success or the error code on
* error (IS_ERR() is true).
*/
static struct dentry *ntfs_get_parent(struct dentry *child_dent)
{
- struct inode *vi = child_dent->d_inode;
+ struct inode *vi = d_inode(child_dent);
ntfs_inode *ni = NTFS_I(vi);
MFT_RECORD *mrec;
ntfs_attr_search_ctx *ctx;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 044158bd22be..5997c00a1515 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2925,7 +2925,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
struct ocfs2_path *right_path = NULL;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
- BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+ if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
+ return 0;
*empty_extent_path = NULL;
@@ -3370,7 +3371,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
ret = ocfs2_get_right_path(et, left_path, &right_path);
if (ret) {
mlog_errno(ret);
- goto out;
+ return ret;
}
right_el = path_leaf_el(right_path);
@@ -3453,8 +3454,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
subtree_index);
}
out:
- if (right_path)
- ocfs2_free_path(right_path);
+ ocfs2_free_path(right_path);
return ret;
}
@@ -3536,7 +3536,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
ret = ocfs2_get_left_path(et, right_path, &left_path);
if (ret) {
mlog_errno(ret);
- goto out;
+ return ret;
}
left_el = path_leaf_el(left_path);
@@ -3647,8 +3647,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
right_path, subtree_index);
}
out:
- if (left_path)
- ocfs2_free_path(left_path);
+ ocfs2_free_path(left_path);
return ret;
}
@@ -4313,13 +4312,13 @@ out:
return ret;
}
-static enum ocfs2_contig_type
-ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
+static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
struct ocfs2_path *path,
struct ocfs2_extent_list *el, int index,
- struct ocfs2_extent_rec *split_rec)
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_merge_ctxt *ctxt)
{
- int status;
+ int status = 0;
enum ocfs2_contig_type ret = CONTIG_NONE;
u32 left_cpos, right_cpos;
struct ocfs2_extent_rec *rec = NULL;
@@ -4334,17 +4333,20 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
} else if (path->p_tree_depth > 0) {
status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
if (status)
- goto out;
+ goto exit;
if (left_cpos != 0) {
left_path = ocfs2_new_path_from_path(path);
- if (!left_path)
- goto out;
+ if (!left_path) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto exit;
+ }
status = ocfs2_find_path(et->et_ci, left_path,
left_cpos);
if (status)
- goto out;
+ goto free_left_path;
new_el = path_leaf_el(left_path);
@@ -4361,7 +4363,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
le16_to_cpu(new_el->l_next_free_rec),
le16_to_cpu(new_el->l_count));
status = -EINVAL;
- goto out;
+ goto free_left_path;
}
rec = &new_el->l_recs[
le16_to_cpu(new_el->l_next_free_rec) - 1];
@@ -4388,18 +4390,21 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
path->p_tree_depth > 0) {
status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
if (status)
- goto out;
+ goto free_left_path;
if (right_cpos == 0)
- goto out;
+ goto free_left_path;
right_path = ocfs2_new_path_from_path(path);
- if (!right_path)
- goto out;
+ if (!right_path) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto free_left_path;
+ }
status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
if (status)
- goto out;
+ goto free_right_path;
new_el = path_leaf_el(right_path);
rec = &new_el->l_recs[0];
@@ -4413,7 +4418,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec));
status = -EINVAL;
- goto out;
+ goto free_right_path;
}
rec = &new_el->l_recs[1];
}
@@ -4430,13 +4435,15 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
ret = contig_type;
}
-out:
- if (left_path)
- ocfs2_free_path(left_path);
- if (right_path)
- ocfs2_free_path(right_path);
+free_right_path:
+ ocfs2_free_path(right_path);
+free_left_path:
+ ocfs2_free_path(left_path);
+exit:
+ if (status == 0)
+ ctxt->c_contig_type = ret;
- return ret;
+ return status;
}
static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
@@ -5042,9 +5049,14 @@ int ocfs2_split_extent(handle_t *handle,
goto out;
}
- ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
- split_index,
- split_rec);
+ ret = ocfs2_figure_merge_contig_type(et, path, el,
+ split_index,
+ split_rec,
+ &ctxt);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The core merge / split code wants to know how much room is
@@ -6858,13 +6870,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
if (pages == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
- goto out;
+ return ret;
}
ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto free_pages;
}
}
@@ -6996,9 +7008,8 @@ out_commit:
out:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
- if (pages)
- kfree(pages);
-
+free_pages:
+ kfree(pages);
return ret;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 44db1808cdb5..0f5fd9db8194 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,6 +29,7 @@
#include <linux/mpage.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/uio.h>
#include <cluster/masklog.h>
@@ -522,7 +523,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
unsigned long len = bh_result->b_size;
- unsigned int clusters_to_alloc = 0;
+ unsigned int clusters_to_alloc = 0, contig_clusters = 0;
cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
@@ -559,8 +560,10 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
/* fill hole, allocate blocks can't be larger than the size
* of the hole */
clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
- if (clusters_to_alloc > contig_blocks)
- clusters_to_alloc = contig_blocks;
+ contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
+ contig_blocks);
+ if (clusters_to_alloc > contig_clusters)
+ clusters_to_alloc = contig_clusters;
/* allocate extent and insert them into the extent tree */
ret = ocfs2_extend_allocation(inode, cpos,
@@ -618,9 +621,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
- if (ocfs2_iocb_is_sem_locked(iocb))
- ocfs2_iocb_clear_sem_locked(iocb);
-
if (ocfs2_iocb_is_unaligned_aio(iocb)) {
ocfs2_iocb_clear_unaligned_aio(iocb);
@@ -663,6 +663,117 @@ static int ocfs2_is_overwrite(struct ocfs2_super *osb,
return 0;
}
+static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
+ struct inode *inode, loff_t offset,
+ u64 zero_len, int cluster_align)
+{
+ u32 p_cpos = 0;
+ u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+ int ret = 0;
+
+ if (offset <= i_size_read(inode) || cluster_align)
+ return 0;
+
+ ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+ &ext_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ u64 s = i_size_read(inode);
+ sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
+ (do_div(s, osb->s_clustersize) >> 9);
+
+ ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
+ zero_len >> 9, GFP_NOFS, false);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ return ret;
+}
+
+static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
+ struct inode *inode, loff_t offset)
+{
+ u64 zero_start, zero_len, total_zero_len;
+ u32 p_cpos = 0, clusters_to_add;
+ u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+ u32 size_div, offset_div;
+ int ret = 0;
+
+ {
+ u64 o = offset;
+ u64 s = i_size_read(inode);
+
+ offset_div = do_div(o, osb->s_clustersize);
+ size_div = do_div(s, osb->s_clustersize);
+ }
+
+ if (offset <= i_size_read(inode))
+ return 0;
+
+ clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
+ ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
+ total_zero_len = offset - i_size_read(inode);
+ if (clusters_to_add)
+ total_zero_len -= offset_div;
+
+ /* Allocate clusters to fill out holes, and this is only needed
+ * when we add more than one clusters. Otherwise the cluster will
+ * be allocated during direct IO */
+ if (clusters_to_add > 1) {
+ ret = ocfs2_extend_allocation(inode,
+ OCFS2_I(inode)->ip_clusters,
+ clusters_to_add - 1, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ while (total_zero_len) {
+ ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
+ &ext_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
+ size_div;
+ zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
+ size_div;
+ zero_len = min(total_zero_len, zero_len);
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+ ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+ zero_start >> 9, zero_len >> 9,
+ GFP_NOFS, false);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ total_zero_len -= zero_len;
+ v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
+
+ /* Only at first iteration can be cluster not aligned.
+ * So set size_div to 0 for the rest */
+ size_div = 0;
+ }
+
+out:
+ return ret;
+}
+
static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
struct iov_iter *iter,
loff_t offset)
@@ -677,8 +788,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
struct buffer_head *di_bh = NULL;
size_t count = iter->count;
journal_t *journal = osb->journal->j_journal;
- u32 zero_len;
- int cluster_align;
+ u64 zero_len_head, zero_len_tail;
+ int cluster_align_head, cluster_align_tail;
loff_t final_size = offset + count;
int append_write = offset >= i_size_read(inode) ? 1 : 0;
unsigned int num_clusters = 0;
@@ -686,9 +797,16 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
{
u64 o = offset;
+ u64 s = i_size_read(inode);
- zero_len = do_div(o, 1 << osb->s_clustersize_bits);
- cluster_align = !zero_len;
+ zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
+ cluster_align_head = !zero_len_head;
+
+ zero_len_tail = osb->s_clustersize -
+ do_div(s, osb->s_clustersize);
+ if ((offset - i_size_read(inode)) < zero_len_tail)
+ zero_len_tail = offset - i_size_read(inode);
+ cluster_align_tail = !zero_len_tail;
}
/*
@@ -706,21 +824,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
}
if (append_write) {
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ ret = ocfs2_inode_lock(inode, NULL, 1);
if (ret < 0) {
mlog_errno(ret);
goto clean_orphan;
}
+ /* zeroing out the previously allocated cluster tail
+ * that but not zeroed */
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
- ret = ocfs2_zero_extend(inode, di_bh, offset);
+ ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
+ zero_len_tail, cluster_align_tail);
else
- ret = ocfs2_extend_no_holes(inode, di_bh, offset,
+ ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
offset);
if (ret < 0) {
mlog_errno(ret);
ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
goto clean_orphan;
}
@@ -728,19 +848,15 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
if (is_overwrite < 0) {
mlog_errno(is_overwrite);
ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
goto clean_orphan;
}
ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
}
- written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
- iter, offset,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
+ written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
+ offset, ocfs2_direct_IO_get_blocks,
+ ocfs2_dio_end_io, NULL, 0);
if (unlikely(written < 0)) {
loff_t i_size = i_size_read(inode);
@@ -771,25 +887,35 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
if (ret < 0)
mlog_errno(ret);
}
- } else if (written < 0 && append_write && !is_overwrite &&
- !cluster_align) {
+ } else if (written > 0 && append_write && !is_overwrite &&
+ !cluster_align_head) {
+ /* zeroing out the allocated cluster head */
u32 p_cpos = 0;
u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto clean_orphan;
+ }
+
ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
&num_clusters, &ext_flags);
if (ret < 0) {
mlog_errno(ret);
+ ocfs2_inode_unlock(inode, 0);
goto clean_orphan;
}
BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- p_cpos << (osb->s_clustersize_bits - 9),
- zero_len >> 9, GFP_KERNEL, false);
+ (u64)p_cpos << (osb->s_clustersize_bits - 9),
+ zero_len_head >> 9, GFP_NOFS, false);
if (ret < 0)
mlog_errno(ret);
+
+ ocfs2_inode_unlock(inode, 0);
}
clean_orphan:
@@ -798,13 +924,23 @@ clean_orphan:
int update_isize = written > 0 ? 1 : 0;
loff_t end = update_isize ? offset + written : 0;
- tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+ tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (tmp_ret < 0) {
+ ret = tmp_ret;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
update_isize, end);
if (tmp_ret < 0) {
ret = tmp_ret;
+ mlog_errno(ret);
goto out;
}
+ ocfs2_inode_unlock(inode, 1);
+
tmp_ret = jbd2_journal_force_commit(journal);
if (tmp_ret < 0) {
ret = tmp_ret;
@@ -818,9 +954,7 @@ out:
return ret;
}
-static ssize_t ocfs2_direct_IO(int rw,
- struct kiocb *iocb,
- struct iov_iter *iter,
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t offset)
{
struct file *file = iocb->ki_filp;
@@ -842,12 +976,11 @@ static ssize_t ocfs2_direct_IO(int rw,
if (i_size_read(inode) <= offset && !full_coherency)
return 0;
- if (rw == READ)
- return __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev,
- iter, offset,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
+ if (iov_iter_rw(iter) == READ)
+ return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, offset,
+ ocfs2_direct_IO_get_blocks,
+ ocfs2_dio_end_io, NULL, 0);
else
return ocfs2_direct_IO_write(iocb, iter, offset);
}
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 6cae155d54df..24e496d6bdcd 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,7 +22,7 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
-#include <linux/aio.h>
+#include <linux/fs.h>
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
@@ -79,7 +79,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
- OCFS2_IOCB_SEM,
OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
@@ -88,12 +87,6 @@ enum ocfs2_iocb_lock_bits {
clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private)
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_sem_locked(iocb) \
- set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_sem_locked(iocb) \
- clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_sem_locked(iocb) \
- test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
#define ocfs2_iocb_set_unaligned_aio(iocb) \
set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index af7598bff1b5..dfe162f5fd4c 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
return count;
}
+void __mlog_printk(const u64 *mask, const char *func, int line,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ const char *level;
+ const char *prefix = "";
+
+ if (!__mlog_test_u64(*mask, mlog_and_bits) ||
+ __mlog_test_u64(*mask, mlog_not_bits))
+ return;
+
+ if (*mask & ML_ERROR) {
+ level = KERN_ERR;
+ prefix = "ERROR: ";
+ } else if (*mask & ML_NOTICE) {
+ level = KERN_NOTICE;
+ } else {
+ level = KERN_INFO;
+ }
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%s(%s,%u,%u):%s:%d %s%pV",
+ level, current->comm, task_pid_nr(current),
+ raw_smp_processor_id(), func, line, prefix, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(__mlog_printk);
+
struct mlog_attribute {
struct attribute attr;
u64 mask;
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 2260fb9e6508..308ea0eb35fd 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -162,47 +162,30 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#endif
-/*
- * smp_processor_id() "helpfully" screams when called outside preemptible
- * regions in current kernels. sles doesn't have the variants that don't
- * scream. just do this instead of trying to guess which we're building
- * against.. *sigh*.
- */
-#define __mlog_cpu_guess ({ \
- unsigned long _cpu = get_cpu(); \
- put_cpu(); \
- _cpu; \
-})
+__printf(4, 5)
+void __mlog_printk(const u64 *m, const char *func, int line,
+ const char *fmt, ...);
-/* In the following two macros, the whitespace after the ',' just
- * before ##args is intentional. Otherwise, gcc 2.95 will eat the
- * previous token if args expands to nothing.
+/*
+ * Testing before the __mlog_printk call lets the compiler eliminate the
+ * call completely when (m & ML_ALLOWED_BITS) is 0.
*/
-#define __mlog_printk(level, fmt, args...) \
- printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \
- task_pid_nr(current), __mlog_cpu_guess, \
- __PRETTY_FUNCTION__, __LINE__ , ##args)
-
-#define mlog(mask, fmt, args...) do { \
- u64 __m = MLOG_MASK_PREFIX | (mask); \
- if ((__m & ML_ALLOWED_BITS) && \
- __mlog_test_u64(__m, mlog_and_bits) && \
- !__mlog_test_u64(__m, mlog_not_bits)) { \
- if (__m & ML_ERROR) \
- __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
- else if (__m & ML_NOTICE) \
- __mlog_printk(KERN_NOTICE, fmt , ##args); \
- else __mlog_printk(KERN_INFO, fmt , ##args); \
- } \
+#define mlog(mask, fmt, ...) \
+do { \
+ u64 _m = MLOG_MASK_PREFIX | (mask); \
+ if (_m & ML_ALLOWED_BITS) \
+ __mlog_printk(&_m, __func__, __LINE__, fmt, \
+ ##__VA_ARGS__); \
} while (0)
-#define mlog_errno(st) do { \
+#define mlog_errno(st) ({ \
int _st = (st); \
if (_st != -ERESTARTSYS && _st != -EINTR && \
_st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \
_st != -EDQUOT) \
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
-} while (0)
+ _st; \
+})
#define mlog_bug_on_msg(cond, fmt, args...) do { \
if (cond) { \
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 56c403a563bc..2d0acd6678fe 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -2204,7 +2204,7 @@ out:
kfree(o2net_hand);
kfree(o2net_keep_req);
kfree(o2net_keep_resp);
-
+ o2net_debugfs_exit();
o2quo_exit();
return -ENOMEM;
}
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 4fda7a5f3088..290373024d9d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -42,8 +42,8 @@
void ocfs2_dentry_attach_gen(struct dentry *dentry)
{
unsigned long gen =
- OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
- BUG_ON(dentry->d_inode);
+ OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen;
+ BUG_ON(d_inode(dentry));
dentry->d_fsdata = (void *)gen;
}
@@ -57,7 +57,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
osb = OCFS2_SB(dentry->d_sb);
trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
@@ -71,7 +71,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags)
unsigned long gen = (unsigned long) dentry->d_fsdata;
unsigned long pgen;
spin_lock(&dentry->d_lock);
- pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen;
spin_unlock(&dentry->d_lock);
trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
dentry->d_name.name,
@@ -146,7 +146,7 @@ static int ocfs2_match_dentry(struct dentry *dentry,
if (skip_unhashed && d_unhashed(dentry))
return 0;
- parent = dentry->d_parent->d_inode;
+ parent = d_inode(dentry->d_parent);
/* Negative parent dentry? */
if (!parent)
return 0;
@@ -243,7 +243,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
if (!inode)
return 0;
- if (!dentry->d_inode && dentry->d_fsdata) {
+ if (d_really_is_negative(dentry) && dentry->d_fsdata) {
/* Converting a negative dentry to positive
Clear dentry->d_fsdata */
dentry->d_fsdata = dl = NULL;
@@ -446,7 +446,7 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb);
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
/*
* Move within the same directory, so the actual lock info won't
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b08050bd3f2e..02878a83f0b4 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -18,7 +18,7 @@
*
* linux/fs/minix/dir.c
*
- * Copyright (C) 1991, 1992 Linux Torvalds
+ * Copyright (C) 1991, 1992 Linus Torvalds
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -1617,7 +1617,7 @@ int __ocfs2_add_entry(handle_t *handle,
struct ocfs2_dir_entry *de, *de1;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
struct super_block *sb = dir->i_sb;
- int retval, status;
+ int retval;
unsigned int size = sb->s_blocksize;
struct buffer_head *insert_bh = lookup->dl_leaf_bh;
char *data_start = insert_bh->b_data;
@@ -1695,25 +1695,25 @@ int __ocfs2_add_entry(handle_t *handle,
}
if (insert_bh == parent_fe_bh)
- status = ocfs2_journal_access_di(handle,
+ retval = ocfs2_journal_access_di(handle,
INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
else {
- status = ocfs2_journal_access_db(handle,
+ retval = ocfs2_journal_access_db(handle,
INODE_CACHE(dir),
insert_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
- if (ocfs2_dir_indexed(dir)) {
- status = ocfs2_dx_dir_insert(dir,
+ if (!retval && ocfs2_dir_indexed(dir))
+ retval = ocfs2_dx_dir_insert(dir,
handle,
lookup);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
- }
+ }
+
+ if (retval) {
+ mlog_errno(retval);
+ goto bail;
}
/* By now the buffer is marked for journaling */
@@ -2047,22 +2047,19 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
int namelen)
{
- int ret;
+ int ret = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
trace_ocfs2_check_dir_for_entry(
(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
- ret = -EEXIST;
- if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
- goto bail;
+ if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) {
+ ret = -EEXIST;
+ mlog_errno(ret);
+ }
- ret = 0;
-bail:
ocfs2_free_dir_lookup_result(&lookup);
- if (ret)
- mlog_errno(ret);
return ret;
}
@@ -3546,13 +3543,10 @@ static void dx_leaf_sort_swap(void *a, void *b, int size)
{
struct ocfs2_dx_entry *entry1 = a;
struct ocfs2_dx_entry *entry2 = b;
- struct ocfs2_dx_entry tmp;
BUG_ON(size != sizeof(*entry1));
- tmp = *entry1;
- *entry1 = *entry2;
- *entry2 = tmp;
+ swap(*entry1, *entry2);
}
static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index f0344b75b14d..3d8639f38973 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -72,7 +72,7 @@ static inline int ocfs2_add_entry(handle_t *handle,
struct buffer_head *parent_fe_bh,
struct ocfs2_dir_lookup_result *lookup)
{
- return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+ return __ocfs2_add_entry(handle, d_inode(dentry->d_parent),
dentry->d_name.name, dentry->d_name.len,
inode, blkno, parent_fe_bh, lookup);
}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index fae17c640df3..e88ccf8c83ff 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
/* will exit holding res->spinlock, but may drop in function */
void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
-void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
/* will exit holding res->spinlock, but may drop in function */
static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a6944b25fd5b..fdf4b41d0609 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -757,6 +757,19 @@ lookup:
if (tmpres) {
spin_unlock(&dlm->spinlock);
spin_lock(&tmpres->spinlock);
+
+ /*
+ * Right after dlm spinlock was released, dlm_thread could have
+ * purged the lockres. Check if lockres got unhashed. If so
+ * start over.
+ */
+ if (hlist_unhashed(&tmpres->hash_node)) {
+ spin_unlock(&tmpres->spinlock);
+ dlm_lockres_put(tmpres);
+ tmpres = NULL;
+ goto lookup;
+ }
+
/* Wait on the thread that is mastering the resource */
if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
__dlm_wait_on_lockres(tmpres);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 061ba6a91bf2..b5cf27dcb18a 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -208,7 +208,7 @@ static int dlmfs_file_release(struct inode *inode,
static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr)
{
int error;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
attr->ia_valid &= ~ATTR_SIZE;
error = inode_change_ok(inode, attr);
@@ -549,7 +549,7 @@ static int dlmfs_unlink(struct inode *dir,
struct dentry *dentry)
{
int status;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
mlog(0, "unlink inode %lu\n", inode->i_ino);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 11849a44dc5a..23157e40dd74 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1391,6 +1391,11 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
int noqueue_attempted = 0;
int dlm_locked = 0;
+ if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
+ mlog_errno(-EINVAL);
+ return -EINVAL;
+ }
+
ocfs2_init_mask_waiter(&mw);
if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -4020,9 +4025,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
osb->dc_work_sequence = osb->dc_wake_sequence;
processed = osb->blocked_lock_count;
- while (processed) {
- BUG_ON(list_empty(&osb->blocked_lock_list));
-
+ /*
+ * blocked lock processing in this loop might call iput which can
+ * remove items off osb->blocked_lock_list. Downconvert up to
+ * 'processed' number of locks, but stop short if we had some
+ * removed in ocfs2_mark_lockres_freeing when downconverting.
+ */
+ while (processed && !list_empty(&osb->blocked_lock_list)) {
lockres = list_entry(osb->blocked_lock_list.next,
struct ocfs2_lock_res, l_blocked_list);
list_del_init(&lockres->l_blocked_list);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 29651167190d..827fc9809bc2 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -82,7 +82,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
}
status = ocfs2_test_inode_bit(osb, blkno, &set);
- trace_ocfs2_get_dentry_test_bit(status, set);
if (status < 0) {
if (status == -EINVAL) {
/*
@@ -96,6 +95,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
goto unlock_nfs_sync;
}
+ trace_ocfs2_get_dentry_test_bit(status, set);
/* If the inode allocator bit is clear, this inode must be stale */
if (!set) {
status = -ESTALE;
@@ -147,7 +147,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
int status;
u64 blkno;
struct dentry *parent;
- struct inode *dir = child->d_inode;
+ struct inode *dir = d_inode(child);
trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
(unsigned long long)OCFS2_I(dir)->ip_blkno);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 46e0d4e857c7..719f7f4c7a37 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -37,6 +37,7 @@
#include <linux/falloc.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <cluster/masklog.h>
@@ -1126,7 +1127,7 @@ out:
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
int status = 0, size_change;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
struct buffer_head *bh = NULL;
@@ -1275,8 +1276,8 @@ int ocfs2_getattr(struct vfsmount *mnt,
struct dentry *dentry,
struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
- struct super_block *sb = dentry->d_inode->i_sb;
+ struct inode *inode = d_inode(dentry);
+ struct super_block *sb = d_inode(dentry)->i_sb;
struct ocfs2_super *osb = sb->s_fs_info;
int err;
@@ -2106,7 +2107,7 @@ out:
}
static int ocfs2_prepare_inode_for_write(struct file *file,
- loff_t *ppos,
+ loff_t pos,
size_t count,
int appending,
int *direct_io,
@@ -2114,8 +2115,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
{
int ret = 0, meta_level = 0;
struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- loff_t saved_pos = 0, end;
+ struct inode *inode = d_inode(dentry);
+ loff_t end;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2155,23 +2156,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
}
}
- /* work on a copy of ppos until we're sure that we won't have
- * to recalculate it due to relocking. */
- if (appending)
- saved_pos = i_size_read(inode);
- else
- saved_pos = *ppos;
-
- end = saved_pos + count;
+ end = pos + count;
- ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
+ ret = ocfs2_check_range_for_refcount(inode, pos, count);
if (ret == 1) {
ocfs2_inode_unlock(inode, meta_level);
meta_level = -1;
ret = ocfs2_prepare_inode_for_refcount(inode,
file,
- saved_pos,
+ pos,
count,
&meta_level);
if (has_refcount)
@@ -2227,7 +2221,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* caller will have to retake some cluster
* locks and initiate the io as buffered.
*/
- ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
+ ret = ocfs2_check_range_for_holes(inode, pos, count);
if (ret == 1) {
/*
* Fallback to old way if the feature bit is not set.
@@ -2242,12 +2236,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
break;
}
- if (appending)
- *ppos = saved_pos;
-
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- saved_pos, appending, count,
+ pos, appending, count,
direct_io, has_refcount);
if (meta_level >= 0)
@@ -2260,19 +2251,20 @@ out:
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
+ int direct_io, appending, rw_level;
int can_do_direct, has_refcount = 0;
ssize_t written = 0;
- size_t count = iov_iter_count(from);
- loff_t old_size, *ppos = &iocb->ki_pos;
+ ssize_t ret;
+ size_t count = iov_iter_count(from), orig_count;
+ loff_t old_size;
u32 old_clusters;
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct address_space *mapping = file->f_mapping;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
int unaligned_dio = 0;
+ int dropped_dio = 0;
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2280,24 +2272,15 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
file->f_path.dentry->d_name.name,
(unsigned int)from->nr_segs); /* GRRRRR */
- if (iocb->ki_nbytes == 0)
+ if (count == 0)
return 0;
- appending = file->f_flags & O_APPEND ? 1 : 0;
- direct_io = file->f_flags & O_DIRECT ? 1 : 0;
+ appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
+ direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
mutex_lock(&inode->i_mutex);
- ocfs2_iocb_clear_sem_locked(iocb);
-
relock:
- /* to match setattr's i_mutex -> rw_lock ordering */
- if (direct_io) {
- have_alloc_sem = 1;
- /* communicate with ocfs2_dio_end_io */
- ocfs2_iocb_set_sem_locked(iocb);
- }
-
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
@@ -2307,7 +2290,7 @@ relock:
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
mlog_errno(ret);
- goto out_sems;
+ goto out_mutex;
}
/*
@@ -2329,9 +2312,17 @@ relock:
ocfs2_inode_unlock(inode, 1);
}
+ orig_count = iov_iter_count(from);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0) {
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+ count = ret;
+
can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file, ppos,
- iocb->ki_nbytes, appending,
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
&can_do_direct, &has_refcount);
if (ret < 0) {
mlog_errno(ret);
@@ -2339,8 +2330,7 @@ relock:
}
if (direct_io && !is_sync_kiocb(iocb))
- unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
- *ppos);
+ unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
/*
* We can't complete the direct I/O as requested, fall back to
@@ -2349,10 +2339,12 @@ relock:
if (direct_io && !can_do_direct) {
ocfs2_rw_unlock(inode, rw_level);
- have_alloc_sem = 0;
rw_level = -1;
direct_io = 0;
+ iocb->ki_flags &= ~IOCB_DIRECT;
+ iov_iter_reexpand(from, orig_count);
+ dropped_dio = 1;
goto relock;
}
@@ -2376,74 +2368,18 @@ relock:
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
- ret = generic_write_checks(file, ppos, &count,
- S_ISBLK(inode->i_mode));
- if (ret)
- goto out_dio;
-
- iov_iter_truncate(from, count);
- if (direct_io) {
- loff_t endbyte;
- ssize_t written_buffered;
- written = generic_file_direct_write(iocb, from, *ppos);
- if (written < 0 || written == count) {
- ret = written;
- goto out_dio;
- }
-
- /*
- * for completing the rest of the request.
- */
- *ppos += written;
- count -= written;
- written_buffered = generic_perform_write(file, from, *ppos);
- /*
- * If generic_file_buffered_write() returned a synchronous error
- * then we want to return the number of bytes which were
- * direct-written, or the error code if that was zero. Note
- * that this differs from normal direct-io semantics, which
- * will return -EFOO even if some bytes were written.
- */
- if (written_buffered < 0) {
- ret = written_buffered;
- goto out_dio;
- }
-
- iocb->ki_pos = *ppos + written_buffered;
- /* We need to ensure that the page cache pages are written to
- * disk and invalidated to preserve the expected O_DIRECT
- * semantics.
- */
- endbyte = *ppos + written_buffered - 1;
- ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
- endbyte);
- if (ret == 0) {
- written += written_buffered;
- invalidate_mapping_pages(mapping,
- *ppos >> PAGE_CACHE_SHIFT,
- endbyte >> PAGE_CACHE_SHIFT);
- } else {
- /*
- * We don't know how much we wrote, so just return
- * the number of bytes which were direct-written
- */
- }
- } else {
- current->backing_dev_info = inode_to_bdi(inode);
- written = generic_perform_write(file, from, *ppos);
- if (likely(written >= 0))
- iocb->ki_pos = *ppos + written;
- current->backing_dev_info = NULL;
- }
-
-out_dio:
+ written = __generic_file_write_iter(iocb, from);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
+ BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+
+ if (unlikely(written <= 0))
+ goto no_sync;
- if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
- ((file->f_flags & O_DIRECT) && !direct_io)) {
- ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
- *ppos + count - 1);
+ if (((file->f_flags & O_DSYNC) && !direct_io) ||
+ IS_SYNC(inode) || dropped_dio) {
+ ret = filemap_fdatawrite_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
if (ret < 0)
written = ret;
@@ -2454,10 +2390,12 @@ out_dio:
}
if (!ret)
- ret = filemap_fdatawait_range(file->f_mapping, *ppos,
- *ppos + count - 1);
+ ret = filemap_fdatawait_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
}
+no_sync:
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
@@ -2469,7 +2407,6 @@ out_dio:
*/
if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
- have_alloc_sem = 0;
unaligned_dio = 0;
}
@@ -2482,10 +2419,7 @@ out:
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
-out_sems:
- if (have_alloc_sem)
- ocfs2_iocb_clear_sem_locked(iocb);
-
+out_mutex:
mutex_unlock(&inode->i_mutex);
if (written)
@@ -2526,7 +2460,7 @@ bail:
static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
struct iov_iter *to)
{
- int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
+ int ret = 0, rw_level = -1, lock_level = 0;
struct file *filp = iocb->ki_filp;
struct inode *inode = file_inode(filp);
@@ -2543,16 +2477,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
goto bail;
}
- ocfs2_iocb_clear_sem_locked(iocb);
-
/*
* buffered reads protect themselves in ->readpage(). O_DIRECT reads
* need locks to protect pending reads from racing with truncate.
*/
- if (filp->f_flags & O_DIRECT) {
- have_alloc_sem = 1;
- ocfs2_iocb_set_sem_locked(iocb);
-
+ if (iocb->ki_flags & IOCB_DIRECT) {
ret = ocfs2_rw_lock(inode, 0);
if (ret < 0) {
mlog_errno(ret);
@@ -2583,18 +2512,14 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
trace_generic_file_aio_read_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
- BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+ BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
/* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
rw_level = -1;
- have_alloc_sem = 0;
}
bail:
- if (have_alloc_sem)
- ocfs2_iocb_clear_sem_locked(iocb);
-
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
@@ -2678,8 +2603,6 @@ const struct inode_operations ocfs2_special_file_iops = {
*/
const struct file_operations ocfs2_fops = {
.llseek = ocfs2_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
@@ -2726,8 +2649,6 @@ const struct file_operations ocfs2_dops = {
*/
const struct file_operations ocfs2_fops_no_plocks = {
.llseek = ocfs2_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 3025c0da6b8a..b254416dc8d9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -624,7 +624,7 @@ static int ocfs2_remove_inode(struct inode *inode,
ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
le16_to_cpu(di->i_suballoc_slot));
if (!inode_alloc_inode) {
- status = -EEXIST;
+ status = -ENOENT;
mlog_errno(status);
goto bail;
}
@@ -742,7 +742,7 @@ static int ocfs2_wipe_inode(struct inode *inode,
ORPHAN_DIR_SYSTEM_INODE,
orphaned_slot);
if (!orphan_dir_inode) {
- status = -EEXIST;
+ status = -ENOENT;
mlog_errno(status);
goto bail;
}
@@ -1209,7 +1209,7 @@ int ocfs2_drop_inode(struct inode *inode)
*/
int ocfs2_inode_revalidate(struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int status = 0;
trace_ocfs2_inode_revalidate(inode,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 53e6c40ed4c6..3cb097ccce60 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -980,7 +980,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC_GROUP_EXTEND:
case OCFS2_IOC_GROUP_ADD:
case OCFS2_IOC_GROUP_ADD64:
- case FITRIM:
break;
case OCFS2_IOC_REFLINK:
if (copy_from_user(&args, argp, sizeof(args)))
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ff531928269e..7c099f7032fd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -108,7 +108,7 @@ struct ocfs2_replay_map {
unsigned char rm_replay_slots[0];
};
-void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
+static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state)
{
if (!osb->replay_map)
return;
@@ -153,7 +153,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
return 0;
}
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
enum ocfs2_orphan_reco_type orphan_reco_type)
{
struct ocfs2_replay_map *replay_map = osb->replay_map;
@@ -173,7 +173,7 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
replay_map->rm_state = REPLAY_DONE;
}
-void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
{
struct ocfs2_replay_map *replay_map = osb->replay_map;
@@ -571,9 +571,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
(unsigned long)bh,
(unsigned long long)bh->b_blocknr);
- /* We aren't guaranteed to have the superblock here - but if we
- * don't, it'll just crash. */
- ocfs2_error(bh->b_assoc_map->host->i_sb,
+ ocfs2_error(bh->b_bdev->bd_super,
"JBD2 has aborted our journal, ocfs2 cannot continue\n");
}
@@ -775,7 +773,20 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
status = jbd2_journal_dirty_metadata(handle, bh);
- BUG_ON(status);
+ if (status) {
+ mlog_errno(status);
+ if (!is_handle_aborted(handle)) {
+ journal_t *journal = handle->h_transaction->t_journal;
+ struct super_block *sb = bh->b_bdev->bd_super;
+
+ mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
+ "Aborting transaction and journal.\n");
+ handle->h_err = status;
+ jbd2_journal_abort_handle(handle);
+ jbd2_journal_abort(journal, status);
+ ocfs2_abort(sb, "Journal already aborted.\n");
+ }
+ }
}
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -1884,7 +1895,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
* hasn't happened. The node queues a scan and increments the
* sequence number in the LVB.
*/
-void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
{
struct ocfs2_orphan_scan *os;
int status, i;
@@ -1933,7 +1944,7 @@ out:
}
/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
-void ocfs2_orphan_scan_work(struct work_struct *work)
+static void ocfs2_orphan_scan_work(struct work_struct *work)
{
struct ocfs2_orphan_scan *os;
struct ocfs2_super *osb;
@@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
struct inode *inode = NULL;
struct inode *iter;
struct ocfs2_inode_info *oi;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
trace_ocfs2_recover_orphans(slot);
@@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
oi->ip_next_orphan = NULL;
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto next;
+ }
/*
* We need to take and drop the inode lock to
* force read inode from disk.
*/
- ret = ocfs2_inode_lock(inode, NULL, 0);
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
- goto next;
+ goto unlock_rw;
}
- ocfs2_inode_unlock(inode, 0);
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
if (inode->i_nlink == 0) {
spin_lock(&oi->ip_lock);
@@ -2174,43 +2193,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
spin_unlock(&oi->ip_lock);
- } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
- struct buffer_head *di_bh = NULL;
-
- ret = ocfs2_rw_lock(inode, 1);
- if (ret) {
- mlog_errno(ret);
- goto next;
- }
-
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- ocfs2_rw_unlock(inode, 1);
- mlog_errno(ret);
- goto next;
- }
-
+ } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+ (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
ret = ocfs2_truncate_file(inode, di_bh,
i_size_read(inode));
- ocfs2_inode_unlock(inode, 1);
- ocfs2_rw_unlock(inode, 1);
- brelse(di_bh);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
- goto next;
+ goto unlock_inode;
}
- ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+ ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
if (ret)
mlog_errno(ret);
wake_up(&OCFS2_I(inode)->append_dio_wq);
} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-
+unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+unlock_rw:
+ ocfs2_rw_unlock(inode, 1);
next:
iput(inode);
-
+ brelse(di_bh);
+ di_bh = NULL;
inode = iter;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 044013455621..857bbbcd39f3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -666,7 +666,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
ocfs2_error(osb->sb, "local alloc inode %llu says it has "
- "%u free bits, but a count shows %u",
+ "%u used bits, but a count shows %u",
(unsigned long long)le64_to_cpu(alloc->i_blkno),
le32_to_cpu(alloc->id1.bitmap1.i_used),
ocfs2_local_alloc_count_bits(alloc));
@@ -839,7 +839,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
u32 *numbits,
struct ocfs2_alloc_reservation *resv)
{
- int numfound, bitoff, left, startoff, lastzero;
+ int numfound = 0, bitoff, left, startoff, lastzero;
int local_resv = 0;
struct ocfs2_alloc_reservation r;
void *bitmap = NULL;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b5c3a5ea3ee6..6e6abb93fda5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -689,8 +689,8 @@ static int ocfs2_link(struct dentry *old_dentry,
struct dentry *dentry)
{
handle_t *handle;
- struct inode *inode = old_dentry->d_inode;
- struct inode *old_dir = old_dentry->d_parent->d_inode;
+ struct inode *inode = d_inode(old_dentry);
+ struct inode *old_dir = d_inode(old_dentry->d_parent);
int err;
struct buffer_head *fe_bh = NULL;
struct buffer_head *old_dir_bh = NULL;
@@ -879,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir,
int status;
int child_locked = 0;
bool is_unlinkable = false;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct inode *orphan_dir = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
u64 blkno;
@@ -898,7 +898,7 @@ static int ocfs2_unlink(struct inode *dir,
dquot_initialize(dir);
- BUG_ON(dentry->d_parent->d_inode != dir);
+ BUG_ON(d_inode(dentry->d_parent) != dir);
if (inode == osb->root_inode)
return -EPERM;
@@ -1116,8 +1116,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
int inode1_is_ancestor, inode2_is_ancestor;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
- struct buffer_head **tmpbh;
- struct inode *tmpinode;
trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
(unsigned long long)oi2->ip_blkno);
@@ -1148,13 +1146,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
(oi1->ip_blkno < oi2->ip_blkno &&
inode2_is_ancestor == 0)) {
/* switch id1 and id2 around */
- tmpbh = bh2;
- bh2 = bh1;
- bh1 = tmpbh;
-
- tmpinode = inode2;
- inode2 = inode1;
- inode1 = tmpinode;
+ swap(bh2, bh1);
+ swap(inode2, inode1);
}
/* lock id2 */
status = ocfs2_inode_lock_nested(inode2, bh2, 1,
@@ -1209,8 +1202,8 @@ static int ocfs2_rename(struct inode *old_dir,
{
int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0;
int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0;
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct inode *orphan_dir = NULL;
struct ocfs2_dinode *newfe = NULL;
char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
@@ -1454,7 +1447,7 @@ static int ocfs2_rename(struct inode *old_dir,
should_add_orphan = true;
}
} else {
- BUG_ON(new_dentry->d_parent->d_inode != new_dir);
+ BUG_ON(d_inode(new_dentry->d_parent) != new_dir);
status = ocfs2_check_dir_for_entry(new_dir,
new_dentry->d_name.name,
@@ -2322,10 +2315,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
trace_ocfs2_orphan_del(
(unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
- name, namelen);
+ name, strlen(name));
/* find it's spot in the orphan directory */
- status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
+ status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
&lookup);
if (status) {
mlog_errno(status);
@@ -2670,30 +2663,22 @@ bail:
}
int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
- struct inode *inode, int update_isize,
- loff_t end)
+ struct inode *inode, struct buffer_head *di_bh,
+ int update_isize, loff_t end)
{
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
- struct buffer_head *di_bh = NULL;
- struct ocfs2_dinode *di = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
handle_t *handle = NULL;
int status = 0;
- status = ocfs2_inode_lock(inode, &di_bh, 1);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- di = (struct ocfs2_dinode *) di_bh->b_data;
-
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
ORPHAN_DIR_SYSTEM_INODE,
le16_to_cpu(di->i_dio_orphaned_slot));
if (!orphan_dir_inode) {
status = -ENOENT;
mlog_errno(status);
- goto bail_unlock_inode;
+ goto bail;
}
mutex_lock(&orphan_dir_inode->i_mutex);
@@ -2702,7 +2687,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
mutex_unlock(&orphan_dir_inode->i_mutex);
iput(orphan_dir_inode);
mlog_errno(status);
- goto bail_unlock_inode;
+ goto bail;
}
handle = ocfs2_start_trans(osb,
@@ -2749,10 +2734,6 @@ bail_unlock_orphan:
brelse(orphan_dir_bh);
iput(orphan_dir_inode);
-bail_unlock_inode:
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
bail:
return status;
}
@@ -2808,7 +2789,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
ORPHAN_DIR_SYSTEM_INODE,
osb->slot_num);
if (!orphan_dir_inode) {
- status = -EEXIST;
+ status = -ENOENT;
mlog_errno(status);
goto leave;
}
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 5ddecce172fa..e173329eb830 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
struct inode *inode);
int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
- struct inode *inode, int update_isize,
- loff_t end);
+ struct inode *inode, struct buffer_head *di_bh,
+ int update_isize, loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct inode *new_inode,
struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 460c6c37e683..690ddc60189b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -717,6 +717,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
return (u64)clusters << c_to_b_bits;
}
+static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb,
+ u64 blocks)
+{
+ int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
+ sb->s_blocksize_bits;
+
+ blocks += (1 << b_to_c_bits) - 1;
+ return (u32)(blocks >> b_to_c_bits);
+}
+
static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
u64 blocks)
{
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ee541f92dab4..b69dd14c0b9b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1406,11 +1406,9 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
static void swap_refcount_rec(void *a, void *b, int size)
{
- struct ocfs2_refcount_rec *l = a, *r = b, tmp;
+ struct ocfs2_refcount_rec *l = a, *r = b;
- tmp = *l;
- *l = *r;
- *r = tmp;
+ swap(*l, *r);
}
/*
@@ -4194,7 +4192,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
bool preserve)
{
int ret;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct buffer_head *new_bh = NULL;
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
@@ -4263,7 +4261,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool preserve)
{
int error;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
struct posix_acl *default_acl, *acl;
@@ -4276,7 +4274,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error) {
mlog_errno(error);
- goto out;
+ return error;
}
error = ocfs2_create_inode_in_orphan(dir, mode,
@@ -4357,7 +4355,7 @@ out:
/* copied from may_create in VFS. */
static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
{
- if (child->d_inode)
+ if (d_really_is_positive(child))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
@@ -4375,7 +4373,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool preserve)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int error;
if (!inode)
@@ -4463,7 +4461,7 @@ int ocfs2_reflink_ioctl(struct inode *inode,
}
error = ocfs2_vfs_reflink(old_path.dentry,
- new_path.dentry->d_inode,
+ d_inode(new_path.dentry),
new_dentry, preserve);
out_dput:
done_path_create(&new_path, new_dentry);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d5493e361a38..e78a203d44c8 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -427,7 +427,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
if (!si) {
status = -ENOMEM;
mlog_errno(status);
- goto bail;
+ return status;
}
si->si_extended = ocfs2_uses_extended_slot_map(osb);
@@ -452,7 +452,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
osb->slot_info = (struct ocfs2_slot_info *)si;
bail:
- if (status < 0 && si)
+ if (status < 0)
__ocfs2_free_slot_info(si);
return status;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 1724d43d3da1..220cae7bbdbc 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -295,7 +295,7 @@ static int o2cb_cluster_check(void)
set_bit(node_num, netmap);
if (!memcmp(hbmap, netmap, sizeof(hbmap)))
return 0;
- if (i < O2CB_MAP_STABILIZE_COUNT)
+ if (i < O2CB_MAP_STABILIZE_COUNT - 1)
msleep(1000);
}
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 720aa389e0ea..2768eb1da2b8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1004,10 +1004,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
BUG_ON(conn == NULL);
lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
- if (!lc) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!lc)
+ return -ENOMEM;
init_waitqueue_head(&lc->oc_wait);
init_completion(&lc->oc_sync_wait);
@@ -1063,7 +1061,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
}
out:
- if (rc && lc)
+ if (rc)
kfree(lc);
return rc;
}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 0cb889a17ae1..4479029630bb 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2499,6 +2499,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
+ ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
+ start_bit, count);
goto bail;
}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 26675185b886..403c5660b306 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2069,6 +2069,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+ memcpy(sb->s_uuid, di->id2.i_super.s_uuid,
+ sizeof(di->id2.i_super.s_uuid));
osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
@@ -2333,7 +2335,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog_errno(status);
goto bail;
}
- cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
+ cleancache_init_shared_fs(sb);
bail:
return status;
@@ -2563,22 +2565,22 @@ static void ocfs2_handle_error(struct super_block *sb)
ocfs2_set_ro_flag(osb, 0);
}
-static char error_buf[1024];
-
-void __ocfs2_error(struct super_block *sb,
- const char *function,
- const char *fmt, ...)
+void __ocfs2_error(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- vsnprintf(error_buf, sizeof(error_buf), fmt, args);
- va_end(args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
/* Not using mlog here because we want to show the actual
* function the error came from. */
- printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
+ printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+ sb->s_id, function, &vaf);
+
+ va_end(args);
ocfs2_handle_error(sb);
}
@@ -2586,18 +2588,21 @@ void __ocfs2_error(struct super_block *sb,
/* Handle critical errors. This is intentionally more drastic than
* ocfs2_handle_error, so we only use for things like journal errors,
* etc. */
-void __ocfs2_abort(struct super_block* sb,
- const char *function,
+void __ocfs2_abort(struct super_block *sb, const char *function,
const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
- vsnprintf(error_buf, sizeof(error_buf), fmt, args);
- va_end(args);
- printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
- sb->s_id, function, error_buf);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+ sb->s_id, function, &vaf);
+
+ va_end(args);
/* We don't have the cluster support yet to go straight to
* hard readonly in here. Until then, we want to keep
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 85b190dc132f..889f3796a0d7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1020,7 +1020,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry,
int ret = 0, i_ret = 0, b_ret = 0;
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di = NULL;
- struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+ struct ocfs2_inode_info *oi = OCFS2_I(d_inode(dentry));
if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
return -EOPNOTSUPP;
@@ -1028,7 +1028,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry,
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
return ret;
- ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+ ret = ocfs2_inode_lock(d_inode(dentry), &di_bh, 0);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1037,7 +1037,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry,
di = (struct ocfs2_dinode *)di_bh->b_data;
down_read(&oi->ip_xattr_sem);
- i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+ i_ret = ocfs2_xattr_ibody_list(d_inode(dentry), di, buffer, size);
if (i_ret < 0)
b_ret = 0;
else {
@@ -1045,13 +1045,13 @@ ssize_t ocfs2_listxattr(struct dentry *dentry,
buffer += i_ret;
size -= i_ret;
}
- b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+ b_ret = ocfs2_xattr_block_list(d_inode(dentry), di,
buffer, size);
if (b_ret < 0)
i_ret = 0;
}
up_read(&oi->ip_xattr_sem);
- ocfs2_inode_unlock(dentry->d_inode, 0);
+ ocfs2_inode_unlock(d_inode(dentry), 0);
brelse(di_bh);
@@ -1238,6 +1238,10 @@ static int ocfs2_xattr_block_get(struct inode *inode,
i,
&block_off,
&name_offset);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
xs->base = bucket_block(xs->bucket, block_off);
}
if (ocfs2_xattr_is_local(xs->here)) {
@@ -5665,6 +5669,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
i, &xv, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
args->ref_ci,
@@ -7249,7 +7257,7 @@ static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+ return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, buffer, size);
}
@@ -7259,11 +7267,11 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
if (strcmp(name, "") == 0)
return -EINVAL;
- return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
+ return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
-int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
const struct xattr *xattr;
@@ -7339,7 +7347,7 @@ static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+ return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
@@ -7349,7 +7357,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
if (strcmp(name, "") == 0)
return -EINVAL;
- return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
+ return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
@@ -7391,7 +7399,7 @@ static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
- return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name,
+ return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
buffer, size);
}
@@ -7405,7 +7413,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
- return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER,
+ return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER,
name, value, size, flags);
}
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index 082234581d05..83f4e76511c2 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -159,7 +159,7 @@ int omfs_allocate_range(struct super_block *sb,
goto out;
found:
- *return_block = i * bits_per_entry + bit;
+ *return_block = (u64) i * bits_per_entry + bit;
*return_size = run;
ret = set_run(sb, i, bits_per_entry, bit, run, 1);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 1b8e9e8405b2..f833bf8d5792 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -110,7 +110,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb)
static int omfs_add_link(struct dentry *dentry, struct inode *inode)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
struct omfs_inode *oi;
@@ -155,7 +155,7 @@ out:
static int omfs_delete_entry(struct dentry *dentry)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
struct inode *dirty;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
@@ -237,7 +237,7 @@ static int omfs_dir_is_empty(struct inode *inode)
static int omfs_remove(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int ret;
@@ -373,8 +373,8 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *new_inode = new_dentry->d_inode;
- struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = d_inode(new_dentry);
+ struct inode *old_inode = d_inode(old_dentry);
int err;
if (new_inode) {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 902e88527fce..d9e26cfbb793 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -337,8 +337,6 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
const struct file_operations omfs_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
@@ -348,7 +346,7 @@ const struct file_operations omfs_file_operations = {
static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
error = inode_change_ok(inode, attr);
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 138321b0c6c2..3d935c81789a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -306,7 +306,8 @@ static const struct super_operations omfs_sops = {
*/
static int omfs_get_imap(struct super_block *sb)
{
- unsigned int bitmap_size, count, array_size;
+ unsigned int bitmap_size, array_size;
+ int count;
struct omfs_sb_info *sbi = OMFS_SB(sb);
struct buffer_head *bh;
unsigned long **ptr;
@@ -359,7 +360,7 @@ nomem:
}
enum {
- Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask
+ Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err
};
static const match_table_t tokens = {
@@ -368,6 +369,7 @@ static const match_table_t tokens = {
{Opt_umask, "umask=%o"},
{Opt_dmask, "dmask=%o"},
{Opt_fmask, "fmask=%o"},
+ {Opt_err, NULL},
};
static int parse_options(char *options, struct omfs_sb_info *sbi)
@@ -548,8 +550,10 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_root = d_make_root(root);
- if (!sb->s_root)
+ if (!sb->s_root) {
+ ret = -ENOMEM;
goto out_brelse_bh2;
+ }
printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);
ret = 0;
diff --git a/fs/open.c b/fs/open.c
index 33f9cbf2610b..e33dab287fa0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -51,8 +51,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
newattrs.ia_valid |= ATTR_FILE;
}
- /* Remove suid/sgid on truncate too */
- ret = should_remove_suid(dentry);
+ /* Remove suid, sgid, and file capabilities on truncate too */
+ ret = dentry_needs_remove_privs(dentry);
+ if (ret < 0)
+ return ret;
if (ret)
newattrs.ia_valid |= ret | ATTR_FORCE;
@@ -231,8 +233,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EINVAL;
/* Return error if mode is not supported */
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+ if (mode & ~FALLOC_FL_SUPPORTED_MASK)
return -EOPNOTSUPP;
/* Punch hole and zero range are mutually exclusive */
@@ -250,6 +251,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
(mode & ~FALLOC_FL_COLLAPSE_RANGE))
return -EINVAL;
+ /* Insert range should only be used exclusively. */
+ if ((mode & FALLOC_FL_INSERT_RANGE) &&
+ (mode & ~FALLOC_FL_INSERT_RANGE))
+ return -EINVAL;
+
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
@@ -363,7 +369,7 @@ retry:
if (res)
goto out;
- inode = path.dentry->d_inode;
+ inode = d_backing_inode(path.dentry);
if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
/*
@@ -570,6 +576,7 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
uid = make_kuid(current_user_ns(), user);
gid = make_kgid(current_user_ns(), group);
+retry_deleg:
newattrs.ia_valid = ATTR_CTIME;
if (user != (uid_t) -1) {
if (!uid_valid(uid))
@@ -586,7 +593,6 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
-retry_deleg:
mutex_lock(&inode->i_mutex);
error = security_path_chown(path, uid, gid);
if (!error)
@@ -674,18 +680,18 @@ int open_check_o_direct(struct file *f)
}
static int do_dentry_open(struct file *f,
+ struct inode *inode,
int (*open)(struct inode *, struct file *),
const struct cred *cred)
{
static const struct file_operations empty_fops = {};
- struct inode *inode;
int error;
f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
path_get(&f->f_path);
- inode = f->f_inode = f->f_path.dentry->d_inode;
+ f->f_inode = inode;
f->f_mapping = inode->i_mapping;
if (unlikely(f->f_flags & O_PATH)) {
@@ -734,10 +740,10 @@ static int do_dentry_open(struct file *f,
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(inode);
if ((f->f_mode & FMODE_READ) &&
- likely(f->f_op->read || f->f_op->aio_read || f->f_op->read_iter))
+ likely(f->f_op->read || f->f_op->read_iter))
f->f_mode |= FMODE_CAN_READ;
if ((f->f_mode & FMODE_WRITE) &&
- likely(f->f_op->write || f->f_op->aio_write || f->f_op->write_iter))
+ likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -789,7 +795,8 @@ int finish_open(struct file *file, struct dentry *dentry,
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
file->f_path.dentry = dentry;
- error = do_dentry_open(file, open, current_cred());
+ error = do_dentry_open(file, d_backing_inode(dentry), open,
+ current_cred());
if (!error)
*opened |= FILE_OPENED;
@@ -818,6 +825,34 @@ int finish_no_open(struct file *file, struct dentry *dentry)
}
EXPORT_SYMBOL(finish_no_open);
+char *file_path(struct file *filp, char *buf, int buflen)
+{
+ return d_path(&filp->f_path, buf, buflen);
+}
+EXPORT_SYMBOL(file_path);
+
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @file: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *file,
+ const struct cred *cred)
+{
+ struct dentry *dentry = path->dentry;
+ struct inode *inode = dentry->d_inode;
+
+ file->f_path = *path;
+ if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
+ inode = dentry->d_op->d_select_inode(dentry, file->f_flags);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ }
+
+ return do_dentry_open(file, inode, NULL, cred);
+}
+
struct file *dentry_open(const struct path *path, int flags,
const struct cred *cred)
{
@@ -849,26 +884,6 @@ struct file *dentry_open(const struct path *path, int flags,
}
EXPORT_SYMBOL(dentry_open);
-/**
- * vfs_open - open the file at the given path
- * @path: path to open
- * @filp: newly allocated file with f_flag initialized
- * @cred: credentials to use
- */
-int vfs_open(const struct path *path, struct file *filp,
- const struct cred *cred)
-{
- struct inode *inode = path->dentry->d_inode;
-
- if (inode->i_op->dentry_open)
- return inode->i_op->dentry_open(path->dentry, filp, cred);
- else {
- filp->f_path = *path;
- return do_dentry_open(filp, NULL, cred);
- }
-}
-EXPORT_SYMBOL(vfs_open);
-
static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
{
int lookup_flags = 0;
@@ -988,9 +1003,6 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
return ERR_PTR(err);
if (flags & O_CREAT)
return ERR_PTR(-EINVAL);
- if (!filename && (flags & O_DIRECTORY))
- if (!dentry->d_inode->i_op->lookup)
- return ERR_PTR(-ENOTDIR);
return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 24f640441bd9..84d693d37428 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -299,6 +299,9 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct cred *override_cred;
char *link = NULL;
+ if (WARN_ON(!workdir))
+ return -EROFS;
+
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index d139405d2bfa..692ceda3bc21 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -222,6 +222,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct kstat stat;
int err;
+ if (WARN_ON(!workdir))
+ return ERR_PTR(-EROFS);
+
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
@@ -322,6 +325,9 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry;
int err;
+ if (WARN_ON(!workdir))
+ return -EROFS;
+
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
@@ -506,11 +512,28 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
struct dentry *opaquedir = NULL;
int err;
- if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
- opaquedir = ovl_check_empty_and_clear(dentry);
- err = PTR_ERR(opaquedir);
- if (IS_ERR(opaquedir))
- goto out;
+ if (WARN_ON(!workdir))
+ return -EROFS;
+
+ if (is_dir) {
+ if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
+ opaquedir = ovl_check_empty_and_clear(dentry);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out;
+ } else {
+ LIST_HEAD(list);
+
+ /*
+ * When removing an empty opaque directory, then it
+ * makes no sense to replace it with an exact replica of
+ * itself. But emptiness still needs to be checked.
+ */
+ err = ovl_check_empty_dir(dentry, &list);
+ ovl_cache_free(&list);
+ if (err)
+ goto out;
+ }
}
err = ovl_lock_rename_workdir(workdir, upperdir);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 04f124884687..d9da5a4e9382 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -140,11 +140,12 @@ struct ovl_link_data {
void *cookie;
};
-static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
{
- void *ret;
struct dentry *realdentry;
struct inode *realinode;
+ struct ovl_link_data *data = NULL;
+ const char *ret;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
@@ -152,28 +153,28 @@ static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
- ret = realinode->i_op->follow_link(realdentry, nd);
- if (IS_ERR(ret))
- return ret;
-
if (realinode->i_op->put_link) {
- struct ovl_link_data *data;
-
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
- if (!data) {
- realinode->i_op->put_link(realdentry, nd, ret);
+ if (!data)
return ERR_PTR(-ENOMEM);
- }
data->realdentry = realdentry;
- data->cookie = ret;
+ }
- return data;
- } else {
- return NULL;
+ ret = realinode->i_op->follow_link(realdentry, cookie);
+ if (IS_ERR_OR_NULL(ret)) {
+ kfree(data);
+ return ret;
}
+
+ if (data)
+ data->cookie = *cookie;
+
+ *cookie = data;
+
+ return ret;
}
-static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+static void ovl_put_link(struct inode *unused, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
@@ -182,7 +183,7 @@ static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
return;
realinode = data->realdentry->d_inode;
- realinode->i_op->put_link(data->realdentry, nd, data->cookie);
+ realinode->i_op->put_link(realinode, data->cookie);
kfree(data);
}
@@ -336,37 +337,33 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
return true;
}
-static int ovl_dentry_open(struct dentry *dentry, struct file *file,
- const struct cred *cred)
+struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
{
int err;
struct path realpath;
enum ovl_path_type type;
- bool want_write = false;
+
+ if (d_is_dir(dentry))
+ return d_backing_inode(dentry);
type = ovl_path_real(dentry, &realpath);
- if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
- want_write = true;
+ if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
err = ovl_want_write(dentry);
if (err)
- goto out;
+ return ERR_PTR(err);
- if (file->f_flags & O_TRUNC)
+ if (file_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
+ ovl_drop_write(dentry);
if (err)
- goto out_drop_write;
+ return ERR_PTR(err);
ovl_path_upper(dentry, &realpath);
}
- err = vfs_open(&realpath, file, cred);
-out_drop_write:
- if (want_write)
- ovl_drop_write(dentry);
-out:
- return err;
+ return d_backing_inode(realpath.dentry);
}
static const struct inode_operations ovl_file_inode_operations = {
@@ -377,7 +374,6 @@ static const struct inode_operations ovl_file_inode_operations = {
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
- .dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 17ac5afc9ffb..ea5a40b06e3a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -173,6 +173,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
+struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 907870e81a72..70e9af551600 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -23,6 +23,7 @@ struct ovl_cache_entry {
u64 ino;
struct list_head l_node;
struct rb_node node;
+ struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
@@ -39,7 +40,7 @@ struct ovl_readdir_data {
struct rb_root root;
struct list_head *list;
struct list_head middle;
- struct dentry *dir;
+ struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
};
@@ -79,7 +80,7 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
return NULL;
}
-static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
+static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
@@ -98,29 +99,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
p->is_whiteout = false;
if (d_type == DT_CHR) {
- struct dentry *dentry;
- const struct cred *old_cred;
- struct cred *override_cred;
-
- override_cred = prepare_creds();
- if (!override_cred) {
- kfree(p);
- return NULL;
- }
-
- /*
- * CAP_DAC_OVERRIDE for lookup
- */
- cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
- old_cred = override_creds(override_cred);
-
- dentry = lookup_one_len(name, dir, len);
- if (!IS_ERR(dentry)) {
- p->is_whiteout = ovl_is_whiteout(dentry);
- dput(dentry);
- }
- revert_creds(old_cred);
- put_cred(override_cred);
+ p->next_maybe_whiteout = rdd->first_maybe_whiteout;
+ rdd->first_maybe_whiteout = p;
}
return p;
}
@@ -148,7 +128,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
return 0;
}
- p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
+ p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
@@ -169,7 +149,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd,
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
- p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
+ p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
@@ -219,6 +199,43 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
+static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
+{
+ int err;
+ struct ovl_cache_entry *p;
+ struct dentry *dentry;
+ const struct cred *old_cred;
+ struct cred *override_cred;
+
+ override_cred = prepare_creds();
+ if (!override_cred)
+ return -ENOMEM;
+
+ /*
+ * CAP_DAC_OVERRIDE for lookup
+ */
+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
+ old_cred = override_creds(override_cred);
+
+ err = mutex_lock_killable(&dir->d_inode->i_mutex);
+ if (!err) {
+ while (rdd->first_maybe_whiteout) {
+ p = rdd->first_maybe_whiteout;
+ rdd->first_maybe_whiteout = p->next_maybe_whiteout;
+ dentry = lookup_one_len(p->name, dir, p->len);
+ if (!IS_ERR(dentry)) {
+ p->is_whiteout = ovl_is_whiteout(dentry);
+ dput(dentry);
+ }
+ }
+ mutex_unlock(&dir->d_inode->i_mutex);
+ }
+ revert_creds(old_cred);
+ put_cred(override_cred);
+
+ return err;
+}
+
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
@@ -229,7 +246,7 @@ static inline int ovl_dir_read(struct path *realpath,
if (IS_ERR(realfile))
return PTR_ERR(realfile);
- rdd->dir = realpath->dentry;
+ rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
@@ -238,6 +255,10 @@ static inline int ovl_dir_read(struct path *realpath,
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
+
+ if (!err && rdd->first_maybe_whiteout)
+ err = ovl_check_whiteouts(realpath->dentry, rdd);
+
fput(realfile);
return err;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 5f0d1993e6e3..7466ff339c66 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -273,8 +273,56 @@ static void ovl_dentry_release(struct dentry *dentry)
}
}
+static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ unsigned int i;
+ int ret = 1;
+
+ for (i = 0; i < oe->numlower; i++) {
+ struct dentry *d = oe->lowerstack[i].dentry;
+
+ if (d->d_flags & DCACHE_OP_REVALIDATE) {
+ ret = d->d_op->d_revalidate(d, flags);
+ if (ret < 0)
+ return ret;
+ if (!ret) {
+ if (!(flags & LOOKUP_RCU))
+ d_invalidate(d);
+ return -ESTALE;
+ }
+ }
+ }
+ return 1;
+}
+
+static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct ovl_entry *oe = dentry->d_fsdata;
+ unsigned int i;
+ int ret = 1;
+
+ for (i = 0; i < oe->numlower; i++) {
+ struct dentry *d = oe->lowerstack[i].dentry;
+
+ if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) {
+ ret = d->d_op->d_weak_revalidate(d, flags);
+ if (ret <= 0)
+ break;
+ }
+ }
+ return ret;
+}
+
static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
+ .d_select_inode = ovl_d_select_inode,
+};
+
+static const struct dentry_operations ovl_reval_dentry_operations = {
+ .d_release = ovl_dentry_release,
+ .d_revalidate = ovl_dentry_revalidate,
+ .d_weak_revalidate = ovl_dentry_weak_revalidate,
};
static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
@@ -288,6 +336,20 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
return oe;
}
+static bool ovl_dentry_remote(struct dentry *dentry)
+{
+ return dentry->d_flags &
+ (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+}
+
+static bool ovl_dentry_weird(struct dentry *dentry)
+{
+ return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
+ DCACHE_MANAGE_TRANSIT |
+ DCACHE_OP_HASH |
+ DCACHE_OP_COMPARE);
+}
+
static inline struct dentry *ovl_lookup_real(struct dentry *dir,
struct qstr *name)
{
@@ -303,6 +365,10 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
} else if (!dentry->d_inode) {
dput(dentry);
dentry = NULL;
+ } else if (ovl_dentry_weird(dentry)) {
+ dput(dentry);
+ /* Don't support traversing automounts and other weirdness */
+ dentry = ERR_PTR(-EREMOTE);
}
return dentry;
}
@@ -350,6 +416,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out;
if (this) {
+ if (unlikely(ovl_dentry_remote(this))) {
+ dput(this);
+ err = -EREMOTE;
+ goto out;
+ }
if (ovl_is_whiteout(this)) {
dput(this);
this = NULL;
@@ -529,7 +600,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data)
{
struct ovl_fs *ufs = sb->s_fs_info;
- if (!(*flags & MS_RDONLY) && !ufs->upper_mnt)
+ if (!(*flags & MS_RDONLY) && (!ufs->upper_mnt || !ufs->workdir))
return -EROFS;
return 0;
@@ -694,25 +765,6 @@ static void ovl_unescape(char *s)
}
}
-static bool ovl_is_allowed_fs_type(struct dentry *root)
-{
- const struct dentry_operations *dop = root->d_op;
-
- /*
- * We don't support:
- * - automount filesystems
- * - filesystems with revalidate (FIXME for lower layer)
- * - filesystems with case insensitive names
- */
- if (dop &&
- (dop->d_manage || dop->d_automount ||
- dop->d_revalidate || dop->d_weak_revalidate ||
- dop->d_compare || dop->d_hash)) {
- return false;
- }
- return true;
-}
-
static int ovl_mount_dir_noesc(const char *name, struct path *path)
{
int err = -EINVAL;
@@ -727,7 +779,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
goto out;
}
err = -EINVAL;
- if (!ovl_is_allowed_fs_type(path->dentry)) {
+ if (ovl_dentry_weird(path->dentry)) {
pr_err("overlayfs: filesystem on '%s' not supported\n", name);
goto out_put;
}
@@ -751,13 +803,21 @@ static int ovl_mount_dir(const char *name, struct path *path)
if (tmp) {
ovl_unescape(tmp);
err = ovl_mount_dir_noesc(tmp, path);
+
+ if (!err)
+ if (ovl_dentry_remote(path->dentry)) {
+ pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n",
+ tmp);
+ path_put(path);
+ err = -EINVAL;
+ }
kfree(tmp);
}
return err;
}
static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
- int *stack_depth)
+ int *stack_depth, bool *remote)
{
int err;
struct kstatfs statfs;
@@ -774,6 +834,9 @@ static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
*namelen = max(*namelen, statfs.f_namelen);
*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
+ if (ovl_dentry_remote(path->dentry))
+ *remote = true;
+
return 0;
out_put:
@@ -827,6 +890,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
unsigned int numlower;
unsigned int stacklen = 0;
unsigned int i;
+ bool remote = false;
int err;
err = -ENOMEM;
@@ -900,7 +964,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
lower = lowertmp;
for (numlower = 0; numlower < stacklen; numlower++) {
err = ovl_lower_dir(lower, &stack[numlower],
- &ufs->lower_namelen, &sb->s_stack_depth);
+ &ufs->lower_namelen, &sb->s_stack_depth,
+ &remote);
if (err)
goto out_put_lowerpath;
@@ -925,9 +990,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
err = PTR_ERR(ufs->workdir);
if (IS_ERR(ufs->workdir)) {
- pr_err("overlayfs: failed to create directory %s/%s\n",
- ufs->config.workdir, OVL_WORKDIR_NAME);
- goto out_put_upper_mnt;
+ pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n",
+ ufs->config.workdir, OVL_WORKDIR_NAME, -err);
+ sb->s_flags |= MS_RDONLY;
+ ufs->workdir = NULL;
}
}
@@ -957,7 +1023,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!ufs->upper_mnt)
sb->s_flags |= MS_RDONLY;
- sb->s_d_op = &ovl_dentry_operations;
+ if (remote)
+ sb->s_d_op = &ovl_reval_dentry_operations;
+ else
+ sb->s_d_op = &ovl_dentry_operations;
err = -ENOMEM;
oe = ovl_alloc_entry(numlower);
@@ -997,7 +1066,6 @@ out_put_lower_mnt:
kfree(ufs->lower_mnt);
out_put_workdir:
dput(ufs->workdir);
-out_put_upper_mnt:
mntput(ufs->upper_mnt);
out_put_lowerpath:
for (i = 0; i < numlower; i++)
diff --git a/fs/pipe.c b/fs/pipe.c
index 21981e58e2a6..8865f7963700 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,7 +21,6 @@
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
-#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -628,7 +627,7 @@ static struct vfsmount *pipe_mnt __read_mostly;
static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
- dentry->d_inode->i_ino);
+ d_inode(dentry)->i_ino);
}
static const struct dentry_operations pipefs_dentry_operations = {
@@ -947,9 +946,7 @@ err:
const struct file_operations pipefifo_fops = {
.open = fifo_open,
.llseek = no_llseek,
- .read = new_sync_read,
.read_iter = pipe_read,
- .write = new_sync_write,
.write_iter = pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
diff --git a/fs/pnode.c b/fs/pnode.c
index 260ac8f898a4..6367e1e435c6 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -362,6 +362,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
}
/*
+ * Clear MNT_LOCKED when it can be shown to be safe.
+ *
+ * mount_lock lock must be held for write
+ */
+void propagate_mount_unlock(struct mount *mnt)
+{
+ struct mount *parent = mnt->mnt_parent;
+ struct mount *m, *child;
+
+ BUG_ON(parent == mnt);
+
+ for (m = propagation_next(parent, parent); m;
+ m = propagation_next(m, parent)) {
+ child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+ if (child)
+ child->mnt.mnt_flags &= ~MNT_LOCKED;
+ }
+}
+
+/*
+ * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
+ */
+static void mark_umount_candidates(struct mount *mnt)
+{
+ struct mount *parent = mnt->mnt_parent;
+ struct mount *m;
+
+ BUG_ON(parent == mnt);
+
+ for (m = propagation_next(parent, parent); m;
+ m = propagation_next(m, parent)) {
+ struct mount *child = __lookup_mnt_last(&m->mnt,
+ mnt->mnt_mountpoint);
+ if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+ SET_MNT_MARK(child);
+ }
+ }
+}
+
+/*
* NOTE: unmounting 'mnt' naturally propagates to all other mounts its
* parent propagates to.
*/
@@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt)
struct mount *child = __lookup_mnt_last(&m->mnt,
mnt->mnt_mountpoint);
/*
- * umount the child only if the child has no
- * other children
+ * umount the child only if the child has no children
+ * and the child is marked safe to unmount.
*/
- if (child && list_empty(&child->mnt_mounts)) {
+ if (!child || !IS_MNT_MARKED(child))
+ continue;
+ CLEAR_MNT_MARK(child);
+ if (list_empty(&child->mnt_mounts)) {
list_del_init(&child->mnt_child);
- hlist_del_init_rcu(&child->mnt_hash);
- hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
+ child->mnt.mnt_flags |= MNT_UMOUNT;
+ list_move_tail(&child->mnt_list, &mnt->mnt_list);
}
}
}
@@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt)
*
* vfsmount lock must be held for write
*/
-int propagate_umount(struct hlist_head *list)
+int propagate_umount(struct list_head *list)
{
struct mount *mnt;
- hlist_for_each_entry(mnt, list, mnt_hash)
+ list_for_each_entry_reverse(mnt, list, mnt_list)
+ mark_umount_candidates(mnt);
+
+ list_for_each_entry(mnt, list, mnt_list)
__propagate_umount(mnt);
return 0;
}
diff --git a/fs/pnode.h b/fs/pnode.h
index 4a246358b031..0fcdbe7ca648 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -19,6 +19,7 @@
#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
#define CL_EXPIRE 0x01
#define CL_SLAVE 0x02
@@ -40,14 +41,14 @@ static inline void set_mnt_shared(struct mount *mnt)
void change_mnt_propagation(struct mount *, int);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct hlist_head *);
-int propagate_umount(struct hlist_head *);
+int propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
+void propagate_mount_unlock(struct mount *);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
unsigned int mnt_get_count(struct mount *mnt);
void mnt_set_mountpoint(struct mount *, struct mountpoint *,
struct mount *);
-void umount_tree(struct mount *, int);
struct mount *copy_tree(struct mount *, struct dentry *, int);
bool is_path_reachable(struct mount *, struct dentry *,
const struct path *root);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 3a48bb789c9f..4fb17ded7d47 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode,
struct posix_acl **default_acl, struct posix_acl **acl)
{
struct posix_acl *p;
+ struct posix_acl *clone;
int ret;
+ *acl = NULL;
+ *default_acl = NULL;
+
if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
- goto no_acl;
+ return 0;
p = get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(p)) {
- if (p == ERR_PTR(-EOPNOTSUPP))
- goto apply_umask;
- return PTR_ERR(p);
+ if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
+ *mode &= ~current_umask();
+ return 0;
}
+ if (IS_ERR(p))
+ return PTR_ERR(p);
- if (!p)
- goto apply_umask;
-
- *acl = posix_acl_clone(p, GFP_NOFS);
- if (!*acl)
+ clone = posix_acl_clone(p, GFP_NOFS);
+ if (!clone)
goto no_mem;
- ret = posix_acl_create_masq(*acl, mode);
+ ret = posix_acl_create_masq(clone, mode);
if (ret < 0)
goto no_mem_clone;
- if (ret == 0) {
- posix_acl_release(*acl);
- *acl = NULL;
- }
+ if (ret == 0)
+ posix_acl_release(clone);
+ else
+ *acl = clone;
- if (!S_ISDIR(*mode)) {
+ if (!S_ISDIR(*mode))
posix_acl_release(p);
- *default_acl = NULL;
- } else {
+ else
*default_acl = p;
- }
- return 0;
-apply_umask:
- *mode &= ~current_umask();
-no_acl:
- *default_acl = NULL;
- *acl = NULL;
return 0;
no_mem_clone:
- posix_acl_release(*acl);
+ posix_acl_release(clone);
no_mem:
posix_acl_release(p);
return -ENOMEM;
@@ -774,12 +768,12 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
struct posix_acl *acl;
int error;
- if (!IS_POSIXACL(dentry->d_inode))
+ if (!IS_POSIXACL(d_backing_inode(dentry)))
return -EOPNOTSUPP;
if (d_is_symlink(dentry))
return -EOPNOTSUPP;
- acl = get_acl(dentry->d_inode, type);
+ acl = get_acl(d_backing_inode(dentry), type);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -795,7 +789,7 @@ static int
posix_acl_xattr_set(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags, int type)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_backing_inode(dentry);
struct posix_acl *acl = NULL;
int ret;
@@ -834,7 +828,7 @@ posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
const char *xname;
size_t size;
- if (!IS_POSIXACL(dentry->d_inode))
+ if (!IS_POSIXACL(d_backing_inode(dentry)))
return -EOPNOTSUPP;
if (d_is_symlink(dentry))
return -EOPNOTSUPP;
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 2183fcf41d59..1ade1206bb89 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -71,3 +71,13 @@ config PROC_PAGE_MONITOR
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_CHILDREN
+ bool "Include /proc/<pid>/task/<tid>/children file"
+ default n
+ help
+ Provides a fast way to retrieve first level children pids of a task. See
+ <file:Documentation/filesystems/proc.txt> for more information.
+
+ Say Y if you are running any user-space software which takes benefit from
+ this interface. For example, rkt is such a piece of software.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 1295a00ca316..ce065cf3104f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -99,8 +99,8 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
buf = m->buf + m->count;
/* Ignore error for now */
- string_escape_str(tcomm, &buf, m->size - m->count,
- ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ buf += string_escape_str(tcomm, buf, m->size - m->count,
+ ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
m->count = buf - m->buf;
seq_putc(m, '\n');
@@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk)
{
unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+ /*
+ * Parked tasks do not run; they sit in __kthread_parkme().
+ * Without this check, we would report them as running, which is
+ * clearly wrong, so we report them as sleeping instead.
+ */
+ if (tsk->state == TASK_PARKED)
+ state = TASK_INTERRUPTIBLE;
+
BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
return task_state_array[fls(state)];
@@ -188,6 +196,24 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
put_cred(cred);
+#ifdef CONFIG_PID_NS
+ seq_puts(m, "\nNStgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d",
+ task_tgid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d",
+ task_pid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d",
+ task_pgrp_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSsid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d",
+ task_session_nr_ns(p, pid->numbers[g].ns));
+#endif
seq_putc(m, '\n');
}
@@ -551,7 +577,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_PROC_CHILDREN
static struct pid *
get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
{
@@ -614,7 +640,9 @@ static int children_seq_show(struct seq_file *seq, void *v)
pid_t pid;
pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
- return seq_printf(seq, "%d ", pid);
+ seq_printf(seq, "%d ", pid);
+
+ return 0;
}
static void *children_seq_start(struct seq_file *seq, loff_t *pos)
@@ -672,4 +700,4 @@ const struct file_operations proc_tid_children_operations = {
.llseek = seq_lseek,
.release = children_seq_release,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */
+#endif /* CONFIG_PROC_CHILDREN */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3f3d7aeb0712..aa50d1ac28fc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -169,7 +169,7 @@ static int get_task_root(struct task_struct *task, struct path *root)
static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(dentry->d_inode);
+ struct task_struct *task = get_proc_task(d_inode(dentry));
int result = -ENOENT;
if (task) {
@@ -186,7 +186,7 @@ static int proc_cwd_link(struct dentry *dentry, struct path *path)
static int proc_root_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(dentry->d_inode);
+ struct task_struct *task = get_proc_task(d_inode(dentry));
int result = -ENOENT;
if (task) {
@@ -196,18 +196,210 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task)
+static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+ size_t _count, loff_t *pos)
{
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ char *page;
+ unsigned long count = _count;
+ unsigned long arg_start, arg_end, env_start, env_end;
+ unsigned long len1, len2, len;
+ unsigned long p;
+ char c;
+ ssize_t rv;
+
+ BUG_ON(*pos < 0);
+
+ tsk = get_proc_task(file_inode(file));
+ if (!tsk)
+ return -ESRCH;
+ mm = get_task_mm(tsk);
+ put_task_struct(tsk);
+ if (!mm)
+ return 0;
+ /* Check if process spawned far enough to have cmdline. */
+ if (!mm->env_end) {
+ rv = 0;
+ goto out_mmput;
+ }
+
+ page = (char *)__get_free_page(GFP_TEMPORARY);
+ if (!page) {
+ rv = -ENOMEM;
+ goto out_mmput;
+ }
+
+ down_read(&mm->mmap_sem);
+ arg_start = mm->arg_start;
+ arg_end = mm->arg_end;
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ up_read(&mm->mmap_sem);
+
+ BUG_ON(arg_start > arg_end);
+ BUG_ON(env_start > env_end);
+
+ len1 = arg_end - arg_start;
+ len2 = env_end - env_start;
+
+ /* Empty ARGV. */
+ if (len1 == 0) {
+ rv = 0;
+ goto out_free_page;
+ }
/*
- * Rely on struct seq_operations::show() being called once
- * per internal buffer allocation. See single_open(), traverse().
+ * Inherently racy -- command line shares address space
+ * with code and data.
*/
- BUG_ON(m->size < PAGE_SIZE);
- m->count += get_cmdline(task, m->buf, PAGE_SIZE);
- return 0;
+ rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
+ if (rv <= 0)
+ goto out_free_page;
+
+ rv = 0;
+
+ if (c == '\0') {
+ /* Command line (set of strings) occupies whole ARGV. */
+ if (len1 <= *pos)
+ goto out_free_page;
+
+ p = arg_start + *pos;
+ len = len1 - *pos;
+ while (count > 0 && len > 0) {
+ unsigned int _count;
+ int nr_read;
+
+ _count = min3(count, len, PAGE_SIZE);
+ nr_read = access_remote_vm(mm, p, page, _count, 0);
+ if (nr_read < 0)
+ rv = nr_read;
+ if (nr_read <= 0)
+ goto out_free_page;
+
+ if (copy_to_user(buf, page, nr_read)) {
+ rv = -EFAULT;
+ goto out_free_page;
+ }
+
+ p += nr_read;
+ len -= nr_read;
+ buf += nr_read;
+ count -= nr_read;
+ rv += nr_read;
+ }
+ } else {
+ /*
+ * Command line (1 string) occupies ARGV and maybe
+ * extends into ENVP.
+ */
+ if (len1 + len2 <= *pos)
+ goto skip_argv_envp;
+ if (len1 <= *pos)
+ goto skip_argv;
+
+ p = arg_start + *pos;
+ len = len1 - *pos;
+ while (count > 0 && len > 0) {
+ unsigned int _count, l;
+ int nr_read;
+ bool final;
+
+ _count = min3(count, len, PAGE_SIZE);
+ nr_read = access_remote_vm(mm, p, page, _count, 0);
+ if (nr_read < 0)
+ rv = nr_read;
+ if (nr_read <= 0)
+ goto out_free_page;
+
+ /*
+ * Command line can be shorter than whole ARGV
+ * even if last "marker" byte says it is not.
+ */
+ final = false;
+ l = strnlen(page, nr_read);
+ if (l < nr_read) {
+ nr_read = l;
+ final = true;
+ }
+
+ if (copy_to_user(buf, page, nr_read)) {
+ rv = -EFAULT;
+ goto out_free_page;
+ }
+
+ p += nr_read;
+ len -= nr_read;
+ buf += nr_read;
+ count -= nr_read;
+ rv += nr_read;
+
+ if (final)
+ goto out_free_page;
+ }
+skip_argv:
+ /*
+ * Command line (1 string) occupies ARGV and
+ * extends into ENVP.
+ */
+ if (len1 <= *pos) {
+ p = env_start + *pos - len1;
+ len = len1 + len2 - *pos;
+ } else {
+ p = env_start;
+ len = len2;
+ }
+ while (count > 0 && len > 0) {
+ unsigned int _count, l;
+ int nr_read;
+ bool final;
+
+ _count = min3(count, len, PAGE_SIZE);
+ nr_read = access_remote_vm(mm, p, page, _count, 0);
+ if (nr_read < 0)
+ rv = nr_read;
+ if (nr_read <= 0)
+ goto out_free_page;
+
+ /* Find EOS. */
+ final = false;
+ l = strnlen(page, nr_read);
+ if (l < nr_read) {
+ nr_read = l;
+ final = true;
+ }
+
+ if (copy_to_user(buf, page, nr_read)) {
+ rv = -EFAULT;
+ goto out_free_page;
+ }
+
+ p += nr_read;
+ len -= nr_read;
+ buf += nr_read;
+ count -= nr_read;
+ rv += nr_read;
+
+ if (final)
+ goto out_free_page;
+ }
+skip_argv_envp:
+ ;
+ }
+
+out_free_page:
+ free_page((unsigned long)page);
+out_mmput:
+ mmput(mm);
+ if (rv > 0)
+ *pos += rv;
+ return rv;
}
+static const struct file_operations proc_pid_cmdline_ops = {
+ .read = proc_pid_cmdline_read,
+ .llseek = generic_file_llseek,
+};
+
static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -238,13 +430,15 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
wchan = get_wchan(task);
- if (lookup_symbol_name(wchan, symname) < 0)
+ if (lookup_symbol_name(wchan, symname) < 0) {
if (!ptrace_may_access(task, PTRACE_MODE_READ))
return 0;
- else
- return seq_printf(m, "%lu", wchan);
- else
- return seq_printf(m, "%s", symname);
+ seq_printf(m, "%lu", wchan);
+ } else {
+ seq_printf(m, "%s", symname);
+ }
+
+ return 0;
}
#endif /* CONFIG_KALLSYMS */
@@ -302,17 +496,22 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
}
#endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
/*
* Provides /proc/PID/schedstat
*/
static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- return seq_printf(m, "%llu %llu %lu\n",
- (unsigned long long)task->se.sum_exec_runtime,
- (unsigned long long)task->sched_info.run_delay,
- task->sched_info.pcount);
+ if (unlikely(!sched_info_on()))
+ seq_printf(m, "0 0 0\n");
+ else
+ seq_printf(m, "%llu %llu %lu\n",
+ (unsigned long long)task->se.sum_exec_runtime,
+ (unsigned long long)task->sched_info.run_delay,
+ task->sched_info.pcount);
+
+ return 0;
}
#endif
@@ -387,7 +586,9 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
points = oom_badness(task, NULL, NULL, totalpages) *
1000 / totalpages;
read_unlock(&tasklist_lock);
- return seq_printf(m, "%lu\n", points);
+ seq_printf(m, "%lu\n", points);
+
+ return 0;
}
struct limit_names {
@@ -432,15 +633,15 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
* print the file header
*/
seq_printf(m, "%-25s %-20s %-20s %-10s\n",
- "Limit", "Soft Limit", "Hard Limit", "Units");
+ "Limit", "Soft Limit", "Hard Limit", "Units");
for (i = 0; i < RLIM_NLIMITS; i++) {
if (rlim[i].rlim_cur == RLIM_INFINITY)
seq_printf(m, "%-25s %-20s ",
- lnames[i].name, "unlimited");
+ lnames[i].name, "unlimited");
else
seq_printf(m, "%-25s %-20lu ",
- lnames[i].name, rlim[i].rlim_cur);
+ lnames[i].name, rlim[i].rlim_cur);
if (rlim[i].rlim_max == RLIM_INFINITY)
seq_printf(m, "%-20s ", "unlimited");
@@ -462,7 +663,9 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
{
long nr;
unsigned long args[6], sp, pc;
- int res = lock_trace(task);
+ int res;
+
+ res = lock_trace(task);
if (res)
return res;
@@ -477,7 +680,8 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
args[0], args[1], args[2], args[3], args[4], args[5],
sp, pc);
unlock_trace(task);
- return res;
+
+ return 0;
}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
@@ -505,7 +709,7 @@ static int proc_fd_access_allowed(struct inode *inode)
int proc_setattr(struct dentry *dentry, struct iattr *attr)
{
int error;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (attr->ia_valid & ATTR_MODE)
return -EPERM;
@@ -1353,7 +1557,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
struct mm_struct *mm;
struct file *exe_file;
- task = get_proc_task(dentry->d_inode);
+ task = get_proc_task(d_inode(dentry));
if (!task)
return -ENOENT;
mm = get_task_mm(task);
@@ -1371,9 +1575,9 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
return -ENOENT;
}
-static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct path path;
int error = -EACCES;
@@ -1385,7 +1589,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
if (error)
goto out;
- nd_jump_link(nd, &path);
+ nd_jump_link(&path);
return NULL;
out:
return ERR_PTR(error);
@@ -1418,7 +1622,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
{
int error = -EACCES;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct path path;
/* Are we allowed to snoop on the tasks file descriptors? */
@@ -1488,7 +1692,7 @@ out_unlock:
int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct task_struct *task;
const struct cred *cred;
struct pid_namespace *pid = dentry->d_sb->s_fs_info;
@@ -1545,7 +1749,7 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
task = get_proc_task(inode);
if (task) {
@@ -1579,7 +1783,7 @@ int pid_delete_dentry(const struct dentry *dentry)
* If so, then don't put the dentry on the lru list,
* kill it immediately.
*/
- return proc_inode_is_dead(dentry->d_inode);
+ return proc_inode_is_dead(d_inode(dentry));
}
const struct dentry_operations pid_dentry_operations =
@@ -1617,12 +1821,12 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
child = d_alloc(dir, &qname);
if (!child)
goto end_instantiate;
- if (instantiate(dir->d_inode, child, task, ptr) < 0) {
+ if (instantiate(d_inode(dir), child, task, ptr) < 0) {
dput(child);
goto end_instantiate;
}
}
- inode = child->d_inode;
+ inode = d_inode(child);
ino = inode->i_ino;
type = inode->i_mode >> 12;
dput(child);
@@ -1665,7 +1869,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
goto out_notask;
}
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
task = get_proc_task(inode);
if (!task)
goto out_notask;
@@ -1718,7 +1922,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
int rc;
rc = -ENOENT;
- task = get_proc_task(dentry->d_inode);
+ task = get_proc_task(d_inode(dentry));
if (!task)
goto out;
@@ -2002,12 +2206,13 @@ static int show_timer(struct seq_file *m, void *v)
notify = timer->it_sigev_notify;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo,
- timer->sigq->info.si_value.sival_ptr);
+ seq_printf(m, "signal: %d/%p\n",
+ timer->sigq->info.si_signo,
+ timer->sigq->info.si_value.sival_ptr);
seq_printf(m, "notify: %s/%s.%d\n",
- nstr[notify & ~SIGEV_THREAD_ID],
- (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
- pid_nr_ns(timer->it_pid, tp->ns));
+ nstr[notify & ~SIGEV_THREAD_ID],
+ (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
+ pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
return 0;
@@ -2352,21 +2557,23 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
unlock_task_sighand(task, &flags);
}
- result = seq_printf(m,
- "rchar: %llu\n"
- "wchar: %llu\n"
- "syscr: %llu\n"
- "syscw: %llu\n"
- "read_bytes: %llu\n"
- "write_bytes: %llu\n"
- "cancelled_write_bytes: %llu\n",
- (unsigned long long)acct.rchar,
- (unsigned long long)acct.wchar,
- (unsigned long long)acct.syscr,
- (unsigned long long)acct.syscw,
- (unsigned long long)acct.read_bytes,
- (unsigned long long)acct.write_bytes,
- (unsigned long long)acct.cancelled_write_bytes);
+ seq_printf(m,
+ "rchar: %llu\n"
+ "wchar: %llu\n"
+ "syscr: %llu\n"
+ "syscw: %llu\n"
+ "read_bytes: %llu\n"
+ "write_bytes: %llu\n"
+ "cancelled_write_bytes: %llu\n",
+ (unsigned long long)acct.rchar,
+ (unsigned long long)acct.wchar,
+ (unsigned long long)acct.syscr,
+ (unsigned long long)acct.syscw,
+ (unsigned long long)acct.read_bytes,
+ (unsigned long long)acct.write_bytes,
+ (unsigned long long)acct.cancelled_write_bytes);
+ result = 0;
+
out_unlock:
mutex_unlock(&task->signal->cred_guard_mutex);
return result;
@@ -2560,7 +2767,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
- ONE("cmdline", S_IRUGO, proc_pid_cmdline),
+ REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
ONE("stat", S_IRUGO, proc_tgid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_pid_maps_operations),
@@ -2588,7 +2795,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_STACKTRACE
ONE("stack", S_IRUSR, proc_pid_stack),
#endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
@@ -2851,13 +3058,13 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
return 0;
if (pos == TGID_OFFSET - 2) {
- struct inode *inode = ns->proc_self->d_inode;
+ struct inode *inode = d_inode(ns->proc_self);
if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
}
if (pos == TGID_OFFSET - 1) {
- struct inode *inode = ns->proc_thread_self->d_inode;
+ struct inode *inode = d_inode(ns->proc_thread_self);
if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
return 0;
ctx->pos = pos = pos + 1;
@@ -2906,11 +3113,11 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
ONE("syscall", S_IRUSR, proc_pid_syscall),
#endif
- ONE("cmdline", S_IRUGO, proc_pid_cmdline),
+ REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
ONE("stat", S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_tid_maps_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#ifdef CONFIG_PROC_CHILDREN
REG("children", S_IRUGO, proc_tid_children_operations),
#endif
#ifdef CONFIG_NUMA
@@ -2936,7 +3143,7 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_STACKTRACE
ONE("stack", S_IRUSR, proc_pid_stack),
#endif
-#ifdef CONFIG_SCHEDSTATS
+#ifdef CONFIG_SCHED_INFO
ONE("schedstat", S_IRUGO, proc_pid_schedstat),
#endif
#ifdef CONFIG_LATENCYTOP
@@ -3176,7 +3383,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct task_struct *p = get_proc_task(inode);
generic_fillattr(inode, stat);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 8e5ad83b629a..6e5fcd00733e 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -8,6 +8,7 @@
#include <linux/security.h>
#include <linux/file.h>
#include <linux/seq_file.h>
+#include <linux/fs.h>
#include <linux/proc_fs.h>
@@ -48,17 +49,23 @@ static int seq_show(struct seq_file *m, void *v)
put_files_struct(files);
}
- if (!ret) {
- seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
- (long long)file->f_pos, f_flags,
- real_mount(file->f_path.mnt)->mnt_id);
- if (file->f_op->show_fdinfo)
- file->f_op->show_fdinfo(m, file);
- ret = seq_has_overflowed(m);
- fput(file);
- }
+ if (ret)
+ return ret;
- return ret;
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ (long long)file->f_pos, f_flags,
+ real_mount(file->f_path.mnt)->mnt_id);
+
+ show_fd_locks(m, file, files);
+ if (seq_has_overflowed(m))
+ goto out;
+
+ if (file->f_op->show_fdinfo)
+ file->f_op->show_fdinfo(m, file);
+
+out:
+ fput(file);
+ return 0;
}
static int seq_fdinfo_open(struct inode *inode, struct file *file)
@@ -84,7 +91,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
task = get_proc_task(inode);
fd = proc_fd(inode);
@@ -144,14 +151,14 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
struct task_struct *task;
int ret = -ENOENT;
- task = get_proc_task(dentry->d_inode);
+ task = get_proc_task(d_inode(dentry));
if (task) {
files = get_files_struct(task);
put_task_struct(task);
}
if (files) {
- int fd = proc_fd(dentry->d_inode);
+ int fd = proc_fd(d_inode(dentry));
struct file *fd_file;
spin_lock(&files->file_lock);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index be65b2082135..e5dee5c3188e 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -101,7 +101,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct proc_dir_entry *de = PDE(inode);
int error;
@@ -120,7 +120,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct proc_dir_entry *de = PDE(inode);
if (de && de->nlink)
set_nlink(inode, de->nlink);
@@ -373,6 +373,10 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
WARN(1, "create '/proc/%s' by hand\n", qstr.name);
return NULL;
}
+ if (is_empty_pde(*parent)) {
+ WARN(1, "attempt to add to permanently empty directory");
+ return NULL;
+ }
ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
if (!ent)
@@ -455,6 +459,25 @@ struct proc_dir_entry *proc_mkdir(const char *name,
}
EXPORT_SYMBOL(proc_mkdir);
+struct proc_dir_entry *proc_create_mount_point(const char *name)
+{
+ umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ struct proc_dir_entry *ent, *parent = NULL;
+
+ ent = __proc_create(&parent, name, mode, 2);
+ if (ent) {
+ ent->data = NULL;
+ ent->proc_fops = NULL;
+ ent->proc_iops = NULL;
+ if (proc_register(parent, ent) < 0) {
+ kfree(ent);
+ parent->nlink--;
+ ent = NULL;
+ }
+ }
+ return ent;
+}
+
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
struct proc_dir_entry *parent,
const struct file_operations *proc_fops,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 7697b6621cfd..bd95b9fdebb0 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -23,7 +23,6 @@
#include <linux/slab.h>
#include <linux/mount.h>
#include <linux/magic.h>
-#include <linux/namei.h>
#include <asm/uaccess.h>
@@ -394,16 +393,16 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
};
#endif
-static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_follow_link(struct dentry *dentry, void **cookie)
{
- struct proc_dir_entry *pde = PDE(dentry->d_inode);
+ struct proc_dir_entry *pde = PDE(d_inode(dentry));
if (unlikely(!use_pde(pde)))
return ERR_PTR(-EINVAL);
- nd_set_link(nd, pde->data);
- return pde;
+ *cookie = pde;
+ return pde->data;
}
-static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+static void proc_put_link(struct inode *unused, void *p)
{
unuse_pde(p);
}
@@ -423,6 +422,10 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
PROC_I(inode)->pde = de;
+ if (is_empty_pde(de)) {
+ make_empty_dir_inode(inode);
+ return inode;
+ }
if (de->mode) {
inode->i_mode = de->mode;
inode->i_uid = de->uid;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c835b94c0cd3..aa2781095bd1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -191,6 +191,12 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
}
extern void pde_put(struct proc_dir_entry *);
+static inline bool is_empty_pde(const struct proc_dir_entry *pde)
+{
+ return S_ISDIR(pde->mode) && !pde->proc_iops;
+}
+struct proc_dir_entry *proc_create_mount_point(const char *name);
+
/*
* inode.c
*/
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 91a4e6426321..92e6726f6e37 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
roundup(sizeof(CORE_STR), 4)) +
roundup(sizeof(struct elf_prstatus), 4) +
roundup(sizeof(struct elf_prpsinfo), 4) +
- roundup(sizeof(struct task_struct), 4);
+ roundup(arch_task_struct_size, 4);
*elf_buflen = PAGE_ALIGN(*elf_buflen);
return size + *elf_buflen;
}
@@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
/* set up the task structure */
notes[2].name = CORE_STR;
notes[2].type = NT_TASKSTRUCT;
- notes[2].datasz = sizeof(struct task_struct);
+ notes[2].datasz = arch_task_struct_size;
notes[2].data = current;
nhdr->p_filesz += notesize(&notes[2]);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index c9eac4563fa8..f6e8354b8cea 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -30,9 +30,9 @@ static const struct proc_ns_operations *ns_entries[] = {
&mntns_operations,
};
-static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
@@ -45,7 +45,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
if (ptrace_may_access(task, PTRACE_MODE_READ)) {
error = ns_get_path(&ns_path, task, ns_ops);
if (!error)
- nd_jump_link(nd, &ns_path);
+ nd_jump_link(&ns_path);
}
put_task_struct(task);
return error;
@@ -53,7 +53,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
char name[50];
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index d4a35746cab9..f8595e8b5cd0 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -64,7 +64,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
if (file) {
seq_pad(m, ' ');
- seq_path(m, &file->f_path, "");
+ seq_file_path(m, file, "");
}
seq_putc(m, '\n');
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 1bde894bc624..350984a19c83 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -142,7 +142,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct net *net;
net = get_proc_task_net(inode);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f92d5dd578a4..fdda62e6115e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -19,6 +19,28 @@ static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
+/* Support for permanently empty directories */
+
+struct ctl_table sysctl_mount_point[] = {
+ { }
+};
+
+static bool is_empty_dir(struct ctl_table_header *head)
+{
+ return head->ctl_table[0].child == sysctl_mount_point;
+}
+
+static void set_empty_dir(struct ctl_dir *dir)
+{
+ dir->header.ctl_table[0].child = sysctl_mount_point;
+}
+
+static void clear_empty_dir(struct ctl_dir *dir)
+
+{
+ dir->header.ctl_table[0].child = NULL;
+}
+
void proc_sys_poll_notify(struct ctl_table_poll *poll)
{
if (!poll)
@@ -187,6 +209,17 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
struct ctl_table *entry;
int err;
+ /* Is this a permanently empty directory? */
+ if (is_empty_dir(&dir->header))
+ return -EROFS;
+
+ /* Am I creating a permanently empty directory? */
+ if (header->ctl_table == sysctl_mount_point) {
+ if (!RB_EMPTY_ROOT(&dir->root))
+ return -EINVAL;
+ set_empty_dir(dir);
+ }
+
dir->header.nreg++;
header->parent = dir;
err = insert_links(header);
@@ -202,6 +235,8 @@ fail:
erase_header(header);
put_links(header);
fail_links:
+ if (header->ctl_table == sysctl_mount_point)
+ clear_empty_dir(dir);
header->parent = NULL;
drop_sysctl_table(&dir->header);
return err;
@@ -419,6 +454,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_mode |= S_IFDIR;
inode->i_op = &proc_sys_dir_operations;
inode->i_fop = &proc_sys_dir_file_operations;
+ if (is_empty_dir(head))
+ make_empty_dir_inode(inode);
}
out:
return inode;
@@ -604,7 +641,7 @@ static bool proc_sys_fill_cache(struct file *file,
return false;
}
}
- inode = child->d_inode;
+ inode = d_inode(child);
ino = inode->i_ino;
type = inode->i_mode >> 12;
dput(child);
@@ -710,7 +747,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
@@ -727,7 +764,7 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ctl_table_header *head = grab_header(inode);
struct ctl_table *table = PROC_I(inode)->sysctl_entry;
@@ -773,12 +810,12 @@ static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags)
{
if (flags & LOOKUP_RCU)
return -ECHILD;
- return !PROC_I(dentry->d_inode)->sysctl->unregistering;
+ return !PROC_I(d_inode(dentry))->sysctl->unregistering;
}
static int proc_sys_delete(const struct dentry *dentry)
{
- return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
+ return !!PROC_I(d_inode(dentry))->sysctl->unregistering;
}
static int sysctl_is_seen(struct ctl_table_header *p)
@@ -805,7 +842,7 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *de
/* Although proc doesn't have negative dentries, rcu-walk means
* that inode here can be NULL */
/* AV: can it, indeed? */
- inode = ACCESS_ONCE(dentry->d_inode);
+ inode = d_inode_rcu(dentry);
if (!inode)
return 1;
if (name->len != len)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index e74ac9f1a2c0..68feb0f70e63 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -112,9 +112,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
ns = task_active_pid_ns(current);
options = data;
- if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
- return ERR_PTR(-EPERM);
-
/* Does the mounter have privilege over the pid namespace? */
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -159,7 +156,7 @@ static struct file_system_type proc_fs_type = {
.name = "proc",
.mount = proc_mount,
.kill_sb = proc_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
};
void __init proc_root_init(void)
@@ -182,10 +179,10 @@ void __init proc_root_init(void)
#endif
proc_mkdir("fs", NULL);
proc_mkdir("driver", NULL);
- proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
+ proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
/* just give it a mountpoint */
- proc_mkdir("openprom", NULL);
+ proc_create_mount_point("openprom");
#endif
proc_tty_init();
proc_mkdir("bus", NULL);
@@ -195,7 +192,7 @@ void __init proc_root_init(void)
static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
)
{
- generic_fillattr(dentry->d_inode, stat);
+ generic_fillattr(d_inode(dentry), stat);
stat->nlink = proc_root.nlink + nr_processes();
return 0;
}
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 4348bb8907c2..113b8d061fc0 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,5 +1,4 @@
#include <linux/sched.h>
-#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
#include "internal.h"
@@ -19,21 +18,20 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
return readlink_copy(buffer, buflen, tmp);
}
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
{
struct pid_namespace *ns = dentry->d_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
- char *name = ERR_PTR(-ENOENT);
- if (tgid) {
- /* 11 for max length of signed int in decimal + NULL term */
- name = kmalloc(12, GFP_KERNEL);
- if (!name)
- name = ERR_PTR(-ENOMEM);
- else
- sprintf(name, "%d", tgid);
- }
- nd_set_link(nd, name);
- return NULL;
+ char *name;
+
+ if (!tgid)
+ return ERR_PTR(-ENOENT);
+ /* 11 for max length of signed int in decimal + NULL term */
+ name = kmalloc(12, GFP_KERNEL);
+ if (!name)
+ return ERR_PTR(-ENOMEM);
+ sprintf(name, "%d", tgid);
+ return *cookie = name;
}
static const struct inode_operations proc_self_inode_operations = {
@@ -46,7 +44,7 @@ static unsigned self_inum;
int proc_setup_self(struct super_block *s)
{
- struct inode *root_inode = s->s_root->d_inode;
+ struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = s->s_fs_info;
struct dentry *self;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6dee68d013ff..ca1e091881d4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -310,7 +310,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
*/
if (file) {
seq_pad(m, ' ');
- seq_path(m, &file->f_path, "\n");
+ seq_file_path(m, file, "\n");
goto done;
}
@@ -1509,7 +1509,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
if (file) {
seq_puts(m, " file=");
- seq_path(m, &file->f_path, "\n\t= ");
+ seq_file_path(m, file, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
} else {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 599ec2e20104..e0d64c92e4f6 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -180,7 +180,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
if (file) {
seq_pad(m, ' ');
- seq_path(m, &file->f_path, "");
+ seq_file_path(m, file, "");
} else if (mm) {
pid_t tid = pid_of_stack(priv, vma, is_pid);
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 59075b509df3..947b0f4fd0a1 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -1,5 +1,4 @@
#include <linux/sched.h>
-#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/pid_namespace.h>
#include "internal.h"
@@ -20,21 +19,20 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
return readlink_copy(buffer, buflen, tmp);
}
-static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
{
struct pid_namespace *ns = dentry->d_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
pid_t pid = task_pid_nr_ns(current, ns);
- char *name = ERR_PTR(-ENOENT);
- if (pid) {
- name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
- if (!name)
- name = ERR_PTR(-ENOMEM);
- else
- sprintf(name, "%d/task/%d", tgid, pid);
- }
- nd_set_link(nd, name);
- return NULL;
+ char *name;
+
+ if (!pid)
+ return ERR_PTR(-ENOENT);
+ name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
+ if (!name)
+ return ERR_PTR(-ENOMEM);
+ sprintf(name, "%d/task/%d", tgid, pid);
+ return *cookie = name;
}
static const struct inode_operations proc_thread_self_inode_operations = {
@@ -47,7 +45,7 @@ static unsigned thread_self_inum;
int proc_setup_thread_self(struct super_block *s)
{
- struct inode *root_inode = s->s_root->d_inode;
+ struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = s->s_fs_info;
struct dentry *thread_self;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8db932da4009..8ebd9a334085 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -17,7 +17,8 @@
static unsigned mounts_poll(struct file *file, poll_table *wait)
{
- struct proc_mounts *p = proc_mounts(file->private_data);
+ struct seq_file *m = file->private_data;
+ struct proc_mounts *p = m->private;
struct mnt_namespace *ns = p->ns;
unsigned res = POLLIN | POLLRDNORM;
int event;
@@ -25,8 +26,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
poll_wait(file, &p->ns->poll, wait);
event = ACCESS_ONCE(ns->event);
- if (p->m.poll_event != event) {
- p->m.poll_event = event;
+ if (m->poll_event != event) {
+ m->poll_event = event;
res |= POLLERR | POLLPRI;
}
@@ -92,7 +93,7 @@ static void show_type(struct seq_file *m, struct super_block *sb)
static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
int err = 0;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
@@ -126,7 +127,7 @@ out:
static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
struct super_block *sb = mnt->mnt_sb;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
@@ -186,7 +187,7 @@ out:
static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
{
- struct proc_mounts *p = proc_mounts(m);
+ struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
struct super_block *sb = mnt_path.dentry->d_sb;
@@ -236,6 +237,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
struct mnt_namespace *ns = NULL;
struct path root;
struct proc_mounts *p;
+ struct seq_file *m;
int ret = -EINVAL;
if (!task)
@@ -260,26 +262,21 @@ static int mounts_open_common(struct inode *inode, struct file *file,
task_unlock(task);
put_task_struct(task);
- ret = -ENOMEM;
- p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
- if (!p)
+ ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts));
+ if (ret)
goto err_put_path;
- file->private_data = &p->m;
- ret = seq_open(file, &mounts_op);
- if (ret)
- goto err_free;
+ m = file->private_data;
+ m->poll_event = ns->event;
+ p = m->private;
p->ns = ns;
p->root = root;
- p->m.poll_event = ns->event;
p->show = show;
p->cached_event = ~0ULL;
return 0;
- err_free:
- kfree(p);
err_put_path:
path_put(&root);
err_put_ns:
@@ -290,10 +287,11 @@ static int mounts_open_common(struct inode *inode, struct file *file,
static int mounts_release(struct inode *inode, struct file *file)
{
- struct proc_mounts *p = proc_mounts(file->private_data);
+ struct seq_file *m = file->private_data;
+ struct proc_mounts *p = m->private;
path_put(&p->root);
put_mnt_ns(p->ns);
- return seq_release(inode, file);
+ return seq_release_private(inode, file);
}
static int mounts_open(struct inode *inode, struct file *file)
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index b32ce53d24ee..3adcc4669fac 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -190,7 +190,7 @@ static const struct file_operations pstore_file_operations = {
*/
static int pstore_unlink(struct inode *dir, struct dentry *dentry)
{
- struct pstore_private *p = dentry->d_inode->i_private;
+ struct pstore_private *p = d_inode(dentry)->i_private;
int err;
err = pstore_check_syslog_permissions(p);
@@ -199,7 +199,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
if (p->psi->erase)
p->psi->erase(p->type, p->id, p->count,
- dentry->d_inode->i_ctime, p->psi);
+ d_inode(dentry)->i_ctime, p->psi);
else
return -EPERM;
@@ -364,6 +364,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
case PSTORE_TYPE_PMSG:
scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
break;
+ case PSTORE_TYPE_PPC_OPAL:
+ sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+ break;
case PSTORE_TYPE_UNKNOWN:
scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
break;
@@ -373,7 +376,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
break;
}
- mutex_lock(&root->d_inode->i_mutex);
+ mutex_lock(&d_inode(root)->i_mutex);
dentry = d_alloc_name(root, name);
if (!dentry)
@@ -393,12 +396,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
list_add(&private->list, &allpstore);
spin_unlock_irqrestore(&allpstore_lock, flags);
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
return 0;
fail_lockedalloc:
- mutex_unlock(&root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(root)->i_mutex);
kfree(private);
fail_alloc:
iput(inode);
@@ -458,22 +461,18 @@ static struct file_system_type pstore_fs_type = {
.kill_sb = pstore_kill_sb,
};
-static struct kobject *pstore_kobj;
-
static int __init init_pstore_fs(void)
{
- int err = 0;
+ int err;
/* Create a convenient mount point for people to access pstore */
- pstore_kobj = kobject_create_and_add("pstore", fs_kobj);
- if (!pstore_kobj) {
- err = -ENOMEM;
+ err = sysfs_create_mount_point(fs_kobj, "pstore");
+ if (err)
goto out;
- }
err = register_filesystem(&pstore_fs_type);
if (err < 0)
- kobject_put(pstore_kobj);
+ sysfs_remove_mount_point(fs_kobj, "pstore");
out:
return err;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index c4c9a10c5760..791743deedf1 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -299,7 +299,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
bool compressed;
size_t total_len;
- if (big_oops_buf) {
+ if (big_oops_buf && is_locked) {
dst = big_oops_buf;
hsize = sprintf(dst, "%s#%d Part%u\n", why,
oopscount, part);
@@ -456,6 +456,12 @@ int pstore_register(struct pstore_info *psi)
add_timer(&pstore_timer);
}
+ /*
+ * Update the module parameter backend, so it is visible
+ * through /sys/module/pstore/parameters/backend
+ */
+ backend = psi->name;
+
pr_info("Registered %s as persistent store backend\n", psi->name);
return 0;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 39d1373128e9..6c26c4daaec9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -186,12 +186,34 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
ssize_t size;
ssize_t ecc_notice_size;
struct ramoops_context *cxt = psi->data;
- struct persistent_ram_zone *prz;
- int header_length;
+ struct persistent_ram_zone *prz = NULL;
+ int header_length = 0;
+
+ /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
+ * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
+ * valid time stamps, so it is initialized to zero.
+ */
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ *compressed = false;
+
+ /* Find the next valid persistent_ram_zone for DMESG */
+ while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
+ prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
+ cxt->max_dump_cnt, id, type,
+ PSTORE_TYPE_DMESG, 1);
+ if (!prz_ok(prz))
+ continue;
+ header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz),
+ time, compressed);
+ /* Clear and skip this DMESG record if it has no valid header */
+ if (!header_length) {
+ persistent_ram_free_old(prz);
+ persistent_ram_zap(prz);
+ prz = NULL;
+ }
+ }
- prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
- cxt->max_dump_cnt, id, type,
- PSTORE_TYPE_DMESG, 1);
if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
1, id, type, PSTORE_TYPE_CONSOLE, 0);
@@ -204,13 +226,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
if (!prz_ok(prz))
return 0;
- if (!persistent_ram_old(prz))
- return 0;
-
- size = persistent_ram_old_size(prz);
- header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), time,
- compressed);
- size -= header_length;
+ size = persistent_ram_old_size(prz) - header_length;
/* ECC correction notice */
ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
@@ -394,18 +410,16 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
}
for (i = 0; i < cxt->max_dump_cnt; i++) {
- size_t sz = cxt->record_size;
-
- cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
+ cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0,
&cxt->ecc_info,
cxt->memtype);
if (IS_ERR(cxt->przs[i])) {
err = PTR_ERR(cxt->przs[i]);
dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
- sz, (unsigned long long)*paddr, err);
+ cxt->record_size, (unsigned long long)*paddr, err);
goto fail_prz;
}
- *paddr += sz;
+ *paddr += cxt->record_size;
}
return 0;
@@ -539,6 +553,9 @@ static int ramoops_probe(struct platform_device *pdev)
mem_address = pdata->mem_address;
record_size = pdata->record_size;
dump_oops = pdata->dump_oops;
+ ramoops_console_size = pdata->console_size;
+ ramoops_pmsg_size = pdata->pmsg_size;
+ ramoops_ftrace_size = pdata->ftrace_size;
pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
cxt->size, (unsigned long long)cxt->phys_addr,
@@ -605,7 +622,7 @@ static void ramoops_register_dummy(void)
dummy_data->mem_size = mem_size;
dummy_data->mem_address = mem_address;
- dummy_data->mem_type = 0;
+ dummy_data->mem_type = mem_type;
dummy_data->record_size = record_size;
dummy_data->console_size = ramoops_console_size;
dummy_data->ftrace_size = ramoops_ftrace_size;
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 8d64bb5366bf..e1f37278cf97 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -32,11 +32,6 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
return page;
}
-static inline unsigned long dir_pages(struct inode *inode)
-{
- return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
static unsigned last_entry(struct inode *inode, unsigned long page_nr)
{
unsigned long last_byte = inode->i_size;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 44e73923670d..32d2e1a9774c 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -182,7 +182,7 @@ static const char *qnx6_checkroot(struct super_block *s)
static char match_root[2][3] = {".\0\0", "..\0"};
int i, error = 0;
struct qnx6_dir_entry *dir_entry;
- struct inode *root = s->s_root->d_inode;
+ struct inode *root = d_inode(s->s_root);
struct address_space *mapping = root->i_mapping;
struct page *page = read_mapping_page(mapping, 0, NULL);
if (IS_ERR(page))
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 0ccd4ba3a246..20d1f74561cf 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -900,14 +900,17 @@ static inline struct dquot **i_dquot(struct inode *inode)
static int dqinit_needed(struct inode *inode, int type)
{
+ struct dquot * const *dquots;
int cnt;
if (IS_NOQUOTA(inode))
return 0;
+
+ dquots = i_dquot(inode);
if (type != -1)
- return !i_dquot(inode)[type];
+ return !dquots[type];
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- if (!i_dquot(inode)[cnt])
+ if (!dquots[cnt])
return 1;
return 0;
}
@@ -970,12 +973,13 @@ static void add_dquot_ref(struct super_block *sb, int type)
static void remove_inode_dquot_ref(struct inode *inode, int type,
struct list_head *tofree_head)
{
- struct dquot *dquot = i_dquot(inode)[type];
+ struct dquot **dquots = i_dquot(inode);
+ struct dquot *dquot = dquots[type];
- i_dquot(inode)[type] = NULL;
if (!dquot)
return;
+ dquots[type] = NULL;
if (list_empty(&dquot->dq_free)) {
/*
* The inode still has reference to dquot so it can't be in the
@@ -1159,8 +1163,8 @@ static int need_print_warning(struct dquot_warn *warn)
return uid_eq(current_fsuid(), warn->w_dq_id.uid);
case GRPQUOTA:
return in_group_p(warn->w_dq_id.gid);
- case PRJQUOTA: /* Never taken... Just make gcc happy */
- return 0;
+ case PRJQUOTA:
+ return 1;
}
return 0;
}
@@ -1389,16 +1393,21 @@ static int dquot_active(const struct inode *inode)
static void __dquot_initialize(struct inode *inode, int type)
{
int cnt, init_needed = 0;
- struct dquot *got[MAXQUOTAS];
+ struct dquot **dquots, *got[MAXQUOTAS];
struct super_block *sb = inode->i_sb;
qsize_t rsv;
if (!dquot_active(inode))
return;
+ dquots = i_dquot(inode);
+
/* First get references to structures we might need. */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
struct kqid qid;
+ kprojid_t projid;
+ int rc;
+
got[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
@@ -1407,8 +1416,12 @@ static void __dquot_initialize(struct inode *inode, int type)
* we check it without locking here to avoid unnecessary
* dqget()/dqput() calls.
*/
- if (i_dquot(inode)[cnt])
+ if (dquots[cnt])
+ continue;
+
+ if (!sb_has_quota_active(sb, cnt))
continue;
+
init_needed = 1;
switch (cnt) {
@@ -1418,6 +1431,12 @@ static void __dquot_initialize(struct inode *inode, int type)
case GRPQUOTA:
qid = make_kqid_gid(inode->i_gid);
break;
+ case PRJQUOTA:
+ rc = inode->i_sb->dq_op->get_projid(inode, &projid);
+ if (rc)
+ continue;
+ qid = make_kqid_projid(projid);
+ break;
}
got[cnt] = dqget(sb, qid);
}
@@ -1438,8 +1457,8 @@ static void __dquot_initialize(struct inode *inode, int type)
/* We could race with quotaon or dqget() could have failed */
if (!got[cnt])
continue;
- if (!i_dquot(inode)[cnt]) {
- i_dquot(inode)[cnt] = got[cnt];
+ if (!dquots[cnt]) {
+ dquots[cnt] = got[cnt];
got[cnt] = NULL;
/*
* Make quota reservation system happy if someone
@@ -1447,7 +1466,7 @@ static void __dquot_initialize(struct inode *inode, int type)
*/
rsv = inode_get_rsv_space(inode);
if (unlikely(rsv))
- dquot_resv_space(i_dquot(inode)[cnt], rsv);
+ dquot_resv_space(dquots[cnt], rsv);
}
}
out_err:
@@ -1473,12 +1492,13 @@ EXPORT_SYMBOL(dquot_initialize);
static void __dquot_drop(struct inode *inode)
{
int cnt;
+ struct dquot **dquots = i_dquot(inode);
struct dquot *put[MAXQUOTAS];
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- put[cnt] = i_dquot(inode)[cnt];
- i_dquot(inode)[cnt] = NULL;
+ put[cnt] = dquots[cnt];
+ dquots[cnt] = NULL;
}
spin_unlock(&dq_data_lock);
dqput_all(put);
@@ -1486,6 +1506,7 @@ static void __dquot_drop(struct inode *inode)
void dquot_drop(struct inode *inode)
{
+ struct dquot * const *dquots;
int cnt;
if (IS_NOQUOTA(inode))
@@ -1498,8 +1519,9 @@ void dquot_drop(struct inode *inode)
* must assure that nobody can come after the DQUOT_DROP and
* add quota pointers back anyway.
*/
+ dquots = i_dquot(inode);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (i_dquot(inode)[cnt])
+ if (dquots[cnt])
break;
}
@@ -1600,8 +1622,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot **dquots = i_dquot(inode);
int reserve = flags & DQUOT_SPACE_RESERVE;
+ struct dquot **dquots;
if (!dquot_active(inode)) {
inode_incr_space(inode, number, reserve);
@@ -1611,6 +1633,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warn[cnt].w_type = QUOTA_NL_NOWARN;
+ dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1652,13 +1675,14 @@ int dquot_alloc_inode(struct inode *inode)
{
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots = i_dquot(inode);
+ struct dquot * const *dquots;
if (!dquot_active(inode))
return 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warn[cnt].w_type = QUOTA_NL_NOWARN;
+ dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1690,6 +1714,7 @@ EXPORT_SYMBOL(dquot_alloc_inode);
*/
int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
+ struct dquot **dquots;
int cnt, index;
if (!dquot_active(inode)) {
@@ -1697,18 +1722,18 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
return 0;
}
+ dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (i_dquot(inode)[cnt])
- dquot_claim_reserved_space(i_dquot(inode)[cnt],
- number);
+ if (dquots[cnt])
+ dquot_claim_reserved_space(dquots[cnt], number);
}
/* Update inode bytes */
inode_claim_rsv_space(inode, number);
spin_unlock(&dq_data_lock);
- mark_all_dquot_dirty(i_dquot(inode));
+ mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
return 0;
}
@@ -1719,6 +1744,7 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
*/
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
+ struct dquot **dquots;
int cnt, index;
if (!dquot_active(inode)) {
@@ -1726,18 +1752,18 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
return;
}
+ dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (i_dquot(inode)[cnt])
- dquot_reclaim_reserved_space(i_dquot(inode)[cnt],
- number);
+ if (dquots[cnt])
+ dquot_reclaim_reserved_space(dquots[cnt], number);
}
/* Update inode bytes */
inode_reclaim_rsv_space(inode, number);
spin_unlock(&dq_data_lock);
- mark_all_dquot_dirty(i_dquot(inode));
+ mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
return;
}
@@ -1750,7 +1776,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot **dquots = i_dquot(inode);
+ struct dquot **dquots;
int reserve = flags & DQUOT_SPACE_RESERVE, index;
if (!dquot_active(inode)) {
@@ -1758,6 +1784,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
return;
}
+ dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1793,12 +1820,13 @@ void dquot_free_inode(struct inode *inode)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots = i_dquot(inode);
+ struct dquot * const *dquots;
int index;
if (!dquot_active(inode))
return;
+ dquots = i_dquot(inode);
index = srcu_read_lock(&dquot_srcu);
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -2161,7 +2189,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
error = -EROFS;
goto out_fmt;
}
- if (!sb->s_op->quota_write || !sb->s_op->quota_read) {
+ if (!sb->s_op->quota_write || !sb->s_op->quota_read ||
+ (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
error = -EINVAL;
goto out_fmt;
}
@@ -2299,7 +2328,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id,
if (path->dentry->d_sb != sb)
error = -EXDEV;
else
- error = vfs_load_quota_inode(path->dentry->d_inode, type,
+ error = vfs_load_quota_inode(d_inode(path->dentry), type,
format_id, DQUOT_USAGE_ENABLED |
DQUOT_LIMITS_ENABLED);
return error;
@@ -2363,20 +2392,20 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
struct dentry *dentry;
int error;
- mutex_lock(&sb->s_root->d_inode->i_mutex);
+ mutex_lock(&d_inode(sb->s_root)->i_mutex);
dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
- mutex_unlock(&sb->s_root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(sb->s_root)->i_mutex);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (!dentry->d_inode) {
+ if (d_really_is_negative(dentry)) {
error = -ENOENT;
goto out;
}
error = security_quota_on(dentry);
if (!error)
- error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+ error = vfs_load_quota_inode(d_inode(dentry), type, format_id,
DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
out:
@@ -2614,55 +2643,73 @@ out:
EXPORT_SYMBOL(dquot_set_dqblk);
/* Generic routine for getting common part of quota file information */
-int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_get_state(struct super_block *sb, struct qc_state *state)
{
struct mem_dqinfo *mi;
+ struct qc_type_state *tstate;
+ struct quota_info *dqopt = sb_dqopt(sb);
+ int type;
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!sb_has_quota_active(sb, type)) {
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
- return -ESRCH;
+ memset(state, 0, sizeof(*state));
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!sb_has_quota_active(sb, type))
+ continue;
+ tstate = state->s_state + type;
+ mi = sb_dqopt(sb)->info + type;
+ tstate->flags = QCI_ACCT_ENABLED;
+ spin_lock(&dq_data_lock);
+ if (mi->dqi_flags & DQF_SYS_FILE)
+ tstate->flags |= QCI_SYSFILE;
+ if (mi->dqi_flags & DQF_ROOT_SQUASH)
+ tstate->flags |= QCI_ROOT_SQUASH;
+ if (sb_has_quota_limits_enabled(sb, type))
+ tstate->flags |= QCI_LIMITS_ENFORCED;
+ tstate->spc_timelimit = mi->dqi_bgrace;
+ tstate->ino_timelimit = mi->dqi_igrace;
+ tstate->ino = dqopt->files[type]->i_ino;
+ tstate->blocks = dqopt->files[type]->i_blocks;
+ tstate->nextents = 1; /* We don't know... */
+ spin_unlock(&dq_data_lock);
}
- mi = sb_dqopt(sb)->info + type;
- spin_lock(&dq_data_lock);
- ii->dqi_bgrace = mi->dqi_bgrace;
- ii->dqi_igrace = mi->dqi_igrace;
- ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK;
- ii->dqi_valid = IIF_ALL;
- spin_unlock(&dq_data_lock);
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return 0;
}
-EXPORT_SYMBOL(dquot_get_dqinfo);
+EXPORT_SYMBOL(dquot_get_state);
/* Generic routine for setting common part of quota file information */
-int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
{
struct mem_dqinfo *mi;
int err = 0;
+ if ((ii->i_fieldmask & QC_WARNS_MASK) ||
+ (ii->i_fieldmask & QC_RT_SPC_TIMER))
+ return -EINVAL;
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
if (!sb_has_quota_active(sb, type)) {
err = -ESRCH;
goto out;
}
mi = sb_dqopt(sb)->info + type;
- if (ii->dqi_valid & IIF_FLAGS) {
- if (ii->dqi_flags & ~DQF_SETINFO_MASK ||
- (ii->dqi_flags & DQF_ROOT_SQUASH &&
+ if (ii->i_fieldmask & QC_FLAGS) {
+ if ((ii->i_flags & QCI_ROOT_SQUASH &&
mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
err = -EINVAL;
goto out;
}
}
spin_lock(&dq_data_lock);
- if (ii->dqi_valid & IIF_BGRACE)
- mi->dqi_bgrace = ii->dqi_bgrace;
- if (ii->dqi_valid & IIF_IGRACE)
- mi->dqi_igrace = ii->dqi_igrace;
- if (ii->dqi_valid & IIF_FLAGS)
- mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) |
- (ii->dqi_flags & DQF_SETINFO_MASK);
+ if (ii->i_fieldmask & QC_SPC_TIMER)
+ mi->dqi_bgrace = ii->i_spc_timelimit;
+ if (ii->i_fieldmask & QC_INO_TIMER)
+ mi->dqi_igrace = ii->i_ino_timelimit;
+ if (ii->i_fieldmask & QC_FLAGS) {
+ if (ii->i_flags & QCI_ROOT_SQUASH)
+ mi->dqi_flags |= DQF_ROOT_SQUASH;
+ else
+ mi->dqi_flags &= ~DQF_ROOT_SQUASH;
+ }
spin_unlock(&dq_data_lock);
mark_info_dirty(sb, type);
/* Force write to disk */
@@ -2677,7 +2724,7 @@ const struct quotactl_ops dquot_quotactl_ops = {
.quota_on = dquot_quota_on,
.quota_off = dquot_quota_off,
.quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
+ .get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk
@@ -2688,7 +2735,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = {
.quota_enable = dquot_quota_enable,
.quota_disable = dquot_quota_disable,
.quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
+ .get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index d14a799c7785..86ded7375c21 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -118,13 +118,30 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
{
- struct if_dqinfo info;
+ struct qc_state state;
+ struct qc_type_state *tstate;
+ struct if_dqinfo uinfo;
int ret;
- if (!sb->s_qcop->get_info)
+ /* This checks whether qc_state has enough entries... */
+ BUILD_BUG_ON(MAXQUOTAS > XQM_MAXQUOTAS);
+ if (!sb->s_qcop->get_state)
return -ENOSYS;
- ret = sb->s_qcop->get_info(sb, type, &info);
- if (!ret && copy_to_user(addr, &info, sizeof(info)))
+ ret = sb->s_qcop->get_state(sb, &state);
+ if (ret)
+ return ret;
+ tstate = state.s_state + type;
+ if (!(tstate->flags & QCI_ACCT_ENABLED))
+ return -ESRCH;
+ memset(&uinfo, 0, sizeof(uinfo));
+ uinfo.dqi_bgrace = tstate->spc_timelimit;
+ uinfo.dqi_igrace = tstate->ino_timelimit;
+ if (tstate->flags & QCI_SYSFILE)
+ uinfo.dqi_flags |= DQF_SYS_FILE;
+ if (tstate->flags & QCI_ROOT_SQUASH)
+ uinfo.dqi_flags |= DQF_ROOT_SQUASH;
+ uinfo.dqi_valid = IIF_ALL;
+ if (!ret && copy_to_user(addr, &uinfo, sizeof(uinfo)))
return -EFAULT;
return ret;
}
@@ -132,12 +149,31 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr)
static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
{
struct if_dqinfo info;
+ struct qc_info qinfo;
if (copy_from_user(&info, addr, sizeof(info)))
return -EFAULT;
if (!sb->s_qcop->set_info)
return -ENOSYS;
- return sb->s_qcop->set_info(sb, type, &info);
+ if (info.dqi_valid & ~(IIF_FLAGS | IIF_BGRACE | IIF_IGRACE))
+ return -EINVAL;
+ memset(&qinfo, 0, sizeof(qinfo));
+ if (info.dqi_valid & IIF_FLAGS) {
+ if (info.dqi_flags & ~DQF_SETINFO_MASK)
+ return -EINVAL;
+ if (info.dqi_flags & DQF_ROOT_SQUASH)
+ qinfo.i_flags |= QCI_ROOT_SQUASH;
+ qinfo.i_fieldmask |= QC_FLAGS;
+ }
+ if (info.dqi_valid & IIF_BGRACE) {
+ qinfo.i_spc_timelimit = info.dqi_bgrace;
+ qinfo.i_fieldmask |= QC_SPC_TIMER;
+ }
+ if (info.dqi_valid & IIF_IGRACE) {
+ qinfo.i_ino_timelimit = info.dqi_igrace;
+ qinfo.i_fieldmask |= QC_INO_TIMER;
+ }
+ return sb->s_qcop->set_info(sb, type, &qinfo);
}
static inline qsize_t qbtos(qsize_t blocks)
@@ -252,25 +288,149 @@ static int quota_disable(struct super_block *sb, void __user *addr)
return sb->s_qcop->quota_disable(sb, flags);
}
+static int quota_state_to_flags(struct qc_state *state)
+{
+ int flags = 0;
+
+ if (state->s_state[USRQUOTA].flags & QCI_ACCT_ENABLED)
+ flags |= FS_QUOTA_UDQ_ACCT;
+ if (state->s_state[USRQUOTA].flags & QCI_LIMITS_ENFORCED)
+ flags |= FS_QUOTA_UDQ_ENFD;
+ if (state->s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)
+ flags |= FS_QUOTA_GDQ_ACCT;
+ if (state->s_state[GRPQUOTA].flags & QCI_LIMITS_ENFORCED)
+ flags |= FS_QUOTA_GDQ_ENFD;
+ if (state->s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED)
+ flags |= FS_QUOTA_PDQ_ACCT;
+ if (state->s_state[PRJQUOTA].flags & QCI_LIMITS_ENFORCED)
+ flags |= FS_QUOTA_PDQ_ENFD;
+ return flags;
+}
+
+static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs)
+{
+ int type;
+ struct qc_state state;
+ int ret;
+
+ ret = sb->s_qcop->get_state(sb, &state);
+ if (ret < 0)
+ return ret;
+
+ memset(fqs, 0, sizeof(*fqs));
+ fqs->qs_version = FS_QSTAT_VERSION;
+ fqs->qs_flags = quota_state_to_flags(&state);
+ /* No quota enabled? */
+ if (!fqs->qs_flags)
+ return -ENOSYS;
+ fqs->qs_incoredqs = state.s_incoredqs;
+ /*
+ * GETXSTATE quotactl has space for just one set of time limits so
+ * report them for the first enabled quota type
+ */
+ for (type = 0; type < XQM_MAXQUOTAS; type++)
+ if (state.s_state[type].flags & QCI_ACCT_ENABLED)
+ break;
+ BUG_ON(type == XQM_MAXQUOTAS);
+ fqs->qs_btimelimit = state.s_state[type].spc_timelimit;
+ fqs->qs_itimelimit = state.s_state[type].ino_timelimit;
+ fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
+ fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
+ fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
+ if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) {
+ fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
+ fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
+ fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
+ }
+ if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) {
+ fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
+ fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
+ fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
+ }
+ if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) {
+ /*
+ * Q_XGETQSTAT doesn't have room for both group and project
+ * quotas. So, allow the project quota values to be copied out
+ * only if there is no group quota information available.
+ */
+ if (!(state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)) {
+ fqs->qs_gquota.qfs_ino = state.s_state[PRJQUOTA].ino;
+ fqs->qs_gquota.qfs_nblks =
+ state.s_state[PRJQUOTA].blocks;
+ fqs->qs_gquota.qfs_nextents =
+ state.s_state[PRJQUOTA].nextents;
+ }
+ }
+ return 0;
+}
+
static int quota_getxstate(struct super_block *sb, void __user *addr)
{
struct fs_quota_stat fqs;
int ret;
- if (!sb->s_qcop->get_xstate)
+ if (!sb->s_qcop->get_state)
return -ENOSYS;
- ret = sb->s_qcop->get_xstate(sb, &fqs);
+ ret = quota_getstate(sb, &fqs);
if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
return -EFAULT;
return ret;
}
+static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs)
+{
+ int type;
+ struct qc_state state;
+ int ret;
+
+ ret = sb->s_qcop->get_state(sb, &state);
+ if (ret < 0)
+ return ret;
+
+ memset(fqs, 0, sizeof(*fqs));
+ fqs->qs_version = FS_QSTAT_VERSION;
+ fqs->qs_flags = quota_state_to_flags(&state);
+ /* No quota enabled? */
+ if (!fqs->qs_flags)
+ return -ENOSYS;
+ fqs->qs_incoredqs = state.s_incoredqs;
+ /*
+ * GETXSTATV quotactl has space for just one set of time limits so
+ * report them for the first enabled quota type
+ */
+ for (type = 0; type < XQM_MAXQUOTAS; type++)
+ if (state.s_state[type].flags & QCI_ACCT_ENABLED)
+ break;
+ BUG_ON(type == XQM_MAXQUOTAS);
+ fqs->qs_btimelimit = state.s_state[type].spc_timelimit;
+ fqs->qs_itimelimit = state.s_state[type].ino_timelimit;
+ fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit;
+ fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit;
+ fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit;
+ if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) {
+ fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino;
+ fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks;
+ fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents;
+ }
+ if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) {
+ fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino;
+ fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks;
+ fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents;
+ }
+ if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) {
+ fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino;
+ fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks;
+ fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents;
+ }
+ return 0;
+}
+
static int quota_getxstatev(struct super_block *sb, void __user *addr)
{
struct fs_quota_statv fqs;
int ret;
- if (!sb->s_qcop->get_xstatev)
+ if (!sb->s_qcop->get_state)
return -ENOSYS;
memset(&fqs, 0, sizeof(fqs));
@@ -284,7 +444,7 @@ static int quota_getxstatev(struct super_block *sb, void __user *addr)
default:
return -EINVAL;
}
- ret = sb->s_qcop->get_xstatev(sb, &fqs);
+ ret = quota_getstatev(sb, &fqs);
if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
return -EFAULT;
return ret;
@@ -357,6 +517,30 @@ static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
dst->d_fieldmask |= QC_RT_SPACE;
}
+static void copy_qcinfo_from_xfs_dqblk(struct qc_info *dst,
+ struct fs_disk_quota *src)
+{
+ memset(dst, 0, sizeof(*dst));
+ dst->i_spc_timelimit = src->d_btimer;
+ dst->i_ino_timelimit = src->d_itimer;
+ dst->i_rt_spc_timelimit = src->d_rtbtimer;
+ dst->i_ino_warnlimit = src->d_iwarns;
+ dst->i_spc_warnlimit = src->d_bwarns;
+ dst->i_rt_spc_warnlimit = src->d_rtbwarns;
+ if (src->d_fieldmask & FS_DQ_BWARNS)
+ dst->i_fieldmask |= QC_SPC_WARNS;
+ if (src->d_fieldmask & FS_DQ_IWARNS)
+ dst->i_fieldmask |= QC_INO_WARNS;
+ if (src->d_fieldmask & FS_DQ_RTBWARNS)
+ dst->i_fieldmask |= QC_RT_SPC_WARNS;
+ if (src->d_fieldmask & FS_DQ_BTIMER)
+ dst->i_fieldmask |= QC_SPC_TIMER;
+ if (src->d_fieldmask & FS_DQ_ITIMER)
+ dst->i_fieldmask |= QC_INO_TIMER;
+ if (src->d_fieldmask & FS_DQ_RTBTIMER)
+ dst->i_fieldmask |= QC_RT_SPC_TIMER;
+}
+
static int quota_setxquota(struct super_block *sb, int type, qid_t id,
void __user *addr)
{
@@ -371,6 +555,21 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
qid = make_kqid(current_user_ns(), type, id);
if (!qid_valid(qid))
return -EINVAL;
+ /* Are we actually setting timer / warning limits for all users? */
+ if (from_kqid(&init_user_ns, qid) == 0 &&
+ fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) {
+ struct qc_info qinfo;
+ int ret;
+
+ if (!sb->s_qcop->set_info)
+ return -EINVAL;
+ copy_qcinfo_from_xfs_dqblk(&qinfo, &fdq);
+ ret = sb->s_qcop->set_info(sb, type, &qinfo);
+ if (ret)
+ return ret;
+ /* These are already done */
+ fdq.d_fieldmask &= ~(FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK);
+ }
copy_from_xfs_dqblk(&qdq, &fdq);
return sb->s_qcop->set_dqblk(sb, qid, &qdq);
}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index d65877fbe8f4..58efb83dec1c 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -349,6 +349,13 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
struct dquot *dquot)
{
int tmp = QT_TREEOFF;
+
+#ifdef __QUOTA_QT_PARANOIA
+ if (info->dqi_blocks <= QT_TREEOFF) {
+ quota_error(dquot->dq_sb, "Quota tree root isn't allocated!");
+ return -EIO;
+ }
+#endif
return do_insert_tree(info, dquot, &tmp, 0);
}
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 9cb10d7197f7..2aa012a68e90 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -117,12 +117,16 @@ static int v2_read_file_info(struct super_block *sb, int type)
qinfo = info->dqi_priv;
if (version == 0) {
/* limits are stored as unsigned 32-bit data */
- info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
+ info->dqi_max_spc_limit = 0xffffffffLL << QUOTABLOCK_BITS;
info->dqi_max_ino_limit = 0xffffffff;
} else {
- /* used space is stored as unsigned 64-bit value in bytes */
- info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */
- info->dqi_max_ino_limit = 0xffffffffffffffffULL;
+ /*
+ * Used space is stored as unsigned 64-bit value in bytes but
+ * quota core supports only signed 64-bit values so use that
+ * as a limit
+ */
+ info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */
+ info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
}
info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h
index f1966b42c2fd..4e95430093d9 100644
--- a/fs/quota/quotaio_v2.h
+++ b/fs/quota/quotaio_v2.h
@@ -13,12 +13,14 @@
*/
#define V2_INITQMAGICS {\
0xd9c01f11, /* USRQUOTA */\
- 0xd9c01927 /* GRPQUOTA */\
+ 0xd9c01927, /* GRPQUOTA */\
+ 0xd9c03f14, /* PRJQUOTA */\
}
#define V2_INITQVERSIONS {\
1, /* USRQUOTA */\
- 1 /* GRPQUOTA */\
+ 1, /* GRPQUOTA */\
+ 1, /* PRJQUOTA */\
}
/* First generic header */
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 4f56de822d2f..183a212694bf 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -31,9 +31,7 @@
#include "internal.h"
const struct file_operations ramfs_file_operations = {
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = noop_fsync,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index f6ab41b39612..ba1323a94924 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -44,9 +44,7 @@ const struct file_operations ramfs_file_operations = {
.mmap_capabilities = ramfs_mmap_capabilities,
.mmap = ramfs_nommu_mmap,
.get_unmapped_area = ramfs_nommu_get_unmapped_area,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.fsync = noop_fsync,
.splice_read = generic_file_splice_read,
@@ -165,7 +163,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
*/
static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
unsigned int old_ia_valid = ia->ia_valid;
int ret = 0;
diff --git a/fs/read_write.c b/fs/read_write.c
index 8e1b68786d66..819ef3faf1bb 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
-#include <linux/aio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
@@ -23,13 +22,10 @@
#include <asm/unistd.h>
typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
-typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
- unsigned long, loff_t);
typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
.mmap = generic_file_readonly_mmap,
.splice_read = generic_file_splice_read,
@@ -343,13 +339,10 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = iov_iter_count(iter);
iter->type |= READ;
ret = file->f_op->read_iter(&kiocb, iter);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
-
+ BUG_ON(ret == -EIOCBQUEUED);
if (ret > 0)
*ppos = kiocb.ki_pos;
return ret;
@@ -366,13 +359,10 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = iov_iter_count(iter);
iter->type |= WRITE;
ret = file->f_op->write_iter(&kiocb, iter);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
-
+ BUG_ON(ret == -EIOCBQUEUED);
if (ret > 0)
*ppos = kiocb.ki_pos;
return ret;
@@ -418,26 +408,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
}
-ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
- struct iovec iov = { .iov_base = buf, .iov_len = len };
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, filp);
- kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
-
- ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
- *ppos = kiocb.ki_pos;
- return ret;
-}
-
-EXPORT_SYMBOL(do_sync_read);
-
-ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
@@ -446,34 +417,25 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
iov_iter_init(&iter, READ, &iov, 1, len);
ret = filp->f_op->read_iter(&kiocb, &iter);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
+ BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
-EXPORT_SYMBOL(new_sync_read);
-
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
loff_t *pos)
{
- ssize_t ret;
-
if (file->f_op->read)
- ret = file->f_op->read(file, buf, count, pos);
- else if (file->f_op->aio_read)
- ret = do_sync_read(file, buf, count, pos);
+ return file->f_op->read(file, buf, count, pos);
else if (file->f_op->read_iter)
- ret = new_sync_read(file, buf, count, pos);
+ return new_sync_read(file, buf, count, pos);
else
- ret = -EINVAL;
-
- return ret;
+ return -EINVAL;
}
+EXPORT_SYMBOL(__vfs_read);
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
@@ -502,26 +464,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
EXPORT_SYMBOL(vfs_read);
-ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
-{
- struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
- struct kiocb kiocb;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, filp);
- kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
-
- ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
- *ppos = kiocb.ki_pos;
- return ret;
-}
-
-EXPORT_SYMBOL(do_sync_write);
-
-ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
+static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
@@ -530,17 +473,26 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
iov_iter_init(&iter, WRITE, &iov, 1, len);
ret = filp->f_op->write_iter(&kiocb, &iter);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
- *ppos = kiocb.ki_pos;
+ BUG_ON(ret == -EIOCBQUEUED);
+ if (ret > 0)
+ *ppos = kiocb.ki_pos;
return ret;
}
-EXPORT_SYMBOL(new_sync_write);
+ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
+ loff_t *pos)
+{
+ if (file->f_op->write)
+ return file->f_op->write(file, p, count, pos);
+ else if (file->f_op->write_iter)
+ return new_sync_write(file, p, count, pos);
+ else
+ return -EINVAL;
+}
+EXPORT_SYMBOL(__vfs_write);
ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
{
@@ -556,12 +508,7 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
p = (__force const char __user *)buf;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
- if (file->f_op->write)
- ret = file->f_op->write(file, p, count, pos);
- else if (file->f_op->aio_write)
- ret = do_sync_write(file, p, count, pos);
- else
- ret = new_sync_write(file, p, count, pos);
+ ret = __vfs_write(file, p, count, pos);
set_fs(old_fs);
if (ret > 0) {
fsnotify_modify(file);
@@ -588,12 +535,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
if (ret >= 0) {
count = ret;
file_start_write(file);
- if (file->f_op->write)
- ret = file->f_op->write(file, buf, count, pos);
- else if (file->f_op->aio_write)
- ret = do_sync_write(file, buf, count, pos);
- else
- ret = new_sync_write(file, buf, count, pos);
+ ret = __vfs_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
@@ -710,60 +652,32 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
}
EXPORT_SYMBOL(iov_shorten);
-static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
- unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
-{
- struct kiocb kiocb;
- struct iov_iter iter;
- ssize_t ret;
-
- init_sync_kiocb(&kiocb, filp);
- kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
-
- iov_iter_init(&iter, rw, iov, nr_segs, len);
- ret = fn(&kiocb, &iter);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
- *ppos = kiocb.ki_pos;
- return ret;
-}
-
-static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
- unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
+static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
+ loff_t *ppos, iter_fn_t fn)
{
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
- ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
+ ret = fn(&kiocb, iter);
+ BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
/* Do it by hand, with file-ops */
-static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
+static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
+ loff_t *ppos, io_fn_t fn)
{
- struct iovec *vector = iov;
ssize_t ret = 0;
- while (nr_segs > 0) {
- void __user *base;
- size_t len;
+ while (iov_iter_count(iter)) {
+ struct iovec iovec = iov_iter_iovec(iter);
ssize_t nr;
- base = vector->iov_base;
- len = vector->iov_len;
- vector++;
- nr_segs--;
-
- nr = fn(filp, base, len, ppos);
+ nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
if (nr < 0) {
if (!ret)
@@ -771,8 +685,9 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
break;
}
ret += nr;
- if (nr != len)
+ if (nr != iovec.iov_len)
break;
+ iov_iter_advance(iter, nr);
}
return ret;
@@ -863,48 +778,42 @@ static ssize_t do_readv_writev(int type, struct file *file,
size_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
+ struct iov_iter iter;
ssize_t ret;
io_fn_t fn;
- iov_fn_t fnv;
iter_fn_t iter_fn;
- ret = rw_copy_check_uvector(type, uvector, nr_segs,
- ARRAY_SIZE(iovstack), iovstack, &iov);
- if (ret <= 0)
- goto out;
+ ret = import_iovec(type, uvector, nr_segs,
+ ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret < 0)
+ return ret;
- tot_len = ret;
+ tot_len = iov_iter_count(&iter);
+ if (!tot_len)
+ goto out;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
- fnv = NULL;
if (type == READ) {
fn = file->f_op->read;
- fnv = file->f_op->aio_read;
iter_fn = file->f_op->read_iter;
} else {
fn = (io_fn_t)file->f_op->write;
- fnv = file->f_op->aio_write;
iter_fn = file->f_op->write_iter;
file_start_write(file);
}
if (iter_fn)
- ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
- pos, iter_fn);
- else if (fnv)
- ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
- pos, fnv);
+ ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
else
- ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+ ret = do_loop_readv_writev(file, &iter, pos, fn);
if (type != READ)
file_end_write(file);
out:
- if (iov != iovstack)
- kfree(iov);
+ kfree(iov);
if ((ret + (type == READ)) > 0) {
if (type == READ)
fsnotify_access(file);
@@ -1043,48 +952,42 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
compat_ssize_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
+ struct iov_iter iter;
ssize_t ret;
io_fn_t fn;
- iov_fn_t fnv;
iter_fn_t iter_fn;
- ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
- UIO_FASTIOV, iovstack, &iov);
- if (ret <= 0)
- goto out;
+ ret = compat_import_iovec(type, uvector, nr_segs,
+ UIO_FASTIOV, &iov, &iter);
+ if (ret < 0)
+ return ret;
- tot_len = ret;
+ tot_len = iov_iter_count(&iter);
+ if (!tot_len)
+ goto out;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
- fnv = NULL;
if (type == READ) {
fn = file->f_op->read;
- fnv = file->f_op->aio_read;
iter_fn = file->f_op->read_iter;
} else {
fn = (io_fn_t)file->f_op->write;
- fnv = file->f_op->aio_write;
iter_fn = file->f_op->write_iter;
file_start_write(file);
}
if (iter_fn)
- ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
- pos, iter_fn);
- else if (fnv)
- ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
- pos, fnv);
+ ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
else
- ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+ ret = do_loop_readv_writev(file, &iter, pos, fn);
if (type != READ)
file_end_write(file);
out:
- if (iov != iovstack)
- kfree(iov);
+ kfree(iov);
if ((ret + (type == READ)) > 0) {
if (type == READ)
fsnotify_access(file);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 0a7dc941aaf4..4a024e2ceb9f 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -53,8 +53,8 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
{
struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
- return (privroot->d_inode &&
- deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+ return (d_really_is_positive(privroot) &&
+ deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid);
}
int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 751dd3f4346b..96a1bcf33db4 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -243,8 +243,6 @@ drop_write_lock:
}
const struct file_operations reiserfs_file_operations = {
- .read = new_sync_read,
- .write = new_sync_write,
.unlocked_ioctl = reiserfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = reiserfs_compat_ioctl,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index e72401e1f995..f6f2fbad9777 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,7 +18,7 @@
#include <linux/writeback.h>
#include <linux/quotaops.h>
#include <linux/swap.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
@@ -3278,22 +3278,22 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
* We thank Mingming Cao for helping us understand in great detail what
* to do in this section of the code.
*/
-static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter, loff_t offset)
+static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+ ret = blockdev_direct_IO(iocb, inode, iter, offset,
reiserfs_get_blocks_direct_io);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again.
*/
- if (unlikely((rw & WRITE) && ret < 0)) {
+ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
loff_t end = offset + count;
@@ -3308,7 +3308,7 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
unsigned int ia_valid;
int error;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index cd11358b10c7..b55a074653d7 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -400,7 +400,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
struct inode *inode = NULL;
struct reiserfs_dir_entry de;
INITIALIZE_PATH(path_to_entry);
- struct inode *dir = child->d_inode;
+ struct inode *dir = d_inode(child);
if (dir->i_nlink == 0) {
return ERR_PTR(-ENOENT);
@@ -917,7 +917,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
goto end_rmdir;
}
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
reiserfs_update_inode_transaction(inode);
reiserfs_update_inode_transaction(dir);
@@ -987,7 +987,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
dquot_initialize(dir);
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
/*
* in this transaction we can be doing at max two balancings and
@@ -1174,7 +1174,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
int retval;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct reiserfs_transaction_handle th;
/*
* We need blocks for transaction + update of quotas for
@@ -1311,8 +1311,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
dquot_initialize(old_dir);
dquot_initialize(new_dir);
- old_inode = old_dentry->d_inode;
- new_dentry_inode = new_dentry->d_inode;
+ old_inode = d_inode(old_dentry);
+ new_dentry_inode = d_inode(new_dentry);
/*
* make sure that oldname still exists and points to an object we
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index bb79cddf0a1f..2adcde137c3f 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -910,7 +910,6 @@ do { \
if (!(cond)) \
reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \
__FILE__ ":%i:%s: " format "\n", \
- in_interrupt() ? -1 : task_pid_nr(current), \
__LINE__, __func__ , ##args); \
} while (0)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 71fbbe3e2dab..0e4cf728126f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -21,6 +21,7 @@
#include "xattr.h"
#include <linux/init.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
#include <linux/quotaops.h>
@@ -588,8 +589,7 @@ static struct kmem_cache *reiserfs_inode_cachep;
static struct inode *reiserfs_alloc_inode(struct super_block *sb)
{
struct reiserfs_inode_info *ei;
- ei = (struct reiserfs_inode_info *)
- kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
atomic_set(&ei->openers, 0);
@@ -805,7 +805,7 @@ static const struct quotactl_ops reiserfs_qctl_operations = {
.quota_on = reiserfs_quota_on,
.quota_off = dquot_quota_off,
.quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
+ .get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk,
@@ -1687,7 +1687,7 @@ static __u32 find_hash_out(struct super_block *s)
__u32 hash = DEFAULT_HASH;
__u32 deh_hashval, teahash, r5hash, yurahash;
- inode = s->s_root->d_inode;
+ inode = d_inode(s->s_root);
make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
retval = search_by_entry_key(s, &key, &path, &de);
@@ -2347,7 +2347,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
err = -EXDEV;
goto out;
}
- inode = path->dentry->d_inode;
+ inode = d_inode(path->dentry);
/*
* We must not pack tails for quota files on reiserfs for quota
* IO to work
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 4e781e697c90..e87f9b52bf06 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -87,9 +87,9 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
error = dir->i_op->unlink(dir, dentry);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
if (!error)
d_delete(dentry);
@@ -102,11 +102,11 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
BUG_ON(!mutex_is_locked(&dir->i_mutex));
- mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+ mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
error = dir->i_op->rmdir(dir, dentry);
if (!error)
- dentry->d_inode->i_flags |= S_DEAD;
- mutex_unlock(&dentry->d_inode->i_mutex);
+ d_inode(dentry)->i_flags |= S_DEAD;
+ mutex_unlock(&d_inode(dentry)->i_mutex);
if (!error)
d_delete(dentry);
@@ -120,26 +120,26 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
struct dentry *privroot = REISERFS_SB(sb)->priv_root;
struct dentry *xaroot;
- if (!privroot->d_inode)
+ if (d_really_is_negative(privroot))
return ERR_PTR(-ENODATA);
- mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR);
+ mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR);
xaroot = dget(REISERFS_SB(sb)->xattr_root);
if (!xaroot)
xaroot = ERR_PTR(-ENODATA);
- else if (!xaroot->d_inode) {
+ else if (d_really_is_negative(xaroot)) {
int err = -ENODATA;
if (xattr_may_create(flags))
- err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
+ err = xattr_mkdir(d_inode(privroot), xaroot, 0700);
if (err) {
dput(xaroot);
xaroot = ERR_PTR(err);
}
}
- mutex_unlock(&privroot->d_inode->i_mutex);
+ mutex_unlock(&d_inode(privroot)->i_mutex);
return xaroot;
}
@@ -156,21 +156,21 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
le32_to_cpu(INODE_PKEY(inode)->k_objectid),
inode->i_generation);
- mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR);
+ mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR);
xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
- if (!IS_ERR(xadir) && !xadir->d_inode) {
+ if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
int err = -ENODATA;
if (xattr_may_create(flags))
- err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
+ err = xattr_mkdir(d_inode(xaroot), xadir, 0700);
if (err) {
dput(xadir);
xadir = ERR_PTR(err);
}
}
- mutex_unlock(&xaroot->d_inode->i_mutex);
+ mutex_unlock(&d_inode(xaroot)->i_mutex);
dput(xaroot);
return xadir;
}
@@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
container_of(ctx, struct reiserfs_dentry_buf, ctx);
struct dentry *dentry;
- WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
+ WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex));
if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
return -ENOSPC;
@@ -207,7 +207,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
dentry = lookup_one_len(name, dbuf->xadir, namelen);
if (IS_ERR(dentry)) {
return PTR_ERR(dentry);
- } else if (!dentry->d_inode) {
+ } else if (d_really_is_negative(dentry)) {
/* A directory entry exists, but no file? */
reiserfs_error(dentry->d_sb, "xattr-20003",
"Corrupted directory: xattr %pd listed but "
@@ -249,16 +249,16 @@ static int reiserfs_for_each_xattr(struct inode *inode,
if (IS_ERR(dir)) {
err = PTR_ERR(dir);
goto out;
- } else if (!dir->d_inode) {
+ } else if (d_really_is_negative(dir)) {
err = 0;
goto out_dir;
}
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
+ mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
buf.xadir = dir;
while (1) {
- err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
+ err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
if (err)
break;
if (!buf.count)
@@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
break;
buf.count = 0;
}
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
cleanup_dentry_buf(&buf);
@@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
if (!err) {
int jerror;
- mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
+ mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex,
I_MUTEX_XATTR);
err = action(dir, data);
reiserfs_write_lock(inode->i_sb);
jerror = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
- mutex_unlock(&dir->d_parent->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir->d_parent)->i_mutex);
err = jerror ?: err;
}
}
@@ -319,7 +319,7 @@ out:
static int delete_one_xattr(struct dentry *dentry, void *data)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
/* This is the xattr dir, handle specially. */
if (d_is_dir(dentry))
@@ -384,27 +384,27 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
if (IS_ERR(xadir))
return ERR_CAST(xadir);
- mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
+ mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
xafile = lookup_one_len(name, xadir, strlen(name));
if (IS_ERR(xafile)) {
err = PTR_ERR(xafile);
goto out;
}
- if (xafile->d_inode && (flags & XATTR_CREATE))
+ if (d_really_is_positive(xafile) && (flags & XATTR_CREATE))
err = -EEXIST;
- if (!xafile->d_inode) {
+ if (d_really_is_negative(xafile)) {
err = -ENODATA;
if (xattr_may_create(flags))
- err = xattr_create(xadir->d_inode, xafile,
+ err = xattr_create(d_inode(xadir), xafile,
0700|S_IFREG);
}
if (err)
dput(xafile);
out:
- mutex_unlock(&xadir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(xadir)->i_mutex);
dput(xadir);
if (err)
return ERR_PTR(err);
@@ -469,21 +469,21 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
if (IS_ERR(xadir))
return PTR_ERR(xadir);
- mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR);
+ mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
dentry = lookup_one_len(name, xadir, strlen(name));
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_dput;
}
- if (dentry->d_inode) {
- err = xattr_unlink(xadir->d_inode, dentry);
+ if (d_really_is_positive(dentry)) {
+ err = xattr_unlink(d_inode(xadir), dentry);
update_ctime(inode);
}
dput(dentry);
out_dput:
- mutex_unlock(&xadir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(xadir)->i_mutex);
dput(xadir);
return err;
}
@@ -533,7 +533,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
else
chunk = buffer_size - buffer_pos;
- page = reiserfs_get_page(dentry->d_inode, file_pos);
+ page = reiserfs_get_page(d_inode(dentry), file_pos);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto out_unlock;
@@ -573,18 +573,18 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
}
new_size = buffer_size + sizeof(struct reiserfs_xattr_header);
- if (!err && new_size < i_size_read(dentry->d_inode)) {
+ if (!err && new_size < i_size_read(d_inode(dentry))) {
struct iattr newattrs = {
.ia_ctime = current_fs_time(inode->i_sb),
.ia_size = new_size,
.ia_valid = ATTR_SIZE | ATTR_CTIME,
};
- mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR);
- inode_dio_wait(dentry->d_inode);
+ mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR);
+ inode_dio_wait(d_inode(dentry));
err = reiserfs_setattr(dentry, &newattrs);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dentry)->i_mutex);
} else
update_ctime(inode);
out_unlock:
@@ -657,7 +657,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
down_read(&REISERFS_I(inode)->i_xattr_sem);
- isize = i_size_read(dentry->d_inode);
+ isize = i_size_read(d_inode(dentry));
/* Just return the size needed */
if (buffer == NULL) {
@@ -680,7 +680,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
else
chunk = isize - file_pos;
- page = reiserfs_get_page(dentry->d_inode, file_pos);
+ page = reiserfs_get_page(d_inode(dentry), file_pos);
if (IS_ERR(page)) {
err = PTR_ERR(page);
goto out_unlock;
@@ -775,7 +775,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
- if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+ if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
return handler->get(dentry, name, buffer, size, handler->flags);
@@ -784,7 +784,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
/*
* Inode operation setxattr()
*
- * dentry->d_inode->i_mutex down
+ * d_inode(dentry)->i_mutex down
*/
int
reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
@@ -794,7 +794,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
- if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+ if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
return handler->set(dentry, name, value, size, flags, handler->flags);
@@ -803,7 +803,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
/*
* Inode operation removexattr()
*
- * dentry->d_inode->i_mutex down
+ * d_inode(dentry)->i_mutex down
*/
int reiserfs_removexattr(struct dentry *dentry, const char *name)
{
@@ -811,7 +811,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
- if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+ if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
@@ -875,14 +875,14 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
.size = buffer ? size : 0,
};
- if (!dentry->d_inode)
+ if (d_really_is_negative(dentry))
return -EINVAL;
if (!dentry->d_sb->s_xattr ||
- get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+ get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE);
+ dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
if (IS_ERR(dir)) {
err = PTR_ERR(dir);
if (err == -ENODATA)
@@ -890,9 +890,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
goto out;
}
- mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
- err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+ err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
+ mutex_unlock(&d_inode(dir)->i_mutex);
if (!err)
err = buf.pos;
@@ -905,12 +905,12 @@ out:
static int create_privroot(struct dentry *dentry)
{
int err;
- struct inode *inode = dentry->d_parent->d_inode;
+ struct inode *inode = d_inode(dentry->d_parent);
WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
err = xattr_mkdir(inode, dentry, 0700);
- if (err || !dentry->d_inode) {
+ if (err || d_really_is_negative(dentry)) {
reiserfs_warning(dentry->d_sb, "jdm-20006",
"xattrs/ACLs enabled and couldn't "
"find/create .reiserfs_priv. "
@@ -918,7 +918,7 @@ static int create_privroot(struct dentry *dentry)
return -EOPNOTSUPP;
}
- dentry->d_inode->i_flags |= S_PRIVATE;
+ d_inode(dentry)->i_flags |= S_PRIVATE;
reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
"storage.\n", PRIVROOT_NAME);
@@ -997,17 +997,17 @@ int reiserfs_lookup_privroot(struct super_block *s)
int err = 0;
/* If we don't have the privroot located yet - go find it */
- mutex_lock(&s->s_root->d_inode->i_mutex);
+ mutex_lock(&d_inode(s->s_root)->i_mutex);
dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
strlen(PRIVROOT_NAME));
if (!IS_ERR(dentry)) {
REISERFS_SB(s)->priv_root = dentry;
d_set_d_op(dentry, &xattr_lookup_poison_ops);
- if (dentry->d_inode)
- dentry->d_inode->i_flags |= S_PRIVATE;
+ if (d_really_is_positive(dentry))
+ d_inode(dentry)->i_flags |= S_PRIVATE;
} else
err = PTR_ERR(dentry);
- mutex_unlock(&s->s_root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(s->s_root)->i_mutex);
return err;
}
@@ -1026,15 +1026,15 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
if (err)
goto error;
- if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
- mutex_lock(&s->s_root->d_inode->i_mutex);
+ if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
+ mutex_lock(&d_inode(s->s_root)->i_mutex);
err = create_privroot(REISERFS_SB(s)->priv_root);
- mutex_unlock(&s->s_root->d_inode->i_mutex);
+ mutex_unlock(&d_inode(s->s_root)->i_mutex);
}
- if (privroot->d_inode) {
+ if (d_really_is_positive(privroot)) {
s->s_xattr = reiserfs_xattr_handlers;
- mutex_lock(&privroot->d_inode->i_mutex);
+ mutex_lock(&d_inode(privroot)->i_mutex);
if (!REISERFS_SB(s)->xattr_root) {
struct dentry *dentry;
@@ -1045,7 +1045,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
else
err = PTR_ERR(dentry);
}
- mutex_unlock(&privroot->d_inode->i_mutex);
+ mutex_unlock(&d_inode(privroot)->i_mutex);
}
error:
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index f620e9678dd5..15dde6262c00 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -78,7 +78,7 @@ static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode)
if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) {
nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
- if (!REISERFS_SB(inode->i_sb)->xattr_root->d_inode)
+ if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root))
nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb);
}
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index e7f8939a4cb5..9a3b0616f283 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -15,10 +15,10 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
- if (IS_PRIVATE(dentry->d_inode))
+ if (IS_PRIVATE(d_inode(dentry)))
return -EPERM;
- return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
+ return reiserfs_xattr_get(d_inode(dentry), name, buffer, size);
}
static int
@@ -28,10 +28,10 @@ security_set(struct dentry *dentry, const char *name, const void *buffer,
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
- if (IS_PRIVATE(dentry->d_inode))
+ if (IS_PRIVATE(d_inode(dentry)))
return -EPERM;
- return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
+ return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
@@ -39,7 +39,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
{
const size_t len = namelen + 1;
- if (IS_PRIVATE(dentry->d_inode))
+ if (IS_PRIVATE(d_inode(dentry)))
return 0;
if (list && len <= list_len) {
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5eeb0c48ba46..e4f1343714e0 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -14,10 +14,10 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
+ if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
return -EPERM;
- return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
+ return reiserfs_xattr_get(d_inode(dentry), name, buffer, size);
}
static int
@@ -27,10 +27,10 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer,
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
+ if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
return -EPERM;
- return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
+ return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
@@ -38,7 +38,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
{
const size_t len = name_len + 1;
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
+ if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
return 0;
if (list && len <= list_size) {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index e50eab046471..d0b08d3e5689 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -15,7 +15,7 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
return -EINVAL;
if (!reiserfs_xattrs_user(dentry->d_sb))
return -EOPNOTSUPP;
- return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
+ return reiserfs_xattr_get(d_inode(dentry), name, buffer, size);
}
static int
@@ -27,7 +27,7 @@ user_set(struct dentry *dentry, const char *name, const void *buffer,
if (!reiserfs_xattrs_user(dentry->d_sb))
return -EOPNOTSUPP;
- return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
+ return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index 7da9e2153953..1118a0dc6b45 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -81,7 +81,6 @@ static unsigned romfs_mmap_capabilities(struct file *file)
const struct file_operations romfs_ro_fops = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.mmap = romfs_mmap,
diff --git a/fs/select.c b/fs/select.c
index f684c750e08a..015547330e88 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -189,7 +189,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
* doesn't imply write barrier and the users expect write
* barrier semantics on wakeup functions. The following
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
- * and is paired with set_mb() in poll_schedule_timeout.
+ * and is paired with smp_store_mb() in poll_schedule_timeout.
*/
smp_wmb();
pwq->triggered = 1;
@@ -244,7 +244,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
/*
* Prepare for the next iteration.
*
- * The following set_mb() serves two purposes. First, it's
+ * The following smp_store_mb() serves two purposes. First, it's
* the counterpart rmb of the wmb in pollwake() such that data
* written before wake up is always visible after wake up.
* Second, the full barrier guarantees that triggered clearing
@@ -252,7 +252,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
* this problem doesn't exist for the first iteration as
* add_wait_queue() has full barrier semantics.
*/
- set_mb(pwq->triggered, 0);
+ smp_store_mb(pwq->triggered, 0);
return rc;
}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 555f82155be8..ce9e39fd5daf 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,18 +48,21 @@ static void *seq_buf_alloc(unsigned long size)
* ERR_PTR(error). In the end of sequence they return %NULL. ->show()
* returns 0 in case of success and negative number in case of error.
* Returning SEQ_SKIP means "discard this element and move on".
+ * Note: seq_open() will allocate a struct seq_file and store its
+ * pointer in @file->private_data. This pointer should not be modified.
*/
int seq_open(struct file *file, const struct seq_operations *op)
{
- struct seq_file *p = file->private_data;
+ struct seq_file *p;
+
+ WARN_ON(file->private_data);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ file->private_data = p;
- if (!p) {
- p = kmalloc(sizeof(*p), GFP_KERNEL);
- if (!p)
- return -ENOMEM;
- file->private_data = p;
- }
- memset(p, 0, sizeof(*p));
mutex_init(&p->lock);
p->op = op;
#ifdef CONFIG_USER_NS
@@ -487,6 +490,20 @@ int seq_path(struct seq_file *m, const struct path *path, const char *esc)
}
EXPORT_SYMBOL(seq_path);
+/**
+ * seq_file_path - seq_file interface to print a pathname of a file
+ * @m: the seq_file handle
+ * @file: the struct file to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path to the file.
+ */
+int seq_file_path(struct seq_file *m, struct file *file, const char *esc)
+{
+ return seq_path(m, &file->f_path, esc);
+}
+EXPORT_SYMBOL(seq_file_path);
+
/*
* Same as seq_path, but relative to supplied root.
*/
@@ -538,6 +555,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
return res;
}
+EXPORT_SYMBOL(seq_dentry);
static void *single_start(struct seq_file *p, loff_t *pos)
{
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 7e412ad74836..270221fcef42 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -121,8 +121,9 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
- if (kinfo->si_code == BUS_MCEERR_AR ||
- kinfo->si_code == BUS_MCEERR_AO)
+ if (kinfo->si_signo == SIGBUS &&
+ (kinfo->si_code == BUS_MCEERR_AR ||
+ kinfo->si_code == BUS_MCEERR_AO))
err |= __put_user((short) kinfo->si_addr_lsb,
&uinfo->ssi_addr_lsb);
#endif
diff --git a/fs/splice.c b/fs/splice.c
index 7968da96bebb..5fc1e50a7f30 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,7 +32,6 @@
#include <linux/gfp.h>
#include <linux/socket.h>
#include <linux/compat.h>
-#include <linux/aio.h>
#include "internal.h"
/*
@@ -262,6 +261,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
return ret;
}
+EXPORT_SYMBOL_GPL(splice_to_pipe);
void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
{
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
break;
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL);
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (unlikely(error)) {
page_cache_release(page);
if (error == -EEXIST)
@@ -524,6 +524,9 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
loff_t isize, left;
int ret;
+ if (IS_DAX(in->f_mapping->host))
+ return default_file_splice_read(in, ppos, pipe, len, flags);
+
isize = i_size_read(in->f_mapping->host);
if (unlikely(*ppos >= isize))
return 0;
@@ -1159,7 +1162,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
long ret, bytes;
umode_t i_mode;
size_t len;
- int i, flags;
+ int i, flags, more;
/*
* We require the input being a regular file, as we don't want to
@@ -1202,6 +1205,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
* Don't block on output, we have to drain the direct pipe.
*/
sd->flags &= ~SPLICE_F_NONBLOCK;
+ more = sd->flags & SPLICE_F_MORE;
while (len) {
size_t read_len;
@@ -1215,6 +1219,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
sd->total_len = read_len;
/*
+ * If more data is pending, set SPLICE_F_MORE
+ * If this is the last data and SPLICE_F_MORE was not set
+ * initially, clears it.
+ */
+ if (read_len < len)
+ sd->flags |= SPLICE_F_MORE;
+ else if (!more)
+ sd->flags &= ~SPLICE_F_MORE;
+ /*
* NOTE: nonblocking mode only applies to the input. We
* must not do the output in nonblocking mode as then we
* could get stuck data in the internal pipe:
@@ -1534,34 +1547,29 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
- ssize_t count;
pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
- ret = rw_copy_check_uvector(READ, uiov, nr_segs,
- ARRAY_SIZE(iovstack), iovstack, &iov);
- if (ret <= 0)
- goto out;
-
- count = ret;
- iov_iter_init(&iter, READ, iov, nr_segs, count);
+ ret = import_iovec(READ, uiov, nr_segs,
+ ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret < 0)
+ return ret;
+ sd.total_len = iov_iter_count(&iter);
sd.len = 0;
- sd.total_len = count;
sd.flags = flags;
sd.u.data = &iter;
sd.pos = 0;
- pipe_lock(pipe);
- ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
- pipe_unlock(pipe);
-
-out:
- if (iov != iovstack)
- kfree(iov);
+ if (sd.total_len) {
+ pipe_lock(pipe);
+ ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
+ pipe_unlock(pipe);
+ }
+ kfree(iov);
return ret;
}
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 5e1101ff276f..8073b6532cf0 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -110,7 +110,7 @@ static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
static struct dentry *squashfs_get_parent(struct dentry *child)
{
- struct inode *inode = child->d_inode;
+ struct inode *inode = d_inode(child);
unsigned int parent_ino = squashfs_i(inode)->parent;
return squashfs_export_iget(inode->i_sb, parent_ino);
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index 73588e7700ed..d09fcd6fb85d 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -49,6 +49,6 @@ struct squashfs_inode_info {
static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
{
- return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+ return container_of(inode, struct squashfs_inode_info, vfs_inode);
}
#endif
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 92fcde7b4d61..e5e0ddf5b143 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -39,7 +39,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int);
ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
size_t buffer_size)
{
- struct inode *inode = d->d_inode;
+ struct inode *inode = d_inode(d);
struct super_block *sb = inode->i_sb;
struct squashfs_sb_info *msblk = sb->s_fs_info;
u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
@@ -229,7 +229,7 @@ static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
if (name[0] == '\0')
return -EINVAL;
- return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+ return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name,
buffer, size);
}
@@ -259,7 +259,7 @@ static int squashfs_trusted_get(struct dentry *d, const char *name,
if (name[0] == '\0')
return -EINVAL;
- return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+ return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name,
buffer, size);
}
@@ -286,7 +286,7 @@ static int squashfs_security_get(struct dentry *d, const char *name,
if (name[0] == '\0')
return -EINVAL;
- return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+ return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name,
buffer, size);
}
diff --git a/fs/stat.c b/fs/stat.c
index ae0c3cef9927..cccc1aab9a8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -51,7 +51,7 @@ EXPORT_SYMBOL(generic_fillattr);
*/
int vfs_getattr_nosec(struct path *path, struct kstat *stat)
{
- struct inode *inode = path->dentry->d_inode;
+ struct inode *inode = d_backing_inode(path->dentry);
if (inode->i_op->getattr)
return inode->i_op->getattr(path->mnt, path->dentry, stat);
@@ -66,7 +66,7 @@ int vfs_getattr(struct path *path, struct kstat *stat)
{
int retval;
- retval = security_inode_getattr(path->mnt, path->dentry);
+ retval = security_inode_getattr(path);
if (retval)
return retval;
return vfs_getattr_nosec(path, stat);
@@ -326,7 +326,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
retry:
error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
if (!error) {
- struct inode *inode = path.dentry->d_inode;
+ struct inode *inode = d_backing_inode(path.dentry);
error = empty ? -ENOENT : -EINVAL;
if (inode->i_op->readlink) {
diff --git a/fs/super.c b/fs/super.c
index 2b7dc90ccdbb..b61372354f2b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->s_maxbytes = MAX_NON_LFS;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
- s->cleancache_poolid = -1;
+ s->cleancache_poolid = CLEANCACHE_NO_POOL;
s->s_shrink.seeks = DEFAULT_SEEKS;
s->s_shrink.scan_objects = super_cache_scan;
@@ -842,7 +842,7 @@ int get_anon_bdev(dev_t *p)
else if (error)
return -EAGAIN;
- if (dev == (1 << MINORBITS)) {
+ if (dev >= (1 << MINORBITS)) {
spin_lock(&unnamed_dev_lock);
ida_remove(&unnamed_dev_ida, dev);
if (unnamed_dev_start > dev)
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0b45ff42f374..94374e435025 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -121,3 +121,37 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
}
+
+/**
+ * sysfs_create_mount_point - create an always empty directory
+ * @parent_kobj: kobject that will contain this always empty directory
+ * @name: The name of the always empty directory to add
+ */
+int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name)
+{
+ struct kernfs_node *kn, *parent = parent_kobj->sd;
+
+ kn = kernfs_create_empty_dir(parent, name);
+ if (IS_ERR(kn)) {
+ if (PTR_ERR(kn) == -EEXIST)
+ sysfs_warn_dup(parent, name);
+ return PTR_ERR(kn);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_create_mount_point);
+
+/**
+ * sysfs_remove_mount_point - remove an always empty directory.
+ * @parent_kobj: kobject that will contain this always empty directory
+ * @name: The name of the always empty directory to remove
+ *
+ */
+void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name)
+{
+ struct kernfs_node *parent = parent_kobj->sd;
+
+ kernfs_remove_by_name_ns(parent, name, NULL);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_mount_point);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 7c2867b44141..6c95628ea377 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -90,7 +90,7 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
return 0;
if (size) {
- if (pos > size)
+ if (pos >= size)
return 0;
if (pos + count > size)
count = size - pos;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 2554d8835b48..39a019936768 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -41,7 +41,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
if (grp->attrs) {
for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
- umode_t mode = 0;
+ umode_t mode = (*attr)->mode;
/*
* In update mode, we're changing the permissions or
@@ -55,9 +55,14 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
if (!mode)
continue;
}
+
+ WARN(mode & ~(SYSFS_PREALLOC | 0664),
+ "Attribute %s: Invalid permissions 0%o\n",
+ (*attr)->name, mode);
+
+ mode &= SYSFS_PREALLOC | 0664;
error = sysfs_add_file_mode_ns(parent, *attr, false,
- (*attr)->mode | mode,
- NULL);
+ mode, NULL);
if (unlikely(error))
break;
}
@@ -130,7 +135,7 @@ static int internal_create_group(struct kobject *kobj, int update,
* This function creates a group for the first time. It will explicitly
* warn and error if any of the attribute files being created already exist.
*
- * Returns 0 on success or error.
+ * Returns 0 on success or error code on failure.
*/
int sysfs_create_group(struct kobject *kobj,
const struct attribute_group *grp)
@@ -150,7 +155,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_group);
* It will explicitly warn and error if any of the attribute files being
* created already exist.
*
- * Returns 0 on success or error code from sysfs_create_group on error.
+ * Returns 0 on success or error code from sysfs_create_group on failure.
*/
int sysfs_create_groups(struct kobject *kobj,
const struct attribute_group **groups)
@@ -188,7 +193,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_groups);
* The primary use for this function is to call it after making a change
* that affects group visibility.
*
- * Returns 0 on success or error.
+ * Returns 0 on success or error code on failure.
*/
int sysfs_update_group(struct kobject *kobj,
const struct attribute_group *grp)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8a49486bf30c..1c6ac6fcee9f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -31,9 +31,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
bool new_sb;
if (!(flags & MS_KERNMOUNT)) {
- if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
- return ERR_PTR(-EPERM);
-
if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
return ERR_PTR(-EPERM);
}
@@ -58,7 +55,7 @@ static struct file_system_type sysfs_fs_type = {
.name = "sysfs",
.mount = sysfs_mount,
.kill_sb = sysfs_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
};
int __init sysfs_init(void)
diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile
index 3591f9d7a48a..7a75e70a4b61 100644
--- a/fs/sysv/Makefile
+++ b/fs/sysv/Makefile
@@ -5,4 +5,4 @@
obj-$(CONFIG_SYSV_FS) += sysv.o
sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \
- namei.o super.o symlink.o
+ namei.o super.o
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index d42291d08215..63c1bcb224ee 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -33,11 +33,6 @@ static inline void dir_put_page(struct page *page)
page_cache_release(page);
}
-static inline unsigned long dir_pages(struct inode *inode)
-{
- return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
{
struct address_space *mapping = page->mapping;
@@ -132,7 +127,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
{
const char * name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- struct inode * dir = dentry->d_parent->d_inode;
+ struct inode * dir = d_inode(dentry->d_parent);
unsigned long start, n;
unsigned long npages = dir_pages(dir);
struct page *page = NULL;
@@ -176,7 +171,7 @@ found:
int sysv_add_link(struct dentry *dentry, struct inode *inode)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
const char * name = dentry->d_name.name;
int namelen = dentry->d_name.len;
struct page *page = NULL;
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index b00811c75b24..82ddc09061e2 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -21,9 +21,7 @@
*/
const struct file_operations sysv_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
@@ -32,7 +30,7 @@ const struct file_operations sysv_file_operations = {
static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
error = inode_change_ok(inode, attr);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 88956309cc86..590ad9206e3f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -166,8 +166,9 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
inode->i_op = &sysv_symlink_inode_operations;
inode->i_mapping->a_ops = &sysv_aops;
} else {
- inode->i_op = &sysv_fast_symlink_inode_operations;
- nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+ inode->i_op = &simple_symlink_inode_operations;
+ inode->i_link = (char *)SYSV_I(inode)->i_data;
+ nd_terminate_link(inode->i_link, inode->i_size,
sizeof(SYSV_I(inode)->i_data) - 1);
}
} else
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 66bc316927e8..2fde40acf024 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -443,7 +443,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct super_block *s = dentry->d_sb;
- generic_fillattr(dentry->d_inode, stat);
+ generic_fillattr(d_inode(dentry), stat);
stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
stat->blksize = s->s_blocksize;
return 0;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 731b2bbcaab3..11e83ed0b4bf 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -118,7 +118,7 @@ out_fail:
static int sysv_link(struct dentry * old_dentry, struct inode * dir,
struct dentry * dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
@@ -166,7 +166,7 @@ out_dir:
static int sysv_unlink(struct inode * dir, struct dentry * dentry)
{
- struct inode * inode = dentry->d_inode;
+ struct inode * inode = d_inode(dentry);
struct page * page;
struct sysv_dir_entry * de;
int err = -ENOENT;
@@ -187,7 +187,7 @@ out:
static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int err = -ENOTEMPTY;
if (sysv_empty_dir(inode)) {
@@ -208,8 +208,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
struct inode * new_dir, struct dentry * new_dentry)
{
- struct inode * old_inode = old_dentry->d_inode;
- struct inode * new_inode = new_dentry->d_inode;
+ struct inode * old_inode = d_inode(old_dentry);
+ struct inode * new_inode = d_inode(new_dentry);
struct page * dir_page = NULL;
struct sysv_dir_entry * dir_de = NULL;
struct page * old_page;
diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c
deleted file mode 100644
index 00d2f8a43e4e..000000000000
--- a/fs/sysv/symlink.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * linux/fs/sysv/symlink.c
- *
- * Handling of System V filesystem fast symlinks extensions.
- * Aug 2001, Christoph Hellwig (hch@infradead.org)
- */
-
-#include "sysv.h"
-#include <linux/namei.h>
-
-static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- nd_set_link(nd, (char *)SYSV_I(dentry->d_inode)->i_data);
- return NULL;
-}
-
-const struct inode_operations sysv_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = sysv_follow_link,
-};
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 69d488986cce..6c212288adcb 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -73,7 +73,7 @@ struct sysv_inode_info {
static inline struct sysv_inode_info *SYSV_I(struct inode *inode)
{
- return list_entry(inode, struct sysv_inode_info, vfs_inode);
+ return container_of(inode, struct sysv_inode_info, vfs_inode);
}
static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb)
@@ -161,7 +161,6 @@ extern ino_t sysv_inode_by_name(struct dentry *);
extern const struct inode_operations sysv_file_inode_operations;
extern const struct inode_operations sysv_dir_inode_operations;
-extern const struct inode_operations sysv_fast_symlink_inode_operations;
extern const struct file_operations sysv_file_operations;
extern const struct file_operations sysv_dir_operations;
extern const struct address_space_operations sysv_aops;
diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
new file mode 100644
index 000000000000..82fa35b656c4
--- /dev/null
+++ b/fs/tracefs/Makefile
@@ -0,0 +1,4 @@
+tracefs-objs := inode.o
+
+obj-$(CONFIG_TRACING) += tracefs.o
+
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
new file mode 100644
index 000000000000..cbc8d5d2755a
--- /dev/null
+++ b/fs/tracefs/inode.c
@@ -0,0 +1,643 @@
+/*
+ * inode.c - part of tracefs, a pseudo file system for activating tracing
+ *
+ * Based on debugfs by: Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * tracefs is the file system that is used by the tracing infrastructure.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/tracefs.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+#define TRACEFS_DEFAULT_MODE 0700
+
+static struct vfsmount *tracefs_mount;
+static int tracefs_mount_count;
+static bool tracefs_registered;
+
+static ssize_t default_read_file(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return 0;
+}
+
+static ssize_t default_write_file(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return count;
+}
+
+static const struct file_operations tracefs_file_operations = {
+ .read = default_read_file,
+ .write = default_write_file,
+ .open = simple_open,
+ .llseek = noop_llseek,
+};
+
+static struct tracefs_dir_ops {
+ int (*mkdir)(const char *name);
+ int (*rmdir)(const char *name);
+} tracefs_ops;
+
+static char *get_dname(struct dentry *dentry)
+{
+ const char *dname;
+ char *name;
+ int len = dentry->d_name.len;
+
+ dname = dentry->d_name.name;
+ name = kmalloc(len + 1, GFP_KERNEL);
+ if (!name)
+ return NULL;
+ memcpy(name, dname, len);
+ name[len] = 0;
+ return name;
+}
+
+static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+ char *name;
+ int ret;
+
+ name = get_dname(dentry);
+ if (!name)
+ return -ENOMEM;
+
+ /*
+ * The mkdir call can call the generic functions that create
+ * the files within the tracefs system. It is up to the individual
+ * mkdir routine to handle races.
+ */
+ mutex_unlock(&inode->i_mutex);
+ ret = tracefs_ops.mkdir(name);
+ mutex_lock(&inode->i_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
+{
+ char *name;
+ int ret;
+
+ name = get_dname(dentry);
+ if (!name)
+ return -ENOMEM;
+
+ /*
+ * The rmdir call can call the generic functions that create
+ * the files within the tracefs system. It is up to the individual
+ * rmdir routine to handle races.
+ * This time we need to unlock not only the parent (inode) but
+ * also the directory that is being deleted.
+ */
+ mutex_unlock(&inode->i_mutex);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+
+ ret = tracefs_ops.rmdir(name);
+
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock(&dentry->d_inode->i_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+static const struct inode_operations tracefs_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .mkdir = tracefs_syscall_mkdir,
+ .rmdir = tracefs_syscall_rmdir,
+};
+
+static struct inode *tracefs_get_inode(struct super_block *sb)
+{
+ struct inode *inode = new_inode(sb);
+ if (inode) {
+ inode->i_ino = get_next_ino();
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ }
+ return inode;
+}
+
+struct tracefs_mount_opts {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+};
+
+enum {
+ Opt_uid,
+ Opt_gid,
+ Opt_mode,
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_uid, "uid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_mode, "mode=%o"},
+ {Opt_err, NULL}
+};
+
+struct tracefs_fs_info {
+ struct tracefs_mount_opts mount_opts;
+};
+
+static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
+{
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int token;
+ kuid_t uid;
+ kgid_t gid;
+ char *p;
+
+ opts->mode = TRACEFS_DEFAULT_MODE;
+
+ while ((p = strsep(&data, ",")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_uid:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ opts->uid = uid;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ opts->gid = gid;
+ break;
+ case Opt_mode:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->mode = option & S_IALLUGO;
+ break;
+ /*
+ * We might like to report bad mount options here;
+ * but traditionally tracefs has ignored all mount options
+ */
+ }
+ }
+
+ return 0;
+}
+
+static int tracefs_apply_options(struct super_block *sb)
+{
+ struct tracefs_fs_info *fsi = sb->s_fs_info;
+ struct inode *inode = sb->s_root->d_inode;
+ struct tracefs_mount_opts *opts = &fsi->mount_opts;
+
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
+
+ return 0;
+}
+
+static int tracefs_remount(struct super_block *sb, int *flags, char *data)
+{
+ int err;
+ struct tracefs_fs_info *fsi = sb->s_fs_info;
+
+ sync_filesystem(sb);
+ err = tracefs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ tracefs_apply_options(sb);
+
+fail:
+ return err;
+}
+
+static int tracefs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct tracefs_fs_info *fsi = root->d_sb->s_fs_info;
+ struct tracefs_mount_opts *opts = &fsi->mount_opts;
+
+ if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, opts->uid));
+ if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, opts->gid));
+ if (opts->mode != TRACEFS_DEFAULT_MODE)
+ seq_printf(m, ",mode=%o", opts->mode);
+
+ return 0;
+}
+
+static const struct super_operations tracefs_super_operations = {
+ .statfs = simple_statfs,
+ .remount_fs = tracefs_remount,
+ .show_options = tracefs_show_options,
+};
+
+static int trace_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct tree_descr trace_files[] = {{""}};
+ struct tracefs_fs_info *fsi;
+ int err;
+
+ save_mount_options(sb, data);
+
+ fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL);
+ sb->s_fs_info = fsi;
+ if (!fsi) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = tracefs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ err = simple_fill_super(sb, TRACEFS_MAGIC, trace_files);
+ if (err)
+ goto fail;
+
+ sb->s_op = &tracefs_super_operations;
+
+ tracefs_apply_options(sb);
+
+ return 0;
+
+fail:
+ kfree(fsi);
+ sb->s_fs_info = NULL;
+ return err;
+}
+
+static struct dentry *trace_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
+{
+ return mount_single(fs_type, flags, data, trace_fill_super);
+}
+
+static struct file_system_type trace_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "tracefs",
+ .mount = trace_mount,
+ .kill_sb = kill_litter_super,
+};
+MODULE_ALIAS_FS("tracefs");
+
+static struct dentry *start_creating(const char *name, struct dentry *parent)
+{
+ struct dentry *dentry;
+ int error;
+
+ pr_debug("tracefs: creating file '%s'\n",name);
+
+ error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
+ &tracefs_mount_count);
+ if (error)
+ return ERR_PTR(error);
+
+ /* If the parent is not specified, we create it in the root.
+ * We need the root dentry to do this, which is in the super
+ * block. A pointer to that is in the struct vfsmount that we
+ * have around.
+ */
+ if (!parent)
+ parent = tracefs_mount->mnt_root;
+
+ mutex_lock(&parent->d_inode->i_mutex);
+ dentry = lookup_one_len(name, parent, strlen(name));
+ if (!IS_ERR(dentry) && dentry->d_inode) {
+ dput(dentry);
+ dentry = ERR_PTR(-EEXIST);
+ }
+ if (IS_ERR(dentry))
+ mutex_unlock(&parent->d_inode->i_mutex);
+ return dentry;
+}
+
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ dput(dentry);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ return NULL;
+}
+
+static struct dentry *end_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ return dentry;
+}
+
+/**
+ * tracefs_create_file - create a file in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * file will be created in the root of the tracefs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ * on. The inode.i_private pointer will point to this value on
+ * the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ * this file.
+ *
+ * This is the basic "create a file" function for tracefs. It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the tracefs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_file(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+ BUG_ON(!S_ISREG(mode));
+ dentry = start_creating(name, parent);
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = mode;
+ inode->i_fop = fops ? fops : &tracefs_file_operations;
+ inode->i_private = data;
+ d_instantiate(dentry, inode);
+ fsnotify_create(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
+}
+
+static struct dentry *__create_dir(const char *name, struct dentry *parent,
+ const struct inode_operations *ops)
+{
+ struct dentry *dentry = start_creating(name, parent);
+ struct inode *inode;
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_op = ops;
+ inode->i_fop = &simple_dir_operations;
+
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ d_instantiate(dentry, inode);
+ inc_nlink(dentry->d_parent->d_inode);
+ fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
+}
+
+/**
+ * tracefs_create_dir - create a directory in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the directory to
+ * create.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * directory will be created in the root of the tracefs filesystem.
+ *
+ * This function creates a directory in tracefs with the given name.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed. If an error occurs, %NULL will be returned.
+ *
+ * If tracing is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
+{
+ return __create_dir(name, parent, &simple_dir_inode_operations);
+}
+
+/**
+ * tracefs_create_instance_dir - create the tracing instances directory
+ * @name: The name of the instances directory to create
+ * @parent: The parent directory that the instances directory will exist
+ * @mkdir: The function to call when a mkdir is performed.
+ * @rmdir: The function to call when a rmdir is performed.
+ *
+ * Only one instances directory is allowed.
+ *
+ * The instances directory is special as it allows for mkdir and rmdir to
+ * to be done by userspace. When a mkdir or rmdir is performed, the inode
+ * locks are released and the methhods passed in (@mkdir and @rmdir) are
+ * called without locks and with the name of the directory being created
+ * within the instances directory.
+ *
+ * Returns the dentry of the instances directory.
+ */
+struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
+ int (*mkdir)(const char *name),
+ int (*rmdir)(const char *name))
+{
+ struct dentry *dentry;
+
+ /* Only allow one instance of the instances directory. */
+ if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
+ return NULL;
+
+ dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+ if (!dentry)
+ return NULL;
+
+ tracefs_ops.mkdir = mkdir;
+ tracefs_ops.rmdir = rmdir;
+
+ return dentry;
+}
+
+static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
+{
+ int ret = 0;
+
+ if (simple_positive(dentry)) {
+ if (dentry->d_inode) {
+ dget(dentry);
+ switch (dentry->d_inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ ret = simple_rmdir(parent->d_inode, dentry);
+ break;
+ default:
+ simple_unlink(parent->d_inode, dentry);
+ break;
+ }
+ if (!ret)
+ d_delete(dentry);
+ dput(dentry);
+ }
+ }
+ return ret;
+}
+
+/**
+ * tracefs_remove - removes a file or directory from the tracefs filesystem
+ * @dentry: a pointer to a the dentry of the file or directory to be
+ * removed.
+ *
+ * This function removes a file or directory in tracefs that was previously
+ * created with a call to another tracefs function (like
+ * tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove(struct dentry *dentry)
+{
+ struct dentry *parent;
+ int ret;
+
+ if (IS_ERR_OR_NULL(dentry))
+ return;
+
+ parent = dentry->d_parent;
+ if (!parent || !parent->d_inode)
+ return;
+
+ mutex_lock(&parent->d_inode->i_mutex);
+ ret = __tracefs_remove(dentry, parent);
+ mutex_unlock(&parent->d_inode->i_mutex);
+ if (!ret)
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+}
+
+/**
+ * tracefs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in tracefs that
+ * was previously created with a call to another tracefs function
+ * (like tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove_recursive(struct dentry *dentry)
+{
+ struct dentry *child, *parent;
+
+ if (IS_ERR_OR_NULL(dentry))
+ return;
+
+ parent = dentry->d_parent;
+ if (!parent || !parent->d_inode)
+ return;
+
+ parent = dentry;
+ down:
+ mutex_lock(&parent->d_inode->i_mutex);
+ loop:
+ /*
+ * The parent->d_subdirs is protected by the d_lock. Outside that
+ * lock, the child can be unlinked and set to be freed which can
+ * use the d_u.d_child as the rcu head and corrupt this list.
+ */
+ spin_lock(&parent->d_lock);
+ list_for_each_entry(child, &parent->d_subdirs, d_child) {
+ if (!simple_positive(child))
+ continue;
+
+ /* perhaps simple_empty(child) makes more sense */
+ if (!list_empty(&child->d_subdirs)) {
+ spin_unlock(&parent->d_lock);
+ mutex_unlock(&parent->d_inode->i_mutex);
+ parent = child;
+ goto down;
+ }
+
+ spin_unlock(&parent->d_lock);
+
+ if (!__tracefs_remove(child, parent))
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+
+ /*
+ * The parent->d_lock protects agaist child from unlinking
+ * from d_subdirs. When releasing the parent->d_lock we can
+ * no longer trust that the next pointer is valid.
+ * Restart the loop. We'll skip this one with the
+ * simple_positive() check.
+ */
+ goto loop;
+ }
+ spin_unlock(&parent->d_lock);
+
+ mutex_unlock(&parent->d_inode->i_mutex);
+ child = parent;
+ parent = parent->d_parent;
+ mutex_lock(&parent->d_inode->i_mutex);
+
+ if (child != dentry)
+ /* go up */
+ goto loop;
+
+ if (!__tracefs_remove(child, parent))
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ mutex_unlock(&parent->d_inode->i_mutex);
+}
+
+/**
+ * tracefs_initialized - Tells whether tracefs has been registered
+ */
+bool tracefs_initialized(void)
+{
+ return tracefs_registered;
+}
+
+static int __init tracefs_init(void)
+{
+ int retval;
+
+ retval = sysfs_create_mount_point(kernel_kobj, "tracing");
+ if (retval)
+ return -EINVAL;
+
+ retval = register_filesystem(&trace_fs_type);
+ if (!retval)
+ tracefs_registered = true;
+
+ return retval;
+}
+core_initcall(tracefs_init);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index eb997e9c4ab0..11a11b32a2a9 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -509,7 +509,7 @@ again:
c->bi.nospace_rp = 1;
smp_wmb();
} else
- ubifs_err("cannot budget space, error %d", err);
+ ubifs_err(c, "cannot budget space, error %d", err);
return err;
}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 26b69b2d4a45..63f56619991d 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -225,7 +225,7 @@ out_cancel:
out_up:
up_write(&c->commit_sem);
out:
- ubifs_err("commit failed, error %d", err);
+ ubifs_err(c, "commit failed, error %d", err);
spin_lock(&c->cs_lock);
c->cmt_state = COMMIT_BROKEN;
wake_up(&c->cmt_wq);
@@ -289,7 +289,7 @@ int ubifs_bg_thread(void *info)
int err;
struct ubifs_info *c = info;
- ubifs_msg("background thread \"%s\" started, PID %d",
+ ubifs_msg(c, "background thread \"%s\" started, PID %d",
c->bgt_name, current->pid);
set_freezable();
@@ -324,7 +324,7 @@ int ubifs_bg_thread(void *info)
cond_resched();
}
- ubifs_msg("background thread \"%s\" stops", c->bgt_name);
+ ubifs_msg(c, "background thread \"%s\" stops", c->bgt_name);
return 0;
}
@@ -712,13 +712,13 @@ out:
return 0;
out_dump:
- ubifs_err("dumping index node (iip=%d)", i->iip);
+ ubifs_err(c, "dumping index node (iip=%d)", i->iip);
ubifs_dump_node(c, idx);
list_del(&i->list);
kfree(i);
if (!list_empty(&list)) {
i = list_entry(list.prev, struct idx_node, list);
- ubifs_err("dumping parent index node");
+ ubifs_err(c, "dumping parent index node");
ubifs_dump_node(c, &i->idx);
}
out_free:
@@ -727,7 +727,7 @@ out_free:
list_del(&i->list);
kfree(i);
}
- ubifs_err("failed, error %d", err);
+ ubifs_err(c, "failed, error %d", err);
if (err > 0)
err = -EINVAL;
return err;
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 2bfa0953335d..565cb56d7225 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -92,8 +92,8 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
* Note, if the input buffer was not compressed, it is copied to the output
* buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
*/
-void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
- int *compr_type)
+void ubifs_compress(const struct ubifs_info *c, const void *in_buf,
+ int in_len, void *out_buf, int *out_len, int *compr_type)
{
int err;
struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
@@ -112,9 +112,9 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
if (compr->comp_mutex)
mutex_unlock(compr->comp_mutex);
if (unlikely(err)) {
- ubifs_warn("cannot compress %d bytes, compressor %s, error %d, leave data uncompressed",
+ ubifs_warn(c, "cannot compress %d bytes, compressor %s, error %d, leave data uncompressed",
in_len, compr->name, err);
- goto no_compr;
+ goto no_compr;
}
/*
@@ -144,21 +144,21 @@ no_compr:
* The length of the uncompressed data is returned in @out_len. This functions
* returns %0 on success or a negative error code on failure.
*/
-int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
- int *out_len, int compr_type)
+int ubifs_decompress(const struct ubifs_info *c, const void *in_buf,
+ int in_len, void *out_buf, int *out_len, int compr_type)
{
int err;
struct ubifs_compressor *compr;
if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
- ubifs_err("invalid compression type %d", compr_type);
+ ubifs_err(c, "invalid compression type %d", compr_type);
return -EINVAL;
}
compr = ubifs_compressors[compr_type];
if (unlikely(!compr->capi_name)) {
- ubifs_err("%s compression is not compiled in", compr->name);
+ ubifs_err(c, "%s compression is not compiled in", compr->name);
return -EINVAL;
}
@@ -175,7 +175,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
if (compr->decomp_mutex)
mutex_unlock(compr->decomp_mutex);
if (err)
- ubifs_err("cannot decompress %d bytes, compressor %s, error %d",
+ ubifs_err(c, "cannot decompress %d bytes, compressor %s, error %d",
in_len, compr->name, err);
return err;
@@ -193,8 +193,8 @@ static int __init compr_init(struct ubifs_compressor *compr)
if (compr->capi_name) {
compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
if (IS_ERR(compr->cc)) {
- ubifs_err("cannot initialize compressor %s, error %ld",
- compr->name, PTR_ERR(compr->cc));
+ pr_err("UBIFS error (pid %d): cannot initialize compressor %s, error %ld",
+ current->pid, compr->name, PTR_ERR(compr->cc));
return PTR_ERR(compr->cc);
}
}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4cfb3e82c56f..4c46a9865fa7 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -746,7 +746,7 @@ void ubifs_dump_lprops(struct ubifs_info *c)
for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
err = ubifs_read_one_lp(c, lnum, &lp);
if (err) {
- ubifs_err("cannot read lprops for LEB %d", lnum);
+ ubifs_err(c, "cannot read lprops for LEB %d", lnum);
continue;
}
@@ -819,13 +819,13 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
if (!buf) {
- ubifs_err("cannot allocate memory for dumping LEB %d", lnum);
+ ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum);
return;
}
sleb = ubifs_scan(c, lnum, 0, buf, 0);
if (IS_ERR(sleb)) {
- ubifs_err("scan error %d", (int)PTR_ERR(sleb));
+ ubifs_err(c, "scan error %d", (int)PTR_ERR(sleb));
goto out;
}
@@ -1032,7 +1032,7 @@ int dbg_check_space_info(struct ubifs_info *c)
spin_unlock(&c->space_lock);
if (free != d->saved_free) {
- ubifs_err("free space changed from %lld to %lld",
+ ubifs_err(c, "free space changed from %lld to %lld",
d->saved_free, free);
goto out;
}
@@ -1040,15 +1040,15 @@ int dbg_check_space_info(struct ubifs_info *c)
return 0;
out:
- ubifs_msg("saved lprops statistics dump");
+ ubifs_msg(c, "saved lprops statistics dump");
ubifs_dump_lstats(&d->saved_lst);
- ubifs_msg("saved budgeting info dump");
+ ubifs_msg(c, "saved budgeting info dump");
ubifs_dump_budg(c, &d->saved_bi);
- ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
- ubifs_msg("current lprops statistics dump");
+ ubifs_msg(c, "saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
+ ubifs_msg(c, "current lprops statistics dump");
ubifs_get_lp_stats(c, &lst);
ubifs_dump_lstats(&lst);
- ubifs_msg("current budgeting info dump");
+ ubifs_msg(c, "current budgeting info dump");
ubifs_dump_budg(c, &c->bi);
dump_stack();
return -EINVAL;
@@ -1077,9 +1077,9 @@ int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode)
mutex_lock(&ui->ui_mutex);
spin_lock(&ui->ui_lock);
if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
- ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode is clean",
+ ubifs_err(c, "ui_size is %lld, synced_i_size is %lld, but inode is clean",
ui->ui_size, ui->synced_i_size);
- ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
+ ubifs_err(c, "i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
inode->i_mode, i_size_read(inode));
dump_stack();
err = -EINVAL;
@@ -1140,7 +1140,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
kfree(pdent);
if (i_size_read(dir) != size) {
- ubifs_err("directory inode %lu has size %llu, but calculated size is %llu",
+ ubifs_err(c, "directory inode %lu has size %llu, but calculated size is %llu",
dir->i_ino, (unsigned long long)i_size_read(dir),
(unsigned long long)size);
ubifs_dump_inode(c, dir);
@@ -1148,7 +1148,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
return -EINVAL;
}
if (dir->i_nlink != nlink) {
- ubifs_err("directory inode %lu has nlink %u, but calculated nlink is %u",
+ ubifs_err(c, "directory inode %lu has nlink %u, but calculated nlink is %u",
dir->i_ino, dir->i_nlink, nlink);
ubifs_dump_inode(c, dir);
dump_stack();
@@ -1207,10 +1207,10 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
err = 1;
key_read(c, &dent1->key, &key);
if (keys_cmp(c, &zbr1->key, &key)) {
- ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum,
+ ubifs_err(c, "1st entry at %d:%d has key %s", zbr1->lnum,
zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
DBG_KEY_BUF_LEN));
- ubifs_err("but it should have key %s according to tnc",
+ ubifs_err(c, "but it should have key %s according to tnc",
dbg_snprintf_key(c, &zbr1->key, key_buf,
DBG_KEY_BUF_LEN));
ubifs_dump_node(c, dent1);
@@ -1219,10 +1219,10 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
key_read(c, &dent2->key, &key);
if (keys_cmp(c, &zbr2->key, &key)) {
- ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+ ubifs_err(c, "2nd entry at %d:%d has key %s", zbr1->lnum,
zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
DBG_KEY_BUF_LEN));
- ubifs_err("but it should have key %s according to tnc",
+ ubifs_err(c, "but it should have key %s according to tnc",
dbg_snprintf_key(c, &zbr2->key, key_buf,
DBG_KEY_BUF_LEN));
ubifs_dump_node(c, dent2);
@@ -1238,14 +1238,14 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
goto out_free;
}
if (cmp == 0 && nlen1 == nlen2)
- ubifs_err("2 xent/dent nodes with the same name");
+ ubifs_err(c, "2 xent/dent nodes with the same name");
else
- ubifs_err("bad order of colliding key %s",
+ ubifs_err(c, "bad order of colliding key %s",
dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
- ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+ ubifs_msg(c, "first node at %d:%d\n", zbr1->lnum, zbr1->offs);
ubifs_dump_node(c, dent1);
- ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+ ubifs_msg(c, "second node at %d:%d\n", zbr2->lnum, zbr2->offs);
ubifs_dump_node(c, dent2);
out_free:
@@ -1447,11 +1447,11 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
return 0;
out:
- ubifs_err("failed, error %d", err);
- ubifs_msg("dump of the znode");
+ ubifs_err(c, "failed, error %d", err);
+ ubifs_msg(c, "dump of the znode");
ubifs_dump_znode(c, znode);
if (zp) {
- ubifs_msg("dump of the parent znode");
+ ubifs_msg(c, "dump of the parent znode");
ubifs_dump_znode(c, zp);
}
dump_stack();
@@ -1518,9 +1518,9 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
if (err < 0)
return err;
if (err) {
- ubifs_msg("first znode");
+ ubifs_msg(c, "first znode");
ubifs_dump_znode(c, prev);
- ubifs_msg("second znode");
+ ubifs_msg(c, "second znode");
ubifs_dump_znode(c, znode);
return -EINVAL;
}
@@ -1529,13 +1529,13 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
if (extra) {
if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) {
- ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld",
+ ubifs_err(c, "incorrect clean_zn_cnt %ld, calculated %ld",
atomic_long_read(&c->clean_zn_cnt),
clean_cnt);
return -EINVAL;
}
if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) {
- ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld",
+ ubifs_err(c, "incorrect dirty_zn_cnt %ld, calculated %ld",
atomic_long_read(&c->dirty_zn_cnt),
dirty_cnt);
return -EINVAL;
@@ -1608,7 +1608,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
if (znode_cb) {
err = znode_cb(c, znode, priv);
if (err) {
- ubifs_err("znode checking function returned error %d",
+ ubifs_err(c, "znode checking function returned error %d",
err);
ubifs_dump_znode(c, znode);
goto out_dump;
@@ -1619,7 +1619,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
zbr = &znode->zbranch[idx];
err = leaf_cb(c, zbr, priv);
if (err) {
- ubifs_err("leaf checking function returned error %d, for leaf at LEB %d:%d",
+ ubifs_err(c, "leaf checking function returned error %d, for leaf at LEB %d:%d",
err, zbr->lnum, zbr->offs);
goto out_dump;
}
@@ -1675,7 +1675,7 @@ out_dump:
zbr = &znode->parent->zbranch[znode->iip];
else
zbr = &c->zroot;
- ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
+ ubifs_msg(c, "dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
ubifs_dump_znode(c, znode);
out_unlock:
mutex_unlock(&c->tnc_mutex);
@@ -1722,12 +1722,12 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
err = dbg_walk_index(c, NULL, add_size, &calc);
if (err) {
- ubifs_err("error %d while walking the index", err);
+ ubifs_err(c, "error %d while walking the index", err);
return err;
}
if (calc != idx_size) {
- ubifs_err("index size check failed: calculated size is %lld, should be %lld",
+ ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld",
calc, idx_size);
dump_stack();
return -EINVAL;
@@ -1814,7 +1814,7 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
}
if (inum > c->highest_inum) {
- ubifs_err("too high inode number, max. is %lu",
+ ubifs_err(c, "too high inode number, max. is %lu",
(unsigned long)c->highest_inum);
return ERR_PTR(-EINVAL);
}
@@ -1921,17 +1921,17 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c,
ino_key_init(c, &key, inum);
err = ubifs_lookup_level0(c, &key, &znode, &n);
if (!err) {
- ubifs_err("inode %lu not found in index", (unsigned long)inum);
+ ubifs_err(c, "inode %lu not found in index", (unsigned long)inum);
return ERR_PTR(-ENOENT);
} else if (err < 0) {
- ubifs_err("error %d while looking up inode %lu",
+ ubifs_err(c, "error %d while looking up inode %lu",
err, (unsigned long)inum);
return ERR_PTR(err);
}
zbr = &znode->zbranch[n];
if (zbr->len < UBIFS_INO_NODE_SZ) {
- ubifs_err("bad node %lu node length %d",
+ ubifs_err(c, "bad node %lu node length %d",
(unsigned long)inum, zbr->len);
return ERR_PTR(-EINVAL);
}
@@ -1942,7 +1942,7 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c,
err = ubifs_tnc_read_node(c, zbr, ino);
if (err) {
- ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+ ubifs_err(c, "cannot read inode node at LEB %d:%d, error %d",
zbr->lnum, zbr->offs, err);
kfree(ino);
return ERR_PTR(err);
@@ -1951,7 +1951,7 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c,
fscki = add_inode(c, fsckd, ino);
kfree(ino);
if (IS_ERR(fscki)) {
- ubifs_err("error %ld while adding inode %lu node",
+ ubifs_err(c, "error %ld while adding inode %lu node",
PTR_ERR(fscki), (unsigned long)inum);
return fscki;
}
@@ -1985,7 +1985,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
struct fsck_inode *fscki;
if (zbr->len < UBIFS_CH_SZ) {
- ubifs_err("bad leaf length %d (LEB %d:%d)",
+ ubifs_err(c, "bad leaf length %d (LEB %d:%d)",
zbr->len, zbr->lnum, zbr->offs);
return -EINVAL;
}
@@ -1996,7 +1996,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
err = ubifs_tnc_read_node(c, zbr, node);
if (err) {
- ubifs_err("cannot read leaf node at LEB %d:%d, error %d",
+ ubifs_err(c, "cannot read leaf node at LEB %d:%d, error %d",
zbr->lnum, zbr->offs, err);
goto out_free;
}
@@ -2006,7 +2006,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
fscki = add_inode(c, priv, node);
if (IS_ERR(fscki)) {
err = PTR_ERR(fscki);
- ubifs_err("error %d while adding inode node", err);
+ ubifs_err(c, "error %d while adding inode node", err);
goto out_dump;
}
goto out;
@@ -2014,7 +2014,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY &&
type != UBIFS_DATA_KEY) {
- ubifs_err("unexpected node type %d at LEB %d:%d",
+ ubifs_err(c, "unexpected node type %d at LEB %d:%d",
type, zbr->lnum, zbr->offs);
err = -EINVAL;
goto out_free;
@@ -2022,7 +2022,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
ch = node;
if (le64_to_cpu(ch->sqnum) > c->max_sqnum) {
- ubifs_err("too high sequence number, max. is %llu",
+ ubifs_err(c, "too high sequence number, max. is %llu",
c->max_sqnum);
err = -EINVAL;
goto out_dump;
@@ -2042,7 +2042,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
fscki = read_add_inode(c, priv, inum);
if (IS_ERR(fscki)) {
err = PTR_ERR(fscki);
- ubifs_err("error %d while processing data node and trying to find inode node %lu",
+ ubifs_err(c, "error %d while processing data node and trying to find inode node %lu",
err, (unsigned long)inum);
goto out_dump;
}
@@ -2052,7 +2052,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
blk_offs <<= UBIFS_BLOCK_SHIFT;
blk_offs += le32_to_cpu(dn->size);
if (blk_offs > fscki->size) {
- ubifs_err("data node at LEB %d:%d is not within inode size %lld",
+ ubifs_err(c, "data node at LEB %d:%d is not within inode size %lld",
zbr->lnum, zbr->offs, fscki->size);
err = -EINVAL;
goto out_dump;
@@ -2076,7 +2076,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
fscki = read_add_inode(c, priv, inum);
if (IS_ERR(fscki)) {
err = PTR_ERR(fscki);
- ubifs_err("error %d while processing entry node and trying to find inode node %lu",
+ ubifs_err(c, "error %d while processing entry node and trying to find inode node %lu",
err, (unsigned long)inum);
goto out_dump;
}
@@ -2088,7 +2088,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
fscki1 = read_add_inode(c, priv, inum);
if (IS_ERR(fscki1)) {
err = PTR_ERR(fscki1);
- ubifs_err("error %d while processing entry node and trying to find parent inode node %lu",
+ ubifs_err(c, "error %d while processing entry node and trying to find parent inode node %lu",
err, (unsigned long)inum);
goto out_dump;
}
@@ -2111,7 +2111,7 @@ out:
return 0;
out_dump:
- ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
+ ubifs_msg(c, "dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
ubifs_dump_node(c, node);
out_free:
kfree(node);
@@ -2162,52 +2162,52 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
*/
if (fscki->inum != UBIFS_ROOT_INO &&
fscki->references != 1) {
- ubifs_err("directory inode %lu has %d direntries which refer it, but should be 1",
+ ubifs_err(c, "directory inode %lu has %d direntries which refer it, but should be 1",
(unsigned long)fscki->inum,
fscki->references);
goto out_dump;
}
if (fscki->inum == UBIFS_ROOT_INO &&
fscki->references != 0) {
- ubifs_err("root inode %lu has non-zero (%d) direntries which refer it",
+ ubifs_err(c, "root inode %lu has non-zero (%d) direntries which refer it",
(unsigned long)fscki->inum,
fscki->references);
goto out_dump;
}
if (fscki->calc_sz != fscki->size) {
- ubifs_err("directory inode %lu size is %lld, but calculated size is %lld",
+ ubifs_err(c, "directory inode %lu size is %lld, but calculated size is %lld",
(unsigned long)fscki->inum,
fscki->size, fscki->calc_sz);
goto out_dump;
}
if (fscki->calc_cnt != fscki->nlink) {
- ubifs_err("directory inode %lu nlink is %d, but calculated nlink is %d",
+ ubifs_err(c, "directory inode %lu nlink is %d, but calculated nlink is %d",
(unsigned long)fscki->inum,
fscki->nlink, fscki->calc_cnt);
goto out_dump;
}
} else {
if (fscki->references != fscki->nlink) {
- ubifs_err("inode %lu nlink is %d, but calculated nlink is %d",
+ ubifs_err(c, "inode %lu nlink is %d, but calculated nlink is %d",
(unsigned long)fscki->inum,
fscki->nlink, fscki->references);
goto out_dump;
}
}
if (fscki->xattr_sz != fscki->calc_xsz) {
- ubifs_err("inode %lu has xattr size %u, but calculated size is %lld",
+ ubifs_err(c, "inode %lu has xattr size %u, but calculated size is %lld",
(unsigned long)fscki->inum, fscki->xattr_sz,
fscki->calc_xsz);
goto out_dump;
}
if (fscki->xattr_cnt != fscki->calc_xcnt) {
- ubifs_err("inode %lu has %u xattrs, but calculated count is %lld",
+ ubifs_err(c, "inode %lu has %u xattrs, but calculated count is %lld",
(unsigned long)fscki->inum,
fscki->xattr_cnt, fscki->calc_xcnt);
goto out_dump;
}
if (fscki->xattr_nms != fscki->calc_xnms) {
- ubifs_err("inode %lu has xattr names' size %u, but calculated names' size is %lld",
+ ubifs_err(c, "inode %lu has xattr names' size %u, but calculated names' size is %lld",
(unsigned long)fscki->inum, fscki->xattr_nms,
fscki->calc_xnms);
goto out_dump;
@@ -2221,11 +2221,11 @@ out_dump:
ino_key_init(c, &key, fscki->inum);
err = ubifs_lookup_level0(c, &key, &znode, &n);
if (!err) {
- ubifs_err("inode %lu not found in index",
+ ubifs_err(c, "inode %lu not found in index",
(unsigned long)fscki->inum);
return -ENOENT;
} else if (err < 0) {
- ubifs_err("error %d while looking up inode %lu",
+ ubifs_err(c, "error %d while looking up inode %lu",
err, (unsigned long)fscki->inum);
return err;
}
@@ -2237,13 +2237,13 @@ out_dump:
err = ubifs_tnc_read_node(c, zbr, ino);
if (err) {
- ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+ ubifs_err(c, "cannot read inode node at LEB %d:%d, error %d",
zbr->lnum, zbr->offs, err);
kfree(ino);
return err;
}
- ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
+ ubifs_msg(c, "dump of the inode %lu sitting in LEB %d:%d",
(unsigned long)fscki->inum, zbr->lnum, zbr->offs);
ubifs_dump_node(c, ino);
kfree(ino);
@@ -2284,7 +2284,7 @@ int dbg_check_filesystem(struct ubifs_info *c)
return 0;
out_free:
- ubifs_err("file-system check failed with error %d", err);
+ ubifs_err(c, "file-system check failed with error %d", err);
dump_stack();
free_inodes(&fsckd);
return err;
@@ -2315,12 +2315,12 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
sb = container_of(cur->next, struct ubifs_scan_node, list);
if (sa->type != UBIFS_DATA_NODE) {
- ubifs_err("bad node type %d", sa->type);
+ ubifs_err(c, "bad node type %d", sa->type);
ubifs_dump_node(c, sa->node);
return -EINVAL;
}
if (sb->type != UBIFS_DATA_NODE) {
- ubifs_err("bad node type %d", sb->type);
+ ubifs_err(c, "bad node type %d", sb->type);
ubifs_dump_node(c, sb->node);
return -EINVAL;
}
@@ -2331,7 +2331,7 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
if (inuma < inumb)
continue;
if (inuma > inumb) {
- ubifs_err("larger inum %lu goes before inum %lu",
+ ubifs_err(c, "larger inum %lu goes before inum %lu",
(unsigned long)inuma, (unsigned long)inumb);
goto error_dump;
}
@@ -2340,11 +2340,11 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head)
blkb = key_block(c, &sb->key);
if (blka > blkb) {
- ubifs_err("larger block %u goes before %u", blka, blkb);
+ ubifs_err(c, "larger block %u goes before %u", blka, blkb);
goto error_dump;
}
if (blka == blkb) {
- ubifs_err("two data nodes for the same block");
+ ubifs_err(c, "two data nodes for the same block");
goto error_dump;
}
}
@@ -2383,19 +2383,19 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
sa->type != UBIFS_XENT_NODE) {
- ubifs_err("bad node type %d", sa->type);
+ ubifs_err(c, "bad node type %d", sa->type);
ubifs_dump_node(c, sa->node);
return -EINVAL;
}
if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
sa->type != UBIFS_XENT_NODE) {
- ubifs_err("bad node type %d", sb->type);
+ ubifs_err(c, "bad node type %d", sb->type);
ubifs_dump_node(c, sb->node);
return -EINVAL;
}
if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
- ubifs_err("non-inode node goes before inode node");
+ ubifs_err(c, "non-inode node goes before inode node");
goto error_dump;
}
@@ -2405,7 +2405,7 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) {
/* Inode nodes are sorted in descending size order */
if (sa->len < sb->len) {
- ubifs_err("smaller inode node goes first");
+ ubifs_err(c, "smaller inode node goes first");
goto error_dump;
}
continue;
@@ -2421,7 +2421,7 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
if (inuma < inumb)
continue;
if (inuma > inumb) {
- ubifs_err("larger inum %lu goes before inum %lu",
+ ubifs_err(c, "larger inum %lu goes before inum %lu",
(unsigned long)inuma, (unsigned long)inumb);
goto error_dump;
}
@@ -2430,7 +2430,7 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
hashb = key_block(c, &sb->key);
if (hasha > hashb) {
- ubifs_err("larger hash %u goes before %u",
+ ubifs_err(c, "larger hash %u goes before %u",
hasha, hashb);
goto error_dump;
}
@@ -2439,9 +2439,9 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
return 0;
error_dump:
- ubifs_msg("dumping first node");
+ ubifs_msg(c, "dumping first node");
ubifs_dump_node(c, sa->node);
- ubifs_msg("dumping second node");
+ ubifs_msg(c, "dumping second node");
ubifs_dump_node(c, sb->node);
return -EINVAL;
return 0;
@@ -2470,13 +2470,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
delay = prandom_u32() % 60000;
d->pc_timeout = jiffies;
d->pc_timeout += msecs_to_jiffies(delay);
- ubifs_warn("failing after %lums", delay);
+ ubifs_warn(c, "failing after %lums", delay);
} else {
d->pc_delay = 2;
delay = prandom_u32() % 10000;
/* Fail within 10000 operations */
d->pc_cnt_max = delay;
- ubifs_warn("failing after %lu calls", delay);
+ ubifs_warn(c, "failing after %lu calls", delay);
}
}
@@ -2494,55 +2494,55 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
return 0;
if (chance(19, 20))
return 0;
- ubifs_warn("failing in super block LEB %d", lnum);
+ ubifs_warn(c, "failing in super block LEB %d", lnum);
} else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
if (chance(19, 20))
return 0;
- ubifs_warn("failing in master LEB %d", lnum);
+ ubifs_warn(c, "failing in master LEB %d", lnum);
} else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
if (write && chance(99, 100))
return 0;
if (chance(399, 400))
return 0;
- ubifs_warn("failing in log LEB %d", lnum);
+ ubifs_warn(c, "failing in log LEB %d", lnum);
} else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
if (write && chance(7, 8))
return 0;
if (chance(19, 20))
return 0;
- ubifs_warn("failing in LPT LEB %d", lnum);
+ ubifs_warn(c, "failing in LPT LEB %d", lnum);
} else if (lnum >= c->orph_first && lnum <= c->orph_last) {
if (write && chance(1, 2))
return 0;
if (chance(9, 10))
return 0;
- ubifs_warn("failing in orphan LEB %d", lnum);
+ ubifs_warn(c, "failing in orphan LEB %d", lnum);
} else if (lnum == c->ihead_lnum) {
if (chance(99, 100))
return 0;
- ubifs_warn("failing in index head LEB %d", lnum);
+ ubifs_warn(c, "failing in index head LEB %d", lnum);
} else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
if (chance(9, 10))
return 0;
- ubifs_warn("failing in GC head LEB %d", lnum);
+ ubifs_warn(c, "failing in GC head LEB %d", lnum);
} else if (write && !RB_EMPTY_ROOT(&c->buds) &&
!ubifs_search_bud(c, lnum)) {
if (chance(19, 20))
return 0;
- ubifs_warn("failing in non-bud LEB %d", lnum);
+ ubifs_warn(c, "failing in non-bud LEB %d", lnum);
} else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
c->cmt_state == COMMIT_RUNNING_REQUIRED) {
if (chance(999, 1000))
return 0;
- ubifs_warn("failing in bud LEB %d commit running", lnum);
+ ubifs_warn(c, "failing in bud LEB %d commit running", lnum);
} else {
if (chance(9999, 10000))
return 0;
- ubifs_warn("failing in bud LEB %d commit not running", lnum);
+ ubifs_warn(c, "failing in bud LEB %d commit not running", lnum);
}
d->pc_happened = 1;
- ubifs_warn("========== Power cut emulated ==========");
+ ubifs_warn(c, "========== Power cut emulated ==========");
dump_stack();
return 1;
}
@@ -2557,7 +2557,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
/* Corruption span max to end of write unit */
to = min(len, ALIGN(from + 1, c->max_write_size));
- ubifs_warn("filled bytes %u-%u with %s", from, to - 1,
+ ubifs_warn(c, "filled bytes %u-%u with %s", from, to - 1,
ffs ? "0xFFs" : "random data");
if (ffs)
@@ -2579,7 +2579,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
failing = power_cut_emulated(c, lnum, 1);
if (failing) {
len = corrupt_data(c, buf, len);
- ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)",
+ ubifs_warn(c, "actually write %d bytes to LEB %d:%d (the buffer was corrupted)",
len, lnum, offs);
}
err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
@@ -2909,7 +2909,7 @@ out_remove:
debugfs_remove_recursive(d->dfs_dir);
out:
err = dent ? PTR_ERR(dent) : -ENODEV;
- ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n",
+ ubifs_err(c, "cannot create \"%s\" debugfs file or directory, error %d\n",
fname, err);
return err;
}
@@ -3063,8 +3063,8 @@ out_remove:
debugfs_remove_recursive(dfs_rootdir);
out:
err = dent ? PTR_ERR(dent) : -ENODEV;
- ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n",
- fname, err);
+ pr_err("UBIFS error (pid %d): cannot create \"%s\" debugfs file or directory, error %d\n",
+ current->pid, fname, err);
return err;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 0fa6c803992e..5c27c66c224a 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -146,12 +146,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
if (c->highest_inum >= INUM_WARN_WATERMARK) {
if (c->highest_inum >= INUM_WATERMARK) {
spin_unlock(&c->cnt_lock);
- ubifs_err("out of inode numbers");
+ ubifs_err(c, "out of inode numbers");
make_bad_inode(inode);
iput(inode);
return ERR_PTR(-EINVAL);
}
- ubifs_warn("running out of inode numbers (current %lu, max %d)",
+ ubifs_warn(c, "running out of inode numbers (current %lu, max %u)",
(unsigned long)c->highest_inum, INUM_WATERMARK);
}
@@ -222,7 +222,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
* checking.
*/
err = PTR_ERR(inode);
- ubifs_err("dead directory entry '%pd', error %d",
+ ubifs_err(c, "dead directory entry '%pd', error %d",
dentry, err);
ubifs_ro_mode(c, err);
goto out;
@@ -272,7 +272,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
- goto out_cancel;
+ goto out_inode;
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
@@ -292,11 +292,12 @@ out_cancel:
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
make_bad_inode(inode);
iput(inode);
out_budg:
ubifs_release_budget(c, &req);
- ubifs_err("cannot create regular file, error %d", err);
+ ubifs_err(c, "cannot create regular file, error %d", err);
return err;
}
@@ -449,7 +450,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
out:
if (err != -ENOENT) {
- ubifs_err("cannot find next direntry, error %d", err);
+ ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
}
@@ -498,7 +499,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct ubifs_info *c = dir->i_sb->s_fs_info;
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
@@ -553,7 +554,7 @@ out_cancel:
static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
{
struct ubifs_info *c = dir->i_sb->s_fs_info;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
int err, budgeted = 1;
@@ -645,7 +646,7 @@ static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct ubifs_info *c = dir->i_sb->s_fs_info;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
int err, budgeted = 1;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -661,7 +662,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ino, dir->i_ino);
ubifs_assert(mutex_is_locked(&dir->i_mutex));
ubifs_assert(mutex_is_locked(&inode->i_mutex));
- err = check_dir_empty(c, dentry->d_inode);
+ err = check_dir_empty(c, d_inode(dentry));
if (err)
return err;
@@ -732,7 +733,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
- goto out_cancel;
+ goto out_inode;
mutex_lock(&dir_ui->ui_mutex);
insert_inode_hash(inode);
@@ -743,7 +744,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dir->i_mtime = dir->i_ctime = inode->i_ctime;
err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
if (err) {
- ubifs_err("cannot create directory, error %d", err);
+ ubifs_err(c, "cannot create directory, error %d", err);
goto out_cancel;
}
mutex_unlock(&dir_ui->ui_mutex);
@@ -757,6 +758,7 @@ out_cancel:
dir_ui->ui_size = dir->i_size;
drop_nlink(dir);
mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
make_bad_inode(inode);
iput(inode);
out_budg:
@@ -816,7 +818,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
- goto out_cancel;
+ goto out_inode;
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
@@ -836,6 +838,7 @@ out_cancel:
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
make_bad_inode(inode);
iput(inode);
out_budg:
@@ -886,6 +889,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
memcpy(ui->data, symname, len);
((char *)ui->data)[len] = '\0';
+ inode->i_link = ui->data;
/*
* The terminating zero byte is not written to the flash media and it
* is put just to make later in-memory string processing simpler. Thus,
@@ -896,7 +900,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
err = ubifs_init_security(dir, inode, &dentry->d_name);
if (err)
- goto out_cancel;
+ goto out_inode;
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
@@ -967,8 +971,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
struct ubifs_info *c = old_dir->i_sb->s_fs_info;
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
int err, release, sync = 0, move = (new_dir != old_dir);
int is_dir = S_ISDIR(old_inode->i_mode);
@@ -1133,7 +1137,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
loff_t size;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e627c0acf626..a3dfe2ae79f2 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,9 +50,7 @@
*/
#include "ubifs.h"
-#include <linux/aio.h>
#include <linux/mount.h>
-#include <linux/namei.h>
#include <linux/slab.h>
static int read_block(struct inode *inode, void *addr, unsigned int block,
@@ -80,7 +78,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
out_len = UBIFS_BLOCK_SIZE;
- err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+ err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
le16_to_cpu(dn->compr_type));
if (err || len != out_len)
goto dump;
@@ -96,7 +94,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
return 0;
dump:
- ubifs_err("bad data node (block %u, inode %lu)",
+ ubifs_err(c, "bad data node (block %u, inode %lu)",
block, inode->i_ino);
ubifs_dump_node(c, dn);
return -EINVAL;
@@ -161,13 +159,14 @@ static int do_readpage(struct page *page)
addr += UBIFS_BLOCK_SIZE;
}
if (err) {
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
if (err == -ENOENT) {
/* Not found, so it must be a hole */
SetPageChecked(page);
dbg_gen("hole");
goto out_free;
}
- ubifs_err("cannot read page %lu of inode %lu, error %d",
+ ubifs_err(c, "cannot read page %lu of inode %lu, error %d",
page->index, inode->i_ino, err);
goto error;
}
@@ -650,7 +649,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
out_len = UBIFS_BLOCK_SIZE;
- err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+ err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
le16_to_cpu(dn->compr_type));
if (err || len != out_len)
goto out_err;
@@ -698,7 +697,7 @@ out_err:
SetPageError(page);
flush_dcache_page(page);
kunmap(page);
- ubifs_err("bad data node (block %u, inode %lu)",
+ ubifs_err(c, "bad data node (block %u, inode %lu)",
page_block, inode->i_ino);
return -EINVAL;
}
@@ -802,7 +801,7 @@ out_free:
return ret;
out_warn:
- ubifs_warn("ignoring error %d and skipping bulk-read", err);
+ ubifs_warn(c, "ignoring error %d and skipping bulk-read", err);
goto out_free;
out_bu_off:
@@ -930,7 +929,7 @@ static int do_writepage(struct page *page, int len)
}
if (err) {
SetPageError(page);
- ubifs_err("cannot write page %lu of inode %lu, error %d",
+ ubifs_err(c, "cannot write page %lu of inode %lu, error %d",
page->index, inode->i_ino, err);
ubifs_ro_mode(c, err);
}
@@ -1257,7 +1256,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct ubifs_info *c = inode->i_sb->s_fs_info;
dbg_gen("ino %lu, mode %#x, ia_valid %#x",
@@ -1300,14 +1299,6 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
ClearPageChecked(page);
}
-static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct ubifs_inode *ui = ubifs_inode(dentry->d_inode);
-
- nd_set_link(nd, ui->data);
- return NULL;
-}
-
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
@@ -1485,7 +1476,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
err = ubifs_budget_space(c, &req);
if (unlikely(err)) {
if (err == -ENOSPC)
- ubifs_warn("out of space for mmapped file (inode number %lu)",
+ ubifs_warn(c, "out of space for mmapped file (inode number %lu)",
inode->i_ino);
return VM_FAULT_SIGBUS;
}
@@ -1570,7 +1561,7 @@ const struct inode_operations ubifs_file_inode_operations = {
const struct inode_operations ubifs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ubifs_follow_link,
+ .follow_link = simple_follow_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
.setxattr = ubifs_setxattr,
@@ -1581,8 +1572,6 @@ const struct inode_operations ubifs_symlink_inode_operations = {
const struct file_operations ubifs_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = ubifs_write_iter,
.mmap = ubifs_file_mmap,
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index fb08b0c514b6..97be41215332 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -85,7 +85,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
c->ro_error = 1;
c->no_chk_data_crc = 0;
c->vfs_sb->s_flags |= MS_RDONLY;
- ubifs_warn("switched to read-only mode, error %d", err);
+ ubifs_warn(c, "switched to read-only mode, error %d", err);
dump_stack();
}
}
@@ -107,7 +107,7 @@ int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
* @even_ebadmsg is true.
*/
if (err && (err != -EBADMSG || even_ebadmsg)) {
- ubifs_err("reading %d bytes from LEB %d:%d failed, error %d",
+ ubifs_err(c, "reading %d bytes from LEB %d:%d failed, error %d",
len, lnum, offs, err);
dump_stack();
}
@@ -127,7 +127,7 @@ int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
else
err = dbg_leb_write(c, lnum, buf, offs, len);
if (err) {
- ubifs_err("writing %d bytes to LEB %d:%d failed, error %d",
+ ubifs_err(c, "writing %d bytes to LEB %d:%d failed, error %d",
len, lnum, offs, err);
ubifs_ro_mode(c, err);
dump_stack();
@@ -147,7 +147,7 @@ int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len)
else
err = dbg_leb_change(c, lnum, buf, len);
if (err) {
- ubifs_err("changing %d bytes in LEB %d failed, error %d",
+ ubifs_err(c, "changing %d bytes in LEB %d failed, error %d",
len, lnum, err);
ubifs_ro_mode(c, err);
dump_stack();
@@ -167,7 +167,7 @@ int ubifs_leb_unmap(struct ubifs_info *c, int lnum)
else
err = dbg_leb_unmap(c, lnum);
if (err) {
- ubifs_err("unmap LEB %d failed, error %d", lnum, err);
+ ubifs_err(c, "unmap LEB %d failed, error %d", lnum, err);
ubifs_ro_mode(c, err);
dump_stack();
}
@@ -186,7 +186,7 @@ int ubifs_leb_map(struct ubifs_info *c, int lnum)
else
err = dbg_leb_map(c, lnum);
if (err) {
- ubifs_err("mapping LEB %d failed, error %d", lnum, err);
+ ubifs_err(c, "mapping LEB %d failed, error %d", lnum, err);
ubifs_ro_mode(c, err);
dump_stack();
}
@@ -199,7 +199,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
err = ubi_is_mapped(c->ubi, lnum);
if (err < 0) {
- ubifs_err("ubi_is_mapped failed for LEB %d, error %d",
+ ubifs_err(c, "ubi_is_mapped failed for LEB %d, error %d",
lnum, err);
dump_stack();
}
@@ -247,7 +247,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
magic = le32_to_cpu(ch->magic);
if (magic != UBIFS_NODE_MAGIC) {
if (!quiet)
- ubifs_err("bad magic %#08x, expected %#08x",
+ ubifs_err(c, "bad magic %#08x, expected %#08x",
magic, UBIFS_NODE_MAGIC);
err = -EUCLEAN;
goto out;
@@ -256,7 +256,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
type = ch->node_type;
if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
if (!quiet)
- ubifs_err("bad node type %d", type);
+ ubifs_err(c, "bad node type %d", type);
goto out;
}
@@ -279,7 +279,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
node_crc = le32_to_cpu(ch->crc);
if (crc != node_crc) {
if (!quiet)
- ubifs_err("bad CRC: calculated %#08x, read %#08x",
+ ubifs_err(c, "bad CRC: calculated %#08x, read %#08x",
crc, node_crc);
err = -EUCLEAN;
goto out;
@@ -289,10 +289,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
out_len:
if (!quiet)
- ubifs_err("bad node length %d", node_len);
+ ubifs_err(c, "bad node length %d", node_len);
out:
if (!quiet) {
- ubifs_err("bad node at LEB %d:%d", lnum, offs);
+ ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
ubifs_dump_node(c, buf);
dump_stack();
}
@@ -355,11 +355,11 @@ static unsigned long long next_sqnum(struct ubifs_info *c)
if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) {
if (sqnum >= SQNUM_WATERMARK) {
- ubifs_err("sequence number overflow %llu, end of life",
+ ubifs_err(c, "sequence number overflow %llu, end of life",
sqnum);
ubifs_ro_mode(c, -EINVAL);
}
- ubifs_warn("running out of sequence numbers, end of life soon");
+ ubifs_warn(c, "running out of sequence numbers, end of life soon");
}
return sqnum;
@@ -636,7 +636,7 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c)
err = ubifs_wbuf_sync_nolock(wbuf);
mutex_unlock(&wbuf->io_mutex);
if (err) {
- ubifs_err("cannot sync write-buffer, error %d", err);
+ ubifs_err(c, "cannot sync write-buffer, error %d", err);
ubifs_ro_mode(c, err);
goto out_timers;
}
@@ -833,7 +833,7 @@ exit:
return 0;
out:
- ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+ ubifs_err(c, "cannot write %d bytes to LEB %d:%d, error %d",
len, wbuf->lnum, wbuf->offs, err);
ubifs_dump_node(c, buf);
dump_stack();
@@ -932,27 +932,27 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
}
if (type != ch->node_type) {
- ubifs_err("bad node type (%d but expected %d)",
+ ubifs_err(c, "bad node type (%d but expected %d)",
ch->node_type, type);
goto out;
}
err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
if (err) {
- ubifs_err("expected node type %d", type);
+ ubifs_err(c, "expected node type %d", type);
return err;
}
rlen = le32_to_cpu(ch->len);
if (rlen != len) {
- ubifs_err("bad node length %d, expected %d", rlen, len);
+ ubifs_err(c, "bad node length %d, expected %d", rlen, len);
goto out;
}
return 0;
out:
- ubifs_err("bad node at LEB %d:%d", lnum, offs);
+ ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
ubifs_dump_node(c, buf);
dump_stack();
return -EINVAL;
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 648b143606cc..3c7b29de0ca7 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -138,7 +138,7 @@ static int setflags(struct inode *inode, int flags)
return err;
out_unlock:
- ubifs_err("can't modify inode %lu attributes", inode->i_ino);
+ ubifs_err(c, "can't modify inode %lu attributes", inode->i_ino);
mutex_unlock(&ui->ui_mutex);
ubifs_release_budget(c, &req);
return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f6ac3f29323c..0b9da5b6e0f9 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -363,11 +363,11 @@ again:
* This should not happen unless the journal size limitations
* are too tough.
*/
- ubifs_err("stuck in space allocation");
+ ubifs_err(c, "stuck in space allocation");
err = -ENOSPC;
goto out;
} else if (cmt_retries > 32)
- ubifs_warn("too many space allocation re-tries (%d)",
+ ubifs_warn(c, "too many space allocation re-tries (%d)",
cmt_retries);
dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
@@ -380,7 +380,7 @@ again:
goto again;
out:
- ubifs_err("cannot reserve %d bytes in jhead %d, error %d",
+ ubifs_err(c, "cannot reserve %d bytes in jhead %d, error %d",
len, jhead, err);
if (err == -ENOSPC) {
/* This are some budgeting problems, print useful information */
@@ -731,7 +731,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
compr_type = ui->compr_type;
out_len = dlen - UBIFS_DATA_NODE_SZ;
- ubifs_compress(buf, len, &data->data, &out_len, &compr_type);
+ ubifs_compress(c, buf, len, &data->data, &out_len, &compr_type);
ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
dlen = UBIFS_DATA_NODE_SZ + out_len;
@@ -930,8 +930,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
union ubifs_key key;
struct ubifs_dent_node *dent, *dent2;
int err, dlen1, dlen2, ilen, lnum, offs, len;
- const struct inode *old_inode = old_dentry->d_inode;
- const struct inode *new_inode = new_dentry->d_inode;
+ const struct inode *old_inode = d_inode(old_dentry);
+ const struct inode *new_inode = d_inode(new_dentry);
int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
@@ -1100,7 +1100,8 @@ out_free:
* This function is used when an inode is truncated and the last data node of
* the inode has to be re-compressed and re-written.
*/
-static int recomp_data_node(struct ubifs_data_node *dn, int *new_len)
+static int recomp_data_node(const struct ubifs_info *c,
+ struct ubifs_data_node *dn, int *new_len)
{
void *buf;
int err, len, compr_type, out_len;
@@ -1112,11 +1113,11 @@ static int recomp_data_node(struct ubifs_data_node *dn, int *new_len)
len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
compr_type = le16_to_cpu(dn->compr_type);
- err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type);
+ err = ubifs_decompress(c, &dn->data, len, buf, &out_len, compr_type);
if (err)
goto out;
- ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type);
+ ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type);
ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
dn->compr_type = cpu_to_le16(compr_type);
dn->size = cpu_to_le32(*new_len);
@@ -1191,7 +1192,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
int compr_type = le16_to_cpu(dn->compr_type);
if (compr_type != UBIFS_COMPR_NONE) {
- err = recomp_data_node(dn, &dlen);
+ err = recomp_data_node(c, dn, &dlen);
if (err)
goto out_free;
} else {
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index c14628fbeee2..8c795e6392b1 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -696,7 +696,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
destroy_done_tree(&done_tree);
vfree(buf);
if (write_lnum == c->lhead_lnum) {
- ubifs_err("log is too full");
+ ubifs_err(c, "log is too full");
return -EINVAL;
}
/* Unmap remaining LEBs */
@@ -743,7 +743,7 @@ static int dbg_check_bud_bytes(struct ubifs_info *c)
bud_bytes += c->leb_size - bud->start;
if (c->bud_bytes != bud_bytes) {
- ubifs_err("bad bud_bytes %lld, calculated %lld",
+ ubifs_err(c, "bad bud_bytes %lld, calculated %lld",
c->bud_bytes, bud_bytes);
err = -EINVAL;
}
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 46190a7c42a6..a0011aa3a779 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -682,7 +682,7 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
out:
ubifs_release_lprops(c);
if (err)
- ubifs_err("cannot change properties of LEB %d, error %d",
+ ubifs_err(c, "cannot change properties of LEB %d, error %d",
lnum, err);
return err;
}
@@ -721,7 +721,7 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
out:
ubifs_release_lprops(c);
if (err)
- ubifs_err("cannot update properties of LEB %d, error %d",
+ ubifs_err(c, "cannot update properties of LEB %d, error %d",
lnum, err);
return err;
}
@@ -746,7 +746,7 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
lpp = ubifs_lpt_lookup(c, lnum);
if (IS_ERR(lpp)) {
err = PTR_ERR(lpp);
- ubifs_err("cannot read properties of LEB %d, error %d",
+ ubifs_err(c, "cannot read properties of LEB %d, error %d",
lnum, err);
goto out;
}
@@ -873,13 +873,13 @@ int dbg_check_cats(struct ubifs_info *c)
list_for_each_entry(lprops, &c->empty_list, list) {
if (lprops->free != c->leb_size) {
- ubifs_err("non-empty LEB %d on empty list (free %d dirty %d flags %d)",
+ ubifs_err(c, "non-empty LEB %d on empty list (free %d dirty %d flags %d)",
lprops->lnum, lprops->free, lprops->dirty,
lprops->flags);
return -EINVAL;
}
if (lprops->flags & LPROPS_TAKEN) {
- ubifs_err("taken LEB %d on empty list (free %d dirty %d flags %d)",
+ ubifs_err(c, "taken LEB %d on empty list (free %d dirty %d flags %d)",
lprops->lnum, lprops->free, lprops->dirty,
lprops->flags);
return -EINVAL;
@@ -889,13 +889,13 @@ int dbg_check_cats(struct ubifs_info *c)
i = 0;
list_for_each_entry(lprops, &c->freeable_list, list) {
if (lprops->free + lprops->dirty != c->leb_size) {
- ubifs_err("non-freeable LEB %d on freeable list (free %d dirty %d flags %d)",
+ ubifs_err(c, "non-freeable LEB %d on freeable list (free %d dirty %d flags %d)",
lprops->lnum, lprops->free, lprops->dirty,
lprops->flags);
return -EINVAL;
}
if (lprops->flags & LPROPS_TAKEN) {
- ubifs_err("taken LEB %d on freeable list (free %d dirty %d flags %d)",
+ ubifs_err(c, "taken LEB %d on freeable list (free %d dirty %d flags %d)",
lprops->lnum, lprops->free, lprops->dirty,
lprops->flags);
return -EINVAL;
@@ -903,7 +903,7 @@ int dbg_check_cats(struct ubifs_info *c)
i += 1;
}
if (i != c->freeable_cnt) {
- ubifs_err("freeable list count %d expected %d", i,
+ ubifs_err(c, "freeable list count %d expected %d", i,
c->freeable_cnt);
return -EINVAL;
}
@@ -912,26 +912,26 @@ int dbg_check_cats(struct ubifs_info *c)
list_for_each(pos, &c->idx_gc)
i += 1;
if (i != c->idx_gc_cnt) {
- ubifs_err("idx_gc list count %d expected %d", i,
+ ubifs_err(c, "idx_gc list count %d expected %d", i,
c->idx_gc_cnt);
return -EINVAL;
}
list_for_each_entry(lprops, &c->frdi_idx_list, list) {
if (lprops->free + lprops->dirty != c->leb_size) {
- ubifs_err("non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)",
+ ubifs_err(c, "non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)",
lprops->lnum, lprops->free, lprops->dirty,
lprops->flags);
return -EINVAL;
}
if (lprops->flags & LPROPS_TAKEN) {
- ubifs_err("taken LEB %d on frdi_idx list (free %d dirty %d flags %d)",
+ ubifs_err(c, "taken LEB %d on frdi_idx list (free %d dirty %d flags %d)",
lprops->lnum, lprops->free, lprops->dirty,
lprops->flags);
return -EINVAL;
}
if (!(lprops->flags & LPROPS_INDEX)) {
- ubifs_err("non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)",
+ ubifs_err(c, "non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)",
lprops->lnum, lprops->free, lprops->dirty,
lprops->flags);
return -EINVAL;
@@ -944,15 +944,15 @@ int dbg_check_cats(struct ubifs_info *c)
for (i = 0; i < heap->cnt; i++) {
lprops = heap->arr[i];
if (!lprops) {
- ubifs_err("null ptr in LPT heap cat %d", cat);
+ ubifs_err(c, "null ptr in LPT heap cat %d", cat);
return -EINVAL;
}
if (lprops->hpos != i) {
- ubifs_err("bad ptr in LPT heap cat %d", cat);
+ ubifs_err(c, "bad ptr in LPT heap cat %d", cat);
return -EINVAL;
}
if (lprops->flags & LPROPS_TAKEN) {
- ubifs_err("taken LEB in LPT heap cat %d", cat);
+ ubifs_err(c, "taken LEB in LPT heap cat %d", cat);
return -EINVAL;
}
}
@@ -988,7 +988,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
goto out;
}
if (lprops != lp) {
- ubifs_err("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
+ ubifs_err(c, "lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
(size_t)lprops, (size_t)lp, lprops->lnum,
lp->lnum);
err = 4;
@@ -1008,7 +1008,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
}
out:
if (err) {
- ubifs_err("failed cat %d hpos %d err %d", cat, i, err);
+ ubifs_err(c, "failed cat %d hpos %d err %d", cat, i, err);
dump_stack();
ubifs_dump_heap(c, heap, cat);
}
@@ -1039,7 +1039,7 @@ static int scan_check_cb(struct ubifs_info *c,
if (cat != LPROPS_UNCAT) {
cat = ubifs_categorize_lprops(c, lp);
if (cat != (lp->flags & LPROPS_CAT_MASK)) {
- ubifs_err("bad LEB category %d expected %d",
+ ubifs_err(c, "bad LEB category %d expected %d",
(lp->flags & LPROPS_CAT_MASK), cat);
return -EINVAL;
}
@@ -1074,7 +1074,7 @@ static int scan_check_cb(struct ubifs_info *c,
}
}
if (!found) {
- ubifs_err("bad LPT list (category %d)", cat);
+ ubifs_err(c, "bad LPT list (category %d)", cat);
return -EINVAL;
}
}
@@ -1086,7 +1086,7 @@ static int scan_check_cb(struct ubifs_info *c,
if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
lp != heap->arr[lp->hpos]) {
- ubifs_err("bad LPT heap (category %d)", cat);
+ ubifs_err(c, "bad LPT heap (category %d)", cat);
return -EINVAL;
}
}
@@ -1133,7 +1133,7 @@ static int scan_check_cb(struct ubifs_info *c,
is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0;
if (is_idx && snod->type != UBIFS_IDX_NODE) {
- ubifs_err("indexing node in data LEB %d:%d",
+ ubifs_err(c, "indexing node in data LEB %d:%d",
lnum, snod->offs);
goto out_destroy;
}
@@ -1159,7 +1159,7 @@ static int scan_check_cb(struct ubifs_info *c,
if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
dirty < 0) {
- ubifs_err("bad calculated accounting for LEB %d: free %d, dirty %d",
+ ubifs_err(c, "bad calculated accounting for LEB %d: free %d, dirty %d",
lnum, free, dirty);
goto out_destroy;
}
@@ -1206,13 +1206,13 @@ static int scan_check_cb(struct ubifs_info *c,
/* Free but not unmapped LEB, it's fine */
is_idx = 0;
else {
- ubifs_err("indexing node without indexing flag");
+ ubifs_err(c, "indexing node without indexing flag");
goto out_print;
}
}
if (!is_idx && (lp->flags & LPROPS_INDEX)) {
- ubifs_err("data node with indexing flag");
+ ubifs_err(c, "data node with indexing flag");
goto out_print;
}
@@ -1241,7 +1241,7 @@ static int scan_check_cb(struct ubifs_info *c,
return LPT_SCAN_CONTINUE;
out_print:
- ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d",
+ ubifs_err(c, "bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d",
lnum, lp->free, lp->dirty, lp->flags, free, dirty);
ubifs_dump_leb(c, lnum);
out_destroy:
@@ -1293,11 +1293,11 @@ int dbg_check_lprops(struct ubifs_info *c)
lst.total_free != c->lst.total_free ||
lst.total_dirty != c->lst.total_dirty ||
lst.total_used != c->lst.total_used) {
- ubifs_err("bad overall accounting");
- ubifs_err("calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
+ ubifs_err(c, "bad overall accounting");
+ ubifs_err(c, "calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
lst.empty_lebs, lst.idx_lebs, lst.total_free,
lst.total_dirty, lst.total_used);
- ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
+ ubifs_err(c, "read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
c->lst.total_dirty, c->lst.total_used);
err = -EINVAL;
@@ -1306,10 +1306,10 @@ int dbg_check_lprops(struct ubifs_info *c)
if (lst.total_dead != c->lst.total_dead ||
lst.total_dark != c->lst.total_dark) {
- ubifs_err("bad dead/dark space accounting");
- ubifs_err("calculated: total_dead %lld, total_dark %lld",
+ ubifs_err(c, "bad dead/dark space accounting");
+ ubifs_err(c, "calculated: total_dead %lld, total_dark %lld",
lst.total_dead, lst.total_dark);
- ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
+ ubifs_err(c, "read from lprops: total_dead %lld, total_dark %lld",
c->lst.total_dead, c->lst.total_dark);
err = -EINVAL;
goto out;
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 421bd0a80424..dc9f27e9d61b 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -145,13 +145,13 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c)
sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
if (lebs_needed > c->lpt_lebs) {
- ubifs_err("too few LPT LEBs");
+ ubifs_err(c, "too few LPT LEBs");
return -EINVAL;
}
/* Verify that ltab fits in a single LEB (since ltab is a single node */
if (c->ltab_sz > c->leb_size) {
- ubifs_err("LPT ltab too big");
+ ubifs_err(c, "LPT ltab too big");
return -EINVAL;
}
@@ -213,7 +213,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
continue;
}
if (c->ltab_sz > c->leb_size) {
- ubifs_err("LPT ltab too big");
+ ubifs_err(c, "LPT ltab too big");
return -EINVAL;
}
*main_lebs = c->main_lebs;
@@ -911,7 +911,7 @@ static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode,
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int check_lpt_crc(void *buf, int len)
+static int check_lpt_crc(const struct ubifs_info *c, void *buf, int len)
{
int pos = 0;
uint8_t *addr = buf;
@@ -921,8 +921,8 @@ static int check_lpt_crc(void *buf, int len)
calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
len - UBIFS_LPT_CRC_BYTES);
if (crc != calc_crc) {
- ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
- calc_crc);
+ ubifs_err(c, "invalid crc in LPT node: crc %hx calc %hx",
+ crc, calc_crc);
dump_stack();
return -EINVAL;
}
@@ -938,14 +938,15 @@ static int check_lpt_crc(void *buf, int len)
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int check_lpt_type(uint8_t **addr, int *pos, int type)
+static int check_lpt_type(const struct ubifs_info *c, uint8_t **addr,
+ int *pos, int type)
{
int node_type;
node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS);
if (node_type != type) {
- ubifs_err("invalid type (%d) in LPT node type %d", node_type,
- type);
+ ubifs_err(c, "invalid type (%d) in LPT node type %d",
+ node_type, type);
dump_stack();
return -EINVAL;
}
@@ -966,7 +967,7 @@ static int unpack_pnode(const struct ubifs_info *c, void *buf,
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
int i, pos = 0, err;
- err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE);
+ err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_PNODE);
if (err)
return err;
if (c->big_lpt)
@@ -985,7 +986,7 @@ static int unpack_pnode(const struct ubifs_info *c, void *buf,
lprops->flags = 0;
lprops->flags |= ubifs_categorize_lprops(c, lprops);
}
- err = check_lpt_crc(buf, c->pnode_sz);
+ err = check_lpt_crc(c, buf, c->pnode_sz);
return err;
}
@@ -1003,7 +1004,7 @@ int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
int i, pos = 0, err;
- err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE);
+ err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_NNODE);
if (err)
return err;
if (c->big_lpt)
@@ -1019,7 +1020,7 @@ int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos,
c->lpt_offs_bits);
}
- err = check_lpt_crc(buf, c->nnode_sz);
+ err = check_lpt_crc(c, buf, c->nnode_sz);
return err;
}
@@ -1035,7 +1036,7 @@ static int unpack_ltab(const struct ubifs_info *c, void *buf)
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
int i, pos = 0, err;
- err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB);
+ err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_LTAB);
if (err)
return err;
for (i = 0; i < c->lpt_lebs; i++) {
@@ -1051,7 +1052,7 @@ static int unpack_ltab(const struct ubifs_info *c, void *buf)
c->ltab[i].tgc = 0;
c->ltab[i].cmt = 0;
}
- err = check_lpt_crc(buf, c->ltab_sz);
+ err = check_lpt_crc(c, buf, c->ltab_sz);
return err;
}
@@ -1067,7 +1068,7 @@ static int unpack_lsave(const struct ubifs_info *c, void *buf)
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
int i, pos = 0, err;
- err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE);
+ err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_LSAVE);
if (err)
return err;
for (i = 0; i < c->lsave_cnt; i++) {
@@ -1077,7 +1078,7 @@ static int unpack_lsave(const struct ubifs_info *c, void *buf)
return -EINVAL;
c->lsave[i] = lnum;
}
- err = check_lpt_crc(buf, c->lsave_sz);
+ err = check_lpt_crc(c, buf, c->lsave_sz);
return err;
}
@@ -1243,7 +1244,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
return 0;
out:
- ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
+ ubifs_err(c, "error %d reading nnode at %d:%d", err, lnum, offs);
dump_stack();
kfree(nnode);
return err;
@@ -1308,10 +1309,10 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
return 0;
out:
- ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
+ ubifs_err(c, "error %d reading pnode at %d:%d", err, lnum, offs);
ubifs_dump_pnode(c, pnode, parent, iip);
dump_stack();
- ubifs_err("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
+ ubifs_err(c, "calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
kfree(pnode);
return err;
}
@@ -2095,7 +2096,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
int i;
if (pnode->num != col) {
- ubifs_err("pnode num %d expected %d parent num %d iip %d",
+ ubifs_err(c, "pnode num %d expected %d parent num %d iip %d",
pnode->num, col, pnode->parent->num, pnode->iip);
return -EINVAL;
}
@@ -2110,13 +2111,13 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
if (lnum >= c->leb_cnt)
continue;
if (lprops->lnum != lnum) {
- ubifs_err("bad LEB number %d expected %d",
+ ubifs_err(c, "bad LEB number %d expected %d",
lprops->lnum, lnum);
return -EINVAL;
}
if (lprops->flags & LPROPS_TAKEN) {
if (cat != LPROPS_UNCAT) {
- ubifs_err("LEB %d taken but not uncat %d",
+ ubifs_err(c, "LEB %d taken but not uncat %d",
lprops->lnum, cat);
return -EINVAL;
}
@@ -2129,7 +2130,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
case LPROPS_FRDI_IDX:
break;
default:
- ubifs_err("LEB %d index but cat %d",
+ ubifs_err(c, "LEB %d index but cat %d",
lprops->lnum, cat);
return -EINVAL;
}
@@ -2142,7 +2143,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
case LPROPS_FREEABLE:
break;
default:
- ubifs_err("LEB %d not index but cat %d",
+ ubifs_err(c, "LEB %d not index but cat %d",
lprops->lnum, cat);
return -EINVAL;
}
@@ -2183,14 +2184,14 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
break;
}
if (!found) {
- ubifs_err("LEB %d cat %d not found in cat heap/list",
+ ubifs_err(c, "LEB %d cat %d not found in cat heap/list",
lprops->lnum, cat);
return -EINVAL;
}
switch (cat) {
case LPROPS_EMPTY:
if (lprops->free != c->leb_size) {
- ubifs_err("LEB %d cat %d free %d dirty %d",
+ ubifs_err(c, "LEB %d cat %d free %d dirty %d",
lprops->lnum, cat, lprops->free,
lprops->dirty);
return -EINVAL;
@@ -2199,7 +2200,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
case LPROPS_FREEABLE:
case LPROPS_FRDI_IDX:
if (lprops->free + lprops->dirty != c->leb_size) {
- ubifs_err("LEB %d cat %d free %d dirty %d",
+ ubifs_err(c, "LEB %d cat %d free %d dirty %d",
lprops->lnum, cat, lprops->free,
lprops->dirty);
return -EINVAL;
@@ -2236,7 +2237,7 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
/* cnode is a nnode */
num = calc_nnode_num(row, col);
if (cnode->num != num) {
- ubifs_err("nnode num %d expected %d parent num %d iip %d",
+ ubifs_err(c, "nnode num %d expected %d parent num %d iip %d",
cnode->num, num,
(nnode ? nnode->num : 0), cnode->iip);
return -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index d9c02928e992..ce89bdc3eb02 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -319,7 +319,7 @@ static int layout_cnodes(struct ubifs_info *c)
return 0;
no_space:
- ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
+ ubifs_err(c, "LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
lnum, offs, len, done_ltab, done_lsave);
ubifs_dump_lpt_info(c);
ubifs_dump_lpt_lebs(c);
@@ -543,7 +543,7 @@ static int write_cnodes(struct ubifs_info *c)
return 0;
no_space:
- ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
+ ubifs_err(c, "LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
lnum, offs, len, done_ltab, done_lsave);
ubifs_dump_lpt_info(c);
ubifs_dump_lpt_lebs(c);
@@ -1638,7 +1638,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
if (!buf) {
- ubifs_err("cannot allocate memory for ltab checking");
+ ubifs_err(c, "cannot allocate memory for ltab checking");
return 0;
}
@@ -1660,18 +1660,18 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
continue;
}
if (!dbg_is_all_ff(p, len)) {
- ubifs_err("invalid empty space in LEB %d at %d",
+ ubifs_err(c, "invalid empty space in LEB %d at %d",
lnum, c->leb_size - len);
err = -EINVAL;
}
i = lnum - c->lpt_first;
if (len != c->ltab[i].free) {
- ubifs_err("invalid free space in LEB %d (free %d, expected %d)",
+ ubifs_err(c, "invalid free space in LEB %d (free %d, expected %d)",
lnum, len, c->ltab[i].free);
err = -EINVAL;
}
if (dirty != c->ltab[i].dirty) {
- ubifs_err("invalid dirty space in LEB %d (dirty %d, expected %d)",
+ ubifs_err(c, "invalid dirty space in LEB %d (dirty %d, expected %d)",
lnum, dirty, c->ltab[i].dirty);
err = -EINVAL;
}
@@ -1725,7 +1725,7 @@ int dbg_check_ltab(struct ubifs_info *c)
for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
err = dbg_check_ltab_lnum(c, lnum);
if (err) {
- ubifs_err("failed at LEB %d", lnum);
+ ubifs_err(c, "failed at LEB %d", lnum);
return err;
}
}
@@ -1757,7 +1757,7 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
free += c->leb_size;
}
if (free < c->lpt_sz) {
- ubifs_err("LPT space error: free %lld lpt_sz %lld",
+ ubifs_err(c, "LPT space error: free %lld lpt_sz %lld",
free, c->lpt_sz);
ubifs_dump_lpt_info(c);
ubifs_dump_lpt_lebs(c);
@@ -1797,12 +1797,12 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
d->chk_lpt_lebs = 0;
d->chk_lpt_wastage = 0;
if (c->dirty_pn_cnt > c->pnode_cnt) {
- ubifs_err("dirty pnodes %d exceed max %d",
+ ubifs_err(c, "dirty pnodes %d exceed max %d",
c->dirty_pn_cnt, c->pnode_cnt);
err = -EINVAL;
}
if (c->dirty_nn_cnt > c->nnode_cnt) {
- ubifs_err("dirty nnodes %d exceed max %d",
+ ubifs_err(c, "dirty nnodes %d exceed max %d",
c->dirty_nn_cnt, c->nnode_cnt);
err = -EINVAL;
}
@@ -1820,22 +1820,22 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
chk_lpt_sz *= d->chk_lpt_lebs;
chk_lpt_sz += len - c->nhead_offs;
if (d->chk_lpt_sz != chk_lpt_sz) {
- ubifs_err("LPT wrote %lld but space used was %lld",
+ ubifs_err(c, "LPT wrote %lld but space used was %lld",
d->chk_lpt_sz, chk_lpt_sz);
err = -EINVAL;
}
if (d->chk_lpt_sz > c->lpt_sz) {
- ubifs_err("LPT wrote %lld but lpt_sz is %lld",
+ ubifs_err(c, "LPT wrote %lld but lpt_sz is %lld",
d->chk_lpt_sz, c->lpt_sz);
err = -EINVAL;
}
if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
- ubifs_err("LPT layout size %lld but wrote %lld",
+ ubifs_err(c, "LPT layout size %lld but wrote %lld",
d->chk_lpt_sz, d->chk_lpt_sz2);
err = -EINVAL;
}
if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
- ubifs_err("LPT new nhead offs: expected %d was %d",
+ ubifs_err(c, "LPT new nhead offs: expected %d was %d",
d->new_nhead_offs, len);
err = -EINVAL;
}
@@ -1845,7 +1845,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
if (c->big_lpt)
lpt_sz += c->lsave_sz;
if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
- ubifs_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
+ ubifs_err(c, "LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
err = -EINVAL;
}
@@ -1887,7 +1887,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum);
buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
if (!buf) {
- ubifs_err("cannot allocate memory to dump LPT");
+ ubifs_err(c, "cannot allocate memory to dump LPT");
return;
}
@@ -1962,7 +1962,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
pr_err("LEB %d:%d, lsave len\n", lnum, offs);
break;
default:
- ubifs_err("LPT node type %d not recognized", node_type);
+ ubifs_err(c, "LPT node type %d not recognized", node_type);
goto out;
}
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 1a4bb9e8b3b8..c6a5e39e2ba5 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -82,7 +82,7 @@ out:
return -EUCLEAN;
out_dump:
- ubifs_err("unexpected node type %d master LEB %d:%d",
+ ubifs_err(c, "unexpected node type %d master LEB %d:%d",
snod->type, lnum, snod->offs);
ubifs_scan_destroy(sleb);
return -EINVAL;
@@ -240,7 +240,7 @@ static int validate_master(const struct ubifs_info *c)
return 0;
out:
- ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
+ ubifs_err(c, "bad master node at offset %d error %d", c->mst_offs, err);
ubifs_dump_node(c, c->mst_node);
return -EINVAL;
}
@@ -316,7 +316,7 @@ int ubifs_read_master(struct ubifs_info *c)
if (c->leb_cnt < old_leb_cnt ||
c->leb_cnt < UBIFS_MIN_LEB_CNT) {
- ubifs_err("bad leb_cnt on master node");
+ ubifs_err(c, "bad leb_cnt on master node");
ubifs_dump_node(c, c->mst_node);
return -EINVAL;
}
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 4409f486ecef..caf2d123e9ee 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -88,7 +88,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
else if (inum > o->inum)
p = &(*p)->rb_right;
else {
- ubifs_err("orphaned twice");
+ ubifs_err(c, "orphaned twice");
spin_unlock(&c->orphan_lock);
kfree(orphan);
return 0;
@@ -155,7 +155,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
}
}
spin_unlock(&c->orphan_lock);
- ubifs_err("missing orphan ino %lu", (unsigned long)inum);
+ ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum);
dump_stack();
}
@@ -287,7 +287,7 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
* We limit the number of orphans so that this should
* never happen.
*/
- ubifs_err("out of space in orphan area");
+ ubifs_err(c, "out of space in orphan area");
return -EINVAL;
}
}
@@ -397,7 +397,7 @@ static int consolidate(struct ubifs_info *c)
* We limit the number of orphans so that this should
* never happen.
*/
- ubifs_err("out of space in orphan area");
+ ubifs_err(c, "out of space in orphan area");
err = -EINVAL;
}
spin_unlock(&c->orphan_lock);
@@ -569,7 +569,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
list_for_each_entry(snod, &sleb->nodes, list) {
if (snod->type != UBIFS_ORPH_NODE) {
- ubifs_err("invalid node type %d in orphan area at %d:%d",
+ ubifs_err(c, "invalid node type %d in orphan area at %d:%d",
snod->type, sleb->lnum, snod->offs);
ubifs_dump_node(c, snod->node);
return -EINVAL;
@@ -596,7 +596,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
* number. That makes this orphan node, out of date.
*/
if (!first) {
- ubifs_err("out of order commit number %llu in orphan node at %d:%d",
+ ubifs_err(c, "out of order commit number %llu in orphan node at %d:%d",
cmt_no, sleb->lnum, snod->offs);
ubifs_dump_node(c, snod->node);
return -EINVAL;
@@ -831,20 +831,20 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
if (inum != ci->last_ino) {
/* Lowest node type is the inode node, so it comes first */
if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
- ubifs_err("found orphan node ino %lu, type %d",
+ ubifs_err(c, "found orphan node ino %lu, type %d",
(unsigned long)inum, key_type(c, &zbr->key));
ci->last_ino = inum;
ci->tot_inos += 1;
err = ubifs_tnc_read_node(c, zbr, ci->node);
if (err) {
- ubifs_err("node read failed, error %d", err);
+ ubifs_err(c, "node read failed, error %d", err);
return err;
}
if (ci->node->nlink == 0)
/* Must be recorded as an orphan */
if (!dbg_find_check_orphan(&ci->root, inum) &&
!dbg_find_orphan(c, inum)) {
- ubifs_err("missing orphan, ino %lu",
+ ubifs_err(c, "missing orphan, ino %lu",
(unsigned long)inum);
ci->missing += 1;
}
@@ -887,7 +887,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
if (!buf) {
- ubifs_err("cannot allocate memory to check orphans");
+ ubifs_err(c, "cannot allocate memory to check orphans");
return 0;
}
@@ -925,7 +925,7 @@ static int dbg_check_orphans(struct ubifs_info *c)
ci.root = RB_ROOT;
ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
if (!ci.node) {
- ubifs_err("out of memory");
+ ubifs_err(c, "out of memory");
return -ENOMEM;
}
@@ -935,12 +935,12 @@ static int dbg_check_orphans(struct ubifs_info *c)
err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci);
if (err) {
- ubifs_err("cannot scan TNC, error %d", err);
+ ubifs_err(c, "cannot scan TNC, error %d", err);
goto out;
}
if (ci.missing) {
- ubifs_err("%lu missing orphan(s)", ci.missing);
+ ubifs_err(c, "%lu missing orphan(s)", ci.missing);
err = -EINVAL;
goto out;
}
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c640938f62f0..695fc71d5244 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -305,7 +305,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
mst = mst2;
}
- ubifs_msg("recovered master node from LEB %d",
+ ubifs_msg(c, "recovered master node from LEB %d",
(mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
@@ -360,13 +360,13 @@ int ubifs_recover_master_node(struct ubifs_info *c)
out_err:
err = -EINVAL;
out_free:
- ubifs_err("failed to recover master node");
+ ubifs_err(c, "failed to recover master node");
if (mst1) {
- ubifs_err("dumping first master node");
+ ubifs_err(c, "dumping first master node");
ubifs_dump_node(c, mst1);
}
if (mst2) {
- ubifs_err("dumping second master node");
+ ubifs_err(c, "dumping second master node");
ubifs_dump_node(c, mst2);
}
vfree(buf2);
@@ -682,7 +682,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
ret, lnum, offs);
break;
} else {
- ubifs_err("unexpected return value %d", ret);
+ ubifs_err(c, "unexpected return value %d", ret);
err = -EINVAL;
goto error;
}
@@ -702,7 +702,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
* See header comment for this file for more
* explanations about the reasons we have this check.
*/
- ubifs_err("corrupt empty space LEB %d:%d, corruption starts at %d",
+ ubifs_err(c, "corrupt empty space LEB %d:%d, corruption starts at %d",
lnum, offs, corruption);
/* Make sure we dump interesting non-0xFF data */
offs += corruption;
@@ -788,13 +788,13 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
corrupted_rescan:
/* Re-scan the corrupted data with verbose messages */
- ubifs_err("corruption %d", ret);
+ ubifs_err(c, "corruption %d", ret);
ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
corrupted:
ubifs_scanned_corruption(c, lnum, offs, buf);
err = -EUCLEAN;
error:
- ubifs_err("LEB %d scanning failed", lnum);
+ ubifs_err(c, "LEB %d scanning failed", lnum);
ubifs_scan_destroy(sleb);
return ERR_PTR(err);
}
@@ -826,15 +826,15 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
goto out_free;
ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
if (ret != SCANNED_A_NODE) {
- ubifs_err("Not a valid node");
+ ubifs_err(c, "Not a valid node");
goto out_err;
}
if (cs_node->ch.node_type != UBIFS_CS_NODE) {
- ubifs_err("Node a CS node, type is %d", cs_node->ch.node_type);
+ ubifs_err(c, "Node a CS node, type is %d", cs_node->ch.node_type);
goto out_err;
}
if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
- ubifs_err("CS node cmt_no %llu != current cmt_no %llu",
+ ubifs_err(c, "CS node cmt_no %llu != current cmt_no %llu",
(unsigned long long)le64_to_cpu(cs_node->cmt_no),
c->cmt_no);
goto out_err;
@@ -847,7 +847,7 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
out_err:
err = -EINVAL;
out_free:
- ubifs_err("failed to get CS sqnum");
+ ubifs_err(c, "failed to get CS sqnum");
kfree(cs_node);
return err;
}
@@ -899,7 +899,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
}
}
if (snod->sqnum > cs_sqnum) {
- ubifs_err("unrecoverable log corruption in LEB %d",
+ ubifs_err(c, "unrecoverable log corruption in LEB %d",
lnum);
ubifs_scan_destroy(sleb);
return ERR_PTR(-EUCLEAN);
@@ -975,11 +975,8 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf)
return err;
dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
- err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
- if (err)
- return err;
- return 0;
+ return recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
}
/**
@@ -1004,10 +1001,7 @@ static int clean_an_unclean_leb(struct ubifs_info *c,
if (len == 0) {
/* Nothing to read, just unmap it */
- err = ubifs_leb_unmap(c, lnum);
- if (err)
- return err;
- return 0;
+ return ubifs_leb_unmap(c, lnum);
}
err = ubifs_leb_read(c, lnum, buf, offs, len, 0);
@@ -1043,7 +1037,7 @@ static int clean_an_unclean_leb(struct ubifs_info *c,
}
if (ret == SCANNED_EMPTY_SPACE) {
- ubifs_err("unexpected empty space at %d:%d",
+ ubifs_err(c, "unexpected empty space at %d:%d",
lnum, offs);
return -EUCLEAN;
}
@@ -1137,7 +1131,7 @@ static int grab_empty_leb(struct ubifs_info *c)
*/
lnum = ubifs_find_free_leb_for_idx(c);
if (lnum < 0) {
- ubifs_err("could not find an empty LEB");
+ ubifs_err(c, "could not find an empty LEB");
ubifs_dump_lprops(c);
ubifs_dump_budg(c, &c->bi);
return lnum;
@@ -1217,7 +1211,7 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
}
mutex_unlock(&wbuf->io_mutex);
if (err < 0) {
- ubifs_err("GC failed, error %d", err);
+ ubifs_err(c, "GC failed, error %d", err);
if (err == -EAGAIN)
err = -EINVAL;
return err;
@@ -1464,7 +1458,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
return 0;
out:
- ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
+ ubifs_warn(c, "inode %lu failed to fix size %lld -> %lld error %d",
(unsigned long)e->inum, e->i_size, e->d_size, err);
return err;
}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 9b40a1c5e160..3ca4540130b5 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -458,13 +458,13 @@ int ubifs_validate_entry(struct ubifs_info *c,
nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
strnlen(dent->name, nlen) != nlen ||
le64_to_cpu(dent->inum) > MAX_INUM) {
- ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ?
+ ubifs_err(c, "bad %s node", key_type == UBIFS_DENT_KEY ?
"directory entry" : "extended attribute entry");
return -EINVAL;
}
if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
- ubifs_err("bad key type %d", key_type);
+ ubifs_err(c, "bad key type %d", key_type);
return -EINVAL;
}
@@ -589,7 +589,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
cond_resched();
if (snod->sqnum >= SQNUM_WATERMARK) {
- ubifs_err("file system's life ended");
+ ubifs_err(c, "file system's life ended");
goto out_dump;
}
@@ -647,7 +647,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
if (old_size < 0 || old_size > c->max_inode_sz ||
new_size < 0 || new_size > c->max_inode_sz ||
old_size <= new_size) {
- ubifs_err("bad truncation node");
+ ubifs_err(c, "bad truncation node");
goto out_dump;
}
@@ -662,7 +662,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
break;
}
default:
- ubifs_err("unexpected node type %d in bud LEB %d:%d",
+ ubifs_err(c, "unexpected node type %d in bud LEB %d:%d",
snod->type, lnum, snod->offs);
err = -EINVAL;
goto out_dump;
@@ -685,7 +685,7 @@ out:
return err;
out_dump:
- ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
+ ubifs_err(c, "bad node is at LEB %d:%d", lnum, snod->offs);
ubifs_dump_node(c, snod->node);
ubifs_scan_destroy(sleb);
return -EINVAL;
@@ -805,7 +805,7 @@ static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
if (bud) {
if (bud->jhead == jhead && bud->start <= offs)
return 1;
- ubifs_err("bud at LEB %d:%d was already referred", lnum, offs);
+ ubifs_err(c, "bud at LEB %d:%d was already referred", lnum, offs);
return -EINVAL;
}
@@ -861,12 +861,12 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
* numbers.
*/
if (snod->type != UBIFS_CS_NODE) {
- ubifs_err("first log node at LEB %d:%d is not CS node",
+ ubifs_err(c, "first log node at LEB %d:%d is not CS node",
lnum, offs);
goto out_dump;
}
if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
- ubifs_err("first CS node at LEB %d:%d has wrong commit number %llu expected %llu",
+ ubifs_err(c, "first CS node at LEB %d:%d has wrong commit number %llu expected %llu",
lnum, offs,
(unsigned long long)le64_to_cpu(node->cmt_no),
c->cmt_no);
@@ -891,7 +891,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
/* Make sure the first node sits at offset zero of the LEB */
if (snod->offs != 0) {
- ubifs_err("first node is not at zero offset");
+ ubifs_err(c, "first node is not at zero offset");
goto out_dump;
}
@@ -899,12 +899,12 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
cond_resched();
if (snod->sqnum >= SQNUM_WATERMARK) {
- ubifs_err("file system's life ended");
+ ubifs_err(c, "file system's life ended");
goto out_dump;
}
if (snod->sqnum < c->cs_sqnum) {
- ubifs_err("bad sqnum %llu, commit sqnum %llu",
+ ubifs_err(c, "bad sqnum %llu, commit sqnum %llu",
snod->sqnum, c->cs_sqnum);
goto out_dump;
}
@@ -934,12 +934,12 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
case UBIFS_CS_NODE:
/* Make sure it sits at the beginning of LEB */
if (snod->offs != 0) {
- ubifs_err("unexpected node in log");
+ ubifs_err(c, "unexpected node in log");
goto out_dump;
}
break;
default:
- ubifs_err("unexpected node in log");
+ ubifs_err(c, "unexpected node in log");
goto out_dump;
}
}
@@ -955,7 +955,7 @@ out:
return err;
out_dump:
- ubifs_err("log error detected while replaying the log at LEB %d:%d",
+ ubifs_err(c, "log error detected while replaying the log at LEB %d:%d",
lnum, offs + snod->offs);
ubifs_dump_node(c, snod->node);
ubifs_scan_destroy(sleb);
@@ -1017,7 +1017,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
return free; /* Error code */
if (c->ihead_offs != c->leb_size - free) {
- ubifs_err("bad index head LEB %d:%d", c->ihead_lnum,
+ ubifs_err(c, "bad index head LEB %d:%d", c->ihead_lnum,
c->ihead_offs);
return -EINVAL;
}
@@ -1040,7 +1040,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
* someting went wrong and we cannot proceed mounting
* the file-system.
*/
- ubifs_err("no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted",
+ ubifs_err(c, "no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted",
lnum, 0);
err = -EINVAL;
}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 79c6dbbc0e04..f4fbc7b6b794 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -335,7 +335,7 @@ static int create_default_filesystem(struct ubifs_info *c)
if (err)
return err;
- ubifs_msg("default file-system created");
+ ubifs_msg(c, "default file-system created");
return 0;
}
@@ -365,13 +365,13 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
}
if (le32_to_cpu(sup->min_io_size) != c->min_io_size) {
- ubifs_err("min. I/O unit mismatch: %d in superblock, %d real",
+ ubifs_err(c, "min. I/O unit mismatch: %d in superblock, %d real",
le32_to_cpu(sup->min_io_size), c->min_io_size);
goto failed;
}
if (le32_to_cpu(sup->leb_size) != c->leb_size) {
- ubifs_err("LEB size mismatch: %d in superblock, %d real",
+ ubifs_err(c, "LEB size mismatch: %d in superblock, %d real",
le32_to_cpu(sup->leb_size), c->leb_size);
goto failed;
}
@@ -393,33 +393,33 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
- ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, %d minimum required",
+ ubifs_err(c, "bad LEB count: %d in superblock, %d on UBI volume, %d minimum required",
c->leb_cnt, c->vi.size, min_leb_cnt);
goto failed;
}
if (c->max_leb_cnt < c->leb_cnt) {
- ubifs_err("max. LEB count %d less than LEB count %d",
+ ubifs_err(c, "max. LEB count %d less than LEB count %d",
c->max_leb_cnt, c->leb_cnt);
goto failed;
}
if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
- ubifs_err("too few main LEBs count %d, must be at least %d",
+ ubifs_err(c, "too few main LEBs count %d, must be at least %d",
c->main_lebs, UBIFS_MIN_MAIN_LEBS);
goto failed;
}
max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS;
if (c->max_bud_bytes < max_bytes) {
- ubifs_err("too small journal (%lld bytes), must be at least %lld bytes",
+ ubifs_err(c, "too small journal (%lld bytes), must be at least %lld bytes",
c->max_bud_bytes, max_bytes);
goto failed;
}
max_bytes = (long long)c->leb_size * c->main_lebs;
if (c->max_bud_bytes > max_bytes) {
- ubifs_err("too large journal size (%lld bytes), only %lld bytes available in the main area",
+ ubifs_err(c, "too large journal size (%lld bytes), only %lld bytes available in the main area",
c->max_bud_bytes, max_bytes);
goto failed;
}
@@ -468,7 +468,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
return 0;
failed:
- ubifs_err("bad superblock, error %d", err);
+ ubifs_err(c, "bad superblock, error %d", err);
ubifs_dump_node(c, sup);
return -EINVAL;
}
@@ -549,12 +549,12 @@ int ubifs_read_superblock(struct ubifs_info *c)
ubifs_assert(!c->ro_media || c->ro_mount);
if (!c->ro_mount ||
c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
- ubifs_err("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
+ ubifs_err(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
c->fmt_version, c->ro_compat_version,
UBIFS_FORMAT_VERSION,
UBIFS_RO_COMPAT_VERSION);
if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
- ubifs_msg("only R/O mounting is possible");
+ ubifs_msg(c, "only R/O mounting is possible");
err = -EROFS;
} else
err = -EINVAL;
@@ -570,7 +570,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
}
if (c->fmt_version < 3) {
- ubifs_err("on-flash format version %d is not supported",
+ ubifs_err(c, "on-flash format version %d is not supported",
c->fmt_version);
err = -EINVAL;
goto out;
@@ -595,7 +595,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
c->key_len = UBIFS_SK_LEN;
break;
default:
- ubifs_err("unsupported key format");
+ ubifs_err(c, "unsupported key format");
err = -EINVAL;
goto out;
}
@@ -785,7 +785,7 @@ int ubifs_fixup_free_space(struct ubifs_info *c)
ubifs_assert(c->space_fixup);
ubifs_assert(!c->ro_mount);
- ubifs_msg("start fixing up free space");
+ ubifs_msg(c, "start fixing up free space");
err = fixup_free_space(c);
if (err)
@@ -804,6 +804,6 @@ int ubifs_fixup_free_space(struct ubifs_info *c)
if (err)
return err;
- ubifs_msg("free space fixup complete");
+ ubifs_msg(c, "free space fixup complete");
return err;
}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 89adbc4d08ac..aab87340d3de 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -100,7 +100,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
if (pad_len < 0 ||
offs + node_len + pad_len > c->leb_size) {
if (!quiet) {
- ubifs_err("bad pad node at LEB %d:%d",
+ ubifs_err(c, "bad pad node at LEB %d:%d",
lnum, offs);
ubifs_dump_node(c, pad);
}
@@ -110,7 +110,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
/* Make the node pads to 8-byte boundary */
if ((node_len + pad_len) & 7) {
if (!quiet)
- ubifs_err("bad padding length %d - %d",
+ ubifs_err(c, "bad padding length %d - %d",
offs, offs + node_len + pad_len);
return SCANNED_A_BAD_PAD_NODE;
}
@@ -152,7 +152,7 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0);
if (err && err != -EBADMSG) {
- ubifs_err("cannot read %d bytes from LEB %d:%d, error %d",
+ ubifs_err(c, "cannot read %d bytes from LEB %d:%d, error %d",
c->leb_size - offs, lnum, offs, err);
kfree(sleb);
return ERR_PTR(err);
@@ -240,11 +240,11 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
{
int len;
- ubifs_err("corruption at LEB %d:%d", lnum, offs);
+ ubifs_err(c, "corruption at LEB %d:%d", lnum, offs);
len = c->leb_size - offs;
if (len > 8192)
len = 8192;
- ubifs_err("first %d bytes from LEB %d:%d", len, lnum, offs);
+ ubifs_err(c, "first %d bytes from LEB %d:%d", len, lnum, offs);
print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
}
@@ -299,16 +299,16 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
switch (ret) {
case SCANNED_GARBAGE:
- ubifs_err("garbage");
+ ubifs_err(c, "garbage");
goto corrupted;
case SCANNED_A_NODE:
break;
case SCANNED_A_CORRUPT_NODE:
case SCANNED_A_BAD_PAD_NODE:
- ubifs_err("bad node");
+ ubifs_err(c, "bad node");
goto corrupted;
default:
- ubifs_err("unknown");
+ ubifs_err(c, "unknown");
err = -EINVAL;
goto error;
}
@@ -325,7 +325,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
if (offs % c->min_io_size) {
if (!quiet)
- ubifs_err("empty space starts at non-aligned offset %d",
+ ubifs_err(c, "empty space starts at non-aligned offset %d",
offs);
goto corrupted;
}
@@ -338,7 +338,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
for (; len; offs++, buf++, len--)
if (*(uint8_t *)buf != 0xff) {
if (!quiet)
- ubifs_err("corrupt empty space at LEB %d:%d",
+ ubifs_err(c, "corrupt empty space at LEB %d:%d",
lnum, offs);
goto corrupted;
}
@@ -348,14 +348,14 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
corrupted:
if (!quiet) {
ubifs_scanned_corruption(c, lnum, offs, buf);
- ubifs_err("LEB %d scanning failed", lnum);
+ ubifs_err(c, "LEB %d scanning failed", lnum);
}
err = -EUCLEAN;
ubifs_scan_destroy(sleb);
return ERR_PTR(err);
error:
- ubifs_err("LEB %d scanning failed, error %d", lnum, err);
+ ubifs_err(c, "LEB %d scanning failed, error %d", lnum, err);
ubifs_scan_destroy(sleb);
return ERR_PTR(err);
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 93e946561c5c..9547a27868ad 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -70,13 +70,13 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
const struct ubifs_inode *ui = ubifs_inode(inode);
if (inode->i_size > c->max_inode_sz) {
- ubifs_err("inode is too large (%lld)",
+ ubifs_err(c, "inode is too large (%lld)",
(long long)inode->i_size);
return 1;
}
if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
- ubifs_err("unknown compression type %d", ui->compr_type);
+ ubifs_err(c, "unknown compression type %d", ui->compr_type);
return 2;
}
@@ -90,7 +90,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
return 5;
if (!ubifs_compr_present(ui->compr_type)) {
- ubifs_warn("inode %lu uses '%s' compression, but it was not compiled in",
+ ubifs_warn(c, "inode %lu uses '%s' compression, but it was not compiled in",
inode->i_ino, ubifs_compr_name(ui->compr_type));
}
@@ -195,6 +195,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
}
memcpy(ui->data, ino->data, ui->data_len);
((char *)ui->data)[ui->data_len] = '\0';
+ inode->i_link = ui->data;
break;
case S_IFBLK:
case S_IFCHR:
@@ -242,14 +243,14 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
return inode;
out_invalid:
- ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
+ ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err);
ubifs_dump_node(c, ino);
ubifs_dump_inode(c, inode);
err = -EINVAL;
out_ino:
kfree(ino);
out:
- ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err);
+ ubifs_err(c, "failed to read inode %lu, error %d", inode->i_ino, err);
iget_failed(inode);
return ERR_PTR(err);
}
@@ -319,7 +320,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (inode->i_nlink) {
err = ubifs_jnl_write_inode(c, inode);
if (err)
- ubifs_err("can't write inode %lu, error %d",
+ ubifs_err(c, "can't write inode %lu, error %d",
inode->i_ino, err);
else
err = dbg_check_inode_size(c, inode, ui->ui_size);
@@ -363,7 +364,7 @@ static void ubifs_evict_inode(struct inode *inode)
* Worst case we have a lost orphan inode wasting space, so a
* simple error message is OK here.
*/
- ubifs_err("can't delete inode %lu, error %d",
+ ubifs_err(c, "can't delete inode %lu, error %d",
inode->i_ino, err);
out:
@@ -492,17 +493,17 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
static int init_constants_early(struct ubifs_info *c)
{
if (c->vi.corrupted) {
- ubifs_warn("UBI volume is corrupted - read-only mode");
+ ubifs_warn(c, "UBI volume is corrupted - read-only mode");
c->ro_media = 1;
}
if (c->di.ro_mode) {
- ubifs_msg("read-only UBI device");
+ ubifs_msg(c, "read-only UBI device");
c->ro_media = 1;
}
if (c->vi.vol_type == UBI_STATIC_VOLUME) {
- ubifs_msg("static UBI volume - read-only mode");
+ ubifs_msg(c, "static UBI volume - read-only mode");
c->ro_media = 1;
}
@@ -516,19 +517,19 @@ static int init_constants_early(struct ubifs_info *c)
c->max_write_shift = fls(c->max_write_size) - 1;
if (c->leb_size < UBIFS_MIN_LEB_SZ) {
- ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
+ ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes",
c->leb_size, UBIFS_MIN_LEB_SZ);
return -EINVAL;
}
if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
- ubifs_err("too few LEBs (%d), min. is %d",
+ ubifs_err(c, "too few LEBs (%d), min. is %d",
c->leb_cnt, UBIFS_MIN_LEB_CNT);
return -EINVAL;
}
if (!is_power_of_2(c->min_io_size)) {
- ubifs_err("bad min. I/O size %d", c->min_io_size);
+ ubifs_err(c, "bad min. I/O size %d", c->min_io_size);
return -EINVAL;
}
@@ -539,7 +540,7 @@ static int init_constants_early(struct ubifs_info *c)
if (c->max_write_size < c->min_io_size ||
c->max_write_size % c->min_io_size ||
!is_power_of_2(c->max_write_size)) {
- ubifs_err("bad write buffer size %d for %d min. I/O unit",
+ ubifs_err(c, "bad write buffer size %d for %d min. I/O unit",
c->max_write_size, c->min_io_size);
return -EINVAL;
}
@@ -665,7 +666,7 @@ static int init_constants_sb(struct ubifs_info *c)
tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
tmp = ALIGN(tmp, c->min_io_size);
if (tmp > c->leb_size) {
- ubifs_err("too small LEB size %d, at least %d needed",
+ ubifs_err(c, "too small LEB size %d, at least %d needed",
c->leb_size, tmp);
return -EINVAL;
}
@@ -680,7 +681,7 @@ static int init_constants_sb(struct ubifs_info *c)
tmp /= c->leb_size;
tmp += 1;
if (c->log_lebs < tmp) {
- ubifs_err("too small log %d LEBs, required min. %d LEBs",
+ ubifs_err(c, "too small log %d LEBs, required min. %d LEBs",
c->log_lebs, tmp);
return -EINVAL;
}
@@ -772,7 +773,7 @@ static int take_gc_lnum(struct ubifs_info *c)
int err;
if (c->gc_lnum == -1) {
- ubifs_err("no LEB for GC");
+ ubifs_err(c, "no LEB for GC");
return -EINVAL;
}
@@ -857,7 +858,7 @@ static void free_orphans(struct ubifs_info *c)
orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
list_del(&orph->list);
kfree(orph);
- ubifs_err("orphan list not empty at unmount");
+ ubifs_err(c, "orphan list not empty at unmount");
}
vfree(c->orph_buf);
@@ -954,7 +955,8 @@ static const match_table_t tokens = {
*/
static int parse_standard_option(const char *option)
{
- ubifs_msg("parse %s", option);
+
+ pr_notice("UBIFS: parse %s\n", option);
if (!strcmp(option, "sync"))
return MS_SYNCHRONOUS;
return 0;
@@ -1026,7 +1028,7 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
else if (!strcmp(name, "zlib"))
c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
else {
- ubifs_err("unknown compressor \"%s\"", name);
+ ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready?
kfree(name);
return -EINVAL;
}
@@ -1042,7 +1044,7 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
flag = parse_standard_option(p);
if (!flag) {
- ubifs_err("unrecognized mount option \"%s\" or missing value",
+ ubifs_err(c, "unrecognized mount option \"%s\" or missing value",
p);
return -EINVAL;
}
@@ -1105,7 +1107,7 @@ again:
}
/* Just disable bulk-read */
- ubifs_warn("cannot allocate %d bytes of memory for bulk-read, disabling it",
+ ubifs_warn(c, "cannot allocate %d bytes of memory for bulk-read, disabling it",
c->max_bu_buf_len);
c->mount_opts.bulk_read = 1;
c->bulk_read = 0;
@@ -1124,7 +1126,7 @@ static int check_free_space(struct ubifs_info *c)
{
ubifs_assert(c->dark_wm > 0);
if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
- ubifs_err("insufficient free space to mount in R/W mode");
+ ubifs_err(c, "insufficient free space to mount in R/W mode");
ubifs_dump_budg(c, &c->bi);
ubifs_dump_lprops(c);
return -ENOSPC;
@@ -1166,14 +1168,14 @@ static int mount_ubifs(struct ubifs_info *c)
* This UBI volume is empty, and read-only, or the file system
* is mounted read-only - we cannot format it.
*/
- ubifs_err("can't format empty UBI volume: read-only %s",
+ ubifs_err(c, "can't format empty UBI volume: read-only %s",
c->ro_media ? "UBI volume" : "mount");
err = -EROFS;
goto out_free;
}
if (c->ro_media && !c->ro_mount) {
- ubifs_err("cannot mount read-write - read-only media");
+ ubifs_err(c, "cannot mount read-write - read-only media");
err = -EROFS;
goto out_free;
}
@@ -1221,7 +1223,7 @@ static int mount_ubifs(struct ubifs_info *c)
* or overridden by mount options is actually compiled in.
*/
if (!ubifs_compr_present(c->default_compr)) {
- ubifs_err("'compressor \"%s\" is not compiled in",
+ ubifs_err(c, "'compressor \"%s\" is not compiled in",
ubifs_compr_name(c->default_compr));
err = -ENOTSUPP;
goto out_free;
@@ -1250,7 +1252,7 @@ static int mount_ubifs(struct ubifs_info *c)
if (IS_ERR(c->bgt)) {
err = PTR_ERR(c->bgt);
c->bgt = NULL;
- ubifs_err("cannot spawn \"%s\", error %d",
+ ubifs_err(c, "cannot spawn \"%s\", error %d",
c->bgt_name, err);
goto out_wbufs;
}
@@ -1264,7 +1266,7 @@ static int mount_ubifs(struct ubifs_info *c)
init_constants_master(c);
if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
- ubifs_msg("recovery needed");
+ ubifs_msg(c, "recovery needed");
c->need_recovery = 1;
}
@@ -1284,7 +1286,7 @@ static int mount_ubifs(struct ubifs_info *c)
goto out_lpt;
}
- if (!c->ro_mount) {
+ if (!c->ro_mount && !c->need_recovery) {
/*
* Set the "dirty" flag so that if we reboot uncleanly we
* will notice this immediately on the next mount.
@@ -1373,10 +1375,10 @@ static int mount_ubifs(struct ubifs_info *c)
if (c->need_recovery) {
if (c->ro_mount)
- ubifs_msg("recovery deferred");
+ ubifs_msg(c, "recovery deferred");
else {
c->need_recovery = 0;
- ubifs_msg("recovery completed");
+ ubifs_msg(c, "recovery completed");
/*
* GC LEB has to be empty and taken at this point. But
* the journal head LEBs may also be accounted as
@@ -1397,20 +1399,20 @@ static int mount_ubifs(struct ubifs_info *c)
c->mounting = 0;
- ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
+ ubifs_msg(c, "UBIFS: mounted UBI device %d, volume %d, name \"%s\"%s",
c->vi.ubi_num, c->vi.vol_id, c->vi.name,
c->ro_mount ? ", R/O mode" : "");
x = (long long)c->main_lebs * c->leb_size;
y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
- ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
+ ubifs_msg(c, "LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
c->leb_size, c->leb_size >> 10, c->min_io_size,
c->max_write_size);
- ubifs_msg("FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)",
+ ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)",
x, x >> 20, c->main_lebs,
y, y >> 20, c->log_lebs + c->max_bud_cnt);
- ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
+ ubifs_msg(c, "reserved for root: %llu bytes (%llu KiB)",
c->report_rp_size, c->report_rp_size >> 10);
- ubifs_msg("media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s",
+ ubifs_msg(c, "media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s",
c->fmt_version, c->ro_compat_version,
UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid,
c->big_lpt ? ", big LPT model" : ", small LPT model");
@@ -1543,8 +1545,8 @@ static int ubifs_remount_rw(struct ubifs_info *c)
int err, lnum;
if (c->rw_incompat) {
- ubifs_err("the file-system is not R/W-compatible");
- ubifs_msg("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
+ ubifs_err(c, "the file-system is not R/W-compatible");
+ ubifs_msg(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
c->fmt_version, c->ro_compat_version,
UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
return -EROFS;
@@ -1581,7 +1583,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
}
if (c->need_recovery) {
- ubifs_msg("completing deferred recovery");
+ ubifs_msg(c, "completing deferred recovery");
err = ubifs_write_rcvrd_mst_node(c);
if (err)
goto out;
@@ -1630,7 +1632,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
if (IS_ERR(c->bgt)) {
err = PTR_ERR(c->bgt);
c->bgt = NULL;
- ubifs_err("cannot spawn \"%s\", error %d",
+ ubifs_err(c, "cannot spawn \"%s\", error %d",
c->bgt_name, err);
goto out;
}
@@ -1664,7 +1666,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
if (c->need_recovery) {
c->need_recovery = 0;
- ubifs_msg("deferred recovery completed");
+ ubifs_msg(c, "deferred recovery completed");
} else {
/*
* Do not run the debugging space check if the were doing
@@ -1752,8 +1754,7 @@ static void ubifs_put_super(struct super_block *sb)
int i;
struct ubifs_info *c = sb->s_fs_info;
- ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
- c->vi.vol_id);
+ ubifs_msg(c, "un-mount UBI device %d", c->vi.ubi_num);
/*
* The following asserts are only valid if there has not been a failure
@@ -1809,7 +1810,7 @@ static void ubifs_put_super(struct super_block *sb)
* next mount, so we just print a message and
* continue to unmount normally.
*/
- ubifs_err("failed to write master node, error %d",
+ ubifs_err(c, "failed to write master node, error %d",
err);
} else {
for (i = 0; i < c->jhead_cnt; i++)
@@ -1834,17 +1835,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
err = ubifs_parse_options(c, data, 1);
if (err) {
- ubifs_err("invalid or unknown remount parameter");
+ ubifs_err(c, "invalid or unknown remount parameter");
return err;
}
if (c->ro_mount && !(*flags & MS_RDONLY)) {
if (c->ro_error) {
- ubifs_msg("cannot re-mount R/W due to prior errors");
+ ubifs_msg(c, "cannot re-mount R/W due to prior errors");
return -EROFS;
}
if (c->ro_media) {
- ubifs_msg("cannot re-mount R/W - UBI volume is R/O");
+ ubifs_msg(c, "cannot re-mount R/W - UBI volume is R/O");
return -EROFS;
}
err = ubifs_remount_rw(c);
@@ -1852,7 +1853,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
return err;
} else if (!c->ro_mount && (*flags & MS_RDONLY)) {
if (c->ro_error) {
- ubifs_msg("cannot re-mount R/O due to prior errors");
+ ubifs_msg(c, "cannot re-mount R/O due to prior errors");
return -EROFS;
}
ubifs_remount_ro(c);
@@ -2104,8 +2105,8 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
*/
ubi = open_ubi(name, UBI_READONLY);
if (IS_ERR(ubi)) {
- ubifs_err("cannot open \"%s\", error %d",
- name, (int)PTR_ERR(ubi));
+ pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
+ current->pid, name, (int)PTR_ERR(ubi));
return ERR_CAST(ubi);
}
@@ -2233,8 +2234,8 @@ static int __init ubifs_init(void)
* UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
*/
if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
- ubifs_err("VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes",
- (unsigned int)PAGE_CACHE_SIZE);
+ pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes",
+ current->pid, (unsigned int)PAGE_CACHE_SIZE);
return -EINVAL;
}
@@ -2245,7 +2246,9 @@ static int __init ubifs_init(void)
if (!ubifs_inode_slab)
return -ENOMEM;
- register_shrinker(&ubifs_shrinker_info);
+ err = register_shrinker(&ubifs_shrinker_info);
+ if (err)
+ goto out_slab;
err = ubifs_compressors_init();
if (err)
@@ -2257,7 +2260,8 @@ static int __init ubifs_init(void)
err = register_filesystem(&ubifs_fs_type);
if (err) {
- ubifs_err("cannot register file system, error %d", err);
+ pr_err("UBIFS error (pid %d): cannot register file system, error %d",
+ current->pid, err);
goto out_dbg;
}
return 0;
@@ -2268,6 +2272,7 @@ out_compr:
ubifs_compressors_exit();
out_shrinker:
unregister_shrinker(&ubifs_shrinker_info);
+out_slab:
kmem_cache_destroy(ubifs_inode_slab);
return err;
}
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6793db0754f6..957f5757f374 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -98,7 +98,7 @@ static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
else if (offs > o->offs)
p = &(*p)->rb_right;
else {
- ubifs_err("old idx added twice!");
+ ubifs_err(c, "old idx added twice!");
kfree(old_idx);
return 0;
}
@@ -447,7 +447,7 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
err = ubifs_leb_read(c, lnum, buf, offs, len, 1);
if (err) {
- ubifs_err("cannot read node type %d from LEB %d:%d, error %d",
+ ubifs_err(c, "cannot read node type %d from LEB %d:%d, error %d",
type, lnum, offs, err);
return err;
}
@@ -1684,27 +1684,27 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
int err, len;
if (ch->node_type != UBIFS_DATA_NODE) {
- ubifs_err("bad node type (%d but expected %d)",
+ ubifs_err(c, "bad node type (%d but expected %d)",
ch->node_type, UBIFS_DATA_NODE);
goto out_err;
}
err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0);
if (err) {
- ubifs_err("expected node type %d", UBIFS_DATA_NODE);
+ ubifs_err(c, "expected node type %d", UBIFS_DATA_NODE);
goto out;
}
len = le32_to_cpu(ch->len);
if (len != zbr->len) {
- ubifs_err("bad node length %d, expected %d", len, zbr->len);
+ ubifs_err(c, "bad node length %d, expected %d", len, zbr->len);
goto out_err;
}
/* Make sure the key of the read node is correct */
key_read(c, buf + UBIFS_KEY_OFFSET, &key1);
if (!keys_eq(c, &zbr->key, &key1)) {
- ubifs_err("bad key in node at LEB %d:%d",
+ ubifs_err(c, "bad key in node at LEB %d:%d",
zbr->lnum, zbr->offs);
dbg_tnck(&zbr->key, "looked for key ");
dbg_tnck(&key1, "found node's key ");
@@ -1716,7 +1716,7 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
out_err:
err = -EINVAL;
out:
- ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs);
+ ubifs_err(c, "bad node at LEB %d:%d", zbr->lnum, zbr->offs);
ubifs_dump_node(c, buf);
dump_stack();
return err;
@@ -1741,7 +1741,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
len = bu->zbranch[bu->cnt - 1].offs;
len += bu->zbranch[bu->cnt - 1].len - offs;
if (len > bu->buf_len) {
- ubifs_err("buffer too small %d vs %d", bu->buf_len, len);
+ ubifs_err(c, "buffer too small %d vs %d", bu->buf_len, len);
return -EINVAL;
}
@@ -1757,7 +1757,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
return -EAGAIN;
if (err && err != -EBADMSG) {
- ubifs_err("failed to read from LEB %d:%d, error %d",
+ ubifs_err(c, "failed to read from LEB %d:%d, error %d",
lnum, offs, err);
dump_stack();
dbg_tnck(&bu->key, "key ");
@@ -3313,7 +3313,7 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
out_dump:
block = key_block(c, key);
- ubifs_err("inode %lu has size %lld, but there are data at offset %lld",
+ ubifs_err(c, "inode %lu has size %lld, but there are data at offset %lld",
(unsigned long)inode->i_ino, size,
((loff_t)block) << UBIFS_BLOCK_SHIFT);
mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 7a205e046776..b45345d701e7 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -53,7 +53,7 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
br->offs = cpu_to_le32(zbr->offs);
br->len = cpu_to_le32(zbr->len);
if (!zbr->lnum || !zbr->len) {
- ubifs_err("bad ref in znode");
+ ubifs_err(c, "bad ref in znode");
ubifs_dump_znode(c, znode);
if (zbr->znode)
ubifs_dump_znode(c, zbr->znode);
@@ -384,7 +384,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
* Do not print scary warnings if the debugging
* option which forces in-the-gaps is enabled.
*/
- ubifs_warn("out of space");
+ ubifs_warn(c, "out of space");
ubifs_dump_budg(c, &c->bi);
ubifs_dump_lprops(c);
}
@@ -441,7 +441,7 @@ static int layout_in_empty_space(struct ubifs_info *c)
/* Determine the index node position */
if (lnum == -1) {
if (c->ileb_nxt >= c->ileb_cnt) {
- ubifs_err("out of space");
+ ubifs_err(c, "out of space");
return -ENOSPC;
}
lnum = c->ilebs[c->ileb_nxt++];
@@ -855,7 +855,7 @@ static int write_index(struct ubifs_info *c)
br->offs = cpu_to_le32(zbr->offs);
br->len = cpu_to_le32(zbr->len);
if (!zbr->lnum || !zbr->len) {
- ubifs_err("bad ref in znode");
+ ubifs_err(c, "bad ref in znode");
ubifs_dump_znode(c, znode);
if (zbr->znode)
ubifs_dump_znode(c, zbr->znode);
@@ -875,7 +875,7 @@ static int write_index(struct ubifs_info *c)
if (lnum != znode->lnum || offs != znode->offs ||
len != znode->len) {
- ubifs_err("inconsistent znode posn");
+ ubifs_err(c, "inconsistent znode posn");
return -EINVAL;
}
@@ -973,7 +973,7 @@ static int write_index(struct ubifs_info *c)
if (lnum != c->dbg->new_ihead_lnum ||
buf_offs != c->dbg->new_ihead_offs) {
- ubifs_err("inconsistent ihead");
+ ubifs_err(c, "inconsistent ihead");
return -EINVAL;
}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index f6bf8995c7b1..93f5b7859e6f 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -293,9 +293,9 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
lnum, offs, znode->level, znode->child_cnt);
if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
- ubifs_err("current fanout %d, branch count %d",
+ ubifs_err(c, "current fanout %d, branch count %d",
c->fanout, znode->child_cnt);
- ubifs_err("max levels %d, znode level %d",
+ ubifs_err(c, "max levels %d, znode level %d",
UBIFS_MAX_LEVELS, znode->level);
err = 1;
goto out_dump;
@@ -316,7 +316,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
if (zbr->lnum < c->main_first ||
zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
- ubifs_err("bad branch %d", i);
+ ubifs_err(c, "bad branch %d", i);
err = 2;
goto out_dump;
}
@@ -328,7 +328,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
case UBIFS_XENT_KEY:
break;
default:
- ubifs_err("bad key type at slot %d: %d",
+ ubifs_err(c, "bad key type at slot %d: %d",
i, key_type(c, &zbr->key));
err = 3;
goto out_dump;
@@ -340,17 +340,17 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
type = key_type(c, &zbr->key);
if (c->ranges[type].max_len == 0) {
if (zbr->len != c->ranges[type].len) {
- ubifs_err("bad target node (type %d) length (%d)",
+ ubifs_err(c, "bad target node (type %d) length (%d)",
type, zbr->len);
- ubifs_err("have to be %d", c->ranges[type].len);
+ ubifs_err(c, "have to be %d", c->ranges[type].len);
err = 4;
goto out_dump;
}
} else if (zbr->len < c->ranges[type].min_len ||
zbr->len > c->ranges[type].max_len) {
- ubifs_err("bad target node (type %d) length (%d)",
+ ubifs_err(c, "bad target node (type %d) length (%d)",
type, zbr->len);
- ubifs_err("have to be in range of %d-%d",
+ ubifs_err(c, "have to be in range of %d-%d",
c->ranges[type].min_len,
c->ranges[type].max_len);
err = 5;
@@ -370,12 +370,12 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
cmp = keys_cmp(c, key1, key2);
if (cmp > 0) {
- ubifs_err("bad key order (keys %d and %d)", i, i + 1);
+ ubifs_err(c, "bad key order (keys %d and %d)", i, i + 1);
err = 6;
goto out_dump;
} else if (cmp == 0 && !is_hash_key(c, key1)) {
/* These can only be keys with colliding hash */
- ubifs_err("keys %d and %d are not hashed but equivalent",
+ ubifs_err(c, "keys %d and %d are not hashed but equivalent",
i, i + 1);
err = 7;
goto out_dump;
@@ -386,7 +386,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
return 0;
out_dump:
- ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
+ ubifs_err(c, "bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
ubifs_dump_node(c, idx);
kfree(idx);
return -EINVAL;
@@ -482,7 +482,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
/* Make sure the key of the read node is correct */
key_read(c, node + UBIFS_KEY_OFFSET, &key1);
if (!keys_eq(c, key, &key1)) {
- ubifs_err("bad key in node at LEB %d:%d",
+ ubifs_err(c, "bad key in node at LEB %d:%d",
zbr->lnum, zbr->offs);
dbg_tnck(key, "looked for key ");
dbg_tnck(&key1, "but found node's key ");
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bc04b9c69891..de759022f3d6 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -43,15 +43,19 @@
#define UBIFS_VERSION 1
/* Normal UBIFS messages */
-#define ubifs_msg(fmt, ...) pr_notice("UBIFS: " fmt "\n", ##__VA_ARGS__)
+#define ubifs_msg(c, fmt, ...) \
+ pr_notice("UBIFS (ubi%d:%d): " fmt "\n", \
+ (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__)
/* UBIFS error messages */
-#define ubifs_err(fmt, ...) \
- pr_err("UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
+#define ubifs_err(c, fmt, ...) \
+ pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n", \
+ (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \
__func__, ##__VA_ARGS__)
/* UBIFS warning messages */
-#define ubifs_warn(fmt, ...) \
- pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
+#define ubifs_warn(c, fmt, ...) \
+ pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n", \
+ (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \
+ __func__, ##__VA_ARGS__)
/*
* A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
* object as an argument.
@@ -59,7 +63,7 @@
#define ubifs_errc(c, fmt, ...) \
do { \
if (!(c)->probing) \
- ubifs_err(fmt, ##__VA_ARGS__); \
+ ubifs_err(c, fmt, ##__VA_ARGS__); \
} while (0)
/* UBIFS file system VFS magic number */
@@ -158,7 +162,7 @@
#define WORST_COMPR_FACTOR 2
/*
- * How much memory is needed for a buffer where we comress a data node.
+ * How much memory is needed for a buffer where we compress a data node.
*/
#define COMPRESSED_DATA_NODE_BUF_SZ \
(UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
@@ -664,7 +668,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
* @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
* fields
* @softlimit: soft write-buffer timeout interval
- * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit
+ * @delta: hard and soft timeouts delta (the timer expire interval is @softlimit
* and @softlimit + @delta)
* @timer: write-buffer timer
* @no_timer: non-zero if this write-buffer does not have a timer
@@ -930,9 +934,9 @@ struct ubifs_orphan {
/**
* struct ubifs_mount_opts - UBIFS-specific mount options information.
* @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
- * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable)
+ * @bulk_read: enable/disable bulk-reads (%0 default, %1 disable, %2 enable)
* @chk_data_crc: enable/disable CRC data checking when reading data nodes
- * (%0 default, %1 disabe, %2 enable)
+ * (%0 default, %1 disable, %2 enable)
* @override_compr: override default compressor (%0 - do not override and use
* superblock compressor, %1 - override and use compressor
* specified in @compr_type)
@@ -962,9 +966,9 @@ struct ubifs_mount_opts {
* optimization)
* @nospace_rp: the same as @nospace, but additionally means that even reserved
* pool is full
- * @page_budget: budget for a page (constant, nenver changed after mount)
- * @inode_budget: budget for an inode (constant, nenver changed after mount)
- * @dent_budget: budget for a directory entry (constant, nenver changed after
+ * @page_budget: budget for a page (constant, never changed after mount)
+ * @inode_budget: budget for an inode (constant, never changed after mount)
+ * @dent_budget: budget for a directory entry (constant, never changed after
* mount)
*/
struct ubifs_budg_info {
@@ -1787,10 +1791,10 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/* compressor.c */
int __init ubifs_compressors_init(void);
void ubifs_compressors_exit(void);
-void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
- int *compr_type);
-int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
- int compr_type);
+void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len,
+ void *out_buf, int *out_len, int *compr_type);
+int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
+ void *out, int *out_len, int compr_type);
#include "debug.h"
#include "misc.h"
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index a92be244a6fb..96f3448b6eb4 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -108,7 +108,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) {
- ubifs_err("inode %lu already has too many xattrs (%d), cannot create more",
+ ubifs_err(c, "inode %lu already has too many xattrs (%d), cannot create more",
host->i_ino, host_ui->xattr_cnt);
return -ENOSPC;
}
@@ -120,7 +120,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
*/
names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
if (names_len > XATTR_LIST_MAX) {
- ubifs_err("cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
+ ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
host->i_ino, names_len, XATTR_LIST_MAX);
return -ENOSPC;
}
@@ -288,13 +288,13 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
inode = ubifs_iget(c->vfs_sb, inum);
if (IS_ERR(inode)) {
- ubifs_err("dead extended attribute entry, error %d",
+ ubifs_err(c, "dead extended attribute entry, error %d",
(int)PTR_ERR(inode));
return inode;
}
if (ubifs_inode(inode)->xattr)
return inode;
- ubifs_err("corrupt extended attribute entry");
+ ubifs_err(c, "corrupt extended attribute entry");
iput(inode);
return ERR_PTR(-EINVAL);
}
@@ -364,15 +364,15 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
- name, dentry->d_inode->i_ino, dentry, size);
+ name, d_inode(dentry)->i_ino, dentry, size);
- return setxattr(dentry->d_inode, name, value, size, flags);
+ return setxattr(d_inode(dentry), name, value, size, flags);
}
ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
size_t size)
{
- struct inode *inode, *host = dentry->d_inode;
+ struct inode *inode, *host = d_inode(dentry);
struct ubifs_info *c = host->i_sb->s_fs_info;
struct qstr nm = QSTR_INIT(name, strlen(name));
struct ubifs_inode *ui;
@@ -412,7 +412,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
if (buf) {
/* If @buf is %NULL we are supposed to return the length */
if (ui->data_len > size) {
- ubifs_err("buffer size %zd, xattr len %d",
+ ubifs_err(c, "buffer size %zd, xattr len %d",
size, ui->data_len);
err = -ERANGE;
goto out_iput;
@@ -432,7 +432,7 @@ out_unlock:
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
union ubifs_key key;
- struct inode *host = dentry->d_inode;
+ struct inode *host = d_inode(dentry);
struct ubifs_info *c = host->i_sb->s_fs_info;
struct ubifs_inode *host_ui = ubifs_inode(host);
struct ubifs_dent_node *xent, *pxent = NULL;
@@ -485,7 +485,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
kfree(pxent);
if (err != -ENOENT) {
- ubifs_err("cannot find next direntry, error %d", err);
+ ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
}
@@ -535,7 +535,7 @@ out_cancel:
int ubifs_removexattr(struct dentry *dentry, const char *name)
{
- struct inode *inode, *host = dentry->d_inode;
+ struct inode *inode, *host = d_inode(dentry);
struct ubifs_info *c = host->i_sb->s_fs_info;
struct qstr nm = QSTR_INIT(name, strlen(name));
struct ubifs_dent_node *xent;
@@ -657,8 +657,10 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
&init_xattrs, 0);
mutex_unlock(&inode->i_mutex);
- if (err)
- ubifs_err("cannot initialize security for inode %lu, error %d",
+ if (err) {
+ struct ubifs_info *c = dentry->i_sb->s_fs_info;
+ ubifs_err(c, "cannot initialize security for inode %lu, error %d",
inode->i_ino, err);
+ }
return err;
}
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 1ba2baaf4367..6d6a96b4e73f 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
#include "udfdecl.h"
-#include <linux/buffer_head.h>
#include <linux/bitops.h>
#include "udf_i.h"
@@ -63,15 +62,14 @@ static int __load_block_bitmap(struct super_block *sb,
block_group, nr_groups);
}
- if (bitmap->s_block_bitmap[block_group]) {
+ if (bitmap->s_block_bitmap[block_group])
return block_group;
- } else {
- retval = read_block_bitmap(sb, bitmap, block_group,
- block_group);
- if (retval < 0)
- return retval;
- return block_group;
- }
+
+ retval = read_block_bitmap(sb, bitmap, block_group, block_group);
+ if (retval < 0)
+ return retval;
+
+ return block_group;
}
static inline int load_block_bitmap(struct super_block *sb,
@@ -358,7 +356,6 @@ static void udf_table_free_blocks(struct super_block *sb,
struct kernel_lb_addr eloc;
struct extent_position oepos, epos;
int8_t etype;
- int i;
struct udf_inode_info *iinfo;
mutex_lock(&sbi->s_alloc_mutex);
@@ -425,7 +422,6 @@ static void udf_table_free_blocks(struct super_block *sb,
}
if (epos.bh != oepos.bh) {
- i = -1;
oepos.block = epos.block;
brelse(oepos.bh);
get_bh(epos.bh);
@@ -762,7 +758,7 @@ inline int udf_prealloc_blocks(struct super_block *sb,
uint32_t block_count)
{
struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
- sector_t allocated;
+ int allocated;
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
allocated = udf_bitmap_prealloc_blocks(sb,
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 05e90edd1992..541d9c65014d 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,7 +30,6 @@
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -169,7 +168,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
}
flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
- if (!flen)
+ if (flen < 0)
continue;
tloc = lelb_to_cpu(cfi.icb.extLocation);
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 3e44f575fb9c..c763fda257bf 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -16,7 +16,6 @@
#include <linux/fs.h>
#include <linux/string.h>
-#include <linux/buffer_head.h>
struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
struct udf_fileident_bh *fibh,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 08f3555fbeac..bddf3d071dae 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -33,8 +33,7 @@
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/pagemap.h>
-#include <linux/buffer_head.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -100,8 +99,7 @@ static int udf_adinicb_write_begin(struct file *file,
return 0;
}
-static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter,
+static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t offset)
{
/* Fallback to buffered I/O. */
@@ -121,21 +119,21 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t retval;
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- int err, pos;
- size_t count = iocb->ki_nbytes;
struct udf_inode_info *iinfo = UDF_I(inode);
+ int err;
mutex_lock(&inode->i_mutex);
+
+ retval = generic_write_checks(iocb, from);
+ if (retval <= 0)
+ goto out;
+
down_write(&iinfo->i_data_sem);
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
- if (file->f_flags & O_APPEND)
- pos = inode->i_size;
- else
- pos = iocb->ki_pos;
+ loff_t end = iocb->ki_pos + iov_iter_count(from);
if (inode->i_sb->s_blocksize <
- (udf_file_entry_alloc_offset(inode) +
- pos + count)) {
+ (udf_file_entry_alloc_offset(inode) + end)) {
err = udf_expand_file_adinicb(inode);
if (err) {
mutex_unlock(&inode->i_mutex);
@@ -143,21 +141,17 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
return err;
}
} else {
- if (pos + count > inode->i_size)
- iinfo->i_lenAlloc = pos + count;
- else
- iinfo->i_lenAlloc = inode->i_size;
+ iinfo->i_lenAlloc = max(end, inode->i_size);
up_write(&iinfo->i_data_sem);
}
} else
up_write(&iinfo->i_data_sem);
retval = __generic_file_write_iter(iocb, from);
+out:
mutex_unlock(&inode->i_mutex);
if (retval > 0) {
- ssize_t err;
-
mark_inode_dirty(inode);
err = generic_write_sync(file, iocb->ki_pos - retval, retval);
if (err < 0)
@@ -240,12 +234,10 @@ static int udf_release_file(struct inode *inode, struct file *filp)
}
const struct file_operations udf_file_operations = {
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
.unlocked_ioctl = udf_ioctl,
.open = generic_file_open,
.mmap = generic_file_mmap,
- .write = new_sync_write,
.write_iter = udf_file_write_iter,
.release = udf_release_file,
.fsync = generic_file_fsync,
@@ -255,7 +247,7 @@ const struct file_operations udf_file_operations = {
static int udf_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
error = inode_change_ok(inode, attr);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a445d599098d..8d0b3ade0ff0 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -33,12 +33,11 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/pagemap.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
#include <linux/mpage.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -215,8 +214,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static ssize_t udf_direct_IO(int rw, struct kiocb *iocb,
- struct iov_iter *iter,
+static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t offset)
{
struct file *file = iocb->ki_filp;
@@ -225,8 +223,8 @@ static ssize_t udf_direct_IO(int rw, struct kiocb *iocb,
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, udf_get_block);
- if (unlikely(ret < 0 && (rw & WRITE)))
+ ret = blockdev_direct_IO(iocb, inode, iter, offset, udf_get_block);
+ if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE))
udf_write_failed(mapping, offset + count);
return ret;
}
@@ -1637,7 +1635,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0));
if (!bh) {
udf_debug("getblk failure\n");
- return -ENOMEM;
+ return -EIO;
}
lock_buffer(bh);
@@ -1654,17 +1652,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
iinfo->i_ext.i_data, inode->i_sb->s_blocksize -
sizeof(struct unallocSpaceEntry));
use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE);
- use->descTag.tagLocation =
- cpu_to_le32(iinfo->i_location.logicalBlockNum);
- crclen = sizeof(struct unallocSpaceEntry) +
- iinfo->i_lenAlloc - sizeof(struct tag);
- use->descTag.descCRCLength = cpu_to_le16(crclen);
- use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
- sizeof(struct tag),
- crclen));
- use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
+ crclen = sizeof(struct unallocSpaceEntry);
- goto out;
+ goto finish;
}
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET))
@@ -1784,6 +1774,8 @@ static int udf_update_inode(struct inode *inode, int do_sync)
efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE);
crclen = sizeof(struct extendedFileEntry);
}
+
+finish:
if (iinfo->i_strat4096) {
fe->icbTag.strategyType = cpu_to_le16(4096);
fe->icbTag.strategyParameter = cpu_to_le16(1);
@@ -1793,7 +1785,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
fe->icbTag.numEntries = cpu_to_le16(1);
}
- if (S_ISDIR(inode->i_mode))
+ if (iinfo->i_use)
+ fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE;
+ else if (S_ISDIR(inode->i_mode))
fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY;
else if (S_ISREG(inode->i_mode))
fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR;
@@ -1830,7 +1824,6 @@ static int udf_update_inode(struct inode *inode, int do_sync)
crclen));
fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
-out:
set_buffer_uptodate(bh);
unlock_buffer(bh);
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index c175b4dabc14..71d1c25f360d 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -23,7 +23,6 @@
#include <linux/fs.h>
#include <linux/string.h>
-#include <linux/buffer_head.h>
#include <linux/crc-itu-t.h>
#include "udf_i.h"
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 33b246b82c98..c97b5a8d1e24 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/buffer_head.h>
#include <linux/sched.h>
#include <linux/crc-itu-t.h>
#include <linux/exportfs.h>
@@ -139,6 +138,25 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
return 0;
}
+/**
+ * udf_find_entry - find entry in given directory.
+ *
+ * @dir: directory inode to search in
+ * @child: qstr of the name
+ * @fibh: buffer head / inode with file identifier descriptor we found
+ * @cfi: found file identifier descriptor with given name
+ *
+ * This function searches in the directory @dir for a file name @child. When
+ * found, @fibh points to the buffer head(s) (bh is NULL for in ICB
+ * directories) containing the file identifier descriptor (FID). In that case
+ * the function returns pointer to the FID in the buffer or inode - but note
+ * that FID may be split among two buffers (blocks) so accessing it via that
+ * pointer isn't easily possible. This pointer can be used only as an iterator
+ * for other directory manipulation functions. For inspection of the FID @cfi
+ * can be used - the found FID is copied there.
+ *
+ * Returns pointer to FID, NULL when nothing found, or error code.
+ */
static struct fileIdentDesc *udf_find_entry(struct inode *dir,
const struct qstr *child,
struct udf_fileident_bh *fibh,
@@ -168,8 +186,11 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1);
if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos,
- &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
+ &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
+ fi = ERR_PTR(-EIO);
goto out_err;
+ }
+
block = udf_get_lb_pblock(sb, &eloc, offset);
if ((++offset << sb->s_blocksize_bits) < elen) {
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -180,19 +201,25 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
offset = 0;
fibh->sbh = fibh->ebh = udf_tread(sb, block);
- if (!fibh->sbh)
+ if (!fibh->sbh) {
+ fi = ERR_PTR(-EIO);
goto out_err;
+ }
}
fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
- if (!fname)
+ if (!fname) {
+ fi = ERR_PTR(-ENOMEM);
goto out_err;
+ }
while (f_pos < size) {
fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc,
&elen, &offset);
- if (!fi)
+ if (!fi) {
+ fi = ERR_PTR(-EIO);
goto out_err;
+ }
liu = le16_to_cpu(cfi->lengthOfImpUse);
lfi = cfi->lengthFileIdent;
@@ -235,12 +262,17 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
continue;
flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
- if (flen && udf_match(flen, fname, child->len, child->name))
+ if (flen < 0) {
+ fi = ERR_PTR(flen);
+ goto out_err;
+ }
+
+ if (udf_match(flen, fname, child->len, child->name))
goto out_ok;
}
-out_err:
fi = NULL;
+out_err:
if (fibh->sbh != fibh->ebh)
brelse(fibh->ebh);
brelse(fibh->sbh);
@@ -257,6 +289,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
struct fileIdentDesc cfi;
struct udf_fileident_bh fibh;
+ struct fileIdentDesc *fi;
if (dentry->d_name.len > UDF_NAME_LEN - 2)
return ERR_PTR(-ENAMETOOLONG);
@@ -276,7 +309,11 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
} else
#endif /* UDF_RECOVERY */
- if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
+ fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
+ if (IS_ERR(fi))
+ return ERR_CAST(fi);
+
+ if (fi) {
struct kernel_lb_addr loc;
if (fibh.sbh != fibh.ebh)
@@ -552,7 +589,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
{
struct udf_inode_info *iinfo = UDF_I(inode);
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
struct udf_fileident_bh fibh;
struct fileIdentDesc cfi, *fi;
int err;
@@ -569,8 +606,8 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
- if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
- mark_inode_dirty(dir);
+ dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+ mark_inode_dirty(dir);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
@@ -683,6 +720,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY;
udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
inc_nlink(dir);
+ dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
mark_inode_dirty(dir);
unlock_new_inode(inode);
d_instantiate(dentry, inode);
@@ -767,15 +805,18 @@ static int empty_dir(struct inode *dir)
static int udf_rmdir(struct inode *dir, struct dentry *dentry)
{
int retval;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct udf_fileident_bh fibh;
struct fileIdentDesc *fi, cfi;
struct kernel_lb_addr tloc;
retval = -ENOENT;
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
- if (!fi)
+ if (IS_ERR_OR_NULL(fi)) {
+ if (fi)
+ retval = PTR_ERR(fi);
goto out;
+ }
retval = -EIO;
tloc = lelb_to_cpu(cfi.icb.extLocation);
@@ -809,7 +850,7 @@ out:
static int udf_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct udf_fileident_bh fibh;
struct fileIdentDesc *fi;
struct fileIdentDesc cfi;
@@ -817,8 +858,12 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
retval = -ENOENT;
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
- if (!fi)
+
+ if (IS_ERR_OR_NULL(fi)) {
+ if (fi)
+ retval = PTR_ERR(fi);
goto out;
+ }
retval = -EIO;
tloc = lelb_to_cpu(cfi.icb.extLocation);
@@ -999,7 +1044,7 @@ out_no_entry:
static int udf_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct udf_fileident_bh fibh;
struct fileIdentDesc cfi, *fi;
int err;
@@ -1024,6 +1069,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
inode->i_ctime = current_fs_time(inode->i_sb);
mark_inode_dirty(inode);
+ dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
+ mark_inode_dirty(dir);
ihold(inode);
d_instantiate(dentry, inode);
@@ -1036,8 +1083,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct udf_fileident_bh ofibh, nfibh;
struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL;
struct fileIdentDesc ocfi, ncfi;
@@ -1047,24 +1094,30 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
- if (ofi) {
- if (ofibh.sbh != ofibh.ebh)
- brelse(ofibh.ebh);
- brelse(ofibh.sbh);
+ if (IS_ERR(ofi)) {
+ retval = PTR_ERR(ofi);
+ goto end_rename;
}
+
+ if (ofibh.sbh != ofibh.ebh)
+ brelse(ofibh.ebh);
+
+ brelse(ofibh.sbh);
tloc = lelb_to_cpu(ocfi.icb.extLocation);
if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
!= old_inode->i_ino)
goto end_rename;
nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
- if (nfi) {
- if (!new_inode) {
- if (nfibh.sbh != nfibh.ebh)
- brelse(nfibh.ebh);
- brelse(nfibh.sbh);
- nfi = NULL;
- }
+ if (IS_ERR(nfi)) {
+ retval = PTR_ERR(nfi);
+ goto end_rename;
+ }
+ if (nfi && !new_inode) {
+ if (nfibh.sbh != nfibh.ebh)
+ brelse(nfibh.ebh);
+ brelse(nfibh.sbh);
+ nfi = NULL;
}
if (S_ISDIR(old_inode->i_mode)) {
int offset = udf_ext0_offset(old_inode);
@@ -1127,7 +1180,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
inode_dec_link_count(new_inode);
}
old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb);
+ new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb);
mark_inode_dirty(old_dir);
+ mark_inode_dirty(new_dir);
if (dir_fi) {
dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location);
@@ -1175,7 +1230,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
struct fileIdentDesc cfi;
struct udf_fileident_bh fibh;
- if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
+ if (!udf_find_entry(d_inode(child), &dotdot, &fibh, &cfi))
return ERR_PTR(-EACCES);
if (fibh.sbh != fibh.ebh)
@@ -1183,7 +1238,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
brelse(fibh.sbh);
tloc = lelb_to_cpu(cfi.icb.extLocation);
- inode = udf_iget(child->d_inode->i_sb, &tloc);
+ inode = udf_iget(d_inode(child)->i_sb, &tloc);
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -1217,7 +1272,7 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
static struct dentry *udf_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- if ((fh_len != 3 && fh_len != 5) ||
+ if (fh_len < 3 ||
(fh_type != FILEID_UDF_WITH_PARENT &&
fh_type != FILEID_UDF_WITHOUT_PARENT))
return NULL;
@@ -1229,7 +1284,7 @@ static struct dentry *udf_fh_to_dentry(struct super_block *sb,
static struct dentry *udf_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT)
+ if (fh_len < 5 || fh_type != FILEID_UDF_WITH_PARENT)
return NULL;
return udf_nfs_get_inode(sb, fid->udf.parent_block,
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index d6caf01a2097..5f861ed287c3 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
#include <linux/fs.h>
#include <linux/string.h>
-#include <linux/buffer_head.h>
#include <linux/mutex.h>
uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f169411c4ea0..b96f190bc567 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -48,7 +48,6 @@
#include <linux/stat.h>
#include <linux/cdrom.h>
#include <linux/nls.h>
-#include <linux/buffer_head.h>
#include <linux/vfs.h>
#include <linux/vmalloc.h>
#include <linux/errno.h>
@@ -928,17 +927,23 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
#endif
}
- if (!udf_build_ustr(instr, pvoldesc->volIdent, 32))
- if (udf_CS0toUTF8(outstr, instr)) {
- strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
- outstr->u_len > 31 ? 31 : outstr->u_len);
- udf_debug("volIdent[] = '%s'\n",
- UDF_SB(sb)->s_volume_ident);
- }
+ if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) {
+ ret = udf_CS0toUTF8(outstr, instr);
+ if (ret < 0)
+ goto out_bh;
+
+ strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
+ outstr->u_len > 31 ? 31 : outstr->u_len);
+ udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
+ }
- if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128))
- if (udf_CS0toUTF8(outstr, instr))
- udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
+ if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) {
+ ret = udf_CS0toUTF8(outstr, instr);
+ if (ret < 0)
+ goto out_bh;
+
+ udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
+ }
ret = 0;
out_bh:
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index ac10ca939f26..862535b3ba58 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -27,7 +27,6 @@
#include <linux/mm.h>
#include <linux/stat.h>
#include <linux/pagemap.h>
-#include <linux/buffer_head.h>
#include "udf_i.h"
static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
@@ -83,6 +82,9 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
comp_len = udf_get_filename(sb, pc->componentIdent,
pc->lengthComponentIdent,
p, tolen);
+ if (comp_len < 0)
+ return comp_len;
+
p += comp_len;
tolen -= comp_len;
if (tolen == 0)
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 8a9657d7f7c6..42b8c57795cb 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -22,7 +22,6 @@
#include "udfdecl.h"
#include <linux/fs.h>
#include <linux/mm.h>
-#include <linux/buffer_head.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index b5cd8ed2aa12..b1b9a63d8cf3 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -56,7 +56,7 @@ struct udf_inode_info {
static inline struct udf_inode_info *UDF_I(struct inode *inode)
{
- return list_entry(inode, struct udf_inode_info, vfs_inode);
+ return container_of(inode, struct udf_inode_info, vfs_inode);
}
#endif /* _UDF_I_H) */
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index b84fee372734..ab478e62baae 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -68,21 +68,16 @@ int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
/*
* udf_build_ustr_exact
*/
-static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
+static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
{
- if ((!dest) || (!ptr) || (!exactsize))
- return -1;
-
memset(dest, 0, sizeof(struct ustr));
dest->u_cmpID = ptr[0];
dest->u_len = exactsize - 1;
memcpy(dest->u_name, ptr + 1, exactsize - 1);
-
- return 0;
}
/*
- * udf_ocu_to_utf8
+ * udf_CS0toUTF8
*
* PURPOSE
* Convert OSTA Compressed Unicode to the UTF-8 equivalent.
@@ -94,7 +89,7 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
* both of type "struct ustr *"
*
* POST-CONDITIONS
- * <return> Zero on success.
+ * <return> >= 0 on success.
*
* HISTORY
* November 12, 1997 - Andrew E. Mileski
@@ -117,7 +112,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
memset(utf_o, 0, sizeof(struct ustr));
pr_err("unknown compression code (%d) stri=%s\n",
cmp_id, ocu_i->u_name);
- return 0;
+ return -EINVAL;
}
ocu = ocu_i->u_name;
@@ -154,7 +149,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
/*
*
- * udf_utf8_to_ocu
+ * udf_UTF8toCS0
*
* PURPOSE
* Convert UTF-8 to the OSTA Compressed Unicode equivalent.
@@ -270,7 +265,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
memset(utf_o, 0, sizeof(struct ustr));
pr_err("unknown compression code (%d) stri=%s\n",
cmp_id, ocu_i->u_name);
- return 0;
+ return -EINVAL;
}
ocu = ocu_i->u_name;
@@ -338,43 +333,51 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
uint8_t *dname, int dlen)
{
struct ustr *filename, *unifilename;
- int len = 0;
+ int ret;
+
+ if (!slen)
+ return -EIO;
filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
if (!filename)
- return 0;
+ return -ENOMEM;
unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
- if (!unifilename)
+ if (!unifilename) {
+ ret = -ENOMEM;
goto out1;
+ }
- if (udf_build_ustr_exact(unifilename, sname, slen))
- goto out2;
-
+ udf_build_ustr_exact(unifilename, sname, slen);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
- if (!udf_CS0toUTF8(filename, unifilename)) {
+ ret = udf_CS0toUTF8(filename, unifilename);
+ if (ret < 0) {
udf_debug("Failed in udf_get_filename: sname = %s\n",
sname);
goto out2;
}
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
- if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
- unifilename)) {
+ ret = udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
+ unifilename);
+ if (ret < 0) {
udf_debug("Failed in udf_get_filename: sname = %s\n",
sname);
goto out2;
}
} else
- goto out2;
+ BUG();
- len = udf_translate_to_linux(dname, dlen,
+ ret = udf_translate_to_linux(dname, dlen,
filename->u_name, filename->u_len,
unifilename->u_name, unifilename->u_len);
+ /* Zero length filename isn't valid... */
+ if (ret == 0)
+ ret = -EINVAL;
out2:
kfree(unifilename);
out1:
kfree(filename);
- return len;
+ return ret;
}
int udf_put_filename(struct super_block *sb, const uint8_t *sname,
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 2c1036080d52..a7106eda5024 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -51,8 +51,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
if (ufs_fragnum(fragment) + count > uspi->s_fpg)
ufs_error (sb, "ufs_free_fragments", "internal error");
-
- lock_ufs(sb);
+
+ mutex_lock(&UFS_SB(sb)->s_lock);
cgno = ufs_dtog(uspi, fragment);
bit = ufs_dtogd(uspi, fragment);
@@ -115,13 +115,13 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
if (sb->s_flags & MS_SYNCHRONOUS)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
-
- unlock_ufs(sb);
+
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT\n");
return;
failed:
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT (FAILED)\n");
return;
}
@@ -151,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
goto failed;
}
- lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
do_more:
overflow = 0;
@@ -211,12 +211,12 @@ do_more:
}
ufs_mark_sb_dirty(sb);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT\n");
return;
failed_unlock:
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
failed:
UFSD("EXIT (FAILED)\n");
return;
@@ -357,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
usb1 = ubh_get_usb_first(uspi);
*err = -ENOSPC;
- lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
tmp = ufs_data_ptr_to_cpu(sb, p);
if (count + ufs_fragnum(fragment) > uspi->s_fpb) {
@@ -378,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
"fragment %llu, tmp %llu\n",
(unsigned long long)fragment,
(unsigned long long)tmp);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return INVBLOCK;
}
if (fragment < UFS_I(inode)->i_lastfrag) {
UFSD("EXIT (ALREADY ALLOCATED)\n");
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return 0;
}
}
else {
if (tmp) {
UFSD("EXIT (ALREADY ALLOCATED)\n");
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return 0;
}
}
@@ -399,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
* There is not enough space for user on the device
*/
if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT (FAILED)\n");
return 0;
}
@@ -424,7 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
ufs_clear_frags(inode, result + oldcount,
newcount - oldcount, locked_page != NULL);
}
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
}
@@ -439,7 +439,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
fragment + count);
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
locked_page != NULL);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
}
@@ -477,7 +477,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
*err = 0;
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
if (newcount < request)
ufs_free_fragments (inode, result + newcount, request - newcount);
ufs_free_fragments (inode, tmp, oldcount);
@@ -485,7 +485,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
return result;
}
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT (FAILED)\n");
return 0;
}
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 0ecc2cebed8f..74f2e80288bf 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -65,11 +65,6 @@ static inline void ufs_put_page(struct page *page)
page_cache_release(page);
}
-static inline unsigned long ufs_dir_pages(struct inode *inode)
-{
- return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
{
ino_t res = 0;
@@ -87,7 +82,8 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
/* Releases the page */
void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct page *page, struct inode *inode)
+ struct page *page, struct inode *inode,
+ bool update_times)
{
loff_t pos = page_offset(page) +
(char *) de - (char *) page_address(page);
@@ -103,7 +99,8 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
err = ufs_commit_chunk(page, pos, len);
ufs_put_page(page);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ if (update_times)
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(dir);
}
@@ -256,7 +253,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
int namelen = qstr->len;
unsigned reclen = UFS_DIR_REC_LEN(namelen);
unsigned long start, n;
- unsigned long npages = ufs_dir_pages(dir);
+ unsigned long npages = dir_pages(dir);
struct page *page = NULL;
struct ufs_inode_info *ui = UFS_I(dir);
struct ufs_dir_entry *de;
@@ -311,7 +308,7 @@ found:
*/
int ufs_add_link(struct dentry *dentry, struct inode *inode)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir = d_inode(dentry->d_parent);
const unsigned char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
struct super_block *sb = dir->i_sb;
@@ -320,7 +317,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
unsigned short rec_len, name_len;
struct page *page = NULL;
struct ufs_dir_entry *de;
- unsigned long npages = ufs_dir_pages(dir);
+ unsigned long npages = dir_pages(dir);
unsigned long n;
char *kaddr;
loff_t pos;
@@ -437,7 +434,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
struct super_block *sb = inode->i_sb;
unsigned int offset = pos & ~PAGE_CACHE_MASK;
unsigned long n = pos >> PAGE_CACHE_SHIFT;
- unsigned long npages = ufs_dir_pages(inode);
+ unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
int need_revalidate = file->f_version != inode->i_version;
unsigned flags = UFS_SB(sb)->s_flags;
@@ -608,7 +605,7 @@ int ufs_empty_dir(struct inode * inode)
{
struct super_block *sb = inode->i_sb;
struct page *page = NULL;
- unsigned long i, npages = ufs_dir_pages(inode);
+ unsigned long i, npages = dir_pages(inode);
for (i = 0; i < npages; i++) {
char *kaddr;
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index c84ec010a676..042ddbf110cc 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -35,9 +35,7 @@
const struct file_operations ufs_file_operations = {
.llseek = generic_file_llseek,
- .read = new_sync_read,
.read_iter = generic_file_read_iter,
- .write = new_sync_write,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = generic_file_open,
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 7caa01652888..fd0203ce1f7f 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -69,11 +69,11 @@ void ufs_free_inode (struct inode * inode)
ino = inode->i_ino;
- lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) {
ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return;
}
@@ -81,7 +81,7 @@ void ufs_free_inode (struct inode * inode)
bit = ufs_inotocgoff (ino);
ucpi = ufs_load_cylinder (sb, cg);
if (!ucpi) {
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return;
}
ucg = ubh_get_ucg(UCPI_UBH(ucpi));
@@ -115,7 +115,7 @@ void ufs_free_inode (struct inode * inode)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT\n");
}
@@ -193,7 +193,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
sbi = UFS_SB(sb);
uspi = sbi->s_uspi;
- lock_ufs(sb);
+ mutex_lock(&sbi->s_lock);
/*
* Try to place the inode in its parent directory
@@ -331,21 +331,21 @@ cg_found:
sync_dirty_buffer(bh);
brelse(bh);
}
- unlock_ufs(sb);
+ mutex_unlock(&sbi->s_lock);
UFSD("allocating inode %lu\n", inode->i_ino);
UFSD("EXIT\n");
return inode;
fail_remove_inode:
- unlock_ufs(sb);
+ mutex_unlock(&sbi->s_lock);
clear_nlink(inode);
unlock_new_inode(inode);
iput(inode);
UFSD("EXIT (FAILED): err %d\n", err);
return ERR_PTR(err);
failed:
- unlock_ufs(sb);
+ mutex_unlock(&sbi->s_lock);
make_bad_inode(inode);
iput (inode);
UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index be7d42c7d938..f913a6924b23 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -572,9 +572,10 @@ static void ufs_set_inode_ops(struct inode *inode)
inode->i_fop = &ufs_dir_operations;
inode->i_mapping->a_ops = &ufs_aops;
} else if (S_ISLNK(inode->i_mode)) {
- if (!inode->i_blocks)
+ if (!inode->i_blocks) {
inode->i_op = &ufs_fast_symlink_inode_operations;
- else {
+ inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+ } else {
inode->i_op = &ufs_symlink_inode_operations;
inode->i_mapping->a_ops = &ufs_aops;
}
@@ -902,6 +903,9 @@ void ufs_evict_inode(struct inode * inode)
invalidate_inode_buffers(inode);
clear_inode(inode);
- if (want_delete)
+ if (want_delete) {
+ lock_ufs(inode->i_sb);
ufs_free_inode(inode);
+ unlock_ufs(inode->i_sb);
+ }
}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index fd65deb4b5f0..47966554317c 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -56,11 +56,9 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi
if (dentry->d_name.len > UFS_MAXNAMLEN)
return ERR_PTR(-ENAMETOOLONG);
- lock_ufs(dir->i_sb);
ino = ufs_inode_by_name(dir, &dentry->d_name);
if (ino)
inode = ufs_iget(dir->i_sb, ino);
- unlock_ufs(dir->i_sb);
return d_splice_alias(inode, dentry);
}
@@ -76,24 +74,16 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode,
bool excl)
{
struct inode *inode;
- int err;
-
- UFSD("BEGIN\n");
inode = ufs_new_inode(dir, mode);
- err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- if (!IS_ERR(inode)) {
- inode->i_op = &ufs_file_inode_operations;
- inode->i_fop = &ufs_file_operations;
- inode->i_mapping->a_ops = &ufs_aops;
- mark_inode_dirty(inode);
- lock_ufs(dir->i_sb);
- err = ufs_add_nondir(dentry, inode);
- unlock_ufs(dir->i_sb);
- }
- UFSD("END: err=%d\n", err);
- return err;
+ inode->i_op = &ufs_file_inode_operations;
+ inode->i_fop = &ufs_file_operations;
+ inode->i_mapping->a_ops = &ufs_aops;
+ mark_inode_dirty(inode);
+ return ufs_add_nondir(dentry, inode);
}
static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
@@ -110,9 +100,7 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev
init_special_inode(inode, mode, rdev);
ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
mark_inode_dirty(inode);
- lock_ufs(dir->i_sb);
err = ufs_add_nondir(dentry, inode);
- unlock_ufs(dir->i_sb);
}
return err;
}
@@ -121,19 +109,18 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
const char * symname)
{
struct super_block * sb = dir->i_sb;
- int err = -ENAMETOOLONG;
+ int err;
unsigned l = strlen(symname)+1;
struct inode * inode;
if (l > sb->s_blocksize)
- goto out_notlocked;
+ return -ENAMETOOLONG;
inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
- goto out_notlocked;
+ return err;
- lock_ufs(dir->i_sb);
if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
/* slow symlink */
inode->i_op = &ufs_symlink_inode_operations;
@@ -144,38 +131,37 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
} else {
/* fast symlink */
inode->i_op = &ufs_fast_symlink_inode_operations;
- memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l);
+ inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+ memcpy(inode->i_link, symname, l);
inode->i_size = l-1;
}
mark_inode_dirty(inode);
- err = ufs_add_nondir(dentry, inode);
-out:
- unlock_ufs(dir->i_sb);
-out_notlocked:
- return err;
+ return ufs_add_nondir(dentry, inode);
out_fail:
inode_dec_link_count(inode);
unlock_new_inode(inode);
iput(inode);
- goto out;
+ return err;
}
static int ufs_link (struct dentry * old_dentry, struct inode * dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int error;
- lock_ufs(dir->i_sb);
-
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
ihold(inode);
- error = ufs_add_nondir(dentry, inode);
- unlock_ufs(dir->i_sb);
+ error = ufs_add_link(dentry, inode);
+ if (error) {
+ inode_dec_link_count(inode);
+ iput(inode);
+ } else
+ d_instantiate(dentry, inode);
return error;
}
@@ -184,9 +170,12 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
struct inode * inode;
int err;
+ inode_inc_link_count(dir);
+
inode = ufs_new_inode(dir, S_IFDIR|mode);
+ err = PTR_ERR(inode);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ goto out_dir;
inode->i_op = &ufs_dir_inode_operations;
inode->i_fop = &ufs_dir_operations;
@@ -194,9 +183,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
inode_inc_link_count(inode);
- lock_ufs(dir->i_sb);
- inode_inc_link_count(dir);
-
err = ufs_make_empty(inode, dir);
if (err)
goto out_fail;
@@ -204,25 +190,24 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
err = ufs_add_link(dentry, inode);
if (err)
goto out_fail;
- unlock_ufs(dir->i_sb);
+ unlock_new_inode(inode);
d_instantiate(dentry, inode);
-out:
- return err;
+ return 0;
out_fail:
inode_dec_link_count(inode);
inode_dec_link_count(inode);
unlock_new_inode(inode);
iput (inode);
+out_dir:
inode_dec_link_count(dir);
- unlock_ufs(dir->i_sb);
- goto out;
+ return err;
}
static int ufs_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode * inode = dentry->d_inode;
+ struct inode * inode = d_inode(dentry);
struct ufs_dir_entry *de;
struct page *page;
int err = -ENOENT;
@@ -244,10 +229,9 @@ out:
static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
{
- struct inode * inode = dentry->d_inode;
+ struct inode * inode = d_inode(dentry);
int err= -ENOTEMPTY;
- lock_ufs(dir->i_sb);
if (ufs_empty_dir (inode)) {
err = ufs_unlink(dir, dentry);
if (!err) {
@@ -256,15 +240,14 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
inode_dec_link_count(dir);
}
}
- unlock_ufs(dir->i_sb);
return err;
}
static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *old_inode = old_dentry->d_inode;
- struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = d_inode(old_dentry);
+ struct inode *new_inode = d_inode(new_dentry);
struct page *dir_page = NULL;
struct ufs_dir_entry * dir_de = NULL;
struct page *old_page;
@@ -294,7 +277,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
if (!new_de)
goto out_dir;
- ufs_set_link(new_dir, new_de, new_page, old_inode);
+ ufs_set_link(new_dir, new_de, new_page, old_inode, 1);
new_inode->i_ctime = CURRENT_TIME_SEC;
if (dir_de)
drop_nlink(new_inode);
@@ -317,7 +300,12 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
mark_inode_dirty(old_inode);
if (dir_de) {
- ufs_set_link(old_inode, dir_de, dir_page, new_dir);
+ if (old_dir != new_dir)
+ ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0);
+ else {
+ kunmap(dir_page);
+ page_cache_release(dir_page);
+ }
inode_dec_link_count(old_dir);
}
return 0;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 8092d3759a5e..250579a80d90 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -80,6 +80,7 @@
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/parser.h>
#include <linux/buffer_head.h>
@@ -144,10 +145,10 @@ static struct dentry *ufs_get_parent(struct dentry *child)
struct qstr dot_dot = QSTR_INIT("..", 2);
ino_t ino;
- ino = ufs_inode_by_name(child->d_inode, &dot_dot);
+ ino = ufs_inode_by_name(d_inode(child), &dot_dot);
if (!ino)
return ERR_PTR(-ENOENT);
- return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino));
+ return d_obtain_alias(ufs_iget(d_inode(child)->i_sb, ino));
}
static const struct export_operations ufs_export_ops = {
@@ -694,6 +695,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
unsigned flags;
lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
UFSD("ENTER\n");
@@ -711,6 +713,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
ufs_put_cstotal(sb);
UFSD("EXIT\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
@@ -799,6 +802,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
mutex_init(&sbi->mutex);
+ mutex_init(&sbi->s_lock);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
/*
@@ -1277,6 +1281,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
sync_filesystem(sb);
lock_ufs(sb);
+ mutex_lock(&UFS_SB(sb)->s_lock);
uspi = UFS_SB(sb)->s_uspi;
flags = UFS_SB(sb)->s_flags;
usb1 = ubh_get_usb_first(uspi);
@@ -1290,6 +1295,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
new_mount_opt = 0;
ufs_set_opt (new_mount_opt, ONERROR_LOCK);
if (!ufs_parse_options (data, &new_mount_opt)) {
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
@@ -1297,12 +1303,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
new_mount_opt |= ufstype;
} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
pr_err("ufstype can't be changed during remount\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
UFS_SB(sb)->s_mount_opt = new_mount_opt;
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
@@ -1326,6 +1334,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
*/
#ifndef CONFIG_UFS_FS_WRITE
pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
#else
@@ -1335,11 +1344,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
pr_err("this ufstype is read-only supported\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if (!ufs_read_cylinder_structures(sb)) {
pr_err("failed during remounting\n");
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EPERM;
}
@@ -1347,6 +1358,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
#endif
}
UFS_SB(sb)->s_mount_opt = new_mount_opt;
+ mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
index d283628b4778..874480bb43e9 100644
--- a/fs/ufs/symlink.c
+++ b/fs/ufs/symlink.c
@@ -25,23 +25,12 @@
* ext2 symlink handling code
*/
-#include <linux/fs.h>
-#include <linux/namei.h>
-
#include "ufs_fs.h"
#include "ufs.h"
-
-static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct ufs_inode_info *p = UFS_I(dentry->d_inode);
- nd_set_link(nd, (char*)p->i_u1.i_symlink);
- return NULL;
-}
-
const struct inode_operations ufs_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ufs_follow_link,
+ .follow_link = simple_follow_link,
.setattr = ufs_setattr,
};
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index f04f89fbd4d9..21154704c168 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -492,7 +492,7 @@ out:
int ufs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
unsigned int ia_valid = attr->ia_valid;
int error;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 2a07396d5f9e..2e31ea2e35a3 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -30,6 +30,7 @@ struct ufs_sb_info {
int work_queued; /* non-zero if the delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
spinlock_t work_lock; /* protects sync_work and work_queued */
+ struct mutex s_lock;
};
struct ufs_inode_info {
@@ -105,7 +106,7 @@ extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page
extern int ufs_empty_dir (struct inode *);
extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct page *page, struct inode *inode);
+ struct page *page, struct inode *inode, bool update_times);
/* file.c */
extern const struct inode_operations ufs_file_inode_operations;
diff --git a/fs/xattr.c b/fs/xattr.c
index 4ef698549e31..072fee1258dd 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -298,18 +298,18 @@ vfs_removexattr(struct dentry *dentry, const char *name)
mutex_lock(&inode->i_mutex);
error = security_inode_removexattr(dentry, name);
- if (error) {
- mutex_unlock(&inode->i_mutex);
- return error;
- }
+ if (error)
+ goto out;
error = inode->i_op->removexattr(dentry, name);
- mutex_unlock(&inode->i_mutex);
if (!error) {
fsnotify_xattr(dentry);
evm_inode_post_removexattr(dentry, name);
}
+
+out:
+ mutex_unlock(&inode->i_mutex);
return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a6fbf4472017..f9e9ffe6fb46 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -149,13 +149,27 @@ xfs_alloc_compute_aligned(
{
xfs_agblock_t bno;
xfs_extlen_t len;
+ xfs_extlen_t diff;
/* Trim busy sections out of found extent */
xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+ /*
+ * If we have a largish extent that happens to start before min_agbno,
+ * see if we can shift it into range...
+ */
+ if (bno < args->min_agbno && bno + len > args->min_agbno) {
+ diff = args->min_agbno - bno;
+ if (len > diff) {
+ bno += diff;
+ len -= diff;
+ }
+ }
+
if (args->alignment > 1 && len >= args->minlen) {
xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
- xfs_extlen_t diff = aligned_bno - bno;
+
+ diff = aligned_bno - bno;
*resbno = aligned_bno;
*reslen = diff >= len ? 0 : len - diff;
@@ -260,6 +274,7 @@ xfs_alloc_fix_len(
rlen = rlen - (k - args->mod);
else
rlen = rlen - args->prod + (args->mod - k);
+ /* casts to (int) catch length underflows */
if ((int)rlen < (int)args->minlen)
return;
ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
@@ -286,7 +301,8 @@ xfs_alloc_fix_minleft(
if (diff >= 0)
return 1;
args->len += diff; /* shrink the allocated space */
- if (args->len >= args->minlen)
+ /* casts to (int) catch length underflows */
+ if ((int)args->len >= (int)args->minlen)
return 1;
args->agbno = NULLAGBLOCK;
return 0;
@@ -315,6 +331,9 @@ xfs_alloc_fixup_trees(
xfs_agblock_t nfbno2; /* second new free startblock */
xfs_extlen_t nflen1=0; /* first new free length */
xfs_extlen_t nflen2=0; /* second new free length */
+ struct xfs_mount *mp;
+
+ mp = cnt_cur->bc_mp;
/*
* Look up the record in the by-size tree if necessary.
@@ -323,13 +342,13 @@ xfs_alloc_fixup_trees(
#ifdef DEBUG
if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp,
i == 1 && nfbno1 == fbno && nflen1 == flen);
#endif
} else {
if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
}
/*
* Look up the record in the by-block tree if necessary.
@@ -338,13 +357,13 @@ xfs_alloc_fixup_trees(
#ifdef DEBUG
if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp,
i == 1 && nfbno1 == fbno && nflen1 == flen);
#endif
} else {
if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
}
#ifdef DEBUG
@@ -355,7 +374,7 @@ xfs_alloc_fixup_trees(
bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp,
bnoblock->bb_numrecs == cntblock->bb_numrecs);
}
#endif
@@ -386,25 +405,25 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(cnt_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
/*
* Add new by-size btree entry(s).
*/
if (nfbno1 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 0);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
}
if (nfbno2 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 0);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
}
/*
* Fix up the by-block btree entry(s).
@@ -415,7 +434,7 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
} else {
/*
* Update the by-block entry to start later|be shorter.
@@ -429,10 +448,10 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 0);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
if ((error = xfs_btree_insert(bno_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
}
return 0;
}
@@ -682,7 +701,7 @@ xfs_alloc_ag_vextent_exact(
error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
ASSERT(fbno <= args->agbno);
/*
@@ -783,16 +802,20 @@ xfs_alloc_find_best_extent(
error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
/*
* The good extent is closer than this one.
*/
if (!dir) {
+ if (*sbnoa > args->max_agbno)
+ goto out_use_good;
if (*sbnoa >= args->agbno + gdiff)
goto out_use_good;
} else {
+ if (*sbnoa < args->min_agbno)
+ goto out_use_good;
if (*sbnoa <= args->agbno - gdiff)
goto out_use_good;
}
@@ -879,6 +902,17 @@ xfs_alloc_ag_vextent_near(
dofirst = prandom_u32() & 1;
#endif
+ /* handle unitialized agbno range so caller doesn't have to */
+ if (!args->min_agbno && !args->max_agbno)
+ args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
+ ASSERT(args->min_agbno <= args->max_agbno);
+
+ /* clamp agbno to the range if it's outside */
+ if (args->agbno < args->min_agbno)
+ args->agbno = args->min_agbno;
+ if (args->agbno > args->max_agbno)
+ args->agbno = args->max_agbno;
+
restart:
bno_cur_lt = NULL;
bno_cur_gt = NULL;
@@ -946,7 +980,7 @@ restart:
if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
&ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
if (ltlen >= args->minlen)
break;
if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
@@ -966,11 +1000,13 @@ restart:
*/
if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
xfs_alloc_compute_aligned(args, ltbno, ltlen,
&ltbnoa, &ltlena);
if (ltlena < args->minlen)
continue;
+ if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
+ continue;
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ASSERT(args->len >= args->minlen);
@@ -999,7 +1035,7 @@ restart:
cnt_cur->bc_ptrs[0] = besti;
if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
args->len = blen;
if (!xfs_alloc_fix_minleft(args)) {
@@ -1088,14 +1124,14 @@ restart:
if (bno_cur_lt) {
if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
xfs_alloc_compute_aligned(args, ltbno, ltlen,
&ltbnoa, &ltlena);
- if (ltlena >= args->minlen)
+ if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
break;
if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
goto error0;
- if (!i) {
+ if (!i || ltbnoa < args->min_agbno) {
xfs_btree_del_cursor(bno_cur_lt,
XFS_BTREE_NOERROR);
bno_cur_lt = NULL;
@@ -1104,14 +1140,14 @@ restart:
if (bno_cur_gt) {
if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
xfs_alloc_compute_aligned(args, gtbno, gtlen,
&gtbnoa, &gtlena);
- if (gtlena >= args->minlen)
+ if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
break;
if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
goto error0;
- if (!i) {
+ if (!i || gtbnoa > args->max_agbno) {
xfs_btree_del_cursor(bno_cur_gt,
XFS_BTREE_NOERROR);
bno_cur_gt = NULL;
@@ -1211,6 +1247,7 @@ restart:
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+ ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
args->agbno = ltnew;
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
@@ -1303,7 +1340,7 @@ restart:
error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen);
@@ -1342,7 +1379,7 @@ restart:
* This can't happen in the second case above.
*/
rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
- XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+ XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
(rlen <= flen && rbno + rlen <= fbno + flen), error0);
if (rlen < args->maxlen) {
xfs_agblock_t bestfbno;
@@ -1362,13 +1399,13 @@ restart:
if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
&i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
if (flen < bestrlen)
break;
xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen);
rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
- XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+ XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
(rlen <= flen && rbno + rlen <= fbno + flen),
error0);
if (rlen > bestrlen) {
@@ -1383,7 +1420,7 @@ restart:
if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
&i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
rlen = bestrlen;
rbno = bestrbno;
flen = bestflen;
@@ -1408,7 +1445,7 @@ restart:
if (!xfs_alloc_fix_minleft(args))
goto out_nominleft;
rlen = args->len;
- XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
/*
* Allocate and initialize a cursor for the by-block tree.
*/
@@ -1422,7 +1459,7 @@ restart:
cnt_cur = bno_cur = NULL;
args->len = rlen;
args->agbno = rbno;
- XFS_WANT_CORRUPTED_GOTO(
+ XFS_WANT_CORRUPTED_GOTO(args->mp,
args->agbno + args->len <=
be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
error0);
@@ -1467,7 +1504,7 @@ xfs_alloc_ag_vextent_small(
if (i) {
if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
}
/*
* Nothing in the btree, try the freelist. Make sure
@@ -1493,7 +1530,7 @@ xfs_alloc_ag_vextent_small(
}
args->len = 1;
args->agbno = fbno;
- XFS_WANT_CORRUPTED_GOTO(
+ XFS_WANT_CORRUPTED_GOTO(args->mp,
args->agbno + args->len <=
be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
error0);
@@ -1579,7 +1616,7 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
/*
* It's not contiguous, though.
*/
@@ -1591,7 +1628,8 @@ xfs_free_ag_extent(
* space was invalid, it's (partly) already free.
* Very bad.
*/
- XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ ltbno + ltlen <= bno, error0);
}
}
/*
@@ -1606,7 +1644,7 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
/*
* It's not contiguous, though.
*/
@@ -1618,7 +1656,7 @@ xfs_free_ag_extent(
* space was invalid, it's (partly) already free.
* Very bad.
*/
- XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0);
}
}
/*
@@ -1635,31 +1673,31 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
/*
* Delete the old by-size entry on the right.
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
/*
* Delete the old by-block entry for the right block.
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
/*
* Move the by-block cursor back to the left neighbor.
*/
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
#ifdef DEBUG
/*
* Check that this is the right record: delete didn't
@@ -1672,7 +1710,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
&i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(
+ XFS_WANT_CORRUPTED_GOTO(mp,
i == 1 && xxbno == ltbno && xxlen == ltlen,
error0);
}
@@ -1695,17 +1733,17 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
/*
* Back up the by-block cursor to the left neighbor, and
* update its length.
*/
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
nbno = ltbno;
nlen = len + ltlen;
if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
@@ -1721,10 +1759,10 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
/*
* Update the starting block and length of the right
* neighbor in the by-block tree.
@@ -1743,7 +1781,7 @@ xfs_free_ag_extent(
nlen = len;
if ((error = xfs_btree_insert(bno_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
}
xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
bno_cur = NULL;
@@ -1752,10 +1790,10 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0);
if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
cnt_cur = NULL;
@@ -1819,11 +1857,11 @@ xfs_alloc_compute_maxlevels(
xfs_extlen_t
xfs_alloc_longest_free_extent(
struct xfs_mount *mp,
- struct xfs_perag *pag)
+ struct xfs_perag *pag,
+ xfs_extlen_t need)
{
- xfs_extlen_t need, delta = 0;
+ xfs_extlen_t delta = 0;
- need = XFS_MIN_FREELIST_PAG(pag, mp);
if (need > pag->pagf_flcount)
delta = need - pag->pagf_flcount;
@@ -1832,131 +1870,150 @@ xfs_alloc_longest_free_extent(
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
}
+unsigned int
+xfs_alloc_min_freelist(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag)
+{
+ unsigned int min_free;
+
+ /* space needed by-bno freespace btree */
+ min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
+ mp->m_ag_maxlevels);
+ /* space needed by-size freespace btree */
+ min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
+ mp->m_ag_maxlevels);
+
+ return min_free;
+}
+
+/*
+ * Check if the operation we are fixing up the freelist for should go ahead or
+ * not. If we are freeing blocks, we always allow it, otherwise the allocation
+ * is dependent on whether the size and shape of free space available will
+ * permit the requested allocation to take place.
+ */
+static bool
+xfs_alloc_space_available(
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t min_free,
+ int flags)
+{
+ struct xfs_perag *pag = args->pag;
+ xfs_extlen_t longest;
+ int available;
+
+ if (flags & XFS_ALLOC_FLAG_FREEING)
+ return true;
+
+ /* do we have enough contiguous free space for the allocation? */
+ longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free);
+ if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+ return false;
+
+ /* do have enough free space remaining for the allocation? */
+ available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
+ min_free - args->total);
+ if (available < (int)args->minleft)
+ return false;
+
+ return true;
+}
+
/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
STATIC int /* error */
xfs_alloc_fix_freelist(
- xfs_alloc_arg_t *args, /* allocation argument structure */
- int flags) /* XFS_ALLOC_FLAG_... */
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ int flags) /* XFS_ALLOC_FLAG_... */
{
- xfs_buf_t *agbp; /* agf buffer pointer */
- xfs_agf_t *agf; /* a.g. freespace structure pointer */
- xfs_buf_t *agflbp;/* agfl buffer pointer */
- xfs_agblock_t bno; /* freelist block */
- xfs_extlen_t delta; /* new blocks needed in freelist */
- int error; /* error result code */
- xfs_extlen_t longest;/* longest extent in allocation group */
- xfs_mount_t *mp; /* file system mount point structure */
- xfs_extlen_t need; /* total blocks needed in freelist */
- xfs_perag_t *pag; /* per-ag information structure */
- xfs_alloc_arg_t targs; /* local allocation arguments */
- xfs_trans_t *tp; /* transaction pointer */
-
- mp = args->mp;
+ struct xfs_mount *mp = args->mp;
+ struct xfs_perag *pag = args->pag;
+ struct xfs_trans *tp = args->tp;
+ struct xfs_buf *agbp = NULL;
+ struct xfs_buf *agflbp = NULL;
+ struct xfs_alloc_arg targs; /* local allocation arguments */
+ xfs_agblock_t bno; /* freelist block */
+ xfs_extlen_t need; /* total blocks needed in freelist */
+ int error;
- pag = args->pag;
- tp = args->tp;
if (!pag->pagf_init) {
- if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
- &agbp)))
- return error;
+ error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ if (error)
+ goto out_no_agbp;
if (!pag->pagf_init) {
ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- args->agbp = NULL;
- return 0;
+ goto out_agbp_relse;
}
- } else
- agbp = NULL;
+ }
/*
- * If this is a metadata preferred pag and we are user data
- * then try somewhere else if we are not being asked to
- * try harder at this point
+ * If this is a metadata preferred pag and we are user data then try
+ * somewhere else if we are not being asked to try harder at this
+ * point
*/
if (pag->pagf_metadata && args->userdata &&
(flags & XFS_ALLOC_FLAG_TRYLOCK)) {
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- args->agbp = NULL;
- return 0;
+ goto out_agbp_relse;
}
- if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
- /*
- * If it looks like there isn't a long enough extent, or enough
- * total blocks, reject it.
- */
- need = XFS_MIN_FREELIST_PAG(pag, mp);
- longest = xfs_alloc_longest_free_extent(mp, pag);
- if ((args->minlen + args->alignment + args->minalignslop - 1) >
- longest ||
- ((int)(pag->pagf_freeblks + pag->pagf_flcount -
- need - args->total) < (int)args->minleft)) {
- if (agbp)
- xfs_trans_brelse(tp, agbp);
- args->agbp = NULL;
- return 0;
- }
- }
+ need = xfs_alloc_min_freelist(mp, pag);
+ if (!xfs_alloc_space_available(args, need, flags))
+ goto out_agbp_relse;
/*
* Get the a.g. freespace buffer.
* Can fail if we're not blocking on locks, and it's held.
*/
- if (agbp == NULL) {
- if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
- &agbp)))
- return error;
- if (agbp == NULL) {
+ if (!agbp) {
+ error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
+ if (error)
+ goto out_no_agbp;
+ if (!agbp) {
ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
- args->agbp = NULL;
- return 0;
- }
- }
- /*
- * Figure out how many blocks we should have in the freelist.
- */
- agf = XFS_BUF_TO_AGF(agbp);
- need = XFS_MIN_FREELIST(agf, mp);
- /*
- * If there isn't enough total or single-extent, reject it.
- */
- if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
- delta = need > be32_to_cpu(agf->agf_flcount) ?
- (need - be32_to_cpu(agf->agf_flcount)) : 0;
- longest = be32_to_cpu(agf->agf_longest);
- longest = (longest > delta) ? (longest - delta) :
- (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
- if ((args->minlen + args->alignment + args->minalignslop - 1) >
- longest ||
- ((int)(be32_to_cpu(agf->agf_freeblks) +
- be32_to_cpu(agf->agf_flcount) - need - args->total) <
- (int)args->minleft)) {
- xfs_trans_brelse(tp, agbp);
- args->agbp = NULL;
- return 0;
+ goto out_no_agbp;
}
}
+
+ /* If there isn't enough total space or single-extent, reject it. */
+ need = xfs_alloc_min_freelist(mp, pag);
+ if (!xfs_alloc_space_available(args, need, flags))
+ goto out_agbp_relse;
+
/*
* Make the freelist shorter if it's too long.
+ *
+ * Note that from this point onwards, we will always release the agf and
+ * agfl buffers on error. This handles the case where we error out and
+ * the buffers are clean or may not have been joined to the transaction
+ * and hence need to be released manually. If they have been joined to
+ * the transaction, then xfs_trans_brelse() will handle them
+ * appropriately based on the recursion count and dirty state of the
+ * buffer.
+ *
+ * XXX (dgc): When we have lots of free space, does this buy us
+ * anything other than extra overhead when we need to put more blocks
+ * back on the free list? Maybe we should only do this when space is
+ * getting low or the AGFL is more than half full?
*/
- while (be32_to_cpu(agf->agf_flcount) > need) {
- xfs_buf_t *bp;
+ while (pag->pagf_flcount > need) {
+ struct xfs_buf *bp;
error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
- return error;
- if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
- return error;
+ goto out_agbp_relse;
+ error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1);
+ if (error)
+ goto out_agbp_relse;
bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
xfs_trans_binval(tp, bp);
}
- /*
- * Initialize the args structure.
- */
+
memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
@@ -1965,21 +2022,20 @@ xfs_alloc_fix_freelist(
targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
- if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
- return error;
- /*
- * Make the freelist longer if it's too short.
- */
- while (be32_to_cpu(agf->agf_flcount) < need) {
+ error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
+ if (error)
+ goto out_agbp_relse;
+
+ /* Make the freelist longer if it's too short. */
+ while (pag->pagf_flcount < need) {
targs.agbno = 0;
- targs.maxlen = need - be32_to_cpu(agf->agf_flcount);
- /*
- * Allocate as many blocks as possible at once.
- */
- if ((error = xfs_alloc_ag_vextent(&targs))) {
- xfs_trans_brelse(tp, agflbp);
- return error;
- }
+ targs.maxlen = need - pag->pagf_flcount;
+
+ /* Allocate as many blocks as possible at once. */
+ error = xfs_alloc_ag_vextent(&targs);
+ if (error)
+ goto out_agflbp_relse;
+
/*
* Stop if we run out. Won't happen if callers are obeying
* the restrictions correctly. Can happen for free calls
@@ -1988,9 +2044,7 @@ xfs_alloc_fix_freelist(
if (targs.agbno == NULLAGBLOCK) {
if (flags & XFS_ALLOC_FLAG_FREEING)
break;
- xfs_trans_brelse(tp, agflbp);
- args->agbp = NULL;
- return 0;
+ goto out_agflbp_relse;
}
/*
* Put each allocated block on the list.
@@ -1999,12 +2053,21 @@ xfs_alloc_fix_freelist(
error = xfs_alloc_put_freelist(tp, agbp,
agflbp, bno, 0);
if (error)
- return error;
+ goto out_agflbp_relse;
}
}
xfs_trans_brelse(tp, agflbp);
args->agbp = agbp;
return 0;
+
+out_agflbp_relse:
+ xfs_trans_brelse(tp, agflbp);
+out_agbp_relse:
+ if (agbp)
+ xfs_trans_brelse(tp, agbp);
+out_no_agbp:
+ args->agbp = NULL;
+ return error;
}
/*
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d1b4b6a5c894..ca1c8168373a 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg {
xfs_extlen_t total; /* total blocks needed in xaction */
xfs_extlen_t alignment; /* align answer to multiple of this */
xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */
+ xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */
+ xfs_agblock_t max_agbno; /* ... */
xfs_extlen_t len; /* output: actual size of extent */
xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */
xfs_alloctype_t otype; /* original allocation type */
@@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg {
#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
-/*
- * Find the length of the longest extent in an AG.
- */
-xfs_extlen_t
-xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
+ struct xfs_perag *pag, xfs_extlen_t need);
+unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp,
struct xfs_perag *pag);
/*
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 0a472fbe06d4..3349c9a1e845 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -266,7 +266,7 @@ xfs_attr_set(
tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
if (error) {
- xfs_trans_cancel(args.trans, 0);
+ xfs_trans_cancel(args.trans);
return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -276,7 +276,7 @@ xfs_attr_set(
XFS_QMOPT_RES_REGBLKS);
if (error) {
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_trans_cancel(args.trans);
return error;
}
@@ -320,8 +320,7 @@ xfs_attr_set(
xfs_trans_ichgtime(args.trans, dp,
XFS_ICHGTIME_CHG);
}
- err2 = xfs_trans_commit(args.trans,
- XFS_TRANS_RELEASE_LOG_RES);
+ err2 = xfs_trans_commit(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error ? error : err2;
@@ -383,16 +382,14 @@ xfs_attr_set(
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
out:
- if (args.trans) {
- xfs_trans_cancel(args.trans,
- XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- }
+ if (args.trans)
+ xfs_trans_cancel(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
@@ -462,7 +459,7 @@ xfs_attr_remove(
error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
XFS_ATTRRM_SPACE_RES(mp), 0);
if (error) {
- xfs_trans_cancel(args.trans, 0);
+ xfs_trans_cancel(args.trans);
return error;
}
@@ -501,16 +498,14 @@ xfs_attr_remove(
* Commit the last in the sequence of transactions.
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
- error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
out:
- if (args.trans) {
- xfs_trans_cancel(args.trans,
- XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- }
+ if (args.trans)
+ xfs_trans_cancel(args.trans);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 15105dbc9e28..e9d401ce93bb 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -86,8 +86,83 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
int move_count);
STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+/*
+ * attr3 block 'firstused' conversion helpers.
+ *
+ * firstused refers to the offset of the first used byte of the nameval region
+ * of an attr leaf block. The region starts at the tail of the block and expands
+ * backwards towards the middle. As such, firstused is initialized to the block
+ * size for an empty leaf block and is reduced from there.
+ *
+ * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k.
+ * The in-core firstused field is 32-bit and thus supports the maximum fsb size.
+ * The on-disk field is only 16-bit, however, and overflows at 64k. Since this
+ * only occurs at exactly 64k, we use zero as a magic on-disk value to represent
+ * the attr block size. The following helpers manage the conversion between the
+ * in-core and on-disk formats.
+ */
+
+static void
+xfs_attr3_leaf_firstused_from_disk(
+ struct xfs_da_geometry *geo,
+ struct xfs_attr3_icleaf_hdr *to,
+ struct xfs_attr_leafblock *from)
+{
+ struct xfs_attr3_leaf_hdr *hdr3;
+
+ if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+ hdr3 = (struct xfs_attr3_leaf_hdr *) from;
+ to->firstused = be16_to_cpu(hdr3->firstused);
+ } else {
+ to->firstused = be16_to_cpu(from->hdr.firstused);
+ }
+
+ /*
+ * Convert from the magic fsb size value to actual blocksize. This
+ * should only occur for empty blocks when the block size overflows
+ * 16-bits.
+ */
+ if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) {
+ ASSERT(!to->count && !to->usedbytes);
+ ASSERT(geo->blksize > USHRT_MAX);
+ to->firstused = geo->blksize;
+ }
+}
+
+static void
+xfs_attr3_leaf_firstused_to_disk(
+ struct xfs_da_geometry *geo,
+ struct xfs_attr_leafblock *to,
+ struct xfs_attr3_icleaf_hdr *from)
+{
+ struct xfs_attr3_leaf_hdr *hdr3;
+ uint32_t firstused;
+
+ /* magic value should only be seen on disk */
+ ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF);
+
+ /*
+ * Scale down the 32-bit in-core firstused value to the 16-bit on-disk
+ * value. This only overflows at the max supported value of 64k. Use the
+ * magic on-disk value to represent block size in this case.
+ */
+ firstused = from->firstused;
+ if (firstused > USHRT_MAX) {
+ ASSERT(from->firstused == geo->blksize);
+ firstused = XFS_ATTR3_LEAF_NULLOFF;
+ }
+
+ if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+ hdr3 = (struct xfs_attr3_leaf_hdr *) to;
+ hdr3->firstused = cpu_to_be16(firstused);
+ } else {
+ to->hdr.firstused = cpu_to_be16(firstused);
+ }
+}
+
void
xfs_attr3_leaf_hdr_from_disk(
+ struct xfs_da_geometry *geo,
struct xfs_attr3_icleaf_hdr *to,
struct xfs_attr_leafblock *from)
{
@@ -104,7 +179,7 @@ xfs_attr3_leaf_hdr_from_disk(
to->magic = be16_to_cpu(hdr3->info.hdr.magic);
to->count = be16_to_cpu(hdr3->count);
to->usedbytes = be16_to_cpu(hdr3->usedbytes);
- to->firstused = be16_to_cpu(hdr3->firstused);
+ xfs_attr3_leaf_firstused_from_disk(geo, to, from);
to->holes = hdr3->holes;
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -118,7 +193,7 @@ xfs_attr3_leaf_hdr_from_disk(
to->magic = be16_to_cpu(from->hdr.info.magic);
to->count = be16_to_cpu(from->hdr.count);
to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
- to->firstused = be16_to_cpu(from->hdr.firstused);
+ xfs_attr3_leaf_firstused_from_disk(geo, to, from);
to->holes = from->hdr.holes;
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -129,10 +204,11 @@ xfs_attr3_leaf_hdr_from_disk(
void
xfs_attr3_leaf_hdr_to_disk(
+ struct xfs_da_geometry *geo,
struct xfs_attr_leafblock *to,
struct xfs_attr3_icleaf_hdr *from)
{
- int i;
+ int i;
ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
from->magic == XFS_ATTR3_LEAF_MAGIC);
@@ -145,7 +221,7 @@ xfs_attr3_leaf_hdr_to_disk(
hdr3->info.hdr.magic = cpu_to_be16(from->magic);
hdr3->count = cpu_to_be16(from->count);
hdr3->usedbytes = cpu_to_be16(from->usedbytes);
- hdr3->firstused = cpu_to_be16(from->firstused);
+ xfs_attr3_leaf_firstused_to_disk(geo, to, from);
hdr3->holes = from->holes;
hdr3->pad1 = 0;
@@ -160,7 +236,7 @@ xfs_attr3_leaf_hdr_to_disk(
to->hdr.info.magic = cpu_to_be16(from->magic);
to->hdr.count = cpu_to_be16(from->count);
to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
- to->hdr.firstused = cpu_to_be16(from->firstused);
+ xfs_attr3_leaf_firstused_to_disk(geo, to, from);
to->hdr.holes = from->holes;
to->hdr.pad1 = 0;
@@ -178,7 +254,7 @@ xfs_attr3_leaf_verify(
struct xfs_attr_leafblock *leaf = bp->b_addr;
struct xfs_attr3_icleaf_hdr ichdr;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
@@ -498,8 +574,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
* After the last attribute is removed revert to original inode format,
* making all literal area available to the data fork once more.
*/
-STATIC void
-xfs_attr_fork_reset(
+void
+xfs_attr_fork_remove(
struct xfs_inode *ip,
struct xfs_trans *tp)
{
@@ -565,7 +641,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
(mp->m_flags & XFS_MOUNT_ATTR2) &&
(dp->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
!(args->op_flags & XFS_DA_OP_ADDNAME)) {
- xfs_attr_fork_reset(dp, args->trans);
+ xfs_attr_fork_remove(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
@@ -757,9 +833,10 @@ xfs_attr_shortform_allfit(
struct xfs_attr3_icleaf_hdr leafhdr;
int bytes;
int i;
+ struct xfs_mount *mp = bp->b_target->bt_mount;
leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
entry = xfs_attr3_leaf_entryp(leaf);
bytes = sizeof(struct xfs_attr_sf_hdr);
@@ -812,7 +889,7 @@ xfs_attr3_leaf_to_shortform(
memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
leaf = (xfs_attr_leafblock_t *)tmpbuffer;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
entry = xfs_attr3_leaf_entryp(leaf);
/* XXX (dgc): buffer is about to be marked stale - why zero it? */
@@ -828,7 +905,7 @@ xfs_attr3_leaf_to_shortform(
if (forkoff == -1) {
ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2);
ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE);
- xfs_attr_fork_reset(dp, args->trans);
+ xfs_attr_fork_remove(dp, args->trans);
goto out;
}
@@ -923,7 +1000,7 @@ xfs_attr3_leaf_to_node(
btree = dp->d_ops->node_tree_p(node);
leaf = bp2->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
/* both on-disk, don't endian-flip twice */
@@ -988,7 +1065,7 @@ xfs_attr3_leaf_create(
}
ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
- xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
*bpp = bp;
@@ -1073,7 +1150,7 @@ xfs_attr3_leaf_add(
trace_xfs_attr_leaf_add(args);
leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
ASSERT(args->index >= 0 && args->index <= ichdr.count);
entsize = xfs_attr_leaf_newentsize(args, NULL);
@@ -1126,7 +1203,7 @@ xfs_attr3_leaf_add(
tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
out_log_hdr:
- xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, &leaf->hdr,
xfs_attr3_leaf_hdr_size(leaf)));
@@ -1294,7 +1371,7 @@ xfs_attr3_leaf_compact(
ichdr_dst->freemap[0].base;
/* write the header back to initialise the underlying buffer */
- xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
+ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst);
/*
* Copy all entry's in the same (sorted) order,
@@ -1344,9 +1421,10 @@ xfs_attr_leaf_order(
{
struct xfs_attr3_icleaf_hdr ichdr1;
struct xfs_attr3_icleaf_hdr ichdr2;
+ struct xfs_mount *mp = leaf1_bp->b_target->bt_mount;
- xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
- xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
}
@@ -1388,8 +1466,8 @@ xfs_attr3_leaf_rebalance(
ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
- xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1);
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2);
ASSERT(ichdr2.count == 0);
args = state->args;
@@ -1490,8 +1568,8 @@ xfs_attr3_leaf_rebalance(
ichdr1.count, count);
}
- xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
- xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+ xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1);
+ xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2);
xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
@@ -1684,7 +1762,7 @@ xfs_attr3_leaf_toosmall(
*/
blk = &state->path.blk[ state->path.active-1 ];
leaf = blk->bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf);
bytes = xfs_attr3_leaf_hdr_size(leaf) +
ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
ichdr.usedbytes;
@@ -1740,7 +1818,7 @@ xfs_attr3_leaf_toosmall(
if (error)
return error;
- xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr);
bytes = state->args->geo->blksize -
(state->args->geo->blksize >> 2) -
@@ -1805,7 +1883,7 @@ xfs_attr3_leaf_remove(
trace_xfs_attr_leaf_remove(args);
leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
ASSERT(args->index >= 0 && args->index < ichdr.count);
@@ -1918,12 +1996,11 @@ xfs_attr3_leaf_remove(
tmp = be16_to_cpu(entry->nameidx);
}
ichdr.firstused = tmp;
- if (!ichdr.firstused)
- ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
+ ASSERT(ichdr.firstused != 0);
} else {
ichdr.holes = 1; /* mark as needing compaction */
}
- xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+ xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, &leaf->hdr,
xfs_attr3_leaf_hdr_size(leaf)));
@@ -1957,8 +2034,8 @@ xfs_attr3_leaf_unbalance(
drop_leaf = drop_blk->bp->b_addr;
save_leaf = save_blk->bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
- xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf);
+ xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf);
entry = xfs_attr3_leaf_entryp(drop_leaf);
/*
@@ -2012,7 +2089,7 @@ xfs_attr3_leaf_unbalance(
tmphdr.firstused = state->args->geo->blksize;
/* write the header to the temp buffer to initialise it */
- xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+ xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr);
if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
drop_blk->bp, &drophdr)) {
@@ -2039,7 +2116,7 @@ xfs_attr3_leaf_unbalance(
kmem_free(tmp_leaf);
}
- xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
+ xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
state->args->geo->blksize - 1);
@@ -2085,7 +2162,7 @@ xfs_attr3_leaf_lookup_int(
trace_xfs_attr_leaf_lookup(args);
leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
ASSERT(ichdr.count < args->geo->blksize / 8);
@@ -2190,7 +2267,7 @@ xfs_attr3_leaf_getvalue(
int valuelen;
leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
ASSERT(ichdr.count < args->geo->blksize / 8);
ASSERT(args->index < ichdr.count);
@@ -2391,8 +2468,9 @@ xfs_attr_leaf_lasthash(
{
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entries;
+ struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
entries = xfs_attr3_leaf_entryp(bp->b_addr);
if (count)
*count = ichdr.count;
@@ -2486,7 +2564,7 @@ xfs_attr3_leaf_clearflag(
ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
#ifdef DEBUG
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
ASSERT(args->index < ichdr.count);
ASSERT(args->index >= 0);
@@ -2550,7 +2628,7 @@ xfs_attr3_leaf_setflag(
leaf = bp->b_addr;
#ifdef DEBUG
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
ASSERT(args->index < ichdr.count);
ASSERT(args->index >= 0);
#endif
@@ -2629,11 +2707,11 @@ xfs_attr3_leaf_flipflags(
entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
#ifdef DEBUG
- xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1);
ASSERT(args->index < ichdr1.count);
ASSERT(args->index >= 0);
- xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+ xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2);
ASSERT(args->index2 < ichdr2.count);
ASSERT(args->index2 >= 0);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index e2929da7c3ba..882c8d338891 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -53,7 +53,7 @@ int xfs_attr_shortform_remove(struct xfs_da_args *args);
int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
-
+void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
/*
* Internal routines when attribute fork size == XFS_LBSIZE(mp).
@@ -100,9 +100,11 @@ int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp);
-void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
+ struct xfs_attr3_icleaf_hdr *to,
struct xfs_attr_leafblock *from);
-void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
+ struct xfs_attr_leafblock *to,
struct xfs_attr3_icleaf_hdr *from);
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 20de88d1bf86..dd714037c322 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -159,11 +159,10 @@ xfs_attr3_rmt_write_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- struct xfs_buf_log_item *bip = bp->b_fspriv;
+ int blksize = mp->m_attr_geo->blksize;
char *ptr;
int len;
xfs_daddr_t bno;
- int blksize = mp->m_attr_geo->blksize;
/* no verification of non-crc buffers */
if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -175,16 +174,22 @@ xfs_attr3_rmt_write_verify(
ASSERT(len >= blksize);
while (len > 0) {
+ struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
+
if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
}
- if (bip) {
- struct xfs_attr3_rmt_hdr *rmt;
- rmt = (struct xfs_attr3_rmt_hdr *)ptr;
- rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ /*
+ * Ensure we aren't writing bogus LSNs to disk. See
+ * xfs_attr3_rmt_hdr_set() for the explanation.
+ */
+ if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) {
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
}
xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
@@ -221,6 +226,18 @@ xfs_attr3_rmt_hdr_set(
rmt->rm_owner = cpu_to_be64(ino);
rmt->rm_blkno = cpu_to_be64(bno);
+ /*
+ * Remote attribute blocks are written synchronously, so we don't
+ * have an LSN that we can stamp in them that makes any sense to log
+ * recovery. To ensure that log recovery handles overwrites of these
+ * blocks sanely (i.e. once they've been freed and reallocated as some
+ * other type of metadata) we need to ensure that the LSN has a value
+ * that tells log recovery to ignore the LSN and overwrite the buffer
+ * with whatever is in it's log. To do this, we use the magic
+ * NULLCOMMITLSN to indicate that the LSN is invalid.
+ */
+ rmt->rm_lsn = cpu_to_be64(NULLCOMMITLSN);
+
return sizeof(struct xfs_attr3_rmt_hdr);
}
@@ -434,14 +451,21 @@ xfs_attr_rmtval_set(
/*
* Allocate a single extent, up to the size of the value.
+ *
+ * Note that we have to consider this a data allocation as we
+ * write the remote attribute without logging the contents.
+ * Hence we must ensure that we aren't using blocks that are on
+ * the busy list so that we don't overwrite blocks which have
+ * recently been freed but their transactions are not yet
+ * committed to disk. If we overwrite the contents of a busy
+ * extent and then crash then the block may not contain the
+ * correct metadata after log recovery occurs.
*/
xfs_bmap_init(args->flist, args->firstblock);
nmap = 1;
error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
- blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- args->firstblock, args->total, &map, &nmap,
- args->flist);
+ blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
+ args->total, &map, &nmap, args->flist);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 61ec015dca16..63e05b663380 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -244,30 +244,6 @@ xfs_bmap_forkoff_reset(
}
}
-/*
- * Debug/sanity checking code
- */
-
-STATIC int
-xfs_bmap_sanity_check(
- struct xfs_mount *mp,
- struct xfs_buf *bp,
- int level)
-{
- struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
-
- if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
- block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
- return 0;
-
- if (be16_to_cpu(block->bb_level) != level ||
- be16_to_cpu(block->bb_numrecs) == 0 ||
- be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
- return 0;
-
- return 1;
-}
-
#ifdef DEBUG
STATIC struct xfs_buf *
xfs_bmap_get_bp(
@@ -410,9 +386,6 @@ xfs_bmap_check_leaf_extents(
goto error_norelse;
}
block = XFS_BUF_TO_BLOCK(bp);
- XFS_WANT_CORRUPTED_GOTO(
- xfs_bmap_sanity_check(mp, bp, level),
- error0);
if (level == 0)
break;
@@ -424,7 +397,8 @@ xfs_bmap_check_leaf_extents(
xfs_check_block(block, mp, 0, 0);
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ XFS_FSB_SANITY_CHECK(mp, bno), error0);
if (bp_release) {
bp_release = 0;
xfs_trans_brelse(NULL, bp);
@@ -1029,7 +1003,7 @@ xfs_bmap_add_attrfork_btree(
if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
goto error0;
/* must be at least one entry */
- XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
goto error0;
if (stat == 0) {
@@ -1138,7 +1112,6 @@ xfs_bmap_add_attrfork(
int committed; /* xaction was committed */
int logflags; /* logging flags */
int error; /* error return value */
- int cancel_flags = 0;
ASSERT(XFS_IFORK_Q(ip) == 0);
@@ -1150,17 +1123,15 @@ xfs_bmap_add_attrfork(
tp->t_flags |= XFS_TRANS_RESERVE;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
XFS_QMOPT_RES_REGBLKS);
if (error)
goto trans_cancel;
- cancel_flags |= XFS_TRANS_ABORT;
if (XFS_IFORK_Q(ip))
goto trans_cancel;
if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
@@ -1244,14 +1215,14 @@ xfs_bmap_add_attrfork(
error = xfs_bmap_finish(&tp, &flist, &committed);
if (error)
goto bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
bmap_cancel:
xfs_bmap_cancel(&flist);
trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1311,14 +1282,12 @@ xfs_bmap_read_extents(
if (error)
return error;
block = XFS_BUF_TO_BLOCK(bp);
- XFS_WANT_CORRUPTED_GOTO(
- xfs_bmap_sanity_check(mp, bp, level),
- error0);
if (level == 0)
break;
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ XFS_FSB_SANITY_CHECK(mp, bno), error0);
xfs_trans_brelse(tp, bp);
}
/*
@@ -1345,9 +1314,6 @@ xfs_bmap_read_extents(
XFS_ERRLEVEL_LOW, ip->i_mount, block);
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(
- xfs_bmap_sanity_check(mp, bp, 0),
- error0);
/*
* Read-ahead the next leaf block, if any.
*/
@@ -1755,7 +1721,9 @@ xfs_bmap_add_extent_delay_real(
xfs_filblks_t temp=0; /* value for da_new calculations */
xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
+ struct xfs_mount *mp;
+ mp = bma->tp ? bma->tp->t_mountp : NULL;
ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
ASSERT(bma->idx >= 0);
@@ -1866,15 +1834,15 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_blockcount, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_btree_delete(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_btree_decrement(bma->cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
@@ -1907,7 +1875,7 @@ xfs_bmap_add_extent_delay_real(
&i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
@@ -1938,7 +1906,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_blockcount, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
new->br_startblock,
PREV.br_blockcount +
@@ -1968,12 +1936,12 @@ xfs_bmap_add_extent_delay_real(
&i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
break;
@@ -2001,7 +1969,7 @@ xfs_bmap_add_extent_delay_real(
&i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
@@ -2038,12 +2006,12 @@ xfs_bmap_add_extent_delay_real(
&i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2084,7 +2052,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_blockcount, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_bmbt_update(bma->cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
@@ -2122,12 +2090,12 @@ xfs_bmap_add_extent_delay_real(
&i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2191,12 +2159,12 @@ xfs_bmap_add_extent_delay_real(
&i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2212,9 +2180,8 @@ xfs_bmap_add_extent_delay_real(
diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
(bma->cur ? bma->cur->bc_private.b.allocated : 0));
if (diff > 0) {
- error = xfs_icsb_modify_counters(bma->ip->i_mount,
- XFS_SBS_FDBLOCKS,
- -((int64_t)diff), 0);
+ error = xfs_mod_fdblocks(bma->ip->i_mount,
+ -((int64_t)diff), false);
ASSERT(!error);
if (error)
goto done;
@@ -2265,9 +2232,8 @@ xfs_bmap_add_extent_delay_real(
temp += bma->cur->bc_private.b.allocated;
ASSERT(temp <= da_old);
if (temp < da_old)
- xfs_icsb_modify_counters(bma->ip->i_mount,
- XFS_SBS_FDBLOCKS,
- (int64_t)(da_old - temp), 0);
+ xfs_mod_fdblocks(bma->ip->i_mount,
+ (int64_t)(da_old - temp), false);
}
/* clear out the allocated field, done with it now in any case. */
@@ -2309,6 +2275,7 @@ xfs_bmap_add_extent_unwritten_real(
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
int state = 0;/* state bits, accessed thru macros */
+ struct xfs_mount *mp = tp->t_mountp;
*logflagsp = 0;
@@ -2421,19 +2388,19 @@ xfs_bmap_add_extent_unwritten_real(
RIGHT.br_startblock,
RIGHT.br_blockcount, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount + PREV.br_blockcount +
@@ -2464,13 +2431,13 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount + PREV.br_blockcount,
@@ -2499,13 +2466,13 @@ xfs_bmap_add_extent_unwritten_real(
RIGHT.br_startblock,
RIGHT.br_blockcount, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount,
@@ -2532,7 +2499,7 @@ xfs_bmap_add_extent_unwritten_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
newext)))
@@ -2569,7 +2536,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_bmbt_update(cur,
PREV.br_startoff + new->br_blockcount,
PREV.br_startblock + new->br_blockcount,
@@ -2611,7 +2578,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_bmbt_update(cur,
PREV.br_startoff + new->br_blockcount,
PREV.br_startblock + new->br_blockcount,
@@ -2621,7 +2588,7 @@ xfs_bmap_add_extent_unwritten_real(
cur->bc_rec.b = *new;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
break;
@@ -2651,7 +2618,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock,
PREV.br_blockcount, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
PREV.br_startblock,
PREV.br_blockcount - new->br_blockcount,
@@ -2689,7 +2656,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
PREV.br_startblock,
PREV.br_blockcount - new->br_blockcount,
@@ -2699,11 +2666,11 @@ xfs_bmap_add_extent_unwritten_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
break;
@@ -2737,7 +2704,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
/* new right extent - oldext */
if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
r[1].br_startblock, r[1].br_blockcount,
@@ -2749,7 +2716,7 @@ xfs_bmap_add_extent_unwritten_real(
new->br_startoff - PREV.br_startoff;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
/*
* Reset the cursor to the position of the new extent
* we are about to insert as we can't trust it after
@@ -2759,12 +2726,12 @@ xfs_bmap_add_extent_unwritten_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
/* new middle extent - newext */
cur->bc_rec.b.br_state = new->br_state;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
break;
@@ -2944,8 +2911,8 @@ xfs_bmap_add_extent_hole_delay(
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
- xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
- (int64_t)(oldlen - newlen), 0);
+ xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
+ false);
/*
* Nothing to do for disk quota accounting here.
*/
@@ -2968,7 +2935,9 @@ xfs_bmap_add_extent_hole_real(
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
int state; /* state bits, accessed thru macros */
+ struct xfs_mount *mp;
+ mp = bma->tp ? bma->tp->t_mountp : NULL;
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(bma->idx >= 0);
@@ -3056,15 +3025,15 @@ xfs_bmap_add_extent_hole_real(
&i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_btree_delete(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_btree_decrement(bma->cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_bmbt_update(bma->cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
@@ -3097,7 +3066,7 @@ xfs_bmap_add_extent_hole_real(
&i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_bmbt_update(bma->cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
@@ -3131,7 +3100,7 @@ xfs_bmap_add_extent_hole_real(
right.br_blockcount, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
error = xfs_bmbt_update(bma->cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
@@ -3161,12 +3130,12 @@ xfs_bmap_add_extent_hole_real(
new->br_blockcount, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
bma->cur->bc_rec.b.br_state = new->br_state;
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
break;
}
@@ -3252,12 +3221,24 @@ xfs_bmap_extsize_align(
align_alen += temp;
align_off -= temp;
}
+
+ /* Same adjustment for the end of the requested area. */
+ temp = (align_alen % extsz);
+ if (temp)
+ align_alen += extsz - temp;
+
/*
- * Same adjustment for the end of the requested area.
+ * For large extent hint sizes, the aligned extent might be larger than
+ * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
+ * the length back under MAXEXTLEN. The outer allocation loops handle
+ * short allocation just fine, so it is safe to do this. We only want to
+ * do it when we are forced to, though, because it means more allocation
+ * operations are required.
*/
- if ((temp = (align_alen % extsz))) {
- align_alen += extsz - temp;
- }
+ while (align_alen > MAXEXTLEN)
+ align_alen -= extsz;
+ ASSERT(align_alen <= MAXEXTLEN);
+
/*
* If the previous block overlaps with this proposed allocation
* then move the start forward without adjusting the length.
@@ -3346,7 +3327,9 @@ xfs_bmap_extsize_align(
return -EINVAL;
} else {
ASSERT(orig_off >= align_off);
- ASSERT(orig_end <= align_off + align_alen);
+ /* see MAXEXTLEN handling above */
+ ASSERT(orig_end <= align_off + align_alen ||
+ align_alen + extsz > MAXEXTLEN);
}
#ifdef DEBUG
@@ -3535,7 +3518,8 @@ xfs_bmap_longest_free_extent(
}
}
- longest = xfs_alloc_longest_free_extent(mp, pag);
+ longest = xfs_alloc_longest_free_extent(mp, pag,
+ xfs_alloc_min_freelist(mp, pag));
if (*blen < longest)
*blen = longest;
@@ -4127,13 +4111,6 @@ xfs_bmapi_reserve_delalloc(
/* Figure out the extent size, adjust alen */
extsz = xfs_get_extsz_hint(ip);
if (extsz) {
- /*
- * Make sure we don't exceed a single extent length when we
- * align the extent by reducing length we are going to
- * allocate by the maximum amount extent size aligment may
- * require.
- */
- alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1));
error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
1, 0, &aoff, &alen);
ASSERT(!error);
@@ -4160,18 +4137,15 @@ xfs_bmapi_reserve_delalloc(
ASSERT(indlen > 0);
if (rt) {
- error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
- -((int64_t)extsz), 0);
+ error = xfs_mod_frextents(mp, -((int64_t)extsz));
} else {
- error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- -((int64_t)alen), 0);
+ error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
}
if (error)
goto out_unreserve_quota;
- error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- -((int64_t)indlen), 0);
+ error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
if (error)
goto out_unreserve_blocks;
@@ -4198,9 +4172,9 @@ xfs_bmapi_reserve_delalloc(
out_unreserve_blocks:
if (rt)
- xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+ xfs_mod_frextents(mp, extsz);
else
- xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+ xfs_mod_fdblocks(mp, alen, false);
out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp))
xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
@@ -4448,7 +4422,15 @@ xfs_bmapi_convert_unwritten(
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
&bma->cur, mval, bma->firstblock, bma->flist,
&tmp_logflags);
- bma->logflags |= tmp_logflags;
+ /*
+ * Log the inode core unconditionally in the unwritten extent conversion
+ * path because the conversion might not have done so (e.g., if the
+ * extent count hasn't changed). We need to make sure the inode is dirty
+ * in the transaction for the sake of fsync(), even if nothing has
+ * changed, because fsync() will not force the log for this transaction
+ * unless it sees the inode pinned.
+ */
+ bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
if (error)
return error;
@@ -4801,7 +4783,7 @@ xfs_bmap_del_extent(
got.br_startblock, got.br_blockcount,
&i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
da_old = da_new = 0;
} else {
@@ -4835,7 +4817,7 @@ xfs_bmap_del_extent(
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
break;
case 2:
@@ -4935,7 +4917,8 @@ xfs_bmap_del_extent(
got.br_startblock,
temp, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ i == 1, done);
/*
* Update the btree record back
* to the original value.
@@ -4956,7 +4939,7 @@ xfs_bmap_del_extent(
error = -ENOSPC;
goto done;
}
- XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
} else
flags |= xfs_ilog_fext(whichfork);
XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -5012,10 +4995,8 @@ xfs_bmap_del_extent(
* Nothing to do for disk quota accounting here.
*/
ASSERT(da_old >= da_new);
- if (da_old > da_new) {
- xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- (int64_t)(da_old - da_new), 0);
- }
+ if (da_old > da_new)
+ xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
done:
*logflagsp = flags;
return error;
@@ -5284,14 +5265,13 @@ xfs_bunmapi(
rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
do_div(rtexts, mp->m_sb.sb_rextsize);
- xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
- (int64_t)rtexts, 0);
+ xfs_mod_frextents(mp, (int64_t)rtexts);
(void)xfs_trans_reserve_quota_nblks(NULL,
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_RTBLKS);
} else {
- xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- (int64_t)del.br_blockcount, 0);
+ xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
+ false);
(void)xfs_trans_reserve_quota_nblks(NULL,
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_REGBLKS);
@@ -5453,6 +5433,7 @@ xfs_bmse_merge(
struct xfs_bmbt_irec left;
xfs_filblks_t blockcount;
int error, i;
+ struct xfs_mount *mp = ip->i_mount;
xfs_bmbt_get_all(gotp, &got);
xfs_bmbt_get_all(leftp, &left);
@@ -5487,19 +5468,19 @@ xfs_bmse_merge(
got.br_blockcount, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
error = xfs_btree_delete(cur, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
/* lookup and update size of the previous extent */
error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
left.br_blockcount, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
left.br_blockcount = blockcount;
@@ -5518,50 +5499,92 @@ xfs_bmse_shift_one(
int *current_ext,
struct xfs_bmbt_rec_host *gotp,
struct xfs_btree_cur *cur,
- int *logflags)
+ int *logflags,
+ enum shift_direction direction)
{
struct xfs_ifork *ifp;
+ struct xfs_mount *mp;
xfs_fileoff_t startoff;
- struct xfs_bmbt_rec_host *leftp;
+ struct xfs_bmbt_rec_host *adj_irecp;
struct xfs_bmbt_irec got;
- struct xfs_bmbt_irec left;
+ struct xfs_bmbt_irec adj_irec;
int error;
int i;
+ int total_extents;
+ mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
xfs_bmbt_get_all(gotp, &got);
- startoff = got.br_startoff - offset_shift_fsb;
/* delalloc extents should be prevented by caller */
- XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock));
+ XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
- /*
- * Check for merge if we've got an extent to the left, otherwise make
- * sure there's enough room at the start of the file for the shift.
- */
- if (*current_ext) {
- /* grab the left extent and check for a large enough hole */
- leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
- xfs_bmbt_get_all(leftp, &left);
+ if (direction == SHIFT_LEFT) {
+ startoff = got.br_startoff - offset_shift_fsb;
+
+ /*
+ * Check for merge if we've got an extent to the left,
+ * otherwise make sure there's enough room at the start
+ * of the file for the shift.
+ */
+ if (!*current_ext) {
+ if (got.br_startoff < offset_shift_fsb)
+ return -EINVAL;
+ goto update_current_ext;
+ }
+ /*
+ * grab the left extent and check for a large
+ * enough hole.
+ */
+ adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1);
+ xfs_bmbt_get_all(adj_irecp, &adj_irec);
- if (startoff < left.br_startoff + left.br_blockcount)
+ if (startoff <
+ adj_irec.br_startoff + adj_irec.br_blockcount)
return -EINVAL;
/* check whether to merge the extent or shift it down */
- if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) {
+ if (xfs_bmse_can_merge(&adj_irec, &got,
+ offset_shift_fsb)) {
return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
- *current_ext, gotp, leftp, cur,
- logflags);
+ *current_ext, gotp, adj_irecp,
+ cur, logflags);
}
- } else if (got.br_startoff < offset_shift_fsb)
- return -EINVAL;
-
+ } else {
+ startoff = got.br_startoff + offset_shift_fsb;
+ /* nothing to move if this is the last extent */
+ if (*current_ext >= (total_extents - 1))
+ goto update_current_ext;
+ /*
+ * If this is not the last extent in the file, make sure there
+ * is enough room between current extent and next extent for
+ * accommodating the shift.
+ */
+ adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1);
+ xfs_bmbt_get_all(adj_irecp, &adj_irec);
+ if (startoff + got.br_blockcount > adj_irec.br_startoff)
+ return -EINVAL;
+ /*
+ * Unlike a left shift (which involves a hole punch),
+ * a right shift does not modify extent neighbors
+ * in any way. We should never find mergeable extents
+ * in this scenario. Check anyways and warn if we
+ * encounter two extents that could be one.
+ */
+ if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb))
+ WARN_ON_ONCE(1);
+ }
/*
* Increment the extent index for the next iteration, update the start
* offset of the in-core extent and update the btree if applicable.
*/
- (*current_ext)++;
+update_current_ext:
+ if (direction == SHIFT_LEFT)
+ (*current_ext)++;
+ else
+ (*current_ext)--;
xfs_bmbt_set_startoff(gotp, startoff);
*logflags |= XFS_ILOG_CORE;
if (!cur) {
@@ -5573,18 +5596,18 @@ xfs_bmse_shift_one(
got.br_blockcount, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
got.br_startoff = startoff;
return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
- got.br_blockcount, got.br_state);
+ got.br_blockcount, got.br_state);
}
/*
- * Shift extent records to the left to cover a hole.
+ * Shift extent records to the left/right to cover/create a hole.
*
* The maximum number of extents to be shifted in a single operation is
- * @num_exts. @start_fsb specifies the file offset to start the shift and the
+ * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the
* file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
* is the length by which each extent is shifted. If there is no hole to shift
* the extents into, this will be considered invalid operation and we abort
@@ -5594,12 +5617,13 @@ int
xfs_bmap_shift_extents(
struct xfs_trans *tp,
struct xfs_inode *ip,
- xfs_fileoff_t start_fsb,
+ xfs_fileoff_t *next_fsb,
xfs_fileoff_t offset_shift_fsb,
int *done,
- xfs_fileoff_t *next_fsb,
+ xfs_fileoff_t stop_fsb,
xfs_fsblock_t *firstblock,
struct xfs_bmap_free *flist,
+ enum shift_direction direction,
int num_exts)
{
struct xfs_btree_cur *cur = NULL;
@@ -5609,10 +5633,11 @@ xfs_bmap_shift_extents(
struct xfs_ifork *ifp;
xfs_extnum_t nexts = 0;
xfs_extnum_t current_ext;
+ xfs_extnum_t total_extents;
+ xfs_extnum_t stop_extent;
int error = 0;
int whichfork = XFS_DATA_FORK;
int logflags = 0;
- int total_extents;
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5628,6 +5653,8 @@ xfs_bmap_shift_extents(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+ ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT);
ifp = XFS_IFORK_PTR(ip, whichfork);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -5645,43 +5672,83 @@ xfs_bmap_shift_extents(
}
/*
+ * There may be delalloc extents in the data fork before the range we
+ * are collapsing out, so we cannot use the count of real extents here.
+ * Instead we have to calculate it from the incore fork.
+ */
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ if (total_extents == 0) {
+ *done = 1;
+ goto del_cursor;
+ }
+
+ /*
+ * In case of first right shift, we need to initialize next_fsb
+ */
+ if (*next_fsb == NULLFSBLOCK) {
+ gotp = xfs_iext_get_ext(ifp, total_extents - 1);
+ xfs_bmbt_get_all(gotp, &got);
+ *next_fsb = got.br_startoff;
+ if (stop_fsb > *next_fsb) {
+ *done = 1;
+ goto del_cursor;
+ }
+ }
+
+ /* Lookup the extent index at which we have to stop */
+ if (direction == SHIFT_RIGHT) {
+ gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent);
+ /* Make stop_extent exclusive of shift range */
+ stop_extent--;
+ } else
+ stop_extent = total_extents;
+
+ /*
* Look up the extent index for the fsb where we start shifting. We can
* henceforth iterate with current_ext as extent list changes are locked
* out via ilock.
*
* gotp can be null in 2 cases: 1) if there are no extents or 2)
- * start_fsb lies in a hole beyond which there are no extents. Either
+ * *next_fsb lies in a hole beyond which there are no extents. Either
* way, we are done.
*/
- gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext);
+ gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, &current_ext);
if (!gotp) {
*done = 1;
goto del_cursor;
}
- /*
- * There may be delalloc extents in the data fork before the range we
- * are collapsing out, so we cannot use the count of real extents here.
- * Instead we have to calculate it from the incore fork.
- */
- total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
- while (nexts++ < num_exts && current_ext < total_extents) {
+ /* some sanity checking before we finally start shifting extents */
+ if ((direction == SHIFT_LEFT && current_ext >= stop_extent) ||
+ (direction == SHIFT_RIGHT && current_ext <= stop_extent)) {
+ error = -EIO;
+ goto del_cursor;
+ }
+
+ while (nexts++ < num_exts) {
error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
- &current_ext, gotp, cur, &logflags);
+ &current_ext, gotp, cur, &logflags,
+ direction);
if (error)
goto del_cursor;
+ /*
+ * If there was an extent merge during the shift, the extent
+ * count can change. Update the total and grade the next record.
+ */
+ if (direction == SHIFT_LEFT) {
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ stop_extent = total_extents;
+ }
- /* update total extent count and grab the next record */
- total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
- if (current_ext >= total_extents)
+ if (current_ext == stop_extent) {
+ *done = 1;
+ *next_fsb = NULLFSBLOCK;
break;
+ }
gotp = xfs_iext_get_ext(ifp, current_ext);
}
- /* Check if we are done */
- if (current_ext == total_extents) {
- *done = 1;
- } else if (next_fsb) {
+ if (!*done) {
xfs_bmbt_get_all(gotp, &got);
*next_fsb = got.br_startoff;
}
@@ -5696,3 +5763,188 @@ del_cursor:
return error;
}
+
+/*
+ * Splits an extent into two extents at split_fsb block such that it is
+ * the first block of the current_ext. @current_ext is a target extent
+ * to be split. @split_fsb is a block where the extents is split.
+ * If split_fsb lies in a hole or the first block of extents, just return 0.
+ */
+STATIC int
+xfs_bmap_split_extent_at(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_fileoff_t split_fsb,
+ xfs_fsblock_t *firstfsb,
+ struct xfs_bmap_free *free_list)
+{
+ int whichfork = XFS_DATA_FORK;
+ struct xfs_btree_cur *cur = NULL;
+ struct xfs_bmbt_rec_host *gotp;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec new; /* split extent */
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ xfs_fsblock_t gotblkcnt; /* new block count for got */
+ xfs_extnum_t current_ext;
+ int error = 0;
+ int logflags = 0;
+ int i = 0;
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
+ XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ /* Read in all the extents */
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ /*
+ * gotp can be null in 2 cases: 1) if there are no extents
+ * or 2) split_fsb lies in a hole beyond which there are
+ * no extents. Either way, we are done.
+ */
+ gotp = xfs_iext_bno_to_ext(ifp, split_fsb, &current_ext);
+ if (!gotp)
+ return 0;
+
+ xfs_bmbt_get_all(gotp, &got);
+
+ /*
+ * Check split_fsb lies in a hole or the start boundary offset
+ * of the extent.
+ */
+ if (got.br_startoff >= split_fsb)
+ return 0;
+
+ gotblkcnt = split_fsb - got.br_startoff;
+ new.br_startoff = split_fsb;
+ new.br_startblock = got.br_startblock + gotblkcnt;
+ new.br_blockcount = got.br_blockcount - gotblkcnt;
+ new.br_state = got.br_state;
+
+ if (ifp->if_flags & XFS_IFBROOT) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstfsb;
+ cur->bc_private.b.flist = free_list;
+ cur->bc_private.b.flags = 0;
+ error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+ }
+
+ xfs_bmbt_set_blockcount(gotp, gotblkcnt);
+ got.br_blockcount = gotblkcnt;
+
+ logflags = XFS_ILOG_CORE;
+ if (cur) {
+ error = xfs_bmbt_update(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ got.br_state);
+ if (error)
+ goto del_cursor;
+ } else
+ logflags |= XFS_ILOG_DEXT;
+
+ /* Add new extent */
+ current_ext++;
+ xfs_iext_insert(ip, current_ext, 1, &new, 0);
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
+ new.br_startblock, new.br_blockcount,
+ &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
+ cur->bc_rec.b.br_state = new.br_state;
+
+ error = xfs_btree_insert(cur, &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+ }
+
+ /*
+ * Convert to a btree if necessary.
+ */
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
+ &cur, 0, &tmp_logflags, whichfork);
+ logflags |= tmp_logflags;
+ }
+
+del_cursor:
+ if (cur) {
+ cur->bc_private.b.allocated = 0;
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+ }
+
+ if (logflags)
+ xfs_trans_log_inode(tp, ip, logflags);
+ return error;
+}
+
+int
+xfs_bmap_split_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t split_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_bmap_free free_list;
+ xfs_fsblock_t firstfsb;
+ int committed;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ xfs_bmap_init(&free_list, &firstfsb);
+
+ error = xfs_bmap_split_extent_at(tp, ip, split_fsb,
+ &firstfsb, &free_list);
+ if (error)
+ goto out;
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out;
+
+ return xfs_trans_commit(tp);
+
+out:
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b9d8a499d2c4..6aaa0c1c7200 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -166,6 +166,11 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
*/
#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
+enum shift_direction {
+ SHIFT_LEFT = 0,
+ SHIFT_RIGHT,
+};
+
#ifdef DEBUG
void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
int whichfork, unsigned long caller_ip);
@@ -211,8 +216,10 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb,
- int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock,
- struct xfs_bmap_free *flist, int num_exts);
+ xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
+ int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
+ struct xfs_bmap_free *flist, enum shift_direction direction,
+ int num_exts);
+int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 81cad433df85..c72283dd8d44 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -168,7 +168,7 @@ xfs_btree_check_lptr(
xfs_fsblock_t bno, /* btree block disk address */
int level) /* btree block level */
{
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
level > 0 &&
bno != NULLFSBLOCK &&
XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
@@ -187,7 +187,7 @@ xfs_btree_check_sptr(
{
xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks;
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
level > 0 &&
bno != NULLAGBLOCK &&
bno != 0 &&
@@ -1825,7 +1825,7 @@ xfs_btree_lookup(
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
return 0;
@@ -2285,7 +2285,7 @@ xfs_btree_rshift(
if (error)
goto error0;
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
error = xfs_btree_increment(tcur, level, &i);
if (error)
@@ -3138,7 +3138,7 @@ xfs_btree_insert(
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
level++;
/*
@@ -3582,15 +3582,15 @@ xfs_btree_delrec(
* Actually any entry but the first would suffice.
*/
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
error = xfs_btree_increment(tcur, level, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
/* Grab a pointer to the block. */
right = xfs_btree_get_block(tcur, level, &rbp);
@@ -3634,12 +3634,12 @@ xfs_btree_delrec(
rrecs = xfs_btree_get_numrecs(right);
if (!xfs_btree_ptr_is_null(cur, &lptr)) {
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
error = xfs_btree_decrement(tcur, level, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
}
}
@@ -3653,13 +3653,13 @@ xfs_btree_delrec(
* previous block.
*/
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
error = xfs_btree_decrement(tcur, level, &i);
if (error)
goto error0;
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
/* Grab a pointer to the block. */
left = xfs_btree_get_block(tcur, level, &lbp);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 9cb0115c6bd1..2385f8cd08ab 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -538,12 +538,12 @@ xfs_da3_root_split(
oldroot = blk1->bp->b_addr;
if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
- struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_da3_icnode_hdr icnodehdr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+ dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
btree = dp->d_ops->node_tree_p(oldroot);
- size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
- level = nodehdr.level;
+ size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
+ level = icnodehdr.level;
/*
* we are about to copy oldroot to bp, so set up the type
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 0a49b0286372..74bcbabfa523 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -725,7 +725,13 @@ struct xfs_attr3_icleaf_hdr {
__uint16_t magic;
__uint16_t count;
__uint16_t usedbytes;
- __uint16_t firstused;
+ /*
+ * firstused is 32-bit here instead of 16-bit like the on-disk variant
+ * to support maximum fsb size of 64k without overflow issues throughout
+ * the attr code. Instead, the overflow condition is handled on
+ * conversion to/from disk.
+ */
+ __uint32_t firstused;
__u8 holes;
struct {
__uint16_t base;
@@ -734,6 +740,12 @@ struct xfs_attr3_icleaf_hdr {
};
/*
+ * Special value to represent fs block size in the leaf header firstused field.
+ * Only used when block size overflows the 2-bytes available on disk.
+ */
+#define XFS_ATTR3_LEAF_NULLOFF 0
+
+/*
* Flags used in the leaf_entry[i].flags field.
* NOTE: the INCOMPLETE bit must not collide with the flags bits specified
* on the system call, they are "or"ed together for various operations.
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 5ff31be9b1cd..de1ea16f5748 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -89,7 +89,7 @@ __xfs_dir3_data_check(
* so just ensure that the count falls somewhere inside the
* block right now.
*/
- XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+ XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) <
((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
break;
case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
@@ -107,21 +107,21 @@ __xfs_dir3_data_check(
bf = ops->data_bestfree_p(hdr);
count = lastfree = freeseen = 0;
if (!bf[0].length) {
- XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
+ XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset);
freeseen |= 1 << 0;
}
if (!bf[1].length) {
- XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
+ XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset);
freeseen |= 1 << 1;
}
if (!bf[2].length) {
- XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
+ XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset);
freeseen |= 1 << 2;
}
- XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+ XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >=
be16_to_cpu(bf[1].length));
- XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+ XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >=
be16_to_cpu(bf[2].length));
/*
* Loop over the data/unused entries.
@@ -134,18 +134,18 @@ __xfs_dir3_data_check(
* doesn't need to be there.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0);
+ XFS_WANT_CORRUPTED_RETURN(mp,
be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
(char *)dup - (char *)hdr);
dfp = xfs_dir2_data_freefind(hdr, bf, dup);
if (dfp) {
i = (int)(dfp - bf);
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp,
(freeseen & (1 << i)) == 0);
freeseen |= 1 << i;
} else {
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp,
be16_to_cpu(dup->length) <=
be16_to_cpu(bf[2].length));
}
@@ -160,13 +160,13 @@ __xfs_dir3_data_check(
* The linear search is crude but this is DEBUG code.
*/
dep = (xfs_dir2_data_entry_t *)p;
- XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0);
+ XFS_WANT_CORRUPTED_RETURN(mp,
!xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp,
be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
(char *)dep - (char *)hdr);
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp,
ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
count++;
lastfree = 0;
@@ -183,14 +183,15 @@ __xfs_dir3_data_check(
be32_to_cpu(lep[i].hashval) == hash)
break;
}
- XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
+ XFS_WANT_CORRUPTED_RETURN(mp,
+ i < be32_to_cpu(btp->count));
}
p += ops->data_entsize(dep->namelen);
}
/*
* Need to have seen all the entries and all the bestfree slots.
*/
- XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
+ XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7);
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
@@ -198,13 +199,13 @@ __xfs_dir3_data_check(
cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
if (i > 0)
- XFS_WANT_CORRUPTED_RETURN(
+ XFS_WANT_CORRUPTED_RETURN(mp,
be32_to_cpu(lep[i].hashval) >=
be32_to_cpu(lep[i - 1].hashval));
}
- XFS_WANT_CORRUPTED_RETURN(count ==
+ XFS_WANT_CORRUPTED_RETURN(mp, count ==
be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
- XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
+ XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale));
}
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 8eb718979383..a0ae572051de 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -170,7 +170,7 @@ typedef struct xfs_sb {
__uint32_t sb_features_log_incompat;
__uint32_t sb_crc; /* superblock crc */
- __uint32_t sb_pad;
+ xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */
xfs_ino_t sb_pquotino; /* project quota inode */
xfs_lsn_t sb_lsn; /* last write sequence */
@@ -256,7 +256,7 @@ typedef struct xfs_dsb {
__be32 sb_features_log_incompat;
__le32 sb_crc; /* superblock crc */
- __be32 sb_pad;
+ __be32 sb_spino_align; /* sparse inode chunk alignment */
__be64 sb_pquotino; /* project quota inode */
__be64 sb_lsn; /* last write sequence */
@@ -264,68 +264,6 @@ typedef struct xfs_dsb {
/* must be padded to 64 bit alignment */
} xfs_dsb_t;
-/*
- * Sequence number values for the fields.
- */
-typedef enum {
- XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
- XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
- XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
- XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
- XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
- XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
- XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
- XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
- XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
- XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
- XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
- XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
- XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
- XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
- XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
- XFS_SBS_PQUOTINO, XFS_SBS_LSN,
- XFS_SBS_FIELDCOUNT
-} xfs_sb_field_t;
-
-/*
- * Mask values, defined based on the xfs_sb_field_t values.
- * Only define the ones we're using.
- */
-#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x)
-#define XFS_SB_UUID XFS_SB_MVAL(UUID)
-#define XFS_SB_FNAME XFS_SB_MVAL(FNAME)
-#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO)
-#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO)
-#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO)
-#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM)
-#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO)
-#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO)
-#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS)
-#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
-#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
-#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
-#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
-#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
-#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2 (XFS_SB_MVAL(FEATURES2) | \
- XFS_SB_MVAL(BAD_FEATURES2))
-#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
-#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
-#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
-#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
-#define XFS_SB_CRC XFS_SB_MVAL(CRC)
-#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO)
-#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
-#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
-#define XFS_SB_MOD_BITS \
- (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
- XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
- XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
- XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
- XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
- XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
- XFS_SB_PQUOTINO)
-
/*
* Misc. Flags - warning - these will be cleared by xfs_repair unless
@@ -519,8 +457,10 @@ xfs_sb_has_ro_compat_feature(
}
#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
+#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
#define XFS_SB_FEAT_INCOMPAT_ALL \
- (XFS_SB_FEAT_INCOMPAT_FTYPE)
+ (XFS_SB_FEAT_INCOMPAT_FTYPE| \
+ XFS_SB_FEAT_INCOMPAT_SPINODES)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -568,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
}
+static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES);
+}
+
/*
* end of superblock version macros
*/
@@ -820,19 +766,6 @@ typedef struct xfs_agfl {
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
-
-#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
-#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
- (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
-#define XFS_MIN_FREELIST(a,mp) \
- (XFS_MIN_FREELIST_RAW( \
- be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \
- be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp))
-#define XFS_MIN_FREELIST_PAG(pag,mp) \
- (XFS_MIN_FREELIST_RAW( \
- (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
- (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp))
-
#define XFS_AGB_TO_FSB(mp,agno,agbno) \
(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
#define XFS_FSB_TO_AGNO(mp,fsbno) \
@@ -1278,26 +1211,54 @@ typedef __uint64_t xfs_inofree_t;
#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
+#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */
+#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t))
+#define XFS_INODES_PER_HOLEMASK_BIT \
+ (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t)))
+
static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
{
return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
}
/*
- * Data record structure
+ * The on-disk inode record structure has two formats. The original "full"
+ * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount
+ * and replaces the 3 high-order freecount bytes wth the holemask and inode
+ * count.
+ *
+ * The holemask of the sparse record format allows an inode chunk to have holes
+ * that refer to blocks not owned by the inode record. This facilitates inode
+ * allocation in the event of severe free space fragmentation.
*/
typedef struct xfs_inobt_rec {
__be32 ir_startino; /* starting inode number */
- __be32 ir_freecount; /* count of free inodes (set bits) */
+ union {
+ struct {
+ __be32 ir_freecount; /* count of free inodes */
+ } f;
+ struct {
+ __be16 ir_holemask;/* hole mask for sparse chunks */
+ __u8 ir_count; /* total inode count */
+ __u8 ir_freecount; /* count of free inodes */
+ } sp;
+ } ir_u;
__be64 ir_free; /* free inode mask */
} xfs_inobt_rec_t;
typedef struct xfs_inobt_rec_incore {
xfs_agino_t ir_startino; /* starting inode number */
- __int32_t ir_freecount; /* count of free inodes (set bits) */
+ __uint16_t ir_holemask; /* hole mask for sparse chunks */
+ __uint8_t ir_count; /* total inode count */
+ __uint8_t ir_freecount; /* count of free inodes (set bits) */
xfs_inofree_t ir_free; /* free inode mask */
} xfs_inobt_rec_incore_t;
+static inline bool xfs_inobt_issparse(uint16_t holemask)
+{
+ /* non-zero holemask represents a sparse rec. */
+ return holemask;
+}
/*
* Key structure
@@ -1515,8 +1476,8 @@ struct xfs_acl {
sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
/* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE "SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca19f..89689c6a43e2 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 116ef1ddb3e3..66efc702452a 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -65,6 +65,8 @@ xfs_inobt_lookup(
int *stat) /* success/failure */
{
cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_holemask = 0;
+ cur->bc_rec.i.ir_count = 0;
cur->bc_rec.i.ir_freecount = 0;
cur->bc_rec.i.ir_free = 0;
return xfs_btree_lookup(cur, dir, stat);
@@ -82,7 +84,14 @@ xfs_inobt_update(
union xfs_btree_rec rec;
rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
- rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
+ rec.inobt.ir_u.sp.ir_count = irec->ir_count;
+ rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
+ } else {
+ /* ir_holemask/ir_count not supported on-disk */
+ rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ }
rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
return xfs_btree_update(cur, &rec);
}
@@ -100,12 +109,27 @@ xfs_inobt_get_rec(
int error;
error = xfs_btree_get_rec(cur, &rec, stat);
- if (!error && *stat == 1) {
- irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
- irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
- irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+ if (error || *stat == 0)
+ return error;
+
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
+ irec->ir_count = rec->inobt.ir_u.sp.ir_count;
+ irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
+ } else {
+ /*
+ * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
+ * values for full inode chunks.
+ */
+ irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
+ irec->ir_count = XFS_INODES_PER_CHUNK;
+ irec->ir_freecount =
+ be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
}
- return error;
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+
+ return 0;
}
/*
@@ -114,10 +138,14 @@ xfs_inobt_get_rec(
STATIC int
xfs_inobt_insert_rec(
struct xfs_btree_cur *cur,
+ __uint16_t holemask,
+ __uint8_t count,
__int32_t freecount,
xfs_inofree_t free,
int *stat)
{
+ cur->bc_rec.i.ir_holemask = holemask;
+ cur->bc_rec.i.ir_count = count;
cur->bc_rec.i.ir_freecount = freecount;
cur->bc_rec.i.ir_free = free;
return xfs_btree_insert(cur, stat);
@@ -154,7 +182,9 @@ xfs_inobt_insert(
}
ASSERT(i == 0);
- error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+ error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
+ XFS_INODES_PER_CHUNK,
+ XFS_INODES_PER_CHUNK,
XFS_INOBT_ALL_FREE, &i);
if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -220,6 +250,7 @@ xfs_ialloc_inode_init(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct list_head *buffer_list,
+ int icount,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_agblock_t length,
@@ -275,7 +306,7 @@ xfs_ialloc_inode_init(
* they track in the AIL as if they were physically logged.
*/
if (tp)
- xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+ xfs_icreate_log(tp, agno, agbno, icount,
mp->m_sb.sb_inodesize, length, gen);
} else
version = 2;
@@ -347,6 +378,214 @@ xfs_ialloc_inode_init(
}
/*
+ * Align startino and allocmask for a recently allocated sparse chunk such that
+ * they are fit for insertion (or merge) into the on-disk inode btrees.
+ *
+ * Background:
+ *
+ * When enabled, sparse inode support increases the inode alignment from cluster
+ * size to inode chunk size. This means that the minimum range between two
+ * non-adjacent inode records in the inobt is large enough for a full inode
+ * record. This allows for cluster sized, cluster aligned block allocation
+ * without need to worry about whether the resulting inode record overlaps with
+ * another record in the tree. Without this basic rule, we would have to deal
+ * with the consequences of overlap by potentially undoing recent allocations in
+ * the inode allocation codepath.
+ *
+ * Because of this alignment rule (which is enforced on mount), there are two
+ * inobt possibilities for newly allocated sparse chunks. One is that the
+ * aligned inode record for the chunk covers a range of inodes not already
+ * covered in the inobt (i.e., it is safe to insert a new sparse record). The
+ * other is that a record already exists at the aligned startino that considers
+ * the newly allocated range as sparse. In the latter case, record content is
+ * merged in hope that sparse inode chunks fill to full chunks over time.
+ */
+STATIC void
+xfs_align_sparse_ino(
+ struct xfs_mount *mp,
+ xfs_agino_t *startino,
+ uint16_t *allocmask)
+{
+ xfs_agblock_t agbno;
+ xfs_agblock_t mod;
+ int offset;
+
+ agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
+ mod = agbno % mp->m_sb.sb_inoalignmt;
+ if (!mod)
+ return;
+
+ /* calculate the inode offset and align startino */
+ offset = mod << mp->m_sb.sb_inopblog;
+ *startino -= offset;
+
+ /*
+ * Since startino has been aligned down, left shift allocmask such that
+ * it continues to represent the same physical inodes relative to the
+ * new startino.
+ */
+ *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
+}
+
+/*
+ * Determine whether the source inode record can merge into the target. Both
+ * records must be sparse, the inode ranges must match and there must be no
+ * allocation overlap between the records.
+ */
+STATIC bool
+__xfs_inobt_can_merge(
+ struct xfs_inobt_rec_incore *trec, /* tgt record */
+ struct xfs_inobt_rec_incore *srec) /* src record */
+{
+ uint64_t talloc;
+ uint64_t salloc;
+
+ /* records must cover the same inode range */
+ if (trec->ir_startino != srec->ir_startino)
+ return false;
+
+ /* both records must be sparse */
+ if (!xfs_inobt_issparse(trec->ir_holemask) ||
+ !xfs_inobt_issparse(srec->ir_holemask))
+ return false;
+
+ /* both records must track some inodes */
+ if (!trec->ir_count || !srec->ir_count)
+ return false;
+
+ /* can't exceed capacity of a full record */
+ if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
+ return false;
+
+ /* verify there is no allocation overlap */
+ talloc = xfs_inobt_irec_to_allocmask(trec);
+ salloc = xfs_inobt_irec_to_allocmask(srec);
+ if (talloc & salloc)
+ return false;
+
+ return true;
+}
+
+/*
+ * Merge the source inode record into the target. The caller must call
+ * __xfs_inobt_can_merge() to ensure the merge is valid.
+ */
+STATIC void
+__xfs_inobt_rec_merge(
+ struct xfs_inobt_rec_incore *trec, /* target */
+ struct xfs_inobt_rec_incore *srec) /* src */
+{
+ ASSERT(trec->ir_startino == srec->ir_startino);
+
+ /* combine the counts */
+ trec->ir_count += srec->ir_count;
+ trec->ir_freecount += srec->ir_freecount;
+
+ /*
+ * Merge the holemask and free mask. For both fields, 0 bits refer to
+ * allocated inodes. We combine the allocated ranges with bitwise AND.
+ */
+ trec->ir_holemask &= srec->ir_holemask;
+ trec->ir_free &= srec->ir_free;
+}
+
+/*
+ * Insert a new sparse inode chunk into the associated inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * This function supports two modes of handling preexisting records depending on
+ * the merge flag. If merge is true, the provided record is merged with the
+ * existing record and updated in place. The merged record is returned in nrec.
+ * If merge is false, an existing record is replaced with the provided record.
+ * If no preexisting record exists, the provided record is always inserted.
+ *
+ * It is considered corruption if a merge is requested and not possible. Given
+ * the sparse inode alignment constraints, this should never happen.
+ */
+STATIC int
+xfs_inobt_insert_sprec(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ int btnum,
+ struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
+ bool merge) /* merge or replace */
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ int error;
+ int i;
+ struct xfs_inobt_rec_incore rec;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ /* the new record is pre-aligned so we know where to look */
+ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ /* if nothing there, insert a new record and return */
+ if (i == 0) {
+ error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+ nrec->ir_count, nrec->ir_freecount,
+ nrec->ir_free, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+
+ goto out;
+ }
+
+ /*
+ * A record exists at this startino. Merge or replace the record
+ * depending on what we've been asked to do.
+ */
+ if (merge) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+ XFS_WANT_CORRUPTED_GOTO(mp,
+ rec.ir_startino == nrec->ir_startino,
+ error);
+
+ /*
+ * This should never fail. If we have coexisting records that
+ * cannot merge, something is seriously wrong.
+ */
+ XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
+ error);
+
+ trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
+ rec.ir_holemask, nrec->ir_startino,
+ nrec->ir_holemask);
+
+ /* merge to nrec to output the updated record */
+ __xfs_inobt_rec_merge(nrec, &rec);
+
+ trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
+ nrec->ir_holemask);
+
+ error = xfs_inobt_rec_check_count(mp, nrec);
+ if (error)
+ goto error;
+ }
+
+ error = xfs_inobt_update(cur, nrec);
+ if (error)
+ goto error;
+
+out:
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
@@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc(
xfs_agino_t newlen; /* new number of inodes */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
+ uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
+ struct xfs_inobt_rec_incore rec;
struct xfs_perag *pag;
+ int do_sparse = 0;
memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = tp->t_mountp;
+ args.fsbno = NULLFSBLOCK;
+
+#ifdef DEBUG
+ /* randomly do sparse inode allocations */
+ if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
+ args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
+ do_sparse = prandom_u32() & 1;
+#endif
/*
* Locking will ensure that we don't have two callers in here
@@ -376,7 +626,8 @@ xfs_ialloc_ag_alloc(
*/
newlen = args.mp->m_ialloc_inos;
if (args.mp->m_maxicount &&
- args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+ percpu_counter_read_positive(&args.mp->m_icount) + newlen >
+ args.mp->m_maxicount)
return -ENOSPC;
args.minlen = args.maxlen = args.mp->m_ialloc_blks;
/*
@@ -389,6 +640,8 @@ xfs_ialloc_ag_alloc(
agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
args.mp->m_ialloc_blks;
+ if (do_sparse)
+ goto sparse_alloc;
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -427,8 +680,7 @@ xfs_ialloc_ag_alloc(
* subsequent requests.
*/
args.minalignslop = 0;
- } else
- args.fsbno = NULLFSBLOCK;
+ }
if (unlikely(args.fsbno == NULLFSBLOCK)) {
/*
@@ -479,6 +731,47 @@ xfs_ialloc_ag_alloc(
return error;
}
+ /*
+ * Finally, try a sparse allocation if the filesystem supports it and
+ * the sparse allocation length is smaller than a full chunk.
+ */
+ if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
+ args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+ args.fsbno == NULLFSBLOCK) {
+sparse_alloc:
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ args.agbno = be32_to_cpu(agi->agi_root);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.alignment = args.mp->m_sb.sb_spino_align;
+ args.prod = 1;
+
+ args.minlen = args.mp->m_ialloc_min_blks;
+ args.maxlen = args.minlen;
+
+ /*
+ * The inode record will be aligned to full chunk size. We must
+ * prevent sparse allocation from AG boundaries that result in
+ * invalid inode records, such as records that start at agbno 0
+ * or extend beyond the AG.
+ *
+ * Set min agbno to the first aligned, non-zero agbno and max to
+ * the last aligned agbno that is at least one full chunk from
+ * the end of the AG.
+ */
+ args.min_agbno = args.mp->m_sb.sb_inoalignmt;
+ args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
+ args.mp->m_sb.sb_inoalignmt) -
+ args.mp->m_ialloc_blks;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+
+ newlen = args.len << args.mp->m_sb.sb_inopblog;
+ ASSERT(newlen <= XFS_INODES_PER_CHUNK);
+ allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
+ }
+
if (args.fsbno == NULLFSBLOCK) {
*alloc = 0;
return 0;
@@ -494,8 +787,8 @@ xfs_ialloc_ag_alloc(
* rather than a linear progression to prevent the next generation
* number from being easily guessable.
*/
- error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
- args.len, prandom_u32());
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
+ args.agbno, args.len, prandom_u32());
if (error)
return error;
@@ -503,6 +796,73 @@ xfs_ialloc_ag_alloc(
* Convert the results.
*/
newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+
+ if (xfs_inobt_issparse(~allocmask)) {
+ /*
+ * We've allocated a sparse chunk. Align the startino and mask.
+ */
+ xfs_align_sparse_ino(args.mp, &newino, &allocmask);
+
+ rec.ir_startino = newino;
+ rec.ir_holemask = ~allocmask;
+ rec.ir_count = newlen;
+ rec.ir_freecount = newlen;
+ rec.ir_free = XFS_INOBT_ALL_FREE;
+
+ /*
+ * Insert the sparse record into the inobt and allow for a merge
+ * if necessary. If a merge does occur, rec is updated to the
+ * merged record.
+ */
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
+ &rec, true);
+ if (error == -EFSCORRUPTED) {
+ xfs_alert(args.mp,
+ "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
+ XFS_AGINO_TO_INO(args.mp, agno,
+ rec.ir_startino),
+ rec.ir_holemask, rec.ir_count);
+ xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
+ }
+ if (error)
+ return error;
+
+ /*
+ * We can't merge the part we've just allocated as for the inobt
+ * due to finobt semantics. The original record may or may not
+ * exist independent of whether physical inodes exist in this
+ * sparse chunk.
+ *
+ * We must update the finobt record based on the inobt record.
+ * rec contains the fully merged and up to date inobt record
+ * from the previous call. Set merge false to replace any
+ * existing record with this one.
+ */
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
+ XFS_BTNUM_FINO, &rec,
+ false);
+ if (error)
+ return error;
+ }
+ } else {
+ /* full chunk - insert new records to both btrees */
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino,
+ newlen, XFS_BTNUM_FINO);
+ if (error)
+ return error;
+ }
+ }
+
+ /*
+ * Update AGI counts and newino.
+ */
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
pag = xfs_perag_get(args.mp, agno);
@@ -511,20 +871,6 @@ xfs_ialloc_ag_alloc(
agi->agi_newino = cpu_to_be32(newino);
/*
- * Insert records describing the new inode chunk into the btrees.
- */
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
- XFS_BTNUM_INO);
- if (error)
- return error;
-
- if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
- error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
- XFS_BTNUM_FINO);
- if (error)
- return error;
- }
- /*
* Log allocation group header fields
*/
xfs_ialloc_log_agi(tp, agbp,
@@ -644,7 +990,7 @@ xfs_ialloc_ag_select(
* if we fail allocation due to alignment issues then it is most
* likely a real ENOSPC condition.
*/
- ineed = mp->m_ialloc_blks;
+ ineed = mp->m_ialloc_min_blks;
if (flags && ineed > 1)
ineed += xfs_ialloc_cluster_alignment(mp);
longest = pag->pagf_longest;
@@ -700,7 +1046,7 @@ xfs_ialloc_next_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
}
return 0;
@@ -724,13 +1070,34 @@ xfs_ialloc_get_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
}
return 0;
}
/*
+ * Return the offset of the first free inode in the record. If the inode chunk
+ * is sparsely allocated, we convert the record holemask to inode granularity
+ * and mask off the unallocated regions from the inode free mask.
+ */
+STATIC int
+xfs_inobt_first_free_inode(
+ struct xfs_inobt_rec_incore *rec)
+{
+ xfs_inofree_t realfree;
+
+ /* if there are no holes, return the first available offset */
+ if (!xfs_inobt_issparse(rec->ir_holemask))
+ return xfs_lowbit64(rec->ir_free);
+
+ realfree = xfs_inobt_irec_to_allocmask(rec);
+ realfree &= rec->ir_free;
+
+ return xfs_lowbit64(realfree);
+}
+
+/*
* Allocate an inode using the inobt-only algorithm.
*/
STATIC int
@@ -783,12 +1150,12 @@ xfs_dialloc_ag_inobt(
error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
error = xfs_inobt_get_rec(cur, &rec, &j);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
if (rec.ir_freecount > 0) {
/*
@@ -944,23 +1311,23 @@ newino:
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
for (;;) {
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
if (rec.ir_freecount > 0)
break;
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
}
alloc_inode:
- offset = xfs_lowbit64(rec.ir_free);
+ offset = xfs_inobt_first_free_inode(&rec);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1016,7 +1383,7 @@ xfs_dialloc_ag_finobt_near(
error = xfs_inobt_get_rec(lcur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
/*
* See if we've landed in the parent inode record. The finobt
@@ -1039,10 +1406,10 @@ xfs_dialloc_ag_finobt_near(
error = xfs_inobt_get_rec(rcur, &rrec, &j);
if (error)
goto error_rcur;
- XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+ XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
}
- XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+ XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
if (i == 1 && j == 1) {
/*
* Both the left and right records are valid. Choose the closer
@@ -1095,7 +1462,7 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
return 0;
}
}
@@ -1106,12 +1473,12 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
return 0;
}
@@ -1133,19 +1500,19 @@ xfs_dialloc_ag_update_inobt(
error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
- XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
(rec.ir_freecount == frec->ir_freecount));
return xfs_inobt_update(cur, &rec);
@@ -1209,7 +1576,7 @@ xfs_dialloc_ag(
if (error)
goto error_cur;
- offset = xfs_lowbit64(rec.ir_free);
+ offset = xfs_inobt_first_free_inode(&rec);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1338,9 +1705,13 @@ xfs_dialloc(
* If we have already hit the ceiling of inode blocks then clear
* okalloc so we scan all available agi structures for a free
* inode.
+ *
+ * Read rough value of mp->m_icount by percpu_counter_read_positive,
+ * which will sacrifice the preciseness but improve the performance.
*/
if (mp->m_maxicount &&
- mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
+ percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos
+ > mp->m_maxicount) {
noroom = 1;
okalloc = 0;
}
@@ -1434,6 +1805,83 @@ out_error:
return error;
}
+/*
+ * Free the blocks of an inode chunk. We must consider that the inode chunk
+ * might be sparse and only free the regions that are allocated as part of the
+ * chunk.
+ */
+STATIC void
+xfs_difree_inode_chunk(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *rec,
+ struct xfs_bmap_free *flist)
+{
+ xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
+ int startidx, endidx;
+ int nextbit;
+ xfs_agblock_t agbno;
+ int contigblk;
+ DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
+
+ if (!xfs_inobt_issparse(rec->ir_holemask)) {
+ /* not sparse, calculate extent info directly */
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+ XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
+ mp->m_ialloc_blks, flist, mp);
+ return;
+ }
+
+ /* holemask is only 16-bits (fits in an unsigned long) */
+ ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
+ holemask[0] = rec->ir_holemask;
+
+ /*
+ * Find contiguous ranges of zeroes (i.e., allocated regions) in the
+ * holemask and convert the start/end index of each range to an extent.
+ * We start with the start and end index both pointing at the first 0 in
+ * the mask.
+ */
+ startidx = endidx = find_first_zero_bit(holemask,
+ XFS_INOBT_HOLEMASK_BITS);
+ nextbit = startidx + 1;
+ while (startidx < XFS_INOBT_HOLEMASK_BITS) {
+ nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
+ nextbit);
+ /*
+ * If the next zero bit is contiguous, update the end index of
+ * the current range and continue.
+ */
+ if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
+ nextbit == endidx + 1) {
+ endidx = nextbit;
+ goto next;
+ }
+
+ /*
+ * nextbit is not contiguous with the current end index. Convert
+ * the current start/end to an extent and add it to the free
+ * list.
+ */
+ agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
+ mp->m_sb.sb_inopblock;
+ contigblk = ((endidx - startidx + 1) *
+ XFS_INODES_PER_HOLEMASK_BIT) /
+ mp->m_sb.sb_inopblock;
+
+ ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
+ ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
+ flist, mp);
+
+ /* reset range to current bit and carry on... */
+ startidx = endidx = nextbit;
+
+next:
+ nextbit++;
+ }
+}
+
STATIC int
xfs_difree_inobt(
struct xfs_mount *mp,
@@ -1441,8 +1889,7 @@ xfs_difree_inobt(
struct xfs_buf *agbp,
xfs_agino_t agino,
struct xfs_bmap_free *flist,
- int *deleted,
- xfs_ino_t *first_ino,
+ struct xfs_icluster *xic,
struct xfs_inobt_rec_incore *orec)
{
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
@@ -1475,14 +1922,14 @@ xfs_difree_inobt(
__func__, error);
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error) {
xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
__func__, error);
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
/*
* Get the offset in the inode chunk.
*/
@@ -1496,20 +1943,23 @@ xfs_difree_inobt(
rec.ir_freecount++;
/*
- * When an inode cluster is free, it becomes eligible for removal
+ * When an inode chunk is free, it becomes eligible for removal. Don't
+ * remove the chunk if the block size is large enough for multiple inode
+ * chunks (that might not be free).
*/
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
- (rec.ir_freecount == mp->m_ialloc_inos)) {
-
- *deleted = 1;
- *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+ rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
+ xic->deleted = 1;
+ xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+ xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
/*
* Remove the inode cluster from the AGI B+Tree, adjust the
* AGI and Superblock inode counts, and mark the disk space
* to be freed when the transaction is committed.
*/
- ilen = mp->m_ialloc_inos;
+ ilen = rec.ir_freecount;
be32_add_cpu(&agi->agi_count, -ilen);
be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1525,11 +1975,9 @@ xfs_difree_inobt(
goto error0;
}
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
- XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
- mp->m_ialloc_blks, flist, mp);
+ xfs_difree_inode_chunk(mp, agno, &rec, flist);
} else {
- *deleted = 0;
+ xic->deleted = 0;
error = xfs_inobt_update(cur, &rec);
if (error) {
@@ -1592,9 +2040,11 @@ xfs_difree_finobt(
* freed an inode in a previously fully allocated chunk. If not,
* something is out of sync.
*/
- XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+ XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
- error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
+ ibtrec->ir_count,
+ ibtrec->ir_freecount,
ibtrec->ir_free, &i);
if (error)
goto error;
@@ -1613,12 +2063,12 @@ xfs_difree_finobt(
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
goto error;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
rec.ir_free |= XFS_INOBT_MASK(offset);
rec.ir_freecount++;
- XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+ XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
(rec.ir_freecount == ibtrec->ir_freecount),
error);
@@ -1629,8 +2079,13 @@ xfs_difree_finobt(
* free inode. Hence, if all of the inodes are free and we aren't
* keeping inode chunks permanently on disk, remove the record.
* Otherwise, update the record with the new information.
+ *
+ * Note that we currently can't free chunks when the block size is large
+ * enough for multiple chunks. Leave the finobt record to remain in sync
+ * with the inobt.
*/
- if (rec.ir_freecount == mp->m_ialloc_inos &&
+ if (rec.ir_free == XFS_INOBT_ALL_FREE &&
+ mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
!(mp->m_flags & XFS_MOUNT_IKEEP)) {
error = xfs_btree_delete(cur, &i);
if (error)
@@ -1666,8 +2121,7 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *deleted,/* set if inode cluster was deleted */
- xfs_ino_t *first_ino)/* first inode in deleted cluster */
+ struct xfs_icluster *xic) /* cluster info if deleted */
{
/* REFERENCED */
xfs_agblock_t agbno; /* block number containing inode */
@@ -1718,8 +2172,7 @@ xfs_difree(
/*
* Fix up the inode allocation btree.
*/
- error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
- &rec);
+ error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec);
if (error)
goto error0;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 100007d56449..6e450df2979b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -28,6 +28,13 @@ struct xfs_btree_cur;
/* Move inodes in clusters of this size */
#define XFS_INODE_BIG_CLUSTER_SIZE 8192
+struct xfs_icluster {
+ bool deleted; /* record is deleted */
+ xfs_ino_t first_ino; /* first inode number */
+ uint64_t alloc; /* inode phys. allocation bitmap for
+ * sparse chunks */
+};
+
/* Calculate and return the number of filesystem blocks per inode cluster */
static inline int
xfs_icluster_size_fsb(
@@ -44,8 +51,7 @@ xfs_icluster_size_fsb(
static inline struct xfs_dinode *
xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
{
- return (struct xfs_dinode *)
- (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
+ return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog);
}
/*
@@ -90,8 +96,7 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *deleted, /* set if inode cluster was deleted */
- xfs_ino_t *first_ino); /* first inode in deleted cluster */
+ struct xfs_icluster *ifree); /* cluster info if deleted */
/*
* Return the location of the inode in imap, for mapping it into a buffer.
@@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
* Inode chunk initialisation routine
*/
int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
- struct list_head *buffer_list,
+ struct list_head *buffer_list, int icount,
xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_agblock_t length, unsigned int gen);
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 964c465ca69c..674ad8f760be 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur(
union xfs_btree_rec *rec)
{
rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
- rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
+ rec->inobt.ir_u.sp.ir_holemask =
+ cpu_to_be16(cur->bc_rec.i.ir_holemask);
+ rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count;
+ rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount;
+ } else {
+ /* ir_holemask/ir_count not supported on-disk */
+ rec->inobt.ir_u.f.ir_freecount =
+ cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ }
rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
}
@@ -418,3 +427,85 @@ xfs_inobt_maxrecs(
return blocklen / sizeof(xfs_inobt_rec_t);
return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
}
+
+/*
+ * Convert the inode record holemask to an inode allocation bitmap. The inode
+ * allocation bitmap is inode granularity and specifies whether an inode is
+ * physically allocated on disk (not whether the inode is considered allocated
+ * or free by the fs).
+ *
+ * A bit value of 1 means the inode is allocated, a value of 0 means it is free.
+ */
+uint64_t
+xfs_inobt_irec_to_allocmask(
+ struct xfs_inobt_rec_incore *rec)
+{
+ uint64_t bitmap = 0;
+ uint64_t inodespbit;
+ int nextbit;
+ uint allocbitmap;
+
+ /*
+ * The holemask has 16-bits for a 64 inode record. Therefore each
+ * holemask bit represents multiple inodes. Create a mask of bits to set
+ * in the allocmask for each holemask bit.
+ */
+ inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1;
+
+ /*
+ * Allocated inodes are represented by 0 bits in holemask. Invert the 0
+ * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask
+ * anything beyond the 16 holemask bits since this casts to a larger
+ * type.
+ */
+ allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1);
+
+ /*
+ * allocbitmap is the inverted holemask so every set bit represents
+ * allocated inodes. To expand from 16-bit holemask granularity to
+ * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target
+ * bitmap for every holemask bit.
+ */
+ nextbit = xfs_next_bit(&allocbitmap, 1, 0);
+ while (nextbit != -1) {
+ ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY));
+
+ bitmap |= (inodespbit <<
+ (nextbit * XFS_INODES_PER_HOLEMASK_BIT));
+
+ nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1);
+ }
+
+ return bitmap;
+}
+
+#if defined(DEBUG) || defined(XFS_WARN)
+/*
+ * Verify that an in-core inode record has a valid inode count.
+ */
+int
+xfs_inobt_rec_check_count(
+ struct xfs_mount *mp,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int inocount = 0;
+ int nextbit = 0;
+ uint64_t allocbmap;
+ int wordsz;
+
+ wordsz = sizeof(allocbmap) / sizeof(unsigned int);
+ allocbmap = xfs_inobt_irec_to_allocmask(rec);
+
+ nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
+ while (nextbit != -1) {
+ inocount++;
+ nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
+ nextbit + 1);
+ }
+
+ if (inocount != rec->ir_count)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+#endif /* DEBUG */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c2d0..bd88453217ce 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
xfs_btnum_t);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+/* ir_holemask to inode allocation bitmap conversion */
+uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *);
+
+#if defined(DEBUG) || defined(XFS_WARN)
+int xfs_inobt_rec_check_count(struct xfs_mount *,
+ struct xfs_inobt_rec_incore *);
+#else
+#define xfs_inobt_rec_check_count(mp, rec) 0
+#endif /* DEBUG */
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 002b6b3a1988..6526e7696184 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -46,8 +46,7 @@ xfs_inobp_check(
j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
for (i = 0; i < j; i++) {
- dip = (xfs_dinode_t *)xfs_buf_offset(bp,
- i * mp->m_sb.sb_inodesize);
+ dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
if (!dip->di_next_unlinked) {
xfs_alert(mp,
"Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
@@ -86,8 +85,7 @@ xfs_inode_buf_verify(
int di_ok;
xfs_dinode_t *dip;
- dip = (struct xfs_dinode *)xfs_buf_offset(bp,
- (i << mp->m_sb.sb_inodelog));
+ dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
XFS_DINODE_GOOD_VERSION(dip->di_version);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
@@ -186,7 +184,7 @@ xfs_imap_to_bp(
}
*bpp = bp;
- *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
+ *dipp = xfs_buf_offset(bp, imap->im_boffset);
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b0a5fe95a3e2..df9851c46b5c 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -111,14 +111,6 @@ xfs_mount_validate_sb(
bool check_inprogress,
bool check_version)
{
-
- /*
- * If the log device and data device have the
- * same device number, the log is internal.
- * Consequently, the sb_logstart should be non-zero. If
- * we have a zero sb_logstart in this case, we may be trying to mount
- * a volume filesystem in a non-volume manner.
- */
if (sbp->sb_magicnum != XFS_SB_MAGIC) {
xfs_warn(mp, "bad magic number");
return -EWRONGFS;
@@ -182,6 +174,27 @@ xfs_mount_validate_sb(
return -EFSCORRUPTED;
}
+ /*
+ * Full inode chunks must be aligned to inode chunk size when
+ * sparse inodes are enabled to support the sparse chunk
+ * allocation algorithm and prevent overlapping inode records.
+ */
+ if (xfs_sb_version_hassparseinodes(sbp)) {
+ uint32_t align;
+
+ xfs_alert(mp,
+ "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+
+ align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
+ >> sbp->sb_blocklog;
+ if (sbp->sb_inoalignmt != align) {
+ xfs_warn(mp,
+"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.",
+ sbp->sb_inoalignmt, align);
+ return -EINVAL;
+ }
+ }
+
if (unlikely(
sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
xfs_warn(mp,
@@ -382,7 +395,7 @@ __xfs_sb_from_disk(
be32_to_cpu(from->sb_features_log_incompat);
/* crc is only used on disk, not in memory; just init to 0 here. */
to->sb_crc = 0;
- to->sb_pad = 0;
+ to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
to->sb_lsn = be64_to_cpu(from->sb_lsn);
/* Convert on-disk flags to in-memory flags? */
@@ -524,7 +537,7 @@ xfs_sb_to_disk(
cpu_to_be32(from->sb_features_incompat);
to->sb_features_log_incompat =
cpu_to_be32(from->sb_features_log_incompat);
- to->sb_pad = 0;
+ to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
to->sb_lsn = cpu_to_be64(from->sb_lsn);
}
}
@@ -697,6 +710,11 @@ xfs_sb_mount_common(
mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
sbp->sb_inopblock);
mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+
+ if (sbp->sb_spino_align)
+ mp->m_ialloc_min_blks = sbp->sb_spino_align;
+ else
+ mp->m_ialloc_min_blks = mp->m_ialloc_blks;
}
/*
@@ -743,17 +761,15 @@ xfs_initialize_perag_data(
btree += pag->pagf_btreeblks;
xfs_perag_put(pag);
}
- /*
- * Overwrite incore superblock counters with just-read data
- */
+
+ /* Overwrite incore superblock counters with just-read data */
spin_lock(&mp->m_sb_lock);
sbp->sb_ifree = ifree;
sbp->sb_icount = ialloc;
sbp->sb_fdblocks = bfree + bfreelst + btree;
spin_unlock(&mp->m_sb_lock);
- /* Fixup the per-cpu counters as well. */
- xfs_icsb_reinit_counters(mp);
+ xfs_reinit_percpu_counters(mp);
return 0;
}
@@ -771,6 +787,10 @@ xfs_log_sb(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0);
+ mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+ mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+ mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
@@ -798,12 +818,12 @@ xfs_sync_sb(
tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_log_sb(tp);
if (wait)
xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 8dda4b321343..5be529707903 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -182,12 +182,6 @@ int xfs_log_calc_minimum_size(struct xfs_mount *);
#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
count in superblock */
/*
- * Values for call flags parameter.
- */
-#define XFS_TRANS_RELEASE_LOG_RES 0x4
-#define XFS_TRANS_ABORT 0x8
-
-/*
* Field values for xfs_trans_mod_sb.
*/
#define XFS_TRANS_SB_ICOUNT 0x00000001
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 2d5bdfce6d8f..797815012c0e 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,9 +73,9 @@ struct xfs_trans_resv {
* 2 trees * (2 blocks/level * max depth - 1) * block size
*/
#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
- ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+ ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1)))
#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
- ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+ ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1)))
/*
* Per-directory log reservation for any directory change.
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index bf9c4579334d..41e0428d8175 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -67,7 +67,7 @@
#define XFS_DIOSTRAT_SPACE_RES(mp, v) \
(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
#define XFS_GROWFS_SPACE_RES(mp) \
- (2 * XFS_AG_MAXLEVELS(mp))
+ (2 * (mp)->m_ag_maxlevels)
#define XFS_GROWFSRT_SPACE_RES(mp,b) \
((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
#define XFS_LINK_SPACE_RES(mp,nl) \
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a9b7a1b8704..3859f5e27a4d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,7 +31,6 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
-#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagevec.h>
@@ -110,7 +109,7 @@ xfs_setfilesize_trans_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -146,7 +145,7 @@ xfs_setfilesize(
isize = xfs_new_eof(ip, offset + size);
if (!isize) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return 0;
}
@@ -156,7 +155,7 @@ xfs_setfilesize(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
STATIC int
@@ -357,7 +356,6 @@ xfs_end_bio(
{
xfs_ioend_t *ioend = bio->bi_private;
- ASSERT(atomic_read(&bio->bi_cnt) >= 1);
ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
/* Toss bio and pass work off to an xfsdatad thread */
@@ -1233,13 +1231,124 @@ xfs_vm_releasepage(
return try_to_free_buffers(page);
}
+/*
+ * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * type of write IO we are doing. This passes to the completion function the
+ * operations it needs to perform. If the mapping is for an overwrite wholly
+ * within the EOF then we don't need an ioend and so we don't allocate one.
+ * This avoids the unnecessary overhead of allocating and freeing ioends for
+ * workloads that don't require transactions on IO completion.
+ *
+ * If we get multiple mappings in a single IO, we might be mapping different
+ * types. But because the direct IO can only have a single private pointer, we
+ * need to ensure that:
+ *
+ * a) i) the ioend spans the entire region of unwritten mappings; or
+ * ii) the ioend spans all the mappings that cross or are beyond EOF; and
+ * b) if it contains unwritten extents, it is *permanently* marked as such
+ *
+ * We could do this by chaining ioends like buffered IO does, but we only
+ * actually get one IO completion callback from the direct IO, and that spans
+ * the entire IO regardless of how many mappings and IOs are needed to complete
+ * the DIO. There is only going to be one reference to the ioend and its life
+ * cycle is constrained by the DIO completion code. hence we don't need
+ * reference counting here.
+ */
+static void
+xfs_map_direct(
+ struct inode *inode,
+ struct buffer_head *bh_result,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset)
+{
+ struct xfs_ioend *ioend;
+ xfs_off_t size = bh_result->b_size;
+ int type;
+
+ if (ISUNWRITTEN(imap))
+ type = XFS_IO_UNWRITTEN;
+ else
+ type = XFS_IO_OVERWRITE;
+
+ trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+
+ if (bh_result->b_private) {
+ ioend = bh_result->b_private;
+ ASSERT(ioend->io_size > 0);
+ ASSERT(offset >= ioend->io_offset);
+ if (offset + size > ioend->io_offset + ioend->io_size)
+ ioend->io_size = offset - ioend->io_offset + size;
+
+ if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
+ ioend->io_type = XFS_IO_UNWRITTEN;
+
+ trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
+ ioend->io_size, ioend->io_type,
+ imap);
+ } else if (type == XFS_IO_UNWRITTEN ||
+ offset + size > i_size_read(inode)) {
+ ioend = xfs_alloc_ioend(inode, type);
+ ioend->io_offset = offset;
+ ioend->io_size = size;
+
+ bh_result->b_private = ioend;
+ set_buffer_defer_completion(bh_result);
+
+ trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
+ imap);
+ } else {
+ trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+ imap);
+ }
+}
+
+/*
+ * If this is O_DIRECT or the mpage code calling tell them how large the mapping
+ * is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the mapping
+ * for blocks beyond EOF must be marked new so that sub block regions can be
+ * correctly zeroed. We can't do this for mappings within EOF unless the mapping
+ * was just allocated or is unwritten, otherwise the callers would overwrite
+ * existing data with zeros. Hence we have to split the mapping into a range up
+ * to and including EOF, and a second mapping for beyond EOF.
+ */
+static void
+xfs_map_trim_size(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ struct xfs_bmbt_irec *imap,
+ xfs_off_t offset,
+ ssize_t size)
+{
+ xfs_off_t mapping_size;
+
+ mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
+ mapping_size <<= inode->i_blkbits;
+
+ ASSERT(mapping_size > 0);
+ if (mapping_size > size)
+ mapping_size = size;
+ if (offset < i_size_read(inode) &&
+ offset + mapping_size >= i_size_read(inode)) {
+ /* limit mapping to block that spans EOF */
+ mapping_size = roundup_64(i_size_read(inode) - offset,
+ 1 << inode->i_blkbits);
+ }
+ if (mapping_size > LONG_MAX)
+ mapping_size = LONG_MAX;
+
+ bh_result->b_size = mapping_size;
+}
+
STATIC int
__xfs_get_blocks(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create,
- int direct)
+ bool direct)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1304,6 +1413,7 @@ __xfs_get_blocks(
if (error)
return error;
new = 1;
+
} else {
/*
* Delalloc reservations do not require a transaction,
@@ -1321,31 +1431,37 @@ __xfs_get_blocks(
xfs_iunlock(ip, lockmode);
}
-
- trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+ trace_xfs_get_blocks_alloc(ip, offset, size,
+ ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+ : XFS_IO_DELALLOC, &imap);
} else if (nimaps) {
- trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+ trace_xfs_get_blocks_found(ip, offset, size,
+ ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+ : XFS_IO_OVERWRITE, &imap);
xfs_iunlock(ip, lockmode);
} else {
trace_xfs_get_blocks_notfound(ip, offset, size);
goto out_unlock;
}
+ /* trim mapping down to size requested */
+ if (direct || size > (1 << inode->i_blkbits))
+ xfs_map_trim_size(inode, iblock, bh_result,
+ &imap, offset, size);
+
+ /*
+ * For unwritten extents do not report a disk address in the buffered
+ * read case (treat as if we're reading into a hole).
+ */
if (imap.br_startblock != HOLESTARTBLOCK &&
- imap.br_startblock != DELAYSTARTBLOCK) {
- /*
- * For unwritten extents do not report a disk address on
- * the read case (treat as if we're reading into a hole).
- */
- if (create || !ISUNWRITTEN(&imap))
- xfs_map_buffer(inode, bh_result, &imap, offset);
- if (create && ISUNWRITTEN(&imap)) {
- if (direct) {
- bh_result->b_private = inode;
- set_buffer_defer_completion(bh_result);
- }
+ imap.br_startblock != DELAYSTARTBLOCK &&
+ (create || !ISUNWRITTEN(&imap))) {
+ xfs_map_buffer(inode, bh_result, &imap, offset);
+ if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
- }
+ /* direct IO needs special help */
+ if (create && direct)
+ xfs_map_direct(inode, bh_result, &imap, offset);
}
/*
@@ -1378,39 +1494,6 @@ __xfs_get_blocks(
}
}
- /*
- * If this is O_DIRECT or the mpage code calling tell them how large
- * the mapping is, so that we can avoid repeated get_blocks calls.
- *
- * If the mapping spans EOF, then we have to break the mapping up as the
- * mapping for blocks beyond EOF must be marked new so that sub block
- * regions can be correctly zeroed. We can't do this for mappings within
- * EOF unless the mapping was just allocated or is unwritten, otherwise
- * the callers would overwrite existing data with zeros. Hence we have
- * to split the mapping into a range up to and including EOF, and a
- * second mapping for beyond EOF.
- */
- if (direct || size > (1 << inode->i_blkbits)) {
- xfs_off_t mapping_size;
-
- mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
- mapping_size <<= inode->i_blkbits;
-
- ASSERT(mapping_size > 0);
- if (mapping_size > size)
- mapping_size = size;
- if (offset < i_size_read(inode) &&
- offset + mapping_size >= i_size_read(inode)) {
- /* limit mapping to block that spans EOF */
- mapping_size = roundup_64(i_size_read(inode) - offset,
- 1 << inode->i_blkbits);
- }
- if (mapping_size > LONG_MAX)
- mapping_size = LONG_MAX;
-
- bh_result->b_size = mapping_size;
- }
-
return 0;
out_unlock:
@@ -1425,25 +1508,93 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}
-STATIC int
+int
xfs_get_blocks_direct(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true);
+}
+
+static void
+__xfs_end_io_direct_write(
+ struct inode *inode,
+ struct xfs_ioend *ioend,
+ loff_t offset,
+ ssize_t size)
+{
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
+
+ if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
+ goto out_end_io;
+
+ /*
+ * dio completion end_io functions are only called on writes if more
+ * than 0 bytes was written.
+ */
+ ASSERT(size > 0);
+
+ /*
+ * The ioend only maps whole blocks, while the IO may be sector aligned.
+ * Hence the ioend offset/size may not match the IO offset/size exactly.
+ * Because we don't map overwrites within EOF into the ioend, the offset
+ * may not match, but only if the endio spans EOF. Either way, write
+ * the IO sizes into the ioend so that completion processing does the
+ * right thing.
+ */
+ ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
+ ioend->io_size = size;
+ ioend->io_offset = offset;
+
+ /*
+ * The ioend tells us whether we are doing unwritten extent conversion
+ * or an append transaction that updates the on-disk file size. These
+ * cases are the only cases where we should *potentially* be needing
+ * to update the VFS inode size.
+ *
+ * We need to update the in-core inode size here so that we don't end up
+ * with the on-disk inode size being outside the in-core inode size. We
+ * have no other method of updating EOF for AIO, so always do it here
+ * if necessary.
+ *
+ * We need to lock the test/set EOF update as we can be racing with
+ * other IO completions here to update the EOF. Failing to serialise
+ * here can result in EOF moving backwards and Bad Things Happen when
+ * that occurs.
+ */
+ spin_lock(&XFS_I(inode)->i_flags_lock);
+ if (offset + size > i_size_read(inode))
+ i_size_write(inode, offset + size);
+ spin_unlock(&XFS_I(inode)->i_flags_lock);
+
+ /*
+ * If we are doing an append IO that needs to update the EOF on disk,
+ * do the transaction reserve now so we can use common end io
+ * processing. Stashing the error (if there is one) in the ioend will
+ * result in the ioend processing passing on the error if it is
+ * possible as we can't return it from here.
+ */
+ if (ioend->io_type == XFS_IO_OVERWRITE)
+ ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
+
+out_end_io:
+ xfs_end_io(&ioend->io_work);
+ return;
}
/*
* Complete a direct I/O write request.
*
- * If the private argument is non-NULL __xfs_get_blocks signals us that we
- * need to issue a transaction to convert the range from unwritten to written
- * extents.
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
*/
STATIC void
xfs_end_io_direct_write(
@@ -1453,66 +1604,93 @@ xfs_end_io_direct_write(
void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ioend *ioend = private;
- if (XFS_FORCED_SHUTDOWN(mp))
+ trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
+ ioend ? ioend->io_type : 0, NULL);
+
+ if (!ioend) {
+ ASSERT(offset + size <= i_size_read(inode));
return;
+ }
- /*
- * While the generic direct I/O code updates the inode size, it does
- * so only after the end_io handler is called, which means our
- * end_io handler thinks the on-disk size is outside the in-core
- * size. To prevent this just update it a little bit earlier here.
- */
- if (offset + size > i_size_read(inode))
- i_size_write(inode, offset + size);
+ __xfs_end_io_direct_write(inode, ioend, offset, size);
+}
+
+/*
+ * For DAX we need a mapping buffer callback for unwritten extent conversion
+ * when page faults allocate blocks and then zero them. Note that in this
+ * case the mapping indicated by the ioend may extend beyond EOF. We most
+ * definitely do not want to extend EOF here, so we trim back the ioend size to
+ * EOF.
+ */
+#ifdef CONFIG_FS_DAX
+void
+xfs_end_io_dax_write(
+ struct buffer_head *bh,
+ int uptodate)
+{
+ struct xfs_ioend *ioend = bh->b_private;
+ struct inode *inode = ioend->io_inode;
+ ssize_t size = ioend->io_size;
+
+ ASSERT(IS_DAX(ioend->io_inode));
+
+ /* if there was an error zeroing, then don't convert it */
+ if (!uptodate)
+ ioend->io_error = -EIO;
/*
- * For direct I/O we do not know if we need to allocate blocks or not,
- * so we can't preallocate an append transaction, as that results in
- * nested reservations and log space deadlocks. Hence allocate the
- * transaction here. While this is sub-optimal and can block IO
- * completion for some time, we're stuck with doing it this way until
- * we can pass the ioend to the direct IO allocation callbacks and
- * avoid nesting that way.
+ * Trim update to EOF, so we don't extend EOF during unwritten extent
+ * conversion of partial EOF blocks.
*/
- if (private && size > 0) {
- xfs_iomap_write_unwritten(ip, offset, size);
- } else if (offset + size > ip->i_d.di_size) {
- struct xfs_trans *tp;
- int error;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return;
- }
+ spin_lock(&XFS_I(inode)->i_flags_lock);
+ if (ioend->io_offset + size > i_size_read(inode))
+ size = i_size_read(inode) - ioend->io_offset;
+ spin_unlock(&XFS_I(inode)->i_flags_lock);
+
+ __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
- xfs_setfilesize(ip, tp, offset, size);
- }
+}
+#else
+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
+#endif
+
+static inline ssize_t
+xfs_vm_do_dio(
+ struct inode *inode,
+ struct kiocb *iocb,
+ struct iov_iter *iter,
+ loff_t offset,
+ void (*endio)(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t size,
+ void *private),
+ int flags)
+{
+ struct block_device *bdev;
+
+ if (IS_DAX(inode))
+ return dax_do_io(iocb, inode, iter, offset,
+ xfs_get_blocks_direct, endio, 0);
+
+ bdev = xfs_find_bdev_for_inode(inode);
+ return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+ xfs_get_blocks_direct, endio, NULL, flags);
}
STATIC ssize_t
xfs_vm_direct_IO(
- int rw,
struct kiocb *iocb,
struct iov_iter *iter,
loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
- struct block_device *bdev = xfs_find_bdev_for_inode(inode);
- if (rw & WRITE) {
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
- offset, xfs_get_blocks_direct,
- xfs_end_io_direct_write, NULL,
- DIO_ASYNC_EXTEND);
- }
- return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
- offset, xfs_get_blocks_direct,
- NULL, NULL, 0);
+ if (iov_iter_rw(iter) == WRITE)
+ return xfs_vm_do_dio(inode, iocb, iter, offset,
+ xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
+ return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
}
/*
@@ -1763,6 +1941,7 @@ xfs_vm_set_page_dirty(
loff_t end_offset;
loff_t offset;
int newly_dirty;
+ struct mem_cgroup *memcg;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
@@ -1782,6 +1961,11 @@ xfs_vm_set_page_dirty(
offset += 1 << inode->i_blkbits;
} while (bh != head);
}
+ /*
+ * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+ * per-memcg dirty page counters.
+ */
+ memcg = mem_cgroup_begin_page_stat(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
@@ -1792,13 +1976,15 @@ xfs_vm_set_page_dirty(
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping);
+ account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
+ mem_cgroup_end_page_stat(memcg);
+ if (newly_dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index ac644e0137a4..86afd1ac7895 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -53,7 +53,12 @@ typedef struct xfs_ioend {
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;
-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+int xfs_get_blocks(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
+int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
+void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 83af4c149635..2bb959ada45b 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -132,9 +132,10 @@ xfs_attr3_leaf_inactive(
int size;
int tmp;
int i;
+ struct xfs_mount *mp = bp->b_target->bt_mount;
leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
/*
* Count the number of "remote" value extents.
@@ -379,23 +380,30 @@ xfs_attr3_root_inactive(
return error;
}
+/*
+ * xfs_attr_inactive kills all traces of an attribute fork on an inode. It
+ * removes both the on-disk and in-memory inode fork. Note that this also has to
+ * handle the condition of inodes without attributes but with an attribute fork
+ * configured, so we can't use xfs_inode_hasattr() here.
+ *
+ * The in-memory attribute fork is removed even on error.
+ */
int
-xfs_attr_inactive(xfs_inode_t *dp)
+xfs_attr_inactive(
+ struct xfs_inode *dp)
{
- xfs_trans_t *trans;
- xfs_mount_t *mp;
- int error;
+ struct xfs_trans *trans;
+ struct xfs_mount *mp;
+ int lock_mode = XFS_ILOCK_SHARED;
+ int error = 0;
mp = dp->i_mount;
ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
- xfs_ilock(dp, XFS_ILOCK_SHARED);
- if (!xfs_inode_hasattr(dp) ||
- dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
- return 0;
- }
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
+ xfs_ilock(dp, lock_mode);
+ if (!XFS_IFORK_Q(dp))
+ goto out_destroy_fork;
+ xfs_iunlock(dp, lock_mode);
/*
* Start our first transaction of the day.
@@ -407,13 +415,17 @@ xfs_attr_inactive(xfs_inode_t *dp)
* the inode in every transaction to let it float upward through
* the log.
*/
+ lock_mode = 0;
trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
- if (error) {
- xfs_trans_cancel(trans, 0);
- return error;
- }
- xfs_ilock(dp, XFS_ILOCK_EXCL);
+ if (error)
+ goto out_cancel;
+
+ lock_mode = XFS_ILOCK_EXCL;
+ xfs_ilock(dp, lock_mode);
+
+ if (!XFS_IFORK_Q(dp))
+ goto out_cancel;
/*
* No need to make quota reservations here. We expect to release some
@@ -422,28 +434,36 @@ xfs_attr_inactive(xfs_inode_t *dp)
xfs_trans_ijoin(trans, dp, 0);
/*
- * Decide on what work routines to call based on the inode size.
+ * Invalidate and truncate the attribute fork extents. Make sure the
+ * fork actually has attributes as otherwise the invalidation has no
+ * blocks to read and returns an error. In this case, just do the fork
+ * removal below.
*/
- if (!xfs_inode_hasattr(dp) ||
- dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- error = 0;
- goto out;
- }
- error = xfs_attr3_root_inactive(&trans, dp);
- if (error)
- goto out;
+ if (xfs_inode_hasattr(dp) &&
+ dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) {
+ error = xfs_attr3_root_inactive(&trans, dp);
+ if (error)
+ goto out_cancel;
- error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
- if (error)
- goto out;
+ error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
+ if (error)
+ goto out_cancel;
+ }
- error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ /* Reset the attribute fork - this also destroys the in-core fork */
+ xfs_attr_fork_remove(dp, trans);
+ error = xfs_trans_commit(trans);
+ xfs_iunlock(dp, lock_mode);
return error;
-out:
- xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
+out_cancel:
+ xfs_trans_cancel(trans);
+out_destroy_fork:
+ /* kill the in-core attr fork before we drop the inode lock */
+ if (dp->i_afp)
+ xfs_idestroy_fork(dp, XFS_ATTR_FORK);
+ if (lock_mode)
+ xfs_iunlock(dp, lock_mode);
return error;
}
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a43d370d2c58..65fb37a18e92 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -225,6 +225,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
int error, i;
struct xfs_buf *bp;
struct xfs_inode *dp = context->dp;
+ struct xfs_mount *mp = dp->i_mount;
trace_xfs_attr_node_list(context);
@@ -256,7 +257,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
case XFS_ATTR_LEAF_MAGIC:
case XFS_ATTR3_LEAF_MAGIC:
leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
+ &leafhdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
if (cursor->hashval > be32_to_cpu(
entries[leafhdr.count - 1].hashval)) {
@@ -340,7 +342,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
xfs_trans_brelse(NULL, bp);
return error;
}
- xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
if (context->seen_enough || leafhdr.forw == 0)
break;
cursor->blkno = leafhdr.forw;
@@ -368,11 +370,12 @@ xfs_attr3_leaf_list_int(
struct xfs_attr_leaf_entry *entry;
int retval;
int i;
+ struct xfs_mount *mp = context->dp->i_mount;
trace_xfs_attr_list_leaf(context);
leaf = bp->b_addr;
- xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+ xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
cursor = context->cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 22a5dcb70b32..0f34886cf726 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -75,28 +75,20 @@ xfs_bmap_finish(
xfs_efi_log_item_t *efi; /* extent free intention */
int error; /* error return value */
xfs_bmap_free_item_t *free; /* free extent item */
- struct xfs_trans_res tres; /* new log reservation */
xfs_mount_t *mp; /* filesystem mount structure */
xfs_bmap_free_item_t *next; /* next item on free list */
- xfs_trans_t *ntp; /* new transaction pointer */
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
if (flist->xbf_count == 0) {
*committed = 0;
return 0;
}
- ntp = *tp;
- efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+ efi = xfs_trans_get_efi(*tp, flist->xbf_count);
for (free = flist->xbf_first; free; free = free->xbfi_next)
- xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+ xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
free->xbfi_blockcount);
- tres.tr_logres = ntp->t_log_res;
- tres.tr_logcount = ntp->t_log_count;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- ntp = xfs_trans_dup(*tp);
- error = xfs_trans_commit(*tp, 0);
- *tp = ntp;
+ error = xfs_trans_roll(tp, NULL);
*committed = 1;
/*
* We have a new transaction, so we should return committed=1,
@@ -105,19 +97,10 @@ xfs_bmap_finish(
if (error)
return error;
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(ntp->t_ticket);
-
- error = xfs_trans_reserve(ntp, &tres, 0, 0);
- if (error)
- return error;
- efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
for (free = flist->xbf_first; free != NULL; free = next) {
next = free->xbfi_next;
- if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+ if ((error = xfs_free_extent(*tp, free->xbfi_startblock,
free->xbfi_blockcount))) {
/*
* The bmap free list will be cleaned up at a
@@ -127,7 +110,7 @@ xfs_bmap_finish(
* happens, since this transaction may not be
* dirty yet.
*/
- mp = ntp->t_mountp;
+ mp = (*tp)->t_mountp;
if (!XFS_FORCED_SHUTDOWN(mp))
xfs_force_shutdown(mp,
(error == -EFSCORRUPTED) ?
@@ -135,7 +118,7 @@ xfs_bmap_finish(
SHUTDOWN_META_IO_ERROR);
return error;
}
- xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+ xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock,
free->xbfi_blockcount);
xfs_bmap_del_free(flist, NULL, free);
}
@@ -878,7 +861,7 @@ xfs_free_eofblocks(
if (need_iolock) {
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return -EAGAIN;
}
}
@@ -886,7 +869,7 @@ xfs_free_eofblocks(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
if (need_iolock)
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return error;
@@ -908,12 +891,9 @@ xfs_free_eofblocks(
* If we get an error at this point we simply don't
* bother truncating the file.
*/
- xfs_trans_cancel(tp,
- (XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT));
+ xfs_trans_cancel(tp);
} else {
- error = xfs_trans_commit(tp,
- XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (!error)
xfs_inode_clear_eofblocks_tag(ip);
}
@@ -1026,7 +1006,7 @@ xfs_alloc_file_space(
* Free the transaction structure.
*/
ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1053,7 +1033,7 @@ xfs_alloc_file_space(
goto error0;
}
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error) {
break;
@@ -1077,7 +1057,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
error1: /* Just cancel transaction */
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes(
break;
ASSERT(imap.br_blockcount >= 1);
ASSERT(imap.br_startoff == offset_fsb);
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+
+ if (imap.br_startblock == HOLESTARTBLOCK ||
+ imap.br_state == XFS_EXT_UNWRITTEN) {
+ /* skip the entire extent */
+ lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
+ imap.br_blockcount) - 1;
+ continue;
+ }
+
lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
if (lastoffset > endoff)
lastoffset = endoff;
- if (imap.br_startblock == HOLESTARTBLOCK)
- continue;
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- if (imap.br_state == XFS_EXT_UNWRITTEN)
+
+ /* DAX can just zero the backing device directly */
+ if (IS_DAX(VFS_I(ip))) {
+ error = dax_zero_page_range(VFS_I(ip), offset,
+ lastoffset - offset + 1,
+ xfs_get_blocks_direct);
+ if (error)
+ return error;
continue;
+ }
error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -1289,7 +1284,7 @@ xfs_free_file_space(
* Free the transaction structure.
*/
ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -1320,7 +1315,7 @@ xfs_free_file_space(
goto error0;
}
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
@@ -1330,7 +1325,7 @@ xfs_free_file_space(
error0:
xfs_bmap_cancel(&free_list);
error1:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
goto out;
}
@@ -1376,22 +1371,19 @@ out:
}
/*
- * xfs_collapse_file_space()
- * This routine frees disk space and shift extent for the given file.
- * The first thing we do is to free data blocks in the specified range
- * by calling xfs_free_file_space(). It would also sync dirty data
- * and invalidate page cache over the region on which collapse range
- * is working. And Shift extent records to the left to cover a hole.
- * RETURNS:
- * 0 on success
- * errno on error
- *
+ * @next_fsb will keep track of the extent currently undergoing shift.
+ * @stop_fsb will keep track of the extent at which we have to stop.
+ * If we are shifting left, we will start with block (offset + len) and
+ * shift each extent till last extent.
+ * If we are shifting right, we will start with last extent inside file space
+ * and continue until we reach the block corresponding to offset.
*/
-int
-xfs_collapse_file_space(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t len)
+static int
+xfs_shift_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len,
+ enum shift_direction direction)
{
int done = 0;
struct xfs_mount *mp = ip->i_mount;
@@ -1400,21 +1392,26 @@ xfs_collapse_file_space(
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
int committed;
- xfs_fileoff_t start_fsb;
+ xfs_fileoff_t stop_fsb;
xfs_fileoff_t next_fsb;
xfs_fileoff_t shift_fsb;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
- trace_xfs_collapse_file_space(ip);
+ if (direction == SHIFT_LEFT) {
+ next_fsb = XFS_B_TO_FSB(mp, offset + len);
+ stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+ } else {
+ /*
+ * If right shift, delegate the work of initialization of
+ * next_fsb to xfs_bmap_shift_extent as it has ilock held.
+ */
+ next_fsb = NULLFSBLOCK;
+ stop_fsb = XFS_B_TO_FSB(mp, offset);
+ }
- next_fsb = XFS_B_TO_FSB(mp, offset + len);
shift_fsb = XFS_B_TO_FSB(mp, len);
- error = xfs_free_file_space(ip, offset, len);
- if (error)
- return error;
-
/*
* Trim eofblocks to avoid shifting uninitialized post-eof preallocation
* into the accessible region of the file.
@@ -1427,20 +1424,28 @@ xfs_collapse_file_space(
/*
* Writeback and invalidate cache for the remainder of the file as we're
- * about to shift down every extent from the collapse range to EOF. The
- * free of the collapse range above might have already done some of
- * this, but we shouldn't rely on it to do anything outside of the range
- * that was freed.
+ * about to shift down every extent from offset to EOF.
*/
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- offset + len, -1);
+ offset, -1);
if (error)
return error;
error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- (offset + len) >> PAGE_CACHE_SHIFT, -1);
+ offset >> PAGE_CACHE_SHIFT, -1);
if (error)
return error;
+ /*
+ * The extent shiting code works on extent granularity. So, if
+ * stop_fsb is not the starting block of extent, we need to split
+ * the extent at stop_fsb.
+ */
+ if (direction == SHIFT_RIGHT) {
+ error = xfs_bmap_split_extent(ip, stop_fsb);
+ if (error)
+ return error;
+ }
+
while (!error && !done) {
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
/*
@@ -1452,7 +1457,7 @@ xfs_collapse_file_space(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
break;
}
@@ -1464,7 +1469,7 @@ xfs_collapse_file_space(
if (error)
goto out;
- xfs_trans_ijoin(tp, ip, 0);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_bmap_init(&free_list, &first_block);
@@ -1472,10 +1477,9 @@ xfs_collapse_file_space(
* We are using the write transaction in which max 2 bmbt
* updates are allowed
*/
- start_fsb = next_fsb;
- error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb,
- &done, &next_fsb, &first_block, &free_list,
- XFS_BMAP_MAX_SHIFT_EXTENTS);
+ error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
+ &done, stop_fsb, &first_block, &free_list,
+ direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
if (error)
goto out;
@@ -1483,19 +1487,71 @@ xfs_collapse_file_space(
if (error)
goto out;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_commit(tp);
}
return error;
out:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_cancel(tp);
return error;
}
/*
+ * xfs_collapse_file_space()
+ * This routine frees disk space and shift extent for the given file.
+ * The first thing we do is to free data blocks in the specified range
+ * by calling xfs_free_file_space(). It would also sync dirty data
+ * and invalidate page cache over the region on which collapse range
+ * is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ * 0 on success
+ * errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
+{
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ trace_xfs_collapse_file_space(ip);
+
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ return error;
+
+ return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
+}
+
+/*
+ * xfs_insert_file_space()
+ * This routine create hole space by shifting extents for the given file.
+ * The first thing we do is to sync dirty data and invalidate page cache
+ * over the region on which insert range is working. And split an extent
+ * to two extents at given offset by calling xfs_bmap_split_extent.
+ * And shift all extent records which are laying between [offset,
+ * last allocated extent] to the right to reserve hole range.
+ * RETURNS:
+ * 0 on success
+ * errno on error
+ */
+int
+xfs_insert_file_space(
+ struct xfs_inode *ip,
+ loff_t offset,
+ loff_t len)
+{
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ trace_xfs_insert_file_space(ip);
+
+ return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
+}
+
+/*
* We need to check that the format of the data fork in the temporary inode is
* valid for the target inode before doing the swap. This is not a problem with
* attr1 because of the fixed fork offset, but attr2 has a dynamically sized
@@ -1599,13 +1655,6 @@ xfs_swap_extent_flush(
/* Verify O_DIRECT for ftmp */
if (VFS_I(ip)->i_mapping->nrpages)
return -EINVAL;
-
- /*
- * Don't try to swap extents on mmap()d files because we can't lock
- * out races against page faults safely.
- */
- if (mapping_mapped(VFS_I(ip)->i_mapping))
- return -EBUSY;
return 0;
}
@@ -1633,13 +1682,14 @@ xfs_swap_extents(
}
/*
- * Lock up the inodes against other IO and truncate to begin with.
- * Then we can ensure the inodes are flushed and have no page cache
- * safely. Once we have done this we can take the ilocks and do the rest
- * of the checks.
+ * Lock the inodes against other IO, page faults and truncate to
+ * begin with. Then we can ensure the inodes are flushed and have no
+ * page cache safely. Once we have done this we can take the ilocks and
+ * do the rest of the checks.
*/
- lock_flags = XFS_IOLOCK_EXCL;
+ lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+ xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
/* Verify that both files have the same format */
if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
@@ -1663,11 +1713,19 @@ xfs_swap_extents(
tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
+
+ /*
+ * Lock and join the inodes to the tansaction so that transaction commit
+ * or cancel will unlock the inodes from this point onwards.
+ */
xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
lock_flags |= XFS_ILOCK_EXCL;
+ xfs_trans_ijoin(tp, ip, lock_flags);
+ xfs_trans_ijoin(tp, tip, lock_flags);
+
/* Verify all data are being swapped */
if (sxp->sx_offset != 0 ||
@@ -1720,9 +1778,6 @@ xfs_swap_extents(
goto out_trans_cancel;
}
- xfs_trans_ijoin(tp, ip, lock_flags);
- xfs_trans_ijoin(tp, tip, lock_flags);
-
/*
* Before we've swapped the forks, lets set the owners of the forks
* appropriately. We have to do this as we are demand paging the btree
@@ -1841,7 +1896,7 @@ xfs_swap_extents(
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
trace_xfs_swap_extent_after(ip, 0);
trace_xfs_swap_extent_after(tip, 1);
@@ -1855,6 +1910,6 @@ out_unlock:
goto out;
out_trans_cancel:
- xfs_trans_cancel(tp, 0);
- goto out_unlock;
+ xfs_trans_cancel(tp);
+ goto out;
}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 736429a72a12..af97d9a1dfb4 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,6 +63,8 @@ int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
+int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
+ xfs_off_t len);
/* EOF block manipulation functions */
bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1790b00bea7a..a4b7d92e946c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1419,9 +1419,9 @@ xfs_buf_submit_wait(
return error;
}
-xfs_caddr_t
+void *
xfs_buf_offset(
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
size_t offset)
{
struct page *page;
@@ -1431,7 +1431,7 @@ xfs_buf_offset(
offset += bp->b_offset;
page = bp->b_pages[offset >> PAGE_SHIFT];
- return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+ return page_address(page) + (offset & (PAGE_SIZE-1));
}
/*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 75ff5d5a7d2e..331c1ccf8264 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -299,7 +299,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
/* Buffer Utility Routines */
-extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+extern void *xfs_buf_offset(struct xfs_buf *, size_t);
/* Delayed Write Buffer Routines */
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 507d96a57ac7..092d652bc03d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -537,9 +537,9 @@ xfs_buf_item_push(
/* has a previous flush failed due to IO errors? */
if ((bp->b_flags & XBF_WRITE_FAIL) &&
- ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
+ ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
xfs_warn(bp->b_target->bt_mount,
-"Detected failing async write on buffer block 0x%llx. Retrying async write.",
+"Failing async write on buffer block 0x%llx. Retrying async write.",
(long long)bp->b_bn);
}
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 799e5a2d334d..e85a9519a5ae 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -84,7 +84,7 @@ xfs_trim_extents(
error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
if (error)
goto out_del_cursor;
- XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+ XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor);
ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
/*
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 02c01bbbc789..4143dc75dca4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -568,8 +568,6 @@ xfs_qm_dqread(
struct xfs_buf *bp;
struct xfs_trans *tp = NULL;
int error;
- int cancelflags = 0;
-
dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
@@ -617,7 +615,6 @@ xfs_qm_dqread(
XFS_QM_DQALLOC_SPACE_RES(mp), 0);
if (error)
goto error1;
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
}
/*
@@ -632,7 +629,6 @@ xfs_qm_dqread(
* allocate (ENOENT).
*/
trace_xfs_dqread_fail(dqp);
- cancelflags |= XFS_TRANS_ABORT;
goto error1;
}
@@ -670,7 +666,7 @@ xfs_qm_dqread(
xfs_trans_brelse(tp, bp);
if (tp) {
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error0;
}
@@ -680,7 +676,7 @@ xfs_qm_dqread(
error1:
if (tp)
- xfs_trans_cancel(tp, cancelflags);
+ xfs_trans_cancel(tp);
error0:
xfs_qm_dqdestroy(dqp);
*O_dqpp = NULL;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 3ee186ac1093..74d0e5966ebc 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -127,11 +127,11 @@ xfs_error_report(
struct xfs_mount *mp,
const char *filename,
int linenum,
- inst_t *ra)
+ void *ra)
{
if (level <= xfs_error_level) {
xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
- "Internal error %s at line %d of file %s. Caller %pF",
+ "Internal error %s at line %d of file %s. Caller %pS",
tag, linenum, filename, ra);
xfs_stack_trace();
@@ -146,7 +146,7 @@ xfs_corruption_error(
void *p,
const char *filename,
int linenum,
- inst_t *ra)
+ void *ra)
{
if (level <= xfs_error_level)
xfs_hex_dump(p, 64);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 279a76e52791..4ed3042a0f16 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -21,10 +21,10 @@
struct xfs_mount;
extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
- const char *filename, int linenum, inst_t *ra);
+ const char *filename, int linenum, void *ra);
extern void xfs_corruption_error(const char *tag, int level,
struct xfs_mount *mp, void *p, const char *filename,
- int linenum, inst_t *ra);
+ int linenum, void *ra);
extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_ERROR_REPORT(e, lvl, mp) \
@@ -40,25 +40,25 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
/*
* Macros to set EFSCORRUPTED & return/branch.
*/
-#define XFS_WANT_CORRUPTED_GOTO(x,l) \
+#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \
{ \
int fs_is_ok = (x); \
ASSERT(fs_is_ok); \
if (unlikely(!fs_is_ok)) { \
XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
- XFS_ERRLEVEL_LOW, NULL); \
+ XFS_ERRLEVEL_LOW, mp); \
error = -EFSCORRUPTED; \
goto l; \
} \
}
-#define XFS_WANT_CORRUPTED_RETURN(x) \
+#define XFS_WANT_CORRUPTED_RETURN(mp, x) \
{ \
int fs_is_ok = (x); \
ASSERT(fs_is_ok); \
if (unlikely(!fs_is_ok)) { \
XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
- XFS_ERRLEVEL_LOW, NULL); \
+ XFS_ERRLEVEL_LOW, mp); \
return -EFSCORRUPTED; \
} \
}
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index b97359ba2648..652cd3c5b58c 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -215,7 +215,7 @@ xfs_fs_get_parent(
int error;
struct xfs_inode *cip;
- error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
+ error = xfs_lookup(XFS_I(d_inode(child)), &xfs_name_dotdot, &cip, NULL);
if (unlikely(error))
return ERR_PTR(error);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index cb7fe64cdbfa..adc8f8fdd145 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -239,7 +239,7 @@ xfs_efi_init(
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
efip->efi_format.efi_nextents = nextents;
- efip->efi_format.efi_id = (__psint_t)(void*)efip;
+ efip->efi_format.efi_id = (uintptr_t)(void *)efip;
atomic_set(&efip->efi_next_extent, 0);
atomic_set(&efip->efi_refcount, 2);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a2e1cb8a568b..db4acc1c3e73 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,10 +38,10 @@
#include "xfs_icache.h"
#include "xfs_pnfs.h"
-#include <linux/aio.h>
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
+#include <linux/backing-dev.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -80,14 +80,15 @@ xfs_rw_ilock_demote(
}
/*
- * xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
*
- * xfs_iozero clears the specified range of buffer supplied,
- * and marks all the affected blocks as valid and modified. If
- * an affected block is not allocated, it will be allocated. If
- * an affected block is not completely overwritten, and is not
- * valid before the operation, it will be read from disk before
- * being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
*/
int
xfs_iozero(
@@ -97,7 +98,8 @@ xfs_iozero(
{
struct page *page;
struct address_space *mapping;
- int status;
+ int status = 0;
+
mapping = VFS_I(ip)->i_mapping;
do {
@@ -109,23 +111,30 @@ xfs_iozero(
if (bytes > count)
bytes = count;
- status = pagecache_write_begin(NULL, mapping, pos, bytes,
- AOP_FLAG_UNINTERRUPTIBLE,
- &page, &fsdata);
- if (status)
- break;
+ if (IS_DAX(VFS_I(ip))) {
+ status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+ xfs_get_blocks_direct);
+ if (status)
+ break;
+ } else {
+ status = pagecache_write_begin(NULL, mapping, pos, bytes,
+ AOP_FLAG_UNINTERRUPTIBLE,
+ &page, &fsdata);
+ if (status)
+ break;
- zero_user(page, offset, bytes);
+ zero_user(page, offset, bytes);
- status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
- page, fsdata);
- WARN_ON(status <= 0); /* can't return less than zero! */
+ status = pagecache_write_end(NULL, mapping, pos, bytes,
+ bytes, page, fsdata);
+ WARN_ON(status <= 0); /* can't return less than zero! */
+ status = 0;
+ }
pos += bytes;
count -= bytes;
- status = 0;
} while (count);
- return (-status);
+ return status;
}
int
@@ -139,7 +148,7 @@ xfs_update_prealloc_flags(
tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -161,7 +170,7 @@ xfs_update_prealloc_flags(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (flags & XFS_PREALLOC_SYNC)
xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
/*
@@ -280,12 +289,12 @@ xfs_file_read_iter(
XFS_STATS_INC(xs_read_calls);
- if (unlikely(file->f_flags & O_DIRECT))
+ if (unlikely(iocb->ki_flags & IOCB_DIRECT))
ioflags |= XFS_IO_ISDIRECT;
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
- if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+ if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -379,7 +388,11 @@ xfs_file_splice_read(
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+ /* for dax, we need to avoid the page cache */
+ if (IS_DAX(VFS_I(ip)))
+ ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+ else
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -545,44 +558,74 @@ xfs_zero_eof(
*/
STATIC ssize_t
xfs_file_aio_write_checks(
- struct file *file,
- loff_t *pos,
- size_t *count,
+ struct kiocb *iocb,
+ struct iov_iter *from,
int *iolock)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- int error = 0;
+ ssize_t error = 0;
+ size_t count = iov_iter_count(from);
restart:
- error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
- if (error)
+ error = generic_write_checks(iocb, from);
+ if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock);
+ error = xfs_break_layouts(inode, iolock, true);
if (error)
return error;
+ /* For changing security info in file_remove_privs() we need i_mutex */
+ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ goto restart;
+ }
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
* write. If zeroing is needed and we are currently holding the
* iolock shared, we need to update it to exclusive which implies
* having to redo all checks before.
+ *
+ * We need to serialise against EOF updates that occur in IO
+ * completions here. We want to make sure that nobody is changing the
+ * size while we do this check until we have placed an IO barrier (i.e.
+ * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+ * The spinlock effectively forms a memory barrier once we have the
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+ * and hence be able to correctly determine if we need to run zeroing.
*/
- if (*pos > i_size_read(inode)) {
+ spin_lock(&ip->i_flags_lock);
+ if (iocb->ki_pos > i_size_read(inode)) {
bool zero = false;
+ spin_unlock(&ip->i_flags_lock);
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_rw_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
xfs_rw_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+
+ /*
+ * We now have an IO submission barrier in place, but
+ * AIO can do EOF updates during IO completion and hence
+ * we now need to wait for all of them to drain. Non-AIO
+ * DIO will have drained before we are given the
+ * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+ * no-op.
+ */
+ inode_dio_wait(inode);
goto restart;
}
- error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
+ error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
if (error)
return error;
- }
+ } else
+ spin_unlock(&ip->i_flags_lock);
/*
* Updating the timestamps will grab the ilock again from
@@ -601,7 +644,9 @@ restart:
* setgid bits if the process is not being run by root. This keeps
* people from modifying setuid and setgid binaries.
*/
- return file_remove_suid(file);
+ if (!IS_NOSEC(inode))
+ return file_remove_privs(file);
+ return 0;
}
/*
@@ -644,11 +689,13 @@ xfs_file_dio_aio_write(
int iolock;
size_t count = iov_iter_count(from);
loff_t pos = iocb->ki_pos;
+ loff_t end;
+ struct iov_iter data;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
/* DIO must be aligned to device logical sector size */
- if ((pos | count) & target->bt_logical_sectormask)
+ if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
return -EINVAL;
/* "unaligned" here means not aligned to a filesystem block */
@@ -679,14 +726,16 @@ xfs_file_dio_aio_write(
xfs_rw_ilock(ip, iolock);
}
- ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
+ ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
- iov_iter_truncate(from, count);
+ count = iov_iter_count(from);
+ pos = iocb->ki_pos;
+ end = pos + count - 1;
if (mapping->nrpages) {
ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- pos, pos + count - 1);
+ pos, end);
if (ret)
goto out;
/*
@@ -696,7 +745,7 @@ xfs_file_dio_aio_write(
*/
ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
pos >> PAGE_CACHE_SHIFT,
- (pos + count - 1) >> PAGE_CACHE_SHIFT);
+ end >> PAGE_CACHE_SHIFT);
WARN_ON_ONCE(ret);
ret = 0;
}
@@ -713,13 +762,30 @@ xfs_file_dio_aio_write(
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_direct_write(iocb, from, pos);
+ data = *from;
+ ret = mapping->a_ops->direct_IO(iocb, &data, pos);
+
+ /* see generic_file_direct_write() for why this is necessary */
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ end >> PAGE_CACHE_SHIFT);
+ }
+
+ if (ret > 0) {
+ pos += ret;
+ iov_iter_advance(from, ret);
+ iocb->ki_pos = pos;
+ }
out:
xfs_rw_iunlock(ip, iolock);
- /* No fallback to buffered IO on errors for XFS. */
- ASSERT(ret < 0 || ret == count);
+ /*
+ * No fallback to buffered IO on errors for XFS. DAX can result in
+ * partial writes, but direct IO will either complete fully or fail.
+ */
+ ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
return ret;
}
@@ -735,24 +801,22 @@ xfs_file_buffered_aio_write(
ssize_t ret;
int enospc = 0;
int iolock = XFS_IOLOCK_EXCL;
- loff_t pos = iocb->ki_pos;
- size_t count = iov_iter_count(from);
xfs_rw_ilock(ip, iolock);
- ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
+ ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
- iov_iter_truncate(from, count);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
write_retry:
- trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
- ret = generic_perform_write(file, from, pos);
+ trace_xfs_file_buffered_write(ip, iov_iter_count(from),
+ iocb->ki_pos, 0);
+ ret = generic_perform_write(file, from, iocb->ki_pos);
if (likely(ret >= 0))
- iocb->ki_pos = pos + ret;
+ iocb->ki_pos += ret;
/*
* If we hit a space limit, try to free up some lingering preallocated
@@ -804,7 +868,7 @@ xfs_file_write_iter(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- if (unlikely(file->f_flags & O_DIRECT))
+ if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
ret = xfs_file_dio_aio_write(iocb, from);
else
ret = xfs_file_buffered_aio_write(iocb, from);
@@ -822,6 +886,11 @@ xfs_file_write_iter(
return ret;
}
+#define XFS_FALLOC_FL_SUPPORTED \
+ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
+ FALLOC_FL_INSERT_RANGE)
+
STATIC long
xfs_file_fallocate(
struct file *file,
@@ -835,18 +904,21 @@ xfs_file_fallocate(
enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL;
loff_t new_size = 0;
+ bool do_file_insert = 0;
if (!S_ISREG(inode->i_mode))
return -EINVAL;
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+ if (mode & ~XFS_FALLOC_FL_SUPPORTED)
return -EOPNOTSUPP;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock);
+ error = xfs_break_layouts(inode, &iolock, false);
if (error)
goto out_unlock;
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ iolock |= XFS_MMAPLOCK_EXCL;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
@@ -873,6 +945,27 @@ xfs_file_fallocate(
error = xfs_collapse_file_space(ip, offset, len);
if (error)
goto out_unlock;
+ } else if (mode & FALLOC_FL_INSERT_RANGE) {
+ unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+ new_size = i_size_read(inode) + len;
+ if (offset & blksize_mask || len & blksize_mask) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* check the new inode size does not wrap through zero */
+ if (new_size > inode->i_sb->s_maxbytes) {
+ error = -EFBIG;
+ goto out_unlock;
+ }
+
+ /* Offset should be less than i_size */
+ if (offset >= i_size_read(inode)) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+ do_file_insert = 1;
} else {
flags |= XFS_PREALLOC_SET;
@@ -907,8 +1000,19 @@ xfs_file_fallocate(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
error = xfs_setattr_size(ip, &iattr);
+ if (error)
+ goto out_unlock;
}
+ /*
+ * Perform hole insertion now that the file size has been
+ * updated so that if we crash during the operation we don't
+ * leave shifted extents past EOF and hence losing access to
+ * the data that is contained within them.
+ */
+ if (do_file_insert)
+ error = xfs_insert_file_space(ip, offset, len);
+
out_unlock:
xfs_iunlock(ip, iolock);
return error;
@@ -985,31 +1089,6 @@ xfs_file_readdir(
return xfs_readdir(ip, ctx, bufsize);
}
-STATIC int
-xfs_file_mmap(
- struct file *filp,
- struct vm_area_struct *vma)
-{
- vma->vm_ops = &xfs_file_vm_ops;
-
- file_accessed(filp);
- return 0;
-}
-
-/*
- * mmap()d file has taken write protection fault and is being made
- * writable. We can set the page state up correctly for a writable
- * page, which means we can do correct delalloc accounting (ENOSPC
- * checking!) and unwritten extent mapping.
- */
-STATIC int
-xfs_vm_page_mkwrite(
- struct vm_area_struct *vma,
- struct vm_fault *vmf)
-{
- return block_page_mkwrite(vma, vmf, xfs_get_blocks);
-}
-
/*
* This type is designed to indicate the type of offset we would like
* to search from page cache for xfs_seek_hole_data().
@@ -1385,10 +1464,101 @@ xfs_file_llseek(
}
}
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_sem (MM)
+ * sb_start_pagefault(vfs, freeze)
+ * i_mmap_lock (XFS - truncate serialisation)
+ * page_lock (MM)
+ * i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
+ */
+STATIC int
+xfs_filemap_page_mkwrite(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret;
+
+ trace_xfs_filemap_page_mkwrite(XFS_I(inode));
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+ if (IS_DAX(inode)) {
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+ xfs_end_io_dax_write);
+ } else {
+ ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = block_page_mkwrite_return(ret);
+ }
+
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
+STATIC int
+xfs_filemap_fault(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ int ret;
+
+ trace_xfs_filemap_fault(XFS_I(inode));
+
+ /* DAX can shortcut the normal fault path on write faults! */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode))
+ return xfs_filemap_page_mkwrite(vma, vmf);
+
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ if (IS_DAX(inode)) {
+ /*
+ * we do not want to trigger unwritten extent conversion on read
+ * faults - that is unnecessary overhead and would also require
+ * changes to xfs_get_blocks_direct() to map unwritten extent
+ * ioend for conversion on read-only mappings.
+ */
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+ } else
+ ret = filemap_fault(vma, vmf);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+
+ return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+ .fault = xfs_filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = xfs_filemap_page_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+ struct file *filp,
+ struct vm_area_struct *vma)
+{
+ file_accessed(filp);
+ vma->vm_ops = &xfs_file_vm_ops;
+ if (IS_DAX(file_inode(filp)))
+ vma->vm_flags |= VM_MIXEDMAP;
+ return 0;
+}
+
const struct file_operations xfs_file_operations = {
.llseek = xfs_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = xfs_file_read_iter,
.write_iter = xfs_file_write_iter,
.splice_read = xfs_file_splice_read,
@@ -1415,9 +1585,3 @@ const struct file_operations xfs_dir_file_operations = {
#endif
.fsync = xfs_dir_fsync,
};
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
- .fault = filemap_fault,
- .map_pages = filemap_map_pages,
- .page_mkwrite = xfs_vm_page_mkwrite,
-};
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a2e86e8a0fea..c4c130f9bfb6 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -196,7 +196,8 @@ xfs_filestream_pick_ag(
goto next_ag;
}
- longest = xfs_alloc_longest_free_extent(mp, pag);
+ longest = xfs_alloc_longest_free_extent(mp, pag,
+ xfs_alloc_min_freelist(mp, pag));
if (((minlen && longest >= minlen) ||
(!minlen && pag->pagf_freeblks >= minfree)) &&
(!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) ||
@@ -294,7 +295,7 @@ xfs_filestream_get_parent(
if (!parent)
goto out_dput;
- dir = igrab(parent->d_inode);
+ dir = igrab(d_inode(parent));
dput(parent);
out_dput:
@@ -322,7 +323,7 @@ xfs_filestream_lookup_ag(
pip = xfs_filestream_get_parent(ip);
if (!pip)
- goto out;
+ return NULLAGNUMBER;
mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
if (mru) {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 74efe5b760dc..9b3438a7680f 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -101,7 +101,9 @@ xfs_fs_geometry(
(xfs_sb_version_hasftype(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
(xfs_sb_version_hasfinobt(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
+ XFS_FSOP_GEOM_FLAGS_FINOBT : 0) |
+ (xfs_sb_version_hassparseinodes(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_SPINODES : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -201,7 +203,7 @@ xfs_growfs_data_private(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
XFS_GROWFS_SPACE_RES(mp), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -489,7 +491,7 @@ xfs_growfs_data_private(
if (dpct)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
return error;
@@ -557,7 +559,7 @@ xfs_growfs_data_private(
return saved_error ? saved_error : error;
error0:
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
@@ -637,12 +639,13 @@ xfs_fs_counts(
xfs_mount_t *mp,
xfs_fsop_counts_t *cnt)
{
- xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+ cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
+ cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
+ cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+ XFS_ALLOC_SET_ASIDE(mp);
+
spin_lock(&mp->m_sb_lock);
- cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
cnt->freertx = mp->m_sb.sb_frextents;
- cnt->freeino = mp->m_sb.sb_ifree;
- cnt->allocino = mp->m_sb.sb_icount;
spin_unlock(&mp->m_sb_lock);
return 0;
}
@@ -692,14 +695,9 @@ xfs_reserve_blocks(
* what to do. This means that the amount of free space can
* change while we do this, so we need to retry if we end up
* trying to reserve more space than is available.
- *
- * We also use the xfs_mod_incore_sb() interface so that we
- * don't have to care about whether per cpu counter are
- * enabled, disabled or even compiled in....
*/
retry:
spin_lock(&mp->m_sb_lock);
- xfs_icsb_sync_counters_locked(mp, 0);
/*
* If our previous reservation was larger than the current value,
@@ -716,7 +714,8 @@ retry:
} else {
__int64_t free;
- free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ free = percpu_counter_sum(&mp->m_fdblocks) -
+ XFS_ALLOC_SET_ASIDE(mp);
if (!free)
goto out; /* ENOSPC and fdblks_delta = 0 */
@@ -755,8 +754,7 @@ out:
* the extra reserve blocks from the reserve.....
*/
int error;
- error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- fdblks_delta, 0);
+ error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
if (error == -ENOSPC)
goto retry;
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9771b7ef62ed..76a9f2783282 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -439,11 +439,11 @@ again:
*ipp = ip;
/*
- * If we have a real type for an on-disk inode, we can set ops(&unlock)
+ * If we have a real type for an on-disk inode, we can setup the inode
* now. If it's a new inode being created, xfs_ialloc will handle it.
*/
if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
- xfs_setup_inode(ip);
+ xfs_setup_existing_inode(ip);
return 0;
out_error_or_again:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6163767aa856..3da9f4da4f3d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared(
}
/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock. This routine
- * allows either or both of the locks to be obtained.
+ * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
+ * the i_lock. This routine allows various combinations of the locks to be
+ * obtained.
*
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
+ * The 3 locks should always be ordered so that the IO lock is obtained first,
+ * the mmap lock second and the ilock last in order to prevent deadlock.
*
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- * to be locked. It can be:
- * XFS_IOLOCK_SHARED,
- * XFS_IOLOCK_EXCL,
- * XFS_ILOCK_SHARED,
- * XFS_ILOCK_EXCL,
- * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ * Basic locking order:
+ *
+ * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
+ *
+ * mmap_sem locking order:
+ *
+ * i_iolock -> page lock -> mmap_sem
+ * mmap_sem -> i_mmap_lock -> page_lock
+ *
+ * The difference in mmap_sem locking order mean that we cannot hold the
+ * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
+ * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
+ * in get_user_pages() to map the user pages into the kernel address space for
+ * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * page faults already hold the mmap_sem.
+ *
+ * Hence to serialise fully against both syscall and mmap based IO, we need to
+ * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * taken in places where we need to invalidate the page cache in a race
+ * free manner (e.g. truncate, hole punch and other extent manipulation
+ * functions).
*/
void
xfs_ilock(
@@ -150,6 +160,8 @@ xfs_ilock(
*/
ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+ (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -159,6 +171,11 @@ xfs_ilock(
else if (lock_flags & XFS_IOLOCK_SHARED)
mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+ if (lock_flags & XFS_MMAPLOCK_EXCL)
+ mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+ else if (lock_flags & XFS_MMAPLOCK_SHARED)
+ mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+
if (lock_flags & XFS_ILOCK_EXCL)
mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
else if (lock_flags & XFS_ILOCK_SHARED)
@@ -191,6 +208,8 @@ xfs_ilock_nowait(
*/
ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+ (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -202,21 +221,35 @@ xfs_ilock_nowait(
if (!mrtryaccess(&ip->i_iolock))
goto out;
}
+
+ if (lock_flags & XFS_MMAPLOCK_EXCL) {
+ if (!mrtryupdate(&ip->i_mmaplock))
+ goto out_undo_iolock;
+ } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+ if (!mrtryaccess(&ip->i_mmaplock))
+ goto out_undo_iolock;
+ }
+
if (lock_flags & XFS_ILOCK_EXCL) {
if (!mrtryupdate(&ip->i_lock))
- goto out_undo_iolock;
+ goto out_undo_mmaplock;
} else if (lock_flags & XFS_ILOCK_SHARED) {
if (!mrtryaccess(&ip->i_lock))
- goto out_undo_iolock;
+ goto out_undo_mmaplock;
}
return 1;
- out_undo_iolock:
+out_undo_mmaplock:
+ if (lock_flags & XFS_MMAPLOCK_EXCL)
+ mrunlock_excl(&ip->i_mmaplock);
+ else if (lock_flags & XFS_MMAPLOCK_SHARED)
+ mrunlock_shared(&ip->i_mmaplock);
+out_undo_iolock:
if (lock_flags & XFS_IOLOCK_EXCL)
mrunlock_excl(&ip->i_iolock);
else if (lock_flags & XFS_IOLOCK_SHARED)
mrunlock_shared(&ip->i_iolock);
- out:
+out:
return 0;
}
@@ -244,6 +277,8 @@ xfs_iunlock(
*/
ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+ (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -254,6 +289,11 @@ xfs_iunlock(
else if (lock_flags & XFS_IOLOCK_SHARED)
mrunlock_shared(&ip->i_iolock);
+ if (lock_flags & XFS_MMAPLOCK_EXCL)
+ mrunlock_excl(&ip->i_mmaplock);
+ else if (lock_flags & XFS_MMAPLOCK_SHARED)
+ mrunlock_shared(&ip->i_mmaplock);
+
if (lock_flags & XFS_ILOCK_EXCL)
mrunlock_excl(&ip->i_lock);
else if (lock_flags & XFS_ILOCK_SHARED)
@@ -271,11 +311,14 @@ xfs_ilock_demote(
xfs_inode_t *ip,
uint lock_flags)
{
- ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+ ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
+ ASSERT((lock_flags &
+ ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
if (lock_flags & XFS_ILOCK_EXCL)
mrdemote(&ip->i_lock);
+ if (lock_flags & XFS_MMAPLOCK_EXCL)
+ mrdemote(&ip->i_mmaplock);
if (lock_flags & XFS_IOLOCK_EXCL)
mrdemote(&ip->i_iolock);
@@ -294,6 +337,12 @@ xfs_isilocked(
return rwsem_is_locked(&ip->i_lock.mr_lock);
}
+ if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
+ if (!(lock_flags & XFS_MMAPLOCK_SHARED))
+ return !!ip->i_mmaplock.mr_writer;
+ return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+ }
+
if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
if (!(lock_flags & XFS_IOLOCK_SHARED))
return !!ip->i_iolock.mr_writer;
@@ -314,14 +363,27 @@ int xfs_lock_delays;
#endif
/*
- * Bump the subclass so xfs_lock_inodes() acquires each lock with
- * a different value
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
+ * value. This shouldn't be called for page fault locking, but we also need to
+ * ensure we don't overrun the number of lockdep subclasses for the iolock or
+ * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
*/
static inline int
xfs_lock_inumorder(int lock_mode, int subclass)
{
- if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+ if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
+ ASSERT(subclass + XFS_LOCK_INUMORDER <
+ (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+ }
+
+ if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
+ ASSERT(subclass + XFS_LOCK_INUMORDER <
+ (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
+ lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
+ XFS_MMAPLOCK_SHIFT;
+ }
+
if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
@@ -329,15 +391,14 @@ xfs_lock_inumorder(int lock_mode, int subclass)
}
/*
- * The following routine will lock n inodes in exclusive mode.
- * We assume the caller calls us with the inodes in i_ino order.
+ * The following routine will lock n inodes in exclusive mode. We assume the
+ * caller calls us with the inodes in i_ino order.
*
- * We need to detect deadlock where an inode that we lock
- * is in the AIL and we start waiting for another inode that is locked
- * by a thread in a long running transaction (such as truncate). This can
- * result in deadlock since the long running trans might need to wait
- * for the inode we just locked in order to push the tail and free space
- * in the log.
+ * We need to detect deadlock where an inode that we lock is in the AIL and we
+ * start waiting for another inode that is locked by a thread in a long running
+ * transaction (such as truncate). This can result in deadlock since the long
+ * running trans might need to wait for the inode we just locked in order to
+ * push the tail and free space in the log.
*/
void
xfs_lock_inodes(
@@ -348,30 +409,27 @@ xfs_lock_inodes(
int attempts = 0, i, j, try_lock;
xfs_log_item_t *lp;
- ASSERT(ips && (inodes >= 2)); /* we need at least two */
+ /* currently supports between 2 and 5 inodes */
+ ASSERT(ips && inodes >= 2 && inodes <= 5);
try_lock = 0;
i = 0;
-
again:
for (; i < inodes; i++) {
ASSERT(ips[i]);
- if (i && (ips[i] == ips[i-1])) /* Already locked */
+ if (i && (ips[i] == ips[i - 1])) /* Already locked */
continue;
/*
- * If try_lock is not set yet, make sure all locked inodes
- * are not in the AIL.
- * If any are, set try_lock to be used later.
+ * If try_lock is not set yet, make sure all locked inodes are
+ * not in the AIL. If any are, set try_lock to be used later.
*/
-
if (!try_lock) {
for (j = (i - 1); j >= 0 && !try_lock; j--) {
lp = (xfs_log_item_t *)ips[j]->i_itemp;
- if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ if (lp && (lp->li_flags & XFS_LI_IN_AIL))
try_lock++;
- }
}
}
@@ -381,51 +439,42 @@ again:
* we can't get any, we must release all we have
* and try again.
*/
+ if (!try_lock) {
+ xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+ continue;
+ }
+
+ /* try_lock means we have an inode locked that is in the AIL. */
+ ASSERT(i != 0);
+ if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
+ continue;
- if (try_lock) {
- /* try_lock must be 0 if i is 0. */
+ /*
+ * Unlock all previous guys and try again. xfs_iunlock will try
+ * to push the tail if the inode is in the AIL.
+ */
+ attempts++;
+ for (j = i - 1; j >= 0; j--) {
/*
- * try_lock means we have an inode locked
- * that is in the AIL.
+ * Check to see if we've already unlocked this one. Not
+ * the first one going back, and the inode ptr is the
+ * same.
*/
- ASSERT(i != 0);
- if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
- attempts++;
-
- /*
- * Unlock all previous guys and try again.
- * xfs_iunlock will try to push the tail
- * if the inode is in the AIL.
- */
-
- for(j = i - 1; j >= 0; j--) {
-
- /*
- * Check to see if we've already
- * unlocked this one.
- * Not the first one going back,
- * and the inode ptr is the same.
- */
- if ((j != (i - 1)) && ips[j] ==
- ips[j+1])
- continue;
-
- xfs_iunlock(ips[j], lock_mode);
- }
+ if (j != (i - 1) && ips[j] == ips[j + 1])
+ continue;
- if ((attempts % 5) == 0) {
- delay(1); /* Don't just spin the CPU */
+ xfs_iunlock(ips[j], lock_mode);
+ }
+
+ if ((attempts % 5) == 0) {
+ delay(1); /* Don't just spin the CPU */
#ifdef DEBUG
- xfs_lock_delays++;
+ xfs_lock_delays++;
#endif
- }
- i = 0;
- try_lock = 0;
- goto again;
- }
- } else {
- xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
}
+ i = 0;
+ try_lock = 0;
+ goto again;
}
#ifdef DEBUG
@@ -440,10 +489,10 @@ again:
}
/*
- * xfs_lock_two_inodes() can only be used to lock one type of lock
- * at a time - the iolock or the ilock, but not both at once. If
- * we lock both at once, lockdep will report false positives saying
- * we have violated locking orders.
+ * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
+ * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
+ * lock more than one at a time, lockdep will report false positives saying we
+ * have violated locking orders.
*/
void
xfs_lock_two_inodes(
@@ -455,8 +504,12 @@ xfs_lock_two_inodes(
int attempts = 0;
xfs_log_item_t *lp;
- if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
- ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+ if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
+ ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+ ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
+ } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
+ ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
+
ASSERT(ip0->i_ino != ip1->i_ino);
if (ip0->i_ino > ip1->i_ino) {
@@ -818,7 +871,7 @@ xfs_ialloc(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, flags);
- /* now that we have an i_mode we can setup inode ops and unlock */
+ /* now that we have an i_mode we can setup the inode structure */
xfs_setup_inode(ip);
*ipp = ip;
@@ -852,7 +905,6 @@ xfs_dir_ialloc(
{
xfs_trans_t *tp;
- xfs_trans_t *ntp;
xfs_inode_t *ip;
xfs_buf_t *ialloc_context = NULL;
int code;
@@ -901,8 +953,6 @@ xfs_dir_ialloc(
* to succeed the second time.
*/
if (ialloc_context) {
- struct xfs_trans_res tres;
-
/*
* Normally, xfs_trans_commit releases all the locks.
* We call bhold to hang on to the ialloc_context across
@@ -911,12 +961,6 @@ xfs_dir_ialloc(
* allocation group.
*/
xfs_trans_bhold(tp, ialloc_context);
- /*
- * Save the log reservation so we can use
- * them in the next transaction.
- */
- tres.tr_logres = xfs_trans_get_log_res(tp);
- tres.tr_logcount = xfs_trans_get_log_count(tp);
/*
* We want the quota changes to be associated with the next
@@ -932,35 +976,9 @@ xfs_dir_ialloc(
tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
}
- ntp = xfs_trans_dup(tp);
- code = xfs_trans_commit(tp, 0);
- tp = ntp;
- if (committed != NULL) {
+ code = xfs_trans_roll(&tp, 0);
+ if (committed != NULL)
*committed = 1;
- }
- /*
- * If we get an error during the commit processing,
- * release the buffer that is still held and return
- * to the caller.
- */
- if (code) {
- xfs_buf_relse(ialloc_context);
- if (dqinfo) {
- tp->t_dqinfo = dqinfo;
- xfs_trans_free_dqinfo(tp);
- }
- *tpp = ntp;
- *ipp = NULL;
- return code;
- }
-
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- code = xfs_trans_reserve(tp, &tres, 0, 0);
/*
* Re-attach the quota info that we detached from prev trx.
@@ -972,7 +990,7 @@ xfs_dir_ialloc(
if (code) {
xfs_buf_relse(ialloc_context);
- *tpp = ntp;
+ *tpp = tp;
*ipp = NULL;
return code;
}
@@ -1074,7 +1092,6 @@ xfs_create(
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- uint cancel_flags;
int committed;
prid_t prid;
struct xfs_dquot *udqp = NULL;
@@ -1111,8 +1128,6 @@ xfs_create(
tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
}
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-
/*
* Initially assume that the file does not exist and
* reserve the resources for that case. If that is not
@@ -1130,10 +1145,9 @@ xfs_create(
resblks = 0;
error = xfs_trans_reserve(tp, tres, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto out_trans_cancel;
- }
+
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -1164,7 +1178,7 @@ xfs_create(
if (error) {
if (error == -ENOSPC)
goto out_trans_cancel;
- goto out_trans_abort;
+ goto out_trans_cancel;
}
/*
@@ -1182,7 +1196,7 @@ xfs_create(
resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
if (error) {
ASSERT(error != -ENOSPC);
- goto out_trans_abort;
+ goto out_trans_cancel;
}
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -1216,7 +1230,7 @@ xfs_create(
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
@@ -1229,18 +1243,18 @@ xfs_create(
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- out_trans_abort:
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
out_release_inode:
/*
- * Wait until after the current transaction is aborted to
- * release the inode. This prevents recursive transactions
- * and deadlocks from xfs_inactive.
+ * Wait until after the current transaction is aborted to finish the
+ * setup of the inode and release the inode. This prevents recursive
+ * transactions and deadlocks from xfs_inactive.
*/
- if (ip)
+ if (ip) {
+ xfs_finish_inode_setup(ip);
IRELE(ip);
+ }
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -1262,7 +1276,6 @@ xfs_create_tmpfile(
struct xfs_inode *ip = NULL;
struct xfs_trans *tp = NULL;
int error;
- uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -1295,10 +1308,8 @@ xfs_create_tmpfile(
resblks = 0;
error = xfs_trans_reserve(tp, tres, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto out_trans_cancel;
- }
error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
pdqp, resblks, 1, 0);
@@ -1310,7 +1321,7 @@ xfs_create_tmpfile(
if (error) {
if (error == -ENOSPC)
goto out_trans_cancel;
- goto out_trans_abort;
+ goto out_trans_cancel;
}
if (mp->m_flags & XFS_MOUNT_WSYNC)
@@ -1326,9 +1337,9 @@ xfs_create_tmpfile(
ip->i_d.di_nlink--;
error = xfs_iunlink(tp, ip);
if (error)
- goto out_trans_abort;
+ goto out_trans_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_release_inode;
@@ -1339,18 +1350,18 @@ xfs_create_tmpfile(
*ipp = ip;
return 0;
- out_trans_abort:
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
out_release_inode:
/*
- * Wait until after the current transaction is aborted to
- * release the inode. This prevents recursive transactions
- * and deadlocks from xfs_inactive.
+ * Wait until after the current transaction is aborted to finish the
+ * setup of the inode and release the inode. This prevents recursive
+ * transactions and deadlocks from xfs_inactive.
*/
- if (ip)
+ if (ip) {
+ xfs_finish_inode_setup(ip);
IRELE(ip);
+ }
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -1370,7 +1381,6 @@ xfs_link(
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int cancel_flags;
int committed;
int resblks;
@@ -1390,17 +1400,14 @@ xfs_link(
goto std_return;
tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
if (error == -ENOSPC) {
resblks = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
}
- if (error) {
- cancel_flags = 0;
+ if (error)
goto error_return;
- }
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
@@ -1429,19 +1436,19 @@ xfs_link(
if (sip->i_d.di_nlink == 0) {
error = xfs_iunlink_remove(tp, sip);
if (error)
- goto abort_return;
+ goto error_return;
}
error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
&first_block, &free_list, resblks);
if (error)
- goto abort_return;
+ goto error_return;
xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
error = xfs_bumplink(tp, sip);
if (error)
- goto abort_return;
+ goto error_return;
/*
* If this is a synchronous mount, make sure that the
@@ -1455,15 +1462,13 @@ xfs_link(
error = xfs_bmap_finish (&tp, &free_list, &committed);
if (error) {
xfs_bmap_cancel(&free_list);
- goto abort_return;
+ goto error_return;
}
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return xfs_trans_commit(tp);
- abort_return:
- cancel_flags |= XFS_TRANS_ABORT;
error_return:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
std_return:
return error;
}
@@ -1498,7 +1503,6 @@ xfs_itruncate_extents(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
- struct xfs_trans *ntp;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
xfs_fileoff_t first_unmap_block;
@@ -1556,29 +1560,7 @@ xfs_itruncate_extents(
if (error)
goto out_bmap_cancel;
- if (committed) {
- /*
- * Mark the inode dirty so it will be logged and
- * moved forward in the log as part of every commit.
- */
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- }
-
- ntp = xfs_trans_dup(tp);
- error = xfs_trans_commit(tp, 0);
- tp = ntp;
-
- xfs_trans_ijoin(tp, ip, 0);
-
- if (error)
- goto out;
-
- /*
- * Transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ error = xfs_trans_roll(&tp, ip);
if (error)
goto out;
}
@@ -1699,7 +1681,7 @@ xfs_inactive_truncate(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1720,7 +1702,7 @@ xfs_inactive_truncate(
ASSERT(ip->i_d.di_nextents == 0);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error_unlock;
@@ -1728,7 +1710,7 @@ xfs_inactive_truncate(
return 0;
error_trans_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
error_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -1778,7 +1760,7 @@ xfs_inactive_ifree(
} else {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
}
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1798,7 +1780,7 @@ xfs_inactive_ifree(
__func__, error);
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
}
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1817,7 +1799,7 @@ xfs_inactive_ifree(
if (error)
xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
__func__, error);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
__func__, error);
@@ -1889,21 +1871,17 @@ xfs_inactive(
/*
* If there are attributes associated with the file then blow them away
* now. The code calls a routine that recursively deconstructs the
- * attribute fork. We need to just commit the current transaction
- * because we can't use it for xfs_attr_inactive().
+ * attribute fork. If also blows away the in-core attribute fork.
*/
- if (ip->i_d.di_anextents > 0) {
- ASSERT(ip->i_d.di_forkoff != 0);
-
+ if (XFS_IFORK_Q(ip)) {
error = xfs_attr_inactive(ip);
if (error)
return;
}
- if (ip->i_afp)
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
+ ASSERT(!ip->i_afp);
ASSERT(ip->i_d.di_anextents == 0);
+ ASSERT(ip->i_d.di_forkoff == 0);
/*
* Free the inode.
@@ -2182,28 +2160,42 @@ xfs_iunlink_remove(
*/
STATIC int
xfs_ifree_cluster(
- xfs_inode_t *free_ip,
- xfs_trans_t *tp,
- xfs_ino_t inum)
+ xfs_inode_t *free_ip,
+ xfs_trans_t *tp,
+ struct xfs_icluster *xic)
{
xfs_mount_t *mp = free_ip->i_mount;
int blks_per_cluster;
int inodes_per_cluster;
int nbufs;
int i, j;
+ int ioffset;
xfs_daddr_t blkno;
xfs_buf_t *bp;
xfs_inode_t *ip;
xfs_inode_log_item_t *iip;
xfs_log_item_t *lip;
struct xfs_perag *pag;
+ xfs_ino_t inum;
+ inum = xic->first_ino;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
blks_per_cluster = xfs_icluster_size_fsb(mp);
inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
nbufs = mp->m_ialloc_blks / blks_per_cluster;
for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
+ /*
+ * The allocation bitmap tells us which inodes of the chunk were
+ * physically allocated. Skip the cluster if an inode falls into
+ * a sparse region.
+ */
+ ioffset = inum - xic->first_ino;
+ if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
+ ASSERT(do_mod(ioffset, inodes_per_cluster) == 0);
+ continue;
+ }
+
blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
XFS_INO_TO_AGBNO(mp, inum));
@@ -2361,8 +2353,7 @@ xfs_ifree(
xfs_bmap_free_t *flist)
{
int error;
- int delete;
- xfs_ino_t first_ino;
+ struct xfs_icluster xic = { 0 };
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(ip->i_d.di_nlink == 0);
@@ -2378,7 +2369,7 @@ xfs_ifree(
if (error)
return error;
- error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+ error = xfs_difree(tp, ip->i_ino, flist, &xic);
if (error)
return error;
@@ -2395,8 +2386,8 @@ xfs_ifree(
ip->i_d.di_gen++;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (delete)
- error = xfs_ifree_cluster(ip, tp, first_ino);
+ if (xic.deleted)
+ error = xfs_ifree_cluster(ip, tp, &xic);
return error;
}
@@ -2483,7 +2474,6 @@ xfs_remove(
int error = 0;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int cancel_flags;
int committed;
uint resblks;
@@ -2504,7 +2494,6 @@ xfs_remove(
tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
else
tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
* We try to get the real space reservation first,
@@ -2523,7 +2512,6 @@ xfs_remove(
}
if (error) {
ASSERT(error != -ENOSPC);
- cancel_flags = 0;
goto out_trans_cancel;
}
@@ -2535,7 +2523,6 @@ xfs_remove(
/*
* If we're removing a directory perform some additional validation.
*/
- cancel_flags |= XFS_TRANS_ABORT;
if (is_dir) {
ASSERT(ip->i_d.di_nlink >= 2);
if (ip->i_d.di_nlink != 2) {
@@ -2591,7 +2578,7 @@ xfs_remove(
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto std_return;
@@ -2603,7 +2590,7 @@ xfs_remove(
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
out_trans_cancel:
- xfs_trans_cancel(tp, cancel_flags);
+ xfs_trans_cancel(tp);
std_return:
return error;
}
@@ -2611,19 +2598,22 @@ xfs_remove(
/*
* Enter all inodes for a rename transaction into a sorted array.
*/
+#define __XFS_SORT_INODES 5
STATIC void
xfs_sort_for_rename(
- xfs_inode_t *dp1, /* in: old (source) directory inode */
- xfs_inode_t *dp2, /* in: new (target) directory inode */
- xfs_inode_t *ip1, /* in: inode of old entry */
- xfs_inode_t *ip2, /* in: inode of new entry, if it
- already exists, NULL otherwise. */
- xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
- int *num_inodes) /* out: number of inodes in array */
+ struct xfs_inode *dp1, /* in: old (source) directory inode */
+ struct xfs_inode *dp2, /* in: new (target) directory inode */
+ struct xfs_inode *ip1, /* in: inode of old entry */
+ struct xfs_inode *ip2, /* in: inode of new entry */
+ struct xfs_inode *wip, /* in: whiteout inode */
+ struct xfs_inode **i_tab,/* out: sorted array of inodes */
+ int *num_inodes) /* in/out: inodes in array */
{
- xfs_inode_t *temp;
int i, j;
+ ASSERT(*num_inodes == __XFS_SORT_INODES);
+ memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
+
/*
* i_tab contains a list of pointers to inodes. We initialize
* the table here & we'll sort it. We will then use it to
@@ -2631,25 +2621,24 @@ xfs_sort_for_rename(
*
* Note that the table may contain duplicates. e.g., dp1 == dp2.
*/
- i_tab[0] = dp1;
- i_tab[1] = dp2;
- i_tab[2] = ip1;
- if (ip2) {
- *num_inodes = 4;
- i_tab[3] = ip2;
- } else {
- *num_inodes = 3;
- i_tab[3] = NULL;
- }
+ i = 0;
+ i_tab[i++] = dp1;
+ i_tab[i++] = dp2;
+ i_tab[i++] = ip1;
+ if (ip2)
+ i_tab[i++] = ip2;
+ if (wip)
+ i_tab[i++] = wip;
+ *num_inodes = i;
/*
* Sort the elements via bubble sort. (Remember, there are at
- * most 4 elements to sort, so this is adequate.)
+ * most 5 elements to sort, so this is adequate.)
*/
for (i = 0; i < *num_inodes; i++) {
for (j = 1; j < *num_inodes; j++) {
if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
- temp = i_tab[j];
+ struct xfs_inode *temp = i_tab[j];
i_tab[j] = i_tab[j-1];
i_tab[j-1] = temp;
}
@@ -2657,6 +2646,31 @@ xfs_sort_for_rename(
}
}
+static int
+xfs_finish_rename(
+ struct xfs_trans *tp,
+ struct xfs_bmap_free *free_list)
+{
+ int committed = 0;
+ int error;
+
+ /*
+ * If this is a synchronous mount, make sure that the rename transaction
+ * goes to disk before returning to the user.
+ */
+ if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+ xfs_trans_set_sync(tp);
+
+ error = xfs_bmap_finish(&tp, free_list, &committed);
+ if (error) {
+ xfs_bmap_cancel(free_list);
+ xfs_trans_cancel(tp);
+ return error;
+ }
+
+ return xfs_trans_commit(tp);
+}
+
/*
* xfs_cross_rename()
*
@@ -2685,14 +2699,14 @@ xfs_cross_rename(
ip2->i_ino,
first_block, free_list, spaceres);
if (error)
- goto out;
+ goto out_trans_abort;
/* Swap inode number for dirent in second parent */
error = xfs_dir_replace(tp, dp2, name2,
ip1->i_ino,
first_block, free_list, spaceres);
if (error)
- goto out;
+ goto out_trans_abort;
/*
* If we're renaming one or more directories across different parents,
@@ -2707,16 +2721,16 @@ xfs_cross_rename(
dp1->i_ino, first_block,
free_list, spaceres);
if (error)
- goto out;
+ goto out_trans_abort;
/* transfer ip2 ".." reference to dp1 */
if (!S_ISDIR(ip1->i_d.di_mode)) {
error = xfs_droplink(tp, dp2);
if (error)
- goto out;
+ goto out_trans_abort;
error = xfs_bumplink(tp, dp1);
if (error)
- goto out;
+ goto out_trans_abort;
}
/*
@@ -2734,16 +2748,16 @@ xfs_cross_rename(
dp2->i_ino, first_block,
free_list, spaceres);
if (error)
- goto out;
+ goto out_trans_abort;
/* transfer ip1 ".." reference to dp2 */
if (!S_ISDIR(ip2->i_d.di_mode)) {
error = xfs_droplink(tp, dp1);
if (error)
- goto out;
+ goto out_trans_abort;
error = xfs_bumplink(tp, dp2);
if (error)
- goto out;
+ goto out_trans_abort;
}
/*
@@ -2771,66 +2785,112 @@ xfs_cross_rename(
}
xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
-out:
+ return xfs_finish_rename(tp, free_list);
+
+out_trans_abort:
+ xfs_bmap_cancel(free_list);
+ xfs_trans_cancel(tp);
return error;
}
/*
+ * xfs_rename_alloc_whiteout()
+ *
+ * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * whiteout in a rename transaction. We use a tmpfile inode here so that if we
+ * crash between allocating the inode and linking it into the rename transaction
+ * recovery will free the inode and we won't leak it.
+ */
+static int
+xfs_rename_alloc_whiteout(
+ struct xfs_inode *dp,
+ struct xfs_inode **wip)
+{
+ struct xfs_inode *tmpfile;
+ int error;
+
+ error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+ if (error)
+ return error;
+
+ /*
+ * Prepare the tmpfile inode as if it were created through the VFS.
+ * Otherwise, the link increment paths will complain about nlink 0->1.
+ * Drop the link count as done by d_tmpfile(), complete the inode setup
+ * and flag it as linkable.
+ */
+ drop_nlink(VFS_I(tmpfile));
+ xfs_finish_inode_setup(tmpfile);
+ VFS_I(tmpfile)->i_state |= I_LINKABLE;
+
+ *wip = tmpfile;
+ return 0;
+}
+
+/*
* xfs_rename
*/
int
xfs_rename(
- xfs_inode_t *src_dp,
- struct xfs_name *src_name,
- xfs_inode_t *src_ip,
- xfs_inode_t *target_dp,
- struct xfs_name *target_name,
- xfs_inode_t *target_ip,
- unsigned int flags)
+ struct xfs_inode *src_dp,
+ struct xfs_name *src_name,
+ struct xfs_inode *src_ip,
+ struct xfs_inode *target_dp,
+ struct xfs_name *target_name,
+ struct xfs_inode *target_ip,
+ unsigned int flags)
{
- xfs_trans_t *tp = NULL;
- xfs_mount_t *mp = src_dp->i_mount;
- int new_parent; /* moving to a new dir */
- int src_is_directory; /* src_name is a directory */
- int error;
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- int cancel_flags;
- int committed;
- xfs_inode_t *inodes[4];
- int spaceres;
- int num_inodes;
+ struct xfs_mount *mp = src_dp->i_mount;
+ struct xfs_trans *tp;
+ struct xfs_bmap_free free_list;
+ xfs_fsblock_t first_block;
+ struct xfs_inode *wip = NULL; /* whiteout inode */
+ struct xfs_inode *inodes[__XFS_SORT_INODES];
+ int num_inodes = __XFS_SORT_INODES;
+ bool new_parent = (src_dp != target_dp);
+ bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+ int spaceres;
+ int error;
trace_xfs_rename(src_dp, target_dp, src_name, target_name);
- new_parent = (src_dp != target_dp);
- src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+ if ((flags & RENAME_EXCHANGE) && !target_ip)
+ return -EINVAL;
- xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+ /*
+ * If we are doing a whiteout operation, allocate the whiteout inode
+ * we will be placing at the target and ensure the type is set
+ * appropriately.
+ */
+ if (flags & RENAME_WHITEOUT) {
+ ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
+ error = xfs_rename_alloc_whiteout(target_dp, &wip);
+ if (error)
+ return error;
+
+ /* setup target dirent info as whiteout */
+ src_name->type = XFS_DIR3_FT_CHRDEV;
+ }
+
+ xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
inodes, &num_inodes);
- xfs_bmap_init(&free_list, &first_block);
tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
if (error == -ENOSPC) {
spaceres = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
}
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto std_return;
- }
+ if (error)
+ goto out_trans_cancel;
/*
* Attach the dquots to the inodes
*/
error = xfs_qm_vop_rename_dqattach(inodes);
- if (error) {
- xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
- }
+ if (error)
+ goto out_trans_cancel;
/*
* Lock all the participating inodes. Depending upon whether
@@ -2851,6 +2911,8 @@ xfs_rename(
xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
if (target_ip)
xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+ if (wip)
+ xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
/*
* If we are using project inheritance, we only allow renames
@@ -2860,24 +2922,16 @@ xfs_rename(
if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
(xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
error = -EXDEV;
- goto error_return;
+ goto out_trans_cancel;
}
- /*
- * Handle RENAME_EXCHANGE flags
- */
- if (flags & RENAME_EXCHANGE) {
- if (target_ip == NULL) {
- error = -EINVAL;
- goto error_return;
- }
- error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
- target_dp, target_name, target_ip,
- &free_list, &first_block, spaceres);
- if (error)
- goto abort_return;
- goto finish_rename;
- }
+ xfs_bmap_init(&free_list, &first_block);
+
+ /* RENAME_EXCHANGE is unique from here on. */
+ if (flags & RENAME_EXCHANGE)
+ return xfs_cross_rename(tp, src_dp, src_name, src_ip,
+ target_dp, target_name, target_ip,
+ &free_list, &first_block, spaceres);
/*
* Set up the target.
@@ -2890,7 +2944,7 @@ xfs_rename(
if (!spaceres) {
error = xfs_dir_canenter(tp, target_dp, target_name);
if (error)
- goto error_return;
+ goto out_trans_cancel;
}
/*
* If target does not exist and the rename crosses
@@ -2900,10 +2954,8 @@ xfs_rename(
error = xfs_dir_createname(tp, target_dp, target_name,
src_ip->i_ino, &first_block,
&free_list, spaceres);
- if (error == -ENOSPC)
- goto error_return;
if (error)
- goto abort_return;
+ goto out_bmap_cancel;
xfs_trans_ichgtime(tp, target_dp,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2911,7 +2963,7 @@ xfs_rename(
if (new_parent && src_is_directory) {
error = xfs_bumplink(tp, target_dp);
if (error)
- goto abort_return;
+ goto out_bmap_cancel;
}
} else { /* target_ip != NULL */
/*
@@ -2926,7 +2978,7 @@ xfs_rename(
if (!(xfs_dir_isempty(target_ip)) ||
(target_ip->i_d.di_nlink > 2)) {
error = -EEXIST;
- goto error_return;
+ goto out_trans_cancel;
}
}
@@ -2943,7 +2995,7 @@ xfs_rename(
src_ip->i_ino,
&first_block, &free_list, spaceres);
if (error)
- goto abort_return;
+ goto out_bmap_cancel;
xfs_trans_ichgtime(tp, target_dp,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2954,7 +3006,7 @@ xfs_rename(
*/
error = xfs_droplink(tp, target_ip);
if (error)
- goto abort_return;
+ goto out_bmap_cancel;
if (src_is_directory) {
/*
@@ -2962,7 +3014,7 @@ xfs_rename(
*/
error = xfs_droplink(tp, target_ip);
if (error)
- goto abort_return;
+ goto out_bmap_cancel;
}
} /* target_ip != NULL */
@@ -2979,7 +3031,7 @@ xfs_rename(
&first_block, &free_list, spaceres);
ASSERT(error != -EEXIST);
if (error)
- goto abort_return;
+ goto out_bmap_cancel;
}
/*
@@ -3005,49 +3057,65 @@ xfs_rename(
*/
error = xfs_droplink(tp, src_dp);
if (error)
- goto abort_return;
+ goto out_bmap_cancel;
}
- error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+ /*
+ * For whiteouts, we only need to update the source dirent with the
+ * inode number of the whiteout inode rather than removing it
+ * altogether.
+ */
+ if (wip) {
+ error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
&first_block, &free_list, spaceres);
+ } else
+ error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+ &first_block, &free_list, spaceres);
if (error)
- goto abort_return;
-
- xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
- if (new_parent)
- xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+ goto out_bmap_cancel;
-finish_rename:
/*
- * If this is a synchronous mount, make sure that the
- * rename transaction goes to disk before returning to
- * the user.
+ * For whiteouts, we need to bump the link count on the whiteout inode.
+ * This means that failures all the way up to this point leave the inode
+ * on the unlinked list and so cleanup is a simple matter of dropping
+ * the remaining reference to it. If we fail here after bumping the link
+ * count, we're shutting down the filesystem so we'll never see the
+ * intermediate state on disk.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
- xfs_trans_set_sync(tp);
- }
+ if (wip) {
+ ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
+ error = xfs_bumplink(tp, wip);
+ if (error)
+ goto out_bmap_cancel;
+ error = xfs_iunlink_remove(tp, wip);
+ if (error)
+ goto out_bmap_cancel;
+ xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT));
- goto std_return;
+ /*
+ * Now we have a real link, clear the "I'm a tmpfile" state
+ * flag from the inode so it doesn't accidentally get misused in
+ * future.
+ */
+ VFS_I(wip)->i_state &= ~I_LINKABLE;
}
- /*
- * trans_commit will unlock src_ip, target_ip & decrement
- * the vnode references.
- */
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+ if (new_parent)
+ xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
- abort_return:
- cancel_flags |= XFS_TRANS_ABORT;
- error_return:
+ error = xfs_finish_rename(tp, &free_list);
+ if (wip)
+ IRELE(wip);
+ return error;
+
+out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, cancel_flags);
- std_return:
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ if (wip)
+ IRELE(wip);
return error;
}
@@ -3324,7 +3392,7 @@ xfs_iflush_int(
ASSERT(ip->i_d.di_version > 1);
/* set *dip = inode's place in the buffer */
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
+ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a1cd55f3f351..8f22d20368d8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,6 +56,7 @@ typedef struct xfs_inode {
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
mrlock_t i_iolock; /* inode IO lock */
+ mrlock_t i_mmaplock; /* inode mmap IO lock */
atomic_t i_pincount; /* inode pin count */
spinlock_t i_flags_lock; /* inode i_flags lock */
/* Miscellaneous state. */
@@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
#define XFS_IOLOCK_SHARED (1<<1)
#define XFS_ILOCK_EXCL (1<<2)
#define XFS_ILOCK_SHARED (1<<3)
+#define XFS_MMAPLOCK_EXCL (1<<4)
+#define XFS_MMAPLOCK_SHARED (1<<5)
#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
- | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
+ | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
+ | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
#define XFS_LOCK_FLAGS \
{ XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \
{ XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \
{ XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \
- { XFS_ILOCK_SHARED, "ILOCK_SHARED" }
+ { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \
+ { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \
+ { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" }
/*
@@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
#define XFS_IOLOCK_SHIFT 16
#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
+#define XFS_MMAPLOCK_SHIFT 20
+
#define XFS_ILOCK_SHIFT 24
#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
-#define XFS_IOLOCK_DEP_MASK 0x00ff0000
+#define XFS_IOLOCK_DEP_MASK 0x000f0000
+#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
#define XFS_ILOCK_DEP_MASK 0xff000000
-#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK)
+#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \
+ XFS_MMAPLOCK_DEP_MASK | \
+ XFS_ILOCK_DEP_MASK)
-#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
-#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \
+ >> XFS_IOLOCK_SHIFT)
+#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \
+ >> XFS_MMAPLOCK_SHIFT)
+#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \
+ >> XFS_ILOCK_SHIFT)
/*
* For multiple groups support: if S_ISGID bit is set in the parent
@@ -391,6 +406,28 @@ int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+/* from xfs_iops.c */
+/*
+ * When setting up a newly allocated inode, we need to call
+ * xfs_finish_inode_setup() once the inode is fully instantiated at
+ * the VFS level to prevent the rest of the world seeing the inode
+ * before we've completed instantiation. Otherwise we can do it
+ * the moment the inode lookup is complete.
+ */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
+{
+ xfs_iflags_clear(ip, XFS_INEW);
+ barrier();
+ unlock_new_inode(VFS_I(ip));
+}
+
+static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
+{
+ xfs_setup_inode(ip);
+ xfs_finish_inode_setup(ip);
+}
+
#define IHOLD(ip) \
do { \
ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ac4feae45eb3..ea7d85af5310 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -82,7 +82,7 @@ xfs_find_handle(
error = user_lpath((const char __user *)hreq->path, &path);
if (error)
return error;
- inode = path.dentry->d_inode;
+ inode = d_inode(path.dentry);
}
ip = XFS_I(inode);
@@ -210,7 +210,7 @@ xfs_open_by_handle(
dentry = xfs_handlereq_to_dentry(parfilp, hreq);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- inode = dentry->d_inode;
+ inode = d_inode(dentry);
/* Restrict xfs_open_by_handle to directories & regular files. */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
@@ -303,7 +303,7 @@ xfs_readlink_by_handle(
goto out_dput;
}
- error = xfs_readlink(XFS_I(dentry->d_inode), link);
+ error = xfs_readlink(XFS_I(d_inode(dentry)), link);
if (error)
goto out_kfree;
error = readlink_copy(hreq->ohandle, olen, link);
@@ -336,7 +336,7 @@ xfs_set_dmattrs(
tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -346,7 +346,7 @@ xfs_set_dmattrs(
ip->i_d.di_dmstate = state;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
return error;
}
@@ -376,7 +376,7 @@ xfs_fssetdm_by_handle(
return PTR_ERR(dentry);
}
- if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
+ if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) {
error = -EPERM;
goto out;
}
@@ -386,7 +386,7 @@ xfs_fssetdm_by_handle(
goto out;
}
- error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+ error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask,
fsd.fsd_dmstate);
out:
@@ -429,7 +429,7 @@ xfs_attrlist_by_handle(
goto out_dput;
cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+ error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
al_hreq.flags, cursor);
if (error)
goto out_kfree;
@@ -559,7 +559,7 @@ xfs_attrmulti_by_handle(
switch (ops[i].am_opcode) {
case ATTR_OP_GET:
ops[i].am_error = xfs_attrmulti_attr_get(
- dentry->d_inode, attr_name,
+ d_inode(dentry), attr_name,
ops[i].am_attrvalue, &ops[i].am_length,
ops[i].am_flags);
break;
@@ -568,7 +568,7 @@ xfs_attrmulti_by_handle(
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_set(
- dentry->d_inode, attr_name,
+ d_inode(dentry), attr_name,
ops[i].am_attrvalue, ops[i].am_length,
ops[i].am_flags);
mnt_drop_write_file(parfilp);
@@ -578,7 +578,7 @@ xfs_attrmulti_by_handle(
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_remove(
- dentry->d_inode, attr_name,
+ d_inode(dentry), attr_name,
ops[i].am_flags);
mnt_drop_write_file(parfilp);
break;
@@ -631,7 +631,7 @@ xfs_ioc_space(
if (filp->f_flags & O_DSYNC)
flags |= XFS_PREALLOC_SYNC;
- if (ioflags & XFS_IO_INVIS)
+ if (ioflags & XFS_IO_INVIS)
flags |= XFS_PREALLOC_INVISIBLE;
error = mnt_want_write_file(filp);
@@ -639,10 +639,13 @@ xfs_ioc_space(
return error;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock);
+ error = xfs_break_layouts(inode, &iolock, false);
if (error)
goto out_unlock;
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ iolock |= XFS_MMAPLOCK_EXCL;
+
switch (bf->l_whence) {
case 0: /*SEEK_SET*/
break;
@@ -1073,7 +1076,7 @@ xfs_ioctl_setattr_get_trans(
return tp;
out_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return ERR_PTR(error);
}
@@ -1250,7 +1253,7 @@ xfs_ioctl_setattr(
else
ip->i_d.di_extsize = 0;
- code = xfs_trans_commit(tp, 0);
+ code = xfs_trans_commit(tp);
/*
* Release any dquot(s) the inode had kept before chown.
@@ -1262,7 +1265,7 @@ xfs_ioctl_setattr(
return code;
error_trans_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
error_free_dquots:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(pdqp);
@@ -1335,11 +1338,11 @@ xfs_ioc_setxflags(
error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_drop_write;
}
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
out_drop_write:
mnt_drop_write_file(filp);
return error;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index bfc7c7c8a0c8..b88bdc85dd3d 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -375,7 +375,7 @@ xfs_compat_attrlist_by_handle(
goto out_dput;
cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
- error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+ error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen,
al_hreq.flags, cursor);
if (error)
goto out_kfree;
@@ -445,7 +445,7 @@ xfs_compat_attrmulti_by_handle(
switch (ops[i].am_opcode) {
case ATTR_OP_GET:
ops[i].am_error = xfs_attrmulti_attr_get(
- dentry->d_inode, attr_name,
+ d_inode(dentry), attr_name,
compat_ptr(ops[i].am_attrvalue),
&ops[i].am_length, ops[i].am_flags);
break;
@@ -454,7 +454,7 @@ xfs_compat_attrmulti_by_handle(
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_set(
- dentry->d_inode, attr_name,
+ d_inode(dentry), attr_name,
compat_ptr(ops[i].am_attrvalue),
ops[i].am_length, ops[i].am_flags);
mnt_drop_write_file(parfilp);
@@ -464,7 +464,7 @@ xfs_compat_attrmulti_by_handle(
if (ops[i].am_error)
break;
ops[i].am_error = xfs_attrmulti_attr_remove(
- dentry->d_inode, attr_name,
+ d_inode(dentry), attr_name,
ops[i].am_flags);
mnt_drop_write_file(parfilp);
break;
@@ -504,7 +504,7 @@ xfs_compat_fssetdm_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
+ if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) {
error = -EPERM;
goto out;
}
@@ -514,7 +514,7 @@ xfs_compat_fssetdm_by_handle(
goto out;
}
- error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+ error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask,
fsd.fsd_dmstate);
out:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ccb1dd0d509e..1f86033171c8 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -183,7 +183,7 @@ xfs_iomap_write_direct(
* Check for running out of space, note: need lock to return
*/
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -213,7 +213,7 @@ xfs_iomap_write_direct(
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
goto out_bmap_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto out_unlock;
@@ -236,7 +236,7 @@ out_bmap_cancel:
xfs_bmap_cancel(&free_list);
xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
out_trans_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
@@ -460,8 +460,7 @@ xfs_iomap_prealloc_size(
alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
alloc_blocks);
- xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
- freesp = mp->m_sb.sb_fdblocks;
+ freesp = percpu_counter_read_positive(&mp->m_fdblocks);
if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
shift = 2;
if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
@@ -691,7 +690,7 @@ xfs_iomap_write_allocate(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
nres, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -761,7 +760,7 @@ xfs_iomap_write_allocate(
if (error)
goto trans_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error0;
@@ -792,7 +791,7 @@ xfs_iomap_write_allocate(
trans_cancel:
xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
error0:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -854,7 +853,7 @@ xfs_iomap_write_unwritten(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
resblks, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -891,7 +890,7 @@ xfs_iomap_write_unwritten(
if (error)
goto error_on_bmapi_transaction;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
@@ -915,7 +914,7 @@ xfs_iomap_write_unwritten(
error_on_bmapi_transaction:
xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e53a90331422..766b23f86ce9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -41,7 +41,6 @@
#include <linux/capability.h>
#include <linux/xattr.h>
-#include <linux/namei.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
#include <linux/fiemap.h>
@@ -187,6 +186,8 @@ xfs_generic_create(
else
d_instantiate(dentry, inode);
+ xfs_finish_inode_setup(ip);
+
out_free_acl:
if (default_acl)
posix_acl_release(default_acl);
@@ -195,6 +196,7 @@ xfs_generic_create(
return error;
out_cleanup_inode:
+ xfs_finish_inode_setup(ip);
if (!tmpfile)
xfs_cleanup_inode(dir, inode, dentry);
iput(inode);
@@ -301,7 +303,7 @@ xfs_vn_link(
struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
struct xfs_name name;
int error;
@@ -326,7 +328,7 @@ xfs_vn_unlink(
xfs_dentry_to_name(&name, dentry, 0);
- error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+ error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry)));
if (error)
return error;
@@ -367,9 +369,11 @@ xfs_vn_symlink(
goto out_cleanup_inode;
d_instantiate(dentry, inode);
+ xfs_finish_inode_setup(cip);
return 0;
out_cleanup_inode:
+ xfs_finish_inode_setup(cip);
xfs_cleanup_inode(dir, inode, dentry);
iput(inode);
out:
@@ -384,22 +388,22 @@ xfs_vn_rename(
struct dentry *ndentry,
unsigned int flags)
{
- struct inode *new_inode = ndentry->d_inode;
+ struct inode *new_inode = d_inode(ndentry);
int omode = 0;
struct xfs_name oname;
struct xfs_name nname;
- if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
/* if we are exchanging files, we need to set i_mode of both files */
if (flags & RENAME_EXCHANGE)
- omode = ndentry->d_inode->i_mode;
+ omode = d_inode(ndentry)->i_mode;
xfs_dentry_to_name(&oname, odentry, omode);
- xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
+ xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode);
- return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+ return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)),
XFS_I(ndir), &nname,
new_inode ? XFS_I(new_inode) : NULL, flags);
}
@@ -409,10 +413,10 @@ xfs_vn_rename(
* we need to be very careful about how much stack we use.
* uio is kmalloced for this reason...
*/
-STATIC void *
+STATIC const char *
xfs_vn_follow_link(
struct dentry *dentry,
- struct nameidata *nd)
+ void **cookie)
{
char *link;
int error = -ENOMEM;
@@ -421,18 +425,16 @@ xfs_vn_follow_link(
if (!link)
goto out_err;
- error = xfs_readlink(XFS_I(dentry->d_inode), link);
+ error = xfs_readlink(XFS_I(d_inode(dentry)), link);
if (unlikely(error))
goto out_kfree;
- nd_set_link(nd, link);
- return NULL;
+ return *cookie = link;
out_kfree:
kfree(link);
out_err:
- nd_set_link(nd, ERR_PTR(error));
- return NULL;
+ return ERR_PTR(error);
}
STATIC int
@@ -441,7 +443,7 @@ xfs_vn_getattr(
struct dentry *dentry,
struct kstat *stat)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -697,7 +699,7 @@ xfs_setattr_nonsize(
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -728,7 +730,7 @@ xfs_setattr_nonsize(
return 0;
out_trans_cancel:
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_dqrele:
xfs_qm_dqrele(udqp);
@@ -750,7 +752,6 @@ xfs_setattr_size(
struct xfs_trans *tp;
int error;
uint lock_flags = 0;
- uint commit_flags = 0;
bool did_zeroing = false;
trace_xfs_setattr(ip);
@@ -766,6 +767,7 @@ xfs_setattr_size(
return error;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+ ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
ASSERT(S_ISREG(ip->i_d.di_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -829,61 +831,36 @@ xfs_setattr_size(
inode_dio_wait(inode);
/*
- * Do all the page cache truncate work outside the transaction context
- * as the "lock" order is page lock->log space reservation. i.e.
- * locking pages inside the transaction can ABBA deadlock with
- * writeback. We have to do the VFS inode size update before we truncate
- * the pagecache, however, to avoid racing with page faults beyond the
- * new EOF they are not serialised against truncate operations except by
- * page locks and size updates.
+ * We've already locked out new page faults, so now we can safely remove
+ * pages from the page cache knowing they won't get refaulted until we
+ * drop the XFS_MMAP_EXCL lock after the extent manipulations are
+ * complete. The truncate_setsize() call also cleans partial EOF page
+ * PTEs on extending truncates and hence ensures sub-page block size
+ * filesystems are correctly handled, too.
*
- * Hence we are in a situation where a truncate can fail with ENOMEM
- * from xfs_trans_reserve(), but having already truncated the in-memory
- * version of the file (i.e. made user visible changes). There's not
- * much we can do about this, except to hope that the caller sees ENOMEM
- * and retries the truncate operation.
+ * We have to do all the page cache truncate work outside the
+ * transaction context as the "lock" order is page lock->log space
+ * reservation as defined by extent allocation in the writeback path.
+ * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+ * having already truncated the in-memory version of the file (i.e. made
+ * user visible changes). There's not much we can do about this, except
+ * to hope that the caller sees ENOMEM and retries the truncate
+ * operation.
*/
- error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+ if (IS_DAX(inode))
+ error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
+ else
+ error = block_truncate_page(inode->i_mapping, newsize,
+ xfs_get_blocks);
if (error)
return error;
truncate_setsize(inode, newsize);
- /*
- * The "we can't serialise against page faults" pain gets worse.
- *
- * If the file is mapped then we have to clean the page at the old EOF
- * when extending the file. Extending the file can expose changes the
- * underlying page mapping (e.g. from beyond EOF to a hole or
- * unwritten), and so on the next attempt to write to that page we need
- * to remap it for write. i.e. we need .page_mkwrite() to be called.
- * Hence we need to clean the page to clean the pte and so a new write
- * fault will be triggered appropriately.
- *
- * If we do it before we change the inode size, then we can race with a
- * page fault that maps the page with exactly the same problem. If we do
- * it after we change the file size, then a new page fault can come in
- * and allocate space before we've run the rest of the truncate
- * transaction. That's kinda grotesque, but it's better than have data
- * over a hole, and so that's the lesser evil that has been chosen here.
- *
- * The real solution, however, is to have some mechanism for locking out
- * page faults while a truncate is in progress.
- */
- if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
- error = filemap_write_and_wait_range(
- VFS_I(ip)->i_mapping,
- round_down(oldsize, PAGE_CACHE_SIZE),
- round_up(oldsize, PAGE_CACHE_SIZE) - 1);
- if (error)
- return error;
- }
-
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
goto out_trans_cancel;
- commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
@@ -923,7 +900,7 @@ xfs_setattr_size(
if (newsize <= oldsize) {
error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
if (error)
- goto out_trans_abort;
+ goto out_trans_cancel;
/*
* Truncated "down", so we're removing references to old data
@@ -950,16 +927,14 @@ xfs_setattr_size(
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
out_unlock:
if (lock_flags)
xfs_iunlock(ip, lock_flags);
return error;
-out_trans_abort:
- commit_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
- xfs_trans_cancel(tp, commit_flags);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
@@ -968,16 +943,20 @@ xfs_vn_setattr(
struct dentry *dentry,
struct iattr *iattr)
{
- struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error;
if (iattr->ia_valid & ATTR_SIZE) {
uint iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(dentry->d_inode, &iolock);
- if (!error)
+ error = xfs_break_layouts(d_inode(dentry), &iolock, true);
+ if (!error) {
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ iolock |= XFS_MMAPLOCK_EXCL;
+
error = xfs_setattr_size(ip, iattr);
+ }
xfs_iunlock(ip, iolock);
} else {
error = xfs_setattr_nonsize(ip, iattr, 0);
@@ -1002,7 +981,7 @@ xfs_vn_update_time(
tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -1024,7 +1003,7 @@ xfs_vn_update_time(
}
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
- return xfs_trans_commit(tp, 0);
+ return xfs_trans_commit(tp);
}
#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1209,35 +1188,31 @@ xfs_diflags_to_iflags(
struct inode *inode,
struct xfs_inode *ip)
{
- if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+ uint16_t flags = ip->i_d.di_flags;
+
+ inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
+ S_NOATIME | S_DAX);
+
+ if (flags & XFS_DIFLAG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+ if (flags & XFS_DIFLAG_APPEND)
inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+ if (flags & XFS_DIFLAG_SYNC)
inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+ if (flags & XFS_DIFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
+ /* XXX: Also needs an on-disk per inode flag! */
+ if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+ inode->i_flags |= S_DAX;
}
/*
- * Initialize the Linux inode, set up the operation vectors and
- * unlock the inode.
- *
- * When reading existing inodes from disk this is called directly
- * from xfs_iget, when creating a new inode it is called from
- * xfs_ialloc after setting up the inode.
+ * Initialize the Linux inode and set up the operation vectors.
*
- * We are always called with an uninitialised linux inode here.
- * We need to initialise the necessary fields and take a reference
- * on it.
+ * When reading existing inodes from disk this is called directly from xfs_iget,
+ * when creating a new inode it is called from xfs_ialloc after setting up the
+ * inode. These callers have different criteria for clearing XFS_INEW, so leave
+ * it up to the caller to deal with unlocking the inode appropriately.
*/
void
xfs_setup_inode(
@@ -1324,9 +1299,4 @@ xfs_setup_inode(
inode_has_no_xattr(inode);
cache_no_acl(inode);
}
-
- xfs_iflags_clear(ip, XFS_INEW);
- barrier();
-
- unlock_new_inode(inode);
}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index ea7a98e9cb70..a0f84abb0d09 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations;
extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-extern void xfs_setup_inode(struct xfs_inode *);
-
/*
* Internal setattr interfaces.
*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 82e314258f73..f41b0c3fddab 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -229,7 +229,7 @@ xfs_bulkstat_grab_ichunk(
error = xfs_inobt_get_rec(cur, irec, &stat);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(stat == 1);
+ XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
/* Check if the record contains the inode in request */
if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
@@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk(
}
irec->ir_free |= xfs_inobt_maskn(0, idx);
- *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+ *icount = irec->ir_count - irec->ir_freecount;
}
return 0;
@@ -415,6 +415,8 @@ xfs_bulkstat(
goto del_cursor;
if (icount) {
irbp->ir_startino = r.ir_startino;
+ irbp->ir_holemask = r.ir_holemask;
+ irbp->ir_count = r.ir_count;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
irbp++;
@@ -447,13 +449,15 @@ xfs_bulkstat(
* If this chunk has any allocated inodes, save it.
* Also start read-ahead now for this chunk.
*/
- if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
+ if (r.ir_freecount < r.ir_count) {
xfs_bulkstat_ichunk_ra(mp, agno, &r);
irbp->ir_startino = r.ir_startino;
+ irbp->ir_holemask = r.ir_holemask;
+ irbp->ir_count = r.ir_count;
irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free;
irbp++;
- icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
+ icount += r.ir_count - r.ir_freecount;
}
error = xfs_btree_increment(cur, 0, &stat);
if (error || stat == 0) {
@@ -599,8 +603,7 @@ xfs_inumbers(
agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
buffer[bufidx].xi_startino =
XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
- buffer[bufidx].xi_alloccount =
- XFS_INODES_PER_CHUNK - r.ir_freecount;
+ buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
buffer[bufidx].xi_allocmask = ~r.ir_free;
if (++bufidx == bcount) {
long written;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index c31d2c2eadc4..85f883dd6207 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,26 +32,12 @@ typedef unsigned int __uint32_t;
typedef signed long long int __int64_t;
typedef unsigned long long int __uint64_t;
-typedef __uint32_t inst_t; /* an instruction */
-
typedef __s64 xfs_off_t; /* <file offset> type */
typedef unsigned long long xfs_ino_t; /* <inode> type */
typedef __s64 xfs_daddr_t; /* <disk address> type */
-typedef char * xfs_caddr_t; /* <core address> type */
typedef __u32 xfs_dev_t;
typedef __u32 xfs_nlink_t;
-/* __psint_t is the same size as a pointer */
-#if (BITS_PER_LONG == 32)
-typedef __int32_t __psint_t;
-typedef __uint32_t __psunsigned_t;
-#elif (BITS_PER_LONG == 64)
-typedef __int64_t __psint_t;
-typedef __uint64_t __psunsigned_t;
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-
#include "xfs_types.h"
#include "kmem.h"
@@ -116,15 +102,6 @@ typedef __uint64_t __psunsigned_t;
#undef XFS_NATIVE_HOST
#endif
-/*
- * Feature macros (disable/enable)
- */
-#ifdef CONFIG_SMP
-#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
-#else
-#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
-#endif
-
#define irix_sgid_inherit xfs_params.sgid_inherit.val
#define irix_symlink_mode xfs_params.symlink_mode.val
#define xfs_panic_mask xfs_params.panic_mask.val
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bcc7cfabb787..08d4fe46f0fa 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -109,7 +109,7 @@ xlog_ungrant_log_space(
STATIC void
xlog_verify_dest_ptr(
struct xlog *log,
- char *ptr);
+ void *ptr);
STATIC void
xlog_verify_grant_tail(
struct xlog *log);
@@ -513,7 +513,7 @@ xfs_log_done(
struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- uint flags)
+ bool regrant)
{
struct xlog *log = mp->m_log;
xfs_lsn_t lsn = 0;
@@ -526,14 +526,11 @@ xfs_log_done(
(((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
(xlog_commit_record(log, ticket, iclog, &lsn)))) {
lsn = (xfs_lsn_t) -1;
- if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
- flags |= XFS_LOG_REL_PERM_RESERV;
- }
+ regrant = false;
}
- if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
- (flags & XFS_LOG_REL_PERM_RESERV)) {
+ if (!regrant) {
trace_xfs_log_done_nonperm(log, ticket);
/*
@@ -541,7 +538,6 @@ xfs_log_done(
* request has been made to release a permanent reservation.
*/
xlog_ungrant_log_space(log, ticket);
- xfs_log_ticket_put(ticket);
} else {
trace_xfs_log_done_perm(log, ticket);
@@ -553,6 +549,7 @@ xfs_log_done(
ticket->t_flags |= XLOG_TIC_INITED;
}
+ xfs_log_ticket_put(ticket);
return lsn;
}
@@ -1447,7 +1444,7 @@ xlog_alloc_log(
iclog->ic_bp = bp;
iclog->ic_data = bp->b_addr;
#ifdef DEBUG
- log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+ log->l_iclog_bak[i] = &iclog->ic_header;
#endif
head = &iclog->ic_header;
memset(head, 0, sizeof(xlog_rec_header_t));
@@ -1602,7 +1599,7 @@ xlog_pack_data(
int i, j, k;
int size = iclog->ic_offset + roundoff;
__be32 cycle_lsn;
- xfs_caddr_t dp;
+ char *dp;
cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
@@ -3664,7 +3661,7 @@ xlog_ticket_alloc(
void
xlog_verify_dest_ptr(
struct xlog *log,
- char *ptr)
+ void *ptr)
{
int i;
int good_ptr = 0;
@@ -3767,9 +3764,8 @@ xlog_verify_iclog(
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
xlog_in_core_2_t *xhdr;
- xfs_caddr_t ptr;
- xfs_caddr_t base_ptr;
- __psint_t field_offset;
+ void *base_ptr, *ptr, *p;
+ ptrdiff_t field_offset;
__uint8_t clientid;
int len, i, j, k, op_len;
int idx;
@@ -3788,9 +3784,9 @@ xlog_verify_iclog(
if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
- ptr = (xfs_caddr_t) &iclog->ic_header;
- for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
- ptr += BBSIZE) {
+ base_ptr = ptr = &iclog->ic_header;
+ p = &iclog->ic_header;
+ for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
xfs_emerg(log->l_mp, "%s: unexpected magic num",
__func__);
@@ -3798,20 +3794,19 @@ xlog_verify_iclog(
/* check fields */
len = be32_to_cpu(iclog->ic_header.h_num_logops);
- ptr = iclog->ic_datap;
- base_ptr = ptr;
- ophead = (xlog_op_header_t *)ptr;
+ base_ptr = ptr = iclog->ic_datap;
+ ophead = ptr;
xhdr = iclog->ic_data;
for (i = 0; i < len; i++) {
- ophead = (xlog_op_header_t *)ptr;
+ ophead = ptr;
/* clientid is only 1 byte */
- field_offset = (__psint_t)
- ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+ p = &ophead->oh_clientid;
+ field_offset = p - base_ptr;
if (!syncing || (field_offset & 0x1ff)) {
clientid = ophead->oh_clientid;
} else {
- idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+ idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,13 +3824,13 @@ xlog_verify_iclog(
(unsigned long)field_offset);
/* check length */
- field_offset = (__psint_t)
- ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+ p = &ophead->oh_len;
+ field_offset = p - base_ptr;
if (!syncing || (field_offset & 0x1ff)) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
- idx = BTOBBT((__psint_t)&ophead->oh_len -
- (__psint_t)iclog->ic_datap);
+ idx = BTOBBT((uintptr_t)&ophead->oh_len -
+ (uintptr_t)iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 84e0deb95abd..fa27aaec72cb 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -111,15 +111,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
#define XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
/*
- * Macros, structures, prototypes for interface to the log manager.
- */
-
-/*
- * Flags to xfs_log_done()
- */
-#define XFS_LOG_REL_PERM_RESERV 0x1
-
-/*
* Flags to xfs_log_force()
*
* XFS_LOG_SYNC: Synchronous force in-core log to disk
@@ -138,7 +129,7 @@ struct xfs_log_callback;
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
- uint flags);
+ bool regrant);
int _xfs_log_force(struct xfs_mount *mp,
uint flags,
int *log_forced);
@@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn, int flags);
+ xfs_lsn_t *commit_lsn, bool regrant);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 45cc0ce18adf..abc2ccbff739 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -624,7 +624,7 @@ restart:
spin_unlock(&cil->xc_push_lock);
/* xfs_log_done always frees the ticket on error. */
- commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+ commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
if (commit_lsn == -1)
goto out_abort;
@@ -773,14 +773,10 @@ xfs_log_commit_cil(
struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_lsn_t *commit_lsn,
- int flags)
+ bool regrant)
{
struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
- int log_flags = 0;
-
- if (flags & XFS_TRANS_RELEASE_LOG_RES)
- log_flags = XFS_LOG_REL_PERM_RESERV;
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
@@ -795,7 +791,7 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = tp->t_commit_lsn;
- xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ xfs_log_done(mp, tp->t_ticket, NULL, regrant);
xfs_trans_unreserve_and_mod_sb(tp);
/*
@@ -809,7 +805,7 @@ xfs_log_commit_cil(
* the log items. This affects (at least) processing of stale buffers,
* inodes and EFIs.
*/
- xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
+ xfs_trans_free_items(tp, tp->t_commit_lsn, false);
xlog_cil_push_background(log);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index db7cbdeb2b42..1c87c8abfbed 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -409,7 +409,7 @@ struct xlog {
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
- char *l_iclog_bak[XLOG_MAX_ICLOGS];
+ void *l_iclog_bak[XLOG_MAX_ICLOGS];
#endif
};
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a5a945fc3bdc..480ebba8464f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -147,7 +147,7 @@ xlog_put_bp(
* Return the address of the start of the given block number's data
* in a log buffer. The buffer covers a log sector-aligned region.
*/
-STATIC xfs_caddr_t
+STATIC char *
xlog_align(
struct xlog *log,
xfs_daddr_t blk_no,
@@ -203,7 +203,7 @@ xlog_bread(
xfs_daddr_t blk_no,
int nbblks,
struct xfs_buf *bp,
- xfs_caddr_t *offset)
+ char **offset)
{
int error;
@@ -225,9 +225,9 @@ xlog_bread_offset(
xfs_daddr_t blk_no, /* block to read from */
int nbblks, /* blocks to read */
struct xfs_buf *bp,
- xfs_caddr_t offset)
+ char *offset)
{
- xfs_caddr_t orig_offset = bp->b_addr;
+ char *orig_offset = bp->b_addr;
int orig_len = BBTOB(bp->b_length);
int error, error2;
@@ -396,7 +396,7 @@ xlog_find_cycle_start(
xfs_daddr_t *last_blk,
uint cycle)
{
- xfs_caddr_t offset;
+ char *offset;
xfs_daddr_t mid_blk;
xfs_daddr_t end_blk;
uint mid_cycle;
@@ -443,7 +443,7 @@ xlog_find_verify_cycle(
uint cycle;
xfs_buf_t *bp;
xfs_daddr_t bufblks;
- xfs_caddr_t buf = NULL;
+ char *buf = NULL;
int error = 0;
/*
@@ -509,7 +509,7 @@ xlog_find_verify_log_record(
{
xfs_daddr_t i;
xfs_buf_t *bp;
- xfs_caddr_t offset = NULL;
+ char *offset = NULL;
xlog_rec_header_t *head = NULL;
int error = 0;
int smallmem = 0;
@@ -616,7 +616,7 @@ xlog_find_head(
xfs_daddr_t *return_head_blk)
{
xfs_buf_t *bp;
- xfs_caddr_t offset;
+ char *offset;
xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
int num_scan_bblks;
uint first_half_cycle, last_half_cycle;
@@ -891,7 +891,7 @@ xlog_find_tail(
{
xlog_rec_header_t *rhead;
xlog_op_header_t *op_head;
- xfs_caddr_t offset = NULL;
+ char *offset = NULL;
xfs_buf_t *bp;
int error, i, found;
xfs_daddr_t umount_data_blk;
@@ -1099,7 +1099,7 @@ xlog_find_zeroed(
xfs_daddr_t *blk_no)
{
xfs_buf_t *bp;
- xfs_caddr_t offset;
+ char *offset;
uint first_cycle, last_cycle;
xfs_daddr_t new_blk, last_blk, start_blk;
xfs_daddr_t num_scan_bblks;
@@ -1199,7 +1199,7 @@ bp_err:
STATIC void
xlog_add_record(
struct xlog *log,
- xfs_caddr_t buf,
+ char *buf,
int cycle,
int block,
int tail_cycle,
@@ -1227,7 +1227,7 @@ xlog_write_log_records(
int tail_cycle,
int tail_block)
{
- xfs_caddr_t offset;
+ char *offset;
xfs_buf_t *bp;
int balign, ealign;
int sectbb = log->l_sectBBsize;
@@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer(
return -EFSCORRUPTED;
}
- buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
- next_unlinked_offset);
+ buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
*buffer_nextp = *logged_nextp;
/*
@@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer(
* have to leave the inode in a consistent state for whoever
* reads it next....
*/
- xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+ xfs_dinode_calc_crc(mp,
xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
}
@@ -1887,9 +1886,14 @@ xlog_recover_get_buf_lsn(
uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
break;
case XFS_ATTR3_RMT_MAGIC:
- lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
- uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
- break;
+ /*
+ * Remote attr blocks are written synchronously, rather than
+ * being logged. That means they do not contain a valid LSN
+ * (i.e. transactionally ordered) in them, and hence any time we
+ * see a buffer to replay over the top of a remote attribute
+ * block we should simply do so.
+ */
+ goto recover_immediately;
case XFS_SB_MAGIC:
lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
uuid = &((struct xfs_dsb *)blk)->sb_uuid;
@@ -2503,8 +2507,8 @@ xlog_recover_inode_pass2(
xfs_buf_t *bp;
xfs_dinode_t *dip;
int len;
- xfs_caddr_t src;
- xfs_caddr_t dest;
+ char *src;
+ char *dest;
int error;
int attr_index;
uint fields;
@@ -2546,7 +2550,7 @@ xlog_recover_inode_pass2(
goto out_release;
}
ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
+ dip = xfs_buf_offset(bp, in_f->ilf_boffset);
/*
* Make sure the place we're flushing out to really looks
@@ -2885,7 +2889,7 @@ xlog_recover_dquot_pass2(
return error;
ASSERT(bp);
- ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+ ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
/*
* If the dquot has an LSN in it, recover the dquot only if it's less
@@ -3068,12 +3072,22 @@ xlog_recover_do_icreate_pass2(
return -EINVAL;
}
- /* existing allocation is fixed value */
- ASSERT(count == mp->m_ialloc_inos);
- ASSERT(length == mp->m_ialloc_blks);
- if (count != mp->m_ialloc_inos ||
- length != mp->m_ialloc_blks) {
- xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+ /*
+ * The inode chunk is either full or sparse and we only support
+ * m_ialloc_min_blks sized sparse allocations at this time.
+ */
+ if (length != mp->m_ialloc_blks &&
+ length != mp->m_ialloc_min_blks) {
+ xfs_warn(log->l_mp,
+ "%s: unsupported chunk length", __FUNCTION__);
+ return -EINVAL;
+ }
+
+ /* verify inode count is consistent with extent length */
+ if ((count >> mp->m_sb.sb_inopblog) != length) {
+ xfs_warn(log->l_mp,
+ "%s: inconsistent inode count and chunk length",
+ __FUNCTION__);
return -EINVAL;
}
@@ -3091,8 +3105,8 @@ xlog_recover_do_icreate_pass2(
XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
return 0;
- xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
- be32_to_cpu(icl->icl_gen));
+ xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
+ be32_to_cpu(icl->icl_gen));
return 0;
}
@@ -3364,17 +3378,17 @@ STATIC int
xlog_recover_add_to_cont_trans(
struct xlog *log,
struct xlog_recover *trans,
- xfs_caddr_t dp,
+ char *dp,
int len)
{
xlog_recover_item_t *item;
- xfs_caddr_t ptr, old_ptr;
+ char *ptr, *old_ptr;
int old_len;
if (list_empty(&trans->r_itemq)) {
/* finish copying rest of trans header */
xlog_recover_add_item(&trans->r_itemq);
- ptr = (xfs_caddr_t) &trans->r_theader +
+ ptr = (char *)&trans->r_theader +
sizeof(xfs_trans_header_t) - len;
memcpy(ptr, dp, len);
return 0;
@@ -3410,12 +3424,12 @@ STATIC int
xlog_recover_add_to_trans(
struct xlog *log,
struct xlog_recover *trans,
- xfs_caddr_t dp,
+ char *dp,
int len)
{
xfs_inode_log_format_t *in_f; /* any will do */
xlog_recover_item_t *item;
- xfs_caddr_t ptr;
+ char *ptr;
if (!len)
return 0;
@@ -3504,7 +3518,7 @@ STATIC int
xlog_recovery_process_trans(
struct xlog *log,
struct xlog_recover *trans,
- xfs_caddr_t dp,
+ char *dp,
unsigned int len,
unsigned int flags,
int pass)
@@ -3611,8 +3625,8 @@ xlog_recover_process_ophdr(
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
struct xlog_op_header *ohead,
- xfs_caddr_t dp,
- xfs_caddr_t end,
+ char *dp,
+ char *end,
int pass)
{
struct xlog_recover *trans;
@@ -3661,11 +3675,11 @@ xlog_recover_process_data(
struct xlog *log,
struct hlist_head rhash[],
struct xlog_rec_header *rhead,
- xfs_caddr_t dp,
+ char *dp,
int pass)
{
struct xlog_op_header *ohead;
- xfs_caddr_t end;
+ char *end;
int num_logops;
int error;
@@ -3751,11 +3765,11 @@ xlog_recover_process_efi(
}
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
return error;
abort_error:
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
@@ -3857,13 +3871,13 @@ xlog_recover_clear_agi_bucket(
xfs_trans_log_buf(tp, agibp, offset,
(offset + sizeof(xfs_agino_t) - 1));
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
goto out_error;
return;
out_abort:
- xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
out_error:
xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
return;
@@ -4010,7 +4024,7 @@ xlog_recover_process_iunlinks(
STATIC int
xlog_unpack_data_crc(
struct xlog_rec_header *rhead,
- xfs_caddr_t dp,
+ char *dp,
struct xlog *log)
{
__le32 crc;
@@ -4040,7 +4054,7 @@ xlog_unpack_data_crc(
STATIC int
xlog_unpack_data(
struct xlog_rec_header *rhead,
- xfs_caddr_t dp,
+ char *dp,
struct xlog *log)
{
int i, j, k;
@@ -4122,7 +4136,7 @@ xlog_do_recovery_pass(
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
- xfs_caddr_t offset;
+ char *offset;
xfs_buf_t *hbp, *dbp;
int error = 0, h_size;
int bblks, split_bblks;
@@ -4463,10 +4477,10 @@ xlog_do_recover(
xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
ASSERT(xfs_sb_good_version(sbp));
+ xfs_reinit_percpu_counters(log->l_mp);
+
xfs_buf_relse(bp);
- /* We've re-read the superblock so re-initialize per-cpu counters */
- xfs_icsb_reinit_counters(log->l_mp);
xlog_recover_check_summary(log);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4fa80e63eea2..461e791efad7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,18 +43,6 @@
#include "xfs_sysfs.h"
-#ifdef HAVE_PERCPU_SB
-STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
- int);
-STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
- int);
-STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
-#else
-
-#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
-#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
-#endif
-
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
static uuid_t *xfs_uuid_table;
@@ -347,8 +335,7 @@ reread:
goto reread;
}
- /* Initialize per-cpu counters */
- xfs_icsb_reinit_counters(mp);
+ xfs_reinit_percpu_counters(mp);
/* no need to be quiet anymore, so reset the buf ops */
bp->b_ops = &xfs_sb_buf_ops;
@@ -738,6 +725,22 @@ xfs_mountfs(
}
/*
+ * If enabled, sparse inode chunk alignment is expected to match the
+ * cluster size. Full inode chunk alignment must match the chunk size,
+ * but that is checked on sb read verification...
+ */
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
+ mp->m_sb.sb_spino_align !=
+ XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
+ xfs_warn(mp,
+ "Sparse inode block alignment (%u) must match cluster size (%llu).",
+ mp->m_sb.sb_spino_align,
+ XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
+ error = -EINVAL;
+ goto out_remove_uuid;
+ }
+
+ /*
* Set inode alignment fields
*/
xfs_set_inoalignment(mp);
@@ -1087,8 +1090,6 @@ xfs_log_sbcount(xfs_mount_t *mp)
if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
return 0;
- xfs_icsb_sync_counters(mp, 0);
-
/*
* we don't need to do this if we are updating the superblock
* counters on every modification.
@@ -1100,252 +1101,141 @@ xfs_log_sbcount(xfs_mount_t *mp)
}
/*
- * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply
- * a delta to a specified field in the in-core superblock. Simply
- * switch on the field indicated and apply the delta to that field.
- * Fields are not allowed to dip below zero, so if the delta would
- * do this do not apply it and return EINVAL.
- *
- * The m_sb_lock must be held when this routine is called.
+ * Deltas for the inode count are +/-64, hence we use a large batch size
+ * of 128 so we don't need to take the counter lock on every update.
*/
-STATIC int
-xfs_mod_incore_sb_unlocked(
- xfs_mount_t *mp,
- xfs_sb_field_t field,
- int64_t delta,
- int rsvd)
+#define XFS_ICOUNT_BATCH 128
+int
+xfs_mod_icount(
+ struct xfs_mount *mp,
+ int64_t delta)
{
- int scounter; /* short counter for 32 bit fields */
- long long lcounter; /* long counter for 64 bit fields */
- long long res_used, rem;
-
- /*
- * With the in-core superblock spin lock held, switch
- * on the indicated field. Apply the delta to the
- * proper field. If the fields value would dip below
- * 0, then do not apply the delta and return EINVAL.
- */
- switch (field) {
- case XFS_SBS_ICOUNT:
- lcounter = (long long)mp->m_sb.sb_icount;
- lcounter += delta;
- if (lcounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_icount = lcounter;
- return 0;
- case XFS_SBS_IFREE:
- lcounter = (long long)mp->m_sb.sb_ifree;
- lcounter += delta;
- if (lcounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_ifree = lcounter;
- return 0;
- case XFS_SBS_FDBLOCKS:
- lcounter = (long long)
- mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
- res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
-
- if (delta > 0) { /* Putting blocks back */
- if (res_used > delta) {
- mp->m_resblks_avail += delta;
- } else {
- rem = delta - res_used;
- mp->m_resblks_avail = mp->m_resblks;
- lcounter += rem;
- }
- } else { /* Taking blocks away */
- lcounter += delta;
- if (lcounter >= 0) {
- mp->m_sb.sb_fdblocks = lcounter +
- XFS_ALLOC_SET_ASIDE(mp);
- return 0;
- }
-
- /*
- * We are out of blocks, use any available reserved
- * blocks if were allowed to.
- */
- if (!rsvd)
- return -ENOSPC;
-
- lcounter = (long long)mp->m_resblks_avail + delta;
- if (lcounter >= 0) {
- mp->m_resblks_avail = lcounter;
- return 0;
- }
- printk_once(KERN_WARNING
- "Filesystem \"%s\": reserve blocks depleted! "
- "Consider increasing reserve pool size.",
- mp->m_fsname);
- return -ENOSPC;
- }
-
- mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
- return 0;
- case XFS_SBS_FREXTENTS:
- lcounter = (long long)mp->m_sb.sb_frextents;
- lcounter += delta;
- if (lcounter < 0) {
- return -ENOSPC;
- }
- mp->m_sb.sb_frextents = lcounter;
- return 0;
- case XFS_SBS_DBLOCKS:
- lcounter = (long long)mp->m_sb.sb_dblocks;
- lcounter += delta;
- if (lcounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_dblocks = lcounter;
- return 0;
- case XFS_SBS_AGCOUNT:
- scounter = mp->m_sb.sb_agcount;
- scounter += delta;
- if (scounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_agcount = scounter;
- return 0;
- case XFS_SBS_IMAX_PCT:
- scounter = mp->m_sb.sb_imax_pct;
- scounter += delta;
- if (scounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_imax_pct = scounter;
- return 0;
- case XFS_SBS_REXTSIZE:
- scounter = mp->m_sb.sb_rextsize;
- scounter += delta;
- if (scounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_rextsize = scounter;
- return 0;
- case XFS_SBS_RBMBLOCKS:
- scounter = mp->m_sb.sb_rbmblocks;
- scounter += delta;
- if (scounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_rbmblocks = scounter;
- return 0;
- case XFS_SBS_RBLOCKS:
- lcounter = (long long)mp->m_sb.sb_rblocks;
- lcounter += delta;
- if (lcounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_rblocks = lcounter;
- return 0;
- case XFS_SBS_REXTENTS:
- lcounter = (long long)mp->m_sb.sb_rextents;
- lcounter += delta;
- if (lcounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_rextents = lcounter;
- return 0;
- case XFS_SBS_REXTSLOG:
- scounter = mp->m_sb.sb_rextslog;
- scounter += delta;
- if (scounter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- mp->m_sb.sb_rextslog = scounter;
- return 0;
- default:
+ __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
+ if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
ASSERT(0);
+ percpu_counter_add(&mp->m_icount, -delta);
return -EINVAL;
}
+ return 0;
}
-/*
- * xfs_mod_incore_sb() is used to change a field in the in-core
- * superblock structure by the specified delta. This modification
- * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked()
- * routine to do the work.
- */
int
-xfs_mod_incore_sb(
+xfs_mod_ifree(
struct xfs_mount *mp,
- xfs_sb_field_t field,
- int64_t delta,
- int rsvd)
+ int64_t delta)
{
- int status;
-
-#ifdef HAVE_PERCPU_SB
- ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
-#endif
- spin_lock(&mp->m_sb_lock);
- status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
- spin_unlock(&mp->m_sb_lock);
-
- return status;
+ percpu_counter_add(&mp->m_ifree, delta);
+ if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
+ ASSERT(0);
+ percpu_counter_add(&mp->m_ifree, -delta);
+ return -EINVAL;
+ }
+ return 0;
}
/*
- * Change more than one field in the in-core superblock structure at a time.
- *
- * The fields and changes to those fields are specified in the array of
- * xfs_mod_sb structures passed in. Either all of the specified deltas
- * will be applied or none of them will. If any modified field dips below 0,
- * then all modifications will be backed out and EINVAL will be returned.
- *
- * Note that this function may not be used for the superblock values that
- * are tracked with the in-memory per-cpu counters - a direct call to
- * xfs_icsb_modify_counters is required for these.
+ * Deltas for the block count can vary from 1 to very large, but lock contention
+ * only occurs on frequent small block count updates such as in the delayed
+ * allocation path for buffered writes (page a time updates). Hence we set
+ * a large batch count (1024) to minimise global counter updates except when
+ * we get near to ENOSPC and we have to be very accurate with our updates.
*/
+#define XFS_FDBLOCKS_BATCH 1024
int
-xfs_mod_incore_sb_batch(
+xfs_mod_fdblocks(
struct xfs_mount *mp,
- xfs_mod_sb_t *msb,
- uint nmsb,
- int rsvd)
+ int64_t delta,
+ bool rsvd)
{
- xfs_mod_sb_t *msbp;
- int error = 0;
+ int64_t lcounter;
+ long long res_used;
+ s32 batch;
+
+ if (delta > 0) {
+ /*
+ * If the reserve pool is depleted, put blocks back into it
+ * first. Most of the time the pool is full.
+ */
+ if (likely(mp->m_resblks == mp->m_resblks_avail)) {
+ percpu_counter_add(&mp->m_fdblocks, delta);
+ return 0;
+ }
+
+ spin_lock(&mp->m_sb_lock);
+ res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+
+ if (res_used > delta) {
+ mp->m_resblks_avail += delta;
+ } else {
+ delta -= res_used;
+ mp->m_resblks_avail = mp->m_resblks;
+ percpu_counter_add(&mp->m_fdblocks, delta);
+ }
+ spin_unlock(&mp->m_sb_lock);
+ return 0;
+ }
/*
- * Loop through the array of mod structures and apply each individually.
- * If any fail, then back out all those which have already been applied.
- * Do all of this within the scope of the m_sb_lock so that all of the
- * changes will be atomic.
+ * Taking blocks away, need to be more accurate the closer we
+ * are to zero.
+ *
+ * If the counter has a value of less than 2 * max batch size,
+ * then make everything serialise as we are real close to
+ * ENOSPC.
+ */
+ if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
+ XFS_FDBLOCKS_BATCH) < 0)
+ batch = 1;
+ else
+ batch = XFS_FDBLOCKS_BATCH;
+
+ __percpu_counter_add(&mp->m_fdblocks, delta, batch);
+ if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp),
+ XFS_FDBLOCKS_BATCH) >= 0) {
+ /* we had space! */
+ return 0;
+ }
+
+ /*
+ * lock up the sb for dipping into reserves before releasing the space
+ * that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
- for (msbp = msb; msbp < (msb + nmsb); msbp++) {
- ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
- msbp->msb_field > XFS_SBS_FDBLOCKS);
+ percpu_counter_add(&mp->m_fdblocks, -delta);
+ if (!rsvd)
+ goto fdblocks_enospc;
- error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
- msbp->msb_delta, rsvd);
- if (error)
- goto unwind;
+ lcounter = (long long)mp->m_resblks_avail + delta;
+ if (lcounter >= 0) {
+ mp->m_resblks_avail = lcounter;
+ spin_unlock(&mp->m_sb_lock);
+ return 0;
}
+ printk_once(KERN_WARNING
+ "Filesystem \"%s\": reserve blocks depleted! "
+ "Consider increasing reserve pool size.",
+ mp->m_fsname);
+fdblocks_enospc:
spin_unlock(&mp->m_sb_lock);
- return 0;
+ return -ENOSPC;
+}
-unwind:
- while (--msbp >= msb) {
- error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
- -msbp->msb_delta, rsvd);
- ASSERT(error == 0);
- }
+int
+xfs_mod_frextents(
+ struct xfs_mount *mp,
+ int64_t delta)
+{
+ int64_t lcounter;
+ int ret = 0;
+
+ spin_lock(&mp->m_sb_lock);
+ lcounter = mp->m_sb.sb_frextents + delta;
+ if (lcounter < 0)
+ ret = -ENOSPC;
+ else
+ mp->m_sb.sb_frextents = lcounter;
spin_unlock(&mp->m_sb_lock);
- return error;
+ return ret;
}
/*
@@ -1407,573 +1297,3 @@ xfs_dev_is_read_only(
}
return 0;
}
-
-#ifdef HAVE_PERCPU_SB
-/*
- * Per-cpu incore superblock counters
- *
- * Simple concept, difficult implementation
- *
- * Basically, replace the incore superblock counters with a distributed per cpu
- * counter for contended fields (e.g. free block count).
- *
- * Difficulties arise in that the incore sb is used for ENOSPC checking, and
- * hence needs to be accurately read when we are running low on space. Hence
- * there is a method to enable and disable the per-cpu counters based on how
- * much "stuff" is available in them.
- *
- * Basically, a counter is enabled if there is enough free resource to justify
- * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
- * ENOSPC), then we disable the counters to synchronise all callers and
- * re-distribute the available resources.
- *
- * If, once we redistributed the available resources, we still get a failure,
- * we disable the per-cpu counter and go through the slow path.
- *
- * The slow path is the current xfs_mod_incore_sb() function. This means that
- * when we disable a per-cpu counter, we need to drain its resources back to
- * the global superblock. We do this after disabling the counter to prevent
- * more threads from queueing up on the counter.
- *
- * Essentially, this means that we still need a lock in the fast path to enable
- * synchronisation between the global counters and the per-cpu counters. This
- * is not a problem because the lock will be local to a CPU almost all the time
- * and have little contention except when we get to ENOSPC conditions.
- *
- * Basically, this lock becomes a barrier that enables us to lock out the fast
- * path while we do things like enabling and disabling counters and
- * synchronising the counters.
- *
- * Locking rules:
- *
- * 1. m_sb_lock before picking up per-cpu locks
- * 2. per-cpu locks always picked up via for_each_online_cpu() order
- * 3. accurate counter sync requires m_sb_lock + per cpu locks
- * 4. modifying per-cpu counters requires holding per-cpu lock
- * 5. modifying global counters requires holding m_sb_lock
- * 6. enabling or disabling a counter requires holding the m_sb_lock
- * and _none_ of the per-cpu locks.
- *
- * Disabled counters are only ever re-enabled by a balance operation
- * that results in more free resources per CPU than a given threshold.
- * To ensure counters don't remain disabled, they are rebalanced when
- * the global resource goes above a higher threshold (i.e. some hysteresis
- * is present to prevent thrashing).
- */
-
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * hot-plug CPU notifier support.
- *
- * We need a notifier per filesystem as we need to be able to identify
- * the filesystem to balance the counters out. This is achieved by
- * having a notifier block embedded in the xfs_mount_t and doing pointer
- * magic to get the mount pointer from the notifier block address.
- */
-STATIC int
-xfs_icsb_cpu_notify(
- struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- xfs_icsb_cnts_t *cntp;
- xfs_mount_t *mp;
-
- mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
- cntp = (xfs_icsb_cnts_t *)
- per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- /* Easy Case - initialize the area and locks, and
- * then rebalance when online does everything else for us. */
- memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- xfs_icsb_lock(mp);
- xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
- xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
- xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
- xfs_icsb_unlock(mp);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- /* Disable all the counters, then fold the dead cpu's
- * count into the total on the global superblock and
- * re-enable the counters. */
- xfs_icsb_lock(mp);
- spin_lock(&mp->m_sb_lock);
- xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
- xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
- xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
-
- mp->m_sb.sb_icount += cntp->icsb_icount;
- mp->m_sb.sb_ifree += cntp->icsb_ifree;
- mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
-
- memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-
- xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
- xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
- xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
- spin_unlock(&mp->m_sb_lock);
- xfs_icsb_unlock(mp);
- break;
- }
-
- return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-int
-xfs_icsb_init_counters(
- xfs_mount_t *mp)
-{
- xfs_icsb_cnts_t *cntp;
- int i;
-
- mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
- if (mp->m_sb_cnts == NULL)
- return -ENOMEM;
-
- for_each_online_cpu(i) {
- cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
- memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
- }
-
- mutex_init(&mp->m_icsb_mutex);
-
- /*
- * start with all counters disabled so that the
- * initial balance kicks us off correctly
- */
- mp->m_icsb_counters = -1;
-
-#ifdef CONFIG_HOTPLUG_CPU
- mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
- mp->m_icsb_notifier.priority = 0;
- register_hotcpu_notifier(&mp->m_icsb_notifier);
-#endif /* CONFIG_HOTPLUG_CPU */
-
- return 0;
-}
-
-void
-xfs_icsb_reinit_counters(
- xfs_mount_t *mp)
-{
- xfs_icsb_lock(mp);
- /*
- * start with all counters disabled so that the
- * initial balance kicks us off correctly
- */
- mp->m_icsb_counters = -1;
- xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
- xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
- xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
- xfs_icsb_unlock(mp);
-}
-
-void
-xfs_icsb_destroy_counters(
- xfs_mount_t *mp)
-{
- if (mp->m_sb_cnts) {
- unregister_hotcpu_notifier(&mp->m_icsb_notifier);
- free_percpu(mp->m_sb_cnts);
- }
- mutex_destroy(&mp->m_icsb_mutex);
-}
-
-STATIC void
-xfs_icsb_lock_cntr(
- xfs_icsb_cnts_t *icsbp)
-{
- while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
- ndelay(1000);
- }
-}
-
-STATIC void
-xfs_icsb_unlock_cntr(
- xfs_icsb_cnts_t *icsbp)
-{
- clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
-}
-
-
-STATIC void
-xfs_icsb_lock_all_counters(
- xfs_mount_t *mp)
-{
- xfs_icsb_cnts_t *cntp;
- int i;
-
- for_each_online_cpu(i) {
- cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
- xfs_icsb_lock_cntr(cntp);
- }
-}
-
-STATIC void
-xfs_icsb_unlock_all_counters(
- xfs_mount_t *mp)
-{
- xfs_icsb_cnts_t *cntp;
- int i;
-
- for_each_online_cpu(i) {
- cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
- xfs_icsb_unlock_cntr(cntp);
- }
-}
-
-STATIC void
-xfs_icsb_count(
- xfs_mount_t *mp,
- xfs_icsb_cnts_t *cnt,
- int flags)
-{
- xfs_icsb_cnts_t *cntp;
- int i;
-
- memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
-
- if (!(flags & XFS_ICSB_LAZY_COUNT))
- xfs_icsb_lock_all_counters(mp);
-
- for_each_online_cpu(i) {
- cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
- cnt->icsb_icount += cntp->icsb_icount;
- cnt->icsb_ifree += cntp->icsb_ifree;
- cnt->icsb_fdblocks += cntp->icsb_fdblocks;
- }
-
- if (!(flags & XFS_ICSB_LAZY_COUNT))
- xfs_icsb_unlock_all_counters(mp);
-}
-
-STATIC int
-xfs_icsb_counter_disabled(
- xfs_mount_t *mp,
- xfs_sb_field_t field)
-{
- ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
- return test_bit(field, &mp->m_icsb_counters);
-}
-
-STATIC void
-xfs_icsb_disable_counter(
- xfs_mount_t *mp,
- xfs_sb_field_t field)
-{
- xfs_icsb_cnts_t cnt;
-
- ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-
- /*
- * If we are already disabled, then there is nothing to do
- * here. We check before locking all the counters to avoid
- * the expensive lock operation when being called in the
- * slow path and the counter is already disabled. This is
- * safe because the only time we set or clear this state is under
- * the m_icsb_mutex.
- */
- if (xfs_icsb_counter_disabled(mp, field))
- return;
-
- xfs_icsb_lock_all_counters(mp);
- if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
- /* drain back to superblock */
-
- xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
- switch(field) {
- case XFS_SBS_ICOUNT:
- mp->m_sb.sb_icount = cnt.icsb_icount;
- break;
- case XFS_SBS_IFREE:
- mp->m_sb.sb_ifree = cnt.icsb_ifree;
- break;
- case XFS_SBS_FDBLOCKS:
- mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
- break;
- default:
- BUG();
- }
- }
-
- xfs_icsb_unlock_all_counters(mp);
-}
-
-STATIC void
-xfs_icsb_enable_counter(
- xfs_mount_t *mp,
- xfs_sb_field_t field,
- uint64_t count,
- uint64_t resid)
-{
- xfs_icsb_cnts_t *cntp;
- int i;
-
- ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-
- xfs_icsb_lock_all_counters(mp);
- for_each_online_cpu(i) {
- cntp = per_cpu_ptr(mp->m_sb_cnts, i);
- switch (field) {
- case XFS_SBS_ICOUNT:
- cntp->icsb_icount = count + resid;
- break;
- case XFS_SBS_IFREE:
- cntp->icsb_ifree = count + resid;
- break;
- case XFS_SBS_FDBLOCKS:
- cntp->icsb_fdblocks = count + resid;
- break;
- default:
- BUG();
- break;
- }
- resid = 0;
- }
- clear_bit(field, &mp->m_icsb_counters);
- xfs_icsb_unlock_all_counters(mp);
-}
-
-void
-xfs_icsb_sync_counters_locked(
- xfs_mount_t *mp,
- int flags)
-{
- xfs_icsb_cnts_t cnt;
-
- xfs_icsb_count(mp, &cnt, flags);
-
- if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
- mp->m_sb.sb_icount = cnt.icsb_icount;
- if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
- mp->m_sb.sb_ifree = cnt.icsb_ifree;
- if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
- mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-}
-
-/*
- * Accurate update of per-cpu counters to incore superblock
- */
-void
-xfs_icsb_sync_counters(
- xfs_mount_t *mp,
- int flags)
-{
- spin_lock(&mp->m_sb_lock);
- xfs_icsb_sync_counters_locked(mp, flags);
- spin_unlock(&mp->m_sb_lock);
-}
-
-/*
- * Balance and enable/disable counters as necessary.
- *
- * Thresholds for re-enabling counters are somewhat magic. inode counts are
- * chosen to be the same number as single on disk allocation chunk per CPU, and
- * free blocks is something far enough zero that we aren't going thrash when we
- * get near ENOSPC. We also need to supply a minimum we require per cpu to
- * prevent looping endlessly when xfs_alloc_space asks for more than will
- * be distributed to a single CPU but each CPU has enough blocks to be
- * reenabled.
- *
- * Note that we can be called when counters are already disabled.
- * xfs_icsb_disable_counter() optimises the counter locking in this case to
- * prevent locking every per-cpu counter needlessly.
- */
-
-#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64
-#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
- (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
-STATIC void
-xfs_icsb_balance_counter_locked(
- xfs_mount_t *mp,
- xfs_sb_field_t field,
- int min_per_cpu)
-{
- uint64_t count, resid;
- int weight = num_online_cpus();
- uint64_t min = (uint64_t)min_per_cpu;
-
- /* disable counter and sync counter */
- xfs_icsb_disable_counter(mp, field);
-
- /* update counters - first CPU gets residual*/
- switch (field) {
- case XFS_SBS_ICOUNT:
- count = mp->m_sb.sb_icount;
- resid = do_div(count, weight);
- if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
- return;
- break;
- case XFS_SBS_IFREE:
- count = mp->m_sb.sb_ifree;
- resid = do_div(count, weight);
- if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
- return;
- break;
- case XFS_SBS_FDBLOCKS:
- count = mp->m_sb.sb_fdblocks;
- resid = do_div(count, weight);
- if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
- return;
- break;
- default:
- BUG();
- count = resid = 0; /* quiet, gcc */
- break;
- }
-
- xfs_icsb_enable_counter(mp, field, count, resid);
-}
-
-STATIC void
-xfs_icsb_balance_counter(
- xfs_mount_t *mp,
- xfs_sb_field_t fields,
- int min_per_cpu)
-{
- spin_lock(&mp->m_sb_lock);
- xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
- spin_unlock(&mp->m_sb_lock);
-}
-
-int
-xfs_icsb_modify_counters(
- xfs_mount_t *mp,
- xfs_sb_field_t field,
- int64_t delta,
- int rsvd)
-{
- xfs_icsb_cnts_t *icsbp;
- long long lcounter; /* long counter for 64 bit fields */
- int ret = 0;
-
- might_sleep();
-again:
- preempt_disable();
- icsbp = this_cpu_ptr(mp->m_sb_cnts);
-
- /*
- * if the counter is disabled, go to slow path
- */
- if (unlikely(xfs_icsb_counter_disabled(mp, field)))
- goto slow_path;
- xfs_icsb_lock_cntr(icsbp);
- if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
- xfs_icsb_unlock_cntr(icsbp);
- goto slow_path;
- }
-
- switch (field) {
- case XFS_SBS_ICOUNT:
- lcounter = icsbp->icsb_icount;
- lcounter += delta;
- if (unlikely(lcounter < 0))
- goto balance_counter;
- icsbp->icsb_icount = lcounter;
- break;
-
- case XFS_SBS_IFREE:
- lcounter = icsbp->icsb_ifree;
- lcounter += delta;
- if (unlikely(lcounter < 0))
- goto balance_counter;
- icsbp->icsb_ifree = lcounter;
- break;
-
- case XFS_SBS_FDBLOCKS:
- BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
-
- lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
- lcounter += delta;
- if (unlikely(lcounter < 0))
- goto balance_counter;
- icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
- break;
- default:
- BUG();
- break;
- }
- xfs_icsb_unlock_cntr(icsbp);
- preempt_enable();
- return 0;
-
-slow_path:
- preempt_enable();
-
- /*
- * serialise with a mutex so we don't burn lots of cpu on
- * the superblock lock. We still need to hold the superblock
- * lock, however, when we modify the global structures.
- */
- xfs_icsb_lock(mp);
-
- /*
- * Now running atomically.
- *
- * If the counter is enabled, someone has beaten us to rebalancing.
- * Drop the lock and try again in the fast path....
- */
- if (!(xfs_icsb_counter_disabled(mp, field))) {
- xfs_icsb_unlock(mp);
- goto again;
- }
-
- /*
- * The counter is currently disabled. Because we are
- * running atomically here, we know a rebalance cannot
- * be in progress. Hence we can go straight to operating
- * on the global superblock. We do not call xfs_mod_incore_sb()
- * here even though we need to get the m_sb_lock. Doing so
- * will cause us to re-enter this function and deadlock.
- * Hence we get the m_sb_lock ourselves and then call
- * xfs_mod_incore_sb_unlocked() as the unlocked path operates
- * directly on the global counters.
- */
- spin_lock(&mp->m_sb_lock);
- ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
- spin_unlock(&mp->m_sb_lock);
-
- /*
- * Now that we've modified the global superblock, we
- * may be able to re-enable the distributed counters
- * (e.g. lots of space just got freed). After that
- * we are done.
- */
- if (ret != -ENOSPC)
- xfs_icsb_balance_counter(mp, field, 0);
- xfs_icsb_unlock(mp);
- return ret;
-
-balance_counter:
- xfs_icsb_unlock_cntr(icsbp);
- preempt_enable();
-
- /*
- * We may have multiple threads here if multiple per-cpu
- * counters run dry at the same time. This will mean we can
- * do more balances than strictly necessary but it is not
- * the common slowpath case.
- */
- xfs_icsb_lock(mp);
-
- /*
- * running atomically.
- *
- * This will leave the counter in the correct state for future
- * accesses. After the rebalance, we simply try again and our retry
- * will either succeed through the fast path or slow path without
- * another balance operation being required.
- */
- xfs_icsb_balance_counter(mp, field, delta);
- xfs_icsb_unlock(mp);
- goto again;
-}
-
-#endif
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0d8abd6364d9..7999e91cd49a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,8 +18,6 @@
#ifndef __XFS_MOUNT_H__
#define __XFS_MOUNT_H__
-#ifdef __KERNEL__
-
struct xlog;
struct xfs_inode;
struct xfs_mru_cache;
@@ -29,44 +27,6 @@ struct xfs_quotainfo;
struct xfs_dir_ops;
struct xfs_da_geometry;
-#ifdef HAVE_PERCPU_SB
-
-/*
- * Valid per-cpu incore superblock counters. Note that if you add new counters,
- * you may need to define new counter disabled bit field descriptors as there
- * are more possible fields in the superblock that can fit in a bitfield on a
- * 32 bit platform. The XFS_SBS_* values for the current current counters just
- * fit.
- */
-typedef struct xfs_icsb_cnts {
- uint64_t icsb_fdblocks;
- uint64_t icsb_ifree;
- uint64_t icsb_icount;
- unsigned long icsb_flags;
-} xfs_icsb_cnts_t;
-
-#define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */
-
-#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */
-
-extern int xfs_icsb_init_counters(struct xfs_mount *);
-extern void xfs_icsb_reinit_counters(struct xfs_mount *);
-extern void xfs_icsb_destroy_counters(struct xfs_mount *);
-extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
-extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
-extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
- int64_t, int);
-
-#else
-#define xfs_icsb_init_counters(mp) (0)
-#define xfs_icsb_destroy_counters(mp) do { } while (0)
-#define xfs_icsb_reinit_counters(mp) do { } while (0)
-#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
-#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
-#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
- xfs_mod_incore_sb(mp, field, delta, rsvd)
-#endif
-
/* dynamic preallocation free space thresholds, 5% down to 1% */
enum {
XFS_LOWSP_1_PCNT = 0,
@@ -81,8 +41,13 @@ typedef struct xfs_mount {
struct super_block *m_super;
xfs_tid_t m_tid; /* next unused tid for fs */
struct xfs_ail *m_ail; /* fs active log item list */
- xfs_sb_t m_sb; /* copy of fs superblock */
+
+ struct xfs_sb m_sb; /* copy of fs superblock */
spinlock_t m_sb_lock; /* sb counter lock */
+ struct percpu_counter m_icount; /* allocated inodes counter */
+ struct percpu_counter m_ifree; /* free inodes counter */
+ struct percpu_counter m_fdblocks; /* free block counter */
+
struct xfs_buf *m_sb_bp; /* buffer for superblock */
char *m_fsname; /* filesystem name */
int m_fsname_len; /* strlen of fs name */
@@ -136,6 +101,8 @@ typedef struct xfs_mount {
__uint64_t m_flags; /* global mount flags */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
+ int m_ialloc_min_blks;/* min blocks in sparse inode
+ * allocation */
int m_inoalign_mask;/* mask sb_inoalignmt if used */
uint m_qflags; /* quota status flags */
struct xfs_trans_resv m_resv; /* precomputed res values */
@@ -152,12 +119,6 @@ typedef struct xfs_mount {
const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
uint m_chsize; /* size of next field */
atomic_t m_active_trans; /* number trans frozen */
-#ifdef HAVE_PERCPU_SB
- xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
- unsigned long m_icsb_counters; /* disabled per-cpu counters */
- struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
- struct mutex m_icsb_mutex; /* balancer sync lock */
-#endif
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
struct delayed_work m_reclaim_work; /* background inode reclaim */
struct delayed_work m_eofblocks_work; /* background eof blocks
@@ -220,6 +181,8 @@ typedef struct xfs_mount {
allocator */
#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
+#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
+
/*
* Default minimum read and write sizes.
@@ -301,35 +264,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
}
/*
- * Per-cpu superblock locking functions
- */
-#ifdef HAVE_PERCPU_SB
-static inline void
-xfs_icsb_lock(xfs_mount_t *mp)
-{
- mutex_lock(&mp->m_icsb_mutex);
-}
-
-static inline void
-xfs_icsb_unlock(xfs_mount_t *mp)
-{
- mutex_unlock(&mp->m_icsb_mutex);
-}
-#else
-#define xfs_icsb_lock(mp)
-#define xfs_icsb_unlock(mp)
-#endif
-
-/*
- * This structure is for use by the xfs_mod_incore_sb_batch() routine.
- * xfs_growfs can specify a few fields which are more than int limit
- */
-typedef struct xfs_mod_sb {
- xfs_sb_field_t msb_field; /* Field to modify, see below */
- int64_t msb_delta; /* Change to make to specified field */
-} xfs_mod_sb_t;
-
-/*
* Per-ag incore structure, copies of information in agf and agi, to improve the
* performance of allocation group selection.
*/
@@ -383,11 +317,14 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
xfs_agnumber_t *maxagi);
-
extern void xfs_unmountfs(xfs_mount_t *);
-extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
- uint, int);
+
+extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
+extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
+extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
+ bool reserved);
+extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+
extern int xfs_mount_log_sb(xfs_mount_t *);
extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
extern int xfs_readsb(xfs_mount_t *, int);
@@ -399,6 +336,4 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
extern void xfs_set_low_space_thresholds(struct xfs_mount *);
-#endif /* __KERNEL__ */
-
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 30ecca3037e3..f8a674d7f092 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -437,7 +437,7 @@ xfs_mru_cache_insert(
if (!mru || !mru->lists)
return -EINVAL;
- if (radix_tree_preload(GFP_KERNEL))
+ if (radix_tree_preload(GFP_NOFS))
return -ENOMEM;
INIT_LIST_HEAD(&elem->list_node);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 365dd57ea760..ab4a6066f7ca 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -31,7 +31,8 @@
int
xfs_break_layouts(
struct inode *inode,
- uint *iolock)
+ uint *iolock,
+ bool with_imutex)
{
struct xfs_inode *ip = XFS_I(inode);
int error;
@@ -40,8 +41,12 @@ xfs_break_layouts(
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
xfs_iunlock(ip, *iolock);
+ if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
+ mutex_unlock(&inode->i_mutex);
error = break_layout(inode, true);
*iolock = XFS_IOLOCK_EXCL;
+ if (with_imutex)
+ mutex_lock(&inode->i_mutex);
xfs_ilock(ip, *iolock);
}
@@ -301,7 +306,7 @@ xfs_fs_commit_blocks(
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_drop_iolock;
}
@@ -316,7 +321,7 @@ xfs_fs_commit_blocks(
}
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
out_drop_iolock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index b7fbfce660f6..8147ac108820 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,9 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
struct iattr *iattr);
-int xfs_break_layouts(struct inode *inode, uint *iolock);
+int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
#else
-static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
+static inline int
+xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
{
return 0;
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index fbbb9e62e274..eac9549efd52 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -719,6 +719,7 @@ xfs_qm_qino_alloc(
xfs_trans_t *tp;
int error;
int committed;
+ bool need_alloc = true;
*ip = NULL;
/*
@@ -747,6 +748,7 @@ xfs_qm_qino_alloc(
return error;
mp->m_sb.sb_gquotino = NULLFSINO;
mp->m_sb.sb_pquotino = NULLFSINO;
+ need_alloc = false;
}
}
@@ -754,16 +756,15 @@ xfs_qm_qino_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
- if (!*ip) {
+ if (need_alloc) {
error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
&committed);
if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
return error;
}
}
@@ -794,11 +795,14 @@ xfs_qm_qino_alloc(
spin_unlock(&mp->m_sb_lock);
xfs_log_sb(tp);
- if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
+ error = xfs_trans_commit(tp);
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
xfs_alert(mp, "%s failed (error %d)!", __func__, error);
- return error;
}
- return 0;
+ if (need_alloc)
+ xfs_finish_inode_setup(*ip);
+ return error;
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 0d4d3590cf85..996a04064894 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -168,10 +168,6 @@ extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
uint, struct qc_dqblk *);
extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
struct qc_dqblk *);
-extern int xfs_qm_scall_getqstat(struct xfs_mount *,
- struct fs_quota_stat *);
-extern int xfs_qm_scall_getqstatv(struct xfs_mount *,
- struct fs_quota_statv *);
extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 9b965db45800..3640c6e896af 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -38,7 +38,6 @@
STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
uint);
-STATIC uint xfs_qm_export_flags(uint);
/*
* Turn off quota accounting and/or enforcement for all udquots and/or
@@ -240,7 +239,7 @@ xfs_qm_scall_trunc_qfile(
tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
goto out_put;
}
@@ -253,15 +252,14 @@ xfs_qm_scall_trunc_qfile(
error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
goto out_unlock;
}
ASSERT(ip->i_d.di_nextents == 0);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
@@ -389,159 +387,6 @@ xfs_qm_scall_quotaon(
return 0;
}
-
-/*
- * Return quota status information, such as uquota-off, enforcements, etc.
- * for Q_XGETQSTAT command.
- */
-int
-xfs_qm_scall_getqstat(
- struct xfs_mount *mp,
- struct fs_quota_stat *out)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_inode *uip = NULL;
- struct xfs_inode *gip = NULL;
- struct xfs_inode *pip = NULL;
- bool tempuqip = false;
- bool tempgqip = false;
- bool temppqip = false;
-
- memset(out, 0, sizeof(fs_quota_stat_t));
-
- out->qs_version = FS_QSTAT_VERSION;
- out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
- (XFS_ALL_QUOTA_ACCT|
- XFS_ALL_QUOTA_ENFD));
- uip = q->qi_uquotaip;
- gip = q->qi_gquotaip;
- pip = q->qi_pquotaip;
- if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
- if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip) == 0)
- tempuqip = true;
- }
- if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
- if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip) == 0)
- tempgqip = true;
- }
- /*
- * Q_XGETQSTAT doesn't have room for both group and project quotas.
- * So, allow the project quota values to be copied out only if
- * there is no group quota information available.
- */
- if (!gip) {
- if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
- if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
- 0, 0, &pip) == 0)
- temppqip = true;
- }
- } else
- pip = NULL;
- if (uip) {
- out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
- out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
- out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
- if (tempuqip)
- IRELE(uip);
- }
-
- if (gip) {
- out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
- out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
- out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
- if (tempgqip)
- IRELE(gip);
- }
- if (pip) {
- out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
- out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks;
- out->qs_gquota.qfs_nextents = pip->i_d.di_nextents;
- if (temppqip)
- IRELE(pip);
- }
- out->qs_incoredqs = q->qi_dquots;
- out->qs_btimelimit = q->qi_btimelimit;
- out->qs_itimelimit = q->qi_itimelimit;
- out->qs_rtbtimelimit = q->qi_rtbtimelimit;
- out->qs_bwarnlimit = q->qi_bwarnlimit;
- out->qs_iwarnlimit = q->qi_iwarnlimit;
-
- return 0;
-}
-
-/*
- * Return quota status information, such as uquota-off, enforcements, etc.
- * for Q_XGETQSTATV command, to support separate project quota field.
- */
-int
-xfs_qm_scall_getqstatv(
- struct xfs_mount *mp,
- struct fs_quota_statv *out)
-{
- struct xfs_quotainfo *q = mp->m_quotainfo;
- struct xfs_inode *uip = NULL;
- struct xfs_inode *gip = NULL;
- struct xfs_inode *pip = NULL;
- bool tempuqip = false;
- bool tempgqip = false;
- bool temppqip = false;
-
- out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
- (XFS_ALL_QUOTA_ACCT|
- XFS_ALL_QUOTA_ENFD));
- out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
- out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
- out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
-
- uip = q->qi_uquotaip;
- gip = q->qi_gquotaip;
- pip = q->qi_pquotaip;
- if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
- if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
- 0, 0, &uip) == 0)
- tempuqip = true;
- }
- if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
- if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
- 0, 0, &gip) == 0)
- tempgqip = true;
- }
- if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
- if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
- 0, 0, &pip) == 0)
- temppqip = true;
- }
- if (uip) {
- out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
- out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
- if (tempuqip)
- IRELE(uip);
- }
-
- if (gip) {
- out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
- out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
- if (tempgqip)
- IRELE(gip);
- }
- if (pip) {
- out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks;
- out->qs_pquota.qfs_nextents = pip->i_d.di_nextents;
- if (temppqip)
- IRELE(pip);
- }
- out->qs_incoredqs = q->qi_dquots;
- out->qs_btimelimit = q->qi_btimelimit;
- out->qs_itimelimit = q->qi_itimelimit;
- out->qs_rtbtimelimit = q->qi_rtbtimelimit;
- out->qs_bwarnlimit = q->qi_bwarnlimit;
- out->qs_iwarnlimit = q->qi_iwarnlimit;
-
- return 0;
-}
-
#define XFS_QC_MASK \
(QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
@@ -591,7 +436,7 @@ xfs_qm_scall_setqlim(
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out_rele;
}
@@ -702,7 +547,7 @@ xfs_qm_scall_setqlim(
dqp->dq_flags |= XFS_DQ_DIRTY;
xfs_trans_log_dquot(tp, dqp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
out_rele:
xfs_qm_dqrele(dqp);
@@ -725,7 +570,7 @@ xfs_qm_log_quotaoff_end(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -739,8 +584,7 @@ xfs_qm_log_quotaoff_end(
* We don't care about quotoff's performance.
*/
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
- return error;
+ return xfs_trans_commit(tp);
}
@@ -759,7 +603,7 @@ xfs_qm_log_quotaoff(
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
goto out;
}
@@ -778,7 +622,7 @@ xfs_qm_log_quotaoff(
* We don't care about quotoff's performance.
*/
xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
goto out;
@@ -873,28 +717,6 @@ out_put:
return error;
}
-STATIC uint
-xfs_qm_export_flags(
- uint flags)
-{
- uint uflags;
-
- uflags = 0;
- if (flags & XFS_UQUOTA_ACCT)
- uflags |= FS_QUOTA_UDQ_ACCT;
- if (flags & XFS_GQUOTA_ACCT)
- uflags |= FS_QUOTA_GDQ_ACCT;
- if (flags & XFS_PQUOTA_ACCT)
- uflags |= FS_QUOTA_PDQ_ACCT;
- if (flags & XFS_UQUOTA_ENFD)
- uflags |= FS_QUOTA_UDQ_ENFD;
- if (flags & XFS_GQUOTA_ENFD)
- uflags |= FS_QUOTA_GDQ_ENFD;
- if (flags & XFS_PQUOTA_ENFD)
- uflags |= FS_QUOTA_PDQ_ENFD;
- return uflags;
-}
-
STATIC int
xfs_dqrele_inode(
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 5376dd406ba2..ce6506adab7b 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -55,7 +55,6 @@ struct xfs_trans;
typedef struct xfs_dqtrx {
struct xfs_dquot *qt_dquot; /* the dquot this refers to */
ulong qt_blk_res; /* blks reserved on a dquot */
- ulong qt_blk_res_used; /* blks used from the reservation */
ulong qt_ino_res; /* inode reserved on a dquot */
ulong qt_ino_res_used; /* inodes used from the reservation */
long qt_bcount_delta; /* dquot blk count changes */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 6923905ab33d..7795e0d01382 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -23,10 +23,81 @@
#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
#include "xfs_qm.h"
#include <linux/quota.h>
+static void
+xfs_qm_fill_state(
+ struct qc_type_state *tstate,
+ struct xfs_mount *mp,
+ struct xfs_inode *ip,
+ xfs_ino_t ino)
+{
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+ bool tempqip = false;
+
+ tstate->ino = ino;
+ if (!ip && ino == NULLFSINO)
+ return;
+ if (!ip) {
+ if (xfs_iget(mp, NULL, ino, 0, 0, &ip))
+ return;
+ tempqip = true;
+ }
+ tstate->flags |= QCI_SYSFILE;
+ tstate->blocks = ip->i_d.di_nblocks;
+ tstate->nextents = ip->i_d.di_nextents;
+ tstate->spc_timelimit = q->qi_btimelimit;
+ tstate->ino_timelimit = q->qi_itimelimit;
+ tstate->rt_spc_timelimit = q->qi_rtbtimelimit;
+ tstate->spc_warnlimit = q->qi_bwarnlimit;
+ tstate->ino_warnlimit = q->qi_iwarnlimit;
+ tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit;
+ if (tempqip)
+ IRELE(ip);
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int
+xfs_fs_get_quota_state(
+ struct super_block *sb,
+ struct qc_state *state)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ struct xfs_quotainfo *q = mp->m_quotainfo;
+
+ memset(state, 0, sizeof(*state));
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return 0;
+ state->s_incoredqs = q->qi_dquots;
+ if (XFS_IS_UQUOTA_RUNNING(mp))
+ state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED;
+ if (XFS_IS_UQUOTA_ENFORCED(mp))
+ state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED;
+ if (XFS_IS_GQUOTA_RUNNING(mp))
+ state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED;
+ if (XFS_IS_GQUOTA_ENFORCED(mp))
+ state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED;
+ if (XFS_IS_PQUOTA_RUNNING(mp))
+ state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED;
+ if (XFS_IS_PQUOTA_ENFORCED(mp))
+ state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED;
+
+ xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip,
+ mp->m_sb.sb_uquotino);
+ xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip,
+ mp->m_sb.sb_gquotino);
+ xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip,
+ mp->m_sb.sb_pquotino);
+ return 0;
+}
+
STATIC int
xfs_quota_type(int type)
{
@@ -40,28 +111,40 @@ xfs_quota_type(int type)
}
}
-STATIC int
-xfs_fs_get_xstate(
+#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK)
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int
+xfs_fs_set_info(
struct super_block *sb,
- struct fs_quota_stat *fqs)
+ int type,
+ struct qc_info *info)
{
- struct xfs_mount *mp = XFS_M(sb);
+ struct xfs_mount *mp = XFS_M(sb);
+ struct qc_dqblk newlim;
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
if (!XFS_IS_QUOTA_RUNNING(mp))
return -ENOSYS;
- return xfs_qm_scall_getqstat(mp, fqs);
-}
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -ESRCH;
+ if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK)
+ return -EINVAL;
+ if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0)
+ return 0;
-STATIC int
-xfs_fs_get_xstatev(
- struct super_block *sb,
- struct fs_quota_statv *fqs)
-{
- struct xfs_mount *mp = XFS_M(sb);
+ newlim.d_fieldmask = info->i_fieldmask;
+ newlim.d_spc_timer = info->i_spc_timelimit;
+ newlim.d_ino_timer = info->i_ino_timelimit;
+ newlim.d_rt_spc_timer = info->i_rt_spc_timelimit;
+ newlim.d_ino_warns = info->i_ino_warnlimit;
+ newlim.d_spc_warns = info->i_spc_warnlimit;
+ newlim.d_rt_spc_warns = info->i_rt_spc_warnlimit;
- if (!XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
- return xfs_qm_scall_getqstatv(mp, fqs);
+ return xfs_qm_scall_setqlim(mp, 0, xfs_quota_type(type), &newlim);
}
static unsigned int
@@ -178,8 +261,8 @@ xfs_fs_set_dqblk(
}
const struct quotactl_ops xfs_quotactl_operations = {
- .get_xstatev = xfs_fs_get_xstatev,
- .get_xstate = xfs_fs_get_xstate,
+ .get_state = xfs_fs_get_quota_state,
+ .set_info = xfs_fs_set_info,
.quota_enable = xfs_quota_enable,
.quota_disable = xfs_quota_disable,
.rm_xquota = xfs_fs_rm_xquota,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f2079b6911cc..f4e8c06eee26 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -780,7 +780,6 @@ xfs_growfs_rt_alloc(
* Allocate space to the file, as necessary.
*/
while (oblocks < nblocks) {
- int cancelflags = 0;
xfs_trans_t *tp;
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
@@ -792,7 +791,6 @@ xfs_growfs_rt_alloc(
resblks, 0);
if (error)
goto error_cancel;
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
/*
* Lock the inode.
*/
@@ -804,7 +802,6 @@ xfs_growfs_rt_alloc(
* Allocate blocks to the bitmap file.
*/
nmap = 1;
- cancelflags |= XFS_TRANS_ABORT;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
XFS_BMAPI_METADATA, &firstblock,
resblks, &map, &nmap, &flist);
@@ -818,14 +815,13 @@ xfs_growfs_rt_alloc(
error = xfs_bmap_finish(&tp, &flist, &committed);
if (error)
goto error_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error)
goto error;
/*
* Now we need to clear the allocated blocks.
* Do this one block per transaction, to keep it simple.
*/
- cancelflags = 0;
for (bno = map.br_startoff, fsbno = map.br_startblock;
bno < map.br_startoff + map.br_blockcount;
bno++, fsbno++) {
@@ -851,7 +847,7 @@ xfs_growfs_rt_alloc(
if (bp == NULL) {
error = -EIO;
error_cancel:
- xfs_trans_cancel(tp, cancelflags);
+ xfs_trans_cancel(tp);
goto error;
}
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
@@ -859,7 +855,7 @@ error_cancel:
/*
* Commit the transaction.
*/
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
goto error;
}
@@ -973,7 +969,6 @@ xfs_growfs_rt(
bmbno < nrbmblocks;
bmbno++) {
xfs_trans_t *tp;
- int cancelflags = 0;
*nmp = *mp;
nsbp = &nmp->m_sb;
@@ -1015,7 +1010,6 @@ xfs_growfs_rt(
mp->m_rbmip->i_d.di_size =
nsbp->sb_rbmblocks * nsbp->sb_blocksize;
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
- cancelflags |= XFS_TRANS_ABORT;
/*
* Get the summary inode into the transaction.
*/
@@ -1062,7 +1056,7 @@ xfs_growfs_rt(
nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
if (error) {
error_cancel:
- xfs_trans_cancel(tp, cancelflags);
+ xfs_trans_cancel(tp);
break;
}
/*
@@ -1076,7 +1070,7 @@ error_cancel:
mp->m_rsumlevels = nrsumlevels;
mp->m_rsumsize = nrsumsize;
- error = xfs_trans_commit(tp, 0);
+ error = xfs_trans_commit(tp);
if (error)
break;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8fcc4ccc5c79..1fb16562c159 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -109,11 +109,11 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */
-#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
+#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */
+
/*
* Table driven mount option parser.
*
@@ -361,28 +361,14 @@ xfs_parseargs(
} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_GQUOTA_ENFD;
- } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
- xfs_warn(mp,
- "delaylog is the default now, option is deprecated.");
- } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
- xfs_warn(mp,
- "nodelaylog support has been removed, option is deprecated.");
} else if (!strcmp(this_char, MNTOPT_DISCARD)) {
mp->m_flags |= XFS_MOUNT_DISCARD;
} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
mp->m_flags &= ~XFS_MOUNT_DISCARD;
- } else if (!strcmp(this_char, "ihashsize")) {
- xfs_warn(mp,
- "ihashsize no longer used, option is deprecated.");
- } else if (!strcmp(this_char, "osyncisdsync")) {
- xfs_warn(mp,
- "osyncisdsync has no effect, option is deprecated.");
- } else if (!strcmp(this_char, "osyncisosync")) {
- xfs_warn(mp,
- "osyncisosync has no effect, option is deprecated.");
- } else if (!strcmp(this_char, "irixsgid")) {
- xfs_warn(mp,
- "irixsgid is now a sysctl(2) variable, option is deprecated.");
+#ifdef CONFIG_FS_DAX
+ } else if (!strcmp(this_char, MNTOPT_DAX)) {
+ mp->m_flags |= XFS_MOUNT_DAX;
+#endif
} else {
xfs_warn(mp, "unknown mount option [%s].", this_char);
return -EINVAL;
@@ -472,8 +458,8 @@ done:
}
struct proc_xfs_info {
- int flag;
- char *str;
+ uint64_t flag;
+ char *str;
};
STATIC int
@@ -494,6 +480,7 @@ xfs_showargs(
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
{ XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
+ { XFS_MOUNT_DAX, "," MNTOPT_DAX },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
@@ -986,6 +973,8 @@ xfs_fs_inode_init_once(
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
+ mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+ "xfsino", ip->i_ino);
mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
"xfsino", ip->i_ino);
}
@@ -1033,23 +1022,6 @@ xfs_free_fsname(
kfree(mp->m_logname);
}
-STATIC void
-xfs_fs_put_super(
- struct super_block *sb)
-{
- struct xfs_mount *mp = XFS_M(sb);
-
- xfs_filestream_unmount(mp);
- xfs_unmountfs(mp);
-
- xfs_freesb(mp);
- xfs_icsb_destroy_counters(mp);
- xfs_destroy_mount_workqueues(mp);
- xfs_close_devices(mp);
- xfs_free_fsname(mp);
- kfree(mp);
-}
-
STATIC int
xfs_fs_sync_fs(
struct super_block *sb,
@@ -1083,8 +1055,11 @@ xfs_fs_statfs(
{
struct xfs_mount *mp = XFS_M(dentry->d_sb);
xfs_sb_t *sbp = &mp->m_sb;
- struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
__uint64_t fakeinos, id;
+ __uint64_t icount;
+ __uint64_t ifree;
+ __uint64_t fdblocks;
xfs_extlen_t lsize;
__int64_t ffree;
@@ -1095,17 +1070,21 @@ xfs_fs_statfs(
statp->f_fsid.val[0] = (u32)id;
statp->f_fsid.val[1] = (u32)(id >> 32);
- xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+ icount = percpu_counter_sum(&mp->m_icount);
+ ifree = percpu_counter_sum(&mp->m_ifree);
+ fdblocks = percpu_counter_sum(&mp->m_fdblocks);
spin_lock(&mp->m_sb_lock);
statp->f_bsize = sbp->sb_blocksize;
lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
statp->f_blocks = sbp->sb_dblocks - lsize;
- statp->f_bfree = statp->f_bavail =
- sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ spin_unlock(&mp->m_sb_lock);
+
+ statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+ statp->f_bavail = statp->f_bfree;
+
fakeinos = statp->f_bfree << sbp->sb_inopblog;
- statp->f_files =
- MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+ statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
if (mp->m_maxicount)
statp->f_files = min_t(typeof(statp->f_files),
statp->f_files,
@@ -1117,10 +1096,9 @@ xfs_fs_statfs(
sbp->sb_icount);
/* make sure statp->f_ffree does not underflow */
- ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+ ffree = statp->f_files - (icount - ifree);
statp->f_ffree = max_t(__int64_t, ffree, 0);
- spin_unlock(&mp->m_sb_lock);
if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
@@ -1256,6 +1234,12 @@ xfs_fs_remount(
/* ro -> rw */
if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+ xfs_warn(mp,
+ "ro->rw transition prohibited on norecovery mount");
+ return -EINVAL;
+ }
+
mp->m_flags &= ~XFS_MOUNT_RDONLY;
/*
@@ -1401,6 +1385,51 @@ xfs_finish_flags(
return 0;
}
+static int
+xfs_init_percpu_counters(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
+ if (error)
+ return -ENOMEM;
+
+ error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL);
+ if (error)
+ goto free_icount;
+
+ error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
+ if (error)
+ goto free_ifree;
+
+ return 0;
+
+free_ifree:
+ percpu_counter_destroy(&mp->m_ifree);
+free_icount:
+ percpu_counter_destroy(&mp->m_icount);
+ return -ENOMEM;
+}
+
+void
+xfs_reinit_percpu_counters(
+ struct xfs_mount *mp)
+{
+ percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
+ percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
+ percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+}
+
+static void
+xfs_destroy_percpu_counters(
+ struct xfs_mount *mp)
+{
+ percpu_counter_destroy(&mp->m_icount);
+ percpu_counter_destroy(&mp->m_ifree);
+ percpu_counter_destroy(&mp->m_fdblocks);
+}
+
STATIC int
xfs_fs_fill_super(
struct super_block *sb,
@@ -1449,7 +1478,7 @@ xfs_fs_fill_super(
if (error)
goto out_close_devices;
- error = xfs_icsb_init_counters(mp);
+ error = xfs_init_percpu_counters(mp);
if (error)
goto out_destroy_workqueues;
@@ -1485,6 +1514,20 @@ xfs_fs_fill_super(
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
sb->s_flags |= MS_I_VERSION;
+ if (mp->m_flags & XFS_MOUNT_DAX) {
+ xfs_warn(mp,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ if (sb->s_blocksize != PAGE_SIZE) {
+ xfs_alert(mp,
+ "Filesystem block size invalid for DAX Turning DAX off.");
+ mp->m_flags &= ~XFS_MOUNT_DAX;
+ } else if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ xfs_alert(mp,
+ "Block device does not support DAX Turning DAX off.");
+ mp->m_flags &= ~XFS_MOUNT_DAX;
+ }
+ }
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
@@ -1507,7 +1550,7 @@ xfs_fs_fill_super(
out_free_sb:
xfs_freesb(mp);
out_destroy_counters:
- xfs_icsb_destroy_counters(mp);
+ xfs_destroy_percpu_counters(mp);
out_destroy_workqueues:
xfs_destroy_mount_workqueues(mp);
out_close_devices:
@@ -1524,6 +1567,24 @@ out_destroy_workqueues:
goto out_free_sb;
}
+STATIC void
+xfs_fs_put_super(
+ struct super_block *sb)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ xfs_notice(mp, "Unmounting Filesystem");
+ xfs_filestream_unmount(mp);
+ xfs_unmountfs(mp);
+
+ xfs_freesb(mp);
+ xfs_destroy_percpu_counters(mp);
+ xfs_destroy_mount_workqueues(mp);
+ xfs_close_devices(mp);
+ xfs_free_fsname(mp);
+ kfree(mp);
+}
+
STATIC struct dentry *
xfs_fs_mount(
struct file_system_type *fs_type,
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 2b830c2f322e..499058fea303 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -72,6 +72,8 @@ extern const struct export_operations xfs_export_operations;
extern const struct xattr_handler *xfs_xattr_handlers[];
extern const struct quotactl_ops xfs_quotactl_operations;
+extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
+
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
#endif /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 25791df6f638..4be27b0210af 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -104,7 +104,7 @@ xfs_readlink_bmap(
cur_chunk += sizeof(struct xfs_dsymlink_hdr);
}
- memcpy(link + offset, bp->b_addr, byte_cnt);
+ memcpy(link + offset, cur_chunk, byte_cnt);
pathlen -= byte_cnt;
offset += byte_cnt;
@@ -177,8 +177,7 @@ xfs_symlink(
int pathlen;
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
- bool unlock_dp_on_error = false;
- uint cancel_flags;
+ bool unlock_dp_on_error = false;
int committed;
xfs_fileoff_t first_fsb;
xfs_filblks_t fs_blocks;
@@ -221,10 +220,9 @@ xfs_symlink(
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
- goto std_return;
+ return error;
tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
/*
* The symlink will fit into the inode data fork?
* There can't be any attributes so we get the whole variable part.
@@ -239,10 +237,8 @@ xfs_symlink(
resblks = 0;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
}
- if (error) {
- cancel_flags = 0;
- goto error_return;
- }
+ if (error)
+ goto out_trans_cancel;
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
@@ -252,7 +248,7 @@ xfs_symlink(
*/
if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
error = -EPERM;
- goto error_return;
+ goto out_trans_cancel;
}
/*
@@ -261,7 +257,7 @@ xfs_symlink(
error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
pdqp, resblks, 1, 0);
if (error)
- goto error_return;
+ goto out_trans_cancel;
/*
* Check for ability to enter directory entry, if no space reserved.
@@ -269,7 +265,7 @@ xfs_symlink(
if (!resblks) {
error = xfs_dir_canenter(tp, dp, link_name);
if (error)
- goto error_return;
+ goto out_trans_cancel;
}
/*
* Initialize the bmap freelist prior to calling either
@@ -282,15 +278,14 @@ xfs_symlink(
*/
error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
prid, resblks > 0, &ip, NULL);
- if (error) {
- if (error == -ENOSPC)
- goto error_return;
- goto error1;
- }
+ if (error)
+ goto out_trans_cancel;
/*
- * An error after we've joined dp to the transaction will result in the
- * transaction cancel unlocking dp so don't do it explicitly in the
+ * Now we join the directory inode to the transaction. We do not do it
+ * earlier because xfs_dir_ialloc might commit the previous transaction
+ * (and release all the locks). An error from here on will result in
+ * the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
@@ -330,7 +325,7 @@ xfs_symlink(
XFS_BMAPI_METADATA, &first_block, resblks,
mval, &nmaps, &free_list);
if (error)
- goto error2;
+ goto out_bmap_cancel;
if (resblks)
resblks -= fs_blocks;
@@ -348,7 +343,7 @@ xfs_symlink(
BTOBB(byte_cnt), 0);
if (!bp) {
error = -ENOMEM;
- goto error2;
+ goto out_bmap_cancel;
}
bp->b_ops = &xfs_symlink_buf_ops;
@@ -378,7 +373,7 @@ xfs_symlink(
error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
&first_block, &free_list, resblks);
if (error)
- goto error2;
+ goto out_bmap_cancel;
xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -392,10 +387,13 @@ xfs_symlink(
}
error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- goto error2;
- }
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto out_bmap_cancel;
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_release_inode;
+
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
@@ -403,20 +401,27 @@ xfs_symlink(
*ipp = ip;
return 0;
- error2:
- IRELE(ip);
- error1:
+out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
- error_return:
- xfs_trans_cancel(tp, cancel_flags);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to finish the
+ * setup of the inode and release the inode. This prevents recursive
+ * transactions and deadlocks from xfs_inactive.
+ */
+ if (ip) {
+ xfs_finish_inode_setup(ip);
+ IRELE(ip);
+ }
+
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
return error;
}
@@ -454,7 +459,7 @@ xfs_inactive_symlink_rmt(
tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error) {
- xfs_trans_cancel(tp, 0);
+ xfs_trans_cancel(tp);
return error;
}
@@ -523,7 +528,7 @@ xfs_inactive_symlink_rmt(
/*
* Commit the transaction containing extent freeing and EFDs.
*/
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ error = xfs_trans_commit(tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
goto error_unlock;
@@ -542,7 +547,7 @@ xfs_inactive_symlink_rmt(
error_bmap_cancel:
xfs_bmap_cancel(&free_list);
error_trans_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_trans_cancel(tp);
error_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 51372e34d988..8d916d33d93d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -115,7 +115,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
__entry->refcount = refcount;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+ TP_printk("dev %d:%d agno %u refcount %d caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->refcount,
@@ -239,7 +239,7 @@ TRACE_EVENT(xfs_iext_insert,
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
- "offset %lld block %lld count %lld flag %d caller %pf",
+ "offset %lld block %lld count %lld flag %d caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -283,7 +283,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
- "offset %lld block %lld count %lld flag %d caller %pf",
+ "offset %lld block %lld count %lld flag %d caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
- "lock %d flags %s caller %pf",
+ "lock %d flags %s caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
__entry->nblks,
@@ -402,7 +402,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
- "lock %d flags %s caller %pf",
+ "lock %d flags %s caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
__entry->buffer_length,
@@ -447,7 +447,7 @@ TRACE_EVENT(xfs_buf_ioerror,
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
- "lock %d error %d flags %s caller %pf",
+ "lock %d error %d flags %s caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
__entry->buffer_length,
@@ -613,7 +613,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class,
__entry->lock_flags = lock_flags;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
+ TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
@@ -664,6 +664,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
DEFINE_INODE_EVENT(xfs_zero_file_space);
DEFINE_INODE_EVENT(xfs_collapse_file_space);
+DEFINE_INODE_EVENT(xfs_insert_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
#ifdef CONFIG_XFS_POSIX_ACL
DEFINE_INODE_EVENT(xfs_get_acl);
@@ -685,6 +686,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+
DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
TP_ARGS(ip, caller_ip),
@@ -702,7 +706,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
__entry->pincount = atomic_read(&ip->i_pincount);
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
+ TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->count,
@@ -734,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
__entry->blocks, __entry->shift, __entry->writeio_blocks)
)
+TRACE_EVENT(xfs_irec_merge_pre,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask),
+ TP_ARGS(mp, agno, agino, holemask, nagino, nholemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(uint16_t, holemask)
+ __field(xfs_agino_t, nagino)
+ __field(uint16_t, nholemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->holemask = holemask;
+ __entry->nagino = nagino;
+ __entry->nholemask = holemask;
+ ),
+ TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+ __entry->agino, __entry->holemask, __entry->nagino,
+ __entry->nholemask)
+)
+
+TRACE_EVENT(xfs_irec_merge_post,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ uint16_t holemask),
+ TP_ARGS(mp, agno, agino, holemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->holemask = holemask;
+ ),
+ TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev),
+ MINOR(__entry->dev), __entry->agno, __entry->agino,
+ __entry->holemask)
+)
+
#define DEFINE_IREF_EVENT(name) \
DEFINE_EVENT(xfs_iref_class, name, \
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
@@ -1217,6 +1268,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1333,7 +1389,7 @@ TRACE_EVENT(xfs_bunmap,
__entry->flags = flags;
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
- "flags %s caller %pf",
+ "flags %s caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
@@ -1466,7 +1522,7 @@ TRACE_EVENT(xfs_agf,
),
TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
"levels b %u c %u flfirst %u fllast %u flcount %u "
- "freeblks %u longest %u caller %pf",
+ "freeblks %u longest %u caller %ps",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index eb90cd59a0ec..0582a27107d4 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -113,7 +113,7 @@ xfs_trans_free(
* blocks. Locks and log items, however, are no inherited. They must
* be added to the new transaction explicitly.
*/
-xfs_trans_t *
+STATIC xfs_trans_t *
xfs_trans_dup(
xfs_trans_t *tp)
{
@@ -173,7 +173,7 @@ xfs_trans_reserve(
uint rtextents)
{
int error = 0;
- int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -184,8 +184,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (blocks > 0) {
- error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
- -((int64_t)blocks), rsvd);
+ error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
if (error != 0) {
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
return -ENOSPC;
@@ -236,8 +235,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (rtextents > 0) {
- error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
- -((int64_t)rtextents), rsvd);
+ error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
if (error) {
error = -ENOSPC;
goto undo_log;
@@ -253,14 +251,7 @@ xfs_trans_reserve(
*/
undo_log:
if (resp->tr_logres > 0) {
- int log_flags;
-
- if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
- log_flags = XFS_LOG_REL_PERM_RESERV;
- } else {
- log_flags = 0;
- }
- xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+ xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
@@ -268,8 +259,7 @@ undo_log:
undo_blocks:
if (blocks > 0) {
- xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
- (int64_t)blocks, rsvd);
+ xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
tp->t_blk_res = 0;
}
@@ -488,6 +478,54 @@ xfs_trans_apply_sb_deltas(
sizeof(sbp->sb_frextents) - 1);
}
+STATIC int
+xfs_sb_mod8(
+ uint8_t *field,
+ int8_t delta)
+{
+ int8_t counter = *field;
+
+ counter += delta;
+ if (counter < 0) {
+ ASSERT(0);
+ return -EINVAL;
+ }
+ *field = counter;
+ return 0;
+}
+
+STATIC int
+xfs_sb_mod32(
+ uint32_t *field,
+ int32_t delta)
+{
+ int32_t counter = *field;
+
+ counter += delta;
+ if (counter < 0) {
+ ASSERT(0);
+ return -EINVAL;
+ }
+ *field = counter;
+ return 0;
+}
+
+STATIC int
+xfs_sb_mod64(
+ uint64_t *field,
+ int64_t delta)
+{
+ int64_t counter = *field;
+
+ counter += delta;
+ if (counter < 0) {
+ ASSERT(0);
+ return -EINVAL;
+ }
+ *field = counter;
+ return 0;
+}
+
/*
* xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
* and apply superblock counter changes to the in-core superblock. The
@@ -495,13 +533,6 @@ xfs_trans_apply_sb_deltas(
* applied to the in-core superblock. The idea is that that has already been
* done.
*
- * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
- * However, we have to ensure that we only modify each superblock field only
- * once because the application of the delta values may not be atomic. That can
- * lead to ENOSPC races occurring if we have two separate modifcations of the
- * free space counter to put back the entire reservation and then take away
- * what we used.
- *
* If we are not logging superblock counters, then the inode allocated/free and
* used block counts are not updated in the on disk superblock. In this case,
* XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
@@ -509,21 +540,15 @@ xfs_trans_apply_sb_deltas(
*/
void
xfs_trans_unreserve_and_mod_sb(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
- xfs_mod_sb_t msb[9]; /* If you add cases, add entries */
- xfs_mod_sb_t *msbp;
- xfs_mount_t *mp = tp->t_mountp;
- /* REFERENCED */
- int error;
- int rsvd;
- int64_t blkdelta = 0;
- int64_t rtxdelta = 0;
- int64_t idelta = 0;
- int64_t ifreedelta = 0;
-
- msbp = msb;
- rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ struct xfs_mount *mp = tp->t_mountp;
+ bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ int64_t blkdelta = 0;
+ int64_t rtxdelta = 0;
+ int64_t idelta = 0;
+ int64_t ifreedelta = 0;
+ int error;
/* calculate deltas */
if (tp->t_blk_res > 0)
@@ -547,97 +572,115 @@ xfs_trans_unreserve_and_mod_sb(
/* apply the per-cpu counters */
if (blkdelta) {
- error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- blkdelta, rsvd);
+ error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
if (error)
goto out;
}
if (idelta) {
- error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
- idelta, rsvd);
+ error = xfs_mod_icount(mp, idelta);
if (error)
goto out_undo_fdblocks;
}
if (ifreedelta) {
- error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
- ifreedelta, rsvd);
+ error = xfs_mod_ifree(mp, ifreedelta);
if (error)
goto out_undo_icount;
}
+ if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+ return;
+
/* apply remaining deltas */
- if (rtxdelta != 0) {
- msbp->msb_field = XFS_SBS_FREXTENTS;
- msbp->msb_delta = rtxdelta;
- msbp++;
+ spin_lock(&mp->m_sb_lock);
+ if (rtxdelta) {
+ error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta);
+ if (error)
+ goto out_undo_ifree;
}
- if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
- if (tp->t_dblocks_delta != 0) {
- msbp->msb_field = XFS_SBS_DBLOCKS;
- msbp->msb_delta = tp->t_dblocks_delta;
- msbp++;
- }
- if (tp->t_agcount_delta != 0) {
- msbp->msb_field = XFS_SBS_AGCOUNT;
- msbp->msb_delta = tp->t_agcount_delta;
- msbp++;
- }
- if (tp->t_imaxpct_delta != 0) {
- msbp->msb_field = XFS_SBS_IMAX_PCT;
- msbp->msb_delta = tp->t_imaxpct_delta;
- msbp++;
- }
- if (tp->t_rextsize_delta != 0) {
- msbp->msb_field = XFS_SBS_REXTSIZE;
- msbp->msb_delta = tp->t_rextsize_delta;
- msbp++;
- }
- if (tp->t_rbmblocks_delta != 0) {
- msbp->msb_field = XFS_SBS_RBMBLOCKS;
- msbp->msb_delta = tp->t_rbmblocks_delta;
- msbp++;
- }
- if (tp->t_rblocks_delta != 0) {
- msbp->msb_field = XFS_SBS_RBLOCKS;
- msbp->msb_delta = tp->t_rblocks_delta;
- msbp++;
- }
- if (tp->t_rextents_delta != 0) {
- msbp->msb_field = XFS_SBS_REXTENTS;
- msbp->msb_delta = tp->t_rextents_delta;
- msbp++;
- }
- if (tp->t_rextslog_delta != 0) {
- msbp->msb_field = XFS_SBS_REXTSLOG;
- msbp->msb_delta = tp->t_rextslog_delta;
- msbp++;
- }
+ if (tp->t_dblocks_delta != 0) {
+ error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
+ if (error)
+ goto out_undo_frextents;
}
-
- /*
- * If we need to change anything, do it.
- */
- if (msbp > msb) {
- error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
- (uint)(msbp - msb), rsvd);
+ if (tp->t_agcount_delta != 0) {
+ error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
if (error)
- goto out_undo_ifreecount;
+ goto out_undo_dblocks;
}
-
+ if (tp->t_imaxpct_delta != 0) {
+ error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta);
+ if (error)
+ goto out_undo_agcount;
+ }
+ if (tp->t_rextsize_delta != 0) {
+ error = xfs_sb_mod32(&mp->m_sb.sb_rextsize,
+ tp->t_rextsize_delta);
+ if (error)
+ goto out_undo_imaxpct;
+ }
+ if (tp->t_rbmblocks_delta != 0) {
+ error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks,
+ tp->t_rbmblocks_delta);
+ if (error)
+ goto out_undo_rextsize;
+ }
+ if (tp->t_rblocks_delta != 0) {
+ error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta);
+ if (error)
+ goto out_undo_rbmblocks;
+ }
+ if (tp->t_rextents_delta != 0) {
+ error = xfs_sb_mod64(&mp->m_sb.sb_rextents,
+ tp->t_rextents_delta);
+ if (error)
+ goto out_undo_rblocks;
+ }
+ if (tp->t_rextslog_delta != 0) {
+ error = xfs_sb_mod8(&mp->m_sb.sb_rextslog,
+ tp->t_rextslog_delta);
+ if (error)
+ goto out_undo_rextents;
+ }
+ spin_unlock(&mp->m_sb_lock);
return;
-out_undo_ifreecount:
+out_undo_rextents:
+ if (tp->t_rextents_delta)
+ xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta);
+out_undo_rblocks:
+ if (tp->t_rblocks_delta)
+ xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta);
+out_undo_rbmblocks:
+ if (tp->t_rbmblocks_delta)
+ xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta);
+out_undo_rextsize:
+ if (tp->t_rextsize_delta)
+ xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta);
+out_undo_imaxpct:
+ if (tp->t_rextsize_delta)
+ xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta);
+out_undo_agcount:
+ if (tp->t_agcount_delta)
+ xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta);
+out_undo_dblocks:
+ if (tp->t_dblocks_delta)
+ xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta);
+out_undo_frextents:
+ if (rtxdelta)
+ xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta);
+out_undo_ifree:
+ spin_unlock(&mp->m_sb_lock);
if (ifreedelta)
- xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+ xfs_mod_ifree(mp, -ifreedelta);
out_undo_icount:
if (idelta)
- xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+ xfs_mod_icount(mp, -idelta);
out_undo_fdblocks:
if (blkdelta)
- xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+ xfs_mod_fdblocks(mp, -blkdelta, rsvd);
out:
ASSERT(error == 0);
return;
@@ -694,7 +737,7 @@ void
xfs_trans_free_items(
struct xfs_trans *tp,
xfs_lsn_t commit_lsn,
- int flags)
+ bool abort)
{
struct xfs_log_item_desc *lidp, *next;
@@ -705,7 +748,7 @@ xfs_trans_free_items(
if (commit_lsn != NULLCOMMITLSN)
lip->li_ops->iop_committing(lip, commit_lsn);
- if (flags & XFS_TRANS_ABORT)
+ if (abort)
lip->li_flags |= XFS_LI_ABORTED;
lip->li_ops->iop_unlock(lip);
@@ -842,27 +885,17 @@ xfs_trans_committed_bulk(
* have already been unlocked as if the commit had succeeded.
* Do not reference the transaction structure after this call.
*/
-int
-xfs_trans_commit(
+static int
+__xfs_trans_commit(
struct xfs_trans *tp,
- uint flags)
+ bool regrant)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_lsn_t commit_lsn = -1;
int error = 0;
- int log_flags = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
/*
- * Determine whether this commit is releasing a permanent
- * log reservation or not.
- */
- if (flags & XFS_TRANS_RELEASE_LOG_RES) {
- ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- log_flags = XFS_LOG_REL_PERM_RESERV;
- }
-
- /*
* If there is nothing to be logged by the transaction,
* then unlock all of the items associated with the
* transaction and free the transaction structure.
@@ -886,7 +919,7 @@ xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
+ xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
xfs_trans_free(tp);
@@ -914,18 +947,25 @@ out_unreserve:
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
- commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+ commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
if (commit_lsn == -1 && !error)
error = -EIO;
}
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
XFS_STATS_INC(xs_trans_empty);
return error;
}
+int
+xfs_trans_commit(
+ struct xfs_trans *tp)
+{
+ return __xfs_trans_commit(tp, false);
+}
+
/*
* Unlock all of the transaction's items and free the transaction.
* The transaction must not have modified any of its items, because
@@ -936,29 +976,22 @@ out_unreserve:
*/
void
xfs_trans_cancel(
- xfs_trans_t *tp,
- int flags)
+ struct xfs_trans *tp)
{
- int log_flags;
- xfs_mount_t *mp = tp->t_mountp;
+ struct xfs_mount *mp = tp->t_mountp;
+ bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
/*
- * See if the caller is being too lazy to figure out if
- * the transaction really needs an abort.
- */
- if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
- flags &= ~XFS_TRANS_ABORT;
- /*
* See if the caller is relying on us to shut down the
* filesystem. This happens in paths where we detect
* corruption and decide to give up.
*/
- if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (dirty && !XFS_FORCED_SHUTDOWN(mp)) {
XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
#ifdef DEBUG
- if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) {
+ if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
struct xfs_log_item_desc *lidp;
list_for_each_entry(lidp, &tp->t_items, lid_trans)
@@ -968,27 +1001,20 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_sb(tp);
xfs_trans_unreserve_and_mod_dquots(tp);
- if (tp->t_ticket) {
- if (flags & XFS_TRANS_RELEASE_LOG_RES) {
- ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- log_flags = XFS_LOG_REL_PERM_RESERV;
- } else {
- log_flags = 0;
- }
- xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
- }
+ if (tp->t_ticket)
+ xfs_log_done(mp, tp->t_ticket, NULL, false);
/* mark this thread as no longer being in a transaction */
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
+ xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
xfs_trans_free(tp);
}
/*
* Roll from one trans in the sequence of PERMANENT transactions to
* the next: permanent transactions are only flushed out when
- * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * committed with xfs_trans_commit(), but we still want as soon
* as possible to let chunks of it go to the log. So we commit the
* chunk we've been working on and get a new transaction to continue.
*/
@@ -1005,7 +1031,8 @@ xfs_trans_roll(
* Ensure that the inode is always logged.
*/
trans = *tpp;
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+ if (dp)
+ xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
/*
* Copy the critical parameters from one trans to the next.
@@ -1021,20 +1048,13 @@ xfs_trans_roll(
* is in progress. The caller takes the responsibility to cancel
* the duplicate transaction that gets returned.
*/
- error = xfs_trans_commit(trans, 0);
+ error = __xfs_trans_commit(trans, true);
if (error)
return error;
trans = *tpp;
/*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(trans->t_ticket);
-
-
- /*
* Reserve space in the log for th next transaction.
* This also pushes items in the "AIL", the list of logged items,
* out to disk if they are taking up space at the tail of the log
@@ -1050,6 +1070,7 @@ xfs_trans_roll(
if (error)
return error;
- xfs_trans_ijoin(trans, dp, 0);
+ if (dp)
+ xfs_trans_ijoin(trans, dp, 0);
return 0;
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index b5bc1ab3c4da..3b21b4e5e467 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,8 +133,6 @@ typedef struct xfs_trans {
* XFS transaction mechanism exported interfaces that are
* actually macros.
*/
-#define xfs_trans_get_log_res(tp) ((tp)->t_log_res)
-#define xfs_trans_get_log_count(tp) ((tp)->t_log_count)
#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
@@ -153,7 +151,6 @@ typedef struct xfs_trans {
*/
xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
-xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
uint, uint);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -228,9 +225,9 @@ void xfs_trans_log_efd_extent(xfs_trans_t *,
struct xfs_efd_log_item *,
xfs_fsblock_t,
xfs_extlen_t);
-int xfs_trans_commit(xfs_trans_t *, uint flags);
+int xfs_trans_commit(struct xfs_trans *);
int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
-void xfs_trans_cancel(xfs_trans_t *, int);
+void xfs_trans_cancel(xfs_trans_t *);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 573aefb5a573..1098cf490189 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next(
{
struct xfs_log_item *lip = cur->item;
- if ((__psint_t)lip & 1)
+ if ((uintptr_t)lip & 1)
lip = xfs_ail_min(ailp);
if (lip)
cur->item = xfs_ail_next(ailp, lip);
@@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear(
list_for_each_entry(cur, &ailp->xa_cursors, list) {
if (cur->item == lip)
cur->item = (struct xfs_log_item *)
- ((__psint_t)cur->item | 1);
+ ((uintptr_t)cur->item | 1);
}
}
@@ -287,7 +287,7 @@ xfs_ail_splice(
* find the place in the AIL where the items belong.
*/
lip = cur ? cur->item : NULL;
- if (!lip || (__psint_t) lip & 1)
+ if (!lip || (uintptr_t)lip & 1)
lip = __xfs_trans_ail_cursor_last(ailp, lsn);
/*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 76a16df55ef7..ce78534a047e 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo(
xfs_trans_t *ntp)
{
xfs_dqtrx_t *oq, *nq;
- int i,j;
+ int i, j;
xfs_dqtrx_t *oqa, *nqa;
+ ulong blk_res_used;
if (!otp->t_dqinfo)
return;
@@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo(
* Because the quota blk reservation is carried forward,
* it is also necessary to carry forward the DQ_DIRTY flag.
*/
- if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+ if (otp->t_flags & XFS_TRANS_DQ_DIRTY)
ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
oqa = otp->t_dqinfo->dqs[j];
nqa = ntp->t_dqinfo->dqs[j];
for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+ blk_res_used = 0;
+
if (oqa[i].qt_dquot == NULL)
break;
oq = &oqa[i];
nq = &nqa[i];
+ if (oq->qt_blk_res && oq->qt_bcount_delta > 0)
+ blk_res_used = oq->qt_bcount_delta;
+
nq->qt_dquot = oq->qt_dquot;
nq->qt_bcount_delta = nq->qt_icount_delta = 0;
nq->qt_rtbcount_delta = 0;
@@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo(
/*
* Transfer whatever is left of the reservations.
*/
- nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
- oq->qt_blk_res = oq->qt_blk_res_used;
+ nq->qt_blk_res = oq->qt_blk_res - blk_res_used;
+ oq->qt_blk_res = blk_res_used;
nq->qt_rtblk_res = oq->qt_rtblk_res -
oq->qt_rtblk_res_used;
@@ -239,10 +245,6 @@ xfs_trans_mod_dquot(
* disk blocks used.
*/
case XFS_TRANS_DQ_BCOUNT:
- if (qtrx->qt_blk_res && delta > 0) {
- qtrx->qt_blk_res_used += (ulong)delta;
- ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
- }
qtrx->qt_bcount_delta += delta;
break;
@@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas(
* reservation that a transaction structure knows of.
*/
if (qtrx->qt_blk_res != 0) {
- if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
- if (qtrx->qt_blk_res >
- qtrx->qt_blk_res_used)
+ ulong blk_res_used = 0;
+
+ if (qtrx->qt_bcount_delta > 0)
+ blk_res_used = qtrx->qt_bcount_delta;
+
+ if (qtrx->qt_blk_res != blk_res_used) {
+ if (qtrx->qt_blk_res > blk_res_used)
dqp->q_res_bcount -= (xfs_qcnt_t)
(qtrx->qt_blk_res -
- qtrx->qt_blk_res_used);
+ blk_res_used);
else
dqp->q_res_bcount -= (xfs_qcnt_t)
- (qtrx->qt_blk_res_used -
+ (blk_res_used -
qtrx->qt_blk_res);
}
} else {
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index bd1281862ad7..1b736294558a 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -30,7 +30,7 @@ void xfs_trans_init(struct xfs_mount *);
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
- int flags);
+ bool abort);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 69f6e475de97..c036815183cb 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -35,7 +35,7 @@ static int
xfs_xattr_get(struct dentry *dentry, const char *name,
void *value, size_t size, int xflags)
{
- struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error, asize = size;
if (strcmp(name, "") == 0)
@@ -57,7 +57,7 @@ static int
xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags, int xflags)
{
- struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -197,7 +197,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
{
struct xfs_attr_list_context context;
struct attrlist_cursor_kern cursor = { 0 };
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
int error;
/*