summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/affs.h16
-rw-r--r--fs/affs/super.c10
-rw-r--r--fs/afs/callback.c1
-rw-r--r--fs/afs/cmservice.c6
-rw-r--r--fs/afs/dir.c7
-rw-r--r--fs/afs/dir_edit.c12
-rw-r--r--fs/afs/file.c6
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/afs/fsclient.c16
-rw-r--r--fs/afs/inode.c13
-rw-r--r--fs/afs/internal.h16
-rw-r--r--fs/afs/rxrpc.c13
-rw-r--r--fs/afs/server.c3
-rw-r--r--fs/afs/super.c1
-rw-r--r--fs/afs/vl_list.c4
-rw-r--r--fs/afs/vlclient.c6
-rw-r--r--fs/afs/xattr.c16
-rw-r--r--fs/afs/yfsclient.c15
-rw-r--r--fs/aio.c12
-rw-r--r--fs/autofs/expire.c5
-rw-r--r--fs/binfmt_elf.c68
-rw-r--r--fs/binfmt_elf_fdpic.c12
-rw-r--r--fs/block_dev.c69
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/async-thread.c113
-rw-r--r--fs/btrfs/async-thread.h37
-rw-r--r--fs/btrfs/block-group.c589
-rw-r--r--fs/btrfs/block-group.h51
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/compression.c269
-rw-r--r--fs/btrfs/compression.h46
-rw-r--r--fs/btrfs/ctree.c287
-rw-r--r--fs/btrfs/ctree.h51
-rw-r--r--fs/btrfs/delalloc-space.c21
-rw-r--r--fs/btrfs/delayed-inode.c18
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/dev-replace.h2
-rw-r--r--fs/btrfs/disk-io.c365
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-io-tree.h248
-rw-r--r--fs/btrfs/extent-tree.c146
-rw-r--r--fs/btrfs/extent_io.c120
-rw-r--r--fs/btrfs/extent_io.h231
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/extent_map.h11
-rw-r--r--fs/btrfs/file-item.c1
-rw-r--r--fs/btrfs/file.c74
-rw-r--r--fs/btrfs/free-space-cache.c118
-rw-r--r--fs/btrfs/free-space-cache.h39
-rw-r--r--fs/btrfs/free-space-tree.c133
-rw-r--r--fs/btrfs/free-space-tree.h18
-rw-r--r--fs/btrfs/inode.c185
-rw-r--r--fs/btrfs/ioctl.c49
-rw-r--r--fs/btrfs/locking.c309
-rw-r--r--fs/btrfs/locking.h13
-rw-r--r--fs/btrfs/lzo.c53
-rw-r--r--fs/btrfs/misc.h11
-rw-r--r--fs/btrfs/ordered-data.c7
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/print-tree.c6
-rw-r--r--fs/btrfs/props.c6
-rw-r--r--fs/btrfs/qgroup.c11
-rw-r--r--fs/btrfs/qgroup.h2
-rw-r--r--fs/btrfs/raid56.c101
-rw-r--r--fs/btrfs/reada.c19
-rw-r--r--fs/btrfs/relocation.c43
-rw-r--r--fs/btrfs/scrub.c100
-rw-r--r--fs/btrfs/send.c45
-rw-r--r--fs/btrfs/space-info.c8
-rw-r--r--fs/btrfs/space-info.h3
-rw-r--r--fs/btrfs/super.c28
-rw-r--r--fs/btrfs/sysfs.c47
-rw-r--r--fs/btrfs/sysfs.h2
-rw-r--r--fs/btrfs/tests/btrfs-tests.c11
-rw-r--r--fs/btrfs/tests/btrfs-tests.h4
-rw-r--r--fs/btrfs/tests/free-space-tests.c15
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c101
-rw-r--r--fs/btrfs/transaction.c98
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-checker.c211
-rw-r--r--fs/btrfs/tree-log.c136
-rw-r--r--fs/btrfs/volumes.c494
-rw-r--r--fs/btrfs/volumes.h24
-rw-r--r--fs/btrfs/zlib.c52
-rw-r--r--fs/btrfs/zstd.c47
-rw-r--r--fs/buffer.c54
-rw-r--r--fs/ceph/dir.c1
-rw-r--r--fs/ceph/file.c31
-rw-r--r--fs/cifs/cifs_debug.c43
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsfs.c46
-rw-r--r--fs/cifs/cifsfs.h3
-rw-r--r--fs/cifs/cifsglob.h90
-rw-r--r--fs/cifs/cifsproto.h8
-rw-r--r--fs/cifs/connect.c191
-rw-r--r--fs/cifs/dfs_cache.c3
-rw-r--r--fs/cifs/dir.c6
-rw-r--r--fs/cifs/file.c159
-rw-r--r--fs/cifs/inode.c333
-rw-r--r--fs/cifs/misc.c17
-rw-r--r--fs/cifs/sess.c230
-rw-r--r--fs/cifs/smb1ops.c8
-rw-r--r--fs/cifs/smb2misc.c175
-rw-r--r--fs/cifs/smb2ops.c141
-rw-r--r--fs/cifs/smb2pdu.c168
-rw-r--r--fs/cifs/smb2pdu.h2
-rw-r--r--fs/cifs/smb2proto.h6
-rw-r--r--fs/cifs/smb2transport.c165
-rw-r--r--fs/cifs/smbdirect.c36
-rw-r--r--fs/cifs/transport.c37
-rw-r--r--fs/compat_binfmt_elf.c4
-rw-r--r--fs/compat_ioctl.c931
-rw-r--r--fs/cramfs/inode.c4
-rw-r--r--fs/crypto/bio.c29
-rw-r--r--fs/crypto/crypto.c124
-rw-r--r--fs/crypto/fscrypt_private.h25
-rw-r--r--fs/crypto/keyring.c6
-rw-r--r--fs/crypto/keysetup.c158
-rw-r--r--fs/crypto/keysetup_v1.c4
-rw-r--r--fs/crypto/policy.c41
-rw-r--r--fs/dax.c13
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/debugfs/file.c87
-rw-r--r--fs/direct-io.c21
-rw-r--r--fs/ecryptfs/file.c1
-rw-r--r--fs/ecryptfs/inode.c84
-rw-r--r--fs/erofs/Kconfig1
-rw-r--r--fs/erofs/decompressor.c2
-rw-r--r--fs/erofs/erofs_fs.h3
-rw-r--r--fs/erofs/internal.h7
-rw-r--r--fs/erofs/super.c39
-rw-r--r--fs/erofs/utils.c17
-rw-r--r--fs/erofs/zdata.c288
-rw-r--r--fs/erofs/zdata.h8
-rw-r--r--fs/erofs/zmap.c28
-rw-r--r--fs/eventpoll.c52
-rw-r--r--fs/exec.c5
-rw-r--r--fs/exportfs/expfs.c31
-rw-r--r--fs/ext2/balloc.c75
-rw-r--r--fs/ext2/ext2.h12
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext2/ioctl.c5
-rw-r--r--fs/ext2/super.c13
-rw-r--r--fs/ext4/Kconfig17
-rw-r--r--fs/ext4/Makefile1
-rw-r--r--fs/ext4/ext4.h24
-rw-r--r--fs/ext4/ext4_jbd2.c32
-rw-r--r--fs/ext4/ext4_jbd2.h106
-rw-r--r--fs/ext4/extents.c149
-rw-r--r--fs/ext4/file.c412
-rw-r--r--fs/ext4/fsync.c72
-rw-r--r--fs/ext4/ialloc.c7
-rw-r--r--fs/ext4/indirect.c125
-rw-r--r--fs/ext4/inode-test.c272
-rw-r--r--fs/ext4/inode.c931
-rw-r--r--fs/ext4/ioctl.c1
-rw-r--r--fs/ext4/migrate.c103
-rw-r--r--fs/ext4/namei.c50
-rw-r--r--fs/ext4/page-io.c167
-rw-r--r--fs/ext4/readpage.c6
-rw-r--r--fs/ext4/resize.c46
-rw-r--r--fs/ext4/super.c73
-rw-r--r--fs/ext4/xattr.c94
-rw-r--r--fs/f2fs/checkpoint.c2
-rw-r--r--fs/f2fs/data.c190
-rw-r--r--fs/f2fs/dir.c7
-rw-r--r--fs/f2fs/f2fs.h63
-rw-r--r--fs/f2fs/file.c53
-rw-r--r--fs/f2fs/gc.c46
-rw-r--r--fs/f2fs/inode.c8
-rw-r--r--fs/f2fs/namei.c15
-rw-r--r--fs/f2fs/node.c3
-rw-r--r--fs/f2fs/recovery.c2
-rw-r--r--fs/f2fs/segment.c67
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/super.c129
-rw-r--r--fs/f2fs/sysfs.c4
-rw-r--r--fs/f2fs/xattr.c14
-rw-r--r--fs/fat/file.c13
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fuse/dev.c33
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/file.c36
-rw-r--r--fs/hpfs/dir.c1
-rw-r--r--fs/hpfs/file.c1
-rw-r--r--fs/hugetlbfs/inode.c63
-rw-r--r--fs/io-wq.c1094
-rw-r--r--fs/io-wq.h127
-rw-r--r--fs/io_uring.c2639
-rw-r--r--fs/ioctl.c92
-rw-r--r--fs/iomap/Makefile16
-rw-r--r--fs/iomap/apply.c32
-rw-r--r--fs/iomap/buffered-io.c756
-rw-r--r--fs/iomap/direct-io.c63
-rw-r--r--fs/iomap/fiemap.c10
-rw-r--r--fs/iomap/seek.c4
-rw-r--r--fs/iomap/swapfile.c3
-rw-r--r--fs/iomap/trace.c12
-rw-r--r--fs/iomap/trace.h191
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c26
-rw-r--r--fs/jbd2/journal.c65
-rw-r--r--fs/jbd2/revoke.c6
-rw-r--r--fs/jbd2/transaction.c400
-rw-r--r--fs/jffs2/nodelist.c2
-rw-r--r--fs/kernfs/dir.c105
-rw-r--r--fs/kernfs/file.c4
-rw-r--r--fs/kernfs/inode.c4
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/kernfs/mount.c102
-rw-r--r--fs/namei.c8
-rw-r--r--fs/namespace.c15
-rw-r--r--fs/nilfs2/ioctl.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/fdinfo.c2
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/notify/fsnotify.h2
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/alloc.c32
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/ioctl.c1
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/suballoc.c19
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/ocfs2/xattr.c56
-rw-r--r--fs/pipe.c238
-rw-r--r--fs/proc/Kconfig8
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/generic.c37
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/stat.c56
-rw-r--r--fs/pstore/platform.c2
-rw-r--r--fs/quota/dquot.c289
-rw-r--r--fs/quota/quota.c7
-rw-r--r--fs/quota/quota_v1.c1
-rw-r--r--fs/reiserfs/file.c10
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/reiserfs/namei.c7
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c19
-rw-r--r--fs/reiserfs/xattr_acl.c4
-rw-r--r--fs/select.c10
-rw-r--r--fs/splice.c199
-rw-r--r--fs/timerfd.c14
-rw-r--r--fs/ubifs/debug.c12
-rw-r--r--fs/ubifs/journal.c4
-rw-r--r--fs/ubifs/orphan.c17
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/ubifs/tnc_commit.c34
-rw-r--r--fs/userfaultfd.c23
-rw-r--r--fs/utimes.c8
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/kmem.c2
-rw-r--r--fs/xfs/kmem.h30
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c1236
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h16
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c1
-rw-r--r--fs/xfs/libxfs/xfs_attr.c24
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c134
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h30
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c1
-rw-r--r--fs/xfs/libxfs/xfs_bit.c1
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c700
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h3
-rw-r--r--fs/xfs/libxfs/xfs_btree.c97
-rw-r--r--fs/xfs/libxfs/xfs_btree.h37
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c668
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h73
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c888
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h59
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c72
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h90
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c131
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c282
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c307
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c431
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h114
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c424
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c8
-rw-r--r--fs/xfs/libxfs/xfs_format.h14
-rw-r--r--fs/xfs/libxfs/xfs_fs.h4
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c117
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c21
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h5
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c22
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h18
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h4
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h4
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c174
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c377
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c4
-rw-r--r--fs/xfs/libxfs/xfs_sb.c1
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c8
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c6
-rw-r--r--fs/xfs/libxfs/xfs_types.h2
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bitmap.c3
-rw-r--r--fs/xfs/scrub/common.h9
-rw-r--r--fs/xfs/scrub/dabtree.c62
-rw-r--r--fs/xfs/scrub/dabtree.h3
-rw-r--r--fs/xfs/scrub/dir.c132
-rw-r--r--fs/xfs/scrub/fscounters.c8
-rw-r--r--fs/xfs/scrub/health.c1
-rw-r--r--fs/xfs/scrub/parent.c27
-rw-r--r--fs/xfs/scrub/quota.c7
-rw-r--r--fs/xfs/scrub/scrub.c1
-rw-r--r--fs/xfs/xfs_acl.c18
-rw-r--r--fs/xfs/xfs_aops.c791
-rw-r--r--fs/xfs/xfs_aops.h20
-rw-r--r--fs/xfs/xfs_attr_inactive.c76
-rw-r--r--fs/xfs/xfs_attr_list.c75
-rw-r--r--fs/xfs/xfs_bmap_item.c11
-rw-r--r--fs/xfs/xfs_bmap_util.c255
-rw-r--r--fs/xfs/xfs_bmap_util.h4
-rw-r--r--fs/xfs/xfs_buf.c32
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_buf_item.c6
-rw-r--r--fs/xfs/xfs_dir2_readdir.c137
-rw-r--r--fs/xfs/xfs_discard.c6
-rw-r--r--fs/xfs/xfs_dquot.c46
-rw-r--r--fs/xfs/xfs_dquot.h98
-rw-r--r--fs/xfs/xfs_dquot_item.h34
-rw-r--r--fs/xfs/xfs_error.c31
-rw-r--r--fs/xfs/xfs_error.h33
-rw-r--r--fs/xfs/xfs_extent_busy.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c9
-rw-r--r--fs/xfs/xfs_file.c113
-rw-r--r--fs/xfs/xfs_filestream.c3
-rw-r--r--fs/xfs/xfs_fsmap.c1
-rw-r--r--fs/xfs/xfs_icache.c8
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c48
-rw-r--r--fs/xfs/xfs_inode.h31
-rw-r--r--fs/xfs/xfs_inode_item.c15
-rw-r--r--fs/xfs/xfs_ioctl.c203
-rw-r--r--fs/xfs/xfs_ioctl.h7
-rw-r--r--fs/xfs/xfs_ioctl32.c49
-rw-r--r--fs/xfs/xfs_ioctl32.h13
-rw-r--r--fs/xfs/xfs_iomap.c865
-rw-r--r--fs/xfs/xfs_iomap.h13
-rw-r--r--fs/xfs/xfs_iops.c70
-rw-r--r--fs/xfs/xfs_itable.c6
-rw-r--r--fs/xfs/xfs_iwalk.c3
-rw-r--r--fs/xfs/xfs_linux.h14
-rw-r--r--fs/xfs/xfs_log.c434
-rw-r--r--fs/xfs/xfs_log_cil.c6
-rw-r--r--fs/xfs/xfs_log_priv.h33
-rw-r--r--fs/xfs/xfs_log_recover.c148
-rw-r--r--fs/xfs/xfs_message.c22
-rw-r--r--fs/xfs/xfs_message.h6
-rw-r--r--fs/xfs/xfs_mount.c58
-rw-r--r--fs/xfs/xfs_mount.h57
-rw-r--r--fs/xfs/xfs_pnfs.c58
-rw-r--r--fs/xfs/xfs_qm.c67
-rw-r--r--fs/xfs/xfs_qm.h6
-rw-r--r--fs/xfs/xfs_qm_bhv.c8
-rw-r--r--fs/xfs/xfs_qm_syscalls.c139
-rw-r--r--fs/xfs/xfs_quotaops.c3
-rw-r--r--fs/xfs/xfs_refcount_item.c9
-rw-r--r--fs/xfs/xfs_reflink.c138
-rw-r--r--fs/xfs/xfs_reflink.h4
-rw-r--r--fs/xfs/xfs_rmap_item.c13
-rw-r--r--fs/xfs/xfs_rtalloc.c3
-rw-r--r--fs/xfs/xfs_super.c1471
-rw-r--r--fs/xfs/xfs_super.h10
-rw-r--r--fs/xfs/xfs_symlink.c1
-rw-r--r--fs/xfs/xfs_symlink.h2
-rw-r--r--fs/xfs/xfs_trace.h100
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans_ail.c10
-rw-r--r--fs/xfs/xfs_trans_dquot.c56
-rw-r--r--fs/xfs/xfs_xattr.c1
382 files changed, 17997 insertions, 14992 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 2501e6f1f965..7b623e9fc1b0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -322,4 +322,7 @@ source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
source "fs/unicode/Kconfig"
+config IO_WQ
+ bool
+
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 14231b4cf383..1148c555c4d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
+obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a92eb6ae2ae2..a755bef7c4c7 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -43,8 +43,8 @@ struct affs_ext_key {
*/
struct affs_inode_info {
atomic_t i_opencnt;
- struct semaphore i_link_lock; /* Protects internal inode access. */
- struct semaphore i_ext_lock; /* Protects internal inode access. */
+ struct mutex i_link_lock; /* Protects internal inode access. */
+ struct mutex i_ext_lock; /* Protects internal inode access. */
#define i_hash_lock i_ext_lock
u32 i_blkcnt; /* block count */
u32 i_extcnt; /* extended block count */
@@ -293,30 +293,30 @@ affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val)
static inline void
affs_lock_link(struct inode *inode)
{
- down(&AFFS_I(inode)->i_link_lock);
+ mutex_lock(&AFFS_I(inode)->i_link_lock);
}
static inline void
affs_unlock_link(struct inode *inode)
{
- up(&AFFS_I(inode)->i_link_lock);
+ mutex_unlock(&AFFS_I(inode)->i_link_lock);
}
static inline void
affs_lock_dir(struct inode *inode)
{
- down(&AFFS_I(inode)->i_hash_lock);
+ mutex_lock_nested(&AFFS_I(inode)->i_hash_lock, SINGLE_DEPTH_NESTING);
}
static inline void
affs_unlock_dir(struct inode *inode)
{
- up(&AFFS_I(inode)->i_hash_lock);
+ mutex_unlock(&AFFS_I(inode)->i_hash_lock);
}
static inline void
affs_lock_ext(struct inode *inode)
{
- down(&AFFS_I(inode)->i_ext_lock);
+ mutex_lock(&AFFS_I(inode)->i_ext_lock);
}
static inline void
affs_unlock_ext(struct inode *inode)
{
- up(&AFFS_I(inode)->i_ext_lock);
+ mutex_unlock(&AFFS_I(inode)->i_ext_lock);
}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index cc463ae47c12..47107c6712a6 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -121,8 +121,8 @@ static void init_once(void *foo)
{
struct affs_inode_info *ei = (struct affs_inode_info *) foo;
- sema_init(&ei->i_link_lock, 1);
- sema_init(&ei->i_ext_lock, 1);
+ mutex_init(&ei->i_link_lock);
+ mutex_init(&ei->i_ext_lock);
inode_init_once(&ei->vfs_inode);
}
@@ -561,14 +561,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
int root_block;
unsigned long mount_flags;
int res = 0;
- char *new_opts;
char volume[32];
char *prefix = NULL;
- new_opts = kstrdup(data, GFP_KERNEL);
- if (data && !new_opts)
- return -ENOMEM;
-
pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
sync_filesystem(sb);
@@ -579,7 +574,6 @@ affs_remount(struct super_block *sb, int *flags, char *data)
&blocksize, &prefix, volume,
&mount_flags)) {
kfree(prefix);
- kfree(new_opts);
return -EINVAL;
}
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 6cdd7047c809..2dca8df1a18d 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -312,7 +312,6 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
_enter("%p,%zu,", server, count);
ASSERT(server != NULL);
- ASSERTCMP(count, <=, AFSCBMAX);
/* TODO: Sort the callback break list by volume ID */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index b86195e4dc6c..ff3994a6be23 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -342,14 +342,14 @@ static int afs_deliver_cb_callback(struct afs_call *call)
if (call->count2 != call->count && call->count2 != 0)
return afs_protocol_error(call, -EBADMSG,
afs_eproto_cb_count);
- call->_iter = &call->iter;
- iov_iter_discard(&call->iter, READ, call->count2 * 3 * 4);
+ call->iter = &call->def_iter;
+ iov_iter_discard(&call->def_iter, READ, call->count2 * 3 * 4);
call->unmarshall++;
/* Fall through */
case 4:
_debug("extract discard %zu/%u",
- iov_iter_count(&call->iter), call->count2 * 3 * 4);
+ iov_iter_count(call->iter), call->count2 * 3 * 4);
ret = afs_extract_data(call, false);
if (ret < 0)
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index cc12772d0a4d..497f979018c2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -803,7 +803,12 @@ success:
continue;
if (cookie->inodes[i]) {
- afs_vnode_commit_status(&fc, AFS_FS_I(cookie->inodes[i]),
+ struct afs_vnode *iv = AFS_FS_I(cookie->inodes[i]);
+
+ if (test_bit(AFS_VNODE_UNSET, &iv->flags))
+ continue;
+
+ afs_vnode_commit_status(&fc, iv,
scb->cb_break, NULL, scb);
continue;
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index d4fbe5f85f1b..b108528bf010 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -68,13 +68,11 @@ static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_
static void afs_set_contig_bits(union afs_xdr_dir_block *block,
int bit, unsigned int nr_slots)
{
- u64 mask, before, after;
+ u64 mask;
mask = (1 << nr_slots) - 1;
mask <<= bit;
- before = *(u64 *)block->hdr.bitmap;
-
block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8);
block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8);
block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8);
@@ -83,8 +81,6 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block,
block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8);
block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8);
block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8);
-
- after = *(u64 *)block->hdr.bitmap;
}
/*
@@ -93,13 +89,11 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block,
static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
int bit, unsigned int nr_slots)
{
- u64 mask, before, after;
+ u64 mask;
mask = (1 << nr_slots) - 1;
mask <<= bit;
- before = *(u64 *)block->hdr.bitmap;
-
block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8);
block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8);
block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8);
@@ -108,8 +102,6 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8);
block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8);
block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8);
-
- after = *(u64 *)block->hdr.bitmap;
}
/*
diff --git a/fs/afs/file.c b/fs/afs/file.c
index dd3c55c9101c..8415733f7bc1 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -223,7 +223,7 @@ static void afs_file_readpage_read_complete(struct page *page,
/*
* Fetch file data from the volume.
*/
-int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *desc)
+int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req)
{
struct afs_fs_cursor fc;
struct afs_status_cb *scb;
@@ -246,7 +246,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(vnode);
- afs_fs_fetch_data(&fc, scb, desc);
+ afs_fs_fetch_data(&fc, scb, req);
}
afs_check_for_remote_deletion(&fc, vnode);
@@ -257,7 +257,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *de
if (ret == 0) {
afs_stat_v(vnode, n_fetches);
- atomic_long_add(desc->actual_len,
+ atomic_long_add(req->actual_len,
&afs_v2net(vnode)->n_fetch_bytes);
}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index d5e5a6ddc847..0f2a94ba73cb 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -346,8 +346,8 @@ again:
if (ret < 0) {
trace_afs_flock_ev(vnode, NULL, afs_flock_extend_fail,
ret);
- pr_warning("AFS: Failed to extend lock on {%llx:%llx} error %d\n",
- vnode->fid.vid, vnode->fid.vnode, ret);
+ pr_warn("AFS: Failed to extend lock on {%llx:%llx} error %d\n",
+ vnode->fid.vid, vnode->fid.vnode, ret);
}
spin_lock(&vnode->lock);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 6f84231f11a5..1f9c5d8e6fe5 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -323,7 +323,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
int ret;
_enter("{%u,%zu/%llu}",
- call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
+ call->unmarshall, iov_iter_count(call->iter), req->actual_len);
switch (call->unmarshall) {
case 0:
@@ -363,14 +363,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
call->bvec[0].bv_len = size;
call->bvec[0].bv_offset = req->offset;
call->bvec[0].bv_page = req->pages[req->index];
- iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+ iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
/* Fall through */
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
- iov_iter_count(&call->iter), req->remain);
+ iov_iter_count(call->iter), req->remain);
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -398,7 +398,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
case 3:
_debug("extract discard %zu/%llu",
- iov_iter_count(&call->iter), req->actual_len - req->len);
+ iov_iter_count(call->iter), req->actual_len - req->len);
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -490,7 +490,7 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc,
call->key = fc->key;
call->out_scb = scb;
call->out_volsync = NULL;
- call->read_request = req;
+ call->read_request = afs_get_read(req);
/* marshall the parameters */
bp = call->request;
@@ -503,7 +503,6 @@ static int afs_fs_fetch_data64(struct afs_fs_cursor *fc,
bp[6] = 0;
bp[7] = htonl(lower_32_bits(req->len));
- refcount_inc(&req->usage);
afs_use_fs_server(call, fc->cbi);
trace_afs_make_fs_call(call, &vnode->fid);
afs_set_fc_call(call, fc);
@@ -540,7 +539,7 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc,
call->key = fc->key;
call->out_scb = scb;
call->out_volsync = NULL;
- call->read_request = req;
+ call->read_request = afs_get_read(req);
/* marshall the parameters */
bp = call->request;
@@ -551,7 +550,6 @@ int afs_fs_fetch_data(struct afs_fs_cursor *fc,
bp[4] = htonl(lower_32_bits(req->pos));
bp[5] = htonl(lower_32_bits(req->len));
- refcount_inc(&req->usage);
afs_use_fs_server(call, fc->cbi);
trace_afs_make_fs_call(call, &vnode->fid);
afs_set_fc_call(call, fc);
@@ -1852,7 +1850,7 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
u32 count;
int ret;
- _enter("{%u,%zu}", call->unmarshall, iov_iter_count(&call->iter));
+ _enter("{%u,%zu}", call->unmarshall, iov_iter_count(call->iter));
switch (call->unmarshall) {
case 0:
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 46d2d7cb461d..281470fe1183 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -34,8 +34,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
{
static unsigned long once_only;
- pr_warn("kAFS: AFS vnode with undefined type %u\n",
- vnode->status.type);
+ pr_warn("kAFS: AFS vnode with undefined type %u\n", vnode->status.type);
pr_warn("kAFS: A=%d m=%o s=%llx v=%llx\n",
vnode->status.abort_code,
vnode->status.mode,
@@ -175,11 +174,11 @@ static void afs_apply_status(struct afs_fs_cursor *fc,
BUG_ON(test_bit(AFS_VNODE_UNSET, &vnode->flags));
if (status->type != vnode->status.type) {
- pr_warning("Vnode %llx:%llx:%x changed type %u to %u\n",
- vnode->fid.vid,
- vnode->fid.vnode,
- vnode->fid.unique,
- status->type, vnode->status.type);
+ pr_warn("Vnode %llx:%llx:%x changed type %u to %u\n",
+ vnode->fid.vid,
+ vnode->fid.vnode,
+ vnode->fid.unique,
+ status->type, vnode->status.type);
afs_protocol_error(NULL, -EBADMSG, afs_eproto_bad_status);
return;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 759e0578012c..1d81fc4c3058 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -115,9 +115,9 @@ struct afs_call {
struct afs_vnode *lvnode; /* vnode being locked */
void *request; /* request data (first part) */
struct address_space *mapping; /* Pages being written from */
- struct iov_iter iter; /* Buffer iterator */
- struct iov_iter *_iter; /* Iterator currently in use */
- union { /* Convenience for ->iter */
+ struct iov_iter def_iter; /* Default buffer/data iterator */
+ struct iov_iter *iter; /* Iterator currently in use */
+ union { /* Convenience for ->def_iter */
struct kvec kvec[1];
struct bio_vec bvec[1];
};
@@ -934,6 +934,12 @@ extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *);
extern int afs_page_filler(void *, struct page *);
extern void afs_put_read(struct afs_read *);
+static inline struct afs_read *afs_get_read(struct afs_read *req)
+{
+ refcount_inc(&req->usage);
+ return req;
+}
+
/*
* flock.c
*/
@@ -1136,7 +1142,7 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si
{
call->kvec[0].iov_base = buf;
call->kvec[0].iov_len = size;
- iov_iter_kvec(&call->iter, READ, call->kvec, 1, size);
+ iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size);
}
static inline void afs_extract_to_tmp(struct afs_call *call)
@@ -1151,7 +1157,7 @@ static inline void afs_extract_to_tmp64(struct afs_call *call)
static inline void afs_extract_discard(struct afs_call *call, size_t size)
{
- iov_iter_discard(&call->iter, READ, size);
+ iov_iter_discard(&call->def_iter, READ, size);
}
static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 0e5269374ac1..58d396592250 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -152,7 +152,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
INIT_WORK(&call->async_work, afs_process_async_call);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock);
- call->_iter = &call->iter;
+ call->iter = &call->def_iter;
o = atomic_inc_return(&net->nr_outstanding_calls);
trace_afs_call(call, afs_call_trace_alloc, 1, o,
@@ -513,12 +513,12 @@ static void afs_deliver_to_call(struct afs_call *call)
state == AFS_CALL_SV_AWAIT_ACK
) {
if (state == AFS_CALL_SV_AWAIT_ACK) {
- iov_iter_kvec(&call->iter, READ, NULL, 0, 0);
+ iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0);
ret = rxrpc_kernel_recv_data(call->net->socket,
- call->rxcall, &call->iter,
+ call->rxcall, &call->def_iter,
false, &remote_abort,
&call->service_id);
- trace_afs_receive_data(call, &call->iter, false, ret);
+ trace_afs_receive_data(call, &call->def_iter, false, ret);
if (ret == -EINPROGRESS || ret == -EAGAIN)
return;
@@ -637,6 +637,7 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
call->need_attention = false;
__set_current_state(TASK_RUNNING);
afs_deliver_to_call(call);
+ timeout = rtt2;
continue;
}
@@ -858,7 +859,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
{
int ret;
- _enter("{%zu}", iov_iter_count(call->_iter));
+ _enter("{%zu}", iov_iter_count(call->iter));
/* the operation ID forms the first four bytes of the request data */
ret = afs_extract_data(call, true);
@@ -974,7 +975,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
int afs_extract_data(struct afs_call *call, bool want_more)
{
struct afs_net *net = call->net;
- struct iov_iter *iter = call->_iter;
+ struct iov_iter *iter = call->iter;
enum afs_call_state state;
u32 remote_abort = 0;
int ret;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 64d440aaabc0..1686bf188ccd 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -151,7 +151,7 @@ static struct afs_server *afs_install_server(struct afs_net *net,
const struct afs_addr_list *alist;
struct afs_server *server;
struct rb_node **pp, *p;
- int ret = -EEXIST, diff;
+ int diff;
_enter("%p", candidate);
@@ -196,7 +196,6 @@ static struct afs_server *afs_install_server(struct afs_net *net,
hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6);
write_sequnlock(&net->fs_addr_lock);
- ret = 0;
exists:
afs_get_server(server, afs_server_trace_get_install);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f18911e8d770..488641b1a418 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -435,6 +435,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
/* fill in the superblock */
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
if (!as->dyn_root)
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 21eb0c0be912..8fea54eba0c2 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -279,8 +279,8 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
struct afs_addr_list *old = addrs;
write_lock(&server->lock);
- rcu_swap_protected(server->addresses, old,
- lockdep_is_held(&server->lock));
+ old = rcu_replace_pointer(server->addresses, old,
+ lockdep_is_held(&server->lock));
write_unlock(&server->lock);
afs_put_addrlist(old);
}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index cfb0ac4bd039..516e9a3bb5b4 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -185,7 +185,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
int i, ret;
_enter("{%u,%zu/%u}",
- call->unmarshall, iov_iter_count(call->_iter), call->count);
+ call->unmarshall, iov_iter_count(call->iter), call->count);
switch (call->unmarshall) {
case 0:
@@ -316,7 +316,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
int ret;
_enter("{%u,%zu/%u}",
- call->unmarshall, iov_iter_count(call->_iter), call->count);
+ call->unmarshall, iov_iter_count(call->iter), call->count);
switch (call->unmarshall) {
case 0:
@@ -425,7 +425,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
int ret;
_enter("{%u,%zu,%u}",
- call->unmarshall, iov_iter_count(call->_iter), call->count2);
+ call->unmarshall, iov_iter_count(call->iter), call->count2);
switch (call->unmarshall) {
case 0:
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 5552d034090a..7af41fd5f3ee 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -228,11 +228,11 @@ static int afs_xattr_get_yfs(const struct xattr_handler *handler,
break;
case 1:
data = buf;
- dsize = snprintf(buf, sizeof(buf), "%u", yacl->inherit_flag);
+ dsize = scnprintf(buf, sizeof(buf), "%u", yacl->inherit_flag);
break;
case 2:
data = buf;
- dsize = snprintf(buf, sizeof(buf), "%u", yacl->num_cleaned);
+ dsize = scnprintf(buf, sizeof(buf), "%u", yacl->num_cleaned);
break;
case 3:
data = yacl->vol_acl->data;
@@ -370,13 +370,15 @@ static int afs_xattr_get_fid(const struct xattr_handler *handler,
/* The volume ID is 64-bit, the vnode ID is 96-bit and the
* uniquifier is 32-bit.
*/
- len = sprintf(text, "%llx:", vnode->fid.vid);
+ len = scnprintf(text, sizeof(text), "%llx:", vnode->fid.vid);
if (vnode->fid.vnode_hi)
- len += sprintf(text + len, "%x%016llx",
- vnode->fid.vnode_hi, vnode->fid.vnode);
+ len += scnprintf(text + len, sizeof(text) - len, "%x%016llx",
+ vnode->fid.vnode_hi, vnode->fid.vnode);
else
- len += sprintf(text + len, "%llx", vnode->fid.vnode);
- len += sprintf(text + len, ":%x", vnode->fid.unique);
+ len += scnprintf(text + len, sizeof(text) - len, "%llx",
+ vnode->fid.vnode);
+ len += scnprintf(text + len, sizeof(text) - len, ":%x",
+ vnode->fid.unique);
if (size == 0)
return len;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 3ee7abf4b2d0..a26126ac7bf1 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -152,8 +152,8 @@ static void yfs_check_req(struct afs_call *call, __be32 *bp)
pr_err("kAFS: %s: Request buffer overflow (%zu>%u)\n",
call->type->name, len, call->request_size);
else if (len < call->request_size)
- pr_warning("kAFS: %s: Request buffer underflow (%zu<%u)\n",
- call->type->name, len, call->request_size);
+ pr_warn("kAFS: %s: Request buffer underflow (%zu<%u)\n",
+ call->type->name, len, call->request_size);
}
/*
@@ -441,7 +441,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
int ret;
_enter("{%u,%zu/%llu}",
- call->unmarshall, iov_iter_count(&call->iter), req->actual_len);
+ call->unmarshall, iov_iter_count(call->iter), req->actual_len);
switch (call->unmarshall) {
case 0:
@@ -476,14 +476,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
call->bvec[0].bv_len = size;
call->bvec[0].bv_offset = req->offset;
call->bvec[0].bv_page = req->pages[req->index];
- iov_iter_bvec(&call->iter, READ, call->bvec, 1, size);
+ iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
ASSERTCMP(size, <=, PAGE_SIZE);
/* Fall through */
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
- iov_iter_count(&call->iter), req->remain);
+ iov_iter_count(call->iter), req->remain);
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -511,7 +511,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
case 3:
_debug("extract discard %zu/%llu",
- iov_iter_count(&call->iter), req->actual_len - req->len);
+ iov_iter_count(call->iter), req->actual_len - req->len);
ret = afs_extract_data(call, true);
if (ret < 0)
@@ -605,7 +605,7 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
call->key = fc->key;
call->out_scb = scb;
call->out_volsync = NULL;
- call->read_request = req;
+ call->read_request = afs_get_read(req);
/* marshall the parameters */
bp = call->request;
@@ -616,7 +616,6 @@ int yfs_fs_fetch_data(struct afs_fs_cursor *fc, struct afs_status_cb *scb,
bp = xdr_encode_u64(bp, req->len);
yfs_check_req(call, bp);
- refcount_inc(&req->usage);
afs_use_fs_server(call, fc->cbi);
trace_afs_make_fs_call(call, &vnode->fid);
afs_set_fc_call(call, fc);
diff --git a/fs/aio.c b/fs/aio.c
index 01e0fb9ae45a..a9fbad2ce5e6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2056,7 +2056,7 @@ static long do_io_getevents(aio_context_t ctx_id,
* specifies an infinite timeout. Note that the timeout pointed to by
* timeout is relative. Will fail with -ENOSYS if not implemented.
*/
-#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+#ifdef CONFIG_64BIT
SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
long, min_nr,
@@ -2179,7 +2179,7 @@ SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
#ifdef CONFIG_COMPAT
struct __compat_aio_sigset {
- compat_sigset_t __user *sigmask;
+ compat_uptr_t sigmask;
compat_size_t sigsetsize;
};
@@ -2193,7 +2193,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
struct old_timespec32 __user *, timeout,
const struct __compat_aio_sigset __user *, usig)
{
- struct __compat_aio_sigset ksig = { NULL, };
+ struct __compat_aio_sigset ksig = { 0, };
struct timespec64 t;
bool interrupted;
int ret;
@@ -2204,7 +2204,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
if (ret)
return ret;
@@ -2228,7 +2228,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
struct __kernel_timespec __user *, timeout,
const struct __compat_aio_sigset __user *, usig)
{
- struct __compat_aio_sigset ksig = { NULL, };
+ struct __compat_aio_sigset ksig = { 0, };
struct timespec64 t;
bool interrupted;
int ret;
@@ -2239,7 +2239,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
if (ret)
return ret;
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 2866fabf497f..91f5787dae7c 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -459,9 +459,10 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
*/
how &= ~AUTOFS_EXP_LEAVES;
found = should_expire(expired, mnt, timeout, how);
- if (!found || found != expired)
- /* Something has changed, continue */
+ if (found != expired) { // something has changed, continue
+ dput(found);
goto next;
+ }
if (expired != dentry)
dput(dentry);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c5642bcb6b46..ecd8d2698515 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -404,6 +404,17 @@ static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
ELF_PAGESTART(cmds[first_idx].p_vaddr);
}
+static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
+{
+ ssize_t rv;
+
+ rv = kernel_read(file, buf, len, &pos);
+ if (unlikely(rv != len)) {
+ return (rv < 0) ? rv : -EIO;
+ }
+ return 0;
+}
+
/**
* load_elf_phdrs() - load ELF program headers
* @elf_ex: ELF header of the binary whose program headers should be loaded
@@ -418,7 +429,6 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
{
struct elf_phdr *elf_phdata = NULL;
int retval, err = -1;
- loff_t pos = elf_ex->e_phoff;
unsigned int size;
/*
@@ -439,9 +449,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
goto out;
/* Read in the program headers */
- retval = kernel_read(elf_file, elf_phdata, size, &pos);
- if (retval != size) {
- err = (retval < 0) ? retval : -EIO;
+ retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff);
+ if (retval < 0) {
+ err = retval;
goto out;
}
@@ -544,7 +554,7 @@ static inline int make_prot(u32 p_flags)
an ELF header */
static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
- struct file *interpreter, unsigned long *interp_map_addr,
+ struct file *interpreter,
unsigned long no_base, struct elf_phdr *interp_elf_phdata)
{
struct elf_phdr *eppnt;
@@ -590,8 +600,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
map_addr = elf_map(interpreter, load_addr + vaddr,
eppnt, elf_prot, elf_type, total_size);
total_size = 0;
- if (!*interp_map_addr)
- *interp_map_addr = map_addr;
error = map_addr;
if (BAD_ADDR(map_addr))
goto out;
@@ -722,7 +730,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
elf_ppnt = elf_phdata;
for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
char *elf_interpreter;
- loff_t pos;
if (elf_ppnt->p_type != PT_INTERP)
continue;
@@ -740,14 +747,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (!elf_interpreter)
goto out_free_ph;
- pos = elf_ppnt->p_offset;
- retval = kernel_read(bprm->file, elf_interpreter,
- elf_ppnt->p_filesz, &pos);
- if (retval != elf_ppnt->p_filesz) {
- if (retval >= 0)
- retval = -EIO;
+ retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz,
+ elf_ppnt->p_offset);
+ if (retval < 0)
goto out_free_interp;
- }
/* make sure path is NULL terminated */
retval = -ENOEXEC;
if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
@@ -766,14 +769,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
would_dump(bprm, interpreter);
/* Get the exec headers */
- pos = 0;
- retval = kernel_read(interpreter, &loc->interp_elf_ex,
- sizeof(loc->interp_elf_ex), &pos);
- if (retval != sizeof(loc->interp_elf_ex)) {
- if (retval >= 0)
- retval = -EIO;
+ retval = elf_read(interpreter, &loc->interp_elf_ex,
+ sizeof(loc->interp_elf_ex), 0);
+ if (retval < 0)
goto out_free_dentry;
- }
break;
@@ -1054,11 +1053,8 @@ out_free_interp:
}
if (interpreter) {
- unsigned long interp_map_addr = 0;
-
elf_entry = load_elf_interp(&loc->interp_elf_ex,
interpreter,
- &interp_map_addr,
load_bias, interp_elf_phdata);
if (!IS_ERR((void *)elf_entry)) {
/*
@@ -1179,11 +1175,10 @@ static int load_elf_library(struct file *file)
unsigned long elf_bss, bss, len;
int retval, error, i, j;
struct elfhdr elf_ex;
- loff_t pos = 0;
error = -ENOEXEC;
- retval = kernel_read(file, &elf_ex, sizeof(elf_ex), &pos);
- if (retval != sizeof(elf_ex))
+ retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0);
+ if (retval < 0)
goto out;
if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
@@ -1208,9 +1203,8 @@ static int load_elf_library(struct file *file)
eppnt = elf_phdata;
error = -ENOEXEC;
- pos = elf_ex.e_phoff;
- retval = kernel_read(file, eppnt, j, &pos);
- if (retval != j)
+ retval = elf_read(file, eppnt, j, elf_ex.e_phoff);
+ if (retval < 0)
goto out_free_ph;
for (j = 0, i = 0; i<elf_ex.e_phnum; i++)
@@ -1489,18 +1483,18 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
* group-wide total, not its individual thread total.
*/
thread_group_cputime(p, &cputime);
- prstatus->pr_utime = ns_to_timeval(cputime.utime);
- prstatus->pr_stime = ns_to_timeval(cputime.stime);
+ prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime);
+ prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime);
} else {
u64 utime, stime;
task_cputime(p, &utime, &stime);
- prstatus->pr_utime = ns_to_timeval(utime);
- prstatus->pr_stime = ns_to_timeval(stime);
+ prstatus->pr_utime = ns_to_kernel_old_timeval(utime);
+ prstatus->pr_stime = ns_to_kernel_old_timeval(stime);
}
- prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
- prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
+ prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime);
+ prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime);
}
static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d86ebd0dcc3d..240f66663543 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1359,17 +1359,17 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
* group-wide total, not its individual thread total.
*/
thread_group_cputime(p, &cputime);
- prstatus->pr_utime = ns_to_timeval(cputime.utime);
- prstatus->pr_stime = ns_to_timeval(cputime.stime);
+ prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime);
+ prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime);
} else {
u64 utime, stime;
task_cputime(p, &utime, &stime);
- prstatus->pr_utime = ns_to_timeval(utime);
- prstatus->pr_stime = ns_to_timeval(stime);
+ prstatus->pr_utime = ns_to_kernel_old_timeval(utime);
+ prstatus->pr_stime = ns_to_kernel_old_timeval(stime);
}
- prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
- prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
+ prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime);
+ prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime);
prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9c073dbdc1b0..ee63c2732fa2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1403,11 +1403,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
"resized disk %s\n",
bdev->bd_disk ? bdev->bd_disk->disk_name : "");
}
-
- if (!bdev->bd_disk)
- return;
- if (disk_part_scan_enabled(bdev->bd_disk))
- bdev->bd_invalidated = 1;
+ bdev->bd_invalidated = 1;
}
/**
@@ -1420,8 +1416,8 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
* and adjusts it if it differs. When shrinking the bdev size, its all caches
* are freed.
*/
-void check_disk_size_change(struct gendisk *disk, struct block_device *bdev,
- bool verbose)
+static void check_disk_size_change(struct gendisk *disk,
+ struct block_device *bdev, bool verbose)
{
loff_t disk_size, bdev_size;
@@ -1437,6 +1433,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev,
if (bdev_size > disk_size)
flush_disk(bdev, false);
}
+ bdev->bd_invalidated = 0;
}
/**
@@ -1466,7 +1463,6 @@ int revalidate_disk(struct gendisk *disk)
mutex_lock(&bdev->bd_mutex);
check_disk_size_change(disk, bdev, ret == 0);
- bdev->bd_invalidated = 0;
mutex_unlock(&bdev->bd_mutex);
bdput(bdev);
}
@@ -1512,6 +1508,45 @@ EXPORT_SYMBOL(bd_set_size);
static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+int bdev_disk_changed(struct block_device *bdev, bool invalidate)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ int ret;
+
+ lockdep_assert_held(&bdev->bd_mutex);
+
+rescan:
+ ret = blk_drop_partitions(disk, bdev);
+ if (ret)
+ return ret;
+
+ if (invalidate)
+ set_capacity(disk, 0);
+ else if (disk->fops->revalidate_disk)
+ disk->fops->revalidate_disk(disk);
+
+ check_disk_size_change(disk, bdev, !invalidate);
+
+ if (get_capacity(disk)) {
+ ret = blk_add_partitions(disk, bdev);
+ if (ret == -EAGAIN)
+ goto rescan;
+ } else {
+ /*
+ * Tell userspace that the media / partition table may have
+ * changed.
+ */
+ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+ }
+
+ return ret;
+}
+/*
+ * Only exported for for loop and dasd for historic reasons. Don't use in new
+ * code!
+ */
+EXPORT_SYMBOL_GPL(bdev_disk_changed);
+
/*
* bd_mutex locking:
*
@@ -1594,12 +1629,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
* The latter is necessary to prevent ghost
* partitions on a removed medium.
*/
- if (bdev->bd_invalidated) {
- if (!ret)
- rescan_partitions(disk, bdev);
- else if (ret == -ENOMEDIUM)
- invalidate_partitions(disk, bdev);
- }
+ if (bdev->bd_invalidated &&
+ (!ret || ret == -ENOMEDIUM))
+ bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
goto out_clear;
@@ -1632,12 +1664,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (bdev->bd_disk->fops->open)
ret = bdev->bd_disk->fops->open(bdev, mode);
/* the same as first opener case, read comment there */
- if (bdev->bd_invalidated) {
- if (!ret)
- rescan_partitions(bdev->bd_disk, bdev);
- else if (ret == -ENOMEDIUM)
- invalidate_partitions(bdev->bd_disk, bdev);
- }
+ if (bdev->bd_invalidated &&
+ (!ret || ret == -ENOMEDIUM))
+ bdev_disk_changed(bdev, ret == -ENOMEDIUM);
if (ret)
goto out_unlock_bdev;
}
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 38651fae7f21..75b6d10c9845 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -5,6 +5,8 @@ config BTRFS_FS
select CRYPTO
select CRYPTO_CRC32C
select LIBCRC32C
+ select CRYPTO_XXHASH
+ select CRYPTO_SHA256
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 2e9e13ffbd08..1d32a07bb2d1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -53,24 +53,12 @@ struct btrfs_workqueue {
struct __btrfs_workqueue *high;
};
-static void normal_work_helper(struct btrfs_work *work);
-
-#define BTRFS_WORK_HELPER(name) \
-noinline_for_stack void btrfs_##name(struct work_struct *arg) \
-{ \
- struct btrfs_work *work = container_of(arg, struct btrfs_work, \
- normal_work); \
- normal_work_helper(work); \
-}
-
-struct btrfs_fs_info *
-btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
{
return wq->fs_info;
}
-struct btrfs_fs_info *
-btrfs_work_owner(const struct btrfs_work *work)
+struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
{
return work->wq->fs_info;
}
@@ -89,29 +77,6 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
}
-BTRFS_WORK_HELPER(worker_helper);
-BTRFS_WORK_HELPER(delalloc_helper);
-BTRFS_WORK_HELPER(flush_delalloc_helper);
-BTRFS_WORK_HELPER(cache_helper);
-BTRFS_WORK_HELPER(submit_helper);
-BTRFS_WORK_HELPER(fixup_helper);
-BTRFS_WORK_HELPER(endio_helper);
-BTRFS_WORK_HELPER(endio_meta_helper);
-BTRFS_WORK_HELPER(endio_meta_write_helper);
-BTRFS_WORK_HELPER(endio_raid56_helper);
-BTRFS_WORK_HELPER(endio_repair_helper);
-BTRFS_WORK_HELPER(rmw_helper);
-BTRFS_WORK_HELPER(endio_write_helper);
-BTRFS_WORK_HELPER(freespace_write_helper);
-BTRFS_WORK_HELPER(delayed_meta_helper);
-BTRFS_WORK_HELPER(readahead_helper);
-BTRFS_WORK_HELPER(qgroup_rescan_helper);
-BTRFS_WORK_HELPER(extent_refs_helper);
-BTRFS_WORK_HELPER(scrub_helper);
-BTRFS_WORK_HELPER(scrubwrc_helper);
-BTRFS_WORK_HELPER(scrubnc_helper);
-BTRFS_WORK_HELPER(scrubparity_helper);
-
static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
unsigned int flags, int limit_active, int thresh)
@@ -252,16 +217,16 @@ out:
}
}
-static void run_ordered_work(struct __btrfs_workqueue *wq)
+static void run_ordered_work(struct __btrfs_workqueue *wq,
+ struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
struct btrfs_work *work;
spinlock_t *lock = &wq->list_lock;
unsigned long flags;
+ bool free_self = false;
while (1) {
- void *wtag;
-
spin_lock_irqsave(lock, flags);
if (list_empty(list))
break;
@@ -287,22 +252,53 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
list_del(&work->ordered_list);
spin_unlock_irqrestore(lock, flags);
- /*
- * We don't want to call the ordered free functions with the
- * lock held though. Save the work as tag for the trace event,
- * because the callback could free the structure.
- */
- wtag = work;
- work->ordered_free(work);
- trace_btrfs_all_work_done(wq->fs_info, wtag);
+ if (work == self) {
+ /*
+ * This is the work item that the worker is currently
+ * executing.
+ *
+ * The kernel workqueue code guarantees non-reentrancy
+ * of work items. I.e., if a work item with the same
+ * address and work function is queued twice, the second
+ * execution is blocked until the first one finishes. A
+ * work item may be freed and recycled with the same
+ * work function; the workqueue code assumes that the
+ * original work item cannot depend on the recycled work
+ * item in that case (see find_worker_executing_work()).
+ *
+ * Note that different types of Btrfs work can depend on
+ * each other, and one type of work on one Btrfs
+ * filesystem may even depend on the same type of work
+ * on another Btrfs filesystem via, e.g., a loop device.
+ * Therefore, we must not allow the current work item to
+ * be recycled until we are really done, otherwise we
+ * break the above assumption and can deadlock.
+ */
+ free_self = true;
+ } else {
+ /*
+ * We don't want to call the ordered free functions with
+ * the lock held.
+ */
+ work->ordered_free(work);
+ /* NB: work must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, work);
+ }
}
spin_unlock_irqrestore(lock, flags);
+
+ if (free_self) {
+ self->ordered_free(self);
+ /* NB: self must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, self);
+ }
}
-static void normal_work_helper(struct btrfs_work *work)
+static void btrfs_work_helper(struct work_struct *normal_work)
{
+ struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
+ normal_work);
struct __btrfs_workqueue *wq;
- void *wtag;
int need_order = 0;
/*
@@ -316,29 +312,26 @@ static void normal_work_helper(struct btrfs_work *work)
if (work->ordered_func)
need_order = 1;
wq = work->wq;
- /* Safe for tracepoints in case work gets freed by the callback */
- wtag = work;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
set_bit(WORK_DONE_BIT, &work->flags);
- run_ordered_work(wq);
+ run_ordered_work(wq, work);
+ } else {
+ /* NB: work must not be dereferenced past this point. */
+ trace_btrfs_all_work_done(wq->fs_info, work);
}
- if (!need_order)
- trace_btrfs_all_work_done(wq->fs_info, wtag);
}
-void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
- btrfs_func_t func,
- btrfs_func_t ordered_func,
- btrfs_func_t ordered_free)
+void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
+ btrfs_func_t ordered_func, btrfs_func_t ordered_free)
{
work->func = func;
work->ordered_func = ordered_func;
work->ordered_free = ordered_free;
- INIT_WORK(&work->normal_work, uniq_func);
+ INIT_WORK(&work->normal_work, btrfs_work_helper);
INIT_LIST_HEAD(&work->ordered_list);
work->flags = 0;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 7861c9feba5f..a4434301d84d 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -29,49 +29,20 @@ struct btrfs_work {
unsigned long flags;
};
-#define BTRFS_WORK_HELPER_PROTO(name) \
-void btrfs_##name(struct work_struct *arg)
-
-BTRFS_WORK_HELPER_PROTO(worker_helper);
-BTRFS_WORK_HELPER_PROTO(delalloc_helper);
-BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
-BTRFS_WORK_HELPER_PROTO(cache_helper);
-BTRFS_WORK_HELPER_PROTO(submit_helper);
-BTRFS_WORK_HELPER_PROTO(fixup_helper);
-BTRFS_WORK_HELPER_PROTO(endio_helper);
-BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
-BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
-BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
-BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
-BTRFS_WORK_HELPER_PROTO(rmw_helper);
-BTRFS_WORK_HELPER_PROTO(endio_write_helper);
-BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
-BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
-BTRFS_WORK_HELPER_PROTO(readahead_helper);
-BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
-BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
-BTRFS_WORK_HELPER_PROTO(scrub_helper);
-BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
-BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
-BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
-
-
struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
const char *name,
unsigned int flags,
int limit_active,
int thresh);
-void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
- btrfs_func_t func,
- btrfs_func_t ordered_func,
- btrfs_func_t ordered_free);
+void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
+ btrfs_func_t ordered_func, btrfs_func_t ordered_free);
void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
void btrfs_set_work_high_priority(struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
+struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 670700cb1110..6934a5b8708f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -120,12 +120,12 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
return get_alloc_profile(fs_info, orig_flags);
}
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group *cache)
{
atomic_inc(&cache->count);
}
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_put_block_group(struct btrfs_block_group *cache)
{
if (atomic_dec_and_test(&cache->count)) {
WARN_ON(cache->pinned > 0);
@@ -149,22 +149,21 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
* This adds the block group to the fs_info rb tree for the block group cache
*/
static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct rb_node **p;
struct rb_node *parent = NULL;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
spin_lock(&info->block_group_cache_lock);
p = &info->block_group_cache_tree.rb_node;
while (*p) {
parent = *p;
- cache = rb_entry(parent, struct btrfs_block_group_cache,
- cache_node);
- if (block_group->key.objectid < cache->key.objectid) {
+ cache = rb_entry(parent, struct btrfs_block_group, cache_node);
+ if (block_group->start < cache->start) {
p = &(*p)->rb_left;
- } else if (block_group->key.objectid > cache->key.objectid) {
+ } else if (block_group->start > cache->start) {
p = &(*p)->rb_right;
} else {
spin_unlock(&info->block_group_cache_lock);
@@ -176,8 +175,8 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
rb_insert_color(&block_group->cache_node,
&info->block_group_cache_tree);
- if (info->first_logical_byte > block_group->key.objectid)
- info->first_logical_byte = block_group->key.objectid;
+ if (info->first_logical_byte > block_group->start)
+ info->first_logical_byte = block_group->start;
spin_unlock(&info->block_group_cache_lock);
@@ -188,10 +187,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
* This will return the block group at or after bytenr if contains is 0, else
* it will return the block group that contains the bytenr
*/
-static struct btrfs_block_group_cache *block_group_cache_tree_search(
+static struct btrfs_block_group *block_group_cache_tree_search(
struct btrfs_fs_info *info, u64 bytenr, int contains)
{
- struct btrfs_block_group_cache *cache, *ret = NULL;
+ struct btrfs_block_group *cache, *ret = NULL;
struct rb_node *n;
u64 end, start;
@@ -199,13 +198,12 @@ static struct btrfs_block_group_cache *block_group_cache_tree_search(
n = info->block_group_cache_tree.rb_node;
while (n) {
- cache = rb_entry(n, struct btrfs_block_group_cache,
- cache_node);
- end = cache->key.objectid + cache->key.offset - 1;
- start = cache->key.objectid;
+ cache = rb_entry(n, struct btrfs_block_group, cache_node);
+ end = cache->start + cache->length - 1;
+ start = cache->start;
if (bytenr < start) {
- if (!contains && (!ret || start < ret->key.objectid))
+ if (!contains && (!ret || start < ret->start))
ret = cache;
n = n->rb_left;
} else if (bytenr > start) {
@@ -221,8 +219,8 @@ static struct btrfs_block_group_cache *block_group_cache_tree_search(
}
if (ret) {
btrfs_get_block_group(ret);
- if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
- info->first_logical_byte = ret->key.objectid;
+ if (bytenr == 0 && info->first_logical_byte > ret->start)
+ info->first_logical_byte = ret->start;
}
spin_unlock(&info->block_group_cache_lock);
@@ -232,7 +230,7 @@ static struct btrfs_block_group_cache *block_group_cache_tree_search(
/*
* Return the block group that starts at or after bytenr
*/
-struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
+struct btrfs_block_group *btrfs_lookup_first_block_group(
struct btrfs_fs_info *info, u64 bytenr)
{
return block_group_cache_tree_search(info, bytenr, 0);
@@ -241,14 +239,14 @@ struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
/*
* Return the block group that contains the given bytenr
*/
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
+struct btrfs_block_group *btrfs_lookup_block_group(
struct btrfs_fs_info *info, u64 bytenr)
{
return block_group_cache_tree_search(info, bytenr, 1);
}
-struct btrfs_block_group_cache *btrfs_next_block_group(
- struct btrfs_block_group_cache *cache)
+struct btrfs_block_group *btrfs_next_block_group(
+ struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
@@ -257,7 +255,7 @@ struct btrfs_block_group_cache *btrfs_next_block_group(
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
- const u64 next_bytenr = cache->key.objectid + cache->key.offset;
+ const u64 next_bytenr = cache->start + cache->length;
spin_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
@@ -266,8 +264,7 @@ struct btrfs_block_group_cache *btrfs_next_block_group(
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
if (node) {
- cache = rb_entry(node, struct btrfs_block_group_cache,
- cache_node);
+ cache = rb_entry(node, struct btrfs_block_group, cache_node);
btrfs_get_block_group(cache);
} else
cache = NULL;
@@ -277,7 +274,7 @@ struct btrfs_block_group_cache *btrfs_next_block_group(
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
{
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
bool ret = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
@@ -300,7 +297,7 @@ bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
{
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, bytenr);
ASSERT(bg);
@@ -314,7 +311,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
btrfs_put_block_group(bg);
}
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
+void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
{
wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
}
@@ -322,7 +319,7 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start)
{
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, start);
ASSERT(bg);
@@ -331,7 +328,7 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(bg);
}
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
+void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
{
struct btrfs_space_info *space_info = bg->space_info;
@@ -357,7 +354,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
}
struct btrfs_caching_control *btrfs_get_caching_control(
- struct btrfs_block_group_cache *cache)
+ struct btrfs_block_group *cache)
{
struct btrfs_caching_control *ctl;
@@ -392,7 +389,7 @@ void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
* any of the information in this block group.
*/
-void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
@@ -401,13 +398,13 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache
if (!caching_ctl)
return;
- wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) ||
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
(cache->free_space_ctl->free_space >= num_bytes));
btrfs_put_caching_control(caching_ctl);
}
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
{
struct btrfs_caching_control *caching_ctl;
int ret = 0;
@@ -416,7 +413,7 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
if (!caching_ctl)
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
- wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache));
+ wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
if (cache->cached == BTRFS_CACHE_ERROR)
ret = -EIO;
btrfs_put_caching_control(caching_ctl);
@@ -424,11 +421,11 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
}
#ifdef CONFIG_BTRFS_DEBUG
-static void fragment_free_space(struct btrfs_block_group_cache *block_group)
+static void fragment_free_space(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- u64 start = block_group->key.objectid;
- u64 len = block_group->key.offset;
+ u64 start = block_group->start;
+ u64 len = block_group->length;
u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
fs_info->nodesize : fs_info->sectorsize;
u64 step = chunk << 1;
@@ -450,8 +447,7 @@ static void fragment_free_space(struct btrfs_block_group_cache *block_group)
* used yet since their free space will be released as soon as the transaction
* commits.
*/
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
{
struct btrfs_fs_info *info = block_group->fs_info;
u64 extent_start, extent_end, size, total_added = 0;
@@ -491,7 +487,7 @@ u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
- struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
+ struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_path *path;
@@ -507,7 +503,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
if (!path)
return -ENOMEM;
- last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+ last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
#ifdef CONFIG_BTRFS_DEBUG
/*
@@ -587,13 +583,12 @@ next:
goto next;
}
- if (key.objectid < block_group->key.objectid) {
+ if (key.objectid < block_group->start) {
path->slots[0]++;
continue;
}
- if (key.objectid >= block_group->key.objectid +
- block_group->key.offset)
+ if (key.objectid >= block_group->start + block_group->length)
break;
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -617,8 +612,7 @@ next:
ret = 0;
total_found += add_new_free_space(block_group, last,
- block_group->key.objectid +
- block_group->key.offset);
+ block_group->start + block_group->length);
caching_ctl->progress = (u64)-1;
out:
@@ -628,7 +622,7 @@ out:
static noinline void caching_thread(struct btrfs_work *work)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_caching_control *caching_ctl;
int ret;
@@ -656,8 +650,7 @@ static noinline void caching_thread(struct btrfs_work *work)
spin_lock(&block_group->space_info->lock);
spin_lock(&block_group->lock);
- bytes_used = block_group->key.offset -
- btrfs_block_group_used(&block_group->item);
+ bytes_used = block_group->length - block_group->used;
block_group->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&block_group->lock);
spin_unlock(&block_group->space_info->lock);
@@ -677,8 +670,7 @@ static noinline void caching_thread(struct btrfs_work *work)
btrfs_put_block_group(block_group);
}
-int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
- int load_cache_only)
+int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
{
DEFINE_WAIT(wait);
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -693,10 +685,9 @@ int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
- caching_ctl->progress = cache->key.objectid;
+ caching_ctl->progress = cache->start;
refcount_set(&caching_ctl->count, 1);
- btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
- caching_thread, NULL, NULL);
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
/*
@@ -763,8 +754,7 @@ int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
- bytes_used = cache->key.offset -
- btrfs_block_group_used(&cache->item);
+ bytes_used = cache->length - cache->used;
cache->space_info->bytes_used += bytes_used >> 1;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -833,27 +823,36 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
*
* - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
* in the whole filesystem
+ *
+ * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
*/
static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
- if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ bool found_raid56 = false;
+ bool found_raid1c34 = false;
+
+ if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
+ (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
+ (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *sinfo;
list_for_each_entry_rcu(sinfo, head, list) {
- bool found = false;
-
down_read(&sinfo->groups_sem);
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
- found = true;
+ found_raid56 = true;
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
- found = true;
+ found_raid56 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
+ found_raid1c34 = true;
+ if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
+ found_raid1c34 = true;
up_read(&sinfo->groups_sem);
-
- if (found)
- return;
}
- btrfs_clear_fs_incompat(fs_info, RAID56);
+ if (found_raid56)
+ btrfs_clear_fs_incompat(fs_info, RAID56);
+ if (found_raid1c34)
+ btrfs_clear_fs_incompat(fs_info, RAID1C34);
}
}
@@ -863,7 +862,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_path *path;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_free_cluster *cluster;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_key key;
@@ -886,10 +885,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* remove it.
*/
btrfs_free_excluded_extents(block_group);
- btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
- block_group->key.offset);
+ btrfs_free_ref_tree_range(fs_info, block_group->start,
+ block_group->length);
- memcpy(&key, &block_group->key, sizeof(key));
index = btrfs_bg_flags_to_raid_index(block_group->flags);
factor = btrfs_bg_type_to_factor(block_group->flags);
@@ -967,8 +965,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = block_group->key.objectid;
key.type = 0;
+ key.offset = block_group->start;
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0)
@@ -987,7 +985,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
- if (fs_info->first_logical_byte == block_group->key.objectid)
+ if (fs_info->first_logical_byte == block_group->start)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
@@ -1048,19 +1046,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
WARN_ON(block_group->space_info->total_bytes
- < block_group->key.offset);
+ < block_group->length);
WARN_ON(block_group->space_info->bytes_readonly
- < block_group->key.offset);
+ < block_group->length);
WARN_ON(block_group->space_info->disk_total
- < block_group->key.offset * factor);
+ < block_group->length * factor);
}
- block_group->space_info->total_bytes -= block_group->key.offset;
- block_group->space_info->bytes_readonly -= block_group->key.offset;
- block_group->space_info->disk_total -= block_group->key.offset * factor;
+ block_group->space_info->total_bytes -= block_group->length;
+ block_group->space_info->bytes_readonly -= block_group->length;
+ block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
- memcpy(&key, &block_group->key, sizeof(key));
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
mutex_lock(&fs_info->chunk_mutex);
spin_lock(&block_group->lock);
@@ -1180,7 +1180,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* data in this block group. That check should be done by relocation routine,
* not this function.
*/
-static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
@@ -1209,8 +1209,8 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
goto out;
}
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
- cache->bytes_super - btrfs_block_group_used(&cache->item);
+ num_bytes = cache->length - cache->reserved - cache->pinned -
+ cache->bytes_super - cache->used;
sinfo_used = btrfs_space_info_used(sinfo, true);
/*
@@ -1231,8 +1231,7 @@ out:
spin_unlock(&sinfo->lock);
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
- "unable to make block group %llu ro",
- cache->key.objectid);
+ "unable to make block group %llu ro", cache->start);
btrfs_info(cache->fs_info,
"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
sinfo_used, num_bytes, min_allocable_bytes);
@@ -1247,7 +1246,7 @@ out:
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
int ret = 0;
@@ -1261,7 +1260,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
bg_list);
list_del_init(&block_group->bg_list);
@@ -1279,8 +1278,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->pinned ||
- btrfs_block_group_used(&block_group->item) ||
- block_group->ro ||
+ block_group->used || block_group->ro ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
@@ -1308,7 +1306,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* properly if we fail to join the transaction.
*/
trans = btrfs_start_trans_remove_block_group(fs_info,
- block_group->key.objectid);
+ block_group->start);
if (IS_ERR(trans)) {
btrfs_dec_block_group_ro(block_group);
ret = PTR_ERR(trans);
@@ -1319,8 +1317,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* We could have pending pinned extents for this block group,
* just delete them, we don't care about them anymore.
*/
- start = block_group->key.objectid;
- end = start + block_group->key.offset - 1;
+ start = block_group->start;
+ end = start + block_group->length - 1;
/*
* Hold the unused_bg_unpin_mutex lock to avoid racing with
* btrfs_finish_extent_commit(). If we are at transaction N,
@@ -1375,7 +1373,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
*/
- ret = btrfs_remove_chunk(trans, block_group->key.objectid);
+ ret = btrfs_remove_chunk(trans, block_group->start);
if (ret) {
if (trimming)
@@ -1410,7 +1408,7 @@ next:
spin_unlock(&fs_info->unused_bgs_lock);
}
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
+void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1478,7 +1476,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
read_extent_buffer(leaf, &bg,
btrfs_item_ptr_offset(leaf, slot),
sizeof(bg));
- flags = btrfs_block_group_flags(&bg) &
+ flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
if (flags != (em->map_lookup->type &
@@ -1518,7 +1516,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
write_sequnlock(&fs_info->profiles_lock);
}
-static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
+static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 bytenr;
@@ -1526,10 +1524,10 @@ static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
int stripe_len;
int i, nr, ret;
- if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
- stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+ if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
+ stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
cache->bytes_super += stripe_len;
- ret = btrfs_add_excluded_extent(fs_info, cache->key.objectid,
+ ret = btrfs_add_excluded_extent(fs_info, cache->start,
stripe_len);
if (ret)
return ret;
@@ -1537,7 +1535,7 @@ static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(fs_info, cache->key.objectid,
+ ret = btrfs_rmap_block(fs_info, cache->start,
bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
@@ -1545,21 +1543,19 @@ static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
while (nr--) {
u64 start, len;
- if (logical[nr] > cache->key.objectid +
- cache->key.offset)
+ if (logical[nr] > cache->start + cache->length)
continue;
- if (logical[nr] + stripe_len <= cache->key.objectid)
+ if (logical[nr] + stripe_len <= cache->start)
continue;
start = logical[nr];
- if (start < cache->key.objectid) {
- start = cache->key.objectid;
+ if (start < cache->start) {
+ start = cache->start;
len = (logical[nr] + stripe_len) - start;
} else {
len = min_t(u64, stripe_len,
- cache->key.objectid +
- cache->key.offset - start);
+ cache->start + cache->length - start);
}
cache->bytes_super += len;
@@ -1575,7 +1571,7 @@ static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
return 0;
}
-static void link_block_group(struct btrfs_block_group_cache *cache)
+static void link_block_group(struct btrfs_block_group *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
int index = btrfs_bg_flags_to_raid_index(cache->flags);
@@ -1591,10 +1587,10 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
btrfs_sysfs_add_block_group_type(cache);
}
-static struct btrfs_block_group_cache *btrfs_create_block_group_cache(
+static struct btrfs_block_group *btrfs_create_block_group_cache(
struct btrfs_fs_info *fs_info, u64 start, u64 size)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache)
@@ -1607,9 +1603,8 @@ static struct btrfs_block_group_cache *btrfs_create_block_group_cache(
return NULL;
}
- cache->key.objectid = start;
- cache->key.offset = size;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->start = start;
+ cache->length = size;
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
@@ -1640,7 +1635,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;
@@ -1665,15 +1660,14 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
free_extent_map(em);
break;
}
- if (bg->key.objectid != em->start ||
- bg->key.offset != em->len ||
+ if (bg->start != em->start || bg->length != em->len ||
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
(em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
em->start, em->len,
em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
- bg->key.objectid, bg->key.offset,
+ bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
free_extent_map(em);
@@ -1687,22 +1681,117 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
return ret;
}
+static int read_one_block_group(struct btrfs_fs_info *info,
+ struct btrfs_path *path,
+ const struct btrfs_key *key,
+ int need_clear)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_block_group *cache;
+ struct btrfs_space_info *space_info;
+ struct btrfs_block_group_item bgi;
+ const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
+ int slot = path->slots[0];
+ int ret;
+
+ ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
+
+ cache = btrfs_create_block_group_cache(info, key->objectid, key->offset);
+ if (!cache)
+ return -ENOMEM;
+
+ if (need_clear) {
+ /*
+ * When we mount with old space cache, we need to
+ * set BTRFS_DC_CLEAR and set dirty flag.
+ *
+ * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+ * truncate the old free space cache inode and
+ * setup a new one.
+ * b) Setting 'dirty flag' makes sure that we flush
+ * the new space cache info onto disk.
+ */
+ if (btrfs_test_opt(info, SPACE_CACHE))
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
+ }
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+ cache->used = btrfs_stack_block_group_used(&bgi);
+ cache->flags = btrfs_stack_block_group_flags(&bgi);
+ if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
+ btrfs_err(info,
+"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
+ cache->start);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ /*
+ * We need to exclude the super stripes now so that the space info has
+ * super bytes accounted for, otherwise we'll think we have more space
+ * than we actually do.
+ */
+ ret = exclude_super_stripes(cache);
+ if (ret) {
+ /* We may have excluded something, so call this just in case. */
+ btrfs_free_excluded_extents(cache);
+ goto error;
+ }
+
+ /*
+ * Check for two cases, either we are full, and therefore don't need
+ * to bother with the caching work since we won't find any space, or we
+ * are empty, and we can just add all the space in and be done with it.
+ * This saves us _a_lot_ of time, particularly in the full case.
+ */
+ if (key->offset == cache->used) {
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ btrfs_free_excluded_extents(cache);
+ } else if (cache->used == 0) {
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ add_new_free_space(cache, key->objectid,
+ key->objectid + key->offset);
+ btrfs_free_excluded_extents(cache);
+ }
+
+ ret = btrfs_add_block_group_cache(info, cache);
+ if (ret) {
+ btrfs_remove_free_space_cache(cache);
+ goto error;
+ }
+ trace_btrfs_add_block_group(info, cache, 0);
+ btrfs_update_space_info(info, cache->flags, key->offset,
+ cache->used, cache->bytes_super, &space_info);
+
+ cache->space_info = space_info;
+
+ link_block_group(cache);
+
+ set_avail_alloc_bits(info, cache->flags);
+ if (btrfs_chunk_readonly(info, cache->start)) {
+ inc_block_group_ro(cache, 1);
+ } else if (cache->used == 0) {
+ ASSERT(list_empty(&cache->bg_list));
+ btrfs_mark_bg_unused(cache);
+ }
+ return 0;
+error:
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
int btrfs_read_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_path *path;
int ret;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_space_info *space_info;
struct btrfs_key key;
- struct btrfs_key found_key;
- struct extent_buffer *leaf;
int need_clear = 0;
u64 cache_gen;
- u64 feature;
- int mixed;
-
- feature = btrfs_super_incompat_flags(info->super_copy);
- mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
key.objectid = 0;
key.offset = 0;
@@ -1726,108 +1815,13 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
if (ret != 0)
goto error;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- cache = btrfs_create_block_group_cache(info, found_key.objectid,
- found_key.offset);
- if (!cache) {
- ret = -ENOMEM;
- goto error;
- }
-
- if (need_clear) {
- /*
- * When we mount with old space cache, we need to
- * set BTRFS_DC_CLEAR and set dirty flag.
- *
- * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
- * truncate the old free space cache inode and
- * setup a new one.
- * b) Setting 'dirty flag' makes sure that we flush
- * the new space cache info onto disk.
- */
- if (btrfs_test_opt(info, SPACE_CACHE))
- cache->disk_cache_state = BTRFS_DC_CLEAR;
- }
-
- read_extent_buffer(leaf, &cache->item,
- btrfs_item_ptr_offset(leaf, path->slots[0]),
- sizeof(cache->item));
- cache->flags = btrfs_block_group_flags(&cache->item);
- if (!mixed &&
- ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
- (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
- btrfs_err(info,
-"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
- cache->key.objectid);
- btrfs_put_block_group(cache);
- ret = -EINVAL;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ ret = read_one_block_group(info, path, &key, need_clear);
+ if (ret < 0)
goto error;
- }
-
- key.objectid = found_key.objectid + found_key.offset;
+ key.objectid += key.offset;
+ key.offset = 0;
btrfs_release_path(path);
-
- /*
- * We need to exclude the super stripes now so that the space
- * info has super bytes accounted for, otherwise we'll think
- * we have more space than we actually do.
- */
- ret = exclude_super_stripes(cache);
- if (ret) {
- /*
- * We may have excluded something, so call this just in
- * case.
- */
- btrfs_free_excluded_extents(cache);
- btrfs_put_block_group(cache);
- goto error;
- }
-
- /*
- * Check for two cases, either we are full, and therefore
- * don't need to bother with the caching work since we won't
- * find any space, or we are empty, and we can just add all
- * the space in and be done with it. This saves us _a_lot_ of
- * time, particularly in the full case.
- */
- if (found_key.offset == btrfs_block_group_used(&cache->item)) {
- cache->last_byte_to_unpin = (u64)-1;
- cache->cached = BTRFS_CACHE_FINISHED;
- btrfs_free_excluded_extents(cache);
- } else if (btrfs_block_group_used(&cache->item) == 0) {
- cache->last_byte_to_unpin = (u64)-1;
- cache->cached = BTRFS_CACHE_FINISHED;
- add_new_free_space(cache, found_key.objectid,
- found_key.objectid +
- found_key.offset);
- btrfs_free_excluded_extents(cache);
- }
-
- ret = btrfs_add_block_group_cache(info, cache);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- goto error;
- }
-
- trace_btrfs_add_block_group(info, cache, 0);
- btrfs_update_space_info(info, cache->flags, found_key.offset,
- btrfs_block_group_used(&cache->item),
- cache->bytes_super, &space_info);
-
- cache->space_info = space_info;
-
- link_block_group(cache);
-
- set_avail_alloc_bits(info, cache->flags);
- if (btrfs_chunk_readonly(info, cache->key.objectid)) {
- inc_block_group_ro(cache, 1);
- } else if (btrfs_block_group_used(&cache->item) == 0) {
- ASSERT(list_empty(&cache->bg_list));
- btrfs_mark_bg_unused(cache);
- }
}
list_for_each_entry_rcu(space_info, &info->space_info, list) {
@@ -1861,7 +1855,7 @@ error:
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
struct btrfs_key key;
@@ -1872,14 +1866,19 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
while (!list_empty(&trans->new_bgs)) {
block_group = list_first_entry(&trans->new_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
bg_list);
if (ret)
goto next;
spin_lock(&block_group->lock);
- memcpy(&item, &block_group->item, sizeof(item));
- memcpy(&key, &block_group->key, sizeof(key));
+ btrfs_set_stack_block_group_used(&item, block_group->used);
+ btrfs_set_stack_block_group_chunk_objectid(&item,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_stack_block_group_flags(&item, block_group->flags);
+ key.objectid = block_group->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = block_group->length;
spin_unlock(&block_group->lock);
ret = btrfs_insert_item(trans, extent_root, &key, &item,
@@ -1902,7 +1901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
int ret;
btrfs_set_log_full_commit(trans);
@@ -1911,11 +1910,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
if (!cache)
return -ENOMEM;
- btrfs_set_block_group_used(&cache->item, bytes_used);
- btrfs_set_block_group_chunk_objectid(&cache->item,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
- btrfs_set_block_group_flags(&cache->item, type);
-
+ cache->used = bytes_used;
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
@@ -2022,8 +2017,17 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
return flags;
}
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
-
+/*
+ * Mark one block group RO, can be called several times for the same block
+ * group.
+ *
+ * @cache: the destination block group
+ * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
+ * ensure we still have some free space after marking this
+ * block group RO.
+ */
+int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+ bool do_chunk_alloc)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_trans_handle *trans;
@@ -2053,25 +2057,29 @@ again:
goto again;
}
- /*
- * if we are changing raid levels, try to allocate a corresponding
- * block group with the new raid level.
- */
- alloc_flags = update_block_group_flags(fs_info, cache->flags);
- if (alloc_flags != cache->flags) {
- ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ if (do_chunk_alloc) {
/*
- * ENOSPC is allowed here, we may have enough space
- * already allocated at the new raid level to
- * carry on
+ * If we are changing raid levels, try to allocate a
+ * corresponding block group with the new raid level.
*/
- if (ret == -ENOSPC)
- ret = 0;
- if (ret < 0)
- goto out;
+ alloc_flags = update_block_group_flags(fs_info, cache->flags);
+ if (alloc_flags != cache->flags) {
+ ret = btrfs_chunk_alloc(trans, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+ /*
+ * ENOSPC is allowed here, we may have enough space
+ * already allocated at the new raid level to carry on
+ */
+ if (ret == -ENOSPC)
+ ret = 0;
+ if (ret < 0)
+ goto out;
+ }
}
- ret = inc_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, !do_chunk_alloc);
+ if (!do_chunk_alloc)
+ goto unlock_out;
if (!ret)
goto out;
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
@@ -2086,13 +2094,14 @@ out:
check_system_chunk(trans, alloc_flags);
mutex_unlock(&fs_info->chunk_mutex);
}
+unlock_out:
mutex_unlock(&fs_info->ro_block_group_mutex);
btrfs_end_transaction(trans);
return ret;
}
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
+void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
@@ -2102,9 +2111,8 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
if (!--cache->ro) {
- num_bytes = cache->key.offset - cache->reserved -
- cache->pinned - cache->bytes_super -
- btrfs_block_group_used(&cache->item);
+ num_bytes = cache->length - cache->reserved -
+ cache->pinned - cache->bytes_super - cache->used;
sinfo->bytes_readonly -= num_bytes;
list_del_init(&cache->ro_list);
}
@@ -2114,15 +2122,21 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
static int write_one_cache_group(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- struct btrfs_block_group_cache *cache)
+ struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_root *extent_root = fs_info->extent_root;
unsigned long bi;
struct extent_buffer *leaf;
+ struct btrfs_block_group_item bgi;
+ struct btrfs_key key;
+
+ key.objectid = cache->start;
+ key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = cache->length;
- ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+ ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -2131,7 +2145,11 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
- write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+ btrfs_set_stack_block_group_used(&bgi, cache->used);
+ btrfs_set_stack_block_group_chunk_objectid(&bgi,
+ BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ btrfs_set_stack_block_group_flags(&bgi, cache->flags);
+ write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
btrfs_mark_buffer_dirty(leaf);
fail:
btrfs_release_path(path);
@@ -2139,7 +2157,7 @@ fail:
}
-static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+static int cache_save_setup(struct btrfs_block_group *block_group,
struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
@@ -2157,7 +2175,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
* If this block group is smaller than 100 megs don't bother caching the
* block group.
*/
- if (block_group->key.offset < (100 * SZ_1M)) {
+ if (block_group->length < (100 * SZ_1M)) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -2258,7 +2276,7 @@ again:
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->key.offset, SZ_256M);
+ num_pages = div_u64(block_group->length, SZ_256M);
if (!num_pages)
num_pages = 1;
@@ -2303,7 +2321,7 @@ out:
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache, *tmp;
+ struct btrfs_block_group *cache, *tmp;
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_path *path;
@@ -2341,7 +2359,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
@@ -2378,8 +2396,7 @@ again:
while (!list_empty(&dirty)) {
bool drop_reserve = true;
- cache = list_first_entry(&dirty,
- struct btrfs_block_group_cache,
+ cache = list_first_entry(&dirty, struct btrfs_block_group,
dirty_list);
/*
* This can happen if something re-dirties a block group that
@@ -2504,7 +2521,7 @@ again:
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
@@ -2534,7 +2551,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
dirty_list);
/*
@@ -2616,7 +2633,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* to use it without any locking
*/
while (!list_empty(io)) {
- cache = list_first_entry(io, struct btrfs_block_group_cache,
+ cache = list_first_entry(io, struct btrfs_block_group,
io_list);
list_del_init(&cache->io_list);
btrfs_wait_cache_io(trans, cache, path);
@@ -2631,7 +2648,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int alloc)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
u64 total = num_bytes;
u64 old_val;
u64 byte_in_group;
@@ -2662,11 +2679,11 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* is because we need the unpinning stage to actually add the
* space back to the block group, otherwise we will leak space.
*/
- if (!alloc && cache->cached == BTRFS_CACHE_NO)
+ if (!alloc && !btrfs_block_group_done(cache))
btrfs_cache_block_group(cache, 1);
- byte_in_group = bytenr - cache->key.objectid;
- WARN_ON(byte_in_group > cache->key.offset);
+ byte_in_group = bytenr - cache->start;
+ WARN_ON(byte_in_group > cache->length);
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
@@ -2675,11 +2692,11 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
- old_val = btrfs_block_group_used(&cache->item);
- num_bytes = min(total, cache->key.offset - byte_in_group);
+ old_val = cache->used;
+ num_bytes = min(total, cache->length - byte_in_group);
if (alloc) {
old_val += num_bytes;
- btrfs_set_block_group_used(&cache->item, old_val);
+ cache->used = old_val;
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
cache->space_info->bytes_used += num_bytes;
@@ -2688,7 +2705,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
- btrfs_set_block_group_used(&cache->item, old_val);
+ cache->used = old_val;
cache->pinned += num_bytes;
btrfs_space_info_update_bytes_pinned(info,
cache->space_info, num_bytes);
@@ -2746,7 +2763,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* reservation and the block group has become read only we cannot make the
* reservation and return -EAGAIN, otherwise this function always succeeds.
*/
-int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
@@ -2782,7 +2799,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
* A and before transaction A commits you free that leaf, you call this with
* reserve set to 0 in order to clear the reservation.
*/
-void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
@@ -2988,9 +3005,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
}
/*
- * If @is_allocation is true, reserve space in the system space info necessary
- * for allocating a chunk, otherwise if it's false, reserve space necessary for
- * removing a chunk.
+ * Reserve space in the system space for allocating or removing a chunk
*/
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
@@ -3047,7 +3062,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
u64 last = 0;
while (1) {
@@ -3075,7 +3090,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
spin_unlock(&block_group->lock);
ASSERT(block_group->io_ctl.inode == NULL);
iput(inode);
- last = block_group->key.objectid + block_group->key.offset;
+ last = block_group->start + block_group->length;
btrfs_put_block_group(block_group);
}
}
@@ -3087,7 +3102,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
*/
int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
@@ -3104,7 +3119,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
block_group = list_first_entry(&info->unused_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
bg_list);
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
@@ -3113,7 +3128,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
- block_group = rb_entry(n, struct btrfs_block_group_cache,
+ block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
rb_erase(&block_group->cache_node,
&info->block_group_cache_tree);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c391800388dd..9b409676c4b2 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -34,7 +34,7 @@ struct btrfs_caching_control {
struct mutex mutex;
wait_queue_head_t wait;
struct btrfs_work work;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
u64 progress;
refcount_t count;
};
@@ -42,14 +42,15 @@ struct btrfs_caching_control {
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
-struct btrfs_block_group_cache {
- struct btrfs_key key;
- struct btrfs_block_group_item item;
+struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
spinlock_t lock;
+ u64 start;
+ u64 length;
u64 pinned;
u64 reserved;
+ u64 used;
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
@@ -159,7 +160,7 @@ struct btrfs_block_group_cache {
#ifdef CONFIG_BTRFS_DEBUG
static inline int btrfs_should_fragment_free_space(
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -170,29 +171,29 @@ static inline int btrfs_should_fragment_free_space(
}
#endif
-struct btrfs_block_group_cache *btrfs_lookup_first_block_group(
+struct btrfs_block_group *btrfs_lookup_first_block_group(
struct btrfs_fs_info *info, u64 bytenr);
-struct btrfs_block_group_cache *btrfs_lookup_block_group(
+struct btrfs_block_group *btrfs_lookup_block_group(
struct btrfs_fs_info *info, u64 bytenr);
-struct btrfs_block_group_cache *btrfs_next_block_group(
- struct btrfs_block_group_cache *cache);
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
-void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+struct btrfs_block_group *btrfs_next_block_group(
+ struct btrfs_block_group *cache);
+void btrfs_get_block_group(struct btrfs_block_group *cache);
+void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
-void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
+void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
-void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
+void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);
-int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache);
-int btrfs_cache_block_group(struct btrfs_block_group_cache *cache,
+int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache);
+int btrfs_cache_block_group(struct btrfs_block_group *cache,
int load_cache_only);
void btrfs_put_caching_control(struct btrfs_caching_control *ctl);
struct btrfs_caching_control *btrfs_get_caching_control(
- struct btrfs_block_group_cache *cache);
-u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *cache);
+u64 add_new_free_space(struct btrfs_block_group *block_group,
u64 start, u64 end);
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
@@ -200,21 +201,22 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
-void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg);
+void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
-int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
-void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
+int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+ bool do_chunk_alloc);
+void btrfs_dec_block_group_ro(struct btrfs_block_group *cache);
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans);
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, int alloc);
-int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc);
-void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
u64 num_bytes, int delalloc);
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
@@ -239,8 +241,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
}
-static inline int btrfs_block_group_cache_done(
- struct btrfs_block_group_cache *cache)
+static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
{
smp_mb();
return cache->cached == BTRFS_CACHE_FINISHED ||
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f853835c409c..4e12a477d32e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -63,9 +63,6 @@ struct btrfs_inode {
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
- /* held while doing delalloc reservations */
- struct mutex delalloc_mutex;
-
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b05b361e2062..ee834ef7beb4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -29,6 +29,41 @@
#include "extent_io.h"
#include "extent_map.h"
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
@@ -39,6 +74,8 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
case BTRFS_COMPRESS_ZSTD:
case BTRFS_COMPRESS_NONE:
return btrfs_compress_types[type];
+ default:
+ break;
}
return NULL;
@@ -60,6 +97,70 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len)
return false;
}
+static int compression_compress_pages(int type, struct list_head *ws,
+ struct address_space *mapping, u64 start, struct page **pages,
+ unsigned long *out_pages, unsigned long *total_in,
+ unsigned long *total_out)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ return zlib_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_LZO:
+ return lzo_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_ZSTD:
+ return zstd_compress_pages(ws, mapping, start, pages,
+ out_pages, total_in, total_out);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here. As a sane fallback, return what the
+ * callers will understand as 'no compression happened'.
+ */
+ return -E2BIG;
+ }
+}
+
+static int compression_decompress_bio(int type, struct list_head *ws,
+ struct compressed_bio *cb)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static int compression_decompress(int type, struct list_head *ws,
+ unsigned char *data_in, struct page *dest_page,
+ unsigned long start_byte, size_t srclen, size_t destlen)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
+ start_byte, srclen, destlen);
+ case BTRFS_COMPRESS_NONE:
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
@@ -311,7 +412,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long compressed_len,
struct page **compressed_pages,
unsigned long nr_pages,
- unsigned int write_flags)
+ unsigned int write_flags,
+ struct cgroup_subsys_state *blkcg_css)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio *bio = NULL;
@@ -320,7 +422,6 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
int pg_index = 0;
struct page *page;
u64 first_byte = disk_start;
- struct block_device *bdev;
blk_status_t ret;
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
@@ -339,13 +440,15 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
- bdev = fs_info->fs_devices->latest_bdev;
-
bio = btrfs_bio_alloc(first_byte);
- bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+
+ if (blkcg_css) {
+ bio->bi_opf |= REQ_CGROUP_PUNT;
+ bio_associate_blkg_from_css(bio, blkcg_css);
+ }
refcount_set(&cb->pending_bios, 1);
/* create and submit bios for the compressed pages */
@@ -378,14 +481,13 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(fs_info, bio, 0, 1);
+ ret = btrfs_map_bio(fs_info, bio, 0);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
}
bio = btrfs_bio_alloc(first_byte);
- bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
@@ -409,7 +511,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(fs_info, bio, 0, 1);
+ ret = btrfs_map_bio(fs_info, bio, 0);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
@@ -553,7 +655,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
unsigned long nr_pages;
unsigned long pg_index;
struct page *page;
- struct block_device *bdev;
struct bio *comp_bio;
u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
u64 em_len;
@@ -604,8 +705,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!cb->compressed_pages)
goto fail1;
- bdev = fs_info->fs_devices->latest_bdev;
-
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
__GFP_HIGHMEM);
@@ -624,7 +723,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->len = bio->bi_iter.bi_size;
comp_bio = btrfs_bio_alloc(cur_disk_byte);
- bio_set_dev(comp_bio, bdev);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@@ -668,14 +766,13 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
fs_info->sectorsize);
sums += csum_size * nr_sectors;
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
comp_bio->bi_status = ret;
bio_endio(comp_bio);
}
comp_bio = btrfs_bio_alloc(cur_disk_byte);
- bio_set_dev(comp_bio, bdev);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@@ -693,7 +790,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
BUG_ON(ret); /* -ENOMEM */
}
- ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
if (ret) {
comp_bio->bi_status = ret;
bio_endio(comp_bio);
@@ -764,26 +861,6 @@ struct heuristic_ws {
static struct workspace_manager heuristic_wsm;
-static void heuristic_init_workspace_manager(void)
-{
- btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress);
-}
-
-static void heuristic_cleanup_workspace_manager(void)
-{
- btrfs_cleanup_workspace_manager(&heuristic_wsm);
-}
-
-static struct list_head *heuristic_get_workspace(unsigned int level)
-{
- return btrfs_get_workspace(&heuristic_wsm, level);
-}
-
-static void heuristic_put_workspace(struct list_head *ws)
-{
- btrfs_put_workspace(&heuristic_wsm, ws);
-}
-
static void free_heuristic_ws(struct list_head *ws)
{
struct heuristic_ws *workspace;
@@ -824,12 +901,7 @@ fail:
}
const struct btrfs_compress_op btrfs_heuristic_compress = {
- .init_workspace_manager = heuristic_init_workspace_manager,
- .cleanup_workspace_manager = heuristic_cleanup_workspace_manager,
- .get_workspace = heuristic_get_workspace,
- .put_workspace = heuristic_put_workspace,
- .alloc_workspace = alloc_heuristic_ws,
- .free_workspace = free_heuristic_ws,
+ .workspace_manager = &heuristic_wsm,
};
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
@@ -840,13 +912,44 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zstd_compress,
};
-void btrfs_init_workspace_manager(struct workspace_manager *wsm,
- const struct btrfs_compress_op *ops)
+static struct list_head *alloc_workspace(int type, unsigned int level)
{
- struct list_head *workspace;
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
- wsm->ops = ops;
+static void free_workspace(int type, struct list_head *ws)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
+ case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
+ case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
+}
+
+static void btrfs_init_workspace_manager(int type)
+{
+ struct workspace_manager *wsm;
+ struct list_head *workspace;
+ wsm = btrfs_compress_op[type]->workspace_manager;
INIT_LIST_HEAD(&wsm->idle_ws);
spin_lock_init(&wsm->ws_lock);
atomic_set(&wsm->total_ws, 0);
@@ -856,7 +959,7 @@ void btrfs_init_workspace_manager(struct workspace_manager *wsm,
* Preallocate one workspace for each compression type so we can
* guarantee forward progress in the worst case
*/
- workspace = wsm->ops->alloc_workspace(0);
+ workspace = alloc_workspace(type, 0);
if (IS_ERR(workspace)) {
pr_warn(
"BTRFS: cannot preallocate compression workspace, will try later\n");
@@ -867,14 +970,16 @@ void btrfs_init_workspace_manager(struct workspace_manager *wsm,
}
}
-void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
+static void btrfs_cleanup_workspace_manager(int type)
{
+ struct workspace_manager *wsman;
struct list_head *ws;
+ wsman = btrfs_compress_op[type]->workspace_manager;
while (!list_empty(&wsman->idle_ws)) {
ws = wsman->idle_ws.next;
list_del(ws);
- wsman->ops->free_workspace(ws);
+ free_workspace(type, ws);
atomic_dec(&wsman->total_ws);
}
}
@@ -885,9 +990,9 @@ void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
- unsigned int level)
+struct list_head *btrfs_get_workspace(int type, unsigned int level)
{
+ struct workspace_manager *wsm;
struct list_head *workspace;
int cpus = num_online_cpus();
unsigned nofs_flag;
@@ -897,6 +1002,7 @@ struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
wait_queue_head_t *ws_wait;
int *free_ws;
+ wsm = btrfs_compress_op[type]->workspace_manager;
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
@@ -932,7 +1038,7 @@ again:
* context of btrfs_compress_bio/btrfs_compress_pages
*/
nofs_flag = memalloc_nofs_save();
- workspace = wsm->ops->alloc_workspace(level);
+ workspace = alloc_workspace(type, level);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(workspace)) {
@@ -965,21 +1071,34 @@ again:
static struct list_head *get_workspace(int type, int level)
{
- return btrfs_compress_op[type]->get_workspace(level);
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
+ case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
+ case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
+ case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
}
/*
* put a workspace struct back on the list or free it if we have enough
* idle ones sitting around
*/
-void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
+void btrfs_put_workspace(int type, struct list_head *ws)
{
+ struct workspace_manager *wsm;
struct list_head *idle_ws;
spinlock_t *ws_lock;
atomic_t *total_ws;
wait_queue_head_t *ws_wait;
int *free_ws;
+ wsm = btrfs_compress_op[type]->workspace_manager;
idle_ws = &wsm->idle_ws;
ws_lock = &wsm->ws_lock;
total_ws = &wsm->total_ws;
@@ -995,7 +1114,7 @@ void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws)
}
spin_unlock(ws_lock);
- wsm->ops->free_workspace(ws);
+ free_workspace(type, ws);
atomic_dec(total_ws);
wake:
cond_wake_up(ws_wait);
@@ -1003,7 +1122,18 @@ wake:
static void put_workspace(int type, struct list_head *ws)
{
- return btrfs_compress_op[type]->put_workspace(ws);
+ switch (type) {
+ case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
+ case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
+ default:
+ /*
+ * This can't happen, the type is validated several times
+ * before we get here.
+ */
+ BUG();
+ }
}
/*
@@ -1042,10 +1172,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
level = btrfs_compress_set_level(type, level);
workspace = get_workspace(type, level);
- ret = btrfs_compress_op[type]->compress_pages(workspace, mapping,
- start, pages,
- out_pages,
- total_in, total_out);
+ ret = compression_compress_pages(type, workspace, mapping, start, pages,
+ out_pages, total_in, total_out);
put_workspace(type, workspace);
return ret;
}
@@ -1071,7 +1199,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
int type = cb->compress_type;
workspace = get_workspace(type, 0);
- ret = btrfs_compress_op[type]->decompress_bio(workspace, cb);
+ ret = compression_decompress_bio(type, workspace, cb);
put_workspace(type, workspace);
return ret;
@@ -1089,9 +1217,8 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
int ret;
workspace = get_workspace(type, 0);
- ret = btrfs_compress_op[type]->decompress(workspace, data_in,
- dest_page, start_byte,
- srclen, destlen);
+ ret = compression_decompress(type, workspace, data_in, dest_page,
+ start_byte, srclen, destlen);
put_workspace(type, workspace);
return ret;
@@ -1099,18 +1226,18 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
void __init btrfs_init_compress(void)
{
- int i;
-
- for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
- btrfs_compress_op[i]->init_workspace_manager();
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
+ zstd_init_workspace_manager();
}
void __cold btrfs_exit_compress(void)
{
- int i;
-
- for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++)
- btrfs_compress_op[i]->cleanup_workspace_manager();
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
+ zstd_cleanup_workspace_manager();
}
/*
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 4cb8be9ff88b..d253f7aa8ed5 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -93,7 +93,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
unsigned long compressed_len,
struct page **compressed_pages,
unsigned long nr_pages,
- unsigned int write_flags);
+ unsigned int write_flags,
+ struct cgroup_subsys_state *blkcg_css);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
@@ -104,11 +105,10 @@ enum btrfs_compression_type {
BTRFS_COMPRESS_ZLIB = 1,
BTRFS_COMPRESS_LZO = 2,
BTRFS_COMPRESS_ZSTD = 3,
- BTRFS_COMPRESS_TYPES = 3,
+ BTRFS_NR_COMPRESS_TYPES = 4,
};
struct workspace_manager {
- const struct btrfs_compress_op *ops;
struct list_head idle_ws;
spinlock_t ws_lock;
/* Number of free workspaces */
@@ -119,50 +119,18 @@ struct workspace_manager {
wait_queue_head_t ws_wait;
};
-void btrfs_init_workspace_manager(struct workspace_manager *wsm,
- const struct btrfs_compress_op *ops);
-struct list_head *btrfs_get_workspace(struct workspace_manager *wsm,
- unsigned int level);
-void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws);
-void btrfs_cleanup_workspace_manager(struct workspace_manager *wsm);
+struct list_head *btrfs_get_workspace(int type, unsigned int level);
+void btrfs_put_workspace(int type, struct list_head *ws);
struct btrfs_compress_op {
- void (*init_workspace_manager)(void);
-
- void (*cleanup_workspace_manager)(void);
-
- struct list_head *(*get_workspace)(unsigned int level);
-
- void (*put_workspace)(struct list_head *ws);
-
- struct list_head *(*alloc_workspace)(unsigned int level);
-
- void (*free_workspace)(struct list_head *workspace);
-
- int (*compress_pages)(struct list_head *workspace,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out);
-
- int (*decompress_bio)(struct list_head *workspace,
- struct compressed_bio *cb);
-
- int (*decompress)(struct list_head *workspace,
- unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen);
-
+ struct workspace_manager *workspace_manager;
/* Maximum level supported by the compression algorithm */
unsigned int max_level;
unsigned int default_level;
};
/* The heuristic workspaces are managed via the 0th workspace manager */
-#define BTRFS_NR_WORKSPACE_MANAGERS (BTRFS_COMPRESS_TYPES + 1)
+#define BTRFS_NR_WORKSPACE_MANAGERS BTRFS_NR_COMPRESS_TYPES
extern const struct btrfs_compress_op btrfs_heuristic_compress;
extern const struct btrfs_compress_op btrfs_zlib_compress;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e59cde204b2f..5b6e86aaf2e1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -32,8 +32,13 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
static const struct btrfs_csums {
u16 size;
const char *name;
+ const char *driver;
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
+ [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
+ [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
+ [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
+ .driver = "blake2b-256" },
};
int btrfs_super_csum_size(const struct btrfs_super_block *s)
@@ -51,34 +56,25 @@ const char *btrfs_super_csum_name(u16 csum_type)
return btrfs_csums[csum_type].name;
}
-struct btrfs_path *btrfs_alloc_path(void)
+/*
+ * Return driver name if defined, otherwise the name that's also a valid driver
+ * name
+ */
+const char *btrfs_super_csum_driver(u16 csum_type)
{
- return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+ /* csum type is validated at mount time */
+ return btrfs_csums[csum_type].driver ?:
+ btrfs_csums[csum_type].name;
}
-/*
- * set all locked nodes in the path to blocking locks. This should
- * be done before scheduling
- */
-noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+size_t __const btrfs_get_num_csums(void)
{
- int i;
- for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
- if (!p->nodes[i] || !p->locks[i])
- continue;
- /*
- * If we currently have a spinning reader or writer lock this
- * will bump the count of blocking holders and drop the
- * spinlock.
- */
- if (p->locks[i] == BTRFS_READ_LOCK) {
- btrfs_set_lock_blocking_read(p->nodes[i]);
- p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
- } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
- btrfs_set_lock_blocking_write(p->nodes[i]);
- p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
- }
- }
+ return ARRAY_SIZE(btrfs_csums);
+}
+
+struct btrfs_path *btrfs_alloc_path(void)
+{
+ return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
}
/* this also releases the path */
@@ -1125,7 +1121,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
parent_start = buf->start;
- extent_buffer_get(cow);
+ atomic_inc(&cow->refs);
ret = tree_mod_log_insert_root(root->node, cow, 1);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
@@ -1563,7 +1559,7 @@ static int comp_keys(const struct btrfs_disk_key *disk,
/*
* same as comp_keys only with two btrfs_key's
*/
-int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
{
if (k1->objectid > k2->objectid)
return 1;
@@ -2036,7 +2032,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the path */
if (left) {
if (btrfs_header_nritems(left) > orig_slot) {
- extent_buffer_get(left);
+ atomic_inc(&left->refs);
/* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
@@ -2379,32 +2375,6 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
- * This releases any locks held in the path starting at level and
- * going all the way up to the root.
- *
- * btrfs_search_slot will keep the lock held on higher nodes in a few
- * corner cases, such as COW of the block at slot zero in the node. This
- * ignores those rules, and it should only be called when there are no
- * more updates to be done higher up in the tree.
- */
-noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
-{
- int i;
-
- if (path->keep_locks)
- return;
-
- for (i = level; i < BTRFS_MAX_LEVEL; i++) {
- if (!path->nodes[i])
- continue;
- if (!path->locks[i])
- continue;
- btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
- path->locks[i] = 0;
- }
-}
-
-/*
* helper function for btrfs_search_slot. The goal is to find a block
* in cache without setting the path to blocking. If we find the block
* we return zero and the path is unchanged.
@@ -2652,7 +2622,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
} else {
b = root->commit_root;
- extent_buffer_get(b);
+ atomic_inc(&b->refs);
}
level = btrfs_header_level(b);
/*
@@ -2785,12 +2755,10 @@ again:
}
while (b) {
+ int dec = 0;
+
level = btrfs_header_level(b);
- /*
- * setup the path here so we can release it under lock
- * contention with the cow code
- */
if (cow) {
bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
@@ -2861,73 +2829,7 @@ cow_done:
if (ret < 0)
goto done;
- if (level != 0) {
- int dec = 0;
- if (ret && slot > 0) {
- dec = 1;
- slot -= 1;
- }
- p->slots[level] = slot;
- err = setup_nodes_for_search(trans, root, p, b, level,
- ins_len, &write_lock_level);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
- b = p->nodes[level];
- slot = p->slots[level];
-
- /*
- * slot 0 is special, if we change the key
- * we have to update the parent pointer
- * which means we must have a write lock
- * on the parent
- */
- if (slot == 0 && ins_len &&
- write_lock_level < level + 1) {
- write_lock_level = level + 1;
- btrfs_release_path(p);
- goto again;
- }
-
- unlock_up(p, level, lowest_unlock,
- min_write_lock_level, &write_lock_level);
-
- if (level == lowest_level) {
- if (dec)
- p->slots[level]++;
- goto done;
- }
-
- err = read_block_for_search(root, p, &b, level,
- slot, key);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
-
- if (!p->skip_locking) {
- level = btrfs_header_level(b);
- if (level <= write_lock_level) {
- if (!btrfs_try_tree_write_lock(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_lock(b);
- }
- p->locks[level] = BTRFS_WRITE_LOCK;
- } else {
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- p->locks[level] = BTRFS_READ_LOCK;
- }
- p->nodes[level] = b;
- }
- } else {
+ if (level == 0) {
p->slots[level] = slot;
if (ins_len > 0 &&
btrfs_leaf_free_space(b) < ins_len) {
@@ -2952,6 +2854,67 @@ cow_done:
min_write_lock_level, NULL);
goto done;
}
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
+ &write_lock_level);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+ b = p->nodes[level];
+ slot = p->slots[level];
+
+ /*
+ * Slot 0 is special, if we change the key we have to update
+ * the parent pointer which means we must have a write lock on
+ * the parent
+ */
+ if (slot == 0 && ins_len && write_lock_level < level + 1) {
+ write_lock_level = level + 1;
+ btrfs_release_path(p);
+ goto again;
+ }
+
+ unlock_up(p, level, lowest_unlock, min_write_lock_level,
+ &write_lock_level);
+
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
+
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ if (!p->skip_locking) {
+ level = btrfs_header_level(b);
+ if (level <= write_lock_level) {
+ if (!btrfs_try_tree_write_lock(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_lock(b);
+ }
+ p->locks[level] = BTRFS_WRITE_LOCK;
+ } else {
+ if (!btrfs_tree_read_lock_atomic(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ }
+ p->locks[level] = BTRFS_READ_LOCK;
+ }
+ p->nodes[level] = b;
+ }
}
ret = 1;
done:
@@ -3008,6 +2971,8 @@ again:
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
+ int dec = 0;
+
level = btrfs_header_level(b);
p->nodes[level] = b;
@@ -3028,47 +2993,45 @@ again:
if (ret < 0)
goto done;
- if (level != 0) {
- int dec = 0;
- if (ret && slot > 0) {
- dec = 1;
- slot -= 1;
- }
+ if (level == 0) {
p->slots[level] = slot;
unlock_up(p, level, lowest_unlock, 0, NULL);
+ goto done;
+ }
- if (level == lowest_level) {
- if (dec)
- p->slots[level]++;
- goto done;
- }
+ if (ret && slot > 0) {
+ dec = 1;
+ slot--;
+ }
+ p->slots[level] = slot;
+ unlock_up(p, level, lowest_unlock, 0, NULL);
- err = read_block_for_search(root, p, &b, level,
- slot, key);
- if (err == -EAGAIN)
- goto again;
- if (err) {
- ret = err;
- goto done;
- }
+ if (level == lowest_level) {
+ if (dec)
+ p->slots[level]++;
+ goto done;
+ }
- level = btrfs_header_level(b);
- if (!btrfs_tree_read_lock_atomic(b)) {
- btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
- }
- b = tree_mod_log_rewind(fs_info, p, b, time_seq);
- if (!b) {
- ret = -ENOMEM;
- goto done;
- }
- p->locks[level] = BTRFS_READ_LOCK;
- p->nodes[level] = b;
- } else {
- p->slots[level] = slot;
- unlock_up(p, level, lowest_unlock, 0, NULL);
+ err = read_block_for_search(root, p, &b, level, slot, key);
+ if (err == -EAGAIN)
+ goto again;
+ if (err) {
+ ret = err;
+ goto done;
+ }
+
+ level = btrfs_header_level(b);
+ if (!btrfs_tree_read_lock_atomic(b)) {
+ btrfs_set_path_blocking(p);
+ btrfs_tree_read_lock(b);
+ }
+ b = tree_mod_log_rewind(fs_info, p, b, time_seq);
+ if (!b) {
+ ret = -ENOMEM;
goto done;
}
+ p->locks[level] = BTRFS_READ_LOCK;
+ p->nodes[level] = b;
}
ret = 1;
done:
@@ -3433,7 +3396,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
free_extent_buffer(old);
add_root_to_dirty_list(root);
- extent_buffer_get(c);
+ atomic_inc(&c->refs);
path->nodes[level] = c;
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
path->slots[level] = 0;
@@ -4966,7 +4929,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
- extent_buffer_get(leaf);
+ atomic_inc(&leaf->refs);
btrfs_free_tree_block(trans, root, leaf, 0, 1);
free_extent_buffer_stale(leaf);
}
@@ -5047,7 +5010,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* for possible call to del_ptr below
*/
slot = path->slots[1];
- extent_buffer_get(leaf);
+ atomic_inc(&leaf->refs);
btrfs_set_path_blocking(path);
wret = push_leaf_left(trans, root, path, 1, 1,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fe2b8765d9e6..b2e8fd8a8e59 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,6 +28,7 @@
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
#include <linux/crc32c.h>
+#include "extent-io-tree.h"
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -38,7 +39,7 @@ struct btrfs_transaction;
struct btrfs_pending_snapshot;
struct btrfs_delayed_ref_root;
struct btrfs_space_info;
-struct btrfs_block_group_cache;
+struct btrfs_block_group;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
@@ -56,9 +57,9 @@ struct btrfs_ref;
* filesystem data as well that can be used to read data in order to repair
* read errors on other disks.
*
- * Current value is derived from RAID1 with 2 copies.
+ * Current value is derived from RAID1C4 with 4 copies.
*/
-#define BTRFS_MAX_MIRRORS (2 + 1)
+#define BTRFS_MAX_MIRRORS (4 + 1)
#define BTRFS_MAX_LEVEL 8
@@ -291,7 +292,8 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
- BTRFS_FEATURE_INCOMPAT_METADATA_UUID)
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34)
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -413,7 +415,7 @@ struct btrfs_free_cluster {
/* We did a full search and couldn't create a cluster */
bool fragmented;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
/*
* when a cluster is allocated from a block group, we put the
* cluster onto a list in the block group so that it can
@@ -476,8 +478,8 @@ struct btrfs_swapfile_pin {
void *ptr;
struct inode *inode;
/*
- * If true, ptr points to a struct btrfs_block_group_cache. Otherwise,
- * ptr points to a struct btrfs_device.
+ * If true, ptr points to a struct btrfs_block_group. Otherwise, ptr
+ * points to a struct btrfs_device.
*/
bool is_block_group;
};
@@ -722,7 +724,6 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
- struct btrfs_workqueue *submit_workers;
struct btrfs_workqueue *caching_workers;
struct btrfs_workqueue *readahead_workers;
@@ -1519,18 +1520,18 @@ static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
}
/* struct btrfs_block_group_item */
-BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item,
used, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item,
used, 64);
-BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid,
struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+BTRFS_SETGET_FUNCS(block_group_chunk_objectid,
struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_flags,
+BTRFS_SETGET_FUNCS(block_group_flags,
struct btrfs_block_group_item, flags, 64);
-BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
struct btrfs_block_group_item, flags, 64);
/* struct btrfs_free_space_info */
@@ -2163,6 +2164,9 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
+const char *btrfs_super_csum_driver(u16 csum_type);
+size_t __const btrfs_get_num_csums(void);
+
/*
* The leaf data grows from end-to-front in the node.
@@ -2397,7 +2401,7 @@ static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes);
-void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache);
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
unsigned long count);
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
@@ -2453,8 +2457,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref);
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
-void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
-void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
+void btrfs_get_block_group_trimming(struct btrfs_block_group *cache);
+void btrfs_put_block_group_trimming(struct btrfs_block_group *cache);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
enum btrfs_reserve_flush_enum {
@@ -2507,7 +2511,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
int level, int *slot);
-int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
@@ -2567,8 +2571,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
-void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int slot, int nr);
@@ -2870,10 +2872,9 @@ int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new,
- struct btrfs_path *path);
+ struct btrfs_root *root, struct btrfs_path *path);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *was_new);
+ struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 end, int create);
@@ -2909,7 +2910,7 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
-int btrfs_is_empty_uuid(u8 *uuid);
+int __pure btrfs_is_empty_uuid(u8 *uuid);
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_pages);
@@ -3143,7 +3144,7 @@ __cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
-const char *btrfs_decode_error(int errno);
+const char * __attribute_const__ btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index db9f2c58eb4a..4cdac4d834f5 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -307,7 +307,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
unsigned nr_extents;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
- bool delalloc_lock = true;
/*
* If we are a free space inode we need to not flush since we will be in
@@ -320,7 +319,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
*/
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
- delalloc_lock = false;
} else {
if (current->journal_info)
flush = BTRFS_RESERVE_FLUSH_LIMIT;
@@ -329,9 +327,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
schedule_timeout(1);
}
- if (delalloc_lock)
- mutex_lock(&inode->delalloc_mutex);
-
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
/*
@@ -348,10 +343,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
&qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
if (ret)
- goto out_fail;
+ return ret;
ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
- if (ret)
- goto out_qgroup;
+ if (ret) {
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
+ return ret;
+ }
/*
* Now we need to update our outstanding extents and csum bytes _first_
@@ -375,15 +372,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
block_rsv->qgroup_rsv_reserved += qgroup_reserve;
spin_unlock(&block_rsv->lock);
- if (delalloc_lock)
- mutex_unlock(&inode->delalloc_mutex);
return 0;
-out_qgroup:
- btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
-out_fail:
- if (delalloc_lock)
- mutex_unlock(&inode->delalloc_mutex);
- return ret;
}
/**
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 1f7f39b10bd0..d3e15e1d4a91 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -12,6 +12,7 @@
#include "transaction.h"
#include "ctree.h"
#include "qgroup.h"
+#include "locking.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -1367,8 +1368,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
- btrfs_async_run_delayed_root, NULL, NULL);
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
+ NULL);
async_work->nr = nr;
btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
@@ -1949,12 +1950,19 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
}
inode_id = delayed_nodes[n - 1]->inode_id + 1;
-
- for (i = 0; i < n; i++)
- refcount_inc(&delayed_nodes[i]->refs);
+ for (i = 0; i < n; i++) {
+ /*
+ * Don't increase refs in case the node is dead and
+ * about to be removed from the tree in the loop below
+ */
+ if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+ delayed_nodes[i] = NULL;
+ }
spin_unlock(&root->inode_lock);
for (i = 0; i < n; i++) {
+ if (!delayed_nodes[i])
+ continue;
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 48890826b5e6..f639dde2a679 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -986,7 +986,7 @@ static int btrfs_dev_replace_kthread(void *data)
return 0;
}
-int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
{
if (!dev_replace->is_valid)
return 0;
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 78c5d8f1adda..60b70dacc299 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -17,6 +17,6 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
-int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 402b61bf345c..e0edfdc9c82b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -205,7 +205,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset, u64 start, u64 len,
int create)
{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
int ret;
@@ -213,7 +212,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em) {
- em->bdev = fs_info->fs_devices->latest_bdev;
read_unlock(&em_tree->lock);
goto out;
}
@@ -228,7 +226,6 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
em->len = (u64)-1;
em->block_len = (u64)-1;
em->block_start = 0;
- em->bdev = fs_info->fs_devices->latest_bdev;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
@@ -352,6 +349,9 @@ static bool btrfs_supported_super_csum(u16 csum_type)
{
switch (csum_type) {
case BTRFS_CSUM_TYPE_CRC32:
+ case BTRFS_CSUM_TYPE_XXHASH:
+ case BTRFS_CSUM_TYPE_SHA256:
+ case BTRFS_CSUM_TYPE_BLAKE2:
return true;
default:
return false;
@@ -545,9 +545,11 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
ret = btrfs_check_leaf_full(eb);
if (ret < 0) {
+ btrfs_print_tree(eb, 0);
btrfs_err(fs_info,
"block=%llu write time tree block corruption detected",
eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return ret;
}
write_extent_buffer(eb, result, 0, csum_size);
@@ -608,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
/* the pending IO might have been the only thing that kept this buffer
* in memory. Make sure we have a ref for all this other checks
*/
- extent_buffer_get(eb);
+ atomic_inc(&eb->refs);
reads_done = atomic_dec_and_test(&eb->io_pages);
if (!reads_done)
@@ -706,43 +708,31 @@ static void end_workqueue_bio(struct bio *bio)
struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
fs_info = end_io_wq->info;
end_io_wq->status = bio->bi_status;
if (bio_op(bio) == REQ_OP_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
wq = fs_info->endio_meta_write_workers;
- func = btrfs_endio_meta_write_helper;
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
wq = fs_info->endio_raid56_workers;
- func = btrfs_endio_raid56_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
} else {
- if (unlikely(end_io_wq->metadata ==
- BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+ if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR))
wq = fs_info->endio_repair_workers;
- func = btrfs_endio_repair_helper;
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
wq = fs_info->endio_raid56_workers;
- func = btrfs_endio_raid56_helper;
- } else if (end_io_wq->metadata) {
+ else if (end_io_wq->metadata)
wq = fs_info->endio_meta_workers;
- func = btrfs_endio_meta_helper;
- } else {
+ else
wq = fs_info->endio_workers;
- func = btrfs_endio_helper;
- }
}
- btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+ btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
btrfs_queue_work(wq, &end_io_wq->work);
}
@@ -803,8 +793,13 @@ static void run_one_async_done(struct btrfs_work *work)
return;
}
- ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio,
- async->mirror_num, 1);
+ /*
+ * All of the bios that pass through here are from async helpers.
+ * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
+ * This changes nothing when cgroups aren't in use.
+ */
+ async->bio->bi_opf |= REQ_CGROUP_PUNT;
+ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
if (ret) {
async->bio->bi_status = ret;
bio_endio(async->bio);
@@ -835,8 +830,8 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
- btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
- run_one_async_done, run_one_async_free);
+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
+ run_one_async_free);
async->bio_offset = bio_offset;
@@ -904,12 +899,12 @@ static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else if (!async) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else {
/*
* kthread helpers are used to submit writes so that
@@ -1657,8 +1652,8 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio->bi_status = end_io_wq->status;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
bio_endio(bio);
+ kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
}
static int cleaner_kthread(void *arg)
@@ -1753,7 +1748,7 @@ static int transaction_kthread(void *arg)
}
now = ktime_get_seconds();
- if (cur->state < TRANS_STATE_BLOCKED &&
+ if (cur->state < TRANS_STATE_COMMIT_START &&
!test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
(now < cur->start_time ||
now - cur->start_time < fs_info->commit_interval)) {
@@ -1792,18 +1787,18 @@ sleep:
}
/*
- * this will find the highest generation in the array of
- * root backups. The index of the highest array is returned,
- * or -1 if we can't find anything.
+ * This will find the highest generation in the array of root backups. The
+ * index of the highest array is returned, or -EINVAL if we can't find
+ * anything.
*
* We check to make sure the array is valid by comparing the
* generation of the latest root in the array with the generation
* in the super block. If they don't match we pitch it.
*/
-static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+static int find_newest_super_backup(struct btrfs_fs_info *info)
{
+ const u64 newest_gen = btrfs_super_generation(info->super_copy);
u64 cur;
- int newest_index = -1;
struct btrfs_root_backup *root_backup;
int i;
@@ -1811,37 +1806,10 @@ static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
root_backup = info->super_copy->super_roots + i;
cur = btrfs_backup_tree_root_gen(root_backup);
if (cur == newest_gen)
- newest_index = i;
+ return i;
}
- /* check to see if we actually wrapped around */
- if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
- root_backup = info->super_copy->super_roots;
- cur = btrfs_backup_tree_root_gen(root_backup);
- if (cur == newest_gen)
- newest_index = 0;
- }
- return newest_index;
-}
-
-
-/*
- * find the oldest backup so we know where to store new entries
- * in the backup array. This will set the backup_root_index
- * field in the fs_info struct
- */
-static void find_oldest_super_backup(struct btrfs_fs_info *info,
- u64 newest_gen)
-{
- int newest_index = -1;
-
- newest_index = find_newest_super_backup(info, newest_gen);
- /* if there was garbage in there, just move along */
- if (newest_index == -1) {
- info->backup_root_index = 0;
- } else {
- info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
- }
+ return -EINVAL;
}
/*
@@ -1851,22 +1819,8 @@ static void find_oldest_super_backup(struct btrfs_fs_info *info,
*/
static void backup_super_roots(struct btrfs_fs_info *info)
{
- int next_backup;
+ const int next_backup = info->backup_root_index;
struct btrfs_root_backup *root_backup;
- int last_backup;
-
- next_backup = info->backup_root_index;
- last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
- BTRFS_NUM_BACKUP_ROOTS;
-
- /*
- * just overwrite the last backup if we're at the same generation
- * this happens only at umount
- */
- root_backup = info->super_for_commit->super_roots + last_backup;
- if (btrfs_backup_tree_root_gen(root_backup) ==
- btrfs_header_generation(info->tree_root->node))
- next_backup = last_backup;
root_backup = info->super_for_commit->super_roots + next_backup;
@@ -1939,40 +1893,31 @@ static void backup_super_roots(struct btrfs_fs_info *info)
}
/*
- * this copies info out of the root backup array and back into
- * the in-memory super block. It is meant to help iterate through
- * the array, so you send it the number of backups you've already
- * tried and the last backup index you used.
+ * read_backup_root - Reads a backup root based on the passed priority. Prio 0
+ * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
*
- * this returns -1 when it has tried all the backups
+ * fs_info - filesystem whose backup roots need to be read
+ * priority - priority of backup root required
+ *
+ * Returns backup root index on success and -EINVAL otherwise.
*/
-static noinline int next_root_backup(struct btrfs_fs_info *info,
- struct btrfs_super_block *super,
- int *num_backups_tried, int *backup_index)
+static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
{
+ int backup_index = find_newest_super_backup(fs_info);
+ struct btrfs_super_block *super = fs_info->super_copy;
struct btrfs_root_backup *root_backup;
- int newest = *backup_index;
-
- if (*num_backups_tried == 0) {
- u64 gen = btrfs_super_generation(super);
- newest = find_newest_super_backup(info, gen);
- if (newest == -1)
- return -1;
+ if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
+ if (priority == 0)
+ return backup_index;
- *backup_index = newest;
- *num_backups_tried = 1;
- } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
- /* we've tried all the backups, all done */
- return -1;
+ backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
+ backup_index %= BTRFS_NUM_BACKUP_ROOTS;
} else {
- /* jump to the next oldest backup */
- newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
- BTRFS_NUM_BACKUP_ROOTS;
- *backup_index = newest;
- *num_backups_tried += 1;
+ return -EINVAL;
}
- root_backup = super->super_roots + newest;
+
+ root_backup = super->super_roots + backup_index;
btrfs_set_super_generation(super,
btrfs_backup_tree_root_gen(root_backup));
@@ -1982,12 +1927,13 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
/*
- * fixme: the total bytes and num_devices need to match or we should
+ * Fixme: the total bytes and num_devices need to match or we should
* need a fsck
*/
btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
- return 0;
+
+ return backup_index;
}
/* helper to cleanup workers */
@@ -2002,7 +1948,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
- btrfs_destroy_workqueue(fs_info->submit_workers);
btrfs_destroy_workqueue(fs_info->delayed_workers);
btrfs_destroy_workqueue(fs_info->caching_workers);
btrfs_destroy_workqueue(fs_info->readahead_workers);
@@ -2028,7 +1973,7 @@ static void free_root_extent_buffers(struct btrfs_root *root)
}
/* helper to cleanup tree roots */
-static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
free_root_extent_buffers(info->tree_root);
@@ -2037,7 +1982,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
- if (chunk_root)
+ if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
free_root_extent_buffers(info->free_space_root);
}
@@ -2167,16 +2112,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
fs_info->caching_workers =
btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
- /*
- * a higher idle thresh on the submit workers makes it much more
- * likely that bios will be send down in a sane order to the
- * devices
- */
- fs_info->submit_workers =
- btrfs_alloc_workqueue(fs_info, "submit", flags,
- min_t(u64, fs_devices->num_devices,
- max_active), 64);
-
fs_info->fixup_workers =
btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
@@ -2215,7 +2150,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
if (!(fs_info->workers && fs_info->delalloc_workers &&
- fs_info->submit_workers && fs_info->flush_workers &&
+ fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
fs_info->endio_repair_workers &&
@@ -2233,13 +2168,13 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
struct crypto_shash *csum_shash;
- const char *csum_name = btrfs_super_csum_name(csum_type);
+ const char *csum_driver = btrfs_super_csum_driver(csum_type);
- csum_shash = crypto_alloc_shash(csum_name, 0, 0);
+ csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
if (IS_ERR(csum_shash)) {
btrfs_err(fs_info, "error allocating %s hash for checksum",
- csum_name);
+ csum_driver);
return PTR_ERR(csum_shash);
}
@@ -2589,7 +2524,101 @@ out:
return ret;
}
-int open_ctree(struct super_block *sb,
+static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
+{
+ int backup_index = find_newest_super_backup(fs_info);
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ bool handle_error = false;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ u64 generation;
+ int level;
+
+ if (handle_error) {
+ if (!IS_ERR(tree_root->node))
+ free_extent_buffer(tree_root->node);
+ tree_root->node = NULL;
+
+ if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
+ break;
+
+ free_root_pointers(fs_info, 0);
+
+ /*
+ * Don't use the log in recovery mode, it won't be
+ * valid
+ */
+ btrfs_set_super_log_root(sb, 0);
+
+ /* We can't trust the free space cache either */
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+ ret = read_backup_root(fs_info, i);
+ backup_index = ret;
+ if (ret < 0)
+ return ret;
+ }
+ generation = btrfs_super_generation(sb);
+ level = btrfs_super_root_level(sb);
+ tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
+ generation, level, NULL);
+ if (IS_ERR(tree_root->node) ||
+ !extent_buffer_uptodate(tree_root->node)) {
+ handle_error = true;
+
+ if (IS_ERR(tree_root->node))
+ ret = PTR_ERR(tree_root->node);
+ else if (!extent_buffer_uptodate(tree_root->node))
+ ret = -EUCLEAN;
+
+ btrfs_warn(fs_info, "failed to read tree root");
+ continue;
+ }
+
+ btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+ tree_root->commit_root = btrfs_root_node(tree_root);
+ btrfs_set_root_refs(&tree_root->root_item, 1);
+
+ /*
+ * No need to hold btrfs_root::objectid_mutex since the fs
+ * hasn't been fully initialised and we are the only user
+ */
+ ret = btrfs_find_highest_objectid(tree_root,
+ &tree_root->highest_objectid);
+ if (ret < 0) {
+ handle_error = true;
+ continue;
+ }
+
+ ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ ret = btrfs_read_roots(fs_info);
+ if (ret < 0) {
+ handle_error = true;
+ continue;
+ }
+
+ /* All successful */
+ fs_info->generation = generation;
+ fs_info->last_trans_committed = generation;
+
+ /* Always begin writing backup roots after the one being used */
+ if (backup_index < 0) {
+ fs_info->backup_root_index = 0;
+ } else {
+ fs_info->backup_root_index = backup_index + 1;
+ fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
+ }
+ break;
+ }
+
+ return ret;
+}
+
+int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
{
@@ -2607,8 +2636,6 @@ int open_ctree(struct super_block *sb,
struct btrfs_root *chunk_root;
int ret;
int err = -EINVAL;
- int num_backups_tried = 0;
- int backup_index = 0;
int clear_free_space_tree = 0;
int level;
@@ -2879,13 +2906,6 @@ int open_ctree(struct super_block *sb,
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/*
- * run through our array of backup supers and setup
- * our ring pointer to the oldest one
- */
- generation = btrfs_super_generation(disk_super);
- find_oldest_super_backup(fs_info, generation);
-
- /*
* In the long term, we'll store the compression type in the super
* block, and it'll be used for per file compression control.
*/
@@ -3031,44 +3051,9 @@ int open_ctree(struct super_block *sb,
goto fail_tree_roots;
}
-retry_root_backup:
- generation = btrfs_super_generation(disk_super);
- level = btrfs_super_root_level(disk_super);
-
- tree_root->node = read_tree_block(fs_info,
- btrfs_super_root(disk_super),
- generation, level, NULL);
- if (IS_ERR(tree_root->node) ||
- !extent_buffer_uptodate(tree_root->node)) {
- btrfs_warn(fs_info, "failed to read tree root");
- if (!IS_ERR(tree_root->node))
- free_extent_buffer(tree_root->node);
- tree_root->node = NULL;
- goto recovery_tree_root;
- }
-
- btrfs_set_root_node(&tree_root->root_item, tree_root->node);
- tree_root->commit_root = btrfs_root_node(tree_root);
- btrfs_set_root_refs(&tree_root->root_item, 1);
-
- mutex_lock(&tree_root->objectid_mutex);
- ret = btrfs_find_highest_objectid(tree_root,
- &tree_root->highest_objectid);
- if (ret) {
- mutex_unlock(&tree_root->objectid_mutex);
- goto recovery_tree_root;
- }
-
- ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
-
- mutex_unlock(&tree_root->objectid_mutex);
-
- ret = btrfs_read_roots(fs_info);
+ ret = init_tree_roots(fs_info);
if (ret)
- goto recovery_tree_root;
-
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
+ goto fail_tree_roots;
ret = btrfs_verify_dev_extents(fs_info);
if (ret) {
@@ -3336,7 +3321,7 @@ fail_block_groups:
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
- free_root_pointers(fs_info, 1);
+ free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
@@ -3363,24 +3348,6 @@ fail:
btrfs_free_stripe_hash_table(fs_info);
btrfs_close_devices(fs_info->fs_devices);
return err;
-
-recovery_tree_root:
- if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
- goto fail_tree_roots;
-
- free_root_pointers(fs_info, 0);
-
- /* don't use the log in recovery mode, it won't be valid */
- btrfs_set_super_log_root(disk_super, 0);
-
- /* we can't trust the free space cache either */
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
- ret = next_root_backup(fs_info, fs_info->super_copy,
- &num_backups_tried, &backup_index);
- if (ret == -1)
- goto fail_block_groups;
- goto retry_root_backup;
}
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
@@ -3974,7 +3941,7 @@ int btrfs_commit_super(struct btrfs_fs_info *fs_info)
return btrfs_commit_transaction(trans);
}
-void close_ctree(struct btrfs_fs_info *fs_info)
+void __cold close_ctree(struct btrfs_fs_info *fs_info)
{
int ret;
@@ -4062,7 +4029,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
btrfs_free_block_groups(fs_info);
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
- free_root_pointers(fs_info, 1);
+ free_root_pointers(fs_info, true);
iput(fs_info->btree_inode);
@@ -4439,7 +4406,7 @@ again:
return 0;
}
-static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
+static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
{
struct inode *inode;
@@ -4456,12 +4423,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
struct btrfs_fs_info *fs_info)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
dirty_list);
if (!list_empty(&cache->io_list)) {
@@ -4489,7 +4456,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
*/
while (!list_empty(&cur_trans->io_bgs)) {
cache = list_first_entry(&cur_trans->io_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
io_list);
list_del_init(&cache->io_list);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a6958103d87e..76f123ebb292 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -49,10 +49,10 @@ struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr);
void btrfs_clean_tree_block(struct extent_buffer *buf);
-int open_ctree(struct super_block *sb,
+int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
-void close_ctree(struct btrfs_fs_info *fs_info);
+void __cold close_ctree(struct btrfs_fs_info *fs_info);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ddf28ecf17f9..72e312cae69d 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -87,7 +87,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(sb, &key, root, NULL);
+ inode = btrfs_iget(sb, &key, root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail;
@@ -214,7 +214,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root, NULL));
+ return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root));
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
new file mode 100644
index 000000000000..a3febe746c79
--- /dev/null
+++ b/fs/btrfs/extent-io-tree.h
@@ -0,0 +1,248 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef BTRFS_EXTENT_IO_TREE_H
+#define BTRFS_EXTENT_IO_TREE_H
+
+struct extent_changeset;
+struct io_failure_record;
+
+/* Bits for the extent state */
+#define EXTENT_DIRTY (1U << 0)
+#define EXTENT_UPTODATE (1U << 1)
+#define EXTENT_LOCKED (1U << 2)
+#define EXTENT_NEW (1U << 3)
+#define EXTENT_DELALLOC (1U << 4)
+#define EXTENT_DEFRAG (1U << 5)
+#define EXTENT_BOUNDARY (1U << 6)
+#define EXTENT_NODATASUM (1U << 7)
+#define EXTENT_CLEAR_META_RESV (1U << 8)
+#define EXTENT_NEED_WAIT (1U << 9)
+#define EXTENT_DAMAGED (1U << 10)
+#define EXTENT_NORESERVE (1U << 11)
+#define EXTENT_QGROUP_RESERVED (1U << 12)
+#define EXTENT_CLEAR_DATA_RESV (1U << 13)
+#define EXTENT_DELALLOC_NEW (1U << 14)
+#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
+ EXTENT_CLEAR_DATA_RESV)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
+
+/*
+ * Redefined bits above which are used only in the device allocation tree,
+ * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
+ * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
+ * manipulation functions
+ */
+#define CHUNK_ALLOCATED EXTENT_DIRTY
+#define CHUNK_TRIMMED EXTENT_DEFRAG
+
+enum {
+ IO_TREE_FS_INFO_FREED_EXTENTS0,
+ IO_TREE_FS_INFO_FREED_EXTENTS1,
+ IO_TREE_INODE_IO,
+ IO_TREE_INODE_IO_FAILURE,
+ IO_TREE_RELOC_BLOCKS,
+ IO_TREE_TRANS_DIRTY_PAGES,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES,
+ IO_TREE_SELFTEST,
+};
+
+struct extent_io_tree {
+ struct rb_root state;
+ struct btrfs_fs_info *fs_info;
+ void *private_data;
+ u64 dirty_bytes;
+ bool track_uptodate;
+
+ /* Who owns this io tree, should be one of IO_TREE_* */
+ u8 owner;
+
+ spinlock_t lock;
+ const struct extent_io_ops *ops;
+};
+
+struct extent_state {
+ u64 start;
+ u64 end; /* inclusive */
+ struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
+ wait_queue_head_t wq;
+ refcount_t refs;
+ unsigned state;
+
+ struct io_failure_record *failrec;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct list_head leak_list;
+#endif
+};
+
+int __init extent_state_cache_init(void);
+void __cold extent_state_cache_exit(void);
+
+void extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner,
+ void *private_data);
+void extent_io_tree_release(struct extent_io_tree *tree);
+
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached);
+
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return lock_extent_bits(tree, start, end, NULL);
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
+
+int __init extent_io_init(void);
+void __cold extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, unsigned bits, int contig);
+
+void free_extent_state(struct extent_state *state);
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int filled,
+ struct extent_state *cached_state);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, struct extent_changeset *changeset);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached);
+int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask,
+ struct extent_changeset *changeset);
+
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
+}
+
+static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_NOFS, NULL);
+}
+
+static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
+ u64 start, u64 end, struct extent_state **cached)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ GFP_ATOMIC, NULL);
+}
+
+static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits)
+{
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
+}
+
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, struct extent_changeset *changeset);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask);
+int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits);
+
+static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
+}
+
+static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state)
+{
+ return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, GFP_NOFS, NULL);
+}
+
+static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+ NULL, mask);
+}
+
+static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, cached);
+}
+
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, unsigned clear_bits,
+ struct extent_state **cached_state);
+
+static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned int extra_bits,
+ struct extent_state **cached_state)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
+ NULL, cached_state, GFP_NOFS);
+}
+
+static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, GFP_NOFS);
+}
+
+static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
+ u64 end)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
+ GFP_NOFS);
+}
+
+static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+ cached_state, mask);
+}
+
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
+ struct extent_state **cached_state);
+void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, unsigned bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+ struct page *page, unsigned long offset);
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state);
+
+/* This should be reworked in the future and put elsewhere. */
+int get_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record **failrec);
+int set_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record *failrec);
+void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
+ u64 end);
+int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
+ struct io_failure_record **failrec_ret);
+int free_io_failure(struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree,
+ struct io_failure_record *rec);
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *failure_tree,
+ struct extent_io_tree *io_tree, u64 start,
+ struct page *page, u64 ino, unsigned int pg_offset);
+
+#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49cb26fa7c63..153f71a5bba9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -54,7 +54,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
-static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
{
return (cache->flags & bits) == bits;
}
@@ -70,13 +70,13 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
return 0;
}
-void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache)
+void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 start, end;
- start = cache->key.objectid;
- end = start + cache->key.offset - 1;
+ start = cache->start;
+ end = start + cache->length - 1;
clear_extent_bits(&fs_info->freed_extents[0],
start, end, EXTENT_UPTODATE);
@@ -1306,8 +1306,10 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes)
{
- int ret;
+ int ret = 0;
u64 discarded_bytes = 0;
+ u64 end = bytenr + num_bytes;
+ u64 cur = bytenr;
struct btrfs_bio *bbio = NULL;
@@ -1316,15 +1318,23 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
* associated to its stripes that don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- /* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
- &bbio, 0);
- /* Error condition is -ENOMEM */
- if (!ret) {
- struct btrfs_bio_stripe *stripe = bbio->stripes;
+ while (cur < end) {
+ struct btrfs_bio_stripe *stripe;
int i;
+ num_bytes = end - cur;
+ /* Tell the block device(s) that the sectors can be discarded */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
+ &num_bytes, &bbio, 0);
+ /*
+ * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
+ * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
+ * thus we can't continue anyway.
+ */
+ if (ret < 0)
+ goto out;
+ stripe = bbio->stripes;
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
struct request_queue *req_q;
@@ -1341,10 +1351,19 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
stripe->physical,
stripe->length,
&bytes);
- if (!ret)
+ if (!ret) {
discarded_bytes += bytes;
- else if (ret != -EOPNOTSUPP)
- break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
+ } else if (ret != -EOPNOTSUPP) {
+ /*
+ * Logic errors or -ENOMEM, or -EIO, but
+ * unlikely to happen.
+ *
+ * And since there are two loops, explicitly
+ * go to out to avoid confusion.
+ */
+ btrfs_put_bbio(bbio);
+ goto out;
+ }
/*
* Just in case we get back EOPNOTSUPP for some reason,
@@ -1354,7 +1373,9 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
ret = 0;
}
btrfs_put_bbio(bbio);
+ cur += num_bytes;
}
+out:
btrfs_bio_counter_dec(fs_info);
if (actual_bytes)
@@ -2516,7 +2537,7 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
int readonly = 0;
block_group = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2546,7 +2567,7 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 bytenr;
spin_lock(&fs_info->block_group_cache_lock);
@@ -2560,13 +2581,13 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
if (!cache)
return 0;
- bytenr = cache->key.objectid;
+ bytenr = cache->start;
btrfs_put_block_group(cache);
return bytenr;
}
-static int pin_down_extent(struct btrfs_block_group_cache *cache,
+static int pin_down_extent(struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -2590,13 +2611,12 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache,
return 0;
}
-/*
- * this function must be called within transaction
- */
int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, int reserved)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
+
+ ASSERT(fs_info->running_transaction);
cache = btrfs_lookup_block_group(fs_info, bytenr);
BUG_ON(!cache); /* Logic error */
@@ -2613,7 +2633,7 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
int ret;
cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2640,7 +2660,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes)
{
int ret;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_caching_control *caching_ctl;
block_group = btrfs_lookup_block_group(fs_info, start);
@@ -2652,7 +2672,7 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
if (!caching_ctl) {
/* Logic error */
- BUG_ON(!btrfs_block_group_cache_done(block_group));
+ BUG_ON(!btrfs_block_group_done(block_group));
ret = btrfs_remove_free_space(block_group, start, num_bytes);
} else {
mutex_lock(&caching_ctl->mutex);
@@ -2717,7 +2737,7 @@ int btrfs_exclude_logged_extents(struct extent_buffer *eb)
}
static void
-btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
+btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
{
atomic_inc(&bg->reservations);
}
@@ -2726,14 +2746,14 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
{
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
cache = caching_ctl->block_group;
- if (btrfs_block_group_cache_done(cache)) {
+ if (btrfs_block_group_done(cache)) {
cache->last_byte_to_unpin = (u64)-1;
list_del_init(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
@@ -2785,7 +2805,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end,
const bool return_free_space)
{
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_free_cluster *cluster = NULL;
@@ -2797,7 +2817,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
while (start <= end) {
readonly = false;
if (!cache ||
- start >= cache->key.objectid + cache->key.offset) {
+ start >= cache->start + cache->length) {
if (cache)
btrfs_put_block_group(cache);
total_unpinned = 0;
@@ -2810,7 +2830,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
empty_cluster <<= 1;
}
- len = cache->key.objectid + cache->key.offset - start;
+ len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
if (start < cache->last_byte_to_unpin) {
@@ -2880,7 +2900,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group *block_group, *tmp;
struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
u64 start;
@@ -2926,8 +2946,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
ret = -EROFS;
if (!trans->aborted)
ret = btrfs_discard_extent(fs_info,
- block_group->key.objectid,
- block_group->key.offset,
+ block_group->start,
+ block_group->length,
&trimmed);
list_del_init(&block_group->bg_list);
@@ -3262,7 +3282,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
}
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
@@ -3349,15 +3369,14 @@ enum btrfs_loop_type {
};
static inline void
-btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+btrfs_lock_block_group(struct btrfs_block_group *cache,
int delalloc)
{
if (delalloc)
down_read(&cache->data_rwsem);
}
-static inline void
-btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
int delalloc)
{
btrfs_get_block_group(cache);
@@ -3365,12 +3384,12 @@ btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
down_read(&cache->data_rwsem);
}
-static struct btrfs_block_group_cache *
-btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+static struct btrfs_block_group *btrfs_lock_cluster(
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
int delalloc)
{
- struct btrfs_block_group_cache *used_bg = NULL;
+ struct btrfs_block_group *used_bg = NULL;
spin_lock(&cluster->refill_lock);
while (1) {
@@ -3404,7 +3423,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
}
static inline void
-btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+btrfs_release_block_group(struct btrfs_block_group *cache,
int delalloc)
{
if (delalloc)
@@ -3475,12 +3494,12 @@ struct find_free_extent_ctl {
* Return >0 to inform caller that we find nothing
* Return 0 means we have found a location and set ffe_ctl->found_offset.
*/
-static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
+static int find_free_extent_clustered(struct btrfs_block_group *bg,
struct btrfs_free_cluster *last_ptr,
struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group_cache **cluster_bg_ret)
+ struct btrfs_block_group **cluster_bg_ret)
{
- struct btrfs_block_group_cache *cluster_bg;
+ struct btrfs_block_group *cluster_bg;
u64 aligned_cluster;
u64 offset;
int ret;
@@ -3493,7 +3512,7 @@ static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
- ffe_ctl->num_bytes, cluster_bg->key.objectid,
+ ffe_ctl->num_bytes, cluster_bg->start,
&ffe_ctl->max_extent_size);
if (offset) {
/* We have a block, we're done */
@@ -3579,7 +3598,7 @@ refill_cluster:
* Return 0 when we found an free extent and set ffe_ctrl->found_offset
* Return -EAGAIN to inform caller that we need to re-search this block group
*/
-static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
+static int find_free_extent_unclustered(struct btrfs_block_group *bg,
struct btrfs_free_cluster *last_ptr,
struct find_free_extent_ctl *ffe_ctl)
{
@@ -3781,7 +3800,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
{
int ret = 0;
struct btrfs_free_cluster *last_ptr = NULL;
- struct btrfs_block_group_cache *block_group = NULL;
+ struct btrfs_block_group *block_group = NULL;
struct find_free_extent_ctl ffe_ctl = {0};
struct btrfs_space_info *space_info;
bool use_cluster = true;
@@ -3904,7 +3923,7 @@ search:
continue;
btrfs_grab_block_group(block_group, delalloc);
- ffe_ctl.search_start = block_group->key.objectid;
+ ffe_ctl.search_start = block_group->start;
/*
* this can happen if we end up cycling through all the
@@ -3935,7 +3954,7 @@ search:
}
have_block_group:
- ffe_ctl.cached = btrfs_block_group_cache_done(block_group);
+ ffe_ctl.cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl.cached)) {
ffe_ctl.have_caching_bg = true;
ret = btrfs_cache_block_group(block_group, 0);
@@ -3951,7 +3970,7 @@ have_block_group:
* lets look there
*/
if (last_ptr && use_cluster) {
- struct btrfs_block_group_cache *cluster_bg = NULL;
+ struct btrfs_block_group *cluster_bg = NULL;
ret = find_free_extent_clustered(block_group, last_ptr,
&ffe_ctl, &cluster_bg);
@@ -3984,7 +4003,7 @@ checks:
/* move on to the next group */
if (ffe_ctl.search_start + num_bytes >
- block_group->key.objectid + block_group->key.offset) {
+ block_group->start + block_group->length) {
btrfs_add_free_space(block_group, ffe_ctl.found_offset,
num_bytes);
goto loop;
@@ -4133,7 +4152,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len,
int pin, int delalloc)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
int ret = 0;
cache = btrfs_lookup_block_group(fs_info, start);
@@ -4366,7 +4385,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
/*
@@ -5436,7 +5455,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
btrfs_assert_tree_locked(parent);
parent_level = btrfs_header_level(parent);
- extent_buffer_get(parent);
+ atomic_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
@@ -5480,7 +5499,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
*/
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
u64 free_bytes = 0;
int factor;
@@ -5498,9 +5517,8 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
}
factor = btrfs_bg_type_to_factor(block_group->flags);
- free_bytes += (block_group->key.offset -
- btrfs_block_group_used(&block_group->item)) *
- factor;
+ free_bytes += (block_group->length -
+ block_group->used) * factor;
spin_unlock(&block_group->lock);
}
@@ -5623,7 +5641,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
*/
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
struct btrfs_device *device;
struct list_head *devices;
u64 group_trimmed;
@@ -5647,16 +5665,16 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
cache = btrfs_lookup_first_block_group(fs_info, range->start);
for (; cache; cache = btrfs_next_block_group(cache)) {
- if (cache->key.objectid >= range_end) {
+ if (cache->start >= range_end) {
btrfs_put_block_group(cache);
break;
}
- start = max(range->start, cache->key.objectid);
- end = min(range_end, cache->key.objectid + cache->key.offset);
+ start = max(range->start, cache->start);
+ end = min(range_end, cache->start + cache->length);
if (end - start >= range->minlen) {
- if (!btrfs_block_group_cache_done(cache)) {
+ if (!btrfs_block_group_done(cache)) {
ret = btrfs_cache_block_group(cache, 0);
if (ret) {
bg_failed++;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cceaf05aada2..eb8bd0258360 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,6 +14,7 @@
#include <linux/prefetch.h>
#include <linux/cleancache.h>
#include "extent_io.h"
+#include "extent-io-tree.h"
#include "extent_map.h"
#include "ctree.h"
#include "btrfs_inode.h"
@@ -59,12 +60,23 @@ void btrfs_leak_debug_del(struct list_head *entry)
spin_unlock_irqrestore(&leak_lock, flags);
}
-static inline
-void btrfs_leak_debug_check(void)
+static inline void btrfs_extent_buffer_leak_debug_check(void)
{
- struct extent_state *state;
struct extent_buffer *eb;
+ while (!list_empty(&buffers)) {
+ eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+ pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
+ list_del(&eb->leak_list);
+ kmem_cache_free(extent_buffer_cache, eb);
+ }
+}
+
+static inline void btrfs_extent_state_leak_debug_check(void)
+{
+ struct extent_state *state;
+
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
@@ -74,14 +86,6 @@ void btrfs_leak_debug_check(void)
list_del(&state->leak_list);
kmem_cache_free(extent_state_cache, state);
}
-
- while (!list_empty(&buffers)) {
- eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
- list_del(&eb->leak_list);
- kmem_cache_free(extent_buffer_cache, eb);
- }
}
#define btrfs_debug_check_extent_io_range(tree, start, end) \
@@ -105,7 +109,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#else
#define btrfs_leak_debug_add(new, head) do {} while (0)
#define btrfs_leak_debug_del(entry) do {} while (0)
-#define btrfs_leak_debug_check() do {} while (0)
+#define btrfs_extent_buffer_leak_debug_check() do {} while (0)
+#define btrfs_extent_state_leak_debug_check() do {} while (0)
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
@@ -196,19 +201,23 @@ static int __must_check flush_write_bio(struct extent_page_data *epd)
return ret;
}
-int __init extent_io_init(void)
+int __init extent_state_cache_init(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
+ return 0;
+}
+int __init extent_io_init(void)
+{
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
- goto free_state_cache;
+ return -ENOMEM;
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_io_bio, bio),
@@ -226,23 +235,24 @@ free_bioset:
free_buffer_cache:
kmem_cache_destroy(extent_buffer_cache);
extent_buffer_cache = NULL;
+ return -ENOMEM;
+}
-free_state_cache:
+void __cold extent_state_cache_exit(void)
+{
+ btrfs_extent_state_leak_debug_check();
kmem_cache_destroy(extent_state_cache);
- extent_state_cache = NULL;
- return -ENOMEM;
}
void __cold extent_io_exit(void)
{
- btrfs_leak_debug_check();
+ btrfs_extent_buffer_leak_debug_check();
/*
* Make sure all delayed rcu free are flushed before we
* destroy caches.
*/
rcu_barrier();
- kmem_cache_destroy(extent_state_cache);
kmem_cache_destroy(extent_buffer_cache);
bioset_exit(&btrfs_bioset);
}
@@ -1676,9 +1686,9 @@ out:
*
* true is returned if we find something, false if nothing was in the tree
*/
-static noinline bool find_delalloc_range(struct extent_io_tree *tree,
- u64 *start, u64 *end, u64 max_bytes,
- struct extent_state **cached_state)
+bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
+ u64 *end, u64 max_bytes,
+ struct extent_state **cached_state)
{
struct rb_node *node;
struct extent_state *state;
@@ -1796,8 +1806,8 @@ again:
/* step one, find a bunch of delalloc bytes starting at start */
delalloc_start = *start;
delalloc_end = 0;
- found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
- max_bytes, &cached_state);
+ found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+ max_bytes, &cached_state);
if (!found || delalloc_end <= *start) {
*start = delalloc_start;
*end = delalloc_end;
@@ -1899,7 +1909,7 @@ static int __process_pages_contig(struct address_space *mapping,
if (page_ops & PAGE_SET_PRIVATE2)
SetPagePrivate2(pages[i]);
- if (pages[i] == locked_page) {
+ if (locked_page && pages[i] == locked_page) {
put_page(pages[i]);
pages_locked++;
continue;
@@ -2014,8 +2024,8 @@ out:
* set the private field for a given byte offset in the tree. If there isn't
* an extent_state there already, this does nothing.
*/
-static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record *failrec)
+int set_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record *failrec)
{
struct rb_node *node;
struct extent_state *state;
@@ -2042,8 +2052,8 @@ out:
return ret;
}
-static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
- struct io_failure_record **failrec)
+int get_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record **failrec)
{
struct rb_node *node;
struct extent_state *state;
@@ -2534,7 +2544,6 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
bio = btrfs_io_bio_alloc(1);
bio->bi_end_io = endio_func;
bio->bi_iter.bi_sector = failrec->logical >> 9;
- bio_set_dev(bio, fs_info->fs_devices->latest_bdev);
bio->bi_iter.bi_size = 0;
bio->bi_private = data;
@@ -2920,7 +2929,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
* a contiguous page to the previous one
* @size: portion of page that we want to write
* @offset: starting offset in the page
- * @bdev: attach newly created bios to this bdev
* @bio_ret: must be valid pointer, newly allocated bio will be stored there
* @end_io_func: end_io callback for new bio
* @mirror_num: desired mirror to read/write
@@ -2931,7 +2939,6 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
struct writeback_control *wbc,
struct page *page, u64 offset,
size_t size, unsigned long pg_offset,
- struct block_device *bdev,
struct bio **bio_ret,
bio_end_io_t end_io_func,
int mirror_num,
@@ -2977,13 +2984,16 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
}
bio = btrfs_bio_alloc(offset);
- bio_set_dev(bio, bdev);
bio_add_page(bio, page, page_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
bio->bi_write_hint = page->mapping->host->i_write_hint;
bio->bi_opf = opf;
if (wbc) {
+ struct block_device *bdev;
+
+ bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
+ bio_set_dev(bio, bdev);
wbc_init_bio(wbc, bio);
wbc_account_cgroup_owner(wbc, page, page_size);
}
@@ -3065,7 +3075,6 @@ static int __do_readpage(struct extent_io_tree *tree,
u64 block_start;
u64 cur_end;
struct extent_map *em;
- struct block_device *bdev;
int ret = 0;
int nr = 0;
size_t pg_offset = 0;
@@ -3142,7 +3151,6 @@ static int __do_readpage(struct extent_io_tree *tree,
offset = em->block_start + extent_offset;
disk_io_size = iosize;
}
- bdev = em->bdev;
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
@@ -3232,7 +3240,7 @@ static int __do_readpage(struct extent_io_tree *tree,
ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
page, offset, disk_io_size,
- pg_offset, bdev, bio,
+ pg_offset, bio,
end_bio_extent_readpage, mirror_num,
*bio_flags,
this_bio_flag,
@@ -3409,7 +3417,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
struct extent_page_data *epd,
loff_t i_size,
unsigned long nr_written,
- unsigned int write_flags, int *nr_ret)
+ int *nr_ret)
{
struct extent_io_tree *tree = epd->tree;
u64 start = page_offset(page);
@@ -3420,11 +3428,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
u64 block_start;
u64 iosize;
struct extent_map *em;
- struct block_device *bdev;
size_t pg_offset = 0;
size_t blocksize;
int ret = 0;
int nr = 0;
+ const unsigned int write_flags = wbc_to_write_flags(wbc);
bool compressed;
ret = btrfs_writepage_cow_fixup(page, start, page_end);
@@ -3478,7 +3486,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
iosize = min(em_end - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
offset = em->block_start + extent_offset;
- bdev = em->bdev;
block_start = em->block_start;
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
free_extent_map(em);
@@ -3520,7 +3527,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
page, offset, iosize, pg_offset,
- bdev, &epd->bio,
+ &epd->bio,
end_bio_extent_writepage,
0, 0, 0, false);
if (ret) {
@@ -3558,11 +3565,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
size_t pg_offset = 0;
loff_t i_size = i_size_read(inode);
unsigned long end_index = i_size >> PAGE_SHIFT;
- unsigned int write_flags = 0;
unsigned long nr_written = 0;
- write_flags = wbc_to_write_flags(wbc);
-
trace___extent_writepage(page, inode, wbc);
WARN_ON(!PageLocked(page));
@@ -3600,7 +3604,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
ret = __extent_writepage_io(inode, page, wbc, epd,
- i_size, nr_written, write_flags, &nr);
+ i_size, nr_written, &nr);
if (ret == 1)
goto done_unlocked;
@@ -3849,7 +3853,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct block_device *bdev = fs_info->fs_devices->latest_bdev;
struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
u32 nritems;
@@ -3884,7 +3887,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
- p, offset, PAGE_SIZE, 0, bdev,
+ p, offset, PAGE_SIZE, 0,
&epd->bio,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
@@ -4121,7 +4124,7 @@ retry:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- done_index = page->index;
+ done_index = page->index + 1;
/*
* At this point we hold neither the i_pages lock nor
* the page lock: the page may be truncated or
@@ -4156,16 +4159,6 @@ retry:
ret = __extent_writepage(page, wbc, epd);
if (ret < 0) {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
- done_index = page->index + 1;
done = 1;
break;
}
@@ -4240,8 +4233,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
.nr_to_write = nr_pages * 2,
.range_start = start,
.range_end = end + 1,
+ /* We're called from an async helper function */
+ .punt_to_cgroup = 1,
+ .no_cgroup_owner = 1,
};
+ wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
while (start <= end) {
page = find_get_page(mapping, start >> PAGE_SHIFT);
if (clear_page_dirty_for_io(page))
@@ -4256,11 +4253,12 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
}
ASSERT(ret <= 0);
- if (ret < 0) {
+ if (ret == 0)
+ ret = flush_write_bio(&epd);
+ else
end_write_bio(&epd, ret);
- return ret;
- }
- ret = flush_write_bio(&epd);
+
+ wbc_detach_inode(&wbc_writepages);
return ret;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index cf3424d58fec..a8551a1f56e2 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,35 +7,6 @@
#include <linux/refcount.h>
#include "ulist.h"
-/* bits for the extent state */
-#define EXTENT_DIRTY (1U << 0)
-#define EXTENT_UPTODATE (1U << 1)
-#define EXTENT_LOCKED (1U << 2)
-#define EXTENT_NEW (1U << 3)
-#define EXTENT_DELALLOC (1U << 4)
-#define EXTENT_DEFRAG (1U << 5)
-#define EXTENT_BOUNDARY (1U << 6)
-#define EXTENT_NODATASUM (1U << 7)
-#define EXTENT_CLEAR_META_RESV (1U << 8)
-#define EXTENT_NEED_WAIT (1U << 9)
-#define EXTENT_DAMAGED (1U << 10)
-#define EXTENT_NORESERVE (1U << 11)
-#define EXTENT_QGROUP_RESERVED (1U << 12)
-#define EXTENT_CLEAR_DATA_RESV (1U << 13)
-#define EXTENT_DELALLOC_NEW (1U << 14)
-#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
- EXTENT_CLEAR_DATA_RESV)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING)
-
-/*
- * Redefined bits above which are used only in the device allocation tree,
- * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
- * / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
- * manipulation functions
- */
-#define CHUNK_ALLOCATED EXTENT_DIRTY
-#define CHUNK_TRIMMED EXTENT_DEFRAG
-
/*
* flags for bio submission. The high bits indicate the compression
* type for this bio
@@ -89,12 +60,11 @@ enum {
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
-struct extent_state;
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
struct io_failure_record;
-
+struct extent_io_tree;
typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct bio *bio, u64 bio_offset);
@@ -111,47 +81,6 @@ struct extent_io_ops {
int mirror);
};
-enum {
- IO_TREE_FS_INFO_FREED_EXTENTS0,
- IO_TREE_FS_INFO_FREED_EXTENTS1,
- IO_TREE_INODE_IO,
- IO_TREE_INODE_IO_FAILURE,
- IO_TREE_RELOC_BLOCKS,
- IO_TREE_TRANS_DIRTY_PAGES,
- IO_TREE_ROOT_DIRTY_LOG_PAGES,
- IO_TREE_SELFTEST,
-};
-
-struct extent_io_tree {
- struct rb_root state;
- struct btrfs_fs_info *fs_info;
- void *private_data;
- u64 dirty_bytes;
- bool track_uptodate;
-
- /* Who owns this io tree, should be one of IO_TREE_* */
- u8 owner;
-
- spinlock_t lock;
- const struct extent_io_ops *ops;
-};
-
-struct extent_state {
- u64 start;
- u64 end; /* inclusive */
- struct rb_node rb_node;
-
- /* ADD NEW ELEMENTS AFTER THIS */
- wait_queue_head_t wq;
- refcount_t refs;
- unsigned state;
-
- struct io_failure_record *failrec;
-
-#ifdef CONFIG_BTRFS_DEBUG
- struct list_head leak_list;
-#endif
-};
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
@@ -259,152 +188,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
u64 start, u64 len,
int create);
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner,
- void *private_data);
-void extent_io_tree_release(struct extent_io_tree *tree);
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached);
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, NULL);
-}
-
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
-int __init extent_io_init(void);
-void __cold extent_io_exit(void);
-
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end,
- u64 max_bytes, unsigned bits, int contig);
-
-void free_extent_state(struct extent_state *state);
-int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int filled,
- struct extent_state *cached_state);
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached);
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached, gfp_t mask,
- struct extent_changeset *changeset);
-
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
-}
-
-static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_NOFS, NULL);
-}
-
-static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
- u64 start, u64 end, struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- GFP_ATOMIC, NULL);
-}
-
-static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
-{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
-}
-
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_changeset *changeset);
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, u64 *failed_start,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits);
-
-static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned bits)
-{
- return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS);
-}
-
-static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, GFP_NOFS, NULL);
-}
-
-static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
-}
-
-static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
-{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, cached);
-}
-
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, unsigned clear_bits,
- struct extent_state **cached_state);
-
-static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
- u64 end, unsigned int extra_bits,
- struct extent_state **cached_state)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
- NULL, cached_state, GFP_NOFS);
-}
-
-static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, GFP_NOFS);
-}
-
-static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
- u64 end)
-{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL,
- GFP_NOFS);
-}
-
-static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
- cached_state, mask);
-}
-
-int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits,
- struct extent_state **cached_state);
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned bits);
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@@ -442,11 +230,6 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
(eb->start >> PAGE_SHIFT);
}
-static inline void extent_buffer_get(struct extent_buffer *eb)
-{
- atomic_inc(&eb->refs);
-}
-
static inline int extent_buffer_uptodate(struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -508,10 +291,6 @@ struct btrfs_inode;
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
-int clean_io_failure(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree, u64 start,
- struct page *page, u64 ino, unsigned int pg_offset);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num);
@@ -535,19 +314,12 @@ struct io_failure_record {
};
-void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
- u64 end);
-int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret);
bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
struct io_failure_record *failrec, int fail_mirror);
struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
struct io_failure_record *failrec,
struct page *page, int pg_offset, int icsum,
bio_end_io_t *endio_func, void *data);
-int free_io_failure(struct extent_io_tree *failure_tree,
- struct extent_io_tree *io_tree,
- struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
@@ -555,5 +327,4 @@ bool find_lock_delalloc_range(struct inode *inode,
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
-
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 9d30acca55e1..6f417ff68980 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -214,9 +214,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
prev->block_start != EXTENT_MAP_DELALLOC);
+ if (prev->map_lookup || next->map_lookup)
+ ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
+ test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
- prev->bdev == next->bdev &&
+ prev->map_lookup == next->map_lookup &&
((next->block_start == EXTENT_MAP_HOLE &&
prev->block_start == EXTENT_MAP_HOLE) ||
(next->block_start == EXTENT_MAP_INLINE &&
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 473f039fcd7c..8e217337dff9 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -42,15 +42,8 @@ struct extent_map {
u64 block_len;
u64 generation;
unsigned long flags;
- union {
- struct block_device *bdev;
-
- /*
- * used for chunk mappings
- * flags & EXTENT_FLAG_FS_MAPPING must be set
- */
- struct map_lookup *map_lookup;
- };
+ /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
+ struct map_lookup *map_lookup;
refcount_t refs;
unsigned int compress_type;
struct list_head list;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1a599f50837b..3270a40b0777 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -945,7 +945,6 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
u8 type = btrfs_file_extent_type(leaf, fi);
int compress_type = btrfs_file_extent_compression(leaf, fi);
- em->bdev = fs_info->fs_devices->latest_bdev;
btrfs_item_key_to_cpu(leaf, &key, slot);
extent_start = key.offset;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 435a502a3226..0cb43b682789 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -296,7 +296,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.objectid = defrag->ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
@@ -667,7 +667,6 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
}
split->generation = gen;
- split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
replace_extent_mapping(em_tree, em, split, modified);
@@ -680,7 +679,6 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
split->start = start + len;
split->len = em->start + em->len - (start + len);
- split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
split->generation = gen;
@@ -1636,6 +1634,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
break;
}
+ only_release_metadata = false;
sector_offset = pos & (fs_info->sectorsize - 1);
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
@@ -1791,7 +1790,6 @@ again:
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_NORESERVE, NULL,
NULL, GFP_NOFS);
- only_release_metadata = false;
}
btrfs_drop_pages(pages, num_pages);
@@ -1903,9 +1901,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
(iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
return -EAGAIN;
+ } else {
inode_lock(inode);
}
@@ -2359,7 +2358,6 @@ out:
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
@@ -3350,29 +3348,30 @@ out:
return ret;
}
-static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
+static loff_t find_desired_extent(struct inode *inode, loff_t offset,
+ int whence)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
+ loff_t i_size = inode->i_size;
u64 lockstart;
u64 lockend;
u64 start;
u64 len;
int ret = 0;
- if (inode->i_size == 0)
+ if (i_size == 0 || offset >= i_size)
return -ENXIO;
/*
- * *offset can be negative, in this case we start finding DATA/HOLE from
+ * offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
*/
- start = max_t(loff_t, 0, *offset);
+ start = max_t(loff_t, 0, offset);
lockstart = round_down(start, fs_info->sectorsize);
- lockend = round_up(i_size_read(inode),
- fs_info->sectorsize);
+ lockend = round_up(i_size, fs_info->sectorsize);
if (lockend <= lockstart)
lockend = lockstart + fs_info->sectorsize;
lockend--;
@@ -3381,7 +3380,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
- while (start < inode->i_size) {
+ while (start < i_size) {
em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
@@ -3404,46 +3403,39 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
cond_resched();
}
free_extent_map(em);
- if (!ret) {
- if (whence == SEEK_DATA && start >= inode->i_size)
- ret = -ENXIO;
- else
- *offset = min_t(loff_t, start, inode->i_size);
- }
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
- return ret;
+ if (ret) {
+ offset = ret;
+ } else {
+ if (whence == SEEK_DATA && start >= i_size)
+ offset = -ENXIO;
+ else
+ offset = min_t(loff_t, start, i_size);
+ }
+
+ return offset;
}
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- int ret;
- inode_lock(inode);
switch (whence) {
- case SEEK_END:
- case SEEK_CUR:
- offset = generic_file_llseek(file, offset, whence);
- goto out;
+ default:
+ return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
- if (offset >= i_size_read(inode)) {
- inode_unlock(inode);
- return -ENXIO;
- }
-
- ret = find_desired_extent(inode, &offset, whence);
- if (ret) {
- inode_unlock(inode);
- return ret;
- }
+ inode_lock_shared(inode);
+ offset = find_desired_extent(inode, offset, whence);
+ inode_unlock_shared(inode);
+ break;
}
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
-out:
- inode_unlock(inode);
- return offset;
+ if (offset < 0)
+ return offset;
+
+ return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
static int btrfs_file_open(struct inode *inode, struct file *filp)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d54dcd0ab230..3283da419200 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -78,7 +78,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
* sure NOFS is set to keep us from deadlocking.
*/
nofs_flag = memalloc_nofs_save();
- inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
+ inode = btrfs_iget_path(fs_info->sb, &location, root, path);
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
@@ -91,8 +91,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
return inode;
}
-struct inode *lookup_free_space_inode(
- struct btrfs_block_group_cache *block_group,
+struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -107,7 +106,7 @@ struct inode *lookup_free_space_inode(
return inode;
inode = __lookup_free_space_inode(fs_info->tree_root, path,
- block_group->key.objectid);
+ block_group->start);
if (IS_ERR(inode))
return inode;
@@ -190,7 +189,7 @@ static int __create_free_space_inode(struct btrfs_root *root,
}
int create_free_space_inode(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
int ret;
@@ -201,7 +200,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans,
return ret;
return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
- ino, block_group->key.objectid);
+ ino, block_group->start);
}
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
@@ -224,7 +223,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
}
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -385,6 +384,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode
if (uptodate && !PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "free space cache page truncated");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
if (!PageUptodate(page)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
@@ -814,7 +819,7 @@ free_cache:
goto out;
}
-int load_free_space_cache(struct btrfs_block_group_cache *block_group)
+int load_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -822,7 +827,7 @@ int load_free_space_cache(struct btrfs_block_group_cache *block_group)
struct btrfs_path *path;
int ret = 0;
bool matched;
- u64 used = btrfs_block_group_used(&block_group->item);
+ u64 used = block_group->used;
/*
* If this block group has been marked to be cleared for one reason or
@@ -876,13 +881,13 @@ int load_free_space_cache(struct btrfs_block_group_cache *block_group)
spin_unlock(&block_group->lock);
ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
- path, block_group->key.objectid);
+ path, block_group->start);
btrfs_free_path(path);
if (ret <= 0)
goto out;
spin_lock(&ctl->tree_lock);
- matched = (ctl->free_space == (block_group->key.offset - used -
+ matched = (ctl->free_space == (block_group->length - used -
block_group->bytes_super));
spin_unlock(&ctl->tree_lock);
@@ -890,7 +895,7 @@ int load_free_space_cache(struct btrfs_block_group_cache *block_group)
__btrfs_remove_free_space_cache(ctl);
btrfs_warn(fs_info,
"block group %llu has wrong amount of free space",
- block_group->key.objectid);
+ block_group->start);
ret = -1;
}
out:
@@ -903,7 +908,7 @@ out:
btrfs_warn(fs_info,
"failed to load free space cache for block group %llu, rebuilding it now",
- block_group->key.objectid);
+ block_group->start);
}
iput(inode);
@@ -913,7 +918,7 @@ out:
static noinline_for_stack
int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
int *entries, int *bitmaps,
struct list_head *bitmap_list)
{
@@ -1041,7 +1046,7 @@ fail:
}
static noinline_for_stack int write_pinned_extent_entries(
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
int *entries)
{
@@ -1061,9 +1066,9 @@ static noinline_for_stack int write_pinned_extent_entries(
*/
unpin = block_group->fs_info->pinned_extents;
- start = block_group->key.objectid;
+ start = block_group->start;
- while (start < block_group->key.objectid + block_group->key.offset) {
+ while (start < block_group->start + block_group->length) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL);
@@ -1071,13 +1076,12 @@ static noinline_for_stack int write_pinned_extent_entries(
return 0;
/* This pinned extent is out of our range */
- if (extent_start >= block_group->key.objectid +
- block_group->key.offset)
+ if (extent_start >= block_group->start + block_group->length)
return 0;
extent_start = max(extent_start, start);
- extent_end = min(block_group->key.objectid +
- block_group->key.offset, extent_end + 1);
+ extent_end = min(block_group->start + block_group->length,
+ extent_end + 1);
len = extent_end - extent_start;
*entries += 1;
@@ -1141,7 +1145,7 @@ cleanup_write_cache_enospc(struct inode *inode,
static int __btrfs_wait_cache_io(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_path *path, u64 offset)
{
@@ -1168,7 +1172,7 @@ out:
#ifdef DEBUG
btrfs_err(root->fs_info,
"failed to write free space cache for block group %llu",
- block_group->key.objectid);
+ block_group->start);
#endif
}
}
@@ -1210,12 +1214,12 @@ static int btrfs_wait_cache_io_root(struct btrfs_root *root,
}
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
block_group, &block_group->io_ctl,
- path, block_group->key.objectid);
+ path, block_group->start);
}
/**
@@ -1231,7 +1235,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_trans_handle *trans)
{
@@ -1369,7 +1373,7 @@ out_unlock:
}
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1394,7 +1398,7 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
#ifdef DEBUG
btrfs_err(fs_info,
"failed to write free space cache for block group %llu",
- block_group->key.objectid);
+ block_group->start);
#endif
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
@@ -1647,11 +1651,11 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
- struct btrfs_block_group_cache *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->private;
u64 max_bytes;
u64 bitmap_bytes;
u64 extent_bytes;
- u64 size = block_group->key.offset;
+ u64 size = block_group->length;
u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
@@ -1991,7 +1995,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
- struct btrfs_block_group_cache *block_group = ctl->private;
+ struct btrfs_block_group *block_group = ctl->private;
struct btrfs_fs_info *fs_info = block_group->fs_info;
bool forced = false;
@@ -2028,7 +2032,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
* so allow those block groups to still be allowed to have a bitmap
* entry.
*/
- if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
+ if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
return false;
return true;
@@ -2043,7 +2047,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_free_space *bitmap_info;
- struct btrfs_block_group_cache *block_group = NULL;
+ struct btrfs_block_group *block_group = NULL;
int added = 0;
u64 bytes, offset, bytes_added;
int ret;
@@ -2380,7 +2384,7 @@ out:
return ret;
}
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size)
{
return __btrfs_add_free_space(block_group->fs_info,
@@ -2388,7 +2392,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
bytenr, size);
}
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 offset, u64 bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -2478,7 +2482,7 @@ out:
return ret;
}
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2503,14 +2507,14 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
"%d blocks of free space at or bigger than bytes is", count);
}
-void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
spin_lock_init(&ctl->tree_lock);
ctl->unit = fs_info->sectorsize;
- ctl->start = block_group->key.objectid;
+ ctl->start = block_group->start;
ctl->private = block_group;
ctl->op = &free_space_op;
INIT_LIST_HEAD(&ctl->trimming_ranges);
@@ -2532,7 +2536,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
*/
static int
__btrfs_return_cluster_to_free_space(
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -2598,7 +2602,7 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
spin_unlock(&ctl->tree_lock);
}
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_cluster *cluster;
@@ -2620,7 +2624,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
}
-u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size)
{
@@ -2674,7 +2678,7 @@ out:
* cluster and remove the cluster from it.
*/
int btrfs_return_cluster_to_free_space(
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster)
{
struct btrfs_free_space_ctl *ctl;
@@ -2708,7 +2712,7 @@ int btrfs_return_cluster_to_free_space(
return ret;
}
-static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct btrfs_free_space *entry,
u64 bytes, u64 min_start,
@@ -2741,7 +2745,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
* if it couldn't find anything suitably large, or a logical disk offset
* if things worked out
*/
-u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size)
{
@@ -2827,7 +2831,7 @@ out:
return ret;
}
-static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_space *entry,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes,
@@ -2909,7 +2913,7 @@ again:
* extent of cont1_bytes, and other clusters of at least min_bytes.
*/
static noinline int
-setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
+setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
u64 cont1_bytes, u64 min_bytes)
@@ -3000,7 +3004,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
* that we have already failed to find extents that will work.
*/
static noinline int
-setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+setup_cluster_bitmap(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
u64 cont1_bytes, u64 min_bytes)
@@ -3050,7 +3054,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
* returns zero and sets up cluster if things worked out, otherwise
* it returns -enospc
*/
-int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group,
+int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 empty_size)
{
@@ -3141,7 +3145,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
cluster->block_group = NULL;
}
-static int do_trimming(struct btrfs_block_group_cache *block_group,
+static int do_trimming(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 bytes,
u64 reserved_start, u64 reserved_bytes,
struct btrfs_trim_range *trim_entry)
@@ -3186,7 +3190,7 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
return ret;
}
-static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+static int trim_no_bitmap(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 end, u64 minlen)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -3271,7 +3275,7 @@ out:
return ret;
}
-static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+static int trim_bitmaps(struct btrfs_block_group *block_group,
u64 *total_trimmed, u64 start, u64 end, u64 minlen)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
@@ -3352,12 +3356,12 @@ next:
return ret;
}
-void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group_trimming(struct btrfs_block_group *cache)
{
atomic_inc(&cache->trimming);
}
-void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
+void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_map_tree *em_tree;
@@ -3373,7 +3377,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
mutex_lock(&fs_info->chunk_mutex);
em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->key.objectid,
+ em = lookup_extent_mapping(em_tree, block_group->start,
1);
BUG_ON(!em); /* logic error, can't happen */
remove_extent_mapping(em_tree, em);
@@ -3392,7 +3396,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
}
}
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen)
{
int ret;
@@ -3590,7 +3594,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
* how the free space cache loading stuff works, so you can get really weird
* configurations.
*/
-int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+int test_add_free_space_entry(struct btrfs_block_group *cache,
u64 offset, u64 bytes, bool bitmap)
{
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
@@ -3658,7 +3662,7 @@ again:
* just used to check the absence of space, so if there is free space in the
* range at all we will return 1.
*/
-int test_check_exists(struct btrfs_block_group_cache *cache,
+int test_check_exists(struct btrfs_block_group *cache,
u64 offset, u64 bytes)
{
struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 39c32c8fc24f..ba9a23241101 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -50,24 +50,23 @@ struct btrfs_io_ctl {
unsigned check_crcs:1;
};
-struct inode *lookup_free_space_inode(
- struct btrfs_block_group_cache *block_group,
+struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
struct btrfs_path *path);
int create_free_space_inode(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct inode *inode);
-int load_free_space_cache(struct btrfs_block_group_cache *block_group);
+int load_free_space_cache(struct btrfs_block_group *block_group);
int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
struct inode *lookup_free_ino_inode(struct btrfs_root *root,
struct btrfs_path *path);
@@ -81,42 +80,40 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
struct btrfs_path *path,
struct inode *inode);
-void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group);
int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_free_space_ctl *ctl,
u64 bytenr, u64 size);
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_add_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
- *block_group);
-u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
+void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
+u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
u64 offset, u64 bytes, u64 empty_size,
u64 *max_extent_size);
u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
-void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+void btrfs_dump_free_space(struct btrfs_block_group *block_group,
u64 bytes);
-int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group,
+int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 empty_size);
void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
-u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
+u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start, u64 *max_extent_size);
int btrfs_return_cluster_to_free_space(
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_free_cluster *cluster);
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+int btrfs_trim_block_group(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen);
/* Support functions for running our sanity tests */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+int test_add_free_space_entry(struct btrfs_block_group *cache,
u64 offset, u64 bytes, bool bitmap);
-int test_check_exists(struct btrfs_block_group_cache *cache,
- u64 offset, u64 bytes);
+int test_check_exists(struct btrfs_block_group *cache, u64 offset, u64 bytes);
#endif
#endif
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 48a03f5240f5..258cb3fae17a 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -13,10 +13,10 @@
#include "block-group.h"
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
-void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
+void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
size_t bitmap_size;
@@ -27,8 +27,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
* exceeds that required for using bitmaps.
*/
bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
- num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1,
- bitmap_range);
+ num_bitmaps = div_u64(cache->length + bitmap_range - 1, bitmap_range);
bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
total_bitmap_size = num_bitmaps * bitmap_size;
cache->bitmap_high_thresh = div_u64(total_bitmap_size,
@@ -45,7 +44,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
}
static int add_new_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_root *root = trans->fs_info->free_space_root;
@@ -54,9 +53,9 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int ret;
- key.objectid = block_group->key.objectid;
+ key.objectid = block_group->start;
key.type = BTRFS_FREE_SPACE_INFO_KEY;
- key.offset = block_group->key.offset;
+ key.offset = block_group->length;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
if (ret)
@@ -78,7 +77,7 @@ out:
EXPORT_FOR_TESTS
struct btrfs_free_space_info *search_free_space_info(
struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -86,16 +85,16 @@ struct btrfs_free_space_info *search_free_space_info(
struct btrfs_key key;
int ret;
- key.objectid = block_group->key.objectid;
+ key.objectid = block_group->start;
key.type = BTRFS_FREE_SPACE_INFO_KEY;
- key.offset = block_group->key.offset;
+ key.offset = block_group->length;
ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
if (ret < 0)
return ERR_PTR(ret);
if (ret != 0) {
btrfs_warn(fs_info, "missing free space info for %llu",
- block_group->key.objectid);
+ block_group->start);
ASSERT(0);
return ERR_PTR(-ENOENT);
}
@@ -180,7 +179,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
EXPORT_FOR_TESTS
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -197,7 +196,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ bitmap_size = free_space_bitmap_size(block_group->length,
fs_info->sectorsize);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
@@ -205,8 +204,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
goto out;
}
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
@@ -224,8 +223,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
- ASSERT(found_key.objectid == block_group->key.objectid);
- ASSERT(found_key.offset == block_group->key.offset);
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
done = 1;
break;
} else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
@@ -271,7 +270,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -320,7 +319,7 @@ out:
EXPORT_FOR_TESTS
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -336,7 +335,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
int done = 0, nr;
int ret;
- bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ bitmap_size = free_space_bitmap_size(block_group->length,
fs_info->sectorsize);
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
@@ -344,8 +343,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
goto out;
}
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
@@ -363,8 +362,8 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
- ASSERT(found_key.objectid == block_group->key.objectid);
- ASSERT(found_key.offset == block_group->key.offset);
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
done = 1;
break;
} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
@@ -413,7 +412,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- nrbits = div_u64(block_group->key.offset, block_group->fs_info->sectorsize);
+ nrbits = div_u64(block_group->length, block_group->fs_info->sectorsize);
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
@@ -437,7 +436,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -453,7 +452,7 @@ out:
}
static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
int new_extents)
{
@@ -491,7 +490,7 @@ out:
}
EXPORT_FOR_TESTS
-int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+int free_space_test_bit(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 offset)
{
struct extent_buffer *leaf;
@@ -513,7 +512,7 @@ int free_space_test_bit(struct btrfs_block_group_cache *block_group,
return !!extent_buffer_test_bit(leaf, ptr, i);
}
-static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
+static void free_space_set_bits(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 *start, u64 *size,
int bit)
{
@@ -581,7 +580,7 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
* the bitmap.
*/
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size, int remove)
{
@@ -597,7 +596,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
* Read the bit for the block immediately before the extent of space if
* that block is within the block group.
*/
- if (start > block_group->key.objectid) {
+ if (start > block_group->start) {
u64 prev_block = start - block_group->fs_info->sectorsize;
key.objectid = prev_block;
@@ -649,7 +648,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
* Read the bit for the block immediately after the extent of space if
* that block is within the block group.
*/
- if (end < block_group->key.objectid + block_group->key.offset) {
+ if (end < block_group->start + block_group->length) {
/* The next block may be in the next bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (end >= key.objectid + key.offset) {
@@ -694,7 +693,7 @@ out:
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
@@ -781,7 +780,7 @@ out:
EXPORT_FOR_TESTS
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
struct btrfs_free_space_info *info;
@@ -812,7 +811,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_path *path;
int ret;
@@ -846,7 +845,7 @@ out:
}
static int add_free_space_extent(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path,
u64 start, u64 size)
{
@@ -880,7 +879,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
new_key.offset = size;
/* Search for a neighbor on the left. */
- if (start == block_group->key.objectid)
+ if (start == block_group->start)
goto right;
key.objectid = start - 1;
key.type = (u8)-1;
@@ -900,8 +899,8 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
found_start = key.objectid;
found_end = key.objectid + key.offset;
- ASSERT(found_start >= block_group->key.objectid &&
- found_end > block_group->key.objectid);
+ ASSERT(found_start >= block_group->start &&
+ found_end > block_group->start);
ASSERT(found_start < start && found_end <= start);
/*
@@ -920,7 +919,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
right:
/* Search for a neighbor on the right. */
- if (end == block_group->key.objectid + block_group->key.offset)
+ if (end == block_group->start + block_group->length)
goto insert;
key.objectid = end;
key.type = (u8)-1;
@@ -940,8 +939,8 @@ right:
found_start = key.objectid;
found_end = key.objectid + key.offset;
- ASSERT(found_start >= block_group->key.objectid &&
- found_end > block_group->key.objectid);
+ ASSERT(found_start >= block_group->start &&
+ found_end > block_group->start);
ASSERT((found_start < start && found_end <= start) ||
(found_start >= end && found_end > end));
@@ -974,7 +973,7 @@ out:
EXPORT_FOR_TESTS
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size)
{
struct btrfs_free_space_info *info;
@@ -1005,7 +1004,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_path *path;
int ret;
@@ -1043,7 +1042,7 @@ out:
* through the normal add/remove hooks.
*/
static int populate_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *extent_root = trans->fs_info->extent_root;
struct btrfs_path *path, *path2;
@@ -1075,7 +1074,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
* BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
* contained in.
*/
- key.objectid = block_group->key.objectid;
+ key.objectid = block_group->start;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = 0;
@@ -1084,8 +1083,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
goto out_locked;
ASSERT(ret == 0);
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
while (1) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -1109,7 +1108,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
else
start += key.offset;
} else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- if (key.objectid != block_group->key.objectid)
+ if (key.objectid != block_group->start)
break;
}
@@ -1140,7 +1139,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *free_space_root;
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct rb_node *node;
int ret;
@@ -1159,7 +1158,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
node = rb_first(&fs_info->block_group_cache_tree);
while (node) {
- block_group = rb_entry(node, struct btrfs_block_group_cache,
+ block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
ret = populate_free_space_tree(trans, block_group);
if (ret)
@@ -1265,7 +1264,7 @@ abort:
}
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
int ret;
@@ -1277,12 +1276,12 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
return ret;
return __add_to_free_space_tree(trans, block_group, path,
- block_group->key.objectid,
- block_group->key.offset);
+ block_group->start,
+ block_group->length);
}
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path = NULL;
@@ -1312,7 +1311,7 @@ out:
}
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *root = trans->fs_info->free_space_root;
struct btrfs_path *path;
@@ -1336,8 +1335,8 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
goto out;
}
- start = block_group->key.objectid;
- end = block_group->key.objectid + block_group->key.offset;
+ start = block_group->start;
+ end = block_group->start + block_group->length;
key.objectid = end - 1;
key.type = (u8)-1;
@@ -1355,8 +1354,8 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
- ASSERT(found_key.objectid == block_group->key.objectid);
- ASSERT(found_key.offset == block_group->key.offset);
+ ASSERT(found_key.objectid == block_group->start);
+ ASSERT(found_key.offset == block_group->length);
done = 1;
nr++;
path->slots[0]--;
@@ -1391,7 +1390,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
struct btrfs_path *path,
u32 expected_extent_count)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
@@ -1407,7 +1406,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
fs_info = block_group->fs_info;
root = fs_info->free_space_root;
- end = block_group->key.objectid + block_group->key.offset;
+ end = block_group->start + block_group->length;
while (1) {
ret = btrfs_next_item(root, path);
@@ -1454,7 +1453,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -1472,7 +1471,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
struct btrfs_path *path,
u32 expected_extent_count)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
@@ -1485,7 +1484,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
fs_info = block_group->fs_info;
root = fs_info->free_space_root;
- end = block_group->key.objectid + block_group->key.offset;
+ end = block_group->start + block_group->length;
while (1) {
ret = btrfs_next_item(root, path);
@@ -1516,7 +1515,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
if (extent_count != expected_extent_count) {
btrfs_err(fs_info,
"incorrect extent count for %llu; counted %u, expected %u",
- block_group->key.objectid, extent_count,
+ block_group->start, extent_count,
expected_extent_count);
ASSERT(0);
ret = -EIO;
@@ -1532,7 +1531,7 @@ out:
int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
struct btrfs_path *path;
u32 extent_count, flags;
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 360d50e1cdea..dc2463e4cfe3 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -16,14 +16,14 @@ struct btrfs_caching_control;
#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
-void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group);
+void set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group);
+ struct btrfs_block_group *block_group);
int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group);
+ struct btrfs_block_group *block_group);
int add_to_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size);
int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
@@ -32,21 +32,21 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_free_space_info *
search_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow);
int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size);
int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size);
int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct btrfs_path *path);
-int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+int free_space_test_bit(struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 offset);
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c6dc4dd16cf7..56032c518b26 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -368,6 +368,7 @@ struct async_chunk {
u64 end;
unsigned int write_flags;
struct list_head extents;
+ struct cgroup_subsys_state *blkcg_css;
struct btrfs_work work;
atomic_t *pending;
};
@@ -712,10 +713,12 @@ cleanup_and_bail_uncompressed:
* to our extent and set things up for the async work queue to run
* cow_file_range to do the normal delalloc dance.
*/
- if (page_offset(async_chunk->locked_page) >= start &&
- page_offset(async_chunk->locked_page) <= end)
+ if (async_chunk->locked_page &&
+ (page_offset(async_chunk->locked_page) >= start &&
+ page_offset(async_chunk->locked_page)) <= end) {
__set_page_dirty_nobuffers(async_chunk->locked_page);
/* unlocked later on in the async handlers */
+ }
if (redirty)
extent_range_redirty_for_io(inode, start, end);
@@ -795,7 +798,7 @@ retry:
async_extent->start +
async_extent->ram_size - 1,
WB_SYNC_ALL);
- else if (ret)
+ else if (ret && async_chunk->locked_page)
unlock_page(async_chunk->locked_page);
kfree(async_extent);
cond_resched();
@@ -878,7 +881,8 @@ retry:
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages,
- async_chunk->write_flags)) {
+ async_chunk->write_flags,
+ async_chunk->blkcg_css)) {
struct page *p = async_extent->pages[0];
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
@@ -1196,6 +1200,8 @@ static noinline void async_cow_free(struct btrfs_work *work)
async_chunk = container_of(work, struct async_chunk, work);
if (async_chunk->inode)
btrfs_add_delayed_iput(async_chunk->inode);
+ if (async_chunk->blkcg_css)
+ css_put(async_chunk->blkcg_css);
/*
* Since the pointer to 'pending' is at the beginning of the array of
* async_chunk's, freeing it ensures the whole array has been freed.
@@ -1204,12 +1210,14 @@ static noinline void async_cow_free(struct btrfs_work *work)
kvfree(async_chunk->pending);
}
-static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+static int cow_file_range_async(struct inode *inode,
+ struct writeback_control *wbc,
+ struct page *locked_page,
u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- unsigned int write_flags)
+ unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
struct async_cow *ctx;
struct async_chunk *async_chunk;
unsigned long nr_pages;
@@ -1218,6 +1226,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
int i;
bool should_compress;
unsigned nofs_flag;
+ const unsigned int write_flags = wbc_to_write_flags(wbc);
unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
@@ -1264,14 +1273,45 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_chunk[i].inode = inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
- async_chunk[i].locked_page = locked_page;
async_chunk[i].write_flags = write_flags;
INIT_LIST_HEAD(&async_chunk[i].extents);
- btrfs_init_work(&async_chunk[i].work,
- btrfs_delalloc_helper,
- async_cow_start, async_cow_submit,
- async_cow_free);
+ /*
+ * The locked_page comes all the way from writepage and its
+ * the original page we were actually given. As we spread
+ * this large delalloc region across multiple async_chunk
+ * structs, only the first struct needs a pointer to locked_page
+ *
+ * This way we don't need racey decisions about who is supposed
+ * to unlock it.
+ */
+ if (locked_page) {
+ /*
+ * Depending on the compressibility, the pages might or
+ * might not go through async. We want all of them to
+ * be accounted against wbc once. Let's do it here
+ * before the paths diverge. wbc accounting is used
+ * only for foreign writeback detection and doesn't
+ * need full accuracy. Just account the whole thing
+ * against the first page.
+ */
+ wbc_account_cgroup_owner(wbc, locked_page,
+ cur_end - start);
+ async_chunk[i].locked_page = locked_page;
+ locked_page = NULL;
+ } else {
+ async_chunk[i].locked_page = NULL;
+ }
+
+ if (blkcg_css != blkcg_root_css) {
+ css_get(blkcg_css);
+ async_chunk[i].blkcg_css = blkcg_css;
+ } else {
+ async_chunk[i].blkcg_css = NULL;
+ }
+
+ btrfs_init_work(&async_chunk[i].work, async_cow_start,
+ async_cow_submit, async_cow_free);
nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
@@ -1697,7 +1737,6 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
{
int ret;
int force_cow = need_force_cow(inode, start, end);
- unsigned int write_flags = wbc_to_write_flags(wbc);
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1712,9 +1751,8 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags);
- ret = cow_file_range_async(inode, locked_page, start, end,
- page_started, nr_written,
- write_flags);
+ ret = cow_file_range_async(inode, wbc, locked_page, start, end,
+ page_started, nr_written);
}
if (ret)
btrfs_cleanup_ordered_extents(inode, locked_page, start,
@@ -2110,7 +2148,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
}
mapit:
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
out:
if (ret) {
@@ -2214,12 +2252,16 @@ again:
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
ClearPageChecked(page);
- goto out;
+ goto out_reserved;
}
ClearPageChecked(page);
set_page_dirty(page);
+out_reserved:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ if (ret)
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ PAGE_SIZE, true);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
@@ -2260,8 +2302,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
SetPageChecked(page);
get_page(page);
- btrfs_init_work(&fixup->work, btrfs_fixup_helper,
- btrfs_writepage_fixup_worker, NULL, NULL);
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
return -EBUSY;
@@ -2675,7 +2716,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
if (IS_ERR(inode)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
return 0;
@@ -2999,7 +3040,7 @@ out_kfree:
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
u64 start, u64 len)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
@@ -3027,7 +3068,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
int compress_type = 0;
int ret = 0;
u64 logical_len = ordered_extent->len;
- bool nolock;
+ bool freespace_inode;
bool truncated = false;
bool range_locked = false;
bool clear_new_delalloc_bytes = false;
@@ -3038,7 +3079,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
!test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
clear_new_delalloc_bytes = true;
- nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
+ freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
ret = -EIO;
@@ -3069,8 +3110,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3104,8 +3145,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
EXTENT_DEFRAG, 0, 0, &cached_state);
}
- if (nolock)
- trans = btrfs_join_transaction_nolock(root);
+ if (freespace_inode)
+ trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
@@ -3254,7 +3295,6 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -3263,16 +3303,12 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(BTRFS_I(inode)))
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
- btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
- NULL);
+ btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
btrfs_queue_work(wq, &ordered_extent->work);
}
@@ -3531,7 +3567,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.objectid = found_key.offset;
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &found_key, root);
ret = PTR_ERR_OR_ZERO(inode);
if (ret && ret != -ENOENT)
goto out;
@@ -5153,7 +5189,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->bdev = fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = fs_info->generation;
@@ -5750,12 +5785,14 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
return inode;
}
-/* Get an inode object given its location and corresponding root.
- * Returns in *is_new if the inode was read from disk
+/*
+ * Get an inode object given its location and corresponding root.
+ * Path can be preallocated to prevent recursing back to iget through
+ * allocator. NULL is also valid but may require an additional allocation
+ * later.
*/
struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new,
- struct btrfs_path *path)
+ struct btrfs_root *root, struct btrfs_path *path)
{
struct inode *inode;
@@ -5770,8 +5807,6 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
if (!ret) {
inode_tree_add(inode);
unlock_new_inode(inode);
- if (new)
- *new = 1;
} else {
iget_failed(inode);
/*
@@ -5789,9 +5824,9 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
}
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- struct btrfs_root *root, int *new)
+ struct btrfs_root *root)
{
- return btrfs_iget_path(s, location, root, new, NULL);
+ return btrfs_iget_path(s, location, root, NULL);
}
static struct inode *new_simple_dir(struct super_block *s,
@@ -5857,7 +5892,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_PTR(ret);
if (location.type == BTRFS_INODE_ITEM_KEY) {
- inode = btrfs_iget(dir->i_sb, &location, root, NULL);
+ inode = btrfs_iget(dir->i_sb, &location, root);
if (IS_ERR(inode))
return inode;
@@ -5882,7 +5917,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
else
inode = new_simple_dir(dir->i_sb, &location, sub_root);
} else {
- inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
+ inode = btrfs_iget(dir->i_sb, &location, sub_root);
}
srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -6931,8 +6966,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
- if (em)
- em->bdev = fs_info->fs_devices->latest_bdev;
read_unlock(&em_tree->lock);
if (em) {
@@ -6948,7 +6981,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
err = -ENOMEM;
goto out;
}
- em->bdev = fs_info->fs_devices->latest_bdev;
em->start = EXTENT_MAP_HOLE;
em->orig_start = EXTENT_MAP_HOLE;
em->len = (u64)-1;
@@ -7207,7 +7239,6 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
err = -ENOMEM;
goto out;
}
- em->bdev = NULL;
ASSERT(hole_em);
/*
@@ -7567,7 +7598,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
{
struct extent_map_tree *em_tree;
struct extent_map *em;
- struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
ASSERT(type == BTRFS_ORDERED_PREALLOC ||
@@ -7585,7 +7615,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
em->len = len;
em->block_len = block_len;
em->block_start = block_start;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
@@ -7624,6 +7653,8 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em,
struct inode *inode,
u64 start, u64 len)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+
if (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
return -ENOENT;
@@ -7633,7 +7664,7 @@ static int btrfs_get_blocks_direct_read(struct extent_map *em,
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
+ bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
set_buffer_mapped(bh_result);
return 0;
@@ -7716,7 +7747,7 @@ skip_cow:
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
- bh_result->b_bdev = em->bdev;
+ bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
set_buffer_mapped(bh_result);
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@ -7858,7 +7889,7 @@ static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
if (ret)
return ret;
- ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
return ret;
}
@@ -8211,18 +8242,14 @@ static void __endio_write_update_ordered(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered = NULL;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
u64 last_offset;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(BTRFS_I(inode)))
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
while (ordered_offset < offset + bytes) {
last_offset = ordered_offset;
@@ -8230,9 +8257,8 @@ static void __endio_write_update_ordered(struct inode *inode,
&ordered_offset,
ordered_bytes,
uptodate)) {
- btrfs_init_work(&ordered->work, func,
- finish_ordered_fn,
- NULL, NULL);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
+ NULL);
btrfs_queue_work(wq, &ordered->work);
}
/*
@@ -8389,7 +8415,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
goto err;
}
map:
- ret = btrfs_map_bio(fs_info, bio, 0, 0);
+ ret = btrfs_map_bio(fs_info, bio, 0);
err:
return ret;
}
@@ -9321,7 +9347,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->io_failure_tree.track_uptodate = true;
atomic_set(&ei->sync_writers, 0);
mutex_init(&ei->log_mutex);
- mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
@@ -9550,6 +9575,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+
/*
* We need to find a free sequence number both in the source and
* in the destination directory for the exchange.
@@ -9744,6 +9772,18 @@ out_fail:
commit_transaction = true;
}
if (commit_transaction) {
+ /*
+ * We may have set commit_transaction when logging the new name
+ * in the destination root, in which case we left the source
+ * root context in the list of log contextes. So make sure we
+ * remove it to avoid invalid memory accesses, since the context
+ * was allocated in our stack frame.
+ */
+ if (sync_log_root) {
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx_root.list);
+ mutex_unlock(&root->log_mutex);
+ }
ret = btrfs_commit_transaction(trans);
} else {
int ret2;
@@ -9757,6 +9797,9 @@ out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
+ ASSERT(list_empty(&ctx_root.list));
+ ASSERT(list_empty(&ctx_dest.list));
+
return ret;
}
@@ -10101,8 +10144,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
- btrfs_run_delalloc_work, NULL, NULL);
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
return work;
}
@@ -10435,7 +10477,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
em->generation = trans->transid;
@@ -10791,7 +10832,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
u64 len = isize - start;
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 23272d9154f3..a1ee0b775e65 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -479,10 +479,9 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
return put_user(inode->i_generation, arg);
}
-static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_device *device;
struct request_queue *q;
struct fstrim_range range;
@@ -541,7 +540,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return 0;
}
-int btrfs_is_empty_uuid(u8 *uuid)
+int __pure btrfs_is_empty_uuid(u8 *uuid)
{
int i;
@@ -1409,7 +1408,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
return -EINVAL;
if (do_compress) {
- if (range->compress_type > BTRFS_COMPRESS_TYPES)
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
return -EINVAL;
if (range->compress_type)
compress_type = range->compress_type;
@@ -2462,7 +2461,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode,
goto out;
}
- temp_inode = btrfs_iget(sb, &key2, root, NULL);
+ temp_inode = btrfs_iget(sb, &key2, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
goto out;
@@ -4032,16 +4031,15 @@ out:
static void get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space)
{
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
space->total_bytes = 0;
space->used_bytes = 0;
space->flags = 0;
list_for_each_entry(block_group, groups_list, list) {
space->flags = block_group->flags;
- space->total_bytes += block_group->key.offset;
- space->used_bytes +=
- btrfs_block_group_used(&block_group->item);
+ space->total_bytes += block_group->length;
+ space->used_bytes += block_group->used;
}
}
@@ -4952,10 +4950,9 @@ drop_write:
return ret;
}
-static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
+static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_quota_rescan_args *qsa;
int ret = 0;
@@ -4978,11 +4975,9 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
return ret;
}
-static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -5154,10 +5149,9 @@ out:
return ret;
}
-static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
size_t len;
int ret;
char label[BTRFS_LABEL_SIZE];
@@ -5241,10 +5235,9 @@ int btrfs_ioctl_get_supported_features(void __user *arg)
return 0;
}
-static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
+static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
+ void __user *arg)
{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_ioctl_feature_flags features;
@@ -5445,11 +5438,11 @@ long btrfs_ioctl(struct file *file, unsigned int
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
case FS_IOC_GETFSLABEL:
- return btrfs_ioctl_get_fslabel(file, argp);
+ return btrfs_ioctl_get_fslabel(fs_info, argp);
case FS_IOC_SETFSLABEL:
return btrfs_ioctl_set_fslabel(file, argp);
case FITRIM:
- return btrfs_ioctl_fitrim(file, argp);
+ return btrfs_ioctl_fitrim(fs_info, argp);
case BTRFS_IOC_SNAP_CREATE:
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SNAP_CREATE_V2:
@@ -5554,15 +5547,15 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_QUOTA_RESCAN:
return btrfs_ioctl_quota_rescan(file, argp);
case BTRFS_IOC_QUOTA_RESCAN_STATUS:
- return btrfs_ioctl_quota_rescan_status(file, argp);
+ return btrfs_ioctl_quota_rescan_status(fs_info, argp);
case BTRFS_IOC_QUOTA_RESCAN_WAIT:
- return btrfs_ioctl_quota_rescan_wait(file, argp);
+ return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(fs_info, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
return btrfs_ioctl_get_supported_features(argp);
case BTRFS_IOC_GET_FEATURES:
- return btrfs_ioctl_get_features(file, argp);
+ return btrfs_ioctl_get_features(fs_info, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
case FS_IOC_FSGETXATTR:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 7f9a578a1a20..571c4826c428 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -13,65 +13,164 @@
#include "extent_io.h"
#include "locking.h"
+/*
+ * Extent buffer locking
+ * =====================
+ *
+ * The locks use a custom scheme that allows to do more operations than are
+ * available fromt current locking primitives. The building blocks are still
+ * rwlock and wait queues.
+ *
+ * Required semantics:
+ *
+ * - reader/writer exclusion
+ * - writer/writer exclusion
+ * - reader/reader sharing
+ * - spinning lock semantics
+ * - blocking lock semantics
+ * - try-lock semantics for readers and writers
+ * - one level nesting, allowing read lock to be taken by the same thread that
+ * already has write lock
+ *
+ * The extent buffer locks (also called tree locks) manage access to eb data
+ * related to the storage in the b-tree (keys, items, but not the individual
+ * members of eb).
+ * We want concurrency of many readers and safe updates. The underlying locking
+ * is done by read-write spinlock and the blocking part is implemented using
+ * counters and wait queues.
+ *
+ * spinning semantics - the low-level rwlock is held so all other threads that
+ * want to take it are spinning on it.
+ *
+ * blocking semantics - the low-level rwlock is not held but the counter
+ * denotes how many times the blocking lock was held;
+ * sleeping is possible
+ *
+ * Write lock always allows only one thread to access the data.
+ *
+ *
+ * Debugging
+ * ---------
+ *
+ * There are additional state counters that are asserted in various contexts,
+ * removed from non-debug build to reduce extent_buffer size and for
+ * performance reasons.
+ *
+ *
+ * Lock nesting
+ * ------------
+ *
+ * A write operation on a tree might indirectly start a look up on the same
+ * tree. This can happen when btrfs_cow_block locks the tree and needs to
+ * lookup free extents.
+ *
+ * btrfs_cow_block
+ * ..
+ * alloc_tree_block_no_bg_flush
+ * btrfs_alloc_tree_block
+ * btrfs_reserve_extent
+ * ..
+ * load_free_space_cache
+ * ..
+ * btrfs_lookup_file_extent
+ * btrfs_search_slot
+ *
+ *
+ * Locking pattern - spinning
+ * --------------------------
+ *
+ * The simple locking scenario, the +--+ denotes the spinning section.
+ *
+ * +- btrfs_tree_lock
+ * | - extent_buffer::rwlock is held
+ * | - no heavy operations should happen, eg. IO, memory allocations, large
+ * | structure traversals
+ * +- btrfs_tree_unock
+*
+*
+ * Locking pattern - blocking
+ * --------------------------
+ *
+ * The blocking write uses the following scheme. The +--+ denotes the spinning
+ * section.
+ *
+ * +- btrfs_tree_lock
+ * |
+ * +- btrfs_set_lock_blocking_write
+ *
+ * - allowed: IO, memory allocations, etc.
+ *
+ * -- btrfs_tree_unlock - note, no explicit unblocking necessary
+ *
+ *
+ * Blocking read is similar.
+ *
+ * +- btrfs_tree_read_lock
+ * |
+ * +- btrfs_set_lock_blocking_read
+ *
+ * - heavy operations allowed
+ *
+ * +- btrfs_tree_read_unlock_blocking
+ * |
+ * +- btrfs_tree_read_unlock
+ *
+ */
+
#ifdef CONFIG_BTRFS_DEBUG
-static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
+static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
{
WARN_ON(eb->spinning_writers);
eb->spinning_writers++;
}
-static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
+static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
{
WARN_ON(eb->spinning_writers != 1);
eb->spinning_writers--;
}
-static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
+static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
{
WARN_ON(eb->spinning_writers);
}
-static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
+static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
{
atomic_inc(&eb->spinning_readers);
}
-static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
+static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
{
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
atomic_dec(&eb->spinning_readers);
}
-static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
{
atomic_inc(&eb->read_locks);
}
-static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
{
atomic_dec(&eb->read_locks);
}
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
{
BUG_ON(!atomic_read(&eb->read_locks));
}
-static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
{
eb->write_locks++;
}
-static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
+static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
{
eb->write_locks--;
}
-void btrfs_assert_tree_locked(struct extent_buffer *eb)
-{
- BUG_ON(!eb->write_locks);
-}
-
#else
static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
@@ -81,11 +180,19 @@ static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { }
static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
-void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
#endif
+/*
+ * Mark already held read lock as blocking. Can be nested in write lock by the
+ * same thread.
+ *
+ * Use when there are potentially long operations ahead so other thread waiting
+ * on the lock will not actively spin but sleep instead.
+ *
+ * The rwlock is released and blocking reader counter is increased.
+ */
void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
{
trace_btrfs_set_lock_blocking_read(eb);
@@ -102,6 +209,14 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
read_unlock(&eb->lock);
}
+/*
+ * Mark already held write lock as blocking.
+ *
+ * Use when there are potentially long operations ahead so other threads
+ * waiting on the lock will not actively spin but sleep instead.
+ *
+ * The rwlock is released and blocking writers is set.
+ */
void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
{
trace_btrfs_set_lock_blocking_write(eb);
@@ -115,14 +230,19 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
if (eb->blocking_writers == 0) {
btrfs_assert_spinning_writers_put(eb);
btrfs_assert_tree_locked(eb);
- eb->blocking_writers++;
+ WRITE_ONCE(eb->blocking_writers, 1);
write_unlock(&eb->lock);
}
}
/*
- * take a spinning read lock. This will wait for any blocking
- * writers
+ * Lock the extent buffer for read. Wait for any writers (spinning or blocking).
+ * Can be nested in write lock by the same thread.
+ *
+ * Use when the locked section does only lightweight actions and busy waiting
+ * would be cheaper than making other threads do the wait/wake loop.
+ *
+ * The rwlock is held upon exit.
*/
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
@@ -134,23 +254,24 @@ again:
read_lock(&eb->lock);
BUG_ON(eb->blocking_writers == 0 &&
current->pid == eb->lock_owner);
- if (eb->blocking_writers && current->pid == eb->lock_owner) {
- /*
- * This extent is already write-locked by our thread. We allow
- * an additional read lock to be added because it's for the same
- * thread. btrfs_find_all_roots() depends on this as it may be
- * called on a partly (write-)locked tree.
- */
- BUG_ON(eb->lock_nested);
- eb->lock_nested = true;
- read_unlock(&eb->lock);
- trace_btrfs_tree_read_lock(eb, start_ns);
- return;
- }
if (eb->blocking_writers) {
+ if (current->pid == eb->lock_owner) {
+ /*
+ * This extent is already write-locked by our thread.
+ * We allow an additional read lock to be added because
+ * it's for the same thread. btrfs_find_all_roots()
+ * depends on this as it may be called on a partly
+ * (write-)locked tree.
+ */
+ BUG_ON(eb->lock_nested);
+ eb->lock_nested = true;
+ read_unlock(&eb->lock);
+ trace_btrfs_tree_read_lock(eb, start_ns);
+ return;
+ }
read_unlock(&eb->lock);
wait_event(eb->write_lock_wq,
- eb->blocking_writers == 0);
+ READ_ONCE(eb->blocking_writers) == 0);
goto again;
}
btrfs_assert_tree_read_locks_get(eb);
@@ -159,17 +280,19 @@ again:
}
/*
- * take a spinning read lock.
- * returns 1 if we get the read lock and 0 if we don't
- * this won't wait for blocking writers
+ * Lock extent buffer for read, optimistically expecting that there are no
+ * contending blocking writers. If there are, don't wait.
+ *
+ * Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
{
- if (eb->blocking_writers)
+ if (READ_ONCE(eb->blocking_writers))
return 0;
read_lock(&eb->lock);
- if (eb->blocking_writers) {
+ /* Refetch value after lock */
+ if (READ_ONCE(eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
}
@@ -180,18 +303,20 @@ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
}
/*
- * returns 1 if we get the read lock and 0 if we don't
- * this won't wait for blocking writers
+ * Try-lock for read. Don't block or wait for contending writers.
+ *
+ * Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
- if (eb->blocking_writers)
+ if (READ_ONCE(eb->blocking_writers))
return 0;
if (!read_trylock(&eb->lock))
return 0;
- if (eb->blocking_writers) {
+ /* Refetch value after lock */
+ if (READ_ONCE(eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
}
@@ -202,16 +327,19 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
}
/*
- * returns 1 if we get the read lock and 0 if we don't
- * this won't wait for blocking writers or readers
+ * Try-lock for write. May block until the lock is uncontended, but does not
+ * wait until it is free.
+ *
+ * Retrun 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
- if (eb->blocking_writers || atomic_read(&eb->blocking_readers))
+ if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers))
return 0;
write_lock(&eb->lock);
- if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) {
+ /* Refetch value after lock */
+ if (READ_ONCE(eb->blocking_writers) || atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
return 0;
}
@@ -223,7 +351,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
}
/*
- * drop a spinning read lock
+ * Release read lock. Must be used only if the lock is in spinning mode. If
+ * the read lock is nested, must pair with read lock before the write unlock.
+ *
+ * The rwlock is not held upon exit.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
@@ -245,7 +376,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
}
/*
- * drop a blocking read lock
+ * Release read lock, previously set to blocking by a pairing call to
+ * btrfs_set_lock_blocking_read(). Can be nested in write lock by the same
+ * thread.
+ *
+ * State of rwlock is unchanged, last reader wakes waiting threads.
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
@@ -269,8 +404,10 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
}
/*
- * take a spinning write lock. This will wait for both
- * blocking readers or writers
+ * Lock for write. Wait for all blocking and spinning readers and writers. This
+ * starts context where reader lock could be nested by the same thread.
+ *
+ * The rwlock is held for write upon exit.
*/
void btrfs_tree_lock(struct extent_buffer *eb)
{
@@ -282,9 +419,11 @@ void btrfs_tree_lock(struct extent_buffer *eb)
WARN_ON(eb->lock_owner == current->pid);
again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
- wait_event(eb->write_lock_wq, eb->blocking_writers == 0);
+ wait_event(eb->write_lock_wq, READ_ONCE(eb->blocking_writers) == 0);
write_lock(&eb->lock);
- if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) {
+ /* Refetch value after lock */
+ if (atomic_read(&eb->blocking_readers) ||
+ READ_ONCE(eb->blocking_writers)) {
write_unlock(&eb->lock);
goto again;
}
@@ -295,10 +434,19 @@ again:
}
/*
- * drop a spinning or a blocking write lock.
+ * Release the write lock, either blocking or spinning (ie. there's no need
+ * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
+ * This also ends the context for nesting, the read lock must have been
+ * released already.
+ *
+ * Tasks blocked and waiting are woken, rwlock is not held upon exit.
*/
void btrfs_tree_unlock(struct extent_buffer *eb)
{
+ /*
+ * This is read both locked and unlocked but always by the same thread
+ * that already owns the lock so we don't need to use READ_ONCE
+ */
int blockers = eb->blocking_writers;
BUG_ON(blockers > 1);
@@ -310,7 +458,8 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
btrfs_assert_no_spinning_writers(eb);
- eb->blocking_writers--;
+ /* Unlocked write */
+ WRITE_ONCE(eb->blocking_writers, 0);
/*
* We need to order modifying blocking_writers above with
* actually waking up the sleepers to ensure they see the
@@ -322,3 +471,55 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
write_unlock(&eb->lock);
}
}
+
+/*
+ * Set all locked nodes in the path to blocking locks. This should be done
+ * before scheduling
+ */
+void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+ if (!p->nodes[i] || !p->locks[i])
+ continue;
+ /*
+ * If we currently have a spinning reader or writer lock this
+ * will bump the count of blocking holders and drop the
+ * spinlock.
+ */
+ if (p->locks[i] == BTRFS_READ_LOCK) {
+ btrfs_set_lock_blocking_read(p->nodes[i]);
+ p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
+ } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
+ btrfs_set_lock_blocking_write(p->nodes[i]);
+ p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
+ }
+ }
+}
+
+/*
+ * This releases any locks held in the path starting at level and going all the
+ * way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few corner
+ * cases, such as COW of the block at slot zero in the node. This ignores
+ * those rules, and it should only be called when there are no more updates to
+ * be done higher up in the tree.
+ */
+void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+ int i;
+
+ if (path->keep_locks)
+ return;
+
+ for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+ if (!path->nodes[i])
+ continue;
+ if (!path->locks[i])
+ continue;
+ btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+ path->locks[i] = 0;
+ }
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index b775a4207ed9..21a285883e89 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_LOCKING_H
#define BTRFS_LOCKING_H
+#include "extent_io.h"
+
#define BTRFS_WRITE_LOCK 1
#define BTRFS_READ_LOCK 2
#define BTRFS_WRITE_LOCK_BLOCKING 3
@@ -19,11 +21,20 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
void btrfs_set_lock_blocking_read(struct extent_buffer *eb);
void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
-void btrfs_assert_tree_locked(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+#ifdef CONFIG_BTRFS_DEBUG
+static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
+ BUG_ON(!eb->write_locks);
+}
+#else
+static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { }
+#endif
+
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
{
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index acad4174f68d..aa9cd11f4b78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -63,27 +63,7 @@ struct workspace {
static struct workspace_manager wsm;
-static void lzo_init_workspace_manager(void)
-{
- btrfs_init_workspace_manager(&wsm, &btrfs_lzo_compress);
-}
-
-static void lzo_cleanup_workspace_manager(void)
-{
- btrfs_cleanup_workspace_manager(&wsm);
-}
-
-static struct list_head *lzo_get_workspace(unsigned int level)
-{
- return btrfs_get_workspace(&wsm, level);
-}
-
-static void lzo_put_workspace(struct list_head *ws)
-{
- btrfs_put_workspace(&wsm, ws);
-}
-
-static void lzo_free_workspace(struct list_head *ws)
+void lzo_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -93,7 +73,7 @@ static void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *lzo_alloc_workspace(unsigned int level)
+struct list_head *lzo_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -131,13 +111,9 @@ static inline size_t read_compress_length(const char *buf)
return le32_to_cpu(dlen);
}
-static int lzo_compress_pages(struct list_head *ws,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
@@ -303,7 +279,7 @@ out:
return ret;
}
-static int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
@@ -444,10 +420,9 @@ done:
return ret;
}
-static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
@@ -508,15 +483,7 @@ out:
}
const struct btrfs_compress_op btrfs_lzo_compress = {
- .init_workspace_manager = lzo_init_workspace_manager,
- .cleanup_workspace_manager = lzo_cleanup_workspace_manager,
- .get_workspace = lzo_get_workspace,
- .put_workspace = lzo_put_workspace,
- .alloc_workspace = lzo_alloc_workspace,
- .free_workspace = lzo_free_workspace,
- .compress_pages = lzo_compress_pages,
- .decompress_bio = lzo_decompress_bio,
- .decompress = lzo_decompress,
+ .workspace_manager = &wsm,
.max_level = 1,
.default_level = 1,
};
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 7d564924dfeb..72bab64ecf60 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -47,4 +47,15 @@ static inline u64 div_factor_fine(u64 num, int factor)
return div_u64(num, 100);
}
+/* Copy of is_power_of_two that is 64bit safe */
+static inline bool is_power_of_two_u64(u64 n)
+{
+ return n != 0 && (n & (n - 1)) == 0;
+}
+
+static inline bool has_single_bit_set(u64 n)
+{
+ return is_power_of_two_u64(n);
+}
+
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 24b6c72b9a59..fb09bc2f8e4d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -547,7 +547,6 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
- btrfs_flush_delalloc_helper,
btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
@@ -573,12 +572,11 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
return count;
}
-u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len)
{
struct btrfs_root *root;
struct list_head splice;
- u64 total_done = 0;
u64 done;
INIT_LIST_HEAD(&splice);
@@ -598,7 +596,6 @@ u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
done = btrfs_wait_ordered_extents(root, nr,
range_start, range_len);
btrfs_put_fs_root(root);
- total_done += done;
spin_lock(&fs_info->ordered_root_lock);
if (nr != U64_MAX) {
@@ -608,8 +605,6 @@ u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
mutex_unlock(&fs_info->ordered_operations_mutex);
-
- return total_done;
}
/*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 5204171ea962..4eb0319a86d7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -186,7 +186,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
-u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
struct btrfs_inode *inode, u64 start,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9cb50577d982..873b6b694107 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -266,9 +266,9 @@ void btrfs_print_leaf(struct extent_buffer *l)
struct btrfs_block_group_item);
pr_info(
"\t\tblock group used %llu chunk_objectid %llu flags %llu\n",
- btrfs_disk_block_group_used(l, bi),
- btrfs_disk_block_group_chunk_objectid(l, bi),
- btrfs_disk_block_group_flags(l, bi));
+ btrfs_block_group_used(l, bi),
+ btrfs_block_group_chunk_objectid(l, bi),
+ btrfs_block_group_flags(l, bi));
break;
case BTRFS_CHUNK_ITEM_KEY:
print_chunk(l, btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 1e664e0b59b8..deb59e7cfcac 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -416,11 +416,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- parent_inode = btrfs_iget(sb, &key, parent_root, NULL);
+ parent_inode = btrfs_iget(sb, &key, parent_root);
if (IS_ERR(parent_inode))
return PTR_ERR(parent_inode);
- child_inode = btrfs_iget(sb, &key, root, NULL);
+ child_inode = btrfs_iget(sb, &key, root);
if (IS_ERR(child_inode)) {
iput(parent_inode);
return PTR_ERR(child_inode);
@@ -437,8 +437,6 @@ void __init btrfs_props_init(void)
{
int i;
- hash_init(prop_handlers_ht);
-
for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
struct prop_handler *p = &prop_handlers[i];
u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3ad151655eb8..93aeb2e539a4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1811,7 +1811,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
/* For src_path */
- extent_buffer_get(src_eb);
+ atomic_inc(&src_eb->refs);
src_path->nodes[root_level] = src_eb;
src_path->slots[root_level] = dst_path->slots[root_level];
src_path->locks[root_level] = 0;
@@ -2067,7 +2067,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
goto out;
}
/* For dst_path */
- extent_buffer_get(dst_eb);
+ atomic_inc(&dst_eb->refs);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
@@ -2126,7 +2126,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* walk back up the tree (adjusting slot pointers as we go)
* and restart the search process.
*/
- extent_buffer_get(root_eb); /* For path */
+ atomic_inc(&root_eb->refs); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
@@ -3277,10 +3277,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- memset(&fs_info->qgroup_rescan_work, 0,
- sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
- btrfs_qgroup_rescan_helper,
btrfs_qgroup_rescan_worker, NULL, NULL);
return 0;
}
@@ -3826,7 +3823,7 @@ out:
*/
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
- struct btrfs_block_group_cache *bg,
+ struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot)
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 46ba7bd2961c..236f12224d52 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -408,7 +408,7 @@ void btrfs_qgroup_init_swapped_blocks(
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *subvol_root,
- struct btrfs_block_group_cache *bg,
+ struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
u64 last_snapshot);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 57a2ac721985..a8e53c8e7b01 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -190,7 +190,7 @@ static void scrub_parity_work(struct btrfs_work *work);
static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
{
- btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
+ btrfs_init_work(&rbio->work, work_func, NULL, NULL);
btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
}
@@ -671,8 +671,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
*/
static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
{
- int bucket = rbio_bucket(rbio);
- struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+ struct btrfs_stripe_hash *h;
struct btrfs_raid_bio *cur;
struct btrfs_raid_bio *pending;
unsigned long flags;
@@ -680,64 +679,63 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
struct btrfs_raid_bio *cache_drop = NULL;
int ret = 0;
+ h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
- if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
- spin_lock(&cur->bio_list_lock);
-
- /* can we steal this cached rbio's pages? */
- if (bio_list_empty(&cur->bio_list) &&
- list_empty(&cur->plug_list) &&
- test_bit(RBIO_CACHE_BIT, &cur->flags) &&
- !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
- list_del_init(&cur->hash_list);
- refcount_dec(&cur->refs);
-
- steal_rbio(cur, rbio);
- cache_drop = cur;
- spin_unlock(&cur->bio_list_lock);
+ if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
+ continue;
- goto lockit;
- }
+ spin_lock(&cur->bio_list_lock);
- /* can we merge into the lock owner? */
- if (rbio_can_merge(cur, rbio)) {
- merge_rbio(cur, rbio);
- spin_unlock(&cur->bio_list_lock);
- freeit = rbio;
- ret = 1;
- goto out;
- }
+ /* Can we steal this cached rbio's pages? */
+ if (bio_list_empty(&cur->bio_list) &&
+ list_empty(&cur->plug_list) &&
+ test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+ !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+ list_del_init(&cur->hash_list);
+ refcount_dec(&cur->refs);
+ steal_rbio(cur, rbio);
+ cache_drop = cur;
+ spin_unlock(&cur->bio_list_lock);
- /*
- * we couldn't merge with the running
- * rbio, see if we can merge with the
- * pending ones. We don't have to
- * check for rmw_locked because there
- * is no way they are inside finish_rmw
- * right now
- */
- list_for_each_entry(pending, &cur->plug_list,
- plug_list) {
- if (rbio_can_merge(pending, rbio)) {
- merge_rbio(pending, rbio);
- spin_unlock(&cur->bio_list_lock);
- freeit = rbio;
- ret = 1;
- goto out;
- }
- }
+ goto lockit;
+ }
- /* no merging, put us on the tail of the plug list,
- * our rbio will be started with the currently
- * running rbio unlocks
- */
- list_add_tail(&rbio->plug_list, &cur->plug_list);
+ /* Can we merge into the lock owner? */
+ if (rbio_can_merge(cur, rbio)) {
+ merge_rbio(cur, rbio);
spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
ret = 1;
goto out;
}
+
+
+ /*
+ * We couldn't merge with the running rbio, see if we can merge
+ * with the pending ones. We don't have to check for rmw_locked
+ * because there is no way they are inside finish_rmw right now
+ */
+ list_for_each_entry(pending, &cur->plug_list, plug_list) {
+ if (rbio_can_merge(pending, rbio)) {
+ merge_rbio(pending, rbio);
+ spin_unlock(&cur->bio_list_lock);
+ freeit = rbio;
+ ret = 1;
+ goto out;
+ }
+ }
+
+ /*
+ * No merging, put us on the tail of the plug list, our rbio
+ * will be started with the currently running rbio unlocks
+ */
+ list_add_tail(&rbio->plug_list, &cur->plug_list);
+ spin_unlock(&cur->bio_list_lock);
+ ret = 1;
+ goto out;
}
lockit:
refcount_inc(&rbio->refs);
@@ -1743,8 +1741,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, btrfs_rmw_helper,
- unplug_work, NULL, NULL);
+ btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
btrfs_queue_work(plug->info->rmw_workers,
&plug->work);
return;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ee6f60547a8d..243a2e44526e 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -227,7 +227,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
struct btrfs_fs_info *fs_info = dev->fs_info;
int ret;
struct reada_zone *zone;
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
u64 start;
u64 end;
int i;
@@ -248,8 +248,8 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
if (!cache)
return NULL;
- start = cache->key.objectid;
- end = start + cache->key.offset - 1;
+ start = cache->start;
+ end = start + cache->length - 1;
btrfs_put_block_group(cache);
zone = kzalloc(sizeof(*zone), GFP_KERNEL);
@@ -752,21 +752,19 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
static void reada_start_machine_worker(struct btrfs_work *work)
{
struct reada_machine_work *rmw;
- struct btrfs_fs_info *fs_info;
int old_ioprio;
rmw = container_of(work, struct reada_machine_work, work);
- fs_info = rmw->fs_info;
-
- kfree(rmw);
old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
task_nice_ioprio(current));
set_task_ioprio(current, BTRFS_IOPRIO_READA);
- __reada_start_machine(fs_info);
+ __reada_start_machine(rmw->fs_info);
set_task_ioprio(current, old_ioprio);
- atomic_dec(&fs_info->reada_works_cnt);
+ atomic_dec(&rmw->fs_info->reada_works_cnt);
+
+ kfree(rmw);
}
static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -821,8 +819,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
/* FIXME we cannot handle this properly right now */
BUG();
}
- btrfs_init_work(&rmw->work, btrfs_readahead_helper,
- reada_start_machine_worker, NULL, NULL);
+ btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
rmw->fs_info = fs_info;
btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5cd42b66818c..d897a8e5e430 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -147,7 +147,7 @@ struct file_extent_cluster {
struct reloc_control {
/* block group to relocate */
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
/* extent tree */
struct btrfs_root *extent_root;
/* inode for moving data */
@@ -1560,11 +1560,10 @@ again:
return NULL;
}
-static int in_block_group(u64 bytenr,
- struct btrfs_block_group_cache *block_group)
+static int in_block_group(u64 bytenr, struct btrfs_block_group *block_group)
{
- if (bytenr >= block_group->key.objectid &&
- bytenr < block_group->key.objectid + block_group->key.offset)
+ if (bytenr >= block_group->start &&
+ bytenr < block_group->start + block_group->length)
return 1;
return 0;
}
@@ -2246,7 +2245,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_root_level(root_item);
- extent_buffer_get(reloc_root->node);
+ atomic_inc(&reloc_root->node->refs);
path->nodes[level] = reloc_root->node;
path->slots[level] = 0;
} else {
@@ -3195,7 +3194,6 @@ static noinline_for_stack
int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
u64 block_start)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret = 0;
@@ -3208,7 +3206,6 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
- em->bdev = fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
@@ -3544,7 +3541,7 @@ static int block_use_full_backref(struct reloc_control *rc,
}
static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group,
+ struct btrfs_block_group *block_group,
struct inode *inode,
u64 ino)
{
@@ -3560,7 +3557,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
if (IS_ERR(inode))
return -ENOENT;
@@ -3863,7 +3860,7 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
u64 start, end, last;
int ret;
- last = rc->block_group->key.objectid + rc->block_group->key.offset;
+ last = rc->block_group->start + rc->block_group->length;
while (1) {
cond_resched();
if (rc->search_start >= last) {
@@ -3980,7 +3977,7 @@ int prepare_to_relocate(struct reloc_control *rc)
return -ENOMEM;
memset(&rc->cluster, 0, sizeof(rc->cluster));
- rc->search_start = rc->block_group->key.objectid;
+ rc->search_start = rc->block_group->start;
rc->extents_found = 0;
rc->nodes_relocated = 0;
rc->merging_rsv_size = 0;
@@ -4219,7 +4216,7 @@ out:
*/
static noinline_for_stack
struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *group)
+ struct btrfs_block_group *group)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
@@ -4246,9 +4243,9 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
BUG_ON(IS_ERR(inode));
- BTRFS_I(inode)->index_cnt = group->key.objectid;
+ BTRFS_I(inode)->index_cnt = group->start;
err = btrfs_orphan_add(trans, BTRFS_I(inode));
out:
@@ -4283,7 +4280,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
* Print the block group being relocated
*/
static void describe_relocation(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group)
+ struct btrfs_block_group *block_group)
{
char buf[128] = {'\0'};
@@ -4291,7 +4288,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
btrfs_info(fs_info,
"relocating block group %llu flags %s",
- block_group->key.objectid, buf);
+ block_group->start, buf);
}
/*
@@ -4299,7 +4296,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info,
*/
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
{
- struct btrfs_block_group_cache *bg;
+ struct btrfs_block_group *bg;
struct btrfs_root *extent_root = fs_info->extent_root;
struct reloc_control *rc;
struct inode *inode;
@@ -4326,7 +4323,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
rc->extent_root = extent_root;
rc->block_group = bg;
- ret = btrfs_inc_block_group_ro(rc->block_group);
+ ret = btrfs_inc_block_group_ro(rc->block_group, true);
if (ret) {
err = ret;
goto out;
@@ -4364,8 +4361,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
btrfs_wait_ordered_roots(fs_info, U64_MAX,
- rc->block_group->key.objectid,
- rc->block_group->key.offset);
+ rc->block_group->start,
+ rc->block_group->length);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
@@ -4405,7 +4402,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
- WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+ WARN_ON(rc->block_group->used > 0);
out:
if (err && rw)
btrfs_dec_block_group_ro(rc->block_group);
@@ -4688,7 +4685,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
node->new_bytenr != buf->start);
drop_node_buffer(node);
- extent_buffer_get(cow);
+ atomic_inc(&cow->refs);
node->eb = cow;
node->new_bytenr = cow->start;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f7d4e03f4c5d..21de630b0730 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -389,8 +389,7 @@ static struct full_stripe_lock *search_full_stripe_lock(
*
* Caller must ensure @cache is a RAID56 block group.
*/
-static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
- u64 bytenr)
+static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
{
u64 ret;
@@ -404,8 +403,8 @@ static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
* round_down() can only handle power of 2, while RAID56 full
* stripe length can be 64KiB * n, so we need to manually round down.
*/
- ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
- cache->full_stripe_len + cache->key.objectid;
+ ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
+ cache->full_stripe_len + cache->start;
return ret;
}
@@ -423,7 +422,7 @@ static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool *locked_ret)
{
- struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *existing;
u64 fstripe_start;
@@ -470,7 +469,7 @@ out:
static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool locked)
{
- struct btrfs_block_group_cache *bg_cache;
+ struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *fstripe_lock;
u64 fstripe_start;
@@ -598,8 +597,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
- btrfs_init_work(&sbio->work, btrfs_scrub_helper,
- scrub_bio_end_io_worker, NULL, NULL);
+ btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
+ NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -1720,8 +1719,7 @@ static void scrub_wr_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
- scrub_wr_bio_end_io_worker, NULL, NULL);
+ btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
@@ -2149,14 +2147,13 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
scrub_write_block_to_dev_replace(sblock);
}
- scrub_block_put(sblock);
-
if (sctx->is_dev_replace && sctx->flush_all_writes) {
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
}
+ scrub_block_put(sblock);
scrub_pending_bio_dec(sctx);
}
@@ -2204,8 +2201,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
raid56_add_scrub_pages(rbio, spage->page, spage->logical);
}
- btrfs_init_work(&sblock->work, btrfs_scrub_helper,
- scrub_missing_raid56_worker, NULL, NULL);
+ btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
@@ -2743,8 +2739,8 @@ static void scrub_parity_bio_endio(struct bio *bio)
bio_put(bio);
- btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
- scrub_parity_bio_endio_worker, NULL, NULL);
+ btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
+ NULL);
btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
}
@@ -3420,7 +3416,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group_cache *cache)
+ struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
@@ -3484,7 +3480,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
path = btrfs_alloc_path();
@@ -3563,46 +3559,26 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* -> btrfs_scrub_pause()
*/
scrub_pause_on(fs_info);
- ret = btrfs_inc_block_group_ro(cache);
- if (!ret && sctx->is_dev_replace) {
- /*
- * If we are doing a device replace wait for any tasks
- * that started delalloc right before we set the block
- * group to RO mode, as they might have just allocated
- * an extent from it or decided they could do a nocow
- * write. And if any such tasks did that, wait for their
- * ordered extents to complete and then commit the
- * current transaction, so that we can later see the new
- * extent items in the extent tree - the ordered extents
- * create delayed data references (for cow writes) when
- * they complete, which will be run and insert the
- * corresponding extent items into the extent tree when
- * we commit the transaction they used when running
- * inode.c:btrfs_finish_ordered_io(). We later use
- * the commit root of the extent tree to find extents
- * to copy from the srcdev into the tgtdev, and we don't
- * want to miss any new extents.
- */
- btrfs_wait_block_group_reservations(cache);
- btrfs_wait_nocow_writers(cache);
- ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
- cache->key.objectid,
- cache->key.offset);
- if (ret > 0) {
- struct btrfs_trans_handle *trans;
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- ret = PTR_ERR(trans);
- else
- ret = btrfs_commit_transaction(trans);
- if (ret) {
- scrub_pause_off(fs_info);
- btrfs_put_block_group(cache);
- break;
- }
- }
- }
+
+ /*
+ * Don't do chunk preallocation for scrub.
+ *
+ * This is especially important for SYSTEM bgs, or we can hit
+ * -EFBIG from btrfs_finish_chunk_alloc() like:
+ * 1. The only SYSTEM bg is marked RO.
+ * Since SYSTEM bg is small, that's pretty common.
+ * 2. New SYSTEM bg will be allocated
+ * Due to regular version will allocate new chunk.
+ * 3. New SYSTEM bg is empty and will get cleaned up
+ * Before cleanup really happens, it's marked RO again.
+ * 4. Empty SYSTEM bg get scrubbed
+ * We go back to 2.
+ *
+ * This can easily boost the amount of SYSTEM chunks if cleaner
+ * thread can't be triggered fast enough, and use up all space
+ * of btrfs_super_block::sys_chunk_array
+ */
+ ret = btrfs_inc_block_group_ro(cache, false);
scrub_pause_off(fs_info);
if (ret == 0) {
@@ -3623,7 +3599,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
}
- down_write(&fs_info->dev_replace.rwsem);
+ down_write(&dev_replace->rwsem);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
@@ -3664,10 +3640,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
- down_write(&fs_info->dev_replace.rwsem);
+ down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
- up_write(&fs_info->dev_replace.rwsem);
+ up_write(&dev_replace->rwsem);
if (ro_set)
btrfs_dec_block_group_ro(cache);
@@ -3681,7 +3657,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
spin_lock(&cache->lock);
if (!cache->removed && !cache->ro && cache->reserved == 0 &&
- btrfs_block_group_used(&cache->item) == 0) {
+ cache->used == 0) {
spin_unlock(&cache->lock);
btrfs_mark_bg_unused(cache);
} else {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 123ac54af071..ae2db5eb1549 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -25,6 +25,14 @@
#include "compression.h"
/*
+ * Maximum number of references an extent can have in order for us to attempt to
+ * issue clone operations instead of write operations. This currently exists to
+ * avoid hitting limitations of the backreference walking code (taking a lot of
+ * time and using too much memory for extents with large number of references).
+ */
+#define SEND_MAX_EXTENT_REFS 64
+
+/*
* A fs_path is a helper to dynamically build path names with unknown size.
* It reallocates the internal buffer on demand.
* It allows fast adding of path elements on the right side (normal path) and
@@ -1248,12 +1256,20 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
*/
if (found->root == bctx->sctx->send_root) {
/*
- * TODO for the moment we don't accept clones from the inode
- * that is currently send. We may change this when
- * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
- * file.
+ * If the source inode was not yet processed we can't issue a
+ * clone operation, as the source extent does not exist yet at
+ * the destination of the stream.
*/
- if (ino >= bctx->cur_objectid)
+ if (ino > bctx->cur_objectid)
+ return 0;
+ /*
+ * We clone from the inode currently being sent as long as the
+ * source extent is already processed, otherwise we could try
+ * to clone from an extent that does not exist yet at the
+ * destination of the stream.
+ */
+ if (ino == bctx->cur_objectid &&
+ offset >= bctx->sctx->cur_inode_next_write_offset)
return 0;
}
@@ -1302,6 +1318,7 @@ static int find_extent_clone(struct send_ctx *sctx,
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
+ struct btrfs_extent_item *ei;
int compressed;
u32 i;
@@ -1349,7 +1366,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = extent_from_logical(fs_info, disk_byte, tmp_path,
&found_key, &flags);
up_read(&fs_info->commit_root_sem);
- btrfs_release_path(tmp_path);
if (ret < 0)
goto out;
@@ -1358,6 +1374,21 @@ static int find_extent_clone(struct send_ctx *sctx,
goto out;
}
+ ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
+ struct btrfs_extent_item);
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
+ ret = -ENOENT;
+ goto out;
+ }
+ btrfs_release_path(tmp_path);
+
/*
* Setup the clone roots.
*/
@@ -4779,7 +4810,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index e8a4b0ebe97f..f09aa6ee9113 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -10,7 +10,7 @@
#include "transaction.h"
#include "block-group.h"
-u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
+u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included)
{
ASSERT(s_info);
@@ -58,7 +58,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
spin_lock_init(&space_info->lock);
space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
- init_waitqueue_head(&space_info->wait);
INIT_LIST_HEAD(&space_info->ro_bgs);
INIT_LIST_HEAD(&space_info->tickets);
INIT_LIST_HEAD(&space_info->priority_tickets);
@@ -285,7 +284,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
int index = 0;
spin_lock(&info->lock);
@@ -301,8 +300,7 @@ again:
spin_lock(&cache->lock);
btrfs_info(fs_info,
"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
- cache->key.objectid, cache->key.offset,
- btrfs_block_group_used(&cache->item), cache->pinned,
+ cache->start, cache->length, cache->used, cache->pinned,
cache->reserved, cache->ro ? "[readonly]" : "");
btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 8867e84aa33d..1a349e3f9cc1 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -63,7 +63,6 @@ struct btrfs_space_info {
struct rw_semaphore groups_sem;
/* for block groups in our same type */
struct list_head block_groups[BTRFS_NR_RAID_TYPES];
- wait_queue_head_t wait;
struct kobject kobj;
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
@@ -116,7 +115,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
struct btrfs_space_info **space_info);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
-u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
+u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1b151af25772..f452a94abdc3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,7 +66,7 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
-const char *btrfs_decode_error(int errno)
+const char * __attribute_const__ btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -187,7 +187,7 @@ static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
-void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
@@ -1219,7 +1219,7 @@ static int btrfs_fill_super(struct super_block *sb,
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
+ inode = btrfs_iget(sb, &key, fs_info->fs_root);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
@@ -1669,7 +1669,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
- btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
@@ -1936,6 +1935,10 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
num_stripes = nr_devices;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_stripes = 2;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
+ num_stripes = 3;
+ else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
+ num_stripes = 4;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 4;
@@ -2022,7 +2025,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
struct btrfs_super_block *disk_super = fs_info->super_copy;
- struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
@@ -2036,7 +2038,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
int mixed = 0;
rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry_rcu(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
@@ -2297,7 +2299,7 @@ static const struct super_operations btrfs_super_ops = {
static const struct file_operations btrfs_ctl_fops = {
.open = btrfs_control_open,
.unlocked_ioctl = btrfs_control_ioctl,
- .compat_ioctl = btrfs_control_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.owner = THIS_MODULE,
.llseek = noop_llseek,
};
@@ -2360,10 +2362,14 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_cachep;
- err = extent_map_init();
+ err = extent_state_cache_init();
if (err)
goto free_extent_io;
+ err = extent_map_init();
+ if (err)
+ goto free_extent_state_cache;
+
err = ordered_data_init();
if (err)
goto free_extent_map;
@@ -2422,6 +2428,8 @@ free_ordered_data:
ordered_data_exit();
free_extent_map:
extent_map_exit();
+free_extent_state_cache:
+ extent_state_cache_exit();
free_extent_io:
extent_io_exit();
free_cachep:
@@ -2442,6 +2450,7 @@ static void __exit exit_btrfs_fs(void)
btrfs_prelim_ref_exit();
ordered_data_exit();
extent_map_exit();
+ extent_state_cache_exit();
extent_io_exit();
btrfs_interface_exit();
btrfs_end_io_wq_exit();
@@ -2456,3 +2465,6 @@ module_exit(exit_btrfs_fs)
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");
+MODULE_SOFTDEP("pre: xxhash64");
+MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: blake2b-256");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index f6d3c80f2e28..5ebbe8a5ee76 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -9,6 +9,7 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/bug.h>
+#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
@@ -258,6 +259,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
+BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -272,6 +274,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(no_holes),
BTRFS_FEAT_ATTR_PTR(metadata_uuid),
BTRFS_FEAT_ATTR_PTR(free_space_tree),
+ BTRFS_FEAT_ATTR_PTR(raid1c34),
NULL
};
@@ -295,8 +298,30 @@ static ssize_t rmdir_subvol_show(struct kobject *kobj,
}
BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show);
+static ssize_t supported_checksums_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ ssize_t ret = 0;
+ int i;
+
+ for (i = 0; i < btrfs_get_num_csums(); i++) {
+ /*
+ * This "trick" only works as long as 'enum btrfs_csum_type' has
+ * no holes in it
+ */
+ ret += snprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+ (i == 0 ? "" : " "), btrfs_super_csum_name(i));
+
+ }
+
+ ret += snprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
+ BTRFS_ATTR_PTR(static_feature, supported_checksums),
NULL
};
@@ -372,16 +397,16 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
{
struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
- struct btrfs_block_group_cache *block_group;
+ struct btrfs_block_group *block_group;
int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags);
u64 val = 0;
down_read(&sinfo->groups_sem);
list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes))
- val += block_group->key.offset;
+ val += block_group->length;
else
- val += btrfs_block_group_used(&block_group->item);
+ val += block_group->used;
}
up_read(&sinfo->groups_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", val);
@@ -604,6 +629,19 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj,
BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show);
+static ssize_t btrfs_checksum_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ u16 csum_type = btrfs_super_csum_type(fs_info->super_copy);
+
+ return snprintf(buf, PAGE_SIZE, "%s (%s)\n",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(fs_info->csum_shash));
+}
+
+BTRFS_ATTR(, checksum, btrfs_checksum_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -611,6 +649,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, clone_alignment),
BTRFS_ATTR_PTR(, quota_override),
BTRFS_ATTR_PTR(, metadata_uuid),
+ BTRFS_ATTR_PTR(, checksum),
NULL,
};
@@ -822,7 +861,7 @@ static void init_feature_attrs(void)
* Create a sysfs entry for a given block group type at path
* /sys/fs/btrfs/UUID/allocation/data/TYPE
*/
-void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache)
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_space_info *space_info = cache->space_info;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 610e9c36a94c..e10c3adfc30f 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -32,7 +32,7 @@ int __init btrfs_init_sysfs(void);
void __cold btrfs_exit_sysfs(void);
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_add_block_group_type(struct btrfs_block_group_cache *cache);
+void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache);
int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info);
void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 99fe9bf3fdac..a7aca4141788 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -202,11 +202,11 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
kfree(root);
}
-struct btrfs_block_group_cache *
+struct btrfs_block_group *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
unsigned long length)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = kzalloc(sizeof(*cache), GFP_KERNEL);
if (!cache)
@@ -218,9 +218,8 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
return NULL;
}
- cache->key.objectid = 0;
- cache->key.offset = length;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->start = 0;
+ cache->length = length;
cache->full_stripe_len = fs_info->sectorsize;
cache->fs_info = fs_info;
@@ -233,7 +232,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
return cache;
}
-void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
{
if (!cache)
return;
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index ee277bbd939b..9e52527357d8 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -41,9 +41,9 @@ struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_free_dummy_root(struct btrfs_root *root);
-struct btrfs_block_group_cache *
+struct btrfs_block_group *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length);
-void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
#else
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 43ec7060fcd2..aebdf23f0cdd 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -17,7 +17,7 @@
* entry and remove space from either end and the middle, and make sure we can
* remove space that covers adjacent extent entries.
*/
-static int test_extents(struct btrfs_block_group_cache *cache)
+static int test_extents(struct btrfs_block_group *cache)
{
int ret = 0;
@@ -87,8 +87,7 @@ static int test_extents(struct btrfs_block_group_cache *cache)
return 0;
}
-static int test_bitmaps(struct btrfs_block_group_cache *cache,
- u32 sectorsize)
+static int test_bitmaps(struct btrfs_block_group *cache, u32 sectorsize)
{
u64 next_bitmap_offset;
int ret;
@@ -156,7 +155,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache,
}
/* This is the high grade jackassery */
-static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache,
+static int test_bitmaps_and_extents(struct btrfs_block_group *cache,
u32 sectorsize)
{
u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize);
@@ -331,7 +330,7 @@ static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
/* Used by test_steal_space_from_bitmap_to_extent(). */
static int
-check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
+check_num_extents_and_bitmaps(const struct btrfs_block_group *cache,
const int num_extents,
const int num_bitmaps)
{
@@ -351,7 +350,7 @@ check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
}
/* Used by test_steal_space_from_bitmap_to_extent(). */
-static int check_cache_empty(struct btrfs_block_group_cache *cache)
+static int check_cache_empty(struct btrfs_block_group *cache)
{
u64 offset;
u64 max_extent_size;
@@ -393,7 +392,7 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache)
* requests.
*/
static int
-test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
+test_steal_space_from_bitmap_to_extent(struct btrfs_block_group *cache,
u32 sectorsize)
{
int ret;
@@ -829,7 +828,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
struct btrfs_fs_info *fs_info;
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index bc92df977630..1a846bf6e197 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -18,7 +18,7 @@ struct free_space_extent {
static int __check_free_space_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct free_space_extent * const extents,
unsigned int num_extents)
@@ -48,7 +48,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
if (path->slots[0] != 0)
goto invalid;
- end = cache->key.objectid + cache->key.offset;
+ end = cache->start + cache->length;
i = 0;
while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -107,7 +107,7 @@ invalid:
static int check_free_space_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct free_space_extent * const extents,
unsigned int num_extents)
@@ -150,12 +150,12 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
static int test_empty_block_group(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, cache->key.offset},
+ {cache->start, cache->length},
};
return check_free_space_extents(trans, fs_info, cache, path,
@@ -164,7 +164,7 @@ static int test_empty_block_group(struct btrfs_trans_handle *trans,
static int test_remove_all(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
@@ -172,8 +172,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start,
+ cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -185,18 +185,17 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
static int test_remove_beginning(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid + alignment,
- cache->key.offset - alignment},
+ {cache->start + alignment, cache->length - alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid, alignment);
+ cache->start, alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -209,19 +208,18 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans,
static int test_remove_end(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, cache->key.offset - alignment},
+ {cache->start, cache->length - alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid +
- cache->key.offset - alignment,
- alignment);
+ cache->start + cache->length - alignment,
+ alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -233,19 +231,18 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
static int test_remove_middle(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, alignment},
- {cache->key.objectid + 2 * alignment,
- cache->key.offset - 2 * alignment},
+ {cache->start, alignment},
+ {cache->start + 2 * alignment, cache->length - 2 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
+ cache->start + alignment,
alignment);
if (ret) {
test_err("could not remove free space");
@@ -258,24 +255,23 @@ static int test_remove_middle(struct btrfs_trans_handle *trans,
static int test_merge_left(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, 2 * alignment},
+ {cache->start, 2 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
alignment);
if (ret) {
test_err("could not add free space");
@@ -283,7 +279,7 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
+ cache->start + alignment,
alignment);
if (ret) {
test_err("could not add free space");
@@ -296,25 +292,24 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
static int test_merge_right(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid + alignment, 2 * alignment},
+ {cache->start + alignment, 2 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 2 * alignment,
+ cache->start + 2 * alignment,
alignment);
if (ret) {
test_err("could not add free space");
@@ -322,7 +317,7 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
+ cache->start + alignment,
alignment);
if (ret) {
test_err("could not add free space");
@@ -335,24 +330,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
static int test_merge_both(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, 3 * alignment},
+ {cache->start, 3 * alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
alignment);
if (ret) {
test_err("could not add free space");
@@ -360,16 +354,14 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 2 * alignment,
- alignment);
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + alignment,
- alignment);
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -381,26 +373,25 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
static int test_merge_none(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *cache,
+ struct btrfs_block_group *cache,
struct btrfs_path *path,
u32 alignment)
{
const struct free_space_extent extents[] = {
- {cache->key.objectid, alignment},
- {cache->key.objectid + 2 * alignment, alignment},
- {cache->key.objectid + 4 * alignment, alignment},
+ {cache->start, alignment},
+ {cache->start + 2 * alignment, alignment},
+ {cache->start + 4 * alignment, alignment},
};
int ret;
ret = __remove_from_free_space_tree(trans, cache, path,
- cache->key.objectid,
- cache->key.offset);
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->key.objectid,
+ ret = __add_to_free_space_tree(trans, cache, path, cache->start,
alignment);
if (ret) {
test_err("could not add free space");
@@ -408,16 +399,14 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 4 * alignment,
- alignment);
+ cache->start + 4 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
ret = __add_to_free_space_tree(trans, cache, path,
- cache->key.objectid + 2 * alignment,
- alignment);
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -429,7 +418,7 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
typedef int (*test_func_t)(struct btrfs_trans_handle *,
struct btrfs_fs_info *,
- struct btrfs_block_group_cache *,
+ struct btrfs_block_group *,
struct btrfs_path *,
u32 alignment);
@@ -438,7 +427,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
{
struct btrfs_fs_info *fs_info;
struct btrfs_root *root = NULL;
- struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_block_group *cache = NULL;
struct btrfs_trans_handle trans;
struct btrfs_path *path = NULL;
int ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8624bdee8c5b..cfc08ef9b876 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -24,9 +24,79 @@
#define BTRFS_ROOT_TRANS_TAG 0
+/*
+ * Transaction states and transitions
+ *
+ * No running transaction (fs tree blocks are not modified)
+ * |
+ * | To next stage:
+ * | Call start_transaction() variants. Except btrfs_join_transaction_nostart().
+ * V
+ * Transaction N [[TRANS_STATE_RUNNING]]
+ * |
+ * | New trans handles can be attached to transaction N by calling all
+ * | start_transaction() variants.
+ * |
+ * | To next stage:
+ * | Call btrfs_commit_transaction() on any trans handle attached to
+ * | transaction N
+ * V
+ * Transaction N [[TRANS_STATE_COMMIT_START]]
+ * |
+ * | Will wait for previous running transaction to completely finish if there
+ * | is one
+ * |
+ * | Then one of the following happes:
+ * | - Wait for all other trans handle holders to release.
+ * | The btrfs_commit_transaction() caller will do the commit work.
+ * | - Wait for current transaction to be committed by others.
+ * | Other btrfs_commit_transaction() caller will do the commit work.
+ * |
+ * | At this stage, only btrfs_join_transaction*() variants can attach
+ * | to this running transaction.
+ * | All other variants will wait for current one to finish and attach to
+ * | transaction N+1.
+ * |
+ * | To next stage:
+ * | Caller is chosen to commit transaction N, and all other trans handle
+ * | haven been released.
+ * V
+ * Transaction N [[TRANS_STATE_COMMIT_DOING]]
+ * |
+ * | The heavy lifting transaction work is started.
+ * | From running delayed refs (modifying extent tree) to creating pending
+ * | snapshots, running qgroups.
+ * | In short, modify supporting trees to reflect modifications of subvolume
+ * | trees.
+ * |
+ * | At this stage, all start_transaction() calls will wait for this
+ * | transaction to finish and attach to transaction N+1.
+ * |
+ * | To next stage:
+ * | Until all supporting trees are updated.
+ * V
+ * Transaction N [[TRANS_STATE_UNBLOCKED]]
+ * | Transaction N+1
+ * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]]
+ * | need to write them back to disk and update |
+ * | super blocks. |
+ * | |
+ * | At this stage, new transaction is allowed to |
+ * | start. |
+ * | All new start_transaction() calls will be |
+ * | attached to transid N+1. |
+ * | |
+ * | To next stage: |
+ * | Until all tree blocks are super blocks are |
+ * | written to block devices |
+ * V |
+ * Transaction N [[TRANS_STATE_COMPLETED]] V
+ * All tree blocks and super blocks are written. Transaction N+1
+ * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]]
+ * data structures will be cleaned up. | Life goes on
+ */
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
- [TRANS_STATE_BLOCKED] = __TRANS_START,
[TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
[TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
__TRANS_ATTACH |
@@ -63,10 +133,10 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
* discard the physical locations of the block groups.
*/
while (!list_empty(&transaction->deleted_bgs)) {
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
cache = list_first_entry(&transaction->deleted_bgs,
- struct btrfs_block_group_cache,
+ struct btrfs_block_group,
bg_list);
list_del_init(&cache->bg_list);
btrfs_put_block_group_trimming(cache);
@@ -383,7 +453,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
- return (trans->state >= TRANS_STATE_BLOCKED &&
+ return (trans->state >= TRANS_STATE_COMMIT_START &&
trans->state < TRANS_STATE_UNBLOCKED &&
!trans->aborted);
}
@@ -570,7 +640,7 @@ again:
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
- if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
may_wait_transaction(fs_info, type)) {
current->journal_info = h;
btrfs_commit_transaction(h);
@@ -659,7 +729,7 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
true);
}
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
+struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
BTRFS_RESERVE_NO_FLUSH, true);
@@ -798,7 +868,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
smp_mb();
- if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
cur_trans->delayed_refs.flushing)
return 1;
@@ -831,7 +901,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
if (refcount_read(&trans->use_count) > 1) {
@@ -847,13 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_chunk_metadata(trans);
- if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
- if (throttle)
- return btrfs_commit_transaction(trans);
- else
- wake_up_process(info->transaction_kthread);
- }
-
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(info->sb);
@@ -990,7 +1052,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
return werr;
}
-int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
+static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
bool errors = false;
@@ -1875,7 +1937,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group *block_group, *tmp;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -1949,6 +2011,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *prev_trans = NULL;
int ret;
+ ASSERT(refcount_read(&trans->use_count) == 1);
+
/* Stop the commit early if ->aborted is set */
if (unlikely(READ_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 2c5a6f6e5bb0..49f7196368f5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -13,7 +13,6 @@
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
- TRANS_STATE_BLOCKED,
TRANS_STATE_COMMIT_START,
TRANS_STATE_COMMIT_DOING,
TRANS_STATE_UNBLOCKED,
@@ -184,7 +183,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
unsigned int num_items,
int min_factor);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
@@ -218,8 +217,6 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages, int mark);
-int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *dirty_pages);
int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 076d5b8014fb..493d4d9e0f79 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -23,6 +23,7 @@
#include "disk-io.h"
#include "compression.h"
#include "volumes.h"
+#include "misc.h"
/*
* Error message should follow the following format:
@@ -124,6 +125,74 @@ static u64 file_extent_end(struct extent_buffer *leaf,
return end;
}
+/*
+ * Customized report for dir_item, the only new important information is
+ * key->objectid, which represents inode number
+ */
+__printf(3, 4)
+__cold
+static void dir_item_err(const struct extent_buffer *eb, int slot,
+ const char *fmt, ...)
+{
+ const struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct btrfs_key key;
+ struct va_format vaf;
+ va_list args;
+
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_crit(fs_info,
+ "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, &vaf);
+ va_end(args);
+}
+
+/*
+ * This functions checks prev_key->objectid, to ensure current key and prev_key
+ * share the same objectid as inode number.
+ *
+ * This is to detect missing INODE_ITEM in subvolume trees.
+ *
+ * Return true if everything is OK or we don't need to check.
+ * Return false if anything is wrong.
+ */
+static bool check_prev_ino(struct extent_buffer *leaf,
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
+{
+ /* No prev key, skip check */
+ if (slot == 0)
+ return true;
+
+ /* Only these key->types needs to be checked */
+ ASSERT(key->type == BTRFS_XATTR_ITEM_KEY ||
+ key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_DIR_INDEX_KEY ||
+ key->type == BTRFS_DIR_ITEM_KEY ||
+ key->type == BTRFS_EXTENT_DATA_KEY);
+
+ /*
+ * Only subvolume trees along with their reloc trees need this check.
+ * Things like log tree doesn't follow this ino requirement.
+ */
+ if (!is_fstree(btrfs_header_owner(leaf)))
+ return true;
+
+ if (key->objectid == prev_key->objectid)
+ return true;
+
+ /* Error found */
+ dir_item_err(leaf, slot,
+ "invalid previous key objectid, have %llu expect %llu",
+ prev_key->objectid, key->objectid);
+ return false;
+}
static int check_extent_data_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot,
struct btrfs_key *prev_key)
@@ -141,13 +210,33 @@ static int check_extent_data_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
+ /*
+ * Previous key must have the same key->objectid (ino).
+ * It can be XATTR_ITEM, INODE_ITEM or just another EXTENT_DATA.
+ * But if objectids mismatch, it means we have a missing
+ * INODE_ITEM.
+ */
+ if (!check_prev_ino(leaf, key, slot, prev_key))
+ return -EUCLEAN;
+
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
+ /*
+ * Make sure the item contains at least inline header, so the file
+ * extent type is not some garbage.
+ */
+ if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) {
+ file_extent_err(leaf, slot,
+ "invalid item size, have %u expect [%lu, %u)",
+ item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START,
+ SZ_4K);
+ return -EUCLEAN;
+ }
+ if (btrfs_file_extent_type(leaf, fi) >= BTRFS_NR_FILE_EXTENT_TYPES) {
file_extent_err(leaf, slot,
"invalid type for file extent, have %u expect range [0, %u]",
btrfs_file_extent_type(leaf, fi),
- BTRFS_FILE_EXTENT_TYPES);
+ BTRFS_NR_FILE_EXTENT_TYPES - 1);
return -EUCLEAN;
}
@@ -155,11 +244,11 @@ static int check_extent_data_item(struct extent_buffer *leaf,
* Support for new compression/encryption must introduce incompat flag,
* and must be caught in open_ctree().
*/
- if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
+ if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) {
file_extent_err(leaf, slot,
"invalid compression for file extent, have %u expect range [0, %u]",
btrfs_file_extent_compression(leaf, fi),
- BTRFS_COMPRESS_TYPES);
+ BTRFS_NR_COMPRESS_TYPES - 1);
return -EUCLEAN;
}
if (btrfs_file_extent_encryption(leaf, fi)) {
@@ -270,42 +359,17 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
return 0;
}
-/*
- * Customized reported for dir_item, only important new info is key->objectid,
- * which represents inode number
- */
-__printf(3, 4)
-__cold
-static void dir_item_err(const struct extent_buffer *eb, int slot,
- const char *fmt, ...)
-{
- const struct btrfs_fs_info *fs_info = eb->fs_info;
- struct btrfs_key key;
- struct va_format vaf;
- va_list args;
-
- btrfs_item_key_to_cpu(eb, &key, slot);
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- btrfs_crit(fs_info,
- "corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
- btrfs_header_level(eb) == 0 ? "leaf" : "node",
- btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
- key.objectid, &vaf);
- va_end(args);
-}
-
static int check_dir_item(struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_dir_item *di;
u32 item_size = btrfs_item_size_nr(leaf, slot);
u32 cur = 0;
+ if (!check_prev_ino(leaf, key, slot, prev_key))
+ return -EUCLEAN;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
while (cur < item_size) {
u32 name_len;
@@ -459,23 +523,23 @@ static int check_block_group_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
- if (btrfs_block_group_chunk_objectid(&bgi) !=
+ if (btrfs_stack_block_group_chunk_objectid(&bgi) !=
BTRFS_FIRST_CHUNK_TREE_OBJECTID) {
block_group_err(leaf, slot,
"invalid block group chunk objectid, have %llu expect %llu",
- btrfs_block_group_chunk_objectid(&bgi),
+ btrfs_stack_block_group_chunk_objectid(&bgi),
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
return -EUCLEAN;
}
- if (btrfs_block_group_used(&bgi) > key->offset) {
+ if (btrfs_stack_block_group_used(&bgi) > key->offset) {
block_group_err(leaf, slot,
"invalid block group used, have %llu expect [0, %llu)",
- btrfs_block_group_used(&bgi), key->offset);
+ btrfs_stack_block_group_used(&bgi), key->offset);
return -EUCLEAN;
}
- flags = btrfs_block_group_flags(&bgi);
+ flags = btrfs_stack_block_group_flags(&bgi);
if (hweight64(flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) > 1) {
block_group_err(leaf, slot,
"invalid profile flags, have 0x%llx (%lu bits set) expect no more than 1 bit set",
@@ -609,7 +673,7 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ if (!has_single_bit_set(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) {
chunk_err(leaf, chunk, logical,
"invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set",
@@ -785,11 +849,11 @@ static int check_inode_item(struct extent_buffer *leaf,
}
/*
- * S_IFMT is not bit mapped so we can't completely rely on is_power_of_2,
- * but is_power_of_2() can save us from checking FIFO/CHR/DIR/REG.
- * Only needs to check BLK, LNK and SOCKS
+ * S_IFMT is not bit mapped so we can't completely rely on
+ * is_power_of_2/has_single_bit_set, but it can save us from checking
+ * FIFO/CHR/DIR/REG. Only needs to check BLK, LNK and SOCKS
*/
- if (!is_power_of_2(mode & S_IFMT)) {
+ if (!has_single_bit_set(mode & S_IFMT)) {
if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) {
inode_item_err(fs_info, leaf, slot,
"invalid mode: has 0%o expect valid S_IF* bit(s)",
@@ -1010,8 +1074,8 @@ static int check_extent_item(struct extent_buffer *leaf,
btrfs_super_generation(fs_info->super_copy) + 1);
return -EUCLEAN;
}
- if (!is_power_of_2(flags & (BTRFS_EXTENT_FLAG_DATA |
- BTRFS_EXTENT_FLAG_TREE_BLOCK))) {
+ if (!has_single_bit_set(flags & (BTRFS_EXTENT_FLAG_DATA |
+ BTRFS_EXTENT_FLAG_TREE_BLOCK))) {
extent_err(leaf, slot,
"invalid extent flag, have 0x%llx expect 1 bit set in 0x%llx",
flags, BTRFS_EXTENT_FLAG_DATA |
@@ -1224,6 +1288,58 @@ static int check_extent_data_ref(struct extent_buffer *leaf,
return 0;
}
+#define inode_ref_err(fs_info, eb, slot, fmt, args...) \
+ inode_item_err(fs_info, eb, slot, fmt, ##args)
+static int check_inode_ref(struct extent_buffer *leaf,
+ struct btrfs_key *key, struct btrfs_key *prev_key,
+ int slot)
+{
+ struct btrfs_inode_ref *iref;
+ unsigned long ptr;
+ unsigned long end;
+
+ if (!check_prev_ino(leaf, key, slot, prev_key))
+ return -EUCLEAN;
+ /* namelen can't be 0, so item_size == sizeof() is also invalid */
+ if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) {
+ inode_ref_err(fs_info, leaf, slot,
+ "invalid item size, have %u expect (%zu, %u)",
+ btrfs_item_size_nr(leaf, slot),
+ sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ return -EUCLEAN;
+ }
+
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ end = ptr + btrfs_item_size_nr(leaf, slot);
+ while (ptr < end) {
+ u16 namelen;
+
+ if (ptr + sizeof(iref) > end) {
+ inode_ref_err(fs_info, leaf, slot,
+ "inode ref overflow, ptr %lu end %lu inode_ref_size %zu",
+ ptr, end, sizeof(iref));
+ return -EUCLEAN;
+ }
+
+ iref = (struct btrfs_inode_ref *)ptr;
+ namelen = btrfs_inode_ref_name_len(leaf, iref);
+ if (ptr + sizeof(*iref) + namelen > end) {
+ inode_ref_err(fs_info, leaf, slot,
+ "inode ref overflow, ptr %lu end %lu namelen %u",
+ ptr, end, namelen);
+ return -EUCLEAN;
+ }
+
+ /*
+ * NOTE: In theory we should record all found index numbers
+ * to find any duplicated indexes, but that will be too time
+ * consuming for inodes with too many hard links.
+ */
+ ptr += sizeof(*iref) + namelen;
+ }
+ return 0;
+}
+
/*
* Common point to switch the item-specific validation.
*/
@@ -1244,7 +1360,10 @@ static int check_leaf_item(struct extent_buffer *leaf,
case BTRFS_DIR_ITEM_KEY:
case BTRFS_DIR_INDEX_KEY:
case BTRFS_XATTR_ITEM_KEY:
- ret = check_dir_item(leaf, key, slot);
+ ret = check_dir_item(leaf, key, prev_key, slot);
+ break;
+ case BTRFS_INODE_REF_KEY:
+ ret = check_inode_ref(leaf, key, prev_key, slot);
break;
case BTRFS_BLOCK_GROUP_ITEM_KEY:
ret = check_block_group_item(leaf, key, slot);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8a6cc600bf18..6f757361db53 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -559,7 +559,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(root->fs_info->sb, &key, root);
if (IS_ERR(inode))
inode = NULL;
return inode;
@@ -945,54 +945,32 @@ static noinline int backref_in_log(struct btrfs_root *log,
const char *name, int namelen)
{
struct btrfs_path *path;
- struct btrfs_inode_ref *ref;
- unsigned long ptr;
- unsigned long ptr_end;
- unsigned long name_ptr;
- int found_name_len;
- int item_size;
int ret;
- int match = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
- if (ret != 0)
+ if (ret < 0) {
goto out;
-
- ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
-
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
- if (btrfs_find_name_in_ext_backref(path->nodes[0],
- path->slots[0],
- ref_objectid,
- name, namelen))
- match = 1;
-
+ } else if (ret == 1) {
+ ret = 0;
goto out;
}
- item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- ptr_end = ptr + item_size;
- while (ptr < ptr_end) {
- ref = (struct btrfs_inode_ref *)ptr;
- found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
- if (found_name_len == namelen) {
- name_ptr = (unsigned long)(ref + 1);
- ret = memcmp_extent_buffer(path->nodes[0], name,
- name_ptr, namelen);
- if (ret == 0) {
- match = 1;
- goto out;
- }
- }
- ptr = (unsigned long)(ref + 1) + found_name_len;
- }
+ if (key->type == BTRFS_INODE_EXTREF_KEY)
+ ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
+ path->slots[0],
+ ref_objectid,
+ name, namelen);
+ else
+ ret = !!btrfs_find_name_in_backref(path->nodes[0],
+ path->slots[0],
+ name, namelen);
out:
btrfs_free_path(path);
- return match;
+ return ret;
}
static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
@@ -1050,10 +1028,13 @@ again:
(unsigned long)(victim_ref + 1),
victim_name_len);
- if (!backref_in_log(log_root, &search_key,
- parent_objectid,
- victim_name,
- victim_name_len)) {
+ ret = backref_in_log(log_root, &search_key,
+ parent_objectid, victim_name,
+ victim_name_len);
+ if (ret < 0) {
+ kfree(victim_name);
+ return ret;
+ } else if (!ret) {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
@@ -1115,10 +1096,12 @@ again:
search_key.offset = btrfs_extref_hash(parent_objectid,
victim_name,
victim_name_len);
- ret = 0;
- if (!backref_in_log(log_root, &search_key,
- parent_objectid, victim_name,
- victim_name_len)) {
+ ret = backref_in_log(log_root, &search_key,
+ parent_objectid, victim_name,
+ victim_name_len);
+ if (ret < 0) {
+ return ret;
+ } else if (!ret) {
ret = -ENOENT;
victim_parent = read_one_inode(root,
parent_objectid);
@@ -1885,30 +1868,6 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
}
/*
- * Return true if an inode reference exists in the log for the given name,
- * inode and parent inode.
- */
-static bool name_in_log_ref(struct btrfs_root *log_root,
- const char *name, const int name_len,
- const u64 dirid, const u64 ino)
-{
- struct btrfs_key search_key;
-
- search_key.objectid = ino;
- search_key.type = BTRFS_INODE_REF_KEY;
- search_key.offset = dirid;
- if (backref_in_log(log_root, &search_key, dirid, name, name_len))
- return true;
-
- search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = btrfs_extref_hash(dirid, name, name_len);
- if (backref_in_log(log_root, &search_key, dirid, name, name_len))
- return true;
-
- return false;
-}
-
-/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
@@ -2024,8 +1983,31 @@ out:
return ret;
insert:
- if (name_in_log_ref(root->log_root, name, name_len,
- key->objectid, log_key.objectid)) {
+ /*
+ * Check if the inode reference exists in the log for the given name,
+ * inode and parent inode
+ */
+ found_key.objectid = log_key.objectid;
+ found_key.type = BTRFS_INODE_REF_KEY;
+ found_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
+
+ found_key.objectid = log_key.objectid;
+ found_key.type = BTRFS_INODE_EXTREF_KEY;
+ found_key.offset = key->objectid;
+ ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
+ name_len);
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
/* The dentry will be added later. */
ret = 0;
update_size = false;
@@ -2869,7 +2851,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
- extent_buffer_get(log->node);
+ atomic_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
@@ -4983,7 +4965,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
/*
* If the other inode that had a conflicting dir entry was
* deleted in the current transaction, we need to log its parent
@@ -4993,8 +4975,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ret = PTR_ERR(inode);
if (ret == -ENOENT) {
key.objectid = parent;
- inode = btrfs_iget(fs_info->sb, &key, root,
- NULL);
+ inode = btrfs_iget(fs_info->sb, &key, root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
} else {
@@ -5699,7 +5680,7 @@ process_leaf:
continue;
btrfs_release_path(path);
- di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
+ di_inode = btrfs_iget(fs_info->sb, &di_key, root);
if (IS_ERR(di_inode)) {
ret = PTR_ERR(di_inode);
goto next_dir_inode;
@@ -5825,8 +5806,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
cur_offset = item_size;
}
- dir_inode = btrfs_iget(fs_info->sb, &inode_key,
- root, NULL);
+ dir_inode = btrfs_iget(fs_info->sb, &inode_key, root);
/*
* If the parent inode was deleted, return an error to
* fallback to a transaction commit. This is to prevent
@@ -5900,7 +5880,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
search_key.objectid = found_key.offset;
search_key.type = BTRFS_INODE_ITEM_KEY;
search_key.offset = 0;
- inode = btrfs_iget(fs_info->sb, &search_key, root, NULL);
+ inode = btrfs_iget(fs_info->sb, &search_key, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e04409f85063..d8e5560db285 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -58,6 +58,30 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
},
+ [BTRFS_RAID_RAID1C3] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 3,
+ .tolerated_failures = 2,
+ .devs_increment = 3,
+ .ncopies = 3,
+ .raid_name = "raid1c3",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
+ },
+ [BTRFS_RAID_RAID1C4] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 4,
+ .tolerated_failures = 3,
+ .devs_increment = 4,
+ .ncopies = 4,
+ .raid_name = "raid1c4",
+ .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4,
+ .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
+ },
[BTRFS_RAID_DUP] = {
.sub_stripes = 1,
.dev_stripes = 2,
@@ -297,7 +321,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
-struct list_head *btrfs_get_fs_uuids(void)
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
{
return &fs_uuids;
}
@@ -397,8 +421,6 @@ static struct btrfs_device *__alloc_device(void)
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list);
- spin_lock_init(&dev->io_lock);
-
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
@@ -501,212 +523,6 @@ error:
return ret;
}
-static void requeue_list(struct btrfs_pending_bios *pending_bios,
- struct bio *head, struct bio *tail)
-{
-
- struct bio *old_head;
-
- old_head = pending_bios->head;
- pending_bios->head = head;
- if (pending_bios->tail)
- tail->bi_next = old_head;
- else
- pending_bios->tail = tail;
-}
-
-/*
- * we try to collect pending bios for a device so we don't get a large
- * number of procs sending bios down to the same device. This greatly
- * improves the schedulers ability to collect and merge the bios.
- *
- * But, it also turns into a long list of bios to process and that is sure
- * to eventually make the worker thread block. The solution here is to
- * make some progress and then put this work struct back at the end of
- * the list if the block device is congested. This way, multiple devices
- * can make progress from a single worker thread.
- */
-static noinline void run_scheduled_bios(struct btrfs_device *device)
-{
- struct btrfs_fs_info *fs_info = device->fs_info;
- struct bio *pending;
- struct backing_dev_info *bdi;
- struct btrfs_pending_bios *pending_bios;
- struct bio *tail;
- struct bio *cur;
- int again = 0;
- unsigned long num_run;
- unsigned long batch_run = 0;
- unsigned long last_waited = 0;
- int force_reg = 0;
- int sync_pending = 0;
- struct blk_plug plug;
-
- /*
- * this function runs all the bios we've collected for
- * a particular device. We don't want to wander off to
- * another device without first sending all of these down.
- * So, setup a plug here and finish it off before we return
- */
- blk_start_plug(&plug);
-
- bdi = device->bdev->bd_bdi;
-
-loop:
- spin_lock(&device->io_lock);
-
-loop_lock:
- num_run = 0;
-
- /* take all the bios off the list at once and process them
- * later on (without the lock held). But, remember the
- * tail and other pointers so the bios can be properly reinserted
- * into the list if we hit congestion
- */
- if (!force_reg && device->pending_sync_bios.head) {
- pending_bios = &device->pending_sync_bios;
- force_reg = 1;
- } else {
- pending_bios = &device->pending_bios;
- force_reg = 0;
- }
-
- pending = pending_bios->head;
- tail = pending_bios->tail;
- WARN_ON(pending && !tail);
-
- /*
- * if pending was null this time around, no bios need processing
- * at all and we can stop. Otherwise it'll loop back up again
- * and do an additional check so no bios are missed.
- *
- * device->running_pending is used to synchronize with the
- * schedule_bio code.
- */
- if (device->pending_sync_bios.head == NULL &&
- device->pending_bios.head == NULL) {
- again = 0;
- device->running_pending = 0;
- } else {
- again = 1;
- device->running_pending = 1;
- }
-
- pending_bios->head = NULL;
- pending_bios->tail = NULL;
-
- spin_unlock(&device->io_lock);
-
- while (pending) {
-
- rmb();
- /* we want to work on both lists, but do more bios on the
- * sync list than the regular list
- */
- if ((num_run > 32 &&
- pending_bios != &device->pending_sync_bios &&
- device->pending_sync_bios.head) ||
- (num_run > 64 && pending_bios == &device->pending_sync_bios &&
- device->pending_bios.head)) {
- spin_lock(&device->io_lock);
- requeue_list(pending_bios, pending, tail);
- goto loop_lock;
- }
-
- cur = pending;
- pending = pending->bi_next;
- cur->bi_next = NULL;
-
- BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
-
- /*
- * if we're doing the sync list, record that our
- * plug has some sync requests on it
- *
- * If we're doing the regular list and there are
- * sync requests sitting around, unplug before
- * we add more
- */
- if (pending_bios == &device->pending_sync_bios) {
- sync_pending = 1;
- } else if (sync_pending) {
- blk_finish_plug(&plug);
- blk_start_plug(&plug);
- sync_pending = 0;
- }
-
- btrfsic_submit_bio(cur);
- num_run++;
- batch_run++;
-
- cond_resched();
-
- /*
- * we made progress, there is more work to do and the bdi
- * is now congested. Back off and let other work structs
- * run instead
- */
- if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
- fs_info->fs_devices->open_devices > 1) {
- struct io_context *ioc;
-
- ioc = current->io_context;
-
- /*
- * the main goal here is that we don't want to
- * block if we're going to be able to submit
- * more requests without blocking.
- *
- * This code does two great things, it pokes into
- * the elevator code from a filesystem _and_
- * it makes assumptions about how batching works.
- */
- if (ioc && ioc->nr_batch_requests > 0 &&
- time_before(jiffies, ioc->last_waited + HZ/50UL) &&
- (last_waited == 0 ||
- ioc->last_waited == last_waited)) {
- /*
- * we want to go through our batch of
- * requests and stop. So, we copy out
- * the ioc->last_waited time and test
- * against it before looping
- */
- last_waited = ioc->last_waited;
- cond_resched();
- continue;
- }
- spin_lock(&device->io_lock);
- requeue_list(pending_bios, pending, tail);
- device->running_pending = 1;
-
- spin_unlock(&device->io_lock);
- btrfs_queue_work(fs_info->submit_workers,
- &device->work);
- goto done;
- }
- }
-
- cond_resched();
- if (again)
- goto loop;
-
- spin_lock(&device->io_lock);
- if (device->pending_bios.head || device->pending_sync_bios.head)
- goto loop_lock;
- spin_unlock(&device->io_lock);
-
-done:
- blk_finish_plug(&plug);
-}
-
-static void pending_bios_fn(struct btrfs_work *work)
-{
- struct btrfs_device *device;
-
- device = container_of(work, struct btrfs_device, work);
- run_scheduled_bios(device);
-}
-
static bool device_path_matched(const char *path, struct btrfs_device *device)
{
int found;
@@ -818,7 +634,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
- fs_devices->seeding = 1;
+ fs_devices->seeding = true;
} else {
if (bdev_read_only(bdev))
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
@@ -828,7 +644,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
q = bdev_get_queue(bdev);
if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
+ fs_devices->rotating = true;
device->bdev = bdev;
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
@@ -1005,11 +821,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
*new_device_added = true;
if (disk_super->label[0])
- pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
- disk_super->label, devid, found_transid, path);
+ pr_info(
+ "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
+ disk_super->label, devid, found_transid, path,
+ current->comm, task_pid_nr(current));
else
- pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
- disk_super->fsid, devid, found_transid, path);
+ pr_info(
+ "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
+ disk_super->fsid, devid, found_transid, path,
+ current->comm, task_pid_nr(current));
} else if (!device->name || strcmp(device->name->str, path)) {
/*
@@ -1295,7 +1115,7 @@ static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
- fs_devices->seeding = 0;
+ fs_devices->seeding = false;
return 0;
}
@@ -2048,7 +1868,7 @@ static struct btrfs_device * btrfs_find_next_active_device(
* where this function called, there should be always be another device (or
* this_dev) which is active.
*/
-void btrfs_assign_next_active_device(struct btrfs_device *device,
+void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev)
{
struct btrfs_fs_info *fs_info = device->fs_info;
@@ -2450,11 +2270,11 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
mutex_unlock(&fs_info->chunk_mutex);
- fs_devices->seeding = 0;
+ fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
- fs_devices->rotating = 0;
+ fs_devices->rotating = false;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
@@ -2649,7 +2469,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
if (!blk_queue_nonrot(q))
- fs_devices->rotating = 1;
+ fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
btrfs_set_super_total_bytes(fs_info->super_copy,
@@ -3177,7 +2997,7 @@ error:
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
u64 chunk_offset)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 bytes_used;
u64 chunk_type;
@@ -3186,27 +3006,28 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
chunk_type = cache->flags;
btrfs_put_block_group(cache);
- if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
- spin_lock(&fs_info->data_sinfo->lock);
- bytes_used = fs_info->data_sinfo->bytes_used;
- spin_unlock(&fs_info->data_sinfo->lock);
+ if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
+ return 0;
+
+ spin_lock(&fs_info->data_sinfo->lock);
+ bytes_used = fs_info->data_sinfo->bytes_used;
+ spin_unlock(&fs_info->data_sinfo->lock);
- if (!bytes_used) {
- struct btrfs_trans_handle *trans;
- int ret;
+ if (!bytes_used) {
+ struct btrfs_trans_handle *trans;
+ int ret;
- trans = btrfs_join_transaction(fs_info->tree_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- ret = btrfs_force_chunk_alloc(trans,
- BTRFS_BLOCK_GROUP_DATA);
- btrfs_end_transaction(trans);
- if (ret < 0)
- return ret;
- return 1;
- }
+ ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans);
+ if (ret < 0)
+ return ret;
+ return 1;
}
+
return 0;
}
@@ -3385,28 +3206,28 @@ static int chunk_profiles_filter(u64 chunk_type,
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- chunk_used = btrfs_block_group_used(&cache->item);
+ chunk_used = cache->used;
if (bargs->usage_min == 0)
user_thresh_min = 0;
else
- user_thresh_min = div_factor_fine(cache->key.offset,
- bargs->usage_min);
+ user_thresh_min = div_factor_fine(cache->length,
+ bargs->usage_min);
if (bargs->usage_max == 0)
user_thresh_max = 1;
else if (bargs->usage_max > 100)
- user_thresh_max = cache->key.offset;
+ user_thresh_max = cache->length;
else
- user_thresh_max = div_factor_fine(cache->key.offset,
- bargs->usage_max);
+ user_thresh_max = div_factor_fine(cache->length,
+ bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
ret = 0;
@@ -3418,20 +3239,19 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
u64 chunk_offset, struct btrfs_balance_args *bargs)
{
- struct btrfs_block_group_cache *cache;
+ struct btrfs_block_group *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- chunk_used = btrfs_block_group_used(&cache->item);
+ chunk_used = cache->used;
if (bargs->usage_min == 0)
user_thresh = 1;
else if (bargs->usage > 100)
- user_thresh = cache->key.offset;
+ user_thresh = cache->length;
else
- user_thresh = div_factor_fine(cache->key.offset,
- bargs->usage);
+ user_thresh = div_factor_fine(cache->length, bargs->usage);
if (chunk_used < user_thresh)
ret = 0;
@@ -3844,12 +3664,7 @@ static int alloc_profile_is_valid(u64 flags, int extended)
if (flags == 0)
return !extended; /* "0" is valid for usual profiles */
- /* true if exactly one bit set */
- /*
- * Don't use is_power_of_2(unsigned long) because it won't work
- * for the single profile (1ULL << 48) on 32-bit CPUs.
- */
- return flags != 0 && (flags & (flags - 1)) == 0;
+ return has_single_bit_set(flags);
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -4036,7 +3851,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
int ret;
u64 num_devices;
unsigned seq;
- bool reducing_integrity;
+ bool reducing_redundancy;
int i;
if (btrfs_fs_closing(fs_info) ||
@@ -4119,9 +3934,9 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_metadata_alloc_bits & allowed) &&
!(bctl->meta.target & allowed)))
- reducing_integrity = true;
+ reducing_redundancy = true;
else
- reducing_integrity = false;
+ reducing_redundancy = false;
/* if we're not converting, the target field is uninitialized */
meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
@@ -4130,13 +3945,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
bctl->data.target : fs_info->avail_data_alloc_bits;
} while (read_seqretry(&fs_info->profiles_lock, seq));
- if (reducing_integrity) {
+ if (reducing_redundancy) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
btrfs_info(fs_info,
- "balance: force reducing metadata integrity");
+ "balance: force reducing metadata redundancy");
} else {
btrfs_err(fs_info,
- "balance: reduces metadata integrity, use --force if you want this");
+ "balance: reduces metadata redundancy, use --force if you want this");
ret = -EINVAL;
goto out;
}
@@ -4902,6 +4717,14 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
+static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+ if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
+ return;
+
+ btrfs_set_fs_incompat(info, RAID1C34);
+}
+
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
u64 start, u64 type)
{
@@ -5048,8 +4871,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
btrfs_cmp_device_info, NULL);
- /* round down to number of usable stripes */
- ndevs = round_down(ndevs, devs_increment);
+ /*
+ * Round down to number of usable stripes, devs_increment can be any
+ * number so we can't use round_down()
+ */
+ ndevs -= ndevs % devs_increment;
if (ndevs < devs_min) {
ret = -ENOSPC;
@@ -5165,6 +4991,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
free_extent_map(em);
check_raid56_incompat_flag(info, type);
+ check_raid1c34_incompat_flag(info, type);
kfree(devices_info);
return 0;
@@ -5583,12 +5410,13 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
* replace.
*/
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length,
+ u64 logical, u64 *length_ret,
struct btrfs_bio **bbio_ret)
{
struct extent_map *em;
struct map_lookup *map;
struct btrfs_bio *bbio;
+ u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
u64 stripe_nr_end;
@@ -5621,7 +5449,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
offset = logical - em->start;
- length = min_t(u64, em->len - offset, length);
+ length = min_t(u64, em->start + em->len - logical, length);
+ *length_ret = length;
stripe_len = map->stripe_len;
/*
@@ -6036,7 +5865,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
- *length, bbio_ret);
+ length, bbio_ret);
ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
if (ret < 0)
@@ -6416,52 +6245,8 @@ static void btrfs_end_bio(struct bio *bio)
}
}
-/*
- * see run_scheduled_bios for a description of why bios are collected for
- * async submit.
- *
- * This will add one bio to the pending list for a device and make sure
- * the work struct is scheduled.
- */
-static noinline void btrfs_schedule_bio(struct btrfs_device *device,
- struct bio *bio)
-{
- struct btrfs_fs_info *fs_info = device->fs_info;
- int should_queue = 1;
- struct btrfs_pending_bios *pending_bios;
-
- /* don't bother with additional async steps for reads, right now */
- if (bio_op(bio) == REQ_OP_READ) {
- btrfsic_submit_bio(bio);
- return;
- }
-
- WARN_ON(bio->bi_next);
- bio->bi_next = NULL;
-
- spin_lock(&device->io_lock);
- if (op_is_sync(bio->bi_opf))
- pending_bios = &device->pending_sync_bios;
- else
- pending_bios = &device->pending_bios;
-
- if (pending_bios->tail)
- pending_bios->tail->bi_next = bio;
-
- pending_bios->tail = bio;
- if (!pending_bios->head)
- pending_bios->head = bio;
- if (device->running_pending)
- should_queue = 0;
-
- spin_unlock(&device->io_lock);
-
- if (should_queue)
- btrfs_queue_work(fs_info->submit_workers, &device->work);
-}
-
static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
- u64 physical, int dev_nr, int async)
+ u64 physical, int dev_nr)
{
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
struct btrfs_fs_info *fs_info = bbio->fs_info;
@@ -6479,10 +6264,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
btrfs_bio_counter_inc_noblocked(fs_info);
- if (async)
- btrfs_schedule_bio(dev, bio);
- else
- btrfsic_submit_bio(bio);
+ btrfsic_submit_bio(bio);
}
static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
@@ -6503,7 +6285,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
}
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, int async_submit)
+ int mirror_num)
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
@@ -6572,7 +6354,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
bio = first_bio;
submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
- dev_nr, async_submit);
+ dev_nr);
}
btrfs_bio_counter_dec(fs_info);
return BLK_STS_OK;
@@ -6676,9 +6458,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
- btrfs_init_work(&dev->work, btrfs_submit_helper,
- pending_bios_fn, NULL, NULL);
-
return dev;
}
@@ -6875,7 +6654,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (IS_ERR(fs_devices))
return fs_devices;
- fs_devices->seeding = 1;
+ fs_devices->seeding = true;
fs_devices->opened = 1;
return fs_devices;
}
@@ -7064,48 +6843,49 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
sb_array_offset += len;
cur_offset += len;
- if (key.type == BTRFS_CHUNK_ITEM_KEY) {
- chunk = (struct btrfs_chunk *)sb_array_offset;
- /*
- * At least one btrfs_chunk with one stripe must be
- * present, exact stripe count check comes afterwards
- */
- len = btrfs_chunk_item_size(1);
- if (cur_offset + len > array_size)
- goto out_short_read;
-
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
- if (!num_stripes) {
- btrfs_err(fs_info,
- "invalid number of stripes %u in sys_array at offset %u",
- num_stripes, cur_offset);
- ret = -EIO;
- break;
- }
+ if (key.type != BTRFS_CHUNK_ITEM_KEY) {
+ btrfs_err(fs_info,
+ "unexpected item type %u in sys_array at offset %u",
+ (u32)key.type, cur_offset);
+ ret = -EIO;
+ break;
+ }
- type = btrfs_chunk_type(sb, chunk);
- if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
- btrfs_err(fs_info,
- "invalid chunk type %llu in sys_array at offset %u",
- type, cur_offset);
- ret = -EIO;
- break;
- }
+ chunk = (struct btrfs_chunk *)sb_array_offset;
+ /*
+ * At least one btrfs_chunk with one stripe must be present,
+ * exact stripe count check comes afterwards
+ */
+ len = btrfs_chunk_item_size(1);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
- len = btrfs_chunk_item_size(num_stripes);
- if (cur_offset + len > array_size)
- goto out_short_read;
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ if (!num_stripes) {
+ btrfs_err(fs_info,
+ "invalid number of stripes %u in sys_array at offset %u",
+ num_stripes, cur_offset);
+ ret = -EIO;
+ break;
+ }
- ret = read_one_chunk(&key, sb, chunk);
- if (ret)
- break;
- } else {
+ type = btrfs_chunk_type(sb, chunk);
+ if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
btrfs_err(fs_info,
- "unexpected item type %u in sys_array at offset %u",
- (u32)key.type, cur_offset);
+ "invalid chunk type %llu in sys_array at offset %u",
+ type, cur_offset);
ret = -EIO;
break;
}
+
+ len = btrfs_chunk_item_size(num_stripes);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ ret = read_one_chunk(&key, sb, chunk);
+ if (ret)
+ break;
+
array_ptr += len;
sb_array_offset += len;
cur_offset += len;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a7da1f3e3627..fc1b564b9cfe 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -18,10 +18,6 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
struct buffer_head;
-struct btrfs_pending_bios {
- struct bio *head;
- struct bio *tail;
-};
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
@@ -68,13 +64,6 @@ struct btrfs_device {
u64 generation;
- spinlock_t io_lock ____cacheline_aligned;
- int running_pending;
- /* regular prio bios */
- struct btrfs_pending_bios pending_bios;
- /* sync bios */
- struct btrfs_pending_bios pending_sync_bios;
-
struct block_device *bdev;
/* the mode sent to blkdev_get */
@@ -254,14 +243,14 @@ struct btrfs_fs_devices {
struct list_head alloc_list;
struct btrfs_fs_devices *seed;
- int seeding;
+ bool seeding;
int opened;
/* set when we find or add a device that doesn't have the
* nonrot flag set
*/
- int rotating;
+ bool rotating;
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
@@ -330,7 +319,6 @@ struct btrfs_bio {
u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
struct bio *orig_bio;
- unsigned long flags;
void *private;
atomic_t error;
int max_errors;
@@ -436,7 +424,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
- int mirror_num, int async_submit);
+ int mirror_num);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
@@ -557,6 +545,10 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
return BTRFS_RAID_RAID10;
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
return BTRFS_RAID_RAID1;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
+ return BTRFS_RAID_RAID1C3;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
+ return BTRFS_RAID_RAID1C4;
else if (flags & BTRFS_BLOCK_GROUP_DUP)
return BTRFS_RAID_DUP;
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
@@ -571,7 +563,7 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
-struct list_head *btrfs_get_fs_uuids(void);
+struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index df1aace5df50..a6c90a003c12 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -29,19 +29,9 @@ struct workspace {
static struct workspace_manager wsm;
-static void zlib_init_workspace_manager(void)
+struct list_head *zlib_get_workspace(unsigned int level)
{
- btrfs_init_workspace_manager(&wsm, &btrfs_zlib_compress);
-}
-
-static void zlib_cleanup_workspace_manager(void)
-{
- btrfs_cleanup_workspace_manager(&wsm);
-}
-
-static struct list_head *zlib_get_workspace(unsigned int level)
-{
- struct list_head *ws = btrfs_get_workspace(&wsm, level);
+ struct list_head *ws = btrfs_get_workspace(BTRFS_COMPRESS_ZLIB, level);
struct workspace *workspace = list_entry(ws, struct workspace, list);
workspace->level = level;
@@ -49,12 +39,7 @@ static struct list_head *zlib_get_workspace(unsigned int level)
return ws;
}
-static void zlib_put_workspace(struct list_head *ws)
-{
- btrfs_put_workspace(&wsm, ws);
-}
-
-static void zlib_free_workspace(struct list_head *ws)
+void zlib_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -63,7 +48,7 @@ static void zlib_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zlib_alloc_workspace(unsigned int level)
+struct list_head *zlib_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
int workspacesize;
@@ -88,13 +73,9 @@ fail:
return ERR_PTR(-ENOMEM);
}
-static int zlib_compress_pages(struct list_head *ws,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret;
@@ -228,7 +209,7 @@ out:
return ret;
}
-static int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0, ret2;
@@ -319,10 +300,9 @@ done:
return ret;
}
-static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
@@ -419,15 +399,7 @@ next:
}
const struct btrfs_compress_op btrfs_zlib_compress = {
- .init_workspace_manager = zlib_init_workspace_manager,
- .cleanup_workspace_manager = zlib_cleanup_workspace_manager,
- .get_workspace = zlib_get_workspace,
- .put_workspace = zlib_put_workspace,
- .alloc_workspace = zlib_alloc_workspace,
- .free_workspace = zlib_free_workspace,
- .compress_pages = zlib_compress_pages,
- .decompress_bio = zlib_decompress_bio,
- .decompress = zlib_decompress,
+ .workspace_manager = &wsm,
.max_level = 9,
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
};
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 764d47b107e5..9a4871636c6c 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -91,9 +91,8 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
return container_of(list, struct workspace, list);
}
-static void zstd_free_workspace(struct list_head *ws);
-static struct list_head *zstd_alloc_workspace(unsigned int level);
-
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_alloc_workspace(unsigned int level);
/*
* zstd_reclaim_timer_fn - reclaim timer
* @t: timer
@@ -168,7 +167,7 @@ static void zstd_calc_ws_mem_sizes(void)
}
}
-static void zstd_init_workspace_manager(void)
+void zstd_init_workspace_manager(void)
{
struct list_head *ws;
int i;
@@ -194,7 +193,7 @@ static void zstd_init_workspace_manager(void)
}
}
-static void zstd_cleanup_workspace_manager(void)
+void zstd_cleanup_workspace_manager(void)
{
struct workspace *workspace;
int i;
@@ -261,7 +260,7 @@ static struct list_head *zstd_find_workspace(unsigned int level)
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
-static struct list_head *zstd_get_workspace(unsigned int level)
+struct list_head *zstd_get_workspace(unsigned int level)
{
struct list_head *ws;
unsigned int nofs_flag;
@@ -302,7 +301,7 @@ again:
* isn't set, it is also set here. Only the max level workspace tries and wakes
* up waiting workspaces.
*/
-static void zstd_put_workspace(struct list_head *ws)
+void zstd_put_workspace(struct list_head *ws)
{
struct workspace *workspace = list_to_workspace(ws);
@@ -332,7 +331,7 @@ static void zstd_put_workspace(struct list_head *ws)
cond_wake_up(&wsm.wait);
}
-static void zstd_free_workspace(struct list_head *ws)
+void zstd_free_workspace(struct list_head *ws)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
@@ -341,7 +340,7 @@ static void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *zstd_alloc_workspace(unsigned int level)
+struct list_head *zstd_alloc_workspace(unsigned int level)
{
struct workspace *workspace;
@@ -367,13 +366,9 @@ fail:
return ERR_PTR(-ENOMEM);
}
-static int zstd_compress_pages(struct list_head *ws,
- struct address_space *mapping,
- u64 start,
- struct page **pages,
- unsigned long *out_pages,
- unsigned long *total_in,
- unsigned long *total_out)
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
ZSTD_CStream *stream;
@@ -548,7 +543,7 @@ out:
return ret;
}
-static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
struct page **pages_in = cb->compressed_pages;
@@ -626,10 +621,9 @@ done:
return ret;
}
-static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page,
- unsigned long start_byte,
- size_t srclen, size_t destlen)
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
ZSTD_DStream *stream;
@@ -712,15 +706,8 @@ finish:
}
const struct btrfs_compress_op btrfs_zstd_compress = {
- .init_workspace_manager = zstd_init_workspace_manager,
- .cleanup_workspace_manager = zstd_cleanup_workspace_manager,
- .get_workspace = zstd_get_workspace,
- .put_workspace = zstd_put_workspace,
- .alloc_workspace = zstd_alloc_workspace,
- .free_workspace = zstd_free_workspace,
- .compress_pages = zstd_compress_pages,
- .decompress_bio = zstd_decompress_bio,
- .decompress = zstd_decompress,
+ /* ZSTD uses own workspace manager */
+ .workspace_manager = NULL,
.max_level = ZSTD_BTRFS_MAX_LEVEL,
.default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
};
diff --git a/fs/buffer.c b/fs/buffer.c
index 86a38b979323..d8c7242426bb 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -47,6 +47,9 @@
#include <linux/pagevec.h>
#include <linux/sched/mm.h>
#include <trace/events/block.h>
+#include <linux/fscrypt.h>
+
+#include "internal.h"
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
@@ -246,10 +249,6 @@ out:
return ret;
}
-/*
- * I/O completion handler for block_read_full_page() - pages
- * which come unlocked at the end of I/O.
- */
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
@@ -307,6 +306,47 @@ still_busy:
return;
}
+struct decrypt_bh_ctx {
+ struct work_struct work;
+ struct buffer_head *bh;
+};
+
+static void decrypt_bh(struct work_struct *work)
+{
+ struct decrypt_bh_ctx *ctx =
+ container_of(work, struct decrypt_bh_ctx, work);
+ struct buffer_head *bh = ctx->bh;
+ int err;
+
+ err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
+ bh_offset(bh));
+ end_buffer_async_read(bh, err == 0);
+ kfree(ctx);
+}
+
+/*
+ * I/O completion handler for block_read_full_page() - pages
+ * which come unlocked at the end of I/O.
+ */
+static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
+{
+ /* Decrypt if needed */
+ if (uptodate && IS_ENABLED(CONFIG_FS_ENCRYPTION) &&
+ IS_ENCRYPTED(bh->b_page->mapping->host) &&
+ S_ISREG(bh->b_page->mapping->host->i_mode)) {
+ struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
+
+ if (ctx) {
+ INIT_WORK(&ctx->work, decrypt_bh);
+ ctx->bh = bh;
+ fscrypt_enqueue_decrypt_work(&ctx->work);
+ return;
+ }
+ uptodate = 0;
+ }
+ end_buffer_async_read(bh, uptodate);
+}
+
/*
* Completion handler for block_write_full_page() - pages which are unlocked
* during I/O, and which have PageWriteback cleared upon I/O completion.
@@ -379,7 +419,7 @@ EXPORT_SYMBOL(end_buffer_async_write);
*/
static void mark_buffer_async_read(struct buffer_head *bh)
{
- bh->b_end_io = end_buffer_async_read;
+ bh->b_end_io = end_buffer_async_read_io;
set_buffer_async_read(bh);
}
@@ -1385,10 +1425,10 @@ static bool has_bh_in_lru(int cpu, void *dummy)
for (i = 0; i < BH_LRU_SIZE; i++) {
if (b->bhs[i])
- return 1;
+ return true;
}
- return 0;
+ return false;
}
void invalidate_bh_lrus(void)
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d17a789fd856..2e4764fd1872 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1809,6 +1809,7 @@ const struct file_operations ceph_dir_fops = {
.open = ceph_open,
.release = ceph_release,
.unlocked_ioctl = ceph_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.fsync = ceph_fsync,
.lock = ceph_lock,
.flock = ceph_flock,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index bd77adb64bfd..11929d2bb594 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -753,6 +753,9 @@ static void ceph_aio_complete(struct inode *inode,
if (!atomic_dec_and_test(&aio_req->pending_reqs))
return;
+ if (aio_req->iocb->ki_flags & IOCB_DIRECT)
+ inode_dio_end(inode);
+
ret = aio_req->error;
if (!ret)
ret = aio_req->total_len;
@@ -1091,6 +1094,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
CEPH_CAP_FILE_RD);
list_splice(&aio_req->osd_reqs, &osd_reqs);
+ inode_dio_begin(inode);
while (!list_empty(&osd_reqs)) {
req = list_first_entry(&osd_reqs,
struct ceph_osd_request,
@@ -1264,14 +1268,24 @@ again:
dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ ceph_start_io_direct(inode);
+ else
+ ceph_start_io_read(inode);
+
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
&got, &pinned_page);
- if (ret < 0)
+ if (ret < 0) {
+ if (iocb->ki_flags & IOCB_DIRECT)
+ ceph_end_io_direct(inode);
+ else
+ ceph_end_io_read(inode);
return ret;
+ }
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) ||
@@ -1283,16 +1297,12 @@ again:
if (ci->i_inline_version == CEPH_INLINE_NONE) {
if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
- ceph_start_io_direct(inode);
ret = ceph_direct_read_write(iocb, to,
NULL, NULL);
- ceph_end_io_direct(inode);
if (ret >= 0 && ret < len)
retry_op = CHECK_EOF;
} else {
- ceph_start_io_read(inode);
ret = ceph_sync_read(iocb, to, &retry_op);
- ceph_end_io_read(inode);
}
} else {
retry_op = READ_INLINE;
@@ -1303,11 +1313,10 @@ again:
inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
ceph_cap_string(got));
ceph_add_rw_context(fi, &rw_ctx);
- ceph_start_io_read(inode);
ret = generic_file_read_iter(iocb, to);
- ceph_end_io_read(inode);
ceph_del_rw_context(fi, &rw_ctx);
}
+
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
if (pinned_page) {
@@ -1315,6 +1324,12 @@ again:
pinned_page = NULL;
}
ceph_put_cap_refs(ci, got);
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ ceph_end_io_direct(inode);
+ else
+ ceph_end_io_read(inode);
+
if (retry_op > HAVE_RETRIED && ret >= 0) {
int statret;
struct page *page = NULL;
@@ -2173,7 +2188,7 @@ const struct file_operations ceph_file_fops = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = ceph_ioctl,
- .compat_ioctl = ceph_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.fallocate = ceph_fallocate,
.copy_file_range = ceph_copy_file_range,
};
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 0b4eee3bed66..19f6e592b941 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -122,6 +122,27 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
}
static void
+cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan)
+{
+ struct TCP_Server_Info *server = chan->server;
+
+ seq_printf(m, "\t\tChannel %d Number of credits: %d Dialect 0x%x "
+ "TCP status: %d Instance: %d Local Users To Server: %d "
+ "SecMode: 0x%x Req On Wire: %d In Send: %d "
+ "In MaxReq Wait: %d\n",
+ i+1,
+ server->credits,
+ server->dialect,
+ server->tcpStatus,
+ server->reconnect_instance,
+ server->srv_count,
+ server->sec_mode,
+ in_flight(server),
+ atomic_read(&server->in_send),
+ atomic_read(&server->num_waiters));
+}
+
+static void
cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
{
struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr;
@@ -256,6 +277,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
if (!server->rdma)
goto skip_rdma;
+ if (!server->smbd_conn) {
+ seq_printf(m, "\nSMBDirect transport not available");
+ goto skip_rdma;
+ }
+
seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
"transport status: %x",
server->smbd_conn->protocol,
@@ -360,11 +386,10 @@ skip_rdma:
server->srv_count,
server->sec_mode, in_flight(server));
-#ifdef CONFIG_CIFS_STATS2
seq_printf(m, " In Send: %d In MaxReq Wait: %d",
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
-#endif
+
/* dump session id helpful for use with network trace */
seq_printf(m, " SessionId: 0x%llx", ses->Suid);
if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
@@ -372,6 +397,13 @@ skip_rdma:
if (ses->sign)
seq_puts(m, " signed");
+ if (ses->chan_count > 1) {
+ seq_printf(m, "\n\n\tExtra Channels: %zu\n",
+ ses->chan_count-1);
+ for (j = 1; j < ses->chan_count; j++)
+ cifs_dump_channel(m, j, &ses->chans[j]);
+ }
+
seq_puts(m, "\n\tShares:");
j = 0;
@@ -410,8 +442,13 @@ skip_rdma:
seq_printf(m, "\n\tServer interfaces: %zu\n",
ses->iface_count);
for (j = 0; j < ses->iface_count; j++) {
+ struct cifs_server_iface *iface;
+
+ iface = &ses->iface_list[j];
seq_printf(m, "\t%d)", j);
- cifs_dump_iface(m, &ses->iface_list[j]);
+ cifs_dump_iface(m, iface);
+ if (is_ses_using_iface(ses, iface))
+ seq_puts(m, "\t\t[CONNECTED]\n");
}
spin_unlock(&ses->iface_lock);
}
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 7f01c6e60791..7b9b876b513b 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -98,7 +98,7 @@ struct key_type cifs_spnego_key_type = {
struct key *
cifs_get_spnego_key(struct cifs_ses *sesInfo)
{
- struct TCP_Server_Info *server = sesInfo->server;
+ struct TCP_Server_Info *server = cifs_ses_server(sesInfo);
struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
char *description, *dp;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index f842944a5c76..06ffe52bdcfa 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -39,8 +39,6 @@ static const struct cifs_sid sid_everyone = {
/* security id for Authenticated Users system group */
static const struct cifs_sid sid_authusers = {
1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} };
-/* group users */
-static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
/* S-1-22-1 Unmapped Unix users */
static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22},
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1a135d1b85bd..1d1051d31513 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -119,6 +119,7 @@ extern mempool_t *cifs_mid_poolp;
struct workqueue_struct *cifsiod_wq;
struct workqueue_struct *decrypt_wq;
+struct workqueue_struct *fileinfo_put_wq;
struct workqueue_struct *cifsoplockd_wq;
__u32 cifs_lock_secret;
@@ -613,6 +614,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
/* convert actimeo and display it in seconds */
seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
+ if (tcon->ses->chan_max > 1)
+ seq_printf(s, ",multichannel,max_channel=%zu",
+ tcon->ses->chan_max);
+
return 0;
}
@@ -1219,6 +1224,7 @@ const struct file_operations cifs_file_ops = {
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
+ .flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
@@ -1238,6 +1244,7 @@ const struct file_operations cifs_file_strict_ops = {
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
+ .flock = cifs_flock,
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
.mmap = cifs_file_strict_mmap,
@@ -1257,6 +1264,7 @@ const struct file_operations cifs_file_direct_ops = {
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
+ .flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
.mmap = cifs_file_mmap,
@@ -1543,7 +1551,7 @@ init_cifs(void)
/*
* Consider in future setting limit!=0 maybe to min(num_of_cores - 1, 3)
* so that we don't launch too many worker threads but
- * Documentation/workqueue.txt recommends setting it to 0
+ * Documentation/core-api/workqueue.rst recommends setting it to 0
*/
/* WQ_UNBOUND allows decrypt tasks to run on any CPU */
@@ -1554,11 +1562,18 @@ init_cifs(void)
goto out_destroy_cifsiod_wq;
}
+ fileinfo_put_wq = alloc_workqueue("cifsfileinfoput",
+ WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ if (!fileinfo_put_wq) {
+ rc = -ENOMEM;
+ goto out_destroy_decrypt_wq;
+ }
+
cifsoplockd_wq = alloc_workqueue("cifsoplockd",
WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
if (!cifsoplockd_wq) {
rc = -ENOMEM;
- goto out_destroy_decrypt_wq;
+ goto out_destroy_fileinfo_put_wq;
}
rc = cifs_fscache_register();
@@ -1624,6 +1639,8 @@ out_unreg_fscache:
cifs_fscache_unregister();
out_destroy_cifsoplockd_wq:
destroy_workqueue(cifsoplockd_wq);
+out_destroy_fileinfo_put_wq:
+ destroy_workqueue(fileinfo_put_wq);
out_destroy_decrypt_wq:
destroy_workqueue(decrypt_wq);
out_destroy_cifsiod_wq:
@@ -1653,6 +1670,7 @@ exit_cifs(void)
cifs_fscache_unregister();
destroy_workqueue(cifsoplockd_wq);
destroy_workqueue(decrypt_wq);
+ destroy_workqueue(fileinfo_put_wq);
destroy_workqueue(cifsiod_wq);
cifs_proc_clean();
}
@@ -1663,17 +1681,17 @@ MODULE_DESCRIPTION
("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and "
"also older servers complying with the SNIA CIFS Specification)");
MODULE_VERSION(CIFS_VERSION);
-MODULE_SOFTDEP("pre: ecb");
-MODULE_SOFTDEP("pre: hmac");
-MODULE_SOFTDEP("pre: md4");
-MODULE_SOFTDEP("pre: md5");
-MODULE_SOFTDEP("pre: nls");
-MODULE_SOFTDEP("pre: aes");
-MODULE_SOFTDEP("pre: cmac");
-MODULE_SOFTDEP("pre: sha256");
-MODULE_SOFTDEP("pre: sha512");
-MODULE_SOFTDEP("pre: aead2");
-MODULE_SOFTDEP("pre: ccm");
-MODULE_SOFTDEP("pre: gcm");
+MODULE_SOFTDEP("ecb");
+MODULE_SOFTDEP("hmac");
+MODULE_SOFTDEP("md4");
+MODULE_SOFTDEP("md5");
+MODULE_SOFTDEP("nls");
+MODULE_SOFTDEP("aes");
+MODULE_SOFTDEP("cmac");
+MODULE_SOFTDEP("sha256");
+MODULE_SOFTDEP("sha512");
+MODULE_SOFTDEP("aead2");
+MODULE_SOFTDEP("ccm");
+MODULE_SOFTDEP("gcm");
module_init(init_cifs)
module_exit(exit_cifs)
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index bc4ca94137f2..b59dc7478130 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -108,6 +108,7 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
+extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock);
extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
@@ -152,5 +153,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.23"
+#define CIFS_VERSION "2.24"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d78bfcc19156..d34a4ed8c57d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -230,7 +230,8 @@ struct smb_version_operations {
bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *);
/* setup request: allocate mid, sign message */
struct mid_q_entry *(*setup_request)(struct cifs_ses *,
- struct smb_rqst *);
+ struct TCP_Server_Info *,
+ struct smb_rqst *);
/* setup async request: allocate mid, sign message */
struct mid_q_entry *(*setup_async_request)(struct TCP_Server_Info *,
struct smb_rqst *);
@@ -268,8 +269,9 @@ struct smb_version_operations {
int (*check_message)(char *, unsigned int, struct TCP_Server_Info *);
bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
int (*handle_cancelled_mid)(char *, struct TCP_Server_Info *);
- void (*downgrade_oplock)(struct TCP_Server_Info *,
- struct cifsInodeInfo *, bool);
+ void (*downgrade_oplock)(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache);
/* process transaction2 response */
bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
char *, int);
@@ -591,6 +593,10 @@ struct smb_vol {
bool resilient:1; /* noresilient not required since not fored for CA */
bool domainauto:1;
bool rdma:1;
+ bool multichannel:1;
+ bool use_client_guid:1;
+ /* reuse existing guid for multichannel */
+ u8 client_guid[SMB2_CLIENT_GUID_SIZE];
unsigned int bsize;
unsigned int rsize;
unsigned int wsize;
@@ -607,6 +613,7 @@ struct smb_vol {
__u64 snapshot_time; /* needed for timewarp tokens */
__u32 handle_timeout; /* persistent and durable handle timeout in ms */
unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
+ unsigned int max_channels;
__u16 compression; /* compression algorithm 0xFFFF default 0=disabled */
bool rootfs:1; /* if it's a SMB root file system */
};
@@ -736,12 +743,12 @@ struct TCP_Server_Info {
/* Total size of this PDU. Only valid from cifs_demultiplex_thread */
unsigned int pdu_size;
unsigned int total_read; /* total amount of data read in this pass */
+ atomic_t in_send; /* requests trying to send */
+ atomic_t num_waiters; /* blocked waiting to get in sendrecv */
#ifdef CONFIG_CIFS_FSCACHE
struct fscache_cookie *fscache; /* client index cache cookie */
#endif
#ifdef CONFIG_CIFS_STATS2
- atomic_t in_send; /* requests trying to send */
- atomic_t num_waiters; /* blocked waiting to get in sendrecv */
atomic_t num_cmds[NUMBER_OF_SMB2_COMMANDS]; /* total requests by cmd */
atomic_t smb2slowcmd[NUMBER_OF_SMB2_COMMANDS]; /* count resps > 1 sec */
__u64 time_per_cmd[NUMBER_OF_SMB2_COMMANDS]; /* total time per cmd */
@@ -953,6 +960,11 @@ struct cifs_server_iface {
struct sockaddr_storage sockaddr;
};
+struct cifs_chan {
+ struct TCP_Server_Info *server;
+ __u8 signkey[SMB3_SIGN_KEY_SIZE];
+};
+
/*
* Session structure. One of these for each uid session with a particular host
*/
@@ -983,12 +995,15 @@ struct cifs_ses {
bool sign; /* is signing required? */
bool need_reconnect:1; /* connection reset, uid now invalid */
bool domainAuto:1;
+ bool binding:1; /* are we binding the session? */
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
+ __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE];
+
/*
* Network interfaces available on the server this session is
* connected to.
@@ -1002,8 +1017,37 @@ struct cifs_ses {
struct cifs_server_iface *iface_list;
size_t iface_count;
unsigned long iface_last_update; /* jiffies */
+
+#define CIFS_MAX_CHANNELS 16
+ struct cifs_chan chans[CIFS_MAX_CHANNELS];
+ size_t chan_count;
+ size_t chan_max;
+ atomic_t chan_seq; /* round robin state */
};
+/*
+ * When binding a new channel, we need to access the channel which isn't fully
+ * established yet (one past the established count)
+ */
+
+static inline
+struct cifs_chan *cifs_ses_binding_channel(struct cifs_ses *ses)
+{
+ if (ses->binding)
+ return &ses->chans[ses->chan_count];
+ else
+ return NULL;
+}
+
+static inline
+struct TCP_Server_Info *cifs_ses_server(struct cifs_ses *ses)
+{
+ if (ses->binding)
+ return ses->chans[ses->chan_count].server;
+ else
+ return ses->server;
+}
+
static inline bool
cap_unix(struct cifs_ses *ses)
{
@@ -1260,11 +1304,14 @@ struct cifsFileInfo {
unsigned int f_flags;
bool invalidHandle:1; /* file closed via session abend */
bool oplock_break_cancelled:1;
+ unsigned int oplock_epoch; /* epoch from the lease break */
+ __u32 oplock_level; /* oplock/lease level from the lease break */
int count;
spinlock_t file_info_lock; /* protects four flag/count fields above */
struct mutex fh_mutex; /* prevents reopen race after dead ses*/
struct cifs_search_info srch_inf;
struct work_struct oplock_break; /* work for oplock breaks */
+ struct work_struct put; /* work for the final part of _put */
};
struct cifs_io_parms {
@@ -1370,7 +1417,8 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
}
struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
-void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr);
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr,
+ bool offload);
void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
#define CIFS_CACHE_READ_FLG 1
@@ -1405,7 +1453,7 @@ struct cifsInodeInfo {
unsigned int epoch; /* used to track lease state changes */
#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */
#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */
-#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */
+#define CIFS_INODE_FLAG_UNUSED (2) /* Unused flag */
#define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */
#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */
#define CIFS_INO_LOCK (5) /* lock bit for synchronization */
@@ -1524,6 +1572,7 @@ struct mid_q_entry {
struct TCP_Server_Info *server; /* server corresponding to this mid */
__u64 mid; /* multiplex id */
__u16 credits; /* number of credits consumed by this mid */
+ __u16 credits_received; /* number of credits from the response */
__u32 pid; /* process id */
__u32 sequence_number; /* for CIFS signing */
unsigned long when_alloc; /* when mid was created */
@@ -1551,12 +1600,12 @@ struct close_cancelled_open {
struct cifs_fid fid;
struct cifs_tcon *tcon;
struct work_struct work;
+ __u64 mid;
+ __u16 cmd;
};
/* Make code in transport.c a little cleaner by moving
update of optional stats into function below */
-#ifdef CONFIG_CIFS_STATS2
-
static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
{
atomic_inc(&server->in_send);
@@ -1577,26 +1626,12 @@ static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
atomic_dec(&server->num_waiters);
}
+#ifdef CONFIG_CIFS_STATS2
static inline void cifs_save_when_sent(struct mid_q_entry *mid)
{
mid->when_sent = jiffies;
}
#else
-static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
-{
-}
-static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
-{
-}
-
-static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
-{
-}
-
-static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
-{
-}
-
static inline void cifs_save_when_sent(struct mid_q_entry *mid)
{
}
@@ -1907,6 +1942,7 @@ void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
extern const struct slow_work_ops cifs_oplock_break_ops;
extern struct workqueue_struct *cifsiod_wq;
extern struct workqueue_struct *decrypt_wq;
+extern struct workqueue_struct *fileinfo_put_wq;
extern struct workqueue_struct *cifsoplockd_wq;
extern __u32 cifs_lock_secret;
@@ -1937,4 +1973,10 @@ extern struct smb_version_values smb302_values;
#define ALT_SMB311_VERSION_STRING "3.11"
extern struct smb_version_operations smb311_operations;
extern struct smb_version_values smb311_values;
+
+static inline bool is_smb1_server(struct TCP_Server_Info *server)
+{
+ return strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0;
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fe597d3d5208..1ed695336f62 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -109,6 +109,7 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
char *in_buf, int flags);
extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *,
+ struct TCP_Server_Info *,
struct smb_rqst *);
extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
struct smb_rqst *);
@@ -242,6 +243,7 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid,
struct tcon_link *tlink,
struct cifs_pending_open *open);
extern void cifs_del_pending_open(struct cifs_pending_open *open);
+extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb_vol *vol);
extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
int from_reconnect);
extern void cifs_put_tcon(struct cifs_tcon *tcon);
@@ -584,6 +586,12 @@ void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc);
extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
unsigned int *len, unsigned int *offset);
+int cifs_try_adding_channels(struct cifs_ses *ses);
+int cifs_ses_add_channel(struct cifs_ses *ses,
+ struct cifs_server_iface *iface);
+bool is_server_using_iface(struct TCP_Server_Info *server,
+ struct cifs_server_iface *iface);
+bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ccaa8bad336f..86d1baedf21c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -97,6 +97,7 @@ enum {
Opt_persistent, Opt_nopersistent,
Opt_resilient, Opt_noresilient,
Opt_domainauto, Opt_rdma, Opt_modesid, Opt_rootfs,
+ Opt_multichannel, Opt_nomultichannel,
Opt_compress,
/* Mount options which take numeric value */
@@ -106,7 +107,7 @@ enum {
Opt_min_enc_offload,
Opt_blocksize, Opt_rsize, Opt_wsize, Opt_actimeo,
Opt_echo_interval, Opt_max_credits, Opt_handletimeout,
- Opt_snapshot,
+ Opt_snapshot, Opt_max_channels,
/* Mount options which take string value */
Opt_user, Opt_pass, Opt_ip,
@@ -199,6 +200,8 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_noresilient, "noresilienthandles"},
{ Opt_domainauto, "domainauto"},
{ Opt_rdma, "rdma"},
+ { Opt_multichannel, "multichannel" },
+ { Opt_nomultichannel, "nomultichannel" },
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -218,6 +221,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_echo_interval, "echo_interval=%s" },
{ Opt_max_credits, "max_credits=%s" },
{ Opt_snapshot, "snapshot=%s" },
+ { Opt_max_channels, "max_channels=%s" },
{ Opt_compress, "compress=%s" },
{ Opt_blank_user, "user=" },
@@ -387,7 +391,7 @@ static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
#ifdef CONFIG_CIFS_DFS_UPCALL
struct super_cb_data {
struct TCP_Server_Info *server;
- struct cifs_sb_info *cifs_sb;
+ struct super_block *sb;
};
/* These functions must be called with server->srv_mutex held */
@@ -398,25 +402,39 @@ static void super_cb(struct super_block *sb, void *arg)
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
- if (d->cifs_sb)
+ if (d->sb)
return;
cifs_sb = CIFS_SB(sb);
tcon = cifs_sb_master_tcon(cifs_sb);
if (tcon->ses->server == d->server)
- d->cifs_sb = cifs_sb;
+ d->sb = sb;
}
-static inline struct cifs_sb_info *
-find_super_by_tcp(struct TCP_Server_Info *server)
+static struct super_block *get_tcp_super(struct TCP_Server_Info *server)
{
struct super_cb_data d = {
.server = server,
- .cifs_sb = NULL,
+ .sb = NULL,
};
iterate_supers_type(&cifs_fs_type, super_cb, &d);
- return d.cifs_sb ? d.cifs_sb : ERR_PTR(-ENOENT);
+
+ if (unlikely(!d.sb))
+ return ERR_PTR(-ENOENT);
+ /*
+ * Grab an active reference in order to prevent automounts (DFS links)
+ * of expiring and then freeing up our cifs superblock pointer while
+ * we're doing failover.
+ */
+ cifs_sb_active(d.sb);
+ return d.sb;
+}
+
+static inline void put_tcp_super(struct super_block *sb)
+{
+ if (!IS_ERR_OR_NULL(sb))
+ cifs_sb_deactive(sb);
}
static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
@@ -480,6 +498,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
struct mid_q_entry *mid_entry;
struct list_head retry_list;
#ifdef CONFIG_CIFS_DFS_UPCALL
+ struct super_block *sb = NULL;
struct cifs_sb_info *cifs_sb = NULL;
struct dfs_cache_tgt_list tgt_list = {0};
struct dfs_cache_tgt_iterator *tgt_it = NULL;
@@ -489,13 +508,15 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->nr_targets = 1;
#ifdef CONFIG_CIFS_DFS_UPCALL
spin_unlock(&GlobalMid_Lock);
- cifs_sb = find_super_by_tcp(server);
- if (IS_ERR(cifs_sb)) {
- rc = PTR_ERR(cifs_sb);
+ sb = get_tcp_super(server);
+ if (IS_ERR(sb)) {
+ rc = PTR_ERR(sb);
cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n",
__func__, rc);
- cifs_sb = NULL;
+ sb = NULL;
} else {
+ cifs_sb = CIFS_SB(sb);
+
rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it);
if (rc && (rc != -EOPNOTSUPP)) {
cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n",
@@ -512,6 +533,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
/* the demux thread will exit normally
next time through the loop */
spin_unlock(&GlobalMid_Lock);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_cache_free_tgts(&tgt_list);
+ put_tcp_super(sb);
+#endif
return rc;
} else
server->tcpStatus = CifsNeedReconnect;
@@ -638,7 +663,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
__func__, rc);
}
dfs_cache_free_tgts(&tgt_list);
+
}
+
+ put_tcp_super(sb);
#endif
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
@@ -905,6 +933,20 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
spin_unlock(&GlobalMid_Lock);
}
+static unsigned int
+smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
+{
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+
+ /*
+ * SMB1 does not use credits.
+ */
+ if (server->vals->header_preamble_size)
+ return 0;
+
+ return le16_to_cpu(shdr->CreditRequest);
+}
+
static void
handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
char *buf, int malformed)
@@ -912,6 +954,7 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
if (server->ops->check_trans2 &&
server->ops->check_trans2(mid, server, buf, malformed))
return;
+ mid->credits_received = smb2_get_credits_from_hdr(buf, server);
mid->resp_buf = buf;
mid->large_buf = server->large_buf;
/* Was previous buf put in mpx struct for multi-rsp? */
@@ -1222,12 +1265,6 @@ next_pdu:
for (i = 0; i < num_mids; i++) {
if (mids[i] != NULL) {
mids[i]->resp_buf_size = server->pdu_size;
- if ((mids[i]->mid_flags & MID_WAIT_CANCELLED) &&
- mids[i]->mid_state == MID_RESPONSE_RECEIVED &&
- server->ops->handle_cancelled_mid)
- server->ops->handle_cancelled_mid(
- mids[i]->resp_buf,
- server);
if (!mids[i]->multiRsp || mids[i]->multiEnd)
mids[i]->callback(mids[i]);
@@ -1672,6 +1709,10 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT;
+ /* default to no multichannel (single server connection) */
+ vol->multichannel = false;
+ vol->max_channels = 1;
+
if (!mountdata)
goto cifs_parse_mount_err;
@@ -1965,6 +2006,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_rdma:
vol->rdma = true;
break;
+ case Opt_multichannel:
+ vol->multichannel = true;
+ break;
+ case Opt_nomultichannel:
+ vol->multichannel = false;
+ break;
case Opt_compress:
vol->compression = UNKNOWN_TYPE;
cifs_dbg(VFS,
@@ -2128,6 +2175,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
vol->max_credits = option;
break;
+ case Opt_max_channels:
+ if (get_option_ul(args, &option) || option < 1 ||
+ option > CIFS_MAX_CHANNELS) {
+ cifs_dbg(VFS, "%s: Invalid max_channels value, needs to be 1-%d\n",
+ __func__, CIFS_MAX_CHANNELS);
+ goto cifs_parse_mount_err;
+ }
+ vol->max_channels = option;
+ break;
/* String Arguments */
@@ -2713,7 +2769,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
send_sig(SIGKILL, task, 1);
}
-static struct TCP_Server_Info *
+struct TCP_Server_Info *
cifs_get_tcp_session(struct smb_vol *volume_info)
{
struct TCP_Server_Info *tcp_ses = NULL;
@@ -2772,7 +2828,11 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
sizeof(tcp_ses->srcaddr));
memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
sizeof(tcp_ses->dstaddr));
- generate_random_uuid(tcp_ses->client_guid);
+ if (volume_info->use_client_guid)
+ memcpy(tcp_ses->client_guid, volume_info->client_guid,
+ SMB2_CLIENT_GUID_SIZE);
+ else
+ generate_random_uuid(tcp_ses->client_guid);
/*
* at this point we are the only ones with the pointer
* to the struct since the kernel thread not created yet
@@ -2861,6 +2921,13 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
vol->sectype != ses->sectype)
return 0;
+ /*
+ * If an existing session is limited to less channels than
+ * requested, it should not be reused
+ */
+ if (ses->chan_max < vol->max_channels)
+ return 0;
+
switch (ses->sectype) {
case Kerberos:
if (!uid_eq(vol->cred_uid, ses->cred_uid))
@@ -3031,6 +3098,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
list_del_init(&ses->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
+ /* close any extra channels */
+ if (ses->chan_count > 1) {
+ int i;
+
+ for (i = 1; i < ses->chan_count; i++)
+ cifs_put_tcp_session(ses->chans[i].server, 0);
+ }
+
sesInfoFree(ses);
cifs_put_tcp_session(server, 0);
}
@@ -3277,14 +3352,25 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
ses->sectype = volume_info->sectype;
ses->sign = volume_info->sign;
mutex_lock(&ses->session_mutex);
+
+ /* add server as first channel */
+ ses->chans[0].server = server;
+ ses->chan_count = 1;
+ ses->chan_max = volume_info->multichannel ? volume_info->max_channels:1;
+
rc = cifs_negotiate_protocol(xid, ses);
if (!rc)
rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+
+ /* each channel uses a different signing key */
+ memcpy(ses->chans[0].signkey, ses->smb3signingkey,
+ sizeof(ses->smb3signingkey));
+
mutex_unlock(&ses->session_mutex);
if (rc)
goto get_ses_fail;
- /* success, put it on the list */
+ /* success, put it on the list and add it as first channel */
spin_lock(&cifs_tcp_ses_lock);
list_add(&ses->smb_ses_list, &server->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -4700,6 +4786,17 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol,
}
#ifdef CONFIG_CIFS_DFS_UPCALL
+static inline void set_root_tcon(struct cifs_sb_info *cifs_sb,
+ struct cifs_tcon *tcon,
+ struct cifs_tcon **root)
+{
+ spin_lock(&cifs_tcp_ses_lock);
+ tcon->tc_count++;
+ tcon->remap = cifs_remap(cifs_sb);
+ spin_unlock(&cifs_tcp_ses_lock);
+ *root = tcon;
+}
+
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
{
int rc = 0;
@@ -4801,18 +4898,10 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
/* Cache out resolved root server */
(void)dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
root_path + 1, NULL, NULL);
- /*
- * Save root tcon for additional DFS requests to update or create a new
- * DFS cache entry, or even perform DFS failover.
- */
- spin_lock(&cifs_tcp_ses_lock);
- tcon->tc_count++;
- tcon->dfs_path = root_path;
+ kfree(root_path);
root_path = NULL;
- tcon->remap = cifs_remap(cifs_sb);
- spin_unlock(&cifs_tcp_ses_lock);
- root_tcon = tcon;
+ set_root_tcon(cifs_sb, tcon, &root_tcon);
for (count = 1; ;) {
if (!rc && tcon) {
@@ -4849,6 +4938,15 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
mount_put_conns(cifs_sb, xid, server, ses, tcon);
rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses,
&tcon);
+ /*
+ * Ensure that DFS referrals go through new root server.
+ */
+ if (!rc && tcon &&
+ (tcon->share_flags & (SHI1005_FLAGS_DFS |
+ SHI1005_FLAGS_DFS_ROOT))) {
+ cifs_put_tcon(root_tcon);
+ set_root_tcon(cifs_sb, tcon, &root_tcon);
+ }
}
if (rc) {
if (rc == -EACCES || rc == -EOPNOTSUPP)
@@ -4897,6 +4995,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
cifs_autodisable_serverino(cifs_sb);
out:
free_xid(xid);
+ cifs_try_adding_channels(ses);
return mount_setup_tlink(cifs_sb, ses, tcon);
error:
@@ -5142,7 +5241,7 @@ int
cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses)
{
int rc = 0;
- struct TCP_Server_Info *server = ses->server;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
if (!server->ops->need_neg || !server->ops->negotiate)
return -ENOSYS;
@@ -5169,23 +5268,25 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info)
{
int rc = -ENOSYS;
- struct TCP_Server_Info *server = ses->server;
-
- ses->capabilities = server->capabilities;
- if (linuxExtEnabled == 0)
- ses->capabilities &= (~server->vals->cap_unix);
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
+
+ if (!ses->binding) {
+ ses->capabilities = server->capabilities;
+ if (linuxExtEnabled == 0)
+ ses->capabilities &= (~server->vals->cap_unix);
+
+ if (ses->auth_key.response) {
+ cifs_dbg(FYI, "Free previous auth_key.response = %p\n",
+ ses->auth_key.response);
+ kfree(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+ ses->auth_key.len = 0;
+ }
+ }
cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n",
server->sec_mode, server->capabilities, server->timeAdj);
- if (ses->auth_key.response) {
- cifs_dbg(FYI, "Free previous auth_key.response = %p\n",
- ses->auth_key.response);
- kfree(ses->auth_key.response);
- ses->auth_key.response = NULL;
- ses->auth_key.len = 0;
- }
-
if (server->ops->sess_setup)
rc = server->ops->sess_setup(xid, ses, nls_info);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index 1692c0c6c23a..2faa05860a48 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1317,7 +1317,6 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
int rc;
struct dfs_info3_param ref = {0};
char *mdata = NULL, *devname = NULL;
- bool is_smb3 = tcon->ses->server->vals->header_preamble_size == 0;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct smb_vol vol;
@@ -1344,7 +1343,7 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
goto out;
}
- rc = cifs_setup_volume_info(&vol, mdata, devname, is_smb3);
+ rc = cifs_setup_volume_info(&vol, mdata, devname, false);
kfree(devname);
if (rc) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 7ce689d31aa2..f3b79012ff29 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -244,10 +244,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
*oplock = REQ_OPLOCK;
full_path = build_path_from_dentry(direntry);
- if (full_path == NULL) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!full_path)
+ return -ENOMEM;
if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fa7b0fa72bb3..f1fe9c44d298 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -288,6 +288,8 @@ cifs_down_write(struct rw_semaphore *sem)
msleep(10);
}
+static void cifsFileInfo_put_work(struct work_struct *work);
+
struct cifsFileInfo *
cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
@@ -325,6 +327,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
cfile->invalidHandle = false;
cfile->tlink = cifs_get_tlink(tlink);
INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
+ INIT_WORK(&cfile->put, cifsFileInfo_put_work);
mutex_init(&cfile->fh_mutex);
spin_lock_init(&cfile->file_info_lock);
@@ -375,6 +378,41 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
return cifs_file;
}
+static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file)
+{
+ struct inode *inode = d_inode(cifs_file->dentry);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct cifsLockInfo *li, *tmp;
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * Delete any outstanding lock records. We'll lose them when the file
+ * is closed anyway.
+ */
+ cifs_down_write(&cifsi->lock_sem);
+ list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
+ list_del(&li->llist);
+ cifs_del_lock_waiters(li);
+ kfree(li);
+ }
+ list_del(&cifs_file->llist->llist);
+ kfree(cifs_file->llist);
+ up_write(&cifsi->lock_sem);
+
+ cifs_put_tlink(cifs_file->tlink);
+ dput(cifs_file->dentry);
+ cifs_sb_deactive(sb);
+ kfree(cifs_file);
+}
+
+static void cifsFileInfo_put_work(struct work_struct *work)
+{
+ struct cifsFileInfo *cifs_file = container_of(work,
+ struct cifsFileInfo, put);
+
+ cifsFileInfo_put_final(cifs_file);
+}
+
/**
* cifsFileInfo_put - release a reference of file priv data
*
@@ -382,15 +420,15 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
*/
void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
{
- _cifsFileInfo_put(cifs_file, true);
+ _cifsFileInfo_put(cifs_file, true, true);
}
/**
* _cifsFileInfo_put - release a reference of file priv data
*
* This may involve closing the filehandle @cifs_file out on the
- * server. Must be called without holding tcon->open_file_lock and
- * cifs_file->file_info_lock.
+ * server. Must be called without holding tcon->open_file_lock,
+ * cinode->open_file_lock and cifs_file->file_info_lock.
*
* If @wait_for_oplock_handler is true and we are releasing the last
* reference, wait for any running oplock break handler of the file
@@ -398,7 +436,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
* oplock break handler, you need to pass false.
*
*/
-void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
+ bool wait_oplock_handler, bool offload)
{
struct inode *inode = d_inode(cifs_file->dentry);
struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
@@ -406,7 +445,6 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
struct cifsInodeInfo *cifsi = CIFS_I(inode);
struct super_block *sb = inode->i_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- struct cifsLockInfo *li, *tmp;
struct cifs_fid fid;
struct cifs_pending_open open;
bool oplock_break_cancelled;
@@ -467,24 +505,10 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
cifs_del_pending_open(&open);
- /*
- * Delete any outstanding lock records. We'll lose them when the file
- * is closed anyway.
- */
- cifs_down_write(&cifsi->lock_sem);
- list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
- list_del(&li->llist);
- cifs_del_lock_waiters(li);
- kfree(li);
- }
- list_del(&cifs_file->llist->llist);
- kfree(cifs_file->llist);
- up_write(&cifsi->lock_sem);
-
- cifs_put_tlink(cifs_file->tlink);
- dput(cifs_file->dentry);
- cifs_sb_deactive(sb);
- kfree(cifs_file);
+ if (offload)
+ queue_work(fileinfo_put_wq, &cifs_file->put);
+ else
+ cifsFileInfo_put_final(cifs_file);
}
int cifs_open(struct inode *inode, struct file *file)
@@ -728,6 +752,13 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
+ /* O_SYNC also has bit for O_DSYNC so following check picks up either */
+ if (cfile->f_flags & O_SYNC)
+ create_options |= CREATE_WRITE_THROUGH;
+
+ if (cfile->f_flags & O_DIRECT)
+ create_options |= CREATE_NO_BUFFER;
+
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &cfile->fid);
@@ -808,7 +839,7 @@ reopen_error_exit:
int cifs_close(struct inode *inode, struct file *file)
{
if (file->private_data != NULL) {
- cifsFileInfo_put(file->private_data);
+ _cifsFileInfo_put(file->private_data, true, false);
file->private_data = NULL;
}
@@ -1681,7 +1712,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
rc = server->ops->mand_unlock_range(cfile, flock, xid);
out:
- if (flock->fl_flags & FL_POSIX) {
+ if ((flock->fl_flags & FL_POSIX) || (flock->fl_flags & FL_FLOCK)) {
/*
* If this is a request to remove all locks because we
* are closing the file, it doesn't matter if the
@@ -1698,6 +1729,52 @@ out:
return rc;
}
+int cifs_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+ int rc, xid;
+ int lock = 0, unlock = 0;
+ bool wait_flag = false;
+ bool posix_lck = false;
+ struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
+ struct cifsFileInfo *cfile;
+ __u32 type;
+
+ rc = -EACCES;
+ xid = get_xid();
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+
+ cfile = (struct cifsFileInfo *)file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+
+ cifs_read_flock(fl, &type, &lock, &unlock, &wait_flag,
+ tcon->ses->server);
+ cifs_sb = CIFS_FILE_SB(file);
+
+ if (cap_unix(tcon->ses) &&
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+ posix_lck = true;
+
+ if (!lock && !unlock) {
+ /*
+ * if no lock or unlock then nothing to do since we do not
+ * know what it is
+ */
+ free_xid(xid);
+ return -EOPNOTSUPP;
+ }
+
+ rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock,
+ xid);
+ free_xid(xid);
+ return rc;
+
+
+}
+
int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
{
int rc, xid;
@@ -2757,9 +2834,17 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
if (!rc) {
if (wdata->cfile->invalidHandle)
rc = -EAGAIN;
- else
+ else {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (wdata->mr) {
+ wdata->mr->need_invalidate = true;
+ smbd_deregister_mr(wdata->mr);
+ wdata->mr = NULL;
+ }
+#endif
rc = server->ops->async_writev(wdata,
cifs_uncached_writedata_release);
+ }
}
/* If the write was successfully sent, we are done */
@@ -3482,8 +3567,16 @@ static int cifs_resend_rdata(struct cifs_readdata *rdata,
if (!rc) {
if (rdata->cfile->invalidHandle)
rc = -EAGAIN;
- else
+ else {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ if (rdata->mr) {
+ rdata->mr->need_invalidate = true;
+ smbd_deregister_mr(rdata->mr);
+ rdata->mr = NULL;
+ }
+#endif
rc = server->ops->async_readv(rdata);
+ }
}
/* If the read was successfully sent, we are done */
@@ -4637,12 +4730,13 @@ void cifs_oplock_break(struct work_struct *work)
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
int rc = 0;
+ bool purge_cache = false;
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
- server->ops->downgrade_oplock(server, cinode,
- test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
+ server->ops->downgrade_oplock(server, cinode, cfile->oplock_level,
+ cfile->oplock_epoch, &purge_cache);
if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
cifs_has_mand_locks(cinode)) {
@@ -4657,18 +4751,21 @@ void cifs_oplock_break(struct work_struct *work)
else
break_lease(inode, O_WRONLY);
rc = filemap_fdatawrite(inode->i_mapping);
- if (!CIFS_CACHE_READ(cinode)) {
+ if (!CIFS_CACHE_READ(cinode) || purge_cache) {
rc = filemap_fdatawait(inode->i_mapping);
mapping_set_error(inode->i_mapping, rc);
cifs_zap_mapping(inode);
}
cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc);
+ if (CIFS_CACHE_WRITE(cinode))
+ goto oplock_break_ack;
}
rc = cifs_push_locks(cfile);
if (rc)
cifs_dbg(VFS, "Push locks rc = %d\n", rc);
+oplock_break_ack:
/*
* releasing stale oplock after recent reconnect of smb session using
* a now incorrect file handle is not a data integrity issue but do
@@ -4680,7 +4777,7 @@ void cifs_oplock_break(struct work_struct *work)
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
- _cifsFileInfo_put(cfile, false /* do not wait for ourself */);
+ _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false);
cifs_done_oplock_break(cinode);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index df9377828e2f..8a76195e8a69 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -727,22 +727,138 @@ static __u64 simple_hashstr(const char *str)
return hash;
}
+/**
+ * cifs_backup_query_path_info - SMB1 fallback code to get ino
+ *
+ * Fallback code to get file metadata when we don't have access to
+ * @full_path (EACCESS) and have backup creds.
+ *
+ * @data will be set to search info result buffer
+ * @resp_buf will be set to cifs resp buf and needs to be freed with
+ * cifs_buf_release() when done with @data.
+ */
+static int
+cifs_backup_query_path_info(int xid,
+ struct cifs_tcon *tcon,
+ struct super_block *sb,
+ const char *full_path,
+ void **resp_buf,
+ FILE_ALL_INFO **data)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_search_info info = {0};
+ u16 flags;
+ int rc;
+
+ *resp_buf = NULL;
+ info.endOfSearch = false;
+ if (tcon->unix_ext)
+ info.info_level = SMB_FIND_FILE_UNIX;
+ else if ((tcon->ses->capabilities &
+ tcon->ses->server->vals->cap_nt_find) == 0)
+ info.info_level = SMB_FIND_FILE_INFO_STANDARD;
+ else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+ info.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+ else /* no srvino useful for fallback to some netapp */
+ info.info_level = SMB_FIND_FILE_DIRECTORY_INFO;
+
+ flags = CIFS_SEARCH_CLOSE_ALWAYS |
+ CIFS_SEARCH_CLOSE_AT_END |
+ CIFS_SEARCH_BACKUP_SEARCH;
+
+ rc = CIFSFindFirst(xid, tcon, full_path,
+ cifs_sb, NULL, flags, &info, false);
+ if (rc)
+ return rc;
+
+ *resp_buf = (void *)info.ntwrk_buf_start;
+ *data = (FILE_ALL_INFO *)info.srch_entries_start;
+ return 0;
+}
+
+static void
+cifs_set_fattr_ino(int xid,
+ struct cifs_tcon *tcon,
+ struct super_block *sb,
+ struct inode **inode,
+ const char *full_path,
+ FILE_ALL_INFO *data,
+ struct cifs_fattr *fattr)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct TCP_Server_Info *server = tcon->ses->server;
+ int rc;
+
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+ if (*inode)
+ fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ else
+ fattr->cf_uniqueid = iunique(sb, ROOT_I);
+ return;
+ }
+
+ /*
+ * If we have an inode pass a NULL tcon to ensure we don't
+ * make a round trip to the server. This only works for SMB2+.
+ */
+ rc = server->ops->get_srv_inum(xid,
+ *inode ? NULL : tcon,
+ cifs_sb, full_path,
+ &fattr->cf_uniqueid,
+ data);
+ if (rc) {
+ /*
+ * If that fails reuse existing ino or generate one
+ * and disable server ones
+ */
+ if (*inode)
+ fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ else {
+ fattr->cf_uniqueid = iunique(sb, ROOT_I);
+ cifs_autodisable_serverino(cifs_sb);
+ }
+ return;
+ }
+
+ /* If no errors, check for zero root inode (invalid) */
+ if (fattr->cf_uniqueid == 0 && strlen(full_path) == 0) {
+ cifs_dbg(FYI, "Invalid (0) inodenum\n");
+ if (*inode) {
+ /* reuse */
+ fattr->cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ } else {
+ /* make an ino by hashing the UNC */
+ fattr->cf_flags |= CIFS_FATTR_FAKE_ROOT_INO;
+ fattr->cf_uniqueid = simple_hashstr(tcon->treeName);
+ }
+ }
+}
+
+static inline bool is_inode_cache_good(struct inode *ino)
+{
+ return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0;
+}
+
int
-cifs_get_inode_info(struct inode **inode, const char *full_path,
- FILE_ALL_INFO *data, struct super_block *sb, int xid,
+cifs_get_inode_info(struct inode **inode,
+ const char *full_path,
+ FILE_ALL_INFO *in_data,
+ struct super_block *sb, int xid,
const struct cifs_fid *fid)
{
- __u16 srchflgs;
- int rc = 0, tmprc = ENOSYS;
+
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
struct tcon_link *tlink;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- char *buf = NULL;
bool adjust_tz = false;
- struct cifs_fattr fattr;
- struct cifs_search_info *srchinf = NULL;
+ struct cifs_fattr fattr = {0};
bool symlink = false;
+ FILE_ALL_INFO *data = in_data;
+ FILE_ALL_INFO *tmp_data = NULL;
+ void *smb1_backup_rsp_buf = NULL;
+ int rc = 0;
+ int tmprc = 0;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -750,142 +866,88 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
tcon = tlink_tcon(tlink);
server = tcon->ses->server;
- cifs_dbg(FYI, "Getting info on %s\n", full_path);
+ /*
+ * 1. Fetch file metadata if not provided (data)
+ */
- if ((data == NULL) && (*inode != NULL)) {
- if (CIFS_CACHE_READ(CIFS_I(*inode)) &&
- CIFS_I(*inode)->time != 0) {
+ if (!data) {
+ if (is_inode_cache_good(*inode)) {
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
- goto cgii_exit;
- }
- }
-
- /* if inode info is not passed, get it from server */
- if (data == NULL) {
- if (!server->ops->query_path_info) {
- rc = -ENOSYS;
- goto cgii_exit;
+ goto out;
}
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
+ tmp_data = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+ if (!tmp_data) {
rc = -ENOMEM;
- goto cgii_exit;
+ goto out;
}
- data = (FILE_ALL_INFO *)buf;
- rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path,
- data, &adjust_tz, &symlink);
+ rc = server->ops->query_path_info(xid, tcon, cifs_sb,
+ full_path, tmp_data,
+ &adjust_tz, &symlink);
+ data = tmp_data;
}
- if (!rc) {
- cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz,
- symlink);
- } else if (rc == -EREMOTE) {
+ /*
+ * 2. Convert it to internal cifs metadata (fattr)
+ */
+
+ switch (rc) {
+ case 0:
+ cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink);
+ break;
+ case -EREMOTE:
+ /* DFS link, no metadata available on this server */
cifs_create_dfs_fattr(&fattr, sb);
rc = 0;
- } else if ((rc == -EACCES) && backup_cred(cifs_sb) &&
- (strcmp(server->vals->version_string, SMB1_VERSION_STRING)
- == 0)) {
+ break;
+ case -EACCES:
/*
- * For SMB2 and later the backup intent flag is already
- * sent if needed on open and there is no path based
- * FindFirst operation to use to retry with
+ * perm errors, try again with backup flags if possible
+ *
+ * For SMB2 and later the backup intent flag
+ * is already sent if needed on open and there
+ * is no path based FindFirst operation to use
+ * to retry with
*/
+ if (backup_cred(cifs_sb) && is_smb1_server(server)) {
+ /* for easier reading */
+ FILE_DIRECTORY_INFO *fdi;
+ SEARCH_ID_FULL_DIR_INFO *si;
+
+ rc = cifs_backup_query_path_info(xid, tcon, sb,
+ full_path,
+ &smb1_backup_rsp_buf,
+ &data);
+ if (rc)
+ goto out;
- srchinf = kzalloc(sizeof(struct cifs_search_info),
- GFP_KERNEL);
- if (srchinf == NULL) {
- rc = -ENOMEM;
- goto cgii_exit;
- }
+ fdi = (FILE_DIRECTORY_INFO *)data;
+ si = (SEARCH_ID_FULL_DIR_INFO *)data;
- srchinf->endOfSearch = false;
- if (tcon->unix_ext)
- srchinf->info_level = SMB_FIND_FILE_UNIX;
- else if ((tcon->ses->capabilities &
- tcon->ses->server->vals->cap_nt_find) == 0)
- srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD;
- else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
- srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
- else /* no srvino useful for fallback to some netapp */
- srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO;
-
- srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
- CIFS_SEARCH_CLOSE_AT_END |
- CIFS_SEARCH_BACKUP_SEARCH;
-
- rc = CIFSFindFirst(xid, tcon, full_path,
- cifs_sb, NULL, srchflgs, srchinf, false);
- if (!rc) {
- data = (FILE_ALL_INFO *)srchinf->srch_entries_start;
+ cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb);
+ fattr.cf_uniqueid = le64_to_cpu(si->UniqueId);
+ /* uniqueid set, skip get inum step */
+ goto handle_mnt_opt;
+ } else {
+ /* nothing we can do, bail out */
+ goto out;
+ }
+ break;
+ default:
+ cifs_dbg(FYI, "%s: unhandled err rc %d\n", __func__, rc);
+ goto out;
+ }
- cifs_dir_info_to_fattr(&fattr,
- (FILE_DIRECTORY_INFO *)data, cifs_sb);
- fattr.cf_uniqueid = le64_to_cpu(
- ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
+ /*
+ * 3. Get or update inode number (fattr.cf_uniqueid)
+ */
- cifs_buf_release(srchinf->ntwrk_buf_start);
- }
- kfree(srchinf);
- if (rc)
- goto cgii_exit;
- } else
- goto cgii_exit;
+ cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, &fattr);
/*
- * If an inode wasn't passed in, then get the inode number
- *
- * Is an i_ino of zero legal? Can we use that to check if the server
- * supports returning inode numbers? Are there other sanity checks we
- * can use to ensure that the server is really filling in that field?
+ * 4. Tweak fattr based on mount options
*/
- if (*inode == NULL) {
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
- if (server->ops->get_srv_inum)
- tmprc = server->ops->get_srv_inum(xid,
- tcon, cifs_sb, full_path,
- &fattr.cf_uniqueid, data);
- if (tmprc) {
- cifs_dbg(FYI, "GetSrvInodeNum rc %d\n",
- tmprc);
- fattr.cf_uniqueid = iunique(sb, ROOT_I);
- cifs_autodisable_serverino(cifs_sb);
- } else if ((fattr.cf_uniqueid == 0) &&
- strlen(full_path) == 0) {
- /* some servers ret bad root ino ie 0 */
- cifs_dbg(FYI, "Invalid (0) inodenum\n");
- fattr.cf_flags |=
- CIFS_FATTR_FAKE_ROOT_INO;
- fattr.cf_uniqueid =
- simple_hashstr(tcon->treeName);
- }
- } else
- fattr.cf_uniqueid = iunique(sb, ROOT_I);
- } else {
- if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
- && server->ops->get_srv_inum) {
- /*
- * Pass a NULL tcon to ensure we don't make a round
- * trip to the server. This only works for SMB2+.
- */
- tmprc = server->ops->get_srv_inum(xid,
- NULL, cifs_sb, full_path,
- &fattr.cf_uniqueid, data);
- if (tmprc)
- fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
- else if ((fattr.cf_uniqueid == 0) &&
- strlen(full_path) == 0) {
- /*
- * Reuse existing root inode num since
- * inum zero for root causes ls of . and .. to
- * not be returned
- */
- cifs_dbg(FYI, "Srv ret 0 inode num for root\n");
- fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
- }
- } else
- fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
- }
+handle_mnt_opt:
/* query for SFU type info if supported and needed */
if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
@@ -900,8 +962,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
full_path, fid);
if (rc) {
cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n",
- __func__, rc);
- goto cgii_exit;
+ __func__, rc);
+ goto out;
}
} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false,
@@ -909,7 +971,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
if (rc) {
cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
__func__, rc);
- goto cgii_exit;
+ goto out;
}
}
@@ -925,6 +987,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
}
+ /*
+ * 5. Update inode with final fattr data
+ */
+
if (!*inode) {
*inode = cifs_iget(sb, &fattr);
if (!*inode)
@@ -937,7 +1003,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
- goto cgii_exit;
+ goto out;
}
/* if filetype is different, return error */
@@ -945,18 +1011,15 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
(fattr.cf_mode & S_IFMT))) {
CIFS_I(*inode)->time = 0; /* force reval */
rc = -ESTALE;
- goto cgii_exit;
+ goto out;
}
cifs_fattr_to_inode(*inode, &fattr);
}
-
-cgii_exit:
- if ((*inode) && ((*inode)->i_ino == 0))
- cifs_dbg(FYI, "inode number of zero returned\n");
-
- kfree(buf);
+out:
+ cifs_buf_release(smb1_backup_rsp_buf);
cifs_put_tlink(tlink);
+ kfree(tmp_data);
return rc;
}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 5ad83bdb9bea..40ca394fd5de 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -488,21 +488,10 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
&pCifsInode->flags);
- /*
- * Set flag if the server downgrades the oplock
- * to L2 else clear.
- */
- if (pSMB->OplockLevel)
- set_bit(
- CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &pCifsInode->flags);
- else
- clear_bit(
- CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &pCifsInode->flags);
-
- cifs_queue_oplock_break(netfile);
+ netfile->oplock_epoch = 0;
+ netfile->oplock_level = pSMB->OplockLevel;
netfile->oplock_break_cancelled = false;
+ cifs_queue_oplock_break(netfile);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 85bd644f9773..fb3bdc44775c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -31,6 +31,231 @@
#include <linux/utsname.h>
#include <linux/slab.h>
#include "cifs_spnego.h"
+#include "smb2proto.h"
+
+bool
+is_server_using_iface(struct TCP_Server_Info *server,
+ struct cifs_server_iface *iface)
+{
+ struct sockaddr_in *i4 = (struct sockaddr_in *)&iface->sockaddr;
+ struct sockaddr_in6 *i6 = (struct sockaddr_in6 *)&iface->sockaddr;
+ struct sockaddr_in *s4 = (struct sockaddr_in *)&server->dstaddr;
+ struct sockaddr_in6 *s6 = (struct sockaddr_in6 *)&server->dstaddr;
+
+ if (server->dstaddr.ss_family != iface->sockaddr.ss_family)
+ return false;
+ if (server->dstaddr.ss_family == AF_INET) {
+ if (s4->sin_addr.s_addr != i4->sin_addr.s_addr)
+ return false;
+ } else if (server->dstaddr.ss_family == AF_INET6) {
+ if (memcmp(&s6->sin6_addr, &i6->sin6_addr,
+ sizeof(i6->sin6_addr)) != 0)
+ return false;
+ } else {
+ /* unknown family.. */
+ return false;
+ }
+ return true;
+}
+
+bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
+{
+ int i;
+
+ for (i = 0; i < ses->chan_count; i++) {
+ if (is_server_using_iface(ses->chans[i].server, iface))
+ return true;
+ }
+ return false;
+}
+
+/* returns number of channels added */
+int cifs_try_adding_channels(struct cifs_ses *ses)
+{
+ int old_chan_count = ses->chan_count;
+ int left = ses->chan_max - ses->chan_count;
+ int i = 0;
+ int rc = 0;
+ int tries = 0;
+
+ if (left <= 0) {
+ cifs_dbg(FYI,
+ "ses already at max_channels (%zu), nothing to open\n",
+ ses->chan_max);
+ return 0;
+ }
+
+ if (ses->server->dialect < SMB30_PROT_ID) {
+ cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n");
+ return 0;
+ }
+
+ /*
+ * Keep connecting to same, fastest, iface for all channels as
+ * long as its RSS. Try next fastest one if not RSS or channel
+ * creation fails.
+ */
+ while (left > 0) {
+ struct cifs_server_iface *iface;
+
+ tries++;
+ if (tries > 3*ses->chan_max) {
+ cifs_dbg(FYI, "too many attempt at opening channels (%d channels left to open)\n",
+ left);
+ break;
+ }
+
+ iface = &ses->iface_list[i];
+ if (is_ses_using_iface(ses, iface) && !iface->rss_capable) {
+ i = (i+1) % ses->iface_count;
+ continue;
+ }
+
+ rc = cifs_ses_add_channel(ses, iface);
+ if (rc) {
+ cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n",
+ i, rc);
+ i = (i+1) % ses->iface_count;
+ continue;
+ }
+
+ cifs_dbg(FYI, "successfully opened new channel on iface#%d\n",
+ i);
+ left--;
+ }
+
+ return ses->chan_count - old_chan_count;
+}
+
+int
+cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface)
+{
+ struct cifs_chan *chan;
+ struct smb_vol vol = {NULL};
+ static const char unc_fmt[] = "\\%s\\foo";
+ char unc[sizeof(unc_fmt)+SERVER_NAME_LEN_WITH_NULL] = {0};
+ struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr;
+ struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr;
+ int rc;
+ unsigned int xid = get_xid();
+
+ cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ",
+ ses, iface->speed, iface->rdma_capable ? "yes" : "no");
+ if (iface->sockaddr.ss_family == AF_INET)
+ cifs_dbg(FYI, "ip:%pI4)\n", &ipv4->sin_addr);
+ else
+ cifs_dbg(FYI, "ip:%pI6)\n", &ipv6->sin6_addr);
+
+ /*
+ * Setup a smb_vol with mostly the same info as the existing
+ * session and overwrite it with the requested iface data.
+ *
+ * We need to setup at least the fields used for negprot and
+ * sesssetup.
+ *
+ * We only need the volume here, so we can reuse memory from
+ * the session and server without caring about memory
+ * management.
+ */
+
+ /* Always make new connection for now (TODO?) */
+ vol.nosharesock = true;
+
+ /* Auth */
+ vol.domainauto = ses->domainAuto;
+ vol.domainname = ses->domainName;
+ vol.username = ses->user_name;
+ vol.password = ses->password;
+ vol.sectype = ses->sectype;
+ vol.sign = ses->sign;
+
+ /* UNC and paths */
+ /* XXX: Use ses->server->hostname? */
+ sprintf(unc, unc_fmt, ses->serverName);
+ vol.UNC = unc;
+ vol.prepath = "";
+
+ /* Re-use same version as master connection */
+ vol.vals = ses->server->vals;
+ vol.ops = ses->server->ops;
+
+ vol.noblocksnd = ses->server->noblocksnd;
+ vol.noautotune = ses->server->noautotune;
+ vol.sockopt_tcp_nodelay = ses->server->tcp_nodelay;
+ vol.echo_interval = ses->server->echo_interval / HZ;
+
+ /*
+ * This will be used for encoding/decoding user/domain/pw
+ * during sess setup auth.
+ *
+ * XXX: We use the default for simplicity but the proper way
+ * would be to use the one that ses used, which is not
+ * stored. This might break when dealing with non-ascii
+ * strings.
+ */
+ vol.local_nls = load_nls_default();
+
+ /* Use RDMA if possible */
+ vol.rdma = iface->rdma_capable;
+ memcpy(&vol.dstaddr, &iface->sockaddr, sizeof(struct sockaddr_storage));
+
+ /* reuse master con client guid */
+ memcpy(&vol.client_guid, ses->server->client_guid,
+ SMB2_CLIENT_GUID_SIZE);
+ vol.use_client_guid = true;
+
+ mutex_lock(&ses->session_mutex);
+
+ chan = &ses->chans[ses->chan_count];
+ chan->server = cifs_get_tcp_session(&vol);
+ if (IS_ERR(chan->server)) {
+ rc = PTR_ERR(chan->server);
+ chan->server = NULL;
+ goto out;
+ }
+
+ /*
+ * We need to allocate the server crypto now as we will need
+ * to sign packets before we generate the channel signing key
+ * (we sign with the session key)
+ */
+ rc = smb311_crypto_shash_allocate(chan->server);
+ if (rc) {
+ cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+ goto out;
+ }
+
+ ses->binding = true;
+ rc = cifs_negotiate_protocol(xid, ses);
+ if (rc)
+ goto out;
+
+ rc = cifs_setup_session(xid, ses, vol.local_nls);
+ if (rc)
+ goto out;
+
+ /* success, put it on the list
+ * XXX: sharing ses between 2 tcp server is not possible, the
+ * way "internal" linked lists works in linux makes element
+ * only able to belong to one list
+ *
+ * the binding session is already established so the rest of
+ * the code should be able to look it up, no need to add the
+ * ses to the new server.
+ */
+
+ ses->chan_count++;
+ atomic_set(&ses->chan_seq, 0);
+out:
+ ses->binding = false;
+ mutex_unlock(&ses->session_mutex);
+
+ if (rc && chan->server)
+ cifs_put_tcp_session(chan->server, 0);
+ unload_nls(vol.local_nls);
+
+ return rc;
+}
static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
{
@@ -342,6 +567,7 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
struct cifs_ses *ses)
{
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
NEGOTIATE_MESSAGE *sec_blob = (NEGOTIATE_MESSAGE *)pbuffer;
__u32 flags;
@@ -354,9 +580,9 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC |
NTLMSSP_NEGOTIATE_SEAL;
- if (ses->server->sign)
+ if (server->sign)
flags |= NTLMSSP_NEGOTIATE_SIGN;
- if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
+ if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
sec_blob->NegotiateFlags = cpu_to_le32(flags);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 514810694c0f..d70a2bb062df 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -369,12 +369,10 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
static void
cifs_downgrade_oplock(struct TCP_Server_Info *server,
- struct cifsInodeInfo *cinode, bool set_level2)
+ struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache)
{
- if (set_level2)
- cifs_set_oplock_level(cinode, OPLOCK_READ);
- else
- cifs_set_oplock_level(cinode, 0);
+ cifs_set_oplock_level(cinode, oplock);
}
static bool
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index e311f58dc1c8..0516fc482d43 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -29,6 +29,7 @@
#include "cifs_unicode.h"
#include "smb2status.h"
#include "smb2glob.h"
+#include "nterr.h"
static int
check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid)
@@ -249,16 +250,10 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
* of junk. Other servers match RFC1001 len to actual
* SMB2/SMB3 frame length (header + smb2 response specific data)
* Some windows servers also pad up to 8 bytes when compounding.
- * If pad is longer than eight bytes, log the server behavior
- * (once), since may indicate a problem but allow it and continue
- * since the frame is parseable.
*/
- if (clc_len < len) {
- pr_warn_once(
- "srv rsp padded more than expected. Length %d not %d for cmd:%d mid:%llu\n",
- len, clc_len, command, mid);
+ if (clc_len < len)
return 0;
- }
+
pr_warn_once(
"srv rsp too short, len %d not %d. cmd:%d mid:%llu\n",
len, clc_len, command, mid);
@@ -534,7 +529,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
cifs_dbg(FYI, "found in the open list\n");
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
- le32_to_cpu(rsp->NewLeaseState));
+ lease_state);
if (ack_req)
cfile->oplock_break_cancelled = false;
@@ -543,17 +538,8 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
- /*
- * Set or clear flags depending on the lease state being READ.
- * HANDLE caching flag should be added when the client starts
- * to defer closing remote file handles with HANDLE leases.
- */
- if (lease_state & SMB2_LEASE_READ_CACHING_HE)
- set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &cinode->flags);
- else
- clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &cinode->flags);
+ cfile->oplock_epoch = le16_to_cpu(rsp->Epoch);
+ cfile->oplock_level = lease_state;
cifs_queue_oplock_break(cfile);
kfree(lw);
@@ -576,7 +562,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
cifs_dbg(FYI, "found in the pending open list\n");
cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
- le32_to_cpu(rsp->NewLeaseState));
+ lease_state);
open->oplock = lease_state;
}
@@ -673,10 +659,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &server->smb_ses_list) {
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
- cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
spin_lock(&tcon->open_file_lock);
list_for_each(tmp2, &tcon->openFileList) {
cfile = list_entry(tmp2, struct cifsFileInfo,
@@ -688,6 +674,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
continue;
cifs_dbg(FYI, "file id match, oplock break\n");
+ cifs_stats_inc(
+ &tcon->stats.cifs_stats.num_oplock_brks);
cinode = CIFS_I(d_inode(cfile->dentry));
spin_lock(&cfile->file_info_lock);
if (!CIFS_CACHE_WRITE(cinode) &&
@@ -699,18 +687,9 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
&cinode->flags);
- /*
- * Set flag if the server downgrades the oplock
- * to L2 else clear.
- */
- if (rsp->OplockLevel)
- set_bit(
- CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &cinode->flags);
- else
- clear_bit(
- CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
- &cinode->flags);
+ cfile->oplock_epoch = 0;
+ cfile->oplock_level = rsp->OplockLevel;
+
spin_unlock(&cfile->file_info_lock);
cifs_queue_oplock_break(cfile);
@@ -720,9 +699,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
return true;
}
spin_unlock(&tcon->open_file_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- cifs_dbg(FYI, "No matching file for oplock break\n");
- return true;
}
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -735,45 +711,98 @@ smb2_cancelled_close_fid(struct work_struct *work)
{
struct close_cancelled_open *cancelled = container_of(work,
struct close_cancelled_open, work);
+ struct cifs_tcon *tcon = cancelled->tcon;
+ int rc;
- cifs_dbg(VFS, "Close unmatched open\n");
+ if (cancelled->mid)
+ cifs_tcon_dbg(VFS, "Close unmatched open for MID:%llx\n",
+ cancelled->mid);
+ else
+ cifs_tcon_dbg(VFS, "Close interrupted close\n");
- SMB2_close(0, cancelled->tcon, cancelled->fid.persistent_fid,
- cancelled->fid.volatile_fid);
- cifs_put_tcon(cancelled->tcon);
+ rc = SMB2_close(0, tcon, cancelled->fid.persistent_fid,
+ cancelled->fid.volatile_fid);
+ if (rc)
+ cifs_tcon_dbg(VFS, "Close cancelled mid failed rc:%d\n", rc);
+
+ cifs_put_tcon(tcon);
kfree(cancelled);
}
+/*
+ * Caller should already has an extra reference to @tcon
+ * This function is used to queue work to close a handle to prevent leaks
+ * on the server.
+ * We handle two cases. If an open was interrupted after we sent the
+ * SMB2_CREATE to the server but before we processed the reply, and second
+ * if a close was interrupted before we sent the SMB2_CLOSE to the server.
+ */
+static int
+__smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid,
+ __u64 persistent_fid, __u64 volatile_fid)
+{
+ struct close_cancelled_open *cancelled;
+
+ cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
+ if (!cancelled)
+ return -ENOMEM;
+
+ cancelled->fid.persistent_fid = persistent_fid;
+ cancelled->fid.volatile_fid = volatile_fid;
+ cancelled->tcon = tcon;
+ cancelled->cmd = cmd;
+ cancelled->mid = mid;
+ INIT_WORK(&cancelled->work, smb2_cancelled_close_fid);
+ WARN_ON(queue_work(cifsiod_wq, &cancelled->work) == false);
+
+ return 0;
+}
+
+int
+smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
+ __u64 volatile_fid)
+{
+ int rc;
+
+ cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
+ spin_lock(&cifs_tcp_ses_lock);
+ tcon->tc_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ rc = __smb2_handle_cancelled_cmd(tcon, SMB2_CLOSE_HE, 0,
+ persistent_fid, volatile_fid);
+ if (rc)
+ cifs_put_tcon(tcon);
+
+ return rc;
+}
+
int
smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer;
struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer;
struct cifs_tcon *tcon;
- struct close_cancelled_open *cancelled;
+ int rc;
if (sync_hdr->Command != SMB2_CREATE ||
sync_hdr->Status != STATUS_SUCCESS)
return 0;
- cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
- if (!cancelled)
- return -ENOMEM;
-
tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId,
sync_hdr->TreeId);
- if (!tcon) {
- kfree(cancelled);
+ if (!tcon)
return -ENOENT;
- }
- cancelled->fid.persistent_fid = rsp->PersistentFileId;
- cancelled->fid.volatile_fid = rsp->VolatileFileId;
- cancelled->tcon = tcon;
- INIT_WORK(&cancelled->work, smb2_cancelled_close_fid);
- queue_work(cifsiod_wq, &cancelled->work);
+ rc = __smb2_handle_cancelled_cmd(tcon,
+ le16_to_cpu(sync_hdr->Command),
+ le64_to_cpu(sync_hdr->MessageId),
+ rsp->PersistentFileId,
+ rsp->VolatileFileId);
+ if (rc)
+ cifs_put_tcon(tcon);
- return 0;
+ return rc;
}
/**
@@ -788,23 +817,37 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec)
int i, rc;
struct sdesc *d;
struct smb2_sync_hdr *hdr;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
- if (ses->server->tcpStatus == CifsGood) {
- /* skip non smb311 connections */
- if (ses->server->dialect != SMB311_PROT_ID)
- return 0;
+ hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
+ /* neg prot are always taken */
+ if (hdr->Command == SMB2_NEGOTIATE)
+ goto ok;
- /* skip last sess setup response */
- hdr = (struct smb2_sync_hdr *)iov[0].iov_base;
- if (hdr->Flags & SMB2_FLAGS_SIGNED)
- return 0;
- }
+ /*
+ * If we process a command which wasn't a negprot it means the
+ * neg prot was already done, so the server dialect was set
+ * and we can test it. Preauth requires 3.1.1 for now.
+ */
+ if (server->dialect != SMB311_PROT_ID)
+ return 0;
+
+ if (hdr->Command != SMB2_SESSION_SETUP)
+ return 0;
+
+ /* skip last sess setup response */
+ if ((hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
+ && (hdr->Status == NT_STATUS_OK
+ || (hdr->Status !=
+ cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))))
+ return 0;
- rc = smb311_crypto_shash_allocate(ses->server);
+ok:
+ rc = smb311_crypto_shash_allocate(server);
if (rc)
return rc;
- d = ses->server->secmech.sdescsha512;
+ d = server->secmech.sdescsha512;
rc = crypto_shash_init(&d->shash);
if (rc) {
cifs_dbg(VFS, "%s: could not init sha512 shash\n", __func__);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index cd55af9b7cc5..a7f328f79c6f 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -10,6 +10,7 @@
#include <linux/falloc.h>
#include <linux/scatterlist.h>
#include <linux/uuid.h>
+#include <linux/sort.h>
#include <crypto/aead.h>
#include "cifsglob.h"
#include "smb2pdu.h"
@@ -151,13 +152,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype)
static unsigned int
smb2_get_credits(struct mid_q_entry *mid)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf;
-
- if (mid->mid_state == MID_RESPONSE_RECEIVED
- || mid->mid_state == MID_RESPONSE_MALFORMED)
- return le16_to_cpu(shdr->CreditRequest);
-
- return 0;
+ return mid->credits_received;
}
static int
@@ -315,7 +310,7 @@ smb2_negotiate(const unsigned int xid, struct cifs_ses *ses)
{
int rc;
- ses->server->CurrentMid = 0;
+ cifs_ses_server(ses)->CurrentMid = 0;
rc = SMB2_negotiate(xid, ses);
/* BB we probably don't need to retry with modern servers */
if (rc == -EAGAIN)
@@ -558,6 +553,13 @@ out:
return rc;
}
+static int compare_iface(const void *ia, const void *ib)
+{
+ const struct cifs_server_iface *a = (struct cifs_server_iface *)ia;
+ const struct cifs_server_iface *b = (struct cifs_server_iface *)ib;
+
+ return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1);
+}
static int
SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
@@ -587,6 +589,9 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
if (rc)
goto out;
+ /* sort interfaces from fastest to slowest */
+ sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL);
+
spin_lock(&ses->iface_lock);
kfree(ses->iface_list);
ses->iface_list = iface_list;
@@ -1402,15 +1407,10 @@ smb2_ioctl_query_info(const unsigned int xid,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- buffer = kmalloc(qi.output_buffer_length, GFP_KERNEL);
- if (buffer == NULL)
- return -ENOMEM;
-
- if (copy_from_user(buffer, arg + sizeof(struct smb_query_info),
- qi.output_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
+ buffer = memdup_user(arg + sizeof(struct smb_query_info),
+ qi.output_buffer_length);
+ if (IS_ERR(buffer))
+ return PTR_ERR(buffer);
/* Open */
memset(&open_iov, 0, sizeof(open_iov));
@@ -1529,35 +1529,32 @@ smb2_ioctl_query_info(const unsigned int xid,
if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length)
qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
if (qi.input_buffer_length > 0 &&
- le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length > rsp_iov[1].iov_len) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
- if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
- sizeof(qi.input_buffer_length))) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
+ le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length
+ > rsp_iov[1].iov_len)
+ goto e_fault;
+
+ if (copy_to_user(&pqi->input_buffer_length,
+ &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length)))
+ goto e_fault;
+
if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info),
(const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset),
- qi.input_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
+ qi.input_buffer_length))
+ goto e_fault;
} else {
pqi = (struct smb_query_info __user *)arg;
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length)
qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
- if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length,
- sizeof(qi.input_buffer_length))) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
- if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) {
- rc = -EFAULT;
- goto iqinf_exit;
- }
+ if (copy_to_user(&pqi->input_buffer_length,
+ &qi.input_buffer_length,
+ sizeof(qi.input_buffer_length)))
+ goto e_fault;
+
+ if (copy_to_user(pqi + 1, qi_rsp->Buffer,
+ qi.input_buffer_length))
+ goto e_fault;
}
iqinf_exit:
@@ -1573,6 +1570,10 @@ smb2_ioctl_query_info(const unsigned int xid,
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
return rc;
+
+e_fault:
+ rc = -EFAULT;
+ goto iqinf_exit;
}
static ssize_t
@@ -3281,22 +3282,38 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
static void
smb2_downgrade_oplock(struct TCP_Server_Info *server,
- struct cifsInodeInfo *cinode, bool set_level2)
+ struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache)
{
- if (set_level2)
- server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II,
- 0, NULL);
- else
- server->ops->set_oplock_level(cinode, 0, 0, NULL);
+ server->ops->set_oplock_level(cinode, oplock, 0, NULL);
}
static void
-smb21_downgrade_oplock(struct TCP_Server_Info *server,
- struct cifsInodeInfo *cinode, bool set_level2)
+smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache);
+
+static void
+smb3_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, __u32 oplock,
+ unsigned int epoch, bool *purge_cache)
{
- server->ops->set_oplock_level(cinode,
- set_level2 ? SMB2_LEASE_READ_CACHING_HE :
- 0, 0, NULL);
+ unsigned int old_state = cinode->oplock;
+ unsigned int old_epoch = cinode->epoch;
+ unsigned int new_state;
+
+ if (epoch > old_epoch) {
+ smb21_set_oplock_level(cinode, oplock, 0, NULL);
+ cinode->epoch = epoch;
+ }
+
+ new_state = cinode->oplock;
+ *purge_cache = false;
+
+ if ((old_state & CIFS_CACHE_READ_FLG) != 0 &&
+ (new_state & CIFS_CACHE_READ_FLG) == 0)
+ *purge_cache = true;
+ else if (old_state == new_state && (epoch - old_epoch > 1))
+ *purge_cache = true;
}
static void
@@ -3598,14 +3615,16 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
u8 *ses_enc_key;
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
- if (ses->Suid != ses_id)
- continue;
- ses_enc_key = enc ? ses->smb3encryptionkey :
- ses->smb3decryptionkey;
- memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE);
- spin_unlock(&cifs_tcp_ses_lock);
- return 0;
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (ses->Suid == ses_id) {
+ ses_enc_key = enc ? ses->smb3encryptionkey :
+ ses->smb3decryptionkey;
+ memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE);
+ spin_unlock(&cifs_tcp_ses_lock);
+ return 0;
+ }
+ }
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -4556,7 +4575,7 @@ struct smb_version_operations smb21_operations = {
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb21_downgrade_oplock,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -4656,7 +4675,7 @@ struct smb_version_operations smb30_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb21_downgrade_oplock,
+ .downgrade_oplock = smb3_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
@@ -4764,7 +4783,7 @@ struct smb_version_operations smb311_operations = {
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
.handle_cancelled_mid = smb2_handle_cancelled_mid,
- .downgrade_oplock = smb21_downgrade_oplock,
+ .downgrade_oplock = smb3_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb3_negotiate_wsize,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 05149862aea4..ed77f94dbf1d 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -252,7 +252,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
if (tcon == NULL)
return 0;
- if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL)
+ if (smb2_command == SMB2_TREE_CONNECT)
return 0;
if (tcon->tidStatus == CifsExiting) {
@@ -426,16 +426,9 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf,
* SMB information in the SMB header. If the return code is zero, this
* function must have filled in request_buf pointer.
*/
-static int
-smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
- void **request_buf, unsigned int *total_len)
+static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
{
- int rc;
-
- rc = smb2_reconnect(smb2_command, tcon);
- if (rc)
- return rc;
-
/* BB eventually switch this to SMB2 specific small buf size */
if (smb2_command == SMB2_SET_INFO)
*request_buf = cifs_buf_get();
@@ -456,7 +449,31 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
cifs_stats_inc(&tcon->num_smbs_sent);
}
- return rc;
+ return 0;
+}
+
+static int smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
+{
+ int rc;
+
+ rc = smb2_reconnect(smb2_command, tcon);
+ if (rc)
+ return rc;
+
+ return __smb2_plain_req_init(smb2_command, tcon, request_buf,
+ total_len);
+}
+
+static int smb2_ioctl_req_init(u32 opcode, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
+{
+ /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */
+ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) {
+ return __smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf,
+ total_len);
+ }
+ return smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf, total_len);
}
/* For explanation of negotiate contexts see MS-SMB2 section 2.2.3.1 */
@@ -791,7 +808,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
struct kvec rsp_iov;
int rc = 0;
int resp_buftype;
- struct TCP_Server_Info *server = ses->server;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
int blob_offset, blob_length;
char *security_blob;
int flags = CIFS_NEG_OP;
@@ -813,7 +830,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE);
- if (strcmp(ses->server->vals->version_string,
+ if (strcmp(server->vals->version_string,
SMB3ANY_VERSION_STRING) == 0) {
req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID);
req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID);
@@ -829,7 +846,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
total_len += 8;
} else {
/* otherwise send specific dialect */
- req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
+ req->Dialects[0] = cpu_to_le16(server->vals->protocol_id);
req->DialectCount = cpu_to_le16(1);
total_len += 2;
}
@@ -1171,7 +1188,7 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
int rc;
struct cifs_ses *ses = sess_data->ses;
struct smb2_sess_setup_req *req;
- struct TCP_Server_Info *server = ses->server;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
unsigned int total_len;
rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, (void **) &req,
@@ -1179,13 +1196,21 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
if (rc)
return rc;
- /* First session, not a reauthenticate */
- req->sync_hdr.SessionId = 0;
-
- /* if reconnect, we need to send previous sess id, otherwise it is 0 */
- req->PreviousSessionId = sess_data->previous_session;
-
- req->Flags = 0; /* MBZ */
+ if (sess_data->ses->binding) {
+ req->sync_hdr.SessionId = sess_data->ses->Suid;
+ req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
+ req->PreviousSessionId = 0;
+ req->Flags = SMB2_SESSION_REQ_FLAG_BINDING;
+ } else {
+ /* First session, not a reauthenticate */
+ req->sync_hdr.SessionId = 0;
+ /*
+ * if reconnect, we need to send previous sess id
+ * otherwise it is 0
+ */
+ req->PreviousSessionId = sess_data->previous_session;
+ req->Flags = 0; /* MBZ */
+ }
/* enough to enable echos and oplocks and one max size write */
req->sync_hdr.CreditRequest = cpu_to_le16(130);
@@ -1258,28 +1283,33 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
{
int rc = 0;
struct cifs_ses *ses = sess_data->ses;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
- mutex_lock(&ses->server->srv_mutex);
- if (ses->server->ops->generate_signingkey) {
- rc = ses->server->ops->generate_signingkey(ses);
+ mutex_lock(&server->srv_mutex);
+ if (server->ops->generate_signingkey) {
+ rc = server->ops->generate_signingkey(ses);
if (rc) {
cifs_dbg(FYI,
"SMB3 session key generation failed\n");
- mutex_unlock(&ses->server->srv_mutex);
+ mutex_unlock(&server->srv_mutex);
return rc;
}
}
- if (!ses->server->session_estab) {
- ses->server->sequence_number = 0x2;
- ses->server->session_estab = true;
+ if (!server->session_estab) {
+ server->sequence_number = 0x2;
+ server->session_estab = true;
}
- mutex_unlock(&ses->server->srv_mutex);
+ mutex_unlock(&server->srv_mutex);
cifs_dbg(FYI, "SMB2/3 session established successfully\n");
- spin_lock(&GlobalMid_Lock);
- ses->status = CifsGood;
- ses->need_reconnect = false;
- spin_unlock(&GlobalMid_Lock);
+ /* keep existing ses state if binding */
+ if (!ses->binding) {
+ spin_lock(&GlobalMid_Lock);
+ ses->status = CifsGood;
+ ses->need_reconnect = false;
+ spin_unlock(&GlobalMid_Lock);
+ }
+
return rc;
}
@@ -1317,16 +1347,19 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
goto out_put_spnego_key;
}
- ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
- GFP_KERNEL);
- if (!ses->auth_key.response) {
- cifs_dbg(VFS,
- "Kerberos can't allocate (%u bytes) memory",
- msg->sesskey_len);
- rc = -ENOMEM;
- goto out_put_spnego_key;
+ /* keep session key if binding */
+ if (!ses->binding) {
+ ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+ GFP_KERNEL);
+ if (!ses->auth_key.response) {
+ cifs_dbg(VFS,
+ "Kerberos can't allocate (%u bytes) memory",
+ msg->sesskey_len);
+ rc = -ENOMEM;
+ goto out_put_spnego_key;
+ }
+ ses->auth_key.len = msg->sesskey_len;
}
- ses->auth_key.len = msg->sesskey_len;
sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
sess_data->iov[1].iov_len = msg->secblob_len;
@@ -1336,9 +1369,11 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
goto out_put_spnego_key;
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
- ses->Suid = rsp->sync_hdr.SessionId;
-
- ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ /* keep session id and flags if binding */
+ if (!ses->binding) {
+ ses->Suid = rsp->sync_hdr.SessionId;
+ ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ }
rc = SMB2_sess_establish_session(sess_data);
out_put_spnego_key:
@@ -1432,9 +1467,11 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
-
- ses->Suid = rsp->sync_hdr.SessionId;
- ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ /* keep existing ses id and flags if binding */
+ if (!ses->binding) {
+ ses->Suid = rsp->sync_hdr.SessionId;
+ ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ }
out:
kfree(ntlmssp_blob);
@@ -1491,8 +1528,11 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
- ses->Suid = rsp->sync_hdr.SessionId;
- ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ /* keep existing ses id and flags if binding */
+ if (!ses->binding) {
+ ses->Suid = rsp->sync_hdr.SessionId;
+ ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ }
rc = SMB2_sess_establish_session(sess_data);
out:
@@ -1509,7 +1549,7 @@ SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data)
{
int type;
- type = smb2_select_sectype(ses->server, ses->sectype);
+ type = smb2_select_sectype(cifs_ses_server(ses), ses->sectype);
cifs_dbg(FYI, "sess setup type %d\n", type);
if (type == Unspecified) {
cifs_dbg(VFS,
@@ -1537,7 +1577,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
int rc = 0;
- struct TCP_Server_Info *server = ses->server;
+ struct TCP_Server_Info *server = cifs_ses_server(ses);
struct SMB2_sess_data *sess_data;
cifs_dbg(FYI, "Session Setup\n");
@@ -1563,7 +1603,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
/*
* Initialize the session hash with the server one.
*/
- memcpy(ses->preauth_sha_hash, ses->server->preauth_sha_hash,
+ memcpy(ses->preauth_sha_hash, server->preauth_sha_hash,
SMB2_PREAUTH_HASH_SIZE);
while (sess_data->func)
@@ -1807,6 +1847,8 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
return 0;
+ close_shroot(&tcon->crfid);
+
rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req,
&total_len);
if (rc)
@@ -2661,7 +2703,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
int rc;
char *in_data_buf;
- rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
+ rc = smb2_ioctl_req_init(opcode, tcon, (void **) &req, &total_len);
if (rc)
return rc;
@@ -2972,7 +3014,21 @@ int
SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid)
{
- return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0);
+ int rc;
+ int tmp_rc;
+
+ rc = SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0);
+
+ /* retry close in a worker thread if this one is interrupted */
+ if (rc == -EINTR) {
+ tmp_rc = smb2_handle_cancelled_close(tcon, persistent_fid,
+ volatile_fid);
+ if (tmp_rc)
+ cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n",
+ persistent_fid, tmp_rc);
+ }
+
+ return rc;
}
int
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 0abfde6d0b05..f264e1d36fe1 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1386,7 +1386,7 @@ struct smb2_oplock_break {
struct smb2_lease_break {
struct smb2_sync_hdr sync_hdr;
__le16 StructureSize; /* Must be 44 */
- __le16 Reserved;
+ __le16 Epoch;
__le32 Flags;
__u8 LeaseKey[16];
__le32 CurrentLeaseState;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 71b2930b8e0b..d21a5fcc8d06 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -46,7 +46,8 @@ extern int smb2_verify_signature(struct smb_rqst *, struct TCP_Server_Info *);
extern int smb2_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
- struct smb_rqst *rqst);
+ struct TCP_Server_Info *,
+ struct smb_rqst *rqst);
extern struct mid_q_entry *smb2_setup_async_request(
struct TCP_Server_Info *server, struct smb_rqst *rqst);
extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
@@ -212,6 +213,9 @@ extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
const u64 persistent_fid, const u64 volatile_fid,
const __u8 oplock_level);
+extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon,
+ __u64 persistent_fid,
+ __u64 volatile_fid);
extern int smb2_handle_cancelled_mid(char *buffer,
struct TCP_Server_Info *server);
void smb2_cancelled_close_fid(struct work_struct *work);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 148d7942c796..387c88704c52 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -98,6 +98,61 @@ err:
return rc;
}
+
+static
+int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
+{
+ struct cifs_chan *chan;
+ struct cifs_ses *ses = NULL;
+ int i;
+ int rc = 0;
+
+ spin_lock(&cifs_tcp_ses_lock);
+
+ list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (ses->Suid == ses_id)
+ goto found;
+ }
+ }
+ cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n",
+ __func__, ses_id);
+ rc = -ENOENT;
+ goto out;
+
+found:
+ if (ses->binding) {
+ /*
+ * If we are in the process of binding a new channel
+ * to an existing session, use the master connection
+ * session key
+ */
+ memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE);
+ goto out;
+ }
+
+ /*
+ * Otherwise, use the channel key.
+ */
+
+ for (i = 0; i < ses->chan_count; i++) {
+ chan = ses->chans + i;
+ if (chan->server == server) {
+ memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE);
+ goto out;
+ }
+ }
+
+ cifs_dbg(VFS,
+ "%s: Could not find channel signing key for session 0x%llx\n",
+ __func__, ses_id);
+ rc = -ENOENT;
+
+out:
+ spin_unlock(&cifs_tcp_ses_lock);
+ return rc;
+}
+
static struct cifs_ses *
smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
{
@@ -328,21 +383,45 @@ generate_smb3signingkey(struct cifs_ses *ses,
{
int rc;
- rc = generate_key(ses, ptriplet->signing.label,
- ptriplet->signing.context, ses->smb3signingkey,
- SMB3_SIGN_KEY_SIZE);
- if (rc)
- return rc;
+ /*
+ * All channels use the same encryption/decryption keys but
+ * they have their own signing key.
+ *
+ * When we generate the keys, check if it is for a new channel
+ * (binding) in which case we only need to generate a signing
+ * key and store it in the channel as to not overwrite the
+ * master connection signing key stored in the session
+ */
- rc = generate_key(ses, ptriplet->encryption.label,
- ptriplet->encryption.context, ses->smb3encryptionkey,
- SMB3_SIGN_KEY_SIZE);
- if (rc)
- return rc;
+ if (ses->binding) {
+ rc = generate_key(ses, ptriplet->signing.label,
+ ptriplet->signing.context,
+ cifs_ses_binding_channel(ses)->signkey,
+ SMB3_SIGN_KEY_SIZE);
+ if (rc)
+ return rc;
+ } else {
+ rc = generate_key(ses, ptriplet->signing.label,
+ ptriplet->signing.context,
+ ses->smb3signingkey,
+ SMB3_SIGN_KEY_SIZE);
+ if (rc)
+ return rc;
- rc = generate_key(ses, ptriplet->decryption.label,
- ptriplet->decryption.context,
- ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+ memcpy(ses->chans[0].signkey, ses->smb3signingkey,
+ SMB3_SIGN_KEY_SIZE);
+
+ rc = generate_key(ses, ptriplet->encryption.label,
+ ptriplet->encryption.context,
+ ses->smb3encryptionkey,
+ SMB3_SIGN_KEY_SIZE);
+ rc = generate_key(ses, ptriplet->decryption.label,
+ ptriplet->decryption.context,
+ ses->smb3decryptionkey,
+ SMB3_SIGN_KEY_SIZE);
+ if (rc)
+ return rc;
+ }
if (rc)
return rc;
@@ -431,21 +510,19 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
unsigned char *sigptr = smb3_signature;
struct kvec *iov = rqst->rq_iov;
struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base;
- struct cifs_ses *ses;
struct shash_desc *shash = &server->secmech.sdesccmacaes->shash;
struct smb_rqst drqst;
+ u8 key[SMB3_SIGN_KEY_SIZE];
- ses = smb2_find_smb_ses(server, shdr->SessionId);
- if (!ses) {
- cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
+ rc = smb2_get_sign_key(shdr->SessionId, server, key);
+ if (rc)
return 0;
- }
memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
rc = crypto_shash_setkey(server->secmech.cmacaes,
- ses->smb3signingkey, SMB2_CMACAES_SIZE);
+ key, SMB2_CMACAES_SIZE);
if (rc) {
cifs_server_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
return rc;
@@ -494,16 +571,25 @@ static int
smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
int rc = 0;
- struct smb2_sync_hdr *shdr =
- (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ struct smb2_sync_hdr *shdr;
+ struct smb2_sess_setup_req *ssr;
+ bool is_binding;
+ bool is_signed;
- if (!(shdr->Flags & SMB2_FLAGS_SIGNED) ||
- server->tcpStatus == CifsNeedNegotiate)
- return rc;
+ shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
+ ssr = (struct smb2_sess_setup_req *)shdr;
+
+ is_binding = shdr->Command == SMB2_SESSION_SETUP &&
+ (ssr->Flags & SMB2_SESSION_REQ_FLAG_BINDING);
+ is_signed = shdr->Flags & SMB2_FLAGS_SIGNED;
- if (!server->session_estab) {
+ if (!is_signed)
+ return 0;
+ if (server->tcpStatus == CifsNeedNegotiate)
+ return 0;
+ if (!is_binding && !server->session_estab) {
strncpy(shdr->Signature, "BSRSPYL", 8);
- return rc;
+ return 0;
}
rc = server->ops->calc_signature(rqst, server);
@@ -610,18 +696,18 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
}
static int
-smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
- struct mid_q_entry **mid)
+smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
+ struct smb2_sync_hdr *shdr, struct mid_q_entry **mid)
{
- if (ses->server->tcpStatus == CifsExiting)
+ if (server->tcpStatus == CifsExiting)
return -ENOENT;
- if (ses->server->tcpStatus == CifsNeedReconnect) {
+ if (server->tcpStatus == CifsNeedReconnect) {
cifs_dbg(FYI, "tcp session dead - return to caller to retry\n");
return -EAGAIN;
}
- if (ses->server->tcpStatus == CifsNeedNegotiate &&
+ if (server->tcpStatus == CifsNeedNegotiate &&
shdr->Command != SMB2_NEGOTIATE)
return -EAGAIN;
@@ -638,11 +724,11 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
/* else ok - we are shutting down the session */
}
- *mid = smb2_mid_entry_alloc(shdr, ses->server);
+ *mid = smb2_mid_entry_alloc(shdr, server);
if (*mid == NULL)
return -ENOMEM;
spin_lock(&GlobalMid_Lock);
- list_add_tail(&(*mid)->qhead, &ses->server->pending_mid_q);
+ list_add_tail(&(*mid)->qhead, &server->pending_mid_q);
spin_unlock(&GlobalMid_Lock);
return 0;
@@ -675,24 +761,25 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
}
struct mid_q_entry *
-smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
+smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server,
+ struct smb_rqst *rqst)
{
int rc;
struct smb2_sync_hdr *shdr =
(struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base;
struct mid_q_entry *mid;
- smb2_seq_num_into_buf(ses->server, shdr);
+ smb2_seq_num_into_buf(server, shdr);
- rc = smb2_get_mid_entry(ses, shdr, &mid);
+ rc = smb2_get_mid_entry(ses, server, shdr, &mid);
if (rc) {
- revert_current_mid_from_hdr(ses->server, shdr);
+ revert_current_mid_from_hdr(server, shdr);
return ERR_PTR(rc);
}
- rc = smb2_sign_rqst(rqst, ses->server);
+ rc = smb2_sign_rqst(rqst, server);
if (rc) {
- revert_current_mid_from_hdr(ses->server, shdr);
+ revert_current_mid_from_hdr(server, shdr);
cifs_delete_mid(mid);
return ERR_PTR(rc);
}
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 3c91fa97c9a8..5b1b97e9e0c9 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1069,7 +1069,7 @@ static int smbd_post_send_data(
if (n_vec > SMBDIRECT_MAX_SGE) {
cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
- return -ENOMEM;
+ return -EINVAL;
}
sg_init_table(sgl, n_vec);
@@ -1476,6 +1476,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
info->transport_status = SMBD_DESTROYED;
destroy_workqueue(info->workqueue);
+ log_rdma_event(INFO, "rdma session destroyed\n");
kfree(info);
}
@@ -1505,8 +1506,9 @@ create_conn:
log_rdma_event(INFO, "creating rdma session\n");
server->smbd_conn = smbd_get_connection(
server, (struct sockaddr *) &server->dstaddr);
- log_rdma_event(INFO, "created rdma session info=%p\n",
- server->smbd_conn);
+
+ if (server->smbd_conn)
+ cifs_dbg(VFS, "RDMA transport re-established\n");
return server->smbd_conn ? 0 : -ENOENT;
}
@@ -1970,7 +1972,7 @@ read_rfc1002_done:
if (info->transport_status != SMBD_CONNECTED) {
log_read(ERR, "disconnected\n");
- return 0;
+ return -ECONNABORTED;
}
goto again;
@@ -2269,12 +2271,7 @@ static void smbd_mr_recovery_work(struct work_struct *work)
int rc;
list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
- if (smbdirect_mr->state == MR_INVALIDATED)
- ib_dma_unmap_sg(
- info->id->device, smbdirect_mr->sgl,
- smbdirect_mr->sgl_count,
- smbdirect_mr->dir);
- else if (smbdirect_mr->state == MR_ERROR) {
+ if (smbdirect_mr->state == MR_ERROR) {
/* recover this MR entry */
rc = ib_dereg_mr(smbdirect_mr->mr);
@@ -2602,11 +2599,20 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
*/
smbdirect_mr->state = MR_INVALIDATED;
- /*
- * Schedule the work to do MR recovery for future I/Os
- * MR recovery is slow and we don't want it to block the current I/O
- */
- queue_work(info->workqueue, &info->mr_recovery_work);
+ if (smbdirect_mr->state == MR_INVALIDATED) {
+ ib_dma_unmap_sg(
+ info->id->device, smbdirect_mr->sgl,
+ smbdirect_mr->sgl_count,
+ smbdirect_mr->dir);
+ smbdirect_mr->state = MR_READY;
+ if (atomic_inc_return(&info->mr_ready_count) == 1)
+ wake_up_interruptible(&info->wait_mr);
+ } else
+ /*
+ * Schedule the work to do MR recovery for future I/Os MR
+ * recovery is slow and don't want it to block current I/O
+ */
+ queue_work(info->workqueue, &info->mr_recovery_work);
done:
if (atomic_dec_and_test(&info->mr_used_count))
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ca3de62688d6..3d2e11f85cba 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -93,8 +93,14 @@ static void _cifs_mid_q_entry_release(struct kref *refcount)
__u16 smb_cmd = le16_to_cpu(midEntry->command);
unsigned long now;
unsigned long roundtrip_time;
- struct TCP_Server_Info *server = midEntry->server;
#endif
+ struct TCP_Server_Info *server = midEntry->server;
+
+ if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) &&
+ midEntry->mid_state == MID_RESPONSE_RECEIVED &&
+ server->ops->handle_cancelled_mid)
+ server->ops->handle_cancelled_mid(midEntry->resp_buf, server);
+
midEntry->mid_state = MID_FREE;
atomic_dec(&midCount);
if (midEntry->large_buf)
@@ -319,8 +325,11 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
int val = 1;
__be32 rfc1002_marker;
- if (cifs_rdma_enabled(server) && server->smbd_conn) {
- rc = smbd_send(server, num_rqst, rqst);
+ if (cifs_rdma_enabled(server)) {
+ /* return -EAGAIN when connecting or reconnecting */
+ rc = -EAGAIN;
+ if (server->smbd_conn)
+ rc = smbd_send(server, num_rqst, rqst);
goto smbd_done;
}
@@ -927,7 +936,8 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
}
struct mid_q_entry *
-cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
+cifs_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *ignored,
+ struct smb_rqst *rqst)
{
int rc;
struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
@@ -999,7 +1009,18 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
return -EIO;
}
- server = ses->server;
+ if (!ses->binding) {
+ uint index = 0;
+
+ if (ses->chan_count > 1) {
+ index = (uint)atomic_inc_return(&ses->chan_seq);
+ index %= ses->chan_count;
+ }
+ server = ses->chans[index].server;
+ } else {
+ server = cifs_ses_server(ses);
+ }
+
if (server->tcpStatus == CifsExiting)
return -ENOENT;
@@ -1044,7 +1065,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
}
for (i = 0; i < num_rqst; i++) {
- midQ[i] = server->ops->setup_request(ses, &rqst[i]);
+ midQ[i] = server->ops->setup_request(ses, server, &rqst[i]);
if (IS_ERR(midQ[i])) {
revert_current_mid(server, i);
for (j = 0; j < i; j++)
@@ -1119,8 +1140,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(server, &rqst[i], midQ[i]);
spin_lock(&GlobalMid_Lock);
+ midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
- midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
@@ -1287,7 +1308,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
rc = allocate_mid(ses, in_buf, &midQ);
if (rc) {
- mutex_unlock(&ses->server->srv_mutex);
+ mutex_unlock(&server->srv_mutex);
/* Update # of requests on wire to server */
add_credits(server, &credits, 0);
return rc;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index b7f9ffa1d5f1..aaad4ca1217e 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -48,8 +48,8 @@
#define elf_prstatus compat_elf_prstatus
#define elf_prpsinfo compat_elf_prpsinfo
-#undef ns_to_timeval
-#define ns_to_timeval ns_to_old_timeval32
+#undef ns_to_kernel_old_timeval
+#define ns_to_kernel_old_timeval ns_to_old_timeval32
/*
* To use this file, asm/elf.h must define compat_elf_check_arch.
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a7ec2d3dff92..358ea2ecf36b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -11,8 +11,6 @@
* ioctls.
*/
-#include <linux/joystick.h>
-
#include <linux/types.h>
#include <linux/compat.h>
#include <linux/kernel.h>
@@ -27,13 +25,9 @@
#include <linux/file.h>
#include <linux/ppp-ioctl.h>
#include <linux/if_pppox.h>
-#include <linux/mtio.h>
#include <linux/tty.h>
#include <linux/vt_kern.h>
-#include <linux/raw.h>
#include <linux/blkdev.h>
-#include <linux/rtc.h>
-#include <linux/pci.h>
#include <linux/serial.h>
#include <linux/ctype.h>
#include <linux/syscalls.h>
@@ -42,13 +36,6 @@
#include "internal.h"
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci_sock.h>
-#include <net/bluetooth/rfcomm.h>
-
-#include <linux/capi.h>
-#include <linux/gigaset_dev.h>
-
#ifdef CONFIG_BLOCK
#include <linux/cdrom.h>
#include <linux/fd.h>
@@ -60,448 +47,11 @@
#include <linux/uaccess.h>
#include <linux/watchdog.h>
-#include <linux/soundcard.h>
-
#include <linux/hiddev.h>
#include <linux/sort.h>
-#ifdef CONFIG_SPARC
-#include <linux/fb.h>
-#include <asm/fbio.h>
-#endif
-
-#define convert_in_user(srcptr, dstptr) \
-({ \
- typeof(*srcptr) val; \
- \
- get_user(val, srcptr) || put_user(val, dstptr); \
-})
-
-static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- int err;
-
- err = security_file_ioctl(file, cmd, arg);
- if (err)
- return err;
-
- return vfs_ioctl(file, cmd, arg);
-}
-
-#ifdef CONFIG_BLOCK
-typedef struct sg_io_hdr32 {
- compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
- compat_int_t dxfer_direction; /* [i] data transfer direction */
- unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */
- unsigned char mx_sb_len; /* [i] max length to write to sbp */
- unsigned short iovec_count; /* [i] 0 implies no scatter gather */
- compat_uint_t dxfer_len; /* [i] byte count of data transfer */
- compat_uint_t dxferp; /* [i], [*io] points to data transfer memory
- or scatter gather list */
- compat_uptr_t cmdp; /* [i], [*i] points to command to perform */
- compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */
- compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */
- compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */
- compat_int_t pack_id; /* [i->o] unused internally (normally) */
- compat_uptr_t usr_ptr; /* [i->o] unused internally */
- unsigned char status; /* [o] scsi status */
- unsigned char masked_status; /* [o] shifted, masked scsi status */
- unsigned char msg_status; /* [o] messaging level data (optional) */
- unsigned char sb_len_wr; /* [o] byte count actually written to sbp */
- unsigned short host_status; /* [o] errors from host adapter */
- unsigned short driver_status; /* [o] errors from software driver */
- compat_int_t resid; /* [o] dxfer_len - actual_transferred */
- compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */
- compat_uint_t info; /* [o] auxiliary information */
-} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */
-
-typedef struct sg_iovec32 {
- compat_uint_t iov_base;
- compat_uint_t iov_len;
-} sg_iovec32_t;
-
-static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
-{
- sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
- sg_iovec32_t __user *iov32 = dxferp;
- int i;
-
- for (i = 0; i < iovec_count; i++) {
- u32 base, len;
-
- if (get_user(base, &iov32[i].iov_base) ||
- get_user(len, &iov32[i].iov_len) ||
- put_user(compat_ptr(base), &iov[i].iov_base) ||
- put_user(len, &iov[i].iov_len))
- return -EFAULT;
- }
-
- if (put_user(iov, &sgio->dxferp))
- return -EFAULT;
- return 0;
-}
-
-static int sg_ioctl_trans(struct file *file, unsigned int cmd,
- sg_io_hdr32_t __user *sgio32)
-{
- sg_io_hdr_t __user *sgio;
- u16 iovec_count;
- u32 data;
- void __user *dxferp;
- int err;
- int interface_id;
-
- if (get_user(interface_id, &sgio32->interface_id))
- return -EFAULT;
- if (interface_id != 'S')
- return do_ioctl(file, cmd, (unsigned long)sgio32);
-
- if (get_user(iovec_count, &sgio32->iovec_count))
- return -EFAULT;
-
- {
- void __user *top = compat_alloc_user_space(0);
- void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
- (iovec_count * sizeof(sg_iovec_t)));
- if (new > top)
- return -EINVAL;
-
- sgio = new;
- }
-
- /* Ok, now construct. */
- if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
- (2 * sizeof(int)) +
- (2 * sizeof(unsigned char)) +
- (1 * sizeof(unsigned short)) +
- (1 * sizeof(unsigned int))))
- return -EFAULT;
-
- if (get_user(data, &sgio32->dxferp))
- return -EFAULT;
- dxferp = compat_ptr(data);
- if (iovec_count) {
- if (sg_build_iovec(sgio, dxferp, iovec_count))
- return -EFAULT;
- } else {
- if (put_user(dxferp, &sgio->dxferp))
- return -EFAULT;
- }
-
- {
- unsigned char __user *cmdp;
- unsigned char __user *sbp;
-
- if (get_user(data, &sgio32->cmdp))
- return -EFAULT;
- cmdp = compat_ptr(data);
-
- if (get_user(data, &sgio32->sbp))
- return -EFAULT;
- sbp = compat_ptr(data);
-
- if (put_user(cmdp, &sgio->cmdp) ||
- put_user(sbp, &sgio->sbp))
- return -EFAULT;
- }
-
- if (copy_in_user(&sgio->timeout, &sgio32->timeout,
- 3 * sizeof(int)))
- return -EFAULT;
-
- if (get_user(data, &sgio32->usr_ptr))
- return -EFAULT;
- if (put_user(compat_ptr(data), &sgio->usr_ptr))
- return -EFAULT;
-
- err = do_ioctl(file, cmd, (unsigned long) sgio);
-
- if (err >= 0) {
- void __user *datap;
-
- if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
- sizeof(int)) ||
- get_user(datap, &sgio->usr_ptr) ||
- put_user((u32)(unsigned long)datap,
- &sgio32->usr_ptr) ||
- copy_in_user(&sgio32->status, &sgio->status,
- (4 * sizeof(unsigned char)) +
- (2 * sizeof(unsigned short)) +
- (3 * sizeof(int))))
- err = -EFAULT;
- }
-
- return err;
-}
-
-struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
- char req_state;
- char orphan;
- char sg_io_owned;
- char problem;
- int pack_id;
- compat_uptr_t usr_ptr;
- unsigned int duration;
- int unused;
-};
-
-static int sg_grt_trans(struct file *file,
- unsigned int cmd, struct compat_sg_req_info __user *o)
-{
- int err, i;
- sg_req_info_t __user *r;
- r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
- err = do_ioctl(file, cmd, (unsigned long)r);
- if (err < 0)
- return err;
- for (i = 0; i < SG_MAX_QUEUE; i++) {
- void __user *ptr;
- int d;
-
- if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) ||
- get_user(ptr, &r[i].usr_ptr) ||
- get_user(d, &r[i].duration) ||
- put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) ||
- put_user(d, &o[i].duration))
- return -EFAULT;
- }
- return err;
-}
-#endif /* CONFIG_BLOCK */
-
-struct sock_fprog32 {
- unsigned short len;
- compat_caddr_t filter;
-};
-
-#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32)
-#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32)
-
-static int ppp_sock_fprog_ioctl_trans(struct file *file,
- unsigned int cmd, struct sock_fprog32 __user *u_fprog32)
-{
- struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
- void __user *fptr64;
- u32 fptr32;
- u16 flen;
-
- if (get_user(flen, &u_fprog32->len) ||
- get_user(fptr32, &u_fprog32->filter))
- return -EFAULT;
-
- fptr64 = compat_ptr(fptr32);
-
- if (put_user(flen, &u_fprog64->len) ||
- put_user(fptr64, &u_fprog64->filter))
- return -EFAULT;
-
- if (cmd == PPPIOCSPASS32)
- cmd = PPPIOCSPASS;
- else
- cmd = PPPIOCSACTIVE;
-
- return do_ioctl(file, cmd, (unsigned long) u_fprog64);
-}
-
-struct ppp_option_data32 {
- compat_caddr_t ptr;
- u32 length;
- compat_int_t transmit;
-};
-#define PPPIOCSCOMPRESS32 _IOW('t', 77, struct ppp_option_data32)
-
-struct ppp_idle32 {
- compat_time_t xmit_idle;
- compat_time_t recv_idle;
-};
-#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32)
-
-static int ppp_gidle(struct file *file, unsigned int cmd,
- struct ppp_idle32 __user *idle32)
-{
- struct ppp_idle __user *idle;
- __kernel_time_t xmit, recv;
- int err;
-
- idle = compat_alloc_user_space(sizeof(*idle));
-
- err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle);
-
- if (!err) {
- if (get_user(xmit, &idle->xmit_idle) ||
- get_user(recv, &idle->recv_idle) ||
- put_user(xmit, &idle32->xmit_idle) ||
- put_user(recv, &idle32->recv_idle))
- err = -EFAULT;
- }
- return err;
-}
-
-static int ppp_scompress(struct file *file, unsigned int cmd,
- struct ppp_option_data32 __user *odata32)
-{
- struct ppp_option_data __user *odata;
- __u32 data;
- void __user *datap;
-
- odata = compat_alloc_user_space(sizeof(*odata));
-
- if (get_user(data, &odata32->ptr))
- return -EFAULT;
-
- datap = compat_ptr(data);
- if (put_user(datap, &odata->ptr))
- return -EFAULT;
-
- if (copy_in_user(&odata->length, &odata32->length,
- sizeof(__u32) + sizeof(int)))
- return -EFAULT;
-
- return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata);
-}
-
-#ifdef CONFIG_BLOCK
-struct mtget32 {
- compat_long_t mt_type;
- compat_long_t mt_resid;
- compat_long_t mt_dsreg;
- compat_long_t mt_gstat;
- compat_long_t mt_erreg;
- compat_daddr_t mt_fileno;
- compat_daddr_t mt_blkno;
-};
-#define MTIOCGET32 _IOR('m', 2, struct mtget32)
-
-struct mtpos32 {
- compat_long_t mt_blkno;
-};
-#define MTIOCPOS32 _IOR('m', 3, struct mtpos32)
-
-static int mt_ioctl_trans(struct file *file,
- unsigned int cmd, void __user *argp)
-{
- /* NULL initialization to make gcc shut up */
- struct mtget __user *get = NULL;
- struct mtget32 __user *umget32;
- struct mtpos __user *pos = NULL;
- struct mtpos32 __user *upos32;
- unsigned long kcmd;
- void *karg;
- int err = 0;
-
- switch(cmd) {
- case MTIOCPOS32:
- kcmd = MTIOCPOS;
- pos = compat_alloc_user_space(sizeof(*pos));
- karg = pos;
- break;
- default: /* MTIOCGET32 */
- kcmd = MTIOCGET;
- get = compat_alloc_user_space(sizeof(*get));
- karg = get;
- break;
- }
- if (karg == NULL)
- return -EFAULT;
- err = do_ioctl(file, kcmd, (unsigned long)karg);
- if (err)
- return err;
- switch (cmd) {
- case MTIOCPOS32:
- upos32 = argp;
- err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno);
- break;
- case MTIOCGET32:
- umget32 = argp;
- err = convert_in_user(&get->mt_type, &umget32->mt_type);
- err |= convert_in_user(&get->mt_resid, &umget32->mt_resid);
- err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg);
- err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat);
- err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg);
- err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno);
- err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno);
- break;
- }
- return err ? -EFAULT: 0;
-}
-
-#endif /* CONFIG_BLOCK */
-
-/* Bluetooth ioctls */
-#define HCIUARTSETPROTO _IOW('U', 200, int)
-#define HCIUARTGETPROTO _IOR('U', 201, int)
-#define HCIUARTGETDEVICE _IOR('U', 202, int)
-#define HCIUARTSETFLAGS _IOW('U', 203, int)
-#define HCIUARTGETFLAGS _IOR('U', 204, int)
-
-#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
-#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t)
-#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
-#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t)
-
-static int rtc_ioctl(struct file *file,
- unsigned cmd, void __user *argp)
-{
- unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
- int ret;
-
- if (valp == NULL)
- return -EFAULT;
- switch (cmd) {
- case RTC_IRQP_READ32:
- case RTC_EPOCH_READ32:
- ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ?
- RTC_IRQP_READ : RTC_EPOCH_READ,
- (unsigned long)valp);
- if (ret)
- return ret;
- return convert_in_user(valp, (unsigned int __user *)argp);
- case RTC_IRQP_SET32:
- return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp);
- case RTC_EPOCH_SET32:
- return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp);
- }
-
- return -ENOIOCTLCMD;
-}
-
-/* on ia32 l_start is on a 32-bit boundary */
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
-struct space_resv_32 {
- __s16 l_type;
- __s16 l_whence;
- __s64 l_start __attribute__((packed));
- /* len == 0 means until end of file */
- __s64 l_len __attribute__((packed));
- __s32 l_sysid;
- __u32 l_pid;
- __s32 l_pad[4]; /* reserve area */
-};
-
-#define FS_IOC_RESVSP_32 _IOW ('X', 40, struct space_resv_32)
-#define FS_IOC_RESVSP64_32 _IOW ('X', 42, struct space_resv_32)
-
-/* just account for different alignment */
-static int compat_ioctl_preallocate(struct file *file,
- struct space_resv_32 __user *p32)
-{
- struct space_resv __user *p = compat_alloc_user_space(sizeof(*p));
-
- if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
- copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) ||
- copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) ||
- copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) ||
- copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) ||
- copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) ||
- copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32)))
- return -EFAULT;
-
- return ioctl_preallocate(file, p);
-}
-#endif
-
/*
* simple reversible transform to make our table more evenly
* distributed after sorting.
@@ -509,33 +59,7 @@ static int compat_ioctl_preallocate(struct file *file,
#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff)
#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd),
-/* ioctl should not be warned about even if it's not implemented.
- Valid reasons to use this:
- - It is implemented with ->compat_ioctl on some device, but programs
- call it on others too.
- - The ioctl is not implemented in the native kernel, but programs
- call it commonly anyways.
- Most other reasons are not valid. */
-#define IGNORE_IOCTL(cmd) COMPATIBLE_IOCTL(cmd)
-
static unsigned int ioctl_pointer[] = {
-/* compatible ioctls first */
-/* Little t */
-COMPATIBLE_IOCTL(TIOCOUTQ)
-/* Little f */
-COMPATIBLE_IOCTL(FIOCLEX)
-COMPATIBLE_IOCTL(FIONCLEX)
-COMPATIBLE_IOCTL(FIOASYNC)
-COMPATIBLE_IOCTL(FIONBIO)
-COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */
-COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
-/* 0x00 */
-COMPATIBLE_IOCTL(FIBMAP)
-COMPATIBLE_IOCTL(FIGETBSZ)
-/* 'X' - originally XFS but some now in the VFS */
-COMPATIBLE_IOCTL(FIFREEZE)
-COMPATIBLE_IOCTL(FITHAW)
-COMPATIBLE_IOCTL(FITRIM)
#ifdef CONFIG_BLOCK
/* Big S */
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
@@ -547,43 +71,10 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
#endif
-/* Big V (don't complain on serial console) */
-IGNORE_IOCTL(VT_OPENQRY)
-IGNORE_IOCTL(VT_GETMODE)
-/* Little p (/dev/rtc, /dev/envctrl, etc.) */
-COMPATIBLE_IOCTL(RTC_AIE_ON)
-COMPATIBLE_IOCTL(RTC_AIE_OFF)
-COMPATIBLE_IOCTL(RTC_UIE_ON)
-COMPATIBLE_IOCTL(RTC_UIE_OFF)
-COMPATIBLE_IOCTL(RTC_PIE_ON)
-COMPATIBLE_IOCTL(RTC_PIE_OFF)
-COMPATIBLE_IOCTL(RTC_WIE_ON)
-COMPATIBLE_IOCTL(RTC_WIE_OFF)
-COMPATIBLE_IOCTL(RTC_ALM_SET)
-COMPATIBLE_IOCTL(RTC_ALM_READ)
-COMPATIBLE_IOCTL(RTC_RD_TIME)
-COMPATIBLE_IOCTL(RTC_SET_TIME)
-COMPATIBLE_IOCTL(RTC_WKALM_SET)
-COMPATIBLE_IOCTL(RTC_WKALM_RD)
-/*
- * These two are only for the sbus rtc driver, but
- * hwclock tries them on every rtc device first when
- * running on sparc. On other architectures the entries
- * are useless but harmless.
- */
-COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */
-COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */
-/* Little m */
-COMPATIBLE_IOCTL(MTIOCTOP)
-/* Socket level stuff */
-COMPATIBLE_IOCTL(FIOQSIZE)
#ifdef CONFIG_BLOCK
-/* md calls this on random blockdevs */
-IGNORE_IOCTL(RAID_VERSION)
-/* qemu/qemu-img might call these two on plain files for probing */
-IGNORE_IOCTL(CDROM_DRIVE_STATUS)
-IGNORE_IOCTL(FDGETPRM32)
/* SG stuff */
+COMPATIBLE_IOCTL(SG_IO)
+COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
COMPATIBLE_IOCTL(SG_EMULATED_HOST)
@@ -607,314 +98,6 @@ COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
#endif
-/* PPP stuff */
-COMPATIBLE_IOCTL(PPPIOCGFLAGS)
-COMPATIBLE_IOCTL(PPPIOCSFLAGS)
-COMPATIBLE_IOCTL(PPPIOCGASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCSASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCGUNIT)
-COMPATIBLE_IOCTL(PPPIOCGRASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCSRASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCGMRU)
-COMPATIBLE_IOCTL(PPPIOCSMRU)
-COMPATIBLE_IOCTL(PPPIOCSMAXCID)
-COMPATIBLE_IOCTL(PPPIOCGXASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCSXASYNCMAP)
-COMPATIBLE_IOCTL(PPPIOCXFERUNIT)
-/* PPPIOCSCOMPRESS is translated */
-COMPATIBLE_IOCTL(PPPIOCGNPMODE)
-COMPATIBLE_IOCTL(PPPIOCSNPMODE)
-COMPATIBLE_IOCTL(PPPIOCGDEBUG)
-COMPATIBLE_IOCTL(PPPIOCSDEBUG)
-/* PPPIOCSPASS is translated */
-/* PPPIOCSACTIVE is translated */
-/* PPPIOCGIDLE is translated */
-COMPATIBLE_IOCTL(PPPIOCNEWUNIT)
-COMPATIBLE_IOCTL(PPPIOCATTACH)
-COMPATIBLE_IOCTL(PPPIOCDETACH)
-COMPATIBLE_IOCTL(PPPIOCSMRRU)
-COMPATIBLE_IOCTL(PPPIOCCONNECT)
-COMPATIBLE_IOCTL(PPPIOCDISCONN)
-COMPATIBLE_IOCTL(PPPIOCATTCHAN)
-COMPATIBLE_IOCTL(PPPIOCGCHAN)
-COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
-/* Big A */
-/* sparc only */
-/* Big Q for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_SEQ_RESET)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_SYNC)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_INFO)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_CTRLRATE)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_GETOUTCOUNT)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_GETINCOUNT)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_PERCMODE)
-COMPATIBLE_IOCTL(SNDCTL_FM_LOAD_INSTR)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_TESTMIDI)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_RESETSAMPLES)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_NRSYNTHS)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_NRMIDIS)
-COMPATIBLE_IOCTL(SNDCTL_MIDI_INFO)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_THRESHOLD)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_MEMAVL)
-COMPATIBLE_IOCTL(SNDCTL_FM_4OP_ENABLE)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_PANIC)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_OUTOFBAND)
-COMPATIBLE_IOCTL(SNDCTL_SEQ_GETTIME)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_ID)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_CONTROL)
-COMPATIBLE_IOCTL(SNDCTL_SYNTH_REMOVESAMPLE)
-/* Big T for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_TMR_TIMEBASE)
-COMPATIBLE_IOCTL(SNDCTL_TMR_START)
-COMPATIBLE_IOCTL(SNDCTL_TMR_STOP)
-COMPATIBLE_IOCTL(SNDCTL_TMR_CONTINUE)
-COMPATIBLE_IOCTL(SNDCTL_TMR_TEMPO)
-COMPATIBLE_IOCTL(SNDCTL_TMR_SOURCE)
-COMPATIBLE_IOCTL(SNDCTL_TMR_METRONOME)
-COMPATIBLE_IOCTL(SNDCTL_TMR_SELECT)
-/* Little m for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_MIDI_PRETIME)
-COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUMODE)
-COMPATIBLE_IOCTL(SNDCTL_MIDI_MPUCMD)
-/* Big P for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_DSP_RESET)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SYNC)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SPEED)
-COMPATIBLE_IOCTL(SNDCTL_DSP_STEREO)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETBLKSIZE)
-COMPATIBLE_IOCTL(SNDCTL_DSP_CHANNELS)
-COMPATIBLE_IOCTL(SOUND_PCM_WRITE_FILTER)
-COMPATIBLE_IOCTL(SNDCTL_DSP_POST)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SUBDIVIDE)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETFRAGMENT)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETFMTS)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETFMT)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETOSPACE)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETISPACE)
-COMPATIBLE_IOCTL(SNDCTL_DSP_NONBLOCK)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETCAPS)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETTRIGGER)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETTRIGGER)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETIPTR)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETOPTR)
-/* SNDCTL_DSP_MAPINBUF, XXX needs translation */
-/* SNDCTL_DSP_MAPOUTBUF, XXX needs translation */
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETSYNCRO)
-COMPATIBLE_IOCTL(SNDCTL_DSP_SETDUPLEX)
-COMPATIBLE_IOCTL(SNDCTL_DSP_GETODELAY)
-COMPATIBLE_IOCTL(SNDCTL_DSP_PROFILE)
-COMPATIBLE_IOCTL(SOUND_PCM_READ_RATE)
-COMPATIBLE_IOCTL(SOUND_PCM_READ_CHANNELS)
-COMPATIBLE_IOCTL(SOUND_PCM_READ_BITS)
-COMPATIBLE_IOCTL(SOUND_PCM_READ_FILTER)
-/* Big C for sound/OSS */
-COMPATIBLE_IOCTL(SNDCTL_COPR_RESET)
-COMPATIBLE_IOCTL(SNDCTL_COPR_LOAD)
-COMPATIBLE_IOCTL(SNDCTL_COPR_RDATA)
-COMPATIBLE_IOCTL(SNDCTL_COPR_RCODE)
-COMPATIBLE_IOCTL(SNDCTL_COPR_WDATA)
-COMPATIBLE_IOCTL(SNDCTL_COPR_WCODE)
-COMPATIBLE_IOCTL(SNDCTL_COPR_RUN)
-COMPATIBLE_IOCTL(SNDCTL_COPR_HALT)
-COMPATIBLE_IOCTL(SNDCTL_COPR_SENDMSG)
-COMPATIBLE_IOCTL(SNDCTL_COPR_RCVMSG)
-/* Big M for sound/OSS */
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_VOLUME)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_BASS)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_TREBLE)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_SYNTH)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_PCM)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_SPEAKER)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_MIC)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_CD)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_IMIX)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_ALTPCM)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECLEV)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_IGAIN)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_OGAIN)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE1)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE2)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_LINE3)
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL1))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL2))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_DIGITAL3))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEIN))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_PHONEOUT))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_VIDEO))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_RADIO))
-COMPATIBLE_IOCTL(MIXER_READ(SOUND_MIXER_MONITOR))
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_MUTE)
-/* SOUND_MIXER_READ_ENHANCE, same value as READ_MUTE */
-/* SOUND_MIXER_READ_LOUD, same value as READ_MUTE */
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECSRC)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_DEVMASK)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_RECMASK)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_STEREODEVS)
-COMPATIBLE_IOCTL(SOUND_MIXER_READ_CAPS)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_VOLUME)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_BASS)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_TREBLE)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SYNTH)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_PCM)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_SPEAKER)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MIC)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_CD)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IMIX)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_ALTPCM)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECLEV)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_IGAIN)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_OGAIN)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE1)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE2)
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_LINE3)
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL1))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL2))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_DIGITAL3))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEIN))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_PHONEOUT))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_VIDEO))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_RADIO))
-COMPATIBLE_IOCTL(MIXER_WRITE(SOUND_MIXER_MONITOR))
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_MUTE)
-/* SOUND_MIXER_WRITE_ENHANCE, same value as WRITE_MUTE */
-/* SOUND_MIXER_WRITE_LOUD, same value as WRITE_MUTE */
-COMPATIBLE_IOCTL(SOUND_MIXER_WRITE_RECSRC)
-COMPATIBLE_IOCTL(SOUND_MIXER_INFO)
-COMPATIBLE_IOCTL(SOUND_OLD_MIXER_INFO)
-COMPATIBLE_IOCTL(SOUND_MIXER_ACCESS)
-COMPATIBLE_IOCTL(SOUND_MIXER_AGC)
-COMPATIBLE_IOCTL(SOUND_MIXER_3DSE)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE1)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE2)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE3)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE4)
-COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
-COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
-COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
-COMPATIBLE_IOCTL(OSS_GETVERSION)
-/* Raw devices */
-COMPATIBLE_IOCTL(RAW_SETBIND)
-COMPATIBLE_IOCTL(RAW_GETBIND)
-/* Watchdog */
-COMPATIBLE_IOCTL(WDIOC_GETSUPPORT)
-COMPATIBLE_IOCTL(WDIOC_GETSTATUS)
-COMPATIBLE_IOCTL(WDIOC_GETBOOTSTATUS)
-COMPATIBLE_IOCTL(WDIOC_GETTEMP)
-COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
-COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
-COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
-COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
-COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT)
-COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT)
-/* Big R */
-COMPATIBLE_IOCTL(RNDGETENTCNT)
-COMPATIBLE_IOCTL(RNDADDTOENTCNT)
-COMPATIBLE_IOCTL(RNDGETPOOL)
-COMPATIBLE_IOCTL(RNDADDENTROPY)
-COMPATIBLE_IOCTL(RNDZAPENTCNT)
-COMPATIBLE_IOCTL(RNDCLEARPOOL)
-/* Bluetooth */
-COMPATIBLE_IOCTL(HCIDEVUP)
-COMPATIBLE_IOCTL(HCIDEVDOWN)
-COMPATIBLE_IOCTL(HCIDEVRESET)
-COMPATIBLE_IOCTL(HCIDEVRESTAT)
-COMPATIBLE_IOCTL(HCIGETDEVLIST)
-COMPATIBLE_IOCTL(HCIGETDEVINFO)
-COMPATIBLE_IOCTL(HCIGETCONNLIST)
-COMPATIBLE_IOCTL(HCIGETCONNINFO)
-COMPATIBLE_IOCTL(HCIGETAUTHINFO)
-COMPATIBLE_IOCTL(HCISETRAW)
-COMPATIBLE_IOCTL(HCISETSCAN)
-COMPATIBLE_IOCTL(HCISETAUTH)
-COMPATIBLE_IOCTL(HCISETENCRYPT)
-COMPATIBLE_IOCTL(HCISETPTYPE)
-COMPATIBLE_IOCTL(HCISETLINKPOL)
-COMPATIBLE_IOCTL(HCISETLINKMODE)
-COMPATIBLE_IOCTL(HCISETACLMTU)
-COMPATIBLE_IOCTL(HCISETSCOMTU)
-COMPATIBLE_IOCTL(HCIBLOCKADDR)
-COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
-COMPATIBLE_IOCTL(HCIINQUIRY)
-COMPATIBLE_IOCTL(HCIUARTSETPROTO)
-COMPATIBLE_IOCTL(HCIUARTGETPROTO)
-COMPATIBLE_IOCTL(HCIUARTGETDEVICE)
-COMPATIBLE_IOCTL(HCIUARTSETFLAGS)
-COMPATIBLE_IOCTL(HCIUARTGETFLAGS)
-COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
-COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
-COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
-COMPATIBLE_IOCTL(RFCOMMGETDEVINFO)
-COMPATIBLE_IOCTL(RFCOMMSTEALDLC)
-/* CAPI */
-COMPATIBLE_IOCTL(CAPI_REGISTER)
-COMPATIBLE_IOCTL(CAPI_GET_MANUFACTURER)
-COMPATIBLE_IOCTL(CAPI_GET_VERSION)
-COMPATIBLE_IOCTL(CAPI_GET_SERIAL)
-COMPATIBLE_IOCTL(CAPI_GET_PROFILE)
-COMPATIBLE_IOCTL(CAPI_MANUFACTURER_CMD)
-COMPATIBLE_IOCTL(CAPI_GET_ERRCODE)
-COMPATIBLE_IOCTL(CAPI_INSTALLED)
-COMPATIBLE_IOCTL(CAPI_GET_FLAGS)
-COMPATIBLE_IOCTL(CAPI_SET_FLAGS)
-COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
-COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
-COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
-/* Misc. */
-COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */
-COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */
-COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
-COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
-COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
-COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* hiddev */
-COMPATIBLE_IOCTL(HIDIOCGVERSION)
-COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
-COMPATIBLE_IOCTL(HIDIOCGDEVINFO)
-COMPATIBLE_IOCTL(HIDIOCGSTRING)
-COMPATIBLE_IOCTL(HIDIOCINITREPORT)
-COMPATIBLE_IOCTL(HIDIOCGREPORT)
-COMPATIBLE_IOCTL(HIDIOCSREPORT)
-COMPATIBLE_IOCTL(HIDIOCGREPORTINFO)
-COMPATIBLE_IOCTL(HIDIOCGFIELDINFO)
-COMPATIBLE_IOCTL(HIDIOCGUSAGE)
-COMPATIBLE_IOCTL(HIDIOCSUSAGE)
-COMPATIBLE_IOCTL(HIDIOCGUCODE)
-COMPATIBLE_IOCTL(HIDIOCGFLAG)
-COMPATIBLE_IOCTL(HIDIOCSFLAG)
-COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX)
-COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO)
-/* joystick */
-COMPATIBLE_IOCTL(JSIOCGVERSION)
-COMPATIBLE_IOCTL(JSIOCGAXES)
-COMPATIBLE_IOCTL(JSIOCGBUTTONS)
-COMPATIBLE_IOCTL(JSIOCGNAME(0))
-
-/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
- but we don't want warnings on other file systems. So declare
- them as compatible here. */
-#define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2])
-#define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2])
-
-IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32)
-IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32)
-
-#ifdef CONFIG_SPARC
-/* Sparc framebuffers, handled in sbusfb_compat_ioctl() */
-IGNORE_IOCTL(FBIOGTYPE)
-IGNORE_IOCTL(FBIOSATTR)
-IGNORE_IOCTL(FBIOGATTR)
-IGNORE_IOCTL(FBIOSVIDEO)
-IGNORE_IOCTL(FBIOGVIDEO)
-IGNORE_IOCTL(FBIOSCURPOS)
-IGNORE_IOCTL(FBIOGCURPOS)
-IGNORE_IOCTL(FBIOGCURMAX)
-IGNORE_IOCTL(FBIOPUTCMAP32)
-IGNORE_IOCTL(FBIOGETCMAP32)
-IGNORE_IOCTL(FBIOSCURSOR32)
-IGNORE_IOCTL(FBIOGCURSOR32)
-#endif
};
/*
@@ -927,51 +110,12 @@ IGNORE_IOCTL(FBIOGCURSOR32)
static long do_ioctl_trans(unsigned int cmd,
unsigned long arg, struct file *file)
{
- void __user *argp = compat_ptr(arg);
-
- switch (cmd) {
- case PPPIOCGIDLE32:
- return ppp_gidle(file, cmd, argp);
- case PPPIOCSCOMPRESS32:
- return ppp_scompress(file, cmd, argp);
- case PPPIOCSPASS32:
- case PPPIOCSACTIVE32:
- return ppp_sock_fprog_ioctl_trans(file, cmd, argp);
-#ifdef CONFIG_BLOCK
- case SG_IO:
- return sg_ioctl_trans(file, cmd, argp);
- case SG_GET_REQUEST_TABLE:
- return sg_grt_trans(file, cmd, argp);
- case MTIOCGET32:
- case MTIOCPOS32:
- return mt_ioctl_trans(file, cmd, argp);
-#endif
- /* Not implemented in the native kernel */
- case RTC_IRQP_READ32:
- case RTC_IRQP_SET32:
- case RTC_EPOCH_READ32:
- case RTC_EPOCH_SET32:
- return rtc_ioctl(file, cmd, argp);
- }
-
- /*
- * These take an integer instead of a pointer as 'arg',
- * so we must not do a compat_ptr() translation.
- */
- switch (cmd) {
- /* RAID */
- case HOT_REMOVE_DISK:
- case HOT_ADD_DISK:
- case SET_DISK_FAULTY:
- case SET_BITMAP_FILE:
- return vfs_ioctl(file, cmd, arg);
- }
-
return -ENOIOCTLCMD;
}
static int compat_ioctl_check_table(unsigned int xcmd)
{
+#ifdef CONFIG_BLOCK
int i;
const int max = ARRAY_SIZE(ioctl_pointer) - 1;
@@ -990,6 +134,9 @@ static int compat_ioctl_check_table(unsigned int xcmd)
i--;
return ioctl_pointer[i] == xcmd;
+#else
+ return 0;
+#endif
}
COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
@@ -1006,44 +153,62 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (error)
goto out_fput;
- /*
- * To allow the compat_ioctl handlers to be self contained
- * we need to check the common ioctls here first.
- * Just handle them with the standard handlers below.
- */
switch (cmd) {
+ /* these are never seen by ->ioctl(), no argument or int argument */
case FIOCLEX:
case FIONCLEX:
+ case FIFREEZE:
+ case FITHAW:
+ case FICLONE:
+ goto do_ioctl;
+ /* these are never seen by ->ioctl(), pointer argument */
case FIONBIO:
case FIOASYNC:
case FIOQSIZE:
- break;
-
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+ case FS_IOC_FIEMAP:
+ case FIGETBSZ:
+ case FICLONERANGE:
+ case FIDEDUPERANGE:
+ goto found_handler;
+ /*
+ * The next group is the stuff handled inside file_ioctl().
+ * For regular files these never reach ->ioctl(); for
+ * devices, sockets, etc. they do and one (FIONREAD) is
+ * even accepted in some cases. In all those cases
+ * argument has the same type, so we can handle these
+ * here, shunting them towards do_vfs_ioctl().
+ * ->compat_ioctl() will never see any of those.
+ */
+ /* pointer argument, never actually handled by ->ioctl() */
+ case FIBMAP:
+ goto found_handler;
+ /* handled by some ->ioctl(); always a pointer to int */
+ case FIONREAD:
+ goto found_handler;
+ /* these get messy on amd64 due to alignment differences */
+#if defined(CONFIG_X86_64)
case FS_IOC_RESVSP_32:
case FS_IOC_RESVSP64_32:
- error = compat_ioctl_preallocate(f.file, compat_ptr(arg));
+ error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg));
+ goto out_fput;
+ case FS_IOC_UNRESVSP_32:
+ case FS_IOC_UNRESVSP64_32:
+ error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE,
+ compat_ptr(arg));
+ goto out_fput;
+ case FS_IOC_ZERO_RANGE_32:
+ error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE,
+ compat_ptr(arg));
goto out_fput;
#else
case FS_IOC_RESVSP:
case FS_IOC_RESVSP64:
- error = ioctl_preallocate(f.file, compat_ptr(arg));
- goto out_fput;
+ case FS_IOC_UNRESVSP:
+ case FS_IOC_UNRESVSP64:
+ case FS_IOC_ZERO_RANGE:
+ goto found_handler;
#endif
- case FICLONE:
- case FICLONERANGE:
- case FIDEDUPERANGE:
- case FS_IOC_FIEMAP:
- goto do_ioctl;
-
- case FIBMAP:
- case FIGETBSZ:
- case FIONREAD:
- if (S_ISREG(file_inode(f.file)->i_mode))
- break;
- /*FALL THROUGH*/
-
default:
if (f.file->f_op->compat_ioctl) {
error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index d12ea28836a5..2f04024c3588 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -958,8 +958,8 @@ static int cramfs_get_tree(struct fs_context *fc)
if (IS_ENABLED(CONFIG_CRAMFS_MTD)) {
ret = get_tree_mtd(fc, cramfs_mtd_fill_super);
- if (ret < 0)
- return ret;
+ if (!ret)
+ return 0;
}
if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV))
ret = get_tree_bdev(fc, cramfs_blkdev_fill_super);
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 82da2510721f..1f4b8a277060 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -26,7 +26,7 @@
#include <linux/namei.h>
#include "fscrypt_private.h"
-static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
+void fscrypt_decrypt_bio(struct bio *bio)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
@@ -37,37 +37,10 @@ static void __fscrypt_decrypt_bio(struct bio *bio, bool done)
bv->bv_offset);
if (ret)
SetPageError(page);
- else if (done)
- SetPageUptodate(page);
- if (done)
- unlock_page(page);
}
}
-
-void fscrypt_decrypt_bio(struct bio *bio)
-{
- __fscrypt_decrypt_bio(bio, false);
-}
EXPORT_SYMBOL(fscrypt_decrypt_bio);
-static void completion_pages(struct work_struct *work)
-{
- struct fscrypt_ctx *ctx = container_of(work, struct fscrypt_ctx, work);
- struct bio *bio = ctx->bio;
-
- __fscrypt_decrypt_bio(bio, true);
- fscrypt_release_ctx(ctx);
- bio_put(bio);
-}
-
-void fscrypt_enqueue_decrypt_bio(struct fscrypt_ctx *ctx, struct bio *bio)
-{
- INIT_WORK(&ctx->work, completion_pages);
- ctx->bio = bio;
- fscrypt_enqueue_decrypt_work(&ctx->work);
-}
-EXPORT_SYMBOL(fscrypt_enqueue_decrypt_bio);
-
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
sector_t pblk, unsigned int len)
{
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 32a7ad0098cc..3719efa546c6 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -27,29 +27,20 @@
#include <linux/ratelimit.h>
#include <linux/dcache.h>
#include <linux/namei.h>
-#include <crypto/aes.h>
#include <crypto/skcipher.h>
#include "fscrypt_private.h"
static unsigned int num_prealloc_crypto_pages = 32;
-static unsigned int num_prealloc_crypto_ctxs = 128;
module_param(num_prealloc_crypto_pages, uint, 0444);
MODULE_PARM_DESC(num_prealloc_crypto_pages,
"Number of crypto pages to preallocate");
-module_param(num_prealloc_crypto_ctxs, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
- "Number of crypto contexts to preallocate");
static mempool_t *fscrypt_bounce_page_pool = NULL;
-static LIST_HEAD(fscrypt_free_ctxs);
-static DEFINE_SPINLOCK(fscrypt_ctx_lock);
-
static struct workqueue_struct *fscrypt_read_workqueue;
static DEFINE_MUTEX(fscrypt_init_mutex);
-static struct kmem_cache *fscrypt_ctx_cachep;
struct kmem_cache *fscrypt_info_cachep;
void fscrypt_enqueue_decrypt_work(struct work_struct *work)
@@ -58,62 +49,6 @@ void fscrypt_enqueue_decrypt_work(struct work_struct *work)
}
EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work);
-/**
- * fscrypt_release_ctx() - Release a decryption context
- * @ctx: The decryption context to release.
- *
- * If the decryption context was allocated from the pre-allocated pool, return
- * it to that pool. Else, free it.
- */
-void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
-{
- unsigned long flags;
-
- if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
- kmem_cache_free(fscrypt_ctx_cachep, ctx);
- } else {
- spin_lock_irqsave(&fscrypt_ctx_lock, flags);
- list_add(&ctx->free_list, &fscrypt_free_ctxs);
- spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
- }
-}
-EXPORT_SYMBOL(fscrypt_release_ctx);
-
-/**
- * fscrypt_get_ctx() - Get a decryption context
- * @gfp_flags: The gfp flag for memory allocation
- *
- * Allocate and initialize a decryption context.
- *
- * Return: A new decryption context on success; an ERR_PTR() otherwise.
- */
-struct fscrypt_ctx *fscrypt_get_ctx(gfp_t gfp_flags)
-{
- struct fscrypt_ctx *ctx;
- unsigned long flags;
-
- /*
- * First try getting a ctx from the free list so that we don't have to
- * call into the slab allocator.
- */
- spin_lock_irqsave(&fscrypt_ctx_lock, flags);
- ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
- struct fscrypt_ctx, free_list);
- if (ctx)
- list_del(&ctx->free_list);
- spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
- if (!ctx) {
- ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags);
- if (!ctx)
- return ERR_PTR(-ENOMEM);
- ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
- } else {
- ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
- }
- return ctx;
-}
-EXPORT_SYMBOL(fscrypt_get_ctx);
-
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags)
{
return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
@@ -138,14 +73,17 @@ EXPORT_SYMBOL(fscrypt_free_bounce_page);
void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci)
{
+ u8 flags = fscrypt_policy_flags(&ci->ci_policy);
+
memset(iv, 0, ci->ci_mode->ivsize);
- iv->lblk_num = cpu_to_le64(lblk_num);
- if (fscrypt_is_direct_key_policy(&ci->ci_policy))
+ if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
+ WARN_ON_ONCE((u32)lblk_num != lblk_num);
+ lblk_num |= (u64)ci->ci_inode->i_ino << 32;
+ } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE);
-
- if (ci->ci_essiv_tfm != NULL)
- crypto_cipher_encrypt_one(ci->ci_essiv_tfm, iv->raw, iv->raw);
+ }
+ iv->lblk_num = cpu_to_le64(lblk_num);
}
/* Encrypt or decrypt a single filesystem block of file contents */
@@ -396,17 +334,6 @@ const struct dentry_operations fscrypt_d_ops = {
.d_revalidate = fscrypt_d_revalidate,
};
-static void fscrypt_destroy(void)
-{
- struct fscrypt_ctx *pos, *n;
-
- list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list)
- kmem_cache_free(fscrypt_ctx_cachep, pos);
- INIT_LIST_HEAD(&fscrypt_free_ctxs);
- mempool_destroy(fscrypt_bounce_page_pool);
- fscrypt_bounce_page_pool = NULL;
-}
-
/**
* fscrypt_initialize() - allocate major buffers for fs encryption.
* @cop_flags: fscrypt operations flags
@@ -414,11 +341,11 @@ static void fscrypt_destroy(void)
* We only call this when we start accessing encrypted files, since it
* results in memory getting allocated that wouldn't otherwise be used.
*
- * Return: Zero on success, non-zero otherwise.
+ * Return: 0 on success; -errno on failure
*/
int fscrypt_initialize(unsigned int cop_flags)
{
- int i, res = -ENOMEM;
+ int err = 0;
/* No need to allocate a bounce page pool if this FS won't use it. */
if (cop_flags & FS_CFLG_OWN_PAGES)
@@ -426,29 +353,18 @@ int fscrypt_initialize(unsigned int cop_flags)
mutex_lock(&fscrypt_init_mutex);
if (fscrypt_bounce_page_pool)
- goto already_initialized;
-
- for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
- struct fscrypt_ctx *ctx;
-
- ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
- if (!ctx)
- goto fail;
- list_add(&ctx->free_list, &fscrypt_free_ctxs);
- }
+ goto out_unlock;
+ err = -ENOMEM;
fscrypt_bounce_page_pool =
mempool_create_page_pool(num_prealloc_crypto_pages, 0);
if (!fscrypt_bounce_page_pool)
- goto fail;
+ goto out_unlock;
-already_initialized:
- mutex_unlock(&fscrypt_init_mutex);
- return 0;
-fail:
- fscrypt_destroy();
+ err = 0;
+out_unlock:
mutex_unlock(&fscrypt_init_mutex);
- return res;
+ return err;
}
void fscrypt_msg(const struct inode *inode, const char *level,
@@ -494,13 +410,9 @@ static int __init fscrypt_init(void)
if (!fscrypt_read_workqueue)
goto fail;
- fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT);
- if (!fscrypt_ctx_cachep)
- goto fail_free_queue;
-
fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
if (!fscrypt_info_cachep)
- goto fail_free_ctx;
+ goto fail_free_queue;
err = fscrypt_init_keyring();
if (err)
@@ -510,8 +422,6 @@ static int __init fscrypt_init(void)
fail_free_info:
kmem_cache_destroy(fscrypt_info_cachep);
-fail_free_ctx:
- kmem_cache_destroy(fscrypt_ctx_cachep);
fail_free_queue:
destroy_workqueue(fscrypt_read_workqueue);
fail:
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index e84efc01512e..130b50e5a011 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -163,11 +163,8 @@ struct fscrypt_info {
/* The actual crypto transform used for encryption and decryption */
struct crypto_skcipher *ci_ctfm;
- /*
- * Cipher for ESSIV IV generation. Only set for CBC contents
- * encryption, otherwise is NULL.
- */
- struct crypto_cipher *ci_essiv_tfm;
+ /* True if the key should be freed when this fscrypt_info is freed */
+ bool ci_owns_key;
/*
* Encryption mode used for this inode. It corresponds to either the
@@ -209,8 +206,6 @@ typedef enum {
FS_ENCRYPT,
} fscrypt_direction_t;
-#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
-
static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
u32 filenames_mode)
{
@@ -289,7 +284,8 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
*/
#define HKDF_CONTEXT_KEY_IDENTIFIER 1
#define HKDF_CONTEXT_PER_FILE_KEY 2
-#define HKDF_CONTEXT_PER_MODE_KEY 3
+#define HKDF_CONTEXT_DIRECT_KEY 3
+#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4
extern int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context,
const u8 *info, unsigned int infolen,
@@ -386,8 +382,14 @@ struct fscrypt_master_key {
struct list_head mk_decrypted_inodes;
spinlock_t mk_decrypted_inodes_lock;
- /* Per-mode tfms for DIRECT_KEY policies, allocated on-demand */
- struct crypto_skcipher *mk_mode_keys[__FSCRYPT_MODE_MAX + 1];
+ /* Crypto API transforms for DIRECT_KEY policies, allocated on-demand */
+ struct crypto_skcipher *mk_direct_tfms[__FSCRYPT_MODE_MAX + 1];
+
+ /*
+ * Crypto API transforms for filesystem-layer implementation of
+ * IV_INO_LBLK_64 policies, allocated on-demand.
+ */
+ struct crypto_skcipher *mk_iv_ino_lblk_64_tfms[__FSCRYPT_MODE_MAX + 1];
} __randomize_layout;
@@ -443,8 +445,7 @@ struct fscrypt_mode {
const char *cipher_str;
int keysize;
int ivsize;
- bool logged_impl_name;
- bool needs_essiv;
+ int logged_impl_name;
};
static inline bool
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index c34fa7c61b43..040df1f5e1c8 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -43,8 +43,10 @@ static void free_master_key(struct fscrypt_master_key *mk)
wipe_master_key_secret(&mk->mk_secret);
- for (i = 0; i < ARRAY_SIZE(mk->mk_mode_keys); i++)
- crypto_free_skcipher(mk->mk_mode_keys[i]);
+ for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) {
+ crypto_free_skcipher(mk->mk_direct_tfms[i]);
+ crypto_free_skcipher(mk->mk_iv_ino_lblk_64_tfms[i]);
+ }
key_put(mk->mk_users);
kzfree(mk);
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index d71c2d6dd162..f577bb6613f9 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -8,15 +8,11 @@
* Heavily modified since then.
*/
-#include <crypto/aes.h>
-#include <crypto/sha.h>
#include <crypto/skcipher.h>
#include <linux/key.h>
#include "fscrypt_private.h"
-static struct crypto_shash *essiv_hash_tfm;
-
static struct fscrypt_mode available_modes[] = {
[FSCRYPT_MODE_AES_256_XTS] = {
.friendly_name = "AES-256-XTS",
@@ -31,11 +27,10 @@ static struct fscrypt_mode available_modes[] = {
.ivsize = 16,
},
[FSCRYPT_MODE_AES_128_CBC] = {
- .friendly_name = "AES-128-CBC",
- .cipher_str = "cbc(aes)",
+ .friendly_name = "AES-128-CBC-ESSIV",
+ .cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
.ivsize = 16,
- .needs_essiv = true,
},
[FSCRYPT_MODE_AES_128_CTS] = {
.friendly_name = "AES-128-CTS-CBC",
@@ -86,15 +81,13 @@ struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode,
mode->cipher_str, PTR_ERR(tfm));
return tfm;
}
- if (unlikely(!mode->logged_impl_name)) {
+ if (!xchg(&mode->logged_impl_name, 1)) {
/*
* fscrypt performance can vary greatly depending on which
* crypto algorithm implementation is used. Help people debug
* performance problems by logging the ->cra_driver_name the
- * first time a mode is used. Note that multiple threads can
- * race here, but it doesn't really matter.
+ * first time a mode is used.
*/
- mode->logged_impl_name = true;
pr_info("fscrypt: %s using implementation \"%s\"\n",
mode->friendly_name,
crypto_skcipher_alg(tfm)->base.cra_driver_name);
@@ -111,131 +104,64 @@ err_free_tfm:
return ERR_PTR(err);
}
-static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
-{
- struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm);
-
- /* init hash transform on demand */
- if (unlikely(!tfm)) {
- struct crypto_shash *prev_tfm;
-
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- if (PTR_ERR(tfm) == -ENOENT) {
- fscrypt_warn(NULL,
- "Missing crypto API support for SHA-256");
- return -ENOPKG;
- }
- fscrypt_err(NULL,
- "Error allocating SHA-256 transform: %ld",
- PTR_ERR(tfm));
- return PTR_ERR(tfm);
- }
- prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
- if (prev_tfm) {
- crypto_free_shash(tfm);
- tfm = prev_tfm;
- }
- }
-
- {
- SHASH_DESC_ON_STACK(desc, tfm);
- desc->tfm = tfm;
-
- return crypto_shash_digest(desc, key, keysize, salt);
- }
-}
-
-static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key,
- int keysize)
-{
- int err;
- struct crypto_cipher *essiv_tfm;
- u8 salt[SHA256_DIGEST_SIZE];
-
- if (WARN_ON(ci->ci_mode->ivsize != AES_BLOCK_SIZE))
- return -EINVAL;
-
- essiv_tfm = crypto_alloc_cipher("aes", 0, 0);
- if (IS_ERR(essiv_tfm))
- return PTR_ERR(essiv_tfm);
-
- ci->ci_essiv_tfm = essiv_tfm;
-
- err = derive_essiv_salt(raw_key, keysize, salt);
- if (err)
- goto out;
-
- /*
- * Using SHA256 to derive the salt/key will result in AES-256 being
- * used for IV generation. File contents encryption will still use the
- * configured keysize (AES-128) nevertheless.
- */
- err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt));
- if (err)
- goto out;
-
-out:
- memzero_explicit(salt, sizeof(salt));
- return err;
-}
-
-/* Given the per-file key, set up the file's crypto transform object(s) */
+/* Given the per-file key, set up the file's crypto transform object */
int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key)
{
- struct fscrypt_mode *mode = ci->ci_mode;
- struct crypto_skcipher *ctfm;
- int err;
-
- ctfm = fscrypt_allocate_skcipher(mode, derived_key, ci->ci_inode);
- if (IS_ERR(ctfm))
- return PTR_ERR(ctfm);
+ struct crypto_skcipher *tfm;
- ci->ci_ctfm = ctfm;
+ tfm = fscrypt_allocate_skcipher(ci->ci_mode, derived_key, ci->ci_inode);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
- if (mode->needs_essiv) {
- err = init_essiv_generator(ci, derived_key, mode->keysize);
- if (err) {
- fscrypt_warn(ci->ci_inode,
- "Error initializing ESSIV generator: %d",
- err);
- return err;
- }
- }
+ ci->ci_ctfm = tfm;
+ ci->ci_owns_key = true;
return 0;
}
static int setup_per_mode_key(struct fscrypt_info *ci,
- struct fscrypt_master_key *mk)
+ struct fscrypt_master_key *mk,
+ struct crypto_skcipher **tfms,
+ u8 hkdf_context, bool include_fs_uuid)
{
+ const struct inode *inode = ci->ci_inode;
+ const struct super_block *sb = inode->i_sb;
struct fscrypt_mode *mode = ci->ci_mode;
u8 mode_num = mode - available_modes;
struct crypto_skcipher *tfm, *prev_tfm;
u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
+ u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)];
+ unsigned int hkdf_infolen = 0;
int err;
- if (WARN_ON(mode_num >= ARRAY_SIZE(mk->mk_mode_keys)))
+ if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX))
return -EINVAL;
/* pairs with cmpxchg() below */
- tfm = READ_ONCE(mk->mk_mode_keys[mode_num]);
+ tfm = READ_ONCE(tfms[mode_num]);
if (likely(tfm != NULL))
goto done;
BUILD_BUG_ON(sizeof(mode_num) != 1);
+ BUILD_BUG_ON(sizeof(sb->s_uuid) != 16);
+ BUILD_BUG_ON(sizeof(hkdf_info) != 17);
+ hkdf_info[hkdf_infolen++] = mode_num;
+ if (include_fs_uuid) {
+ memcpy(&hkdf_info[hkdf_infolen], &sb->s_uuid,
+ sizeof(sb->s_uuid));
+ hkdf_infolen += sizeof(sb->s_uuid);
+ }
err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
- HKDF_CONTEXT_PER_MODE_KEY,
- &mode_num, sizeof(mode_num),
+ hkdf_context, hkdf_info, hkdf_infolen,
mode_key, mode->keysize);
if (err)
return err;
- tfm = fscrypt_allocate_skcipher(mode, mode_key, ci->ci_inode);
+ tfm = fscrypt_allocate_skcipher(mode, mode_key, inode);
memzero_explicit(mode_key, mode->keysize);
if (IS_ERR(tfm))
return PTR_ERR(tfm);
/* pairs with READ_ONCE() above */
- prev_tfm = cmpxchg(&mk->mk_mode_keys[mode_num], NULL, tfm);
+ prev_tfm = cmpxchg(&tfms[mode_num], NULL, tfm);
if (prev_tfm != NULL) {
crypto_free_skcipher(tfm);
tfm = prev_tfm;
@@ -266,7 +192,19 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
ci->ci_mode->friendly_name);
return -EINVAL;
}
- return setup_per_mode_key(ci, mk);
+ return setup_per_mode_key(ci, mk, mk->mk_direct_tfms,
+ HKDF_CONTEXT_DIRECT_KEY, false);
+ } else if (ci->ci_policy.v2.flags &
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) {
+ /*
+ * IV_INO_LBLK_64: encryption keys are derived from (master_key,
+ * mode_num, filesystem_uuid), and inode number is included in
+ * the IVs. This format is optimized for use with inline
+ * encryption hardware compliant with the UFS or eMMC standards.
+ */
+ return setup_per_mode_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms,
+ HKDF_CONTEXT_IV_INO_LBLK_64_KEY,
+ true);
}
err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
@@ -388,13 +326,10 @@ static void put_crypt_info(struct fscrypt_info *ci)
if (!ci)
return;
- if (ci->ci_direct_key) {
+ if (ci->ci_direct_key)
fscrypt_put_direct_key(ci->ci_direct_key);
- } else if ((ci->ci_ctfm != NULL || ci->ci_essiv_tfm != NULL) &&
- !fscrypt_is_direct_key_policy(&ci->ci_policy)) {
+ else if (ci->ci_owns_key)
crypto_free_skcipher(ci->ci_ctfm);
- crypto_free_cipher(ci->ci_essiv_tfm);
- }
key = ci->ci_master_key;
if (key) {
@@ -415,6 +350,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
key_invalidate(key);
key_put(key);
}
+ memzero_explicit(ci, sizeof(*ci));
kmem_cache_free(fscrypt_info_cachep, ci);
}
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index ad1a36c370c3..5298ef22aa85 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -270,10 +270,6 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci,
return -EINVAL;
}
- /* ESSIV implies 16-byte IVs which implies !DIRECT_KEY */
- if (WARN_ON(mode->needs_essiv))
- return -EINVAL;
-
dk = fscrypt_get_direct_key(ci, raw_master_key);
if (IS_ERR(dk))
return PTR_ERR(dk);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 4072ba644595..96f528071bed 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -29,6 +29,40 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
}
+static bool supported_iv_ino_lblk_64_policy(
+ const struct fscrypt_policy_v2 *policy,
+ const struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ int ino_bits = 64, lblk_bits = 64;
+
+ if (policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
+ fscrypt_warn(inode,
+ "The DIRECT_KEY and IV_INO_LBLK_64 flags are mutually exclusive");
+ return false;
+ }
+ /*
+ * It's unsafe to include inode numbers in the IVs if the filesystem can
+ * potentially renumber inodes, e.g. via filesystem shrinking.
+ */
+ if (!sb->s_cop->has_stable_inodes ||
+ !sb->s_cop->has_stable_inodes(sb)) {
+ fscrypt_warn(inode,
+ "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't have stable inode numbers",
+ sb->s_id);
+ return false;
+ }
+ if (sb->s_cop->get_ino_and_lblk_bits)
+ sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits);
+ if (ino_bits > 32 || lblk_bits > 32) {
+ fscrypt_warn(inode,
+ "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't use 32-bit inode and block numbers",
+ sb->s_id);
+ return false;
+ }
+ return true;
+}
+
/**
* fscrypt_supported_policy - check whether an encryption policy is supported
*
@@ -55,7 +89,8 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
return false;
}
- if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) {
+ if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK |
+ FSCRYPT_POLICY_FLAG_DIRECT_KEY)) {
fscrypt_warn(inode,
"Unsupported encryption flags (0x%02x)",
policy->flags);
@@ -83,6 +118,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
return false;
}
+ if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) &&
+ !supported_iv_ino_lblk_64_policy(policy, inode))
+ return false;
+
if (memchr_inv(policy->__reserved, 0,
sizeof(policy->__reserved))) {
fscrypt_warn(inode,
diff --git a/fs/dax.c b/fs/dax.c
index 2cc43cd914eb..1f1f0201cad1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1091,7 +1091,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ struct iomap *iomap, struct iomap *srcmap)
{
struct block_device *bdev = iomap->bdev;
struct dax_device *dax_dev = iomap->dax_dev;
@@ -1248,7 +1248,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
- struct iomap iomap = { 0 };
+ struct iomap iomap = { .type = IOMAP_HOLE };
+ struct iomap srcmap = { .type = IOMAP_HOLE };
unsigned flags = IOMAP_FAULT;
int error, major = 0;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -1293,7 +1294,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
* the file system block size to be equal the page size, which means
* that we never have to deal with more than a single extent here.
*/
- error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
+ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
if (iomap_errp)
*iomap_errp = error;
if (error) {
@@ -1472,7 +1473,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
vm_fault_t result = VM_FAULT_FALLBACK;
- struct iomap iomap = { 0 };
+ struct iomap iomap = { .type = IOMAP_HOLE };
+ struct iomap srcmap = { .type = IOMAP_HOLE };
pgoff_t max_pgoff;
void *entry;
loff_t pos;
@@ -1547,7 +1549,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* to look up our filesystem block.
*/
pos = (loff_t)xas.xa_index << PAGE_SHIFT;
- error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
+ error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
+ &srcmap);
if (error)
goto unlock_entry;
diff --git a/fs/dcache.c b/fs/dcache.c
index e88cf0554e65..f7931b682a0d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1319,7 +1319,7 @@ resume:
if (!list_empty(&dentry->d_subdirs)) {
spin_unlock(&this_parent->d_lock);
- spin_release(&dentry->d_lock.dep_map, 1, _RET_IP_);
+ spin_release(&dentry->d_lock.dep_map, _RET_IP_);
this_parent = dentry;
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 87846aad594b..dede25247b81 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -420,20 +420,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_u8(const char *name, umode_t mode,
- struct dentry *parent, u8 *value)
+void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent,
+ u8 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8,
&fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -465,20 +456,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_u16(const char *name, umode_t mode,
- struct dentry *parent, u16 *value)
+void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent,
+ u16 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16,
&fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -556,20 +538,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
* This function creates a file in debugfs with the given name that
* contains the value of the variable @value. If the @mode variable is so
* set, it can be read from, and written to.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_u64(const char *name, umode_t mode,
- struct dentry *parent, u64 *value)
+void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent,
+ u64 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64,
&fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);
@@ -660,10 +633,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x8(const char *name, umode_t mode,
- struct dentry *parent, u8 *value)
+void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent,
+ u8 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8,
&fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -678,10 +651,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x16(const char *name, umode_t mode,
- struct dentry *parent, u16 *value)
+void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent,
+ u16 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16,
&fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -696,10 +669,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x32(const char *name, umode_t mode,
- struct dentry *parent, u32 *value)
+void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent,
+ u32 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32,
&fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -714,10 +687,10 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_x64(const char *name, umode_t mode,
- struct dentry *parent, u64 *value)
+void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent,
+ u64 *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64,
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64,
&fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);
@@ -748,12 +721,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
- struct dentry *parent, size_t *value)
+void debugfs_create_size_t(const char *name, umode_t mode,
+ struct dentry *parent, size_t *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value,
- &fops_size_t, &fops_size_t_ro,
- &fops_size_t_wo);
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t,
+ &fops_size_t_ro, &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
@@ -785,12 +757,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
* @value: a pointer to the variable that the file should read to and write
* from.
*/
-struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
- struct dentry *parent, atomic_t *value)
+void debugfs_create_atomic_t(const char *name, umode_t mode,
+ struct dentry *parent, atomic_t *value)
{
- return debugfs_create_mode_unsafe(name, mode, parent, value,
- &fops_atomic_t, &fops_atomic_t_ro,
- &fops_atomic_t_wo);
+ debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t,
+ &fops_atomic_t_ro, &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9329ced91f1d..0ec4f270139f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -221,27 +221,6 @@ static inline struct page *dio_get_page(struct dio *dio,
}
/*
- * Warn about a page cache invalidation failure during a direct io write.
- */
-void dio_warn_stale_pagecache(struct file *filp)
-{
- static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
- char pathname[128];
- struct inode *inode = file_inode(filp);
- char *path;
-
- errseq_set(&inode->i_mapping->wb_err, -EIO);
- if (__ratelimit(&_rs)) {
- path = file_path(filp, pathname, sizeof(pathname));
- if (IS_ERR(path))
- path = "(unknown)";
- pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n");
- pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
- current->comm);
- }
-}
-
-/*
* dio_complete() - called when all DIO BIO I/O has been completed
*
* This drops i_dio_count, lets interested parties know that a DIO operation
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index feecb57defa7..5fb45d865ce5 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -378,6 +378,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return rc;
switch (cmd) {
+ case FITRIM:
case FS_IOC32_GETFLAGS:
case FS_IOC32_SETFLAGS:
case FS_IOC32_GETVERSION:
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 18426f4855f1..e23752d9a79f 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -128,13 +128,20 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
struct inode *inode)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
struct dentry *lower_dir_dentry;
+ struct inode *lower_dir_inode;
int rc;
- dget(lower_dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+ lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+ lower_dir_inode = d_inode(lower_dir_dentry);
+ inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+ dget(lower_dentry); // don't even try to make the lower negative
+ if (lower_dentry->d_parent != lower_dir_dentry)
+ rc = -EINVAL;
+ else if (d_unhashed(lower_dentry))
+ rc = -EINVAL;
+ else
+ rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
if (rc) {
printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
goto out_unlock;
@@ -142,10 +149,11 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
fsstack_copy_attr_times(dir, lower_dir_inode);
set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
inode->i_ctime = dir->i_ctime;
- d_drop(dentry);
out_unlock:
- unlock_dir(lower_dir_dentry);
dput(lower_dentry);
+ inode_unlock(lower_dir_inode);
+ if (!rc)
+ d_drop(dentry);
return rc;
}
@@ -311,9 +319,9 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- struct inode *inode, *lower_inode = d_inode(lower_dentry);
+ struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ struct inode *inode, *lower_inode;
struct ecryptfs_dentry_info *dentry_info;
- struct vfsmount *lower_mnt;
int rc = 0;
dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
@@ -322,16 +330,23 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
return ERR_PTR(-ENOMEM);
}
- lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
fsstack_copy_attr_atime(d_inode(dentry->d_parent),
- d_inode(lower_dentry->d_parent));
+ d_inode(path->dentry));
BUG_ON(!d_count(lower_dentry));
ecryptfs_set_dentry_private(dentry, dentry_info);
- dentry_info->lower_path.mnt = lower_mnt;
+ dentry_info->lower_path.mnt = mntget(path->mnt);
dentry_info->lower_path.dentry = lower_dentry;
- if (d_really_is_negative(lower_dentry)) {
+ /*
+ * negative dentry can go positive under us here - its parent is not
+ * locked. That's OK and that could happen just as we return from
+ * ecryptfs_lookup() anyway. Just need to be careful and fetch
+ * ->d_inode only once - it's not stable here.
+ */
+ lower_inode = READ_ONCE(lower_dentry->d_inode);
+
+ if (!lower_inode) {
/* We want to add because we couldn't find in lower */
d_add(dentry, NULL);
return NULL;
@@ -512,22 +527,30 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct dentry *lower_dentry;
struct dentry *lower_dir_dentry;
+ struct inode *lower_dir_inode;
int rc;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- dget(dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
- dget(lower_dentry);
- rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry);
- dput(lower_dentry);
- if (!rc && d_really_is_positive(dentry))
+ lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+ lower_dir_inode = d_inode(lower_dir_dentry);
+
+ inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+ dget(lower_dentry); // don't even try to make the lower negative
+ if (lower_dentry->d_parent != lower_dir_dentry)
+ rc = -EINVAL;
+ else if (d_unhashed(lower_dentry))
+ rc = -EINVAL;
+ else
+ rc = vfs_rmdir(lower_dir_inode, lower_dentry);
+ if (!rc) {
clear_nlink(d_inode(dentry));
- fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
- set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
- unlock_dir(lower_dir_dentry);
+ fsstack_copy_attr_times(dir, lower_dir_inode);
+ set_nlink(dir, lower_dir_inode->i_nlink);
+ }
+ dput(lower_dentry);
+ inode_unlock(lower_dir_inode);
if (!rc)
d_drop(dentry);
- dput(dentry);
return rc;
}
@@ -565,20 +588,22 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *lower_new_dentry;
struct dentry *lower_old_dir_dentry;
struct dentry *lower_new_dir_dentry;
- struct dentry *trap = NULL;
+ struct dentry *trap;
struct inode *target_inode;
if (flags)
return -EINVAL;
+ lower_old_dir_dentry = ecryptfs_dentry_to_lower(old_dentry->d_parent);
+ lower_new_dir_dentry = ecryptfs_dentry_to_lower(new_dentry->d_parent);
+
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
- dget(lower_old_dentry);
- dget(lower_new_dentry);
- lower_old_dir_dentry = dget_parent(lower_old_dentry);
- lower_new_dir_dentry = dget_parent(lower_new_dentry);
+
target_inode = d_inode(new_dentry);
+
trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ dget(lower_new_dentry);
rc = -EINVAL;
if (lower_old_dentry->d_parent != lower_old_dir_dentry)
goto out_lock;
@@ -606,11 +631,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_dir != old_dir)
fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry));
out_lock:
- unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
- dput(lower_new_dir_dentry);
- dput(lower_old_dir_dentry);
dput(lower_new_dentry);
- dput(lower_old_dentry);
+ unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
return rc;
}
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 9d634d3a1845..74b0aaa7114c 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,6 +3,7 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select LIBCRC32C
help
EROFS (Enhanced Read-Only File System) is a lightweight
read-only file system with modern designs (eg. page-sized
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 19f89f9fb10c..2890a67a1ded 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -73,7 +73,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
victim = availables[--top];
get_page(victim);
} else {
- victim = erofs_allocpage(pagepool, GFP_KERNEL, false);
+ victim = erofs_allocpage(pagepool, GFP_KERNEL);
if (!victim)
return -ENOMEM;
victim->mapping = Z_EROFS_MAPPING_STAGING;
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index b1ee5654750d..385fa49c7749 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -11,6 +11,8 @@
#define EROFS_SUPER_OFFSET 1024
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
* be incompatible with this kernel version.
@@ -37,7 +39,6 @@ struct erofs_super_block {
__u8 uuid[16]; /* 128-bit uuid for volume */
__u8 volume_name[16]; /* volume name */
__le32 feature_incompat;
-
__u8 reserved2[44];
};
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 544a453f3076..1ed5beff7d11 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -85,6 +85,7 @@ struct erofs_sb_info {
u8 uuid[16]; /* 128-bit uuid for volume */
u8 volume_name[16]; /* volume name */
+ u32 feature_compat;
u32 feature_incompat;
unsigned int mount_opt;
@@ -278,9 +279,7 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
extern const struct super_operations erofs_sops;
extern const struct address_space_operations erofs_raw_access_aops;
-#ifdef CONFIG_EROFS_FS_ZIP
-extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
-#endif
+extern const struct address_space_operations z_erofs_aops;
/*
* Logical to physical block mapping, used by erofs_map_blocks()
@@ -382,7 +381,7 @@ int erofs_namei(struct inode *dir, struct qstr *name,
extern const struct file_operations erofs_dir_fops;
/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail);
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
#if (EROFS_PCPUBUF_NR_PAGES > 0)
void *erofs_get_pcpubuf(unsigned int pagenr);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 0e369494f2f2..057e6d7b5b7f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -9,6 +9,7 @@
#include <linux/statfs.h>
#include <linux/parser.h>
#include <linux/seq_file.h>
+#include <linux/crc32c.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -46,6 +47,30 @@ void _erofs_info(struct super_block *sb, const char *function,
va_end(args);
}
+static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
+{
+ struct erofs_super_block *dsb;
+ u32 expected_crc, crc;
+
+ dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET,
+ EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL);
+ if (!dsb)
+ return -ENOMEM;
+
+ expected_crc = le32_to_cpu(dsb->checksum);
+ dsb->checksum = 0;
+ /* to allow for x86 boot sectors and other oddities. */
+ crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET);
+ kfree(dsb);
+
+ if (crc != expected_crc) {
+ erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+ return 0;
+}
+
static void erofs_inode_init_once(void *ptr)
{
struct erofs_inode *vi = ptr;
@@ -112,7 +137,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi = EROFS_SB(sb);
- data = kmap_atomic(page);
+ data = kmap(page);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
ret = -EINVAL;
@@ -121,6 +146,13 @@ static int erofs_read_superblock(struct super_block *sb)
goto out;
}
+ sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
+ if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) {
+ ret = erofs_superblock_csum_verify(sb, data);
+ if (ret)
+ goto out;
+ }
+
blkszbits = dsb->blkszbits;
/* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
if (blkszbits != LOG_BLOCK_SIZE) {
@@ -155,7 +187,7 @@ static int erofs_read_superblock(struct super_block *sb)
}
ret = 0;
out:
- kunmap_atomic(data);
+ kunmap(page);
put_page(page);
return ret;
}
@@ -566,9 +598,6 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",cache_strategy=readahead");
} else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) {
seq_puts(seq, ",cache_strategy=readaround");
- } else {
- seq_puts(seq, ",cache_strategy=(unknown)");
- DBG_BUGON(1);
}
#endif
return 0;
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index d92b3e753a6f..1e8e1450d5b0 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -7,7 +7,7 @@
#include "internal.h"
#include <linux/pagevec.h>
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail)
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
{
struct page *page;
@@ -16,7 +16,7 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail)
DBG_BUGON(page_ref_count(page) != 1);
list_del(&page->lru);
} else {
- page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0);
+ page = alloc_page(gfp);
}
return page;
}
@@ -149,8 +149,7 @@ static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp)
}
static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
- struct erofs_workgroup *grp,
- bool cleanup)
+ struct erofs_workgroup *grp)
{
/*
* If managed cache is on, refcount of workgroups
@@ -188,8 +187,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
}
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
- unsigned long nr_shrink,
- bool cleanup)
+ unsigned long nr_shrink)
{
pgoff_t first_index = 0;
void *batch[PAGEVEC_SIZE];
@@ -208,7 +206,7 @@ repeat:
first_index = grp->index + 1;
/* try to shrink each valid workgroup */
- if (!erofs_try_to_release_workgroup(sbi, grp, cleanup))
+ if (!erofs_try_to_release_workgroup(sbi, grp))
continue;
++freed;
@@ -245,7 +243,8 @@ void erofs_shrinker_unregister(struct super_block *sb)
struct erofs_sb_info *const sbi = EROFS_SB(sb);
mutex_lock(&sbi->umount_mutex);
- erofs_shrink_workstation(sbi, ~0UL, true);
+ /* clean up all remaining workgroups in memory */
+ erofs_shrink_workstation(sbi, ~0UL);
spin_lock(&erofs_sb_list_lock);
list_del(&sbi->list);
@@ -294,7 +293,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
spin_unlock(&erofs_sb_list_lock);
sbi->shrinker_run_no = run_no;
- freed += erofs_shrink_workstation(sbi, nr, false);
+ freed += erofs_shrink_workstation(sbi, nr);
spin_lock(&erofs_sb_list_lock);
/* Get the next list element before we move this one */
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index fad80c97d247..ca99425a4536 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -337,9 +337,9 @@ retry:
return COLLECT_PRIMARY; /* :( better luck next time */
}
-static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
+ struct inode *inode,
+ struct erofs_map_blocks *map)
{
struct erofs_workgroup *grp;
struct z_erofs_pcluster *pcl;
@@ -349,20 +349,20 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag);
if (!grp)
- return NULL;
+ return -ENOENT;
pcl = container_of(grp, struct z_erofs_pcluster, obj);
if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
DBG_BUGON(1);
erofs_workgroup_put(grp);
- return ERR_PTR(-EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
cl = z_erofs_primarycollection(pcl);
if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
DBG_BUGON(1);
erofs_workgroup_put(grp);
- return ERR_PTR(-EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
length = READ_ONCE(pcl->length);
@@ -370,7 +370,7 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
DBG_BUGON(1);
erofs_workgroup_put(grp);
- return ERR_PTR(-EFSCORRUPTED);
+ return -EFSCORRUPTED;
}
} else {
unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
@@ -394,12 +394,12 @@ static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
clt->tailpcl = NULL;
clt->pcl = pcl;
clt->cl = cl;
- return cl;
+ return 0;
}
-static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_register_collection(struct z_erofs_collector *clt,
+ struct inode *inode,
+ struct erofs_map_blocks *map)
{
struct z_erofs_pcluster *pcl;
struct z_erofs_collection *cl;
@@ -408,7 +408,7 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
/* no available workgroup, let's allocate one */
pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
if (!pcl)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
z_erofs_pcluster_init_always(pcl);
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
@@ -442,7 +442,7 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
if (err) {
mutex_unlock(&cl->lock);
kmem_cache_free(pcluster_cachep, pcl);
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
}
/* used to check tail merging loop due to corrupted images */
if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
@@ -450,14 +450,14 @@ static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
clt->owned_head = &pcl->next;
clt->pcl = pcl;
clt->cl = cl;
- return cl;
+ return 0;
}
static int z_erofs_collector_begin(struct z_erofs_collector *clt,
struct inode *inode,
struct erofs_map_blocks *map)
{
- struct z_erofs_collection *cl;
+ int ret;
DBG_BUGON(clt->cl);
@@ -471,19 +471,22 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
}
repeat:
- cl = cllookup(clt, inode, map);
- if (!cl) {
- cl = clregister(clt, inode, map);
+ ret = z_erofs_lookup_collection(clt, inode, map);
+ if (ret == -ENOENT) {
+ ret = z_erofs_register_collection(clt, inode, map);
- if (cl == ERR_PTR(-EAGAIN))
+ /* someone registered at the same time, give another try */
+ if (ret == -EAGAIN) {
+ cond_resched();
goto repeat;
+ }
}
- if (IS_ERR(cl))
- return PTR_ERR(cl);
+ if (ret)
+ return ret;
z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- cl->pagevec, cl->vcnt);
+ clt->cl->pagevec, clt->cl->vcnt);
clt->compressedpages = clt->pcl->compressed_pages;
if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
@@ -543,15 +546,6 @@ static bool z_erofs_collector_end(struct z_erofs_collector *clt)
return true;
}
-static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
- gfp_t gfp)
-{
- struct page *page = erofs_allocpage(pagepool, gfp, true);
-
- page->mapping = Z_EROFS_MAPPING_STAGING;
- return page;
-}
-
static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
unsigned int cachestrategy,
erofs_off_t la)
@@ -571,7 +565,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct list_head *pagepool)
{
struct inode *const inode = fe->inode;
- struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode);
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
struct z_erofs_collector *const clt = &fe->clt;
const loff_t offset = page_offset(page);
@@ -658,8 +652,9 @@ retry:
/* should allocate an additional staging page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
- __stagingpage_alloc(pagepool, GFP_NOFS);
+ erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL);
+ newpage->mapping = Z_EROFS_MAPPING_STAGING;
err = z_erofs_attach_page(clt, newpage,
Z_EROFS_PAGE_TYPE_EXCLUSIVE);
if (!err)
@@ -698,13 +693,11 @@ err_out:
goto out;
}
-static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
+ bool sync, int bios)
{
- tagptr1_t t = tagptr_init(tagptr1_t, ptr);
- struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t);
- bool background = tagptr_unfold_tags(t);
-
- if (!background) {
+ /* wake up the caller thread for sync decompression */
+ if (sync) {
unsigned long flags;
spin_lock_irqsave(&io->u.wait.lock, flags);
@@ -718,37 +711,30 @@ static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
queue_work(z_erofs_workqueue, &io->u.work);
}
-static inline void z_erofs_vle_read_endio(struct bio *bio)
+static void z_erofs_decompressqueue_endio(struct bio *bio)
{
- struct erofs_sb_info *sbi = NULL;
+ tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
+ struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
blk_status_t err = bio->bi_status;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
- bool cachemngd = false;
DBG_BUGON(PageUptodate(page));
DBG_BUGON(!page->mapping);
- if (!sbi && !z_erofs_page_is_staging(page))
- sbi = EROFS_SB(page->mapping->host->i_sb);
-
- /* sbi should already be gotten if the page is managed */
- if (sbi)
- cachemngd = erofs_page_is_managed(sbi, page);
-
if (err)
SetPageError(page);
- else if (cachemngd)
- SetPageUptodate(page);
- if (cachemngd)
+ if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
+ if (!err)
+ SetPageUptodate(page);
unlock_page(page);
+ }
}
-
- z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+ z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
bio_put(bio);
}
@@ -953,9 +939,8 @@ out:
return err;
}
-static void z_erofs_vle_unzip_all(struct super_block *sb,
- struct z_erofs_unzip_io *io,
- struct list_head *pagepool)
+static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
+ struct list_head *pagepool)
{
z_erofs_next_pcluster_t owned = io->head;
@@ -971,21 +956,21 @@ static void z_erofs_vle_unzip_all(struct super_block *sb,
pcl = container_of(owned, struct z_erofs_pcluster, next);
owned = READ_ONCE(pcl->next);
- z_erofs_decompress_pcluster(sb, pcl, pagepool);
+ z_erofs_decompress_pcluster(io->sb, pcl, pagepool);
}
}
-static void z_erofs_vle_unzip_wq(struct work_struct *work)
+static void z_erofs_decompressqueue_work(struct work_struct *work)
{
- struct z_erofs_unzip_io_sb *iosb =
- container_of(work, struct z_erofs_unzip_io_sb, io.u.work);
+ struct z_erofs_decompressqueue *bgq =
+ container_of(work, struct z_erofs_decompressqueue, u.work);
LIST_HEAD(pagepool);
- DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool);
+ DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ z_erofs_decompress_queue(bgq, &pagepool);
put_pages_list(&pagepool);
- kvfree(iosb);
+ kvfree(bgq);
}
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
@@ -994,8 +979,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
struct address_space *mc,
gfp_t gfp)
{
- /* determined at compile time to avoid too many #ifdefs */
- const bool nocache = __builtin_constant_p(mc) ? !mc : false;
const pgoff_t index = pcl->obj.index;
bool tocache = false;
@@ -1016,7 +999,7 @@ repeat:
* the cached page has not been allocated and
* an placeholder is out there, prepare it now.
*/
- if (!nocache && page == PAGE_UNALLOCATED) {
+ if (page == PAGE_UNALLOCATED) {
tocache = true;
goto out_allocpage;
}
@@ -1029,21 +1012,6 @@ repeat:
mapping = READ_ONCE(page->mapping);
/*
- * if managed cache is disabled, it's no way to
- * get such a cached-like page.
- */
- if (nocache) {
- /* if managed cache is disabled, it is impossible `justfound' */
- DBG_BUGON(justfound);
-
- /* and it should be locked, not uptodate, and not truncated */
- DBG_BUGON(!PageLocked(page));
- DBG_BUGON(PageUptodate(page));
- DBG_BUGON(!mapping);
- goto out;
- }
-
- /*
* unmanaged (file) pages are all locked solidly,
* therefore it is impossible for `mapping' to be NULL.
*/
@@ -1093,50 +1061,52 @@ repeat:
unlock_page(page);
put_page(page);
out_allocpage:
- page = __stagingpage_alloc(pagepool, gfp);
- if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
- list_add(&page->lru, pagepool);
- cpu_relax();
- goto repeat;
- }
- if (nocache || !tocache)
- goto out;
- if (add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+ page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
+ if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+ /* non-LRU / non-movable temporary page is needed */
page->mapping = Z_EROFS_MAPPING_STAGING;
- goto out;
+ tocache = false;
}
+ if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
+ if (tocache) {
+ /* since it added to managed cache successfully */
+ unlock_page(page);
+ put_page(page);
+ } else {
+ list_add(&page->lru, pagepool);
+ }
+ cond_resched();
+ goto repeat;
+ }
set_page_private(page, (unsigned long)pcl);
SetPagePrivate(page);
out: /* the only exit (for tracing and debugging) */
return page;
}
-static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb,
- struct z_erofs_unzip_io *io,
- bool foreground)
+static struct z_erofs_decompressqueue *
+jobqueue_init(struct super_block *sb,
+ struct z_erofs_decompressqueue *fgq, bool *fg)
{
- struct z_erofs_unzip_io_sb *iosb;
-
- if (foreground) {
- /* waitqueue available for foreground io */
- DBG_BUGON(!io);
+ struct z_erofs_decompressqueue *q;
- init_waitqueue_head(&io->u.wait);
- atomic_set(&io->pending_bios, 0);
- goto out;
+ if (fg && !*fg) {
+ q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
+ if (!q) {
+ *fg = true;
+ goto fg_out;
+ }
+ INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
+ } else {
+fg_out:
+ q = fgq;
+ init_waitqueue_head(&fgq->u.wait);
+ atomic_set(&fgq->pending_bios, 0);
}
-
- iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL);
- DBG_BUGON(!iosb);
-
- /* initialize fields in the allocated descriptor */
- io = &iosb->io;
- iosb->sb = sb;
- INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
-out:
- io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
- return io;
+ q->sb = sb;
+ q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+ return q;
}
/* define decompression jobqueue types */
@@ -1147,22 +1117,17 @@ enum {
};
static void *jobqueueset_init(struct super_block *sb,
- z_erofs_next_pcluster_t qtail[],
- struct z_erofs_unzip_io *q[],
- struct z_erofs_unzip_io *fgq,
- bool forcefg)
+ struct z_erofs_decompressqueue *q[],
+ struct z_erofs_decompressqueue *fgq, bool *fg)
{
/*
* if managed cache is enabled, bypass jobqueue is needed,
* no need to read from device for all pclusters in this queue.
*/
- q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true);
- qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
-
- q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg);
- qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
+ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
+ q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg);
- return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg));
+ return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg));
}
static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
@@ -1184,9 +1149,8 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
qtail[JQ_BYPASS] = &pcl->next;
}
-static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[],
- unsigned int nr_bios,
- bool force_fg)
+static bool postsubmit_is_all_bypassed(struct z_erofs_decompressqueue *q[],
+ unsigned int nr_bios, bool force_fg)
{
/*
* although background is preferred, no one is pending for submission.
@@ -1195,19 +1159,19 @@ static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[],
if (force_fg || nr_bios)
return false;
- kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io));
+ kvfree(q[JQ_SUBMIT]);
return true;
}
-static bool z_erofs_vle_submit_all(struct super_block *sb,
- z_erofs_next_pcluster_t owned_head,
- struct list_head *pagepool,
- struct z_erofs_unzip_io *fgq,
- bool force_fg)
+static bool z_erofs_submit_queue(struct super_block *sb,
+ z_erofs_next_pcluster_t owned_head,
+ struct list_head *pagepool,
+ struct z_erofs_decompressqueue *fgq,
+ bool *force_fg)
{
- struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
- struct z_erofs_unzip_io *q[NR_JOBQUEUES];
+ struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
struct bio *bio;
void *bi_private;
/* since bio will be NULL, no need to initialize last_index */
@@ -1221,7 +1185,9 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
force_submit = false;
bio = NULL;
nr_bios = 0;
- bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg);
+ bi_private = jobqueueset_init(sb, q, fgq, force_fg);
+ qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
+ qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
/* by default, all need io submission */
q[JQ_SUBMIT]->head = owned_head;
@@ -1268,7 +1234,7 @@ submit_bio_retry:
if (!bio) {
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- bio->bi_end_io = z_erofs_vle_read_endio;
+ bio->bi_end_io = z_erofs_decompressqueue_endio;
bio_set_dev(bio, sb->s_bdev);
bio->bi_iter.bi_sector = (sector_t)(first_index + i) <<
LOG_SECTORS_PER_BLOCK;
@@ -1297,40 +1263,38 @@ skippage:
if (bio)
submit_bio(bio);
- if (postsubmit_is_all_bypassed(q, nr_bios, force_fg))
+ if (postsubmit_is_all_bypassed(q, nr_bios, *force_fg))
return true;
- z_erofs_vle_unzip_kickoff(bi_private, nr_bios);
+ z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios);
return true;
}
-static void z_erofs_submit_and_unzip(struct super_block *sb,
- struct z_erofs_collector *clt,
- struct list_head *pagepool,
- bool force_fg)
+static void z_erofs_runqueue(struct super_block *sb,
+ struct z_erofs_collector *clt,
+ struct list_head *pagepool, bool force_fg)
{
- struct z_erofs_unzip_io io[NR_JOBQUEUES];
+ struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (!z_erofs_vle_submit_all(sb, clt->owned_head,
- pagepool, io, force_fg))
+ if (!z_erofs_submit_queue(sb, clt->owned_head,
+ pagepool, io, &force_fg))
return;
- /* decompress no I/O pclusters immediately */
- z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool);
+ /* handle bypass queue (no i/o pclusters) immediately */
+ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
if (!force_fg)
return;
/* wait until all bios are completed */
- wait_event(io[JQ_SUBMIT].u.wait,
- !atomic_read(&io[JQ_SUBMIT].pending_bios));
+ io_wait_event(io[JQ_SUBMIT].u.wait,
+ !atomic_read(&io[JQ_SUBMIT].pending_bios));
- /* let's synchronous decompression */
- z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool);
+ /* handle synchronous decompress queue in the caller context */
+ z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
}
-static int z_erofs_vle_normalaccess_readpage(struct file *file,
- struct page *page)
+static int z_erofs_readpage(struct file *file, struct page *page)
{
struct inode *const inode = page->mapping->host;
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
@@ -1345,7 +1309,7 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file,
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true);
+ z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
@@ -1364,10 +1328,8 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
return nr <= sbi->max_sync_decompress_pages;
}
-static int z_erofs_vle_normalaccess_readpages(struct file *filp,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned int nr_pages)
+static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned int nr_pages)
{
struct inode *const inode = mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -1422,7 +1384,7 @@ static int z_erofs_vle_normalaccess_readpages(struct file *filp,
(void)z_erofs_collector_end(&f.clt);
- z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync);
+ z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync);
if (f.map.mpage)
put_page(f.map.mpage);
@@ -1432,8 +1394,8 @@ static int z_erofs_vle_normalaccess_readpages(struct file *filp,
return 0;
}
-const struct address_space_operations z_erofs_vle_normalaccess_aops = {
- .readpage = z_erofs_vle_normalaccess_readpage,
- .readpages = z_erofs_vle_normalaccess_readpages,
+const struct address_space_operations z_erofs_aops = {
+ .readpage = z_erofs_readpage,
+ .readpages = z_erofs_readpages,
};
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index faf950189bd7..7824f5563a55 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -84,7 +84,8 @@ struct z_erofs_pcluster {
#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster)
-struct z_erofs_unzip_io {
+struct z_erofs_decompressqueue {
+ struct super_block *sb;
atomic_t pending_bios;
z_erofs_next_pcluster_t head;
@@ -94,11 +95,6 @@ struct z_erofs_unzip_io {
} u;
};
-struct z_erofs_unzip_io_sb {
- struct z_erofs_unzip_io io;
- struct super_block *sb;
-};
-
#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
struct page *page)
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 6a26c293ae2d..736db3a4cdef 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -22,11 +22,11 @@ int z_erofs_fill_inode(struct inode *inode)
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
}
- inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops;
+ inode->i_mapping->a_ops = &z_erofs_aops;
return 0;
}
-static int fill_inode_lazy(struct inode *inode)
+static int z_erofs_fill_inode_lazy(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
@@ -138,8 +138,8 @@ static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
return 0;
}
-static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned long lcn)
+static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned long lcn)
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
@@ -311,13 +311,13 @@ out:
return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
}
-static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned int lcn)
+static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+ unsigned int lcn)
{
const unsigned int datamode = EROFS_I(m->inode)->datalayout;
if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
- return vle_legacy_load_cluster_from_disk(m, lcn);
+ return legacy_load_cluster_from_disk(m, lcn);
if (datamode == EROFS_INODE_FLAT_COMPRESSION)
return compacted_load_cluster_from_disk(m, lcn);
@@ -325,8 +325,8 @@ static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m,
return -EINVAL;
}
-static int vle_extent_lookback(struct z_erofs_maprecorder *m,
- unsigned int lookback_distance)
+static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
+ unsigned int lookback_distance)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
struct erofs_map_blocks *const map = m->map;
@@ -343,7 +343,7 @@ static int vle_extent_lookback(struct z_erofs_maprecorder *m,
/* load extent head logical cluster if needed */
lcn -= lookback_distance;
- err = vle_load_cluster_from_disk(m, lcn);
+ err = z_erofs_load_cluster_from_disk(m, lcn);
if (err)
return err;
@@ -356,7 +356,7 @@ static int vle_extent_lookback(struct z_erofs_maprecorder *m,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- return vle_extent_lookback(m, m->delta[0]);
+ return z_erofs_extent_lookback(m, m->delta[0]);
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
map->m_flags &= ~EROFS_MAP_ZIPPED;
/* fallthrough */
@@ -396,7 +396,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
goto out;
}
- err = fill_inode_lazy(inode);
+ err = z_erofs_fill_inode_lazy(inode);
if (err)
goto out;
@@ -405,7 +405,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
m.lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
- err = vle_load_cluster_from_disk(&m, m.lcn);
+ err = z_erofs_load_cluster_from_disk(&m, m.lcn);
if (err)
goto unmap_out;
@@ -436,7 +436,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
/* fallthrough */
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
/* get the correspoinding first chunk */
- err = vle_extent_lookback(&m, m.delta[0]);
+ err = z_erofs_extent_lookback(&m, m.delta[0]);
if (err)
goto unmap_out;
break;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c4159bcc05d9..67a395039268 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -551,28 +551,23 @@ out_unlock:
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct nested_calls poll_safewake_ncalls;
-
-static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
-{
- unsigned long flags;
- wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
-
- spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
- wake_up_locked_poll(wqueue, EPOLLIN);
- spin_unlock_irqrestore(&wqueue->lock, flags);
-
- return 0;
-}
+static DEFINE_PER_CPU(int, wakeup_nest);
static void ep_poll_safewake(wait_queue_head_t *wq)
{
- int this_cpu = get_cpu();
-
- ep_call_nested(&poll_safewake_ncalls,
- ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
+ unsigned long flags;
+ int subclass;
- put_cpu();
+ local_irq_save(flags);
+ preempt_disable();
+ subclass = __this_cpu_read(wakeup_nest);
+ spin_lock_nested(&wq->lock, subclass + 1);
+ __this_cpu_inc(wakeup_nest);
+ wake_up_locked_poll(wq, POLLIN);
+ __this_cpu_dec(wakeup_nest);
+ spin_unlock(&wq->lock);
+ local_irq_restore(flags);
+ preempt_enable();
}
#else
@@ -671,7 +666,6 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
void *priv, int depth, bool ep_locked)
{
__poll_t res;
- int pwake = 0;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);
@@ -738,26 +732,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
*/
list_splice(&txlist, &ep->rdllist);
__pm_relax(ep->ws);
-
- if (!list_empty(&ep->rdllist)) {
- /*
- * Wake up (if active) both the eventpoll wait list and
- * the ->poll() wait list (delayed after we release the lock).
- */
- if (waitqueue_active(&ep->wq))
- wake_up(&ep->wq);
- if (waitqueue_active(&ep->poll_wait))
- pwake++;
- }
write_unlock_irq(&ep->lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
- /* We have to call this outside the lock */
- if (pwake)
- ep_poll_safewake(&ep->poll_wait);
-
return res;
}
@@ -2370,11 +2349,6 @@ static int __init eventpoll_init(void)
*/
ep_nested_calls_init(&poll_loop_ncalls);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /* Initialize the structure used to perform safe poll wait head wake ups */
- ep_nested_calls_init(&poll_safewake_ncalls);
-#endif
-
/*
* We can have many thousands of epitems, so prevent this from
* using an extra cache line on 64-bit (and smaller) CPUs
diff --git a/fs/exec.c b/fs/exec.c
index 555e93c7dec8..74d88dab98dd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,7 +59,6 @@
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
-#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
@@ -1015,7 +1014,7 @@ static int exec_mmap(struct mm_struct *mm)
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
- mm_release(tsk, old_mm);
+ exec_mm_release(tsk, old_mm);
if (old_mm) {
sync_mm_rss(old_mm);
@@ -1132,7 +1131,7 @@ static int de_thread(struct task_struct *tsk)
* also take its birthdate (always earlier than our own).
*/
tsk->start_time = leader->start_time;
- tsk->real_start_time = leader->real_start_time;
+ tsk->start_boottime = leader->start_boottime;
BUG_ON(!same_thread_group(leader, tsk));
BUG_ON(has_group_leader_pid(tsk));
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 09bc68708d28..2dd55b172d57 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -519,26 +519,33 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
* inode is actually connected to the parent.
*/
err = exportfs_get_name(mnt, target_dir, nbuf, result);
- if (!err) {
- inode_lock(target_dir->d_inode);
- nresult = lookup_one_len(nbuf, target_dir,
- strlen(nbuf));
- inode_unlock(target_dir->d_inode);
- if (!IS_ERR(nresult)) {
- if (nresult->d_inode) {
- dput(result);
- result = nresult;
- } else
- dput(nresult);
- }
+ if (err) {
+ dput(target_dir);
+ goto err_result;
}
+ inode_lock(target_dir->d_inode);
+ nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+ if (!IS_ERR(nresult)) {
+ if (unlikely(nresult->d_inode != result->d_inode)) {
+ dput(nresult);
+ nresult = ERR_PTR(-ESTALE);
+ }
+ }
+ inode_unlock(target_dir->d_inode);
/*
* At this point we are done with the parent, but it's pinned
* by the child dentry anyway.
*/
dput(target_dir);
+ if (IS_ERR(nresult)) {
+ err = PTR_ERR(nresult);
+ goto err_result;
+ }
+ dput(result);
+ result = nresult;
+
/*
* And finally make sure the dentry is actually acceptable
* to NFSD.
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e0cc55164505..fa9c951d3471 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -269,7 +269,7 @@ goal_in_my_reservation(struct ext2_reserve_window *rsv, ext2_grpblk_t grp_goal,
ext2_fsblk_t group_first_block, group_last_block;
group_first_block = ext2_group_first_block_no(sb, group);
- group_last_block = group_first_block + EXT2_BLOCKS_PER_GROUP(sb) - 1;
+ group_last_block = ext2_group_last_block_no(sb, group);
if ((rsv->_rsv_start > group_last_block) ||
(rsv->_rsv_end < group_first_block))
@@ -666,37 +666,24 @@ ext2_try_to_allocate(struct super_block *sb, int group,
unsigned long *count,
struct ext2_reserve_window *my_rsv)
{
- ext2_fsblk_t group_first_block;
+ ext2_fsblk_t group_first_block = ext2_group_first_block_no(sb, group);
+ ext2_fsblk_t group_last_block = ext2_group_last_block_no(sb, group);
ext2_grpblk_t start, end;
unsigned long num = 0;
+ start = 0;
+ end = group_last_block - group_first_block + 1;
/* we do allocation within the reservation window if we have a window */
if (my_rsv) {
- group_first_block = ext2_group_first_block_no(sb, group);
if (my_rsv->_rsv_start >= group_first_block)
start = my_rsv->_rsv_start - group_first_block;
- else
- /* reservation window cross group boundary */
- start = 0;
- end = my_rsv->_rsv_end - group_first_block + 1;
- if (end > EXT2_BLOCKS_PER_GROUP(sb))
- /* reservation window crosses group boundary */
- end = EXT2_BLOCKS_PER_GROUP(sb);
- if ((start <= grp_goal) && (grp_goal < end))
- start = grp_goal;
- else
+ if (my_rsv->_rsv_end < group_last_block)
+ end = my_rsv->_rsv_end - group_first_block + 1;
+ if (grp_goal < start || grp_goal >= end)
grp_goal = -1;
- } else {
- if (grp_goal > 0)
- start = grp_goal;
- else
- start = 0;
- end = EXT2_BLOCKS_PER_GROUP(sb);
}
-
BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb));
-repeat:
if (grp_goal < 0) {
grp_goal = find_next_usable_block(start, bitmap_bh, end);
if (grp_goal < 0)
@@ -711,32 +698,23 @@ repeat:
;
}
}
- start = grp_goal;
- if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group), grp_goal,
- bitmap_bh->b_data)) {
- /*
- * The block was allocated by another thread, or it was
- * allocated and then freed by another thread
- */
- start++;
- grp_goal++;
- if (start >= end)
- goto fail_access;
- goto repeat;
- }
- num++;
- grp_goal++;
- while (num < *count && grp_goal < end
- && !ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group),
+ for (; num < *count && grp_goal < end; grp_goal++) {
+ if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group),
grp_goal, bitmap_bh->b_data)) {
+ if (num == 0)
+ continue;
+ break;
+ }
num++;
- grp_goal++;
}
+
+ if (num == 0)
+ goto fail_access;
+
*count = num;
return grp_goal - num;
fail_access:
- *count = num;
return -1;
}
@@ -754,10 +732,9 @@ fail_access:
* but we will shift to the place where start_block is,
* then start from there, when looking for a reservable space.
*
- * @size: the target new reservation window size
+ * @sb: the super block.
*
- * @group_first_block: the first block we consider to start
- * the real search from
+ * @start_block: the first block we consider to start the real search from
*
* @last_block:
* the maximum block number that our goal reservable space
@@ -908,7 +885,7 @@ static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv,
spinlock_t *rsv_lock = &EXT2_SB(sb)->s_rsv_window_lock;
group_first_block = ext2_group_first_block_no(sb, group);
- group_end_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+ group_end_block = ext2_group_last_block_no(sb, group);
if (grp_goal < 0)
start_block = group_first_block;
@@ -1115,7 +1092,7 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group,
* first block is the block number of the first block in this group
*/
group_first_block = ext2_group_first_block_no(sb, group);
- group_last_block = group_first_block + (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+ group_last_block = ext2_group_last_block_no(sb, group);
/*
* Basically we will allocate a new block from inode's reservation
@@ -1313,6 +1290,13 @@ retry_alloc:
if (free_blocks > 0) {
grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
EXT2_BLOCKS_PER_GROUP(sb));
+ /*
+ * In case we retry allocation (due to fs reservation not
+ * working out or fs corruption), the bitmap_bh is non-null
+ * pointer and we have to release it before calling
+ * read_block_bitmap().
+ */
+ brelse(bitmap_bh);
bitmap_bh = read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
@@ -1404,6 +1388,7 @@ allocated:
* use. So we may want to selectively mark some of the blocks
* as free
*/
+ num = *count;
goto retry_alloc;
}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 10ab238de9a6..8178bd38a9d6 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -813,6 +813,18 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
}
+static inline ext2_fsblk_t
+ext2_group_last_block_no(struct super_block *sb, unsigned long group_no)
+{
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+ if (group_no == sbi->s_groups_count - 1)
+ return le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+ else
+ return ext2_group_first_block_no(sb, group_no) +
+ EXT2_BLOCKS_PER_GROUP(sb) - 1;
+}
+
#define ext2_set_bit __test_and_set_bit_le
#define ext2_clear_bit __test_and_clear_bit_le
#define ext2_test_bit test_bit_le
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7004ce581a32..119667e65890 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -701,10 +701,13 @@ static int ext2_get_blocks(struct inode *inode,
if (!partial) {
count++;
mutex_unlock(&ei->truncate_mutex);
- if (err)
- goto cleanup;
goto got_it;
}
+
+ if (err) {
+ mutex_unlock(&ei->truncate_mutex);
+ goto cleanup;
+ }
}
/*
@@ -801,7 +804,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
#ifdef CONFIG_FS_DAX
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned flags, struct iomap *iomap)
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 1b853fb0b163..32a8d10b579d 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -145,10 +145,13 @@ setversion_out:
if (ei->i_block_alloc_info){
struct ext2_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
rsv->rsv_goal_size = rsv_window_size;
+ } else {
+ ret = -ENOMEM;
}
+
mutex_unlock(&ei->truncate_mutex);
mnt_drop_write_file(filp);
- return 0;
+ return ret;
}
default:
return -ENOTTY;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 30c630d73f0f..bcffe25da2f0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -702,13 +702,7 @@ static int ext2_check_descriptors(struct super_block *sb)
for (i = 0; i < sbi->s_groups_count; i++) {
struct ext2_group_desc *gdp = ext2_get_group_desc(sb, i, NULL);
ext2_fsblk_t first_block = ext2_group_first_block_no(sb, i);
- ext2_fsblk_t last_block;
-
- if (i == sbi->s_groups_count - 1)
- last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
- else
- last_block = first_block +
- (EXT2_BLOCKS_PER_GROUP(sb) - 1);
+ ext2_fsblk_t last_block = ext2_group_last_block_no(sb, i);
if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
le32_to_cpu(gdp->bg_block_bitmap) > last_block)
@@ -806,7 +800,6 @@ static unsigned long descriptor_loc(struct super_block *sb,
{
struct ext2_sb_info *sbi = EXT2_SB(sb);
unsigned long bg, first_meta_bg;
- int has_super = 0;
first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
@@ -814,10 +807,8 @@ static unsigned long descriptor_loc(struct super_block *sb,
nr < first_meta_bg)
return (logic_sb_block + nr + 1);
bg = sbi->s_desc_per_block * nr;
- if (ext2_bg_has_super(sb, bg))
- has_super = 1;
- return ext2_group_first_block_no(sb, bg) + has_super;
+ return ext2_group_first_block_no(sb, bg) + ext2_bg_has_super(sb, bg);
}
static int ext2_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index cbb5ca830e57..ef42ab040905 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -106,3 +106,20 @@ config EXT4_DEBUG
If you select Y here, then you will be able to turn on debugging
with a command such as:
echo 1 > /sys/module/ext4/parameters/mballoc_debug
+
+config EXT4_KUNIT_TESTS
+ bool "KUnit tests for ext4"
+ select EXT4_FS
+ depends on KUNIT
+ help
+ This builds the ext4 KUnit tests.
+
+ KUnit tests run during boot and output the results to the debug log
+ in TAP format (http://testanything.org/). Only useful for kernel devs
+ running KUnit test harness and are not for inclusion into a production
+ build.
+
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index b17ddc229ac5..840b91d040f1 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -13,4 +13,5 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
+ext4-$(CONFIG_EXT4_KUNIT_TESTS) += inode-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 03db3e71676c..f8578caba40d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -198,6 +198,12 @@ struct ext4_system_blocks {
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
+struct ext4_io_end_vec {
+ struct list_head list; /* list of io_end_vec */
+ loff_t offset; /* offset in the file */
+ ssize_t size; /* size of the extent */
+};
+
/*
* For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
@@ -211,8 +217,7 @@ typedef struct ext4_io_end {
* bios covering the extent */
unsigned int flag; /* unwritten or not */
atomic_t count; /* reference counter */
- loff_t offset; /* offset in the file */
- ssize_t size; /* size of the extent */
+ struct list_head list_vec; /* list of ext4_io_end_vec */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -1579,7 +1584,6 @@ enum {
EXT4_STATE_NO_EXPAND, /* No space for expansion */
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
- EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
@@ -1678,6 +1682,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
+#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1779,6 +1784,7 @@ EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
+EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES)
EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
@@ -2560,8 +2566,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
@@ -2604,7 +2608,6 @@ extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
-extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
@@ -3264,6 +3267,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
+ ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -3296,6 +3301,10 @@ extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
ext4_lblk_t lblk2, ext4_lblk_t count,
int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
+extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred);
+
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -3322,6 +3331,8 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
int len,
struct writeback_control *wbc,
bool keep_towrite);
+extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
+extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
@@ -3379,6 +3390,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
}
extern const struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_report_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 7c70b08d104c..d3b8cdea5df7 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -65,12 +65,14 @@ static int ext4_journal_check_start(struct super_block *sb)
}
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks)
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds)
{
journal_t *journal;
int err;
- trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+ trace_ext4_journal_start(sb, blocks, rsv_blocks, revoke_creds,
+ _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0)
return ERR_PTR(err);
@@ -78,8 +80,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
journal = EXT4_SB(sb)->s_journal;
if (!journal)
return ext4_get_nojournal();
- return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
- type, line);
+ return jbd2__journal_start(journal, blocks, rsv_blocks, revoke_creds,
+ GFP_NOFS, type, line);
}
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -119,8 +121,8 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return ext4_get_nojournal();
sb = handle->h_journal->j_private;
- trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
- _RET_IP_);
+ trace_ext4_journal_start_reserved(sb,
+ jbd2_handle_buffer_credits(handle), _RET_IP_);
err = ext4_journal_check_start(sb);
if (err < 0) {
jbd2_journal_free_reserved(handle);
@@ -133,6 +135,19 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return handle;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred)
+{
+ if (!ext4_handle_valid(handle))
+ return 0;
+ if (jbd2_handle_buffer_credits(handle) >= check_cred &&
+ handle->h_revoke_credits >= revoke_cred)
+ return 0;
+ extend_cred = max(0, extend_cred - jbd2_handle_buffer_credits(handle));
+ revoke_cred = max(0, revoke_cred - handle->h_revoke_credits);
+ return ext4_journal_extend(handle, extend_cred, revoke_cred);
+}
+
static void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn,
struct buffer_head *bh,
@@ -278,7 +293,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle), err);
return err;
}
ext4_error_inode(inode, where, line,
@@ -289,7 +304,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
handle->h_type,
handle->h_line_no,
handle->h_requested_credits,
- handle->h_buffer_credits, err);
+ jbd2_handle_buffer_credits(handle),
+ err);
}
} else {
if (inode)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index ef8fcf7d0d3b..a6b9b66dbfad 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -261,7 +261,8 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
- int type, int blocks, int rsv_blocks);
+ int type, int blocks, int rsv_blocks,
+ int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -288,28 +289,41 @@ static inline int ext4_handle_is_aborted(handle_t *handle)
return 0;
}
-static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
+ int blocks)
{
- if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
- return 0;
- return 1;
+ /* Freeing each metadata block can result in freeing one cluster */
+ return blocks * EXT4_SB(sb)->s_cluster_ratio;
+}
+
+static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
+{
+ return ext4_free_metadata_revoke_credits(sb, 8);
}
#define ext4_journal_start_sb(sb, type, nblocks) \
- __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits(sb))
#define ext4_journal_start(inode, type, nblocks) \
- __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+ __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0, \
+ ext4_trans_default_revoke_credits((inode)->i_sb))
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
+ ext4_trans_default_revoke_credits((inode)->i_sb))
-#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
- __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
+ __ext4_journal_start((inode), __LINE__, (type), (blocks), 0, \
+ (revoke_creds))
static inline handle_t *__ext4_journal_start(struct inode *inode,
unsigned int line, int type,
- int blocks, int rsv_blocks)
+ int blocks, int rsv_blocks,
+ int revoke_creds)
{
return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
- rsv_blocks);
+ rsv_blocks, revoke_creds);
}
#define ext4_journal_stop(handle) \
@@ -332,20 +346,68 @@ static inline handle_t *ext4_journal_current_handle(void)
return journal_current_handle();
}
-static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_extend(handle, nblocks);
+ return jbd2_journal_extend(handle, nblocks, revoke);
return 0;
}
-static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+static inline int ext4_journal_restart(handle_t *handle, int nblocks,
+ int revoke)
{
if (ext4_handle_valid(handle))
- return jbd2_journal_restart(handle, nblocks);
+ return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
return 0;
}
+int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
+ int extend_cred, int revoke_cred);
+
+
+/*
+ * Ensure @handle has at least @check_creds credits available. If not,
+ * transaction will be extended or restarted to contain at least @extend_cred
+ * credits. Before restarting transaction @fn is executed to allow for cleanup
+ * before the transaction is restarted.
+ *
+ * The return value is < 0 in case of error, 0 in case the handle has enough
+ * credits or transaction extension succeeded, 1 in case transaction had to be
+ * restarted.
+ */
+#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred, \
+ revoke_cred, fn) \
+({ \
+ __label__ __ensure_end; \
+ int err = __ext4_journal_ensure_credits((handle), (check_cred), \
+ (extend_cred), (revoke_cred)); \
+ \
+ if (err <= 0) \
+ goto __ensure_end; \
+ err = (fn); \
+ if (err < 0) \
+ goto __ensure_end; \
+ err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
+ if (err == 0) \
+ err = 1; \
+__ensure_end: \
+ err; \
+})
+
+/*
+ * Ensure given handle has at least requested amount of credits available,
+ * possibly restarting transaction if needed. We also make sure the transaction
+ * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
+ * as freeing one or two blocks is very common pattern and requesting this is
+ * very cheap.
+ */
+static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
+ int revoke_creds)
+{
+ return ext4_journal_ensure_credits_fn(handle, credits, credits,
+ revoke_creds, 0);
+}
+
static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
@@ -407,6 +469,7 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */
/* We do not support data journalling with delayed allocation */
if (!S_ISREG(inode->i_mode) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
!test_opt(inode->i_sb, DELALLOC))) {
@@ -437,6 +500,19 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}
+static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
+{
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return 0;
+ if (!ext4_should_journal_data(inode))
+ return 0;
+ /*
+ * Data blocks in one extent are contiguous, just account for partial
+ * clusters at extent boundaries
+ */
+ return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
+}
+
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index fb0f99dc8c22..0e8708b77da6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,29 +100,41 @@ static int ext4_split_extent_at(handle_t *handle,
static int ext4_find_delayed_extent(struct inode *inode,
struct extent_status *newes);
-static int ext4_ext_truncate_extend_restart(handle_t *handle,
- struct inode *inode,
- int needed)
+static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
- int err;
-
- if (!ext4_handle_valid(handle))
- return 0;
- if (handle->h_buffer_credits >= needed)
- return 0;
/*
- * If we need to extend the journal get a few extra blocks
- * while we're at it for efficiency's sake.
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
*/
- needed += 3;
- err = ext4_journal_extend(handle, needed - handle->h_buffer_credits);
- if (err <= 0)
- return err;
- err = ext4_truncate_restart_trans(handle, inode, needed);
- if (err == 0)
- err = -EAGAIN;
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+}
- return err;
+/*
+ * Make sure 'handle' has at least 'check_cred' credits. If not, restart
+ * transaction with 'restart_cred' credits. The function drops i_data_sem
+ * when restarting transaction and gets it after transaction is restarted.
+ *
+ * The function returns 0 on success, 1 if transaction had to be restarted,
+ * and < 0 in case of fatal error.
+ */
+int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
+ int check_cred, int restart_cred,
+ int revoke_cred)
+{
+ int ret;
+ int dropped = 0;
+
+ ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
+ revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
+ if (dropped)
+ down_write(&EXT4_I(inode)->i_data_sem);
+ return ret;
}
/*
@@ -1753,16 +1765,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
*/
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
- /*
- * The check for IO to unwritten extent is somewhat racy as we
- * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
- * dropping i_data_sem. But reserved blocks should save us in that
- * case.
- */
+
if (ext4_ext_is_unwritten(ex1) &&
- (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
- atomic_read(&EXT4_I(inode)->i_unwritten) ||
- (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
+ ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
@@ -1840,7 +1845,8 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
* group descriptor to release the extent tree block. If we
* can't get the journal credits, give up.
*/
- if (ext4_journal_extend(handle, 2))
+ if (ext4_journal_extend(handle, 2,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
return;
/*
@@ -2727,7 +2733,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int err = 0, correct_index = 0;
- int depth = ext_depth(inode), credits;
+ int depth = ext_depth(inode), credits, revoke_credits;
struct ext4_extent_header *eh;
ext4_lblk_t a, b;
unsigned num;
@@ -2819,10 +2825,23 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
credits += (ext_depth(inode)) + 1;
}
credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- if (err)
+ /*
+ * We may end up freeing some index blocks and data from the
+ * punched range. Note that partial clusters are accounted for
+ * by ext4_free_data_revoke_credits().
+ */
+ revoke_credits =
+ ext4_free_metadata_revoke_credits(inode->i_sb,
+ ext_depth(inode)) +
+ ext4_free_data_revoke_credits(inode, b - a + 1);
+
+ err = ext4_datasem_ensure_credits(handle, inode, credits,
+ credits, revoke_credits);
+ if (err) {
+ if (err > 0)
+ err = -EAGAIN;
goto out;
+ }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2948,7 +2967,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext_debug("truncate since %u to %u\n", start, end);
/* probably first extent we're gonna free will be last in block */
- handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
+ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
+ depth + 1,
+ ext4_free_metadata_revoke_credits(inode->i_sb, depth));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -4962,23 +4983,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
int ret = 0;
int ret2 = 0;
struct ext4_map_blocks map;
- unsigned int credits, blkbits = inode->i_blkbits;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
map.m_lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- /*
- * This is somewhat ugly but the idea is clear: When transaction is
- * reserved, everything goes into it. Otherwise we rather start several
- * smaller transactions for conversion of each extent separately.
- */
- if (handle) {
- handle = ext4_journal_start_reserved(handle,
- EXT4_HT_EXT_CONVERT);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- credits = 0;
- } else {
+ if (!handle) {
/*
* credits to insert 1 extent into extent tree
*/
@@ -5009,11 +5020,40 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
if (ret <= 0 || ret2)
break;
}
- if (!credits)
- ret2 = ext4_journal_stop(handle);
return ret > 0 ? ret2 : ret;
}
+int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
+{
+ int ret, err = 0;
+ struct ext4_io_end_vec *io_end_vec;
+
+ /*
+ * This is somewhat ugly but the idea is clear: When transaction is
+ * reserved, everything goes into it. Otherwise we rather start several
+ * smaller transactions for conversion of each extent separately.
+ */
+ if (handle) {
+ handle = ext4_journal_start_reserved(handle,
+ EXT4_HT_EXT_CONVERT);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ }
+
+ list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
+ ret = ext4_convert_unwritten_extents(handle, io_end->inode,
+ io_end_vec->offset,
+ io_end_vec->size);
+ if (ret)
+ break;
+ }
+
+ if (handle)
+ err = ext4_journal_stop(handle);
+
+ return ret < 0 ? ret : err;
+}
+
/*
* If newes is not existing extent (newes->ec_pblk equals zero) find
* delayed extent at start of newes and update newes accordingly and
@@ -5206,13 +5246,10 @@ ext4_access_path(handle_t *handle, struct inode *inode,
* descriptor) for each block group; assume two block
* groups
*/
- if (handle->h_buffer_credits < 7) {
- credits = ext4_writepage_trans_blocks(inode);
- err = ext4_ext_truncate_extend_restart(handle, inode, credits);
- /* EAGAIN is success */
- if (err && err != -EAGAIN)
- return err;
- }
+ credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
+ if (err < 0)
+ return err;
err = ext4_ext_get_access(handle, inode, path);
return err;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8d2bbcc2d813..6a7293a5cda2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,10 +29,58 @@
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
+#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "truncate.h"
+
+static bool ext4_dio_supported(struct inode *inode)
+{
+ if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+ return false;
+ if (fsverity_active(inode))
+ return false;
+ if (ext4_should_journal_data(inode))
+ return false;
+ if (ext4_has_inline_data(inode))
+ return false;
+ return true;
+}
+
+static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
+ if (!ext4_dio_supported(inode)) {
+ inode_unlock_shared(inode);
+ /*
+ * Fallback to buffered I/O if the operation being performed on
+ * the inode is not supported by direct I/O. The IOCB_DIRECT
+ * flag needs to be cleared here in order to ensure that the
+ * direct I/O path within generic_file_read_iter() is not
+ * taken.
+ */
+ iocb->ki_flags &= ~IOCB_DIRECT;
+ return generic_file_read_iter(iocb, to);
+ }
+
+ ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
+ inode_unlock_shared(inode);
+
+ file_accessed(iocb->ki_filp);
+ return ret;
+}
#ifdef CONFIG_FS_DAX
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
#ifdef CONFIG_FS_DAX
- if (IS_DAX(file_inode(iocb->ki_filp)))
+ if (IS_DAX(inode))
return ext4_dax_read_iter(iocb, to);
#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_read_iter(iocb, to);
+
return generic_file_read_iter(iocb, to);
}
@@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static void ext4_unwritten_wait(struct inode *inode)
-{
- wait_queue_head_t *wq = ext4_ioend_wq(inode);
-
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
-}
-
/*
* This tests whether the IO in question is block-aligned or not.
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
@@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ if (unlikely(IS_IMMUTABLE(inode)))
+ return -EPERM;
+
ret = generic_write_checks(iocb, from);
if (ret <= 0)
return ret;
- if (unlikely(IS_IMMUTABLE(inode)))
- return -EPERM;
-
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return -EFBIG;
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
}
+
+ ret = file_modified(iocb->ki_filp);
+ if (ret)
+ return ret;
+
return iov_iter_count(from);
}
-#ifdef CONFIG_FS_DAX
-static ssize_t
-ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
- struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
- inode_lock(inode);
- }
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
- ret = file_remove_privs(iocb->ki_filp);
- if (ret)
- goto out;
- ret = file_update_time(iocb->ki_filp);
- if (ret)
- goto out;
- ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+
out:
inode_unlock(inode);
- if (ret > 0)
+ if (likely(ret > 0)) {
+ iocb->ki_pos += ret;
ret = generic_write_sync(iocb, ret);
+ }
+
return ret;
}
-#endif
-static ssize_t
-ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
+ ssize_t written, size_t count)
{
+ handle_t *handle;
+ bool truncate = false;
+ u8 blkbits = inode->i_blkbits;
+ ext4_lblk_t written_blk, end_blk;
+
+ /*
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
+ * inode->i_size while the I/O was running due to writeback of delalloc
+ * blocks. But, the code in ext4_iomap_alloc() is careful to use
+ * zeroed/unwritten extents if this is possible; thus we won't leave
+ * uninitialized blocks in a file even if we didn't succeed in writing
+ * as much as we intended.
+ */
+ WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
+ if (offset + count <= EXT4_I(inode)->i_disksize) {
+ /*
+ * We need to ensure that the inode is removed from the orphan
+ * list if it has been added prematurely, due to writeback of
+ * delalloc blocks.
+ */
+ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ return PTR_ERR(handle);
+ }
+
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
+ return written;
+ }
+
+ if (written < 0)
+ goto truncate;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ written = PTR_ERR(handle);
+ goto truncate;
+ }
+
+ if (ext4_update_inode_size(inode, offset + written))
+ ext4_mark_inode_dirty(handle, inode);
+
+ /*
+ * We may need to truncate allocated but not written blocks beyond EOF.
+ */
+ written_blk = ALIGN(offset + written, 1 << blkbits);
+ end_blk = ALIGN(offset + count, 1 << blkbits);
+ if (written_blk < end_blk && ext4_can_truncate(inode))
+ truncate = true;
+
+ /*
+ * Remove the inode from the orphan list if it has been extended and
+ * everything went OK.
+ */
+ if (!truncate && inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+
+ if (truncate) {
+truncate:
+ ext4_truncate_failed_write(inode);
+ /*
+ * If the truncate operation failed early, then the inode may
+ * still be on the orphan list. In that case, we need to try
+ * remove the inode from the in-memory linked list.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ return written;
+}
+
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned int flags)
+{
+ loff_t offset = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
- int o_direct = iocb->ki_flags & IOCB_DIRECT;
- int unaligned_aio = 0;
- int overwrite = 0;
+
+ if (error)
+ return error;
+
+ if (size && flags & IOMAP_DIO_UNWRITTEN)
+ return ext4_convert_unwritten_extents(NULL, inode,
+ offset, size);
+
+ return 0;
+}
+
+static const struct iomap_dio_ops ext4_dio_write_ops = {
+ .end_io = ext4_dio_write_end_io,
+};
+
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
ssize_t ret;
+ size_t count;
+ loff_t offset;
+ handle_t *handle;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ bool extend = false, overwrite = false, unaligned_aio = false;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
- return -EIO;
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock(inode);
+ }
+
+ if (!ext4_dio_supported(inode)) {
+ inode_unlock(inode);
+ /*
+ * Fallback to buffered I/O if the inode does not support
+ * direct I/O.
+ */
+ return ext4_buffered_write_iter(iocb, from);
+ }
+
+ ret = ext4_write_checks(iocb, from);
+ if (ret <= 0) {
+ inode_unlock(inode);
+ return ret;
+ }
+
+ /*
+ * Unaligned asynchronous direct I/O must be serialized among each
+ * other as the zeroing of partial blocks of two competing unaligned
+ * asynchronous direct I/O writes can result in data corruption.
+ */
+ offset = iocb->ki_pos;
+ count = iov_iter_count(from);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
+ unaligned_aio = true;
+ inode_dio_wait(inode);
+ }
+
+ /*
+ * Determine whether the I/O will overwrite allocated and initialized
+ * blocks. If so, check to see whether it is possible to take the
+ * dioread_nolock path.
+ */
+ if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
+ ext4_should_dioread_nolock(inode)) {
+ overwrite = true;
+ downgrade_write(&inode->i_rwsem);
+ }
+
+ if (offset + count > EXT4_I(inode)->i_disksize) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
+
+ extend = true;
+ ext4_journal_stop(handle);
+ }
+
+ ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
+ is_sync_kiocb(iocb) || unaligned_aio || extend);
+
+ if (extend)
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
+
+out:
+ if (overwrite)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+
+ if (ret >= 0 && iov_iter_count(from)) {
+ ssize_t err;
+ loff_t endbyte;
+
+ offset = iocb->ki_pos;
+ err = ext4_buffered_write_iter(iocb, from);
+ if (err < 0)
+ return err;
+
+ /*
+ * We need to ensure that the pages within the page cache for
+ * the range covered by this I/O are written to disk and
+ * invalidated. This is in attempt to preserve the expected
+ * direct I/O semantics in the case we fallback to buffered I/O
+ * to complete off the I/O request.
+ */
+ ret += err;
+ endbyte = offset + err - 1;
+ err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
+ offset, endbyte);
+ if (!err)
+ invalidate_mapping_pages(iocb->ki_filp->f_mapping,
+ offset >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
+ }
+
+ return ret;
+}
#ifdef CONFIG_FS_DAX
- if (IS_DAX(inode))
- return ext4_dax_write_iter(iocb, from);
-#endif
+static ssize_t
+ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t ret;
+ size_t count;
+ loff_t offset;
+ handle_t *handle;
+ bool extend = false;
+ struct inode *inode = file_inode(iocb->ki_filp);
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
@@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret <= 0)
goto out;
- /*
- * Unaligned direct AIO must be serialized among each other as zeroing
- * of partial blocks of two competing unaligned AIOs can result in data
- * corruption.
- */
- if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
- !is_sync_kiocb(iocb) &&
- ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
- unaligned_aio = 1;
- ext4_unwritten_wait(inode);
- }
+ offset = iocb->ki_pos;
+ count = iov_iter_count(from);
- iocb->private = &overwrite;
- /* Check whether we do a DIO overwrite or not */
- if (o_direct && !unaligned_aio) {
- if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
- if (ext4_should_dioread_nolock(inode))
- overwrite = 1;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
+ if (offset + count > EXT4_I(inode)->i_disksize) {
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
goto out;
}
- }
- ret = __generic_file_write_iter(iocb, from);
- /*
- * Unaligned direct AIO must be the only IO in flight. Otherwise
- * overlapping aligned IO after unaligned might result in data
- * corruption.
- */
- if (ret == -EIOCBQUEUED && unaligned_aio)
- ext4_unwritten_wait(inode);
- inode_unlock(inode);
+ ret = ext4_orphan_add(handle, inode);
+ if (ret) {
+ ext4_journal_stop(handle);
+ goto out;
+ }
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
+ extend = true;
+ ext4_journal_stop(handle);
+ }
- return ret;
+ ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+ if (extend)
+ ret = ext4_handle_inode_extension(inode, offset, ret, count);
out:
inode_unlock(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
return ret;
}
+#endif
+
+static ssize_t
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ return -EIO;
+
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return ext4_dax_write_iter(iocb, from);
+#endif
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_write_iter(iocb, from);
+
+ return ext4_buffered_write_iter(iocb, from);
+}
#ifdef CONFIG_FS_DAX
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
@@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
maxbytes, i_size_read(inode));
case SEEK_HOLE:
inode_lock_shared(inode);
- offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
+ offset = iomap_seek_hole(inode, offset,
+ &ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
case SEEK_DATA:
inode_lock_shared(inode);
- offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
+ offset = iomap_seek_data(inode, offset,
+ &ext4_iomap_report_ops);
inode_unlock_shared(inode);
break;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5508baa11bb6..e10206e7f4bb 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -80,6 +80,43 @@ static int ext4_sync_parent(struct inode *inode)
return ret;
}
+static int ext4_fsync_nojournal(struct inode *inode, bool datasync,
+ bool *needs_barrier)
+{
+ int ret, err;
+
+ ret = sync_mapping_buffers(inode->i_mapping);
+ if (!(inode->i_state & I_DIRTY_ALL))
+ return ret;
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ return ret;
+
+ err = sync_inode_metadata(inode, 1);
+ if (!ret)
+ ret = err;
+
+ if (!ret)
+ ret = ext4_sync_parent(inode);
+ if (test_opt(inode->i_sb, BARRIER))
+ *needs_barrier = true;
+
+ return ret;
+}
+
+static int ext4_fsync_journal(struct inode *inode, bool datasync,
+ bool *needs_barrier)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+ tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ *needs_barrier = true;
+
+ return jbd2_complete_transaction(journal, commit_tid);
+}
+
/*
* akpm: A new design for ext4_sync_file().
*
@@ -91,17 +128,14 @@ static int ext4_sync_parent(struct inode *inode)
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
*/
-
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0, err;
- tid_t commit_tid;
bool needs_barrier = false;
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -111,23 +145,15 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (sb_rdonly(inode->i_sb)) {
/* Make sure that we read updated s_mount_flags value */
smp_rmb();
- if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ret = -EROFS;
goto out;
}
- if (!journal) {
- ret = __generic_file_fsync(file, start, end, datasync);
- if (!ret)
- ret = ext4_sync_parent(inode);
- if (test_opt(inode->i_sb, BARRIER))
- goto issue_flush;
- goto out;
- }
-
ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
+
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -142,18 +168,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode)) {
+ if (!sbi->s_journal)
+ ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier);
+ else if (ext4_should_journal_data(inode))
ret = ext4_force_commit(inode->i_sb);
- goto out;
- }
+ else
+ ret = ext4_fsync_journal(inode, datasync, &needs_barrier);
- commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
- if (journal->j_flags & JBD2_BARRIER &&
- !jbd2_trans_will_send_data_barrier(journal, commit_tid))
- needs_barrier = true;
- ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
- issue_flush:
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
ret = err;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 764ff4c56233..dc333e8e51e8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -265,13 +265,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
ext4_debug("freeing inode %lu\n", ino);
trace_ext4_free_inode(inode);
- /*
- * Note: we must free any quota before locking the superblock,
- * as writing the quota to disk may need the lock as well.
- */
dquot_initialize(inode);
dquot_free_inode(inode);
- dquot_drop(inode);
is_directory = S_ISDIR(inode->i_mode);
@@ -927,7 +922,7 @@ repeat_in_this_group:
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(dir->i_sb, line_no,
handle_type, nblocks,
- 0);
+ 0, 0);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_std_error(sb, err);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36699a131168..3a4ab70fe9e0 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -331,11 +331,14 @@ static int ext4_alloc_branch(handle_t *handle,
for (i = 0; i <= indirect_blks; i++) {
if (i == indirect_blks) {
new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
- } else
+ } else {
ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
ar->inode, ar->goal,
ar->flags & EXT4_MB_DELALLOC_RESERVED,
NULL, &err);
+ /* Simplify error cleanup... */
+ branch[i+1].bh = NULL;
+ }
if (err) {
i--;
goto failed;
@@ -377,18 +380,25 @@ static int ext4_alloc_branch(handle_t *handle,
}
return 0;
failed:
+ if (i == indirect_blks) {
+ /* Free data blocks */
+ ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
+ ar->len, 0);
+ i--;
+ }
for (; i >= 0; i--) {
/*
* We want to ext4_forget() only freshly allocated indirect
- * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
- * buffer at branch[0].bh is indirect block / inode already
- * existing before ext4_alloc_branch() was called.
+ * blocks. Buffer for new_blocks[i] is at branch[i+1].bh
+ * (buffer at branch[0].bh is indirect block / inode already
+ * existing before ext4_alloc_branch() was called). Also
+ * because blocks are freshly allocated, we don't need to
+ * revoke them which is why we don't set
+ * EXT4_FREE_BLOCKS_METADATA.
*/
- if (i > 0 && i != indirect_blks && branch[i].bh)
- ext4_forget(handle, 1, ar->inode, branch[i].bh,
- branch[i].bh->b_blocknr);
- ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
- (i == indirect_blks) ? ar->len : 1, 0);
+ ext4_free_blocks(handle, ar->inode, branch[i+1].bh,
+ new_blocks[i], 1,
+ branch[i+1].bh ? EXT4_FREE_BLOCKS_FORGET : 0);
}
return err;
}
@@ -689,27 +699,63 @@ int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
}
+static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int *dropped)
+{
+ int err;
+
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (unlikely(err))
+ return err;
+ }
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(err))
+ return err;
+ /*
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
+ * moment, get_block can be called only for blocks inside i_size since
+ * page cache has been already dropped and writes are blocked by
+ * i_mutex. So we can safely drop the i_data_sem here.
+ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ ext4_discard_preallocations(inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ *dropped = 1;
+ return 0;
+}
+
/*
* Truncate transactions can be complex and absolutely huge. So we need to
* be able to restart the transaction at a conventient checkpoint to make
* sure we don't overflow the journal.
*
* Try to extend this transaction for the purposes of truncation. If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- *
- * Returns 0 if we managed to create more room. If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * extend fails, we restart transaction.
*/
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_ind_truncate_ensure_credits(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh,
+ int revoke_creds)
{
- if (!ext4_handle_valid(handle))
- return 0;
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
- return 0;
- return 1;
+ int ret;
+ int dropped = 0;
+
+ ret = ext4_journal_ensure_credits_fn(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_blocks_for_truncate(inode), revoke_creds,
+ ext4_ind_trunc_restart_fn(handle, inode, bh, &dropped));
+ if (dropped)
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (ret <= 0)
+ return ret;
+ if (bh) {
+ BUFFER_TRACE(bh, "retaking write access");
+ ret = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(ret))
+ return ret;
+ }
+ return 0;
}
/*
@@ -844,27 +890,10 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
return 1;
}
- if (try_to_extend_transaction(handle, inode)) {
- if (bh) {
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- if (unlikely(err))
- goto out_err;
- }
- err = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(err))
- goto out_err;
- err = ext4_truncate_restart_trans(handle, inode,
- ext4_blocks_for_truncate(inode));
- if (unlikely(err))
- goto out_err;
- if (bh) {
- BUFFER_TRACE(bh, "retaking write access");
- err = ext4_journal_get_write_access(handle, bh);
- if (unlikely(err))
- goto out_err;
- }
- }
+ err = ext4_ind_truncate_ensure_credits(handle, inode, bh,
+ ext4_free_data_revoke_credits(inode, count));
+ if (err < 0)
+ goto out_err;
for (p = first; p < last; p++)
*p = 0;
@@ -1047,11 +1076,11 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
*/
if (ext4_handle_is_aborted(handle))
return;
- if (try_to_extend_transaction(handle, inode)) {
- ext4_mark_inode_dirty(handle, inode);
- ext4_truncate_restart_trans(handle, inode,
- ext4_blocks_for_truncate(inode));
- }
+ if (ext4_ind_truncate_ensure_credits(handle, inode,
+ NULL,
+ ext4_free_metadata_revoke_credits(
+ inode->i_sb, 1)) < 0)
+ return;
/*
* The forget flag here is critical because if
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
new file mode 100644
index 000000000000..92a9da1774aa
--- /dev/null
+++ b/fs/ext4/inode-test.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test of ext4 inode that verify the seconds part of [a/c/m]
+ * timestamps in ext4 inode structs are decoded correctly.
+ */
+
+#include <kunit/test.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+
+#include "ext4.h"
+
+/*
+ * For constructing the nonnegative timestamp lower bound value.
+ * binary: 00000000 00000000 00000000 00000000
+ */
+#define LOWER_MSB_0 0L
+/*
+ * For constructing the nonnegative timestamp upper bound value.
+ * binary: 01111111 11111111 11111111 11111111
+ *
+ */
+#define UPPER_MSB_0 0x7fffffffL
+/*
+ * For constructing the negative timestamp lower bound value.
+ * binary: 10000000 00000000 00000000 00000000
+ */
+#define LOWER_MSB_1 (-0x80000000L)
+/*
+ * For constructing the negative timestamp upper bound value.
+ * binary: 11111111 11111111 11111111 11111111
+ */
+#define UPPER_MSB_1 (-1L)
+/*
+ * Upper bound for nanoseconds value supported by the encoding.
+ * binary: 00111111 11111111 11111111 11111111
+ */
+#define MAX_NANOSECONDS ((1L << 30) - 1)
+
+#define CASE_NAME_FORMAT "%s: msb:%x lower_bound:%x extra_bits: %x"
+
+#define LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE\
+ "1901-12-13 Lower bound of 32bit < 0 timestamp, no extra bits"
+#define UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE\
+ "1969-12-31 Upper bound of 32bit < 0 timestamp, no extra bits"
+#define LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\
+ "1970-01-01 Lower bound of 32bit >=0 timestamp, no extra bits"
+#define UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE\
+ "2038-01-19 Upper bound of 32bit >=0 timestamp, no extra bits"
+#define LOWER_BOUND_NEG_LO_1_CASE\
+ "2038-01-19 Lower bound of 32bit <0 timestamp, lo extra sec bit on"
+#define UPPER_BOUND_NEG_LO_1_CASE\
+ "2106-02-07 Upper bound of 32bit <0 timestamp, lo extra sec bit on"
+#define LOWER_BOUND_NONNEG_LO_1_CASE\
+ "2106-02-07 Lower bound of 32bit >=0 timestamp, lo extra sec bit on"
+#define UPPER_BOUND_NONNEG_LO_1_CASE\
+ "2174-02-25 Upper bound of 32bit >=0 timestamp, lo extra sec bit on"
+#define LOWER_BOUND_NEG_HI_1_CASE\
+ "2174-02-25 Lower bound of 32bit <0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NEG_HI_1_CASE\
+ "2242-03-16 Upper bound of 32bit <0 timestamp, hi extra sec bit on"
+#define LOWER_BOUND_NONNEG_HI_1_CASE\
+ "2242-03-16 Lower bound of 32bit >=0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NONNEG_HI_1_CASE\
+ "2310-04-04 Upper bound of 32bit >=0 timestamp, hi extra sec bit on"
+#define UPPER_BOUND_NONNEG_HI_1_NS_1_CASE\
+ "2310-04-04 Upper bound of 32bit>=0 timestamp, hi extra sec bit 1. 1 ns"
+#define LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE\
+ "2378-04-22 Lower bound of 32bit>= timestamp. Extra sec bits 1. Max ns"
+#define LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE\
+ "2378-04-22 Lower bound of 32bit >=0 timestamp. All extra sec bits on"
+#define UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE\
+ "2446-05-10 Upper bound of 32bit >=0 timestamp. All extra sec bits on"
+
+struct timestamp_expectation {
+ const char *test_case_name;
+ struct timespec64 expected;
+ u32 extra_bits;
+ bool msb_set;
+ bool lower_bound;
+};
+
+static time64_t get_32bit_time(const struct timestamp_expectation * const test)
+{
+ if (test->msb_set) {
+ if (test->lower_bound)
+ return LOWER_MSB_1;
+
+ return UPPER_MSB_1;
+ }
+
+ if (test->lower_bound)
+ return LOWER_MSB_0;
+ return UPPER_MSB_0;
+}
+
+
+/*
+ * Test data is derived from the table in the Inode Timestamps section of
+ * Documentation/filesystems/ext4/inodes.rst.
+ */
+static void inode_test_xtimestamp_decoding(struct kunit *test)
+{
+ const struct timestamp_expectation test_data[] = {
+ {
+ .test_case_name = LOWER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_NO_EXTRA_BITS_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = -1LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0,
+ .expected = {0LL, 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_NO_EXTRA_BITS_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 0,
+ .expected = {.tv_sec = 0x7fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x80000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_LO_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0xffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x100000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_LO_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 1,
+ .expected = {.tv_sec = 0x17fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x180000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NEG_HI_1_CASE,
+ .msb_set = true,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x1ffffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x200000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 2,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_HI_1_NS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 6,
+ .expected = {.tv_sec = 0x27fffffffLL, .tv_nsec = 1L},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_HI_1_NS_MAX_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 0xFFFFFFFF,
+ .expected = {.tv_sec = 0x300000000LL,
+ .tv_nsec = MAX_NANOSECONDS},
+ },
+
+ {
+ .test_case_name = LOWER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = true,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x300000000LL, .tv_nsec = 0L},
+ },
+
+ {
+ .test_case_name = UPPER_BOUND_NONNEG_EXTRA_BITS_1_CASE,
+ .msb_set = false,
+ .lower_bound = false,
+ .extra_bits = 3,
+ .expected = {.tv_sec = 0x37fffffffLL, .tv_nsec = 0L},
+ }
+ };
+
+ struct timespec64 timestamp;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(test_data); ++i) {
+ timestamp.tv_sec = get_32bit_time(&test_data[i]);
+ ext4_decode_extra_time(&timestamp,
+ cpu_to_le32(test_data[i].extra_bits));
+
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_data[i].expected.tv_sec,
+ timestamp.tv_sec,
+ CASE_NAME_FORMAT,
+ test_data[i].test_case_name,
+ test_data[i].msb_set,
+ test_data[i].lower_bound,
+ test_data[i].extra_bits);
+ KUNIT_EXPECT_EQ_MSG(test,
+ test_data[i].expected.tv_nsec,
+ timestamp.tv_nsec,
+ CASE_NAME_FORMAT,
+ test_data[i].test_case_name,
+ test_data[i].msb_set,
+ test_data[i].lower_bound,
+ test_data[i].extra_bits);
+ }
+}
+
+static struct kunit_case ext4_inode_test_cases[] = {
+ KUNIT_CASE(inode_test_xtimestamp_decoding),
+ {}
+};
+
+static struct kunit_suite ext4_inode_test_suite = {
+ .name = "ext4_inode_test",
+ .test_cases = ext4_inode_test_cases,
+};
+
+kunit_test_suite(ext4_inode_test_suite);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 516faa280ced..28f28de0c1b6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -164,39 +164,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode)
}
/*
- * Restart the transaction associated with *handle. This does a commit,
- * so before we call here everything must be consistently dirtied against
- * this transaction.
- */
-int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
- int nblocks)
-{
- int ret;
-
- /*
- * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
- * moment, get_block can be called only for blocks inside i_size since
- * page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
- */
- BUG_ON(EXT4_JOURNAL(inode) == NULL);
- jbd_debug(2, "restarting handle %p\n", handle);
- up_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_journal_restart(handle, nblocks);
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
-
- return ret;
-}
-
-/*
* Called at the last iput() if i_nlink is zero.
*/
void ext4_evict_inode(struct inode *inode)
{
handle_t *handle;
int err;
- int extra_credits = 3;
+ /*
+ * Credits for final inode cleanup and freeing:
+ * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
+ * (xattr block freeing), bitmap, group descriptor (inode freeing)
+ */
+ int extra_credits = 6;
struct ext4_xattr_inode_array *ea_inode_array = NULL;
trace_ext4_evict_inode(inode);
@@ -252,8 +231,12 @@ void ext4_evict_inode(struct inode *inode)
if (!IS_NOQUOTA(inode))
extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+ /*
+ * Block bitmap, group descriptor, and inode are accounted in both
+ * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
+ */
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
- ext4_blocks_for_truncate(inode)+extra_credits);
+ ext4_blocks_for_truncate(inode) + extra_credits - 3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -827,136 +810,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
#define DIO_MAX_BLOCKS 4096
/*
- * Get blocks function for the cases that need to start a transaction -
- * generally difference cases of direct IO and DAX IO. It also handles retries
- * in case of ENOSPC.
- */
-static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int flags)
-{
- int dio_credits;
- handle_t *handle;
- int retries = 0;
- int ret;
-
- /* Trim mapping request to maximum we can map at once for DIO */
- if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
- bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
- dio_credits = ext4_chunk_trans_blocks(inode,
- bh_result->b_size >> inode->i_blkbits);
-retry:
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- ret = _ext4_get_block(inode, iblock, bh_result, flags);
- ext4_journal_stop(handle);
-
- if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- return ret;
-}
-
-/* Get block function for DIO reads and writes to inodes without extents */
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- if (!create)
- return _ext4_get_block(inode, iblock, bh, 0);
- return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
-}
-
-/*
- * Get block function for AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete.
- */
-static int ext4_dio_get_block_unwritten_async(struct inode *inode,
- sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int ret;
-
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
- /*
- * When doing DIO using unwritten extents, we need io_end to convert
- * unwritten extents to written on IO completion. We allocate io_end
- * once we spot unwritten extent and store it in b_private. Generic
- * DIO code keeps b_private set and furthermore passes the value to
- * our completion callback in 'private' argument.
- */
- if (!ret && buffer_unwritten(bh_result)) {
- if (!bh_result->b_private) {
- ext4_io_end_t *io_end;
-
- io_end = ext4_init_io_end(inode, GFP_KERNEL);
- if (!io_end)
- return -ENOMEM;
- bh_result->b_private = io_end;
- ext4_set_io_unwritten_flag(inode, io_end);
- }
- set_buffer_defer_completion(bh_result);
- }
-
- return ret;
-}
-
-/*
- * Get block function for non-AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete by ext4_direct_IO_write().
- */
-static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
- sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int ret;
-
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = ext4_get_block_trans(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
- /*
- * Mark inode as having pending DIO writes to unwritten extents.
- * ext4_direct_IO_write() checks this flag and converts extents to
- * written.
- */
- if (!ret && buffer_unwritten(bh_result))
- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-
- return ret;
-}
-
-static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
-
- ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
- inode->i_ino, create);
- /* We don't expect handle for direct IO */
- WARN_ON_ONCE(ext4_journal_current_handle());
-
- ret = _ext4_get_block(inode, iblock, bh_result, 0);
- /*
- * Blocks should have been preallocated! ext4_file_write_iter() checks
- * that.
- */
- WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
-
- return ret;
-}
-
-
-/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -2341,6 +2194,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
}
/*
+ * mpage_process_page - update page buffers corresponding to changed extent and
+ * may submit fully mapped page for IO
+ *
+ * @mpd - description of extent to map, on return next extent to map
+ * @m_lblk - logical block mapping.
+ * @m_pblk - corresponding physical mapping.
+ * @map_bh - determines on return whether this page requires any further
+ * mapping or not.
+ * Scan given page buffers corresponding to changed extent and update buffer
+ * state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits.
+ * If the given page is not fully mapped, we update @map to the next extent in
+ * the given page that needs mapping & return @map_bh as true.
+ */
+static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+ ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
+ bool *map_bh)
+{
+ struct buffer_head *head, *bh;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ ext4_lblk_t lblk = *m_lblk;
+ ext4_fsblk_t pblock = *m_pblk;
+ int err = 0;
+ int blkbits = mpd->inode->i_blkbits;
+ ssize_t io_end_size = 0;
+ struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
+
+ bh = head = page_buffers(page);
+ do {
+ if (lblk < mpd->map.m_lblk)
+ continue;
+ if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+ /*
+ * Buffer after end of mapped extent.
+ * Find next buffer in the page to map.
+ */
+ mpd->map.m_len = 0;
+ mpd->map.m_flags = 0;
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+
+ err = mpage_process_page_bufs(mpd, head, bh, lblk);
+ if (err > 0)
+ err = 0;
+ if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec)) {
+ err = PTR_ERR(io_end_vec);
+ goto out;
+ }
+ io_end_vec->offset = mpd->map.m_lblk << blkbits;
+ }
+ *map_bh = true;
+ goto out;
+ }
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock++;
+ }
+ clear_buffer_unwritten(bh);
+ io_end_size += (1 << blkbits);
+ } while (lblk++, (bh = bh->b_this_page) != head);
+
+ io_end_vec->size += io_end_size;
+ io_end_size = 0;
+ *map_bh = false;
+out:
+ *m_lblk = lblk;
+ *m_pblk = pblock;
+ return err;
+}
+
+/*
* mpage_map_buffers - update buffers corresponding to changed extent and
* submit fully mapped pages for IO
*
@@ -2359,12 +2285,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
struct pagevec pvec;
int nr_pages, i;
struct inode *inode = mpd->inode;
- struct buffer_head *head, *bh;
int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
- sector_t pblock;
+ ext4_fsblk_t pblock;
int err;
+ bool map_bh = false;
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@ -2380,50 +2306,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- bh = head = page_buffers(page);
- do {
- if (lblk < mpd->map.m_lblk)
- continue;
- if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
- /*
- * Buffer after end of mapped extent.
- * Find next buffer in the page to map.
- */
- mpd->map.m_len = 0;
- mpd->map.m_flags = 0;
- /*
- * FIXME: If dioread_nolock supports
- * blocksize < pagesize, we need to make
- * sure we add size mapped so far to
- * io_end->size as the following call
- * can submit the page for IO.
- */
- err = mpage_process_page_bufs(mpd, head,
- bh, lblk);
- pagevec_release(&pvec);
- if (err > 0)
- err = 0;
- return err;
- }
- if (buffer_delay(bh)) {
- clear_buffer_delay(bh);
- bh->b_blocknr = pblock++;
- }
- clear_buffer_unwritten(bh);
- } while (lblk++, (bh = bh->b_this_page) != head);
-
+ err = mpage_process_page(mpd, page, &lblk, &pblock,
+ &map_bh);
/*
- * FIXME: This is going to break if dioread_nolock
- * supports blocksize < pagesize as we will try to
- * convert potentially unmapped parts of inode.
+ * If map_bh is true, means page may require further bh
+ * mapping, or maybe the page was submitted for IO.
+ * So we return to call further extent mapping.
*/
- mpd->io_submit.io_end->size += PAGE_SIZE;
+ if (err < 0 || map_bh == true)
+ goto out;
/* Page fully mapped - let IO run! */
err = mpage_submit_page(mpd, page);
- if (err < 0) {
- pagevec_release(&pvec);
- return err;
- }
+ if (err < 0)
+ goto out;
}
pagevec_release(&pvec);
}
@@ -2431,6 +2326,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
mpd->map.m_len = 0;
mpd->map.m_flags = 0;
return 0;
+out:
+ pagevec_release(&pvec);
+ return err;
}
static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@ -2510,9 +2408,13 @@ static int mpage_map_and_submit_extent(handle_t *handle,
int err;
loff_t disksize;
int progress = 0;
+ ext4_io_end_t *io_end = mpd->io_submit.io_end;
+ struct ext4_io_end_vec *io_end_vec;
- mpd->io_submit.io_end->offset =
- ((loff_t)map->m_lblk) << inode->i_blkbits;
+ io_end_vec = ext4_alloc_io_end_vec(io_end);
+ if (IS_ERR(io_end_vec))
+ return PTR_ERR(io_end_vec);
+ io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
do {
err = mpage_map_one_extent(handle, mpd);
if (err < 0) {
@@ -3406,473 +3308,235 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
return inode->i_state & I_DIRTY_DATASYNC;
}
-static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned flags, struct iomap *iomap)
+static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
+ struct ext4_map_blocks *map, loff_t offset,
+ loff_t length)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned int blkbits = inode->i_blkbits;
- unsigned long first_block, last_block;
- struct ext4_map_blocks map;
- bool delalloc = false;
- int ret;
-
- if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
- return -EINVAL;
- first_block = offset >> blkbits;
- last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
- EXT4_MAX_LOGICAL_BLOCK);
-
- if (flags & IOMAP_REPORT) {
- if (ext4_has_inline_data(inode)) {
- ret = ext4_inline_data_iomap(inode, iomap);
- if (ret != -EAGAIN) {
- if (ret == 0 && offset >= iomap->length)
- ret = -ENOENT;
- return ret;
- }
- }
- } else {
- if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
- return -ERANGE;
- }
-
- map.m_lblk = first_block;
- map.m_len = last_block - first_block + 1;
-
- if (flags & IOMAP_REPORT) {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
-
- if (ret == 0) {
- ext4_lblk_t end = map.m_lblk + map.m_len - 1;
- struct extent_status es;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map.m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end) {
- /* entire range is a hole */
- } else if (es.es_lblk > map.m_lblk) {
- /* range starts with a hole */
- map.m_len = es.es_lblk - map.m_lblk;
- } else {
- ext4_lblk_t offs = 0;
-
- if (es.es_lblk < map.m_lblk)
- offs = map.m_lblk - es.es_lblk;
- map.m_lblk = es.es_lblk + offs;
- map.m_len = es.es_len - offs;
- delalloc = true;
- }
- }
- } else if (flags & IOMAP_WRITE) {
- int dio_credits;
- handle_t *handle;
- int retries = 0;
-
- /* Trim mapping request to maximum we can map at once for DIO */
- if (map.m_len > DIO_MAX_BLOCKS)
- map.m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-retry:
- /*
- * Either we allocate blocks and then we don't get unwritten
- * extent so we have reserved enough credits, or the blocks
- * are already allocated and unwritten and in that case
- * extent conversion fits in the credits as well.
- */
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
- dio_credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
-
- ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_ZERO);
- if (ret < 0) {
- ext4_journal_stop(handle);
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
- return ret;
- }
-
- /*
- * If we added blocks beyond i_size, we need to make sure they
- * will get truncated if we crash before updating i_size in
- * ext4_iomap_end(). For faults we don't need to do that (and
- * even cannot because for orphan list operations inode_lock is
- * required) - if we happen to instantiate block beyond i_size,
- * it is because we race with truncate which has already added
- * the inode to the orphan list.
- */
- if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
- (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
- int err;
-
- err = ext4_orphan_add(handle, inode);
- if (err < 0) {
- ext4_journal_stop(handle);
- return err;
- }
- }
- ext4_journal_stop(handle);
- } else {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0)
- return ret;
- }
+ u8 blkbits = inode->i_blkbits;
+ /*
+ * Writes that span EOF might trigger an I/O size update on completion,
+ * so consider them to be dirty for the purpose of O_DSYNC, even if
+ * there is no other metadata changes being made or are pending.
+ */
iomap->flags = 0;
- if (ext4_inode_datasync_dirty(inode))
+ if (ext4_inode_datasync_dirty(inode) ||
+ offset + length > i_size_read(inode))
iomap->flags |= IOMAP_F_DIRTY;
+
+ if (map->m_flags & EXT4_MAP_NEW)
+ iomap->flags |= IOMAP_F_NEW;
+
iomap->bdev = inode->i_sb->s_bdev;
- iomap->dax_dev = sbi->s_daxdev;
- iomap->offset = (u64)first_block << blkbits;
- iomap->length = (u64)map.m_len << blkbits;
+ iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+ iomap->offset = (u64) map->m_lblk << blkbits;
+ iomap->length = (u64) map->m_len << blkbits;
- if (ret == 0) {
- iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
- iomap->addr = IOMAP_NULL_ADDR;
+ /*
+ * Flags passed to ext4_map_blocks() for direct I/O writes can result
+ * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
+ * set. In order for any allocated unwritten extents to be converted
+ * into written extents correctly within the ->end_io() handler, we
+ * need to ensure that the iomap->type is set appropriately. Hence, the
+ * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
+ * been set first.
+ */
+ if (map->m_flags & EXT4_MAP_UNWRITTEN) {
+ iomap->type = IOMAP_UNWRITTEN;
+ iomap->addr = (u64) map->m_pblk << blkbits;
+ } else if (map->m_flags & EXT4_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = (u64) map->m_pblk << blkbits;
} else {
- if (map.m_flags & EXT4_MAP_MAPPED) {
- iomap->type = IOMAP_MAPPED;
- } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- iomap->type = IOMAP_UNWRITTEN;
- } else {
- WARN_ON_ONCE(1);
- return -EIO;
- }
- iomap->addr = (u64)map.m_pblk << blkbits;
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
}
-
- if (map.m_flags & EXT4_MAP_NEW)
- iomap->flags |= IOMAP_F_NEW;
-
- return 0;
}
-static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
- ssize_t written, unsigned flags, struct iomap *iomap)
+static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int flags)
{
- int ret = 0;
handle_t *handle;
- int blkbits = inode->i_blkbits;
- bool truncate = false;
-
- if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
- return 0;
+ u8 blkbits = inode->i_blkbits;
+ int ret, dio_credits, m_flags = 0, retries = 0;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto orphan_del;
- }
- if (ext4_update_inode_size(inode, offset + written))
- ext4_mark_inode_dirty(handle, inode);
/*
- * We may need to truncate allocated but not written blocks beyond EOF.
+ * Trim the mapping request to the maximum value that we can map at
+ * once for direct I/O.
*/
- if (iomap->offset + iomap->length >
- ALIGN(inode->i_size, 1 << blkbits)) {
- ext4_lblk_t written_blk, end_blk;
+ if (map->m_len > DIO_MAX_BLOCKS)
+ map->m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
- written_blk = (offset + written) >> blkbits;
- end_blk = (offset + length) >> blkbits;
- if (written_blk < end_blk && ext4_can_truncate(inode))
- truncate = true;
- }
+retry:
/*
- * Remove inode from orphan list if we were extending a inode and
- * everything went fine.
+ * Either we allocate blocks and then don't get an unwritten extent, so
+ * in that case we have reserved enough credits. Or, the blocks are
+ * already allocated and unwritten. In that case, the extent conversion
+ * fits into the credits as well.
*/
- if (!truncate && inode->i_nlink &&
- !list_empty(&EXT4_I(inode)->i_orphan))
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- if (truncate) {
- ext4_truncate_failed_write(inode);
-orphan_del:
- /*
- * If truncate failed early the inode might still be on the
- * orphan list; we need to make sure the inode is removed from
- * the orphan list in that case.
- */
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
- }
- return ret;
-}
-
-const struct iomap_ops ext4_iomap_ops = {
- .iomap_begin = ext4_iomap_begin,
- .iomap_end = ext4_iomap_end,
-};
-
-static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
- ssize_t size, void *private)
-{
- ext4_io_end_t *io_end = private;
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
- /* if not async direct IO just return */
- if (!io_end)
- return 0;
+ /*
+ * DAX and direct I/O are the only two operations that are currently
+ * supported with IOMAP_WRITE.
+ */
+ WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+ if (IS_DAX(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+ /*
+ * We use i_size instead of i_disksize here because delalloc writeback
+ * can complete at any point during the I/O and subsequently push the
+ * i_disksize out to i_size. This could be beyond where direct I/O is
+ * happening and thus expose allocated blocks to direct I/O reads.
+ */
+ else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE;
+ else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- ext_debug("ext4_end_io_dio(): io_end 0x%p "
- "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
- io_end, io_end->inode->i_ino, iocb, offset, size);
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
/*
- * Error during AIO DIO. We cannot convert unwritten extents as the
- * data was not written. Just clear the unwritten flag and drop io_end.
+ * We cannot fill holes in indirect tree based inodes as that could
+ * expose stale data in the case of a crash. Use the magic error code
+ * to fallback to buffered I/O.
*/
- if (size <= 0) {
- ext4_clear_io_unwritten_flag(io_end);
- size = 0;
- }
- io_end->offset = offset;
- io_end->size = size;
- ext4_put_io_end(io_end);
+ if (!m_flags && !ret)
+ ret = -ENOTBLK;
- return 0;
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ return ret;
}
-/*
- * Handling of direct IO writes.
- *
- * For ext4 extent files, ext4 will do direct-io write even to holes,
- * preallocated extents, and those write extend the file, no need to
- * fall back to buffered IO.
- *
- * For holes, we fallocate those blocks, mark them as unwritten
- * If those blocks were preallocated, we mark sure they are split, but
- * still keep the range to write as unwritten.
- *
- * The unwritten extents will be converted to written when DIO is completed.
- * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the conversion
- * when async direct IO completed.
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list. So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- */
-static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
+
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct ext4_inode_info *ei = EXT4_I(inode);
- ssize_t ret;
- loff_t offset = iocb->ki_pos;
- size_t count = iov_iter_count(iter);
- int overwrite = 0;
- get_block_t *get_block_func = NULL;
- int dio_flags = 0;
- loff_t final_size = offset + count;
- int orphan = 0;
- handle_t *handle;
+ int ret;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
- if (final_size > inode->i_size || final_size > ei->i_disksize) {
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
- orphan = 1;
- ext4_update_i_disksize(inode, inode->i_size);
- ext4_journal_stop(handle);
- }
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
- BUG_ON(iocb->private == NULL);
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
/*
- * Make all waiters for direct IO properly wait also for extent
- * conversion. This also disallows race between truncate() and
- * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+ * Calculate the first and last logical blocks respectively.
*/
- inode_dio_begin(inode);
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+
+ if (flags & IOMAP_WRITE)
+ ret = ext4_iomap_alloc(inode, &map, flags);
+ else
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+ if (ret < 0)
+ return ret;
- /* If we do a overwrite dio, i_mutex locking can be released */
- overwrite = *((int *)iocb->private);
+ ext4_set_iomap(inode, iomap, &map, offset, length);
- if (overwrite)
- inode_unlock(inode);
+ return 0;
+}
+static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+ ssize_t written, unsigned flags, struct iomap *iomap)
+{
/*
- * For extent mapped files we could direct write to holes and fallocate.
- *
- * Allocated blocks to fill the hole are marked as unwritten to prevent
- * parallel buffered read to expose the stale data before DIO complete
- * the data IO.
- *
- * As to previously fallocated extents, ext4 get_block will just simply
- * mark the buffer mapped but still keep the extents unwritten.
- *
- * For non AIO case, we will convert those unwritten extents to written
- * after return back from blockdev_direct_IO. That way we save us from
- * allocating io_end structure and also the overhead of offloading
- * the extent convertion to a workqueue.
- *
- * For async DIO, the conversion needs to be deferred when the
- * IO is completed. The ext4 end_io callback function will be
- * called to take care of the conversion work. Here for async
- * case, we allocate an io_end structure to hook to the iocb.
+ * Check to see whether an error occurred while writing out the data to
+ * the allocated blocks. If so, return the magic error code so that we
+ * fallback to buffered I/O and attempt to complete the remainder of
+ * the I/O. Any blocks that may have been allocated in preparation for
+ * the direct I/O will be reused during buffered I/O.
*/
- iocb->private = NULL;
- if (overwrite)
- get_block_func = ext4_dio_get_block_overwrite;
- else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
- round_down(offset, i_blocksize(inode)) >= inode->i_size) {
- get_block_func = ext4_dio_get_block;
- dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
- } else if (is_sync_kiocb(iocb)) {
- get_block_func = ext4_dio_get_block_unwritten_sync;
- dio_flags = DIO_LOCKING;
- } else {
- get_block_func = ext4_dio_get_block_unwritten_async;
- dio_flags = DIO_LOCKING;
- }
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- get_block_func, ext4_end_io_dio, NULL,
- dio_flags);
+ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+ return -ENOTBLK;
- if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN)) {
- int err;
- /*
- * for non AIO case, since the IO is already
- * completed, we could do the conversion right here
- */
- err = ext4_convert_unwritten_extents(NULL, inode,
- offset, ret);
- if (err < 0)
- ret = err;
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
- }
+ return 0;
+}
- inode_dio_end(inode);
- /* take i_mutex locking again if we do a ovewrite dio */
- if (overwrite)
- inode_lock(inode);
+const struct iomap_ops ext4_iomap_ops = {
+ .iomap_begin = ext4_iomap_begin,
+ .iomap_end = ext4_iomap_end,
+};
- if (ret < 0 && final_size > inode->i_size)
- ext4_truncate_failed_write(inode);
+static bool ext4_iomap_is_delalloc(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ struct extent_status es;
+ ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
- /* Handle extending of i_size after direct IO write */
- if (orphan) {
- int err;
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+ map->m_lblk, end, &es);
- /* Credits for sb + inode write */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- /*
- * We wrote the data but cannot extend
- * i_size. Bail out. In async io case, we do
- * not return error here because we have
- * already submmitted the corresponding
- * bio. Returning error here makes the caller
- * think that this IO is done and failed
- * resulting in race with bio's completion
- * handler.
- */
- if (!ret)
- ret = PTR_ERR(handle);
- if (inode->i_nlink)
- ext4_orphan_del(NULL, inode);
+ if (!es.es_len || es.es_lblk > end)
+ return false;
- goto out;
- }
- if (inode->i_nlink)
- ext4_orphan_del(handle, inode);
- if (ret > 0) {
- loff_t end = offset + ret;
- if (end > inode->i_size || end > ei->i_disksize) {
- ext4_update_i_disksize(inode, end);
- if (end > inode->i_size)
- i_size_write(inode, end);
- /*
- * We're going to return a positive `ret'
- * here due to non-zero-length I/O, so there's
- * no way of reporting error returns from
- * ext4_mark_inode_dirty() to userspace. So
- * ignore it.
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- }
- err = ext4_journal_stop(handle);
- if (ret == 0)
- ret = err;
+ if (es.es_lblk > map->m_lblk) {
+ map->m_len = es.es_lblk - map->m_lblk;
+ return false;
}
-out:
- return ret;
-}
-static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct address_space *mapping = iocb->ki_filp->f_mapping;
- struct inode *inode = mapping->host;
- size_t count = iov_iter_count(iter);
- ssize_t ret;
+ offset = map->m_lblk - es.es_lblk;
+ map->m_len = es.es_len - offset;
- /*
- * Shared inode_lock is enough for us - it protects against concurrent
- * writes & truncates and since we take care of writing back page cache,
- * we are protected against page writeback as well.
- */
- inode_lock_shared(inode);
- ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1);
- if (ret)
- goto out_unlock;
- ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, ext4_dio_get_block, NULL, NULL, 0);
-out_unlock:
- inode_unlock_shared(inode);
- return ret;
+ return true;
}
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- size_t count = iov_iter_count(iter);
- loff_t offset = iocb->ki_pos;
- ssize_t ret;
+ int ret;
+ bool delalloc = false;
+ struct ext4_map_blocks map;
+ u8 blkbits = inode->i_blkbits;
-#ifdef CONFIG_FS_ENCRYPTION
- if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
- return 0;
-#endif
- if (fsverity_active(inode))
- return 0;
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
+
+ if (ext4_has_inline_data(inode)) {
+ ret = ext4_inline_data_iomap(inode, iomap);
+ if (ret != -EAGAIN) {
+ if (ret == 0 && offset >= iomap->length)
+ ret = -ENOENT;
+ return ret;
+ }
+ }
/*
- * If we are doing data journalling we don't support O_DIRECT
+ * Calculate the first and last logical block respectively.
*/
- if (ext4_should_journal_data(inode))
- return 0;
+ map.m_lblk = offset >> blkbits;
+ map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
- /* Let buffer I/O handle the inline data case. */
- if (ext4_has_inline_data(inode))
- return 0;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ delalloc = ext4_iomap_is_delalloc(inode, &map);
- trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (iov_iter_rw(iter) == READ)
- ret = ext4_direct_IO_read(iocb, iter);
- else
- ret = ext4_direct_IO_write(iocb, iter);
- trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
- return ret;
+ ext4_set_iomap(inode, iomap, &map, offset, length);
+ if (delalloc && iomap->type == IOMAP_HOLE)
+ iomap->type = IOMAP_DELALLOC;
+
+ return 0;
}
+const struct iomap_ops ext4_iomap_report_ops = {
+ .iomap_begin = ext4_iomap_begin_report,
+};
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3910,7 +3574,7 @@ static const struct address_space_operations ext4_aops = {
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -3927,7 +3591,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.bmap = ext4_bmap,
.invalidatepage = ext4_journalled_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
@@ -3943,7 +3607,7 @@ static const struct address_space_operations ext4_da_aops = {
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
+ .direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -5450,11 +5114,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * All buffers in the last page remain valid? Then there's nothing to
- * do. We do the check mainly to optimize the common PAGE_SIZE ==
- * blocksize case
+ * If the page is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidatepage() may
+ * strip all buffers from the page but keep the page dirty which can then
+ * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * buffers). Also we don't need to wait for any commit if all buffers in
+ * the page remain valid. This is most beneficial for the common case of
+ * blocksize == PAGESIZE.
*/
- if (offset > PAGE_SIZE - i_blocksize(inode))
+ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
page = find_lock_page(inode->i_mapping,
@@ -5717,12 +5385,15 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (flags & EXT4_NODUMP_FL)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (flags & EXT4_VERITY_FL)
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_COMPRESSED |
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE |
- STATX_ATTR_NODUMP);
+ STATX_ATTR_NODUMP |
+ STATX_ATTR_VERITY);
generic_fillattr(inode, stat);
return 0;
@@ -5912,8 +5583,23 @@ static int __ext4_expand_extra_isize(struct inode *inode,
{
struct ext4_inode *raw_inode;
struct ext4_xattr_ibody_header *header;
+ unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
int error;
+ /* this was checked at iget time, but double check for good measure */
+ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+ (ei->i_extra_isize & 3)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+ ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ return -EFSCORRUPTED;
+ }
+ if ((new_extra_isize < ei->i_extra_isize) ||
+ (new_extra_isize < 4) ||
+ (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+ return -EINVAL; /* Should never happen */
+
raw_inode = ext4_raw_inode(iloc);
header = IHDR(inode, raw_inode);
@@ -5965,9 +5651,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode,
* If this is felt to be critical, then e2fsck should be run to
* force a large enough s_min_extra_isize.
*/
- if (ext4_handle_valid(handle) &&
- jbd2_journal_extend(handle,
- EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
+ if (ext4_journal_extend(handle,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
return -ENOSPC;
if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0b7f316fd30f..e8870fff8224 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1360,6 +1360,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
case EXT4_IOC_MOVE_EXT:
case EXT4_IOC_RESIZE_FS:
+ case FITRIM:
case EXT4_IOC_PRECACHE_EXTENTS:
case EXT4_IOC_SET_ENCRYPTION_POLICY:
case EXT4_IOC_GET_ENCRYPTION_PWSALT:
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b1e4d359f73b..89725fa42573 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -50,29 +50,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
needed = ext4_ext_calc_credits_for_single_extent(inode,
lb->last_block - lb->first_block + 1, path);
- /*
- * Make sure the credit we accumalated is not really high
- */
- if (needed && ext4_handle_has_enough_credits(handle,
- EXT4_RESERVE_TRANS_BLOCKS)) {
- up_write((&EXT4_I(inode)->i_data_sem));
- retval = ext4_journal_restart(handle, needed);
- down_write((&EXT4_I(inode)->i_data_sem));
- if (retval)
- goto err_out;
- } else if (needed) {
- retval = ext4_journal_extend(handle, needed);
- if (retval) {
- /*
- * IF not able to extend the journal restart the journal
- */
- up_write((&EXT4_I(inode)->i_data_sem));
- retval = ext4_journal_restart(handle, needed);
- down_write((&EXT4_I(inode)->i_data_sem));
- if (retval)
- goto err_out;
- }
- }
+ retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
+ if (retval < 0)
+ goto err_out;
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
err_out:
up_write((&EXT4_I(inode)->i_data_sem));
@@ -196,42 +176,30 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
}
-static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
-{
- int retval = 0, needed;
-
- if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
- return 0;
- /*
- * We are freeing a blocks. During this we touch
- * superblock, group descriptor and block bitmap.
- * So allocate a credit of 3. We may update
- * quota (user and group).
- */
- needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
-
- if (ext4_journal_extend(handle, needed) != 0)
- retval = ext4_journal_restart(handle, needed);
-
- return retval;
-}
-
static int free_dind_blocks(handle_t *handle,
struct inode *inode, __le32 i_data)
{
int i;
__le32 *tmp_idata;
struct buffer_head *bh;
+ struct super_block *sb = inode->i_sb;
unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
+ int err;
- bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+ bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
tmp_idata = (__le32 *)bh->b_data;
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
- extend_credit_for_blkdel(handle, inode);
+ err = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0) {
+ put_bh(bh);
+ return err;
+ }
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(tmp_idata[i]), 1,
EXT4_FREE_BLOCKS_METADATA |
@@ -239,7 +207,10 @@ static int free_dind_blocks(handle_t *handle,
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(sb, 1));
+ if (err < 0)
+ return err;
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -270,7 +241,10 @@ static int free_tind_blocks(handle_t *handle,
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
@@ -283,7 +257,11 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
/* ei->i_data[EXT4_IND_BLOCK] */
if (i_data[0]) {
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle,
+ EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL,
le32_to_cpu(i_data[0]), 1,
EXT4_FREE_BLOCKS_METADATA |
@@ -318,12 +296,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* One credit accounted for writing the
* i_data field of the original inode
*/
- retval = ext4_journal_extend(handle, 1);
- if (retval) {
- retval = ext4_journal_restart(handle, 1);
- if (retval)
- goto err_out;
- }
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto err_out;
i_data[0] = ei->i_data[EXT4_IND_BLOCK];
i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
@@ -391,15 +366,20 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
ix = EXT_FIRST_INDEX(eh);
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) {
retval = free_ext_idx(handle, inode, ix);
- if (retval)
- break;
+ if (retval) {
+ put_bh(bh);
+ return retval;
+ }
}
}
put_bh(bh);
- extend_credit_for_blkdel(handle, inode);
+ retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (retval < 0)
+ return retval;
ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
- return retval;
+ return 0;
}
/*
@@ -574,9 +554,9 @@ err_out:
}
/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
- if (ext4_journal_extend(handle, 1) != 0)
- ext4_journal_restart(handle, 1);
-
+ retval = ext4_journal_ensure_credits(handle, 1, 0);
+ if (retval < 0)
+ goto out_stop;
/*
* Mark the tmp_inode as of size zero
*/
@@ -594,6 +574,7 @@ err_out:
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
+out_stop:
ext4_journal_stop(handle);
out:
unlock_new_inode(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a427d2031a8d..a856997d87b5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2547,18 +2547,29 @@ static void ext4_dec_count(handle_t *handle, struct inode *inode)
}
+/*
+ * Add non-directory inode to a directory. On success, the inode reference is
+ * consumed by dentry is instantiation. This is also indicated by clearing of
+ * *inodep pointer. On failure, the caller is responsible for dropping the
+ * inode reference in the safe context.
+ */
static int ext4_add_nondir(handle_t *handle,
- struct dentry *dentry, struct inode *inode)
+ struct dentry *dentry, struct inode **inodep)
{
+ struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *inode = *inodep;
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
ext4_mark_inode_dirty(handle, inode);
+ if (IS_DIRSYNC(dir))
+ ext4_handle_sync(handle);
d_instantiate_new(dentry, inode);
+ *inodep = NULL;
return 0;
}
drop_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
- iput(inode);
return err;
}
@@ -2592,12 +2603,12 @@ retry:
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
+ err = ext4_add_nondir(handle, dentry, &inode);
}
if (handle)
ext4_journal_stop(handle);
+ if (!IS_ERR_OR_NULL(inode))
+ iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2624,12 +2635,12 @@ retry:
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &ext4_special_inode_operations;
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
+ err = ext4_add_nondir(handle, dentry, &inode);
}
if (handle)
ext4_journal_stop(handle);
+ if (!IS_ERR_OR_NULL(inode))
+ iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -2779,10 +2790,12 @@ retry:
if (err) {
out_clear_inode:
clear_nlink(inode);
+ ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
iput(inode);
- goto out_stop;
+ goto out_retry;
}
ext4_inc_count(handle, dir);
ext4_update_dx_flag(dir);
@@ -2796,6 +2809,7 @@ out_clear_inode:
out_stop:
if (handle)
ext4_journal_stop(handle);
+out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
@@ -3182,18 +3196,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- if (inode->i_nlink == 0) {
- ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
- dentry->d_name.len, dentry->d_name.name);
- set_nlink(inode, 1);
- }
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
- drop_nlink(inode);
+ if (inode->i_nlink == 0)
+ ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
+ dentry->d_name.len, dentry->d_name.name);
+ else
+ drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
@@ -3328,12 +3341,11 @@ static int ext4_symlink(struct inode *dir,
inode->i_size = disk_link.len - 1;
}
EXT4_I(inode)->i_disksize = inode->i_size;
- err = ext4_add_nondir(handle, dentry, inode);
- if (!err && IS_DIRSYNC(dir))
- ext4_handle_sync(handle);
-
+ err = ext4_add_nondir(handle, dentry, &inode);
if (handle)
ext4_journal_stop(handle);
+ if (inode)
+ iput(inode);
goto out_free_encrypted_link;
err_drop_inode:
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 12ceadef32c5..24aeedb8fc75 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -31,18 +31,56 @@
#include "acl.h"
static struct kmem_cache *io_end_cachep;
+static struct kmem_cache *io_end_vec_cachep;
int __init ext4_init_pageio(void)
{
io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
if (io_end_cachep == NULL)
return -ENOMEM;
+
+ io_end_vec_cachep = KMEM_CACHE(ext4_io_end_vec, 0);
+ if (io_end_vec_cachep == NULL) {
+ kmem_cache_destroy(io_end_cachep);
+ return -ENOMEM;
+ }
return 0;
}
void ext4_exit_pageio(void)
{
kmem_cache_destroy(io_end_cachep);
+ kmem_cache_destroy(io_end_vec_cachep);
+}
+
+struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec;
+
+ io_end_vec = kmem_cache_zalloc(io_end_vec_cachep, GFP_NOFS);
+ if (!io_end_vec)
+ return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&io_end_vec->list);
+ list_add_tail(&io_end_vec->list, &io_end->list_vec);
+ return io_end_vec;
+}
+
+static void ext4_free_io_end_vec(ext4_io_end_t *io_end)
+{
+ struct ext4_io_end_vec *io_end_vec, *tmp;
+
+ if (list_empty(&io_end->list_vec))
+ return;
+ list_for_each_entry_safe(io_end_vec, tmp, &io_end->list_vec, list) {
+ list_del(&io_end_vec->list);
+ kmem_cache_free(io_end_vec_cachep, io_end_vec);
+ }
+}
+
+struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end)
+{
+ BUG_ON(list_empty(&io_end->list_vec));
+ return list_last_entry(&io_end->list_vec, struct ext4_io_end_vec, list);
}
/*
@@ -125,6 +163,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
ext4_finish_bio(bio);
bio_put(bio);
}
+ ext4_free_io_end_vec(io_end);
kmem_cache_free(io_end_cachep, io_end);
}
@@ -136,29 +175,26 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
* cannot get to ext4_ext_truncate() before all IOs overlapping that range are
* completed (happens from ext4_free_ioend()).
*/
-static int ext4_end_io(ext4_io_end_t *io)
+static int ext4_end_io_end(ext4_io_end_t *io_end)
{
- struct inode *inode = io->inode;
- loff_t offset = io->offset;
- ssize_t size = io->size;
- handle_t *handle = io->handle;
+ struct inode *inode = io_end->inode;
+ handle_t *handle = io_end->handle;
int ret = 0;
- ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+ ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
- io, inode->i_ino, io->list.next, io->list.prev);
+ io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+ io_end->handle = NULL; /* Following call will use up the handle */
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
- "(inode %lu, offset %llu, size %zd, error %d)",
- inode->i_ino, offset, size, ret);
+ "(inode %lu, error %d)", inode->i_ino, ret);
}
- ext4_clear_io_unwritten_flag(io);
- ext4_release_io_end(io);
+ ext4_clear_io_unwritten_flag(io_end);
+ ext4_release_io_end(io_end);
return ret;
}
@@ -166,21 +202,21 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
+ ext4_io_end_t *io_end, *io_end0, *io_end1;
if (list_empty(head))
return;
ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
- list_for_each_entry(io, head, list) {
- cur = &io->list;
+ list_for_each_entry(io_end, head, list) {
+ cur = &io_end->list;
before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
+ io_end0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
+ io_end1 = container_of(after, ext4_io_end_t, list);
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
+ io_end, inode->i_ino, io_end0, io_end1);
}
#endif
}
@@ -207,7 +243,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
static int ext4_do_flush_completed_IO(struct inode *inode,
struct list_head *head)
{
- ext4_io_end_t *io;
+ ext4_io_end_t *io_end;
struct list_head unwritten;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -219,11 +255,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
while (!list_empty(&unwritten)) {
- io = list_entry(unwritten.next, ext4_io_end_t, list);
- BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
- list_del_init(&io->list);
+ io_end = list_entry(unwritten.next, ext4_io_end_t, list);
+ BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ list_del_init(&io_end->list);
- err = ext4_end_io(io);
+ err = ext4_end_io_end(io_end);
if (unlikely(!ret && err))
ret = err;
}
@@ -242,19 +278,22 @@ void ext4_end_io_rsv_work(struct work_struct *work)
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
- ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
- if (io) {
- io->inode = inode;
- INIT_LIST_HEAD(&io->list);
- atomic_set(&io->count, 1);
+ ext4_io_end_t *io_end = kmem_cache_zalloc(io_end_cachep, flags);
+
+ if (io_end) {
+ io_end->inode = inode;
+ INIT_LIST_HEAD(&io_end->list);
+ INIT_LIST_HEAD(&io_end->list_vec);
+ atomic_set(&io_end->count, 1);
}
- return io;
+ return io_end;
}
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (atomic_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
+ list_empty(&io_end->list_vec)) {
ext4_release_io_end(io_end);
return;
}
@@ -268,9 +307,8 @@ int ext4_put_io_end(ext4_io_end_t *io_end)
if (atomic_dec_and_test(&io_end->count)) {
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_extents(io_end->handle,
- io_end->inode, io_end->offset,
- io_end->size);
+ err = ext4_convert_unwritten_io_end_vec(io_end->handle,
+ io_end);
io_end->handle = NULL;
ext4_clear_io_unwritten_flag(io_end);
}
@@ -307,10 +345,8 @@ static void ext4_end_bio(struct bio *bio)
struct inode *inode = io_end->inode;
ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
- "(offset %llu size %ld starting block %llu)",
+ "starting block %llu)",
bio->bi_status, inode->i_ino,
- (unsigned long long) io_end->offset,
- (long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
mapping_set_error(inode->i_mapping,
@@ -358,14 +394,16 @@ void ext4_io_submit_init(struct ext4_io_submit *io,
io->io_end = NULL;
}
-static int io_submit_init_bio(struct ext4_io_submit *io,
- struct buffer_head *bh)
+static void io_submit_init_bio(struct ext4_io_submit *io,
+ struct buffer_head *bh)
{
struct bio *bio;
+ /*
+ * bio_alloc will _always_ be able to allocate a bio if
+ * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset().
+ */
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- if (!bio)
- return -ENOMEM;
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
@@ -373,13 +411,12 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
wbc_init_bio(io->io_wbc, bio);
- return 0;
}
-static int io_submit_add_bh(struct ext4_io_submit *io,
- struct inode *inode,
- struct page *page,
- struct buffer_head *bh)
+static void io_submit_add_bh(struct ext4_io_submit *io,
+ struct inode *inode,
+ struct page *page,
+ struct buffer_head *bh)
{
int ret;
@@ -388,9 +425,7 @@ submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
- ret = io_submit_init_bio(io, bh);
- if (ret)
- return ret;
+ io_submit_init_bio(io, bh);
io->io_bio->bi_write_hint = inode->i_write_hint;
}
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
@@ -398,7 +433,6 @@ submit_and_retry:
goto submit_and_retry;
wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
io->io_next_block++;
- return 0;
}
int ext4_bio_write_page(struct ext4_io_submit *io,
@@ -491,8 +525,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
gfp_flags |= __GFP_NOFAIL;
goto retry_encrypt;
}
- bounce_page = NULL;
- goto out;
+
+ printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+ redirty_page_for_writepage(wbc, page);
+ do {
+ clear_buffer_async_write(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ goto unlock;
}
}
@@ -500,30 +540,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh);
- if (ret) {
- /*
- * We only get here on ENOMEM. Not much else
- * we can do but mark the page as dirty, and
- * better luck next time.
- */
- break;
- }
+ io_submit_add_bh(io, inode,
+ bounce_page ? bounce_page : page, bh);
nr_submitted++;
clear_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
- /* Error stopped previous loop? Clean up buffers... */
- if (ret) {
- out:
- fscrypt_free_bounce_page(bounce_page);
- printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
- redirty_page_for_writepage(wbc, page);
- do {
- clear_buffer_async_write(bh);
- bh = bh->b_this_page;
- } while (bh != head);
- }
+unlock:
unlock_page(page);
/* Nothing submitted - we have to end page writeback */
if (!nr_submitted)
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index a30b203fa461..fef7755300c3 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -360,10 +360,12 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (bio == NULL) {
struct bio_post_read_ctx *ctx;
+ /*
+ * bio_alloc will _always_ be able to allocate a bio if
+ * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
+ */
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio)
- goto set_error_page;
ctx = get_bio_post_read_ctx(inode, bio, page->index);
if (IS_ERR(ctx)) {
bio_put(bio);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c0e9aef376a7..a8c0f2b5b6e1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -388,28 +388,10 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
return bh;
}
-/*
- * If we have fewer than thresh credits, extend by EXT4_MAX_TRANS_DATA.
- * If that fails, restart the transaction & regain write access for the
- * buffer head which is used for block_bitmap modifications.
- */
-static int extend_or_restart_transaction(handle_t *handle, int thresh)
+static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits)
{
- int err;
-
- if (ext4_handle_has_enough_credits(handle, thresh))
- return 0;
-
- err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
- if (err < 0)
- return err;
- if (err) {
- err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
- if (err)
- return err;
- }
-
- return 0;
+ return ext4_journal_ensure_credits_fn(handle, credits,
+ EXT4_MAX_TRANS_DATA, 0, 0);
}
/*
@@ -451,8 +433,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
continue;
}
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
return err;
bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
@@ -544,8 +526,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
struct buffer_head *gdb;
ext4_debug("update backup group %#04llx\n", block);
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
gdb = sb_getblk(sb, block);
@@ -602,8 +584,8 @@ handle_bb:
/* Initialize block bitmap of the @group */
block = group_data[i].block_bitmap;
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
bh = bclean(handle, sb, block);
@@ -631,8 +613,8 @@ handle_ib:
/* Initialize inode bitmap of the @group */
block = group_data[i].inode_bitmap;
- err = extend_or_restart_transaction(handle, 1);
- if (err)
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
goto out;
/* Mark unused entries in inode bitmap used */
bh = bclean(handle, sb, block);
@@ -1109,10 +1091,8 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
ext4_fsblk_t backup_block;
/* Out of journal space, and can't get more - abort - so sad */
- if (ext4_handle_valid(handle) &&
- handle->h_buffer_credits == 0 &&
- ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
- (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+ err = ext4_resize_ensure_credits_batch(handle, 1);
+ if (err < 0)
break;
if (meta_bg == 0)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dd654e53ba3d..1d82b56d9b11 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1172,9 +1172,9 @@ void ext4_clear_inode(struct inode *inode)
{
invalidate_inode_buffers(inode);
clear_inode(inode);
- dquot_drop(inode);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -1345,6 +1345,18 @@ static bool ext4_dummy_context(struct inode *inode)
return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
}
+static bool ext4_has_stable_inodes(struct super_block *sb)
+{
+ return ext4_has_feature_stable_inodes(sb);
+}
+
+static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
+ int *ino_bits_ret, int *lblk_bits_ret)
+{
+ *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
+ *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
+}
+
static const struct fscrypt_operations ext4_cryptops = {
.key_prefix = "ext4:",
.get_context = ext4_get_context,
@@ -1352,6 +1364,8 @@ static const struct fscrypt_operations ext4_cryptops = {
.dummy_context = ext4_dummy_context,
.empty_dir = ext4_empty_dir,
.max_namelen = EXT4_NAME_LEN,
+ .has_stable_inodes = ext4_has_stable_inodes,
+ .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits,
};
#endif
@@ -1374,7 +1388,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
static int ext4_enable_quotas(struct super_block *sb);
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
static struct dquot **ext4_get_dquots(struct inode *inode)
{
@@ -1392,7 +1405,7 @@ static const struct dquot_operations ext4_quota_operations = {
.destroy_dquot = dquot_destroy,
.get_projid = ext4_get_projid,
.get_inode_usage = ext4_get_inode_usage,
- .get_next_id = ext4_get_next_id,
+ .get_next_id = dquot_get_next_id,
};
static const struct quotactl_ops ext4_qctl_operations = {
@@ -2051,7 +2064,7 @@ static int parse_options(char *options, struct super_block *sb,
unsigned int *journal_ioprio,
int is_remount)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
substring_t args[MAX_OPT_ARGS];
int token;
@@ -2105,16 +2118,6 @@ static int parse_options(char *options, struct super_block *sb,
}
}
#endif
- if (test_opt(sb, DIOREAD_NOLOCK)) {
- int blocksize =
- BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
-
- if (blocksize < PAGE_SIZE) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "dioread_nolock if block size != PAGE_SIZE");
- return 0;
- }
- }
return 1;
}
@@ -3555,12 +3558,15 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ unsigned def_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
- /* determine the minimum size of new large inodes, if present */
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
- sbi->s_want_extra_isize == 0) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
+ if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) {
+ sbi->s_want_extra_isize = 0;
+ return;
+ }
+ if (sbi->s_want_extra_isize < 4) {
+ sbi->s_want_extra_isize = def_extra_isize;
if (ext4_has_feature_extra_isize(sb)) {
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_want_extra_isize))
@@ -3573,10 +3579,10 @@ static void ext4_clamp_want_extra_isize(struct super_block *sb)
}
}
/* Check if enough inode space is available */
- if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
- sbi->s_inode_size) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
+ if ((sbi->s_want_extra_isize > sbi->s_inode_size) ||
+ (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+ sbi->s_inode_size)) {
+ sbi->s_want_extra_isize = def_extra_isize;
ext4_msg(sb, KERN_INFO,
"required extra inode space not available");
}
@@ -4439,13 +4445,6 @@ no_journal:
}
}
- if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
- (blocksize != PAGE_SIZE)) {
- ext4_msg(sb, KERN_ERR,
- "Unsupported blocksize for fs encryption");
- goto failed_mount_wq;
- }
-
if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
goto failed_mount_wq;
@@ -5835,7 +5834,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
- err = dquot_enable(qf_inode, type, format_id, flags);
+ err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
if (err)
lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
iput(qf_inode);
@@ -6019,18 +6018,6 @@ out:
}
return len;
}
-
-static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
-{
- const struct quota_format_ops *ops;
-
- if (!sb_has_quota_loaded(sb, qid->type))
- return -ESRCH;
- ops = sb_dqopt(sb)->ops[qid->type];
- if (!ops || !ops->get_next_id)
- return -ENOSYS;
- return dquot_get_next_id(sb, qid);
-}
#endif
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 491f9ee4040e..8966a5439a22 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -967,55 +967,6 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
return credits;
}
-static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
- int credits, struct buffer_head *bh,
- bool dirty, bool block_csum)
-{
- int error;
-
- if (!ext4_handle_valid(handle))
- return 0;
-
- if (handle->h_buffer_credits >= credits)
- return 0;
-
- error = ext4_journal_extend(handle, credits - handle->h_buffer_credits);
- if (!error)
- return 0;
- if (error < 0) {
- ext4_warning(inode->i_sb, "Extend journal (error %d)", error);
- return error;
- }
-
- if (bh && dirty) {
- if (block_csum)
- ext4_xattr_block_csum_set(inode, bh);
- error = ext4_handle_dirty_metadata(handle, NULL, bh);
- if (error) {
- ext4_warning(inode->i_sb, "Handle metadata (error %d)",
- error);
- return error;
- }
- }
-
- error = ext4_journal_restart(handle, credits);
- if (error) {
- ext4_warning(inode->i_sb, "Restart journal (error %d)", error);
- return error;
- }
-
- if (bh) {
- error = ext4_journal_get_write_access(handle, bh);
- if (error) {
- ext4_warning(inode->i_sb,
- "Get write access failed (error %d)",
- error);
- return error;
- }
- }
- return 0;
-}
-
static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
@@ -1149,6 +1100,24 @@ cleanup:
return saved_err;
}
+static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, bool block_csum, bool dirty)
+{
+ int error;
+
+ if (bh && dirty) {
+ if (block_csum)
+ ext4_xattr_block_csum_set(inode, bh);
+ error = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (error) {
+ ext4_warning(inode->i_sb, "Handle metadata (error %d)",
+ error);
+ return error;
+ }
+ }
+ return 0;
+}
+
static void
ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
struct buffer_head *bh,
@@ -1185,13 +1154,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
continue;
}
- err = ext4_xattr_ensure_credits(handle, parent, credits, bh,
- dirty, block_csum);
- if (err) {
+ err = ext4_journal_ensure_credits_fn(handle, credits, credits,
+ ext4_free_metadata_revoke_credits(parent->i_sb, 1),
+ ext4_xattr_restart_fn(handle, parent, bh, block_csum,
+ dirty));
+ if (err < 0) {
ext4_warning_inode(ea_inode, "Ensure credits err=%d",
err);
continue;
}
+ if (err > 0) {
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ ext4_warning_inode(ea_inode,
+ "Re-get write access err=%d",
+ err);
+ continue;
+ }
+ }
err = ext4_xattr_inode_dec_ref(handle, ea_inode);
if (err) {
@@ -2335,7 +2315,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
flags & XATTR_CREATE);
brelse(bh);
- if (!ext4_handle_has_enough_credits(handle, credits)) {
+ if (jbd2_handle_buffer_credits(handle) < credits) {
error = -ENOSPC;
goto cleanup;
}
@@ -2862,11 +2842,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
struct inode *ea_inode;
int error;
- error = ext4_xattr_ensure_credits(handle, inode, extra_credits,
- NULL /* bh */,
- false /* dirty */,
- false /* block_csum */);
- if (error) {
+ error = ext4_journal_ensure_credits(handle, extra_credits,
+ ext4_free_metadata_revoke_credits(inode->i_sb, 1));
+ if (error < 0) {
EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error);
goto cleanup;
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index a0eef95b9e0e..ffdaba0c55d2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -581,7 +581,7 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi)
if (time_to_inject(sbi, FAULT_ORPHAN)) {
spin_unlock(&im->ino_lock);
- f2fs_show_injection_info(FAULT_ORPHAN);
+ f2fs_show_injection_info(sbi, FAULT_ORPHAN);
return -ENOSPC;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5755e897a5f0..a034cd0ce021 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -29,6 +29,7 @@
#define NUM_PREALLOC_POST_READ_CTXS 128
static struct kmem_cache *bio_post_read_ctx_cache;
+static struct kmem_cache *bio_entry_slab;
static mempool_t *bio_post_read_ctx_pool;
static bool __is_cp_guaranteed(struct page *page)
@@ -167,9 +168,10 @@ static bool f2fs_bio_post_read_required(struct bio *bio)
static void f2fs_read_end_io(struct bio *bio)
{
- if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)),
- FAULT_READ_IO)) {
- f2fs_show_injection_info(FAULT_READ_IO);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
+
+ if (time_to_inject(sbi, FAULT_READ_IO)) {
+ f2fs_show_injection_info(sbi, FAULT_READ_IO);
bio->bi_status = BLK_STS_IOERR;
}
@@ -191,7 +193,7 @@ static void f2fs_write_end_io(struct bio *bio)
struct bvec_iter_all iter_all;
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
- f2fs_show_injection_info(FAULT_WRITE_IO);
+ f2fs_show_injection_info(sbi, FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR;
}
@@ -543,6 +545,126 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
return io_type_is_mergeable(io, fio);
}
+static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
+ struct page *page, enum temp_type temp)
+{
+ struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
+ struct bio_entry *be;
+
+ be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS);
+ be->bio = bio;
+ bio_get(bio);
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE)
+ f2fs_bug_on(sbi, 1);
+
+ down_write(&io->bio_list_lock);
+ list_add_tail(&be->list, &io->bio_list);
+ up_write(&io->bio_list_lock);
+}
+
+static void del_bio_entry(struct bio_entry *be)
+{
+ list_del(&be->list);
+ kmem_cache_free(bio_entry_slab, be);
+}
+
+static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio,
+ struct page *page)
+{
+ enum temp_type temp;
+ bool found = false;
+ int ret = -EAGAIN;
+
+ for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) {
+ struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
+ struct list_head *head = &io->bio_list;
+ struct bio_entry *be;
+
+ down_write(&io->bio_list_lock);
+ list_for_each_entry(be, head, list) {
+ if (be->bio != *bio)
+ continue;
+
+ found = true;
+
+ if (bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
+ ret = 0;
+ break;
+ }
+
+ /* bio is full */
+ del_bio_entry(be);
+ __submit_bio(sbi, *bio, DATA);
+ break;
+ }
+ up_write(&io->bio_list_lock);
+ }
+
+ if (ret) {
+ bio_put(*bio);
+ *bio = NULL;
+ }
+
+ return ret;
+}
+
+void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
+ struct bio **bio, struct page *page)
+{
+ enum temp_type temp;
+ bool found = false;
+ struct bio *target = bio ? *bio : NULL;
+
+ for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) {
+ struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
+ struct list_head *head = &io->bio_list;
+ struct bio_entry *be;
+
+ if (list_empty(head))
+ continue;
+
+ down_read(&io->bio_list_lock);
+ list_for_each_entry(be, head, list) {
+ if (target)
+ found = (target == be->bio);
+ else
+ found = __has_merged_page(be->bio, NULL,
+ page, 0);
+ if (found)
+ break;
+ }
+ up_read(&io->bio_list_lock);
+
+ if (!found)
+ continue;
+
+ found = false;
+
+ down_write(&io->bio_list_lock);
+ list_for_each_entry(be, head, list) {
+ if (target)
+ found = (target == be->bio);
+ else
+ found = __has_merged_page(be->bio, NULL,
+ page, 0);
+ if (found) {
+ target = be->bio;
+ del_bio_entry(be);
+ break;
+ }
+ }
+ up_write(&io->bio_list_lock);
+ }
+
+ if (found)
+ __submit_bio(sbi, target, DATA);
+ if (bio && *bio) {
+ bio_put(*bio);
+ *bio = NULL;
+ }
+}
+
int f2fs_merge_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio = *fio->bio;
@@ -557,20 +679,17 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
f2fs_trace_ios(fio, 0);
if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block,
- fio->new_blkaddr)) {
- __submit_bio(fio->sbi, bio, fio->type);
- bio = NULL;
- }
+ fio->new_blkaddr))
+ f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL);
alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_PAGES);
bio_set_op_attrs(bio, fio->op, fio->op_flags);
- }
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- __submit_bio(fio->sbi, bio, fio->type);
- bio = NULL;
- goto alloc_new;
+ add_bio_entry(fio->sbi, bio, page, fio->temp);
+ } else {
+ if (add_ipu_page(fio->sbi, &bio, page))
+ goto alloc_new;
}
if (fio->io_wbc)
@@ -584,19 +703,6 @@ alloc_new:
return 0;
}
-static void f2fs_submit_ipu_bio(struct f2fs_sb_info *sbi, struct bio **bio,
- struct page *page)
-{
- if (!bio)
- return;
-
- if (!__has_merged_page(*bio, NULL, page, 0))
- return;
-
- __submit_bio(sbi, *bio, DATA);
- *bio = NULL;
-}
-
void f2fs_submit_page_write(struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = fio->sbi;
@@ -2098,7 +2204,7 @@ static int __write_data_page(struct page *page, bool *submitted,
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_SHIFT;
- loff_t psize = (page->index + 1) << PAGE_SHIFT;
+ loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT;
unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
@@ -2215,14 +2321,12 @@ out:
unlock_page(page);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
- !F2FS_I(inode)->cp_task) {
- f2fs_submit_ipu_bio(sbi, bio, page);
+ !F2FS_I(inode)->cp_task)
f2fs_balance_fs(sbi, need_balance_fs);
- }
if (unlikely(f2fs_cp_error(sbi))) {
- f2fs_submit_ipu_bio(sbi, bio, page);
f2fs_submit_merged_write(sbi, DATA);
+ f2fs_submit_merged_ipu_write(sbi, bio, NULL);
submitted = NULL;
}
@@ -2342,13 +2446,11 @@ continue_unlock:
}
if (PageWriteback(page)) {
- if (wbc->sync_mode != WB_SYNC_NONE) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
f2fs_wait_on_page_writeback(page,
DATA, true, true);
- f2fs_submit_ipu_bio(sbi, &bio, page);
- } else {
+ else
goto continue_unlock;
- }
}
if (!clear_page_dirty_for_io(page))
@@ -2406,7 +2508,7 @@ continue_unlock:
NULL, 0, DATA);
/* submit cached bio of IPU write */
if (bio)
- __submit_bio(sbi, bio, DATA);
+ f2fs_submit_merged_ipu_write(sbi, &bio, NULL);
return ret;
}
@@ -3211,8 +3313,22 @@ fail:
return -ENOMEM;
}
-void __exit f2fs_destroy_post_read_processing(void)
+void f2fs_destroy_post_read_processing(void)
{
mempool_destroy(bio_post_read_ctx_pool);
kmem_cache_destroy(bio_post_read_ctx_cache);
}
+
+int __init f2fs_init_bio_entry_cache(void)
+{
+ bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab",
+ sizeof(struct bio_entry));
+ if (!bio_entry_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void __exit f2fs_destroy_bio_entry_cache(void)
+{
+ kmem_cache_destroy(bio_entry_slab);
+}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 4033778bcbbf..c967cacf979e 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -628,7 +628,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name,
start:
if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) {
- f2fs_show_injection_info(FAULT_DIR_DEPTH);
+ f2fs_show_injection_info(F2FS_I_SB(dir), FAULT_DIR_DEPTH);
return -ENOSPC;
}
@@ -919,8 +919,9 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
bit_pos++;
ctx->pos = start_pos + bit_pos;
printk_ratelimited(
- "%s, invalid namelen(0), ino:%u, run fsck to fix.",
- KERN_WARNING, le32_to_cpu(de->ino));
+ "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
+ KERN_WARNING, sbi->sb->s_id,
+ le32_to_cpu(de->ino));
set_sbi_flag(sbi, SBI_NEED_FSCK);
continue;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4024790028aa..5a888a063c7f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -890,6 +890,7 @@ enum {
CURSEG_WARM_NODE, /* direct node blocks of normal files */
CURSEG_COLD_NODE, /* indirect node blocks */
NO_CHECK_TYPE,
+ CURSEG_COLD_DATA_PINNED,/* cold data for pinned file */
};
struct flush_cmd {
@@ -1068,6 +1069,11 @@ struct f2fs_io_info {
unsigned char version; /* version of the node */
};
+struct bio_entry {
+ struct bio *bio;
+ struct list_head list;
+};
+
#define is_read_io(rw) ((rw) == READ)
struct f2fs_bio_info {
struct f2fs_sb_info *sbi; /* f2fs superblock */
@@ -1077,6 +1083,8 @@ struct f2fs_bio_info {
struct rw_semaphore io_rwsem; /* blocking op for bio */
spinlock_t io_lock; /* serialize DATA/NODE IOs */
struct list_head io_list; /* track fios */
+ struct list_head bio_list; /* bio entry list head */
+ struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */
};
#define FDEV(i) (sbi->devs[i])
@@ -1289,11 +1297,13 @@ struct f2fs_sb_info {
unsigned int gc_mode; /* current GC state */
unsigned int next_victim_seg[2]; /* next segment in victim section */
/* for skip statistic */
+ unsigned int atomic_files; /* # of opened atomic file */
unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
/* threshold for gc trials on pinned files */
u64 gc_pin_file_threshold;
+ struct rw_semaphore pin_sem;
/* maximum # of trials to find a victim segment for SSR and GC */
unsigned int max_victim_search;
@@ -1365,9 +1375,10 @@ struct f2fs_private_dio {
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
-#define f2fs_show_injection_info(type) \
- printk_ratelimited("%sF2FS-fs : inject %s in %s of %pS\n", \
- KERN_INFO, f2fs_fault_name[type], \
+#define f2fs_show_injection_info(sbi, type) \
+ printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \
+ KERN_INFO, sbi->sb->s_id, \
+ f2fs_fault_name[type], \
__func__, __builtin_return_address(0))
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
@@ -1387,7 +1398,7 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
return false;
}
#else
-#define f2fs_show_injection_info(type) do { } while (0)
+#define f2fs_show_injection_info(sbi, type) do { } while (0)
static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
{
return false;
@@ -1772,7 +1783,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
return ret;
if (time_to_inject(sbi, FAULT_BLOCK)) {
- f2fs_show_injection_info(FAULT_BLOCK);
+ f2fs_show_injection_info(sbi, FAULT_BLOCK);
release = *count;
goto release_quota;
}
@@ -2024,7 +2035,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
}
if (time_to_inject(sbi, FAULT_BLOCK)) {
- f2fs_show_injection_info(FAULT_BLOCK);
+ f2fs_show_injection_info(sbi, FAULT_BLOCK);
goto enospc;
}
@@ -2139,7 +2150,8 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
return page;
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) {
- f2fs_show_injection_info(FAULT_PAGE_ALLOC);
+ f2fs_show_injection_info(F2FS_M_SB(mapping),
+ FAULT_PAGE_ALLOC);
return NULL;
}
}
@@ -2154,7 +2166,7 @@ static inline struct page *f2fs_pagecache_get_page(
int fgp_flags, gfp_t gfp_mask)
{
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
- f2fs_show_injection_info(FAULT_PAGE_GET);
+ f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET);
return NULL;
}
@@ -2223,7 +2235,7 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
return bio;
}
if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
- f2fs_show_injection_info(FAULT_ALLOC_BIO);
+ f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
return NULL;
}
@@ -2704,6 +2716,20 @@ static inline void clear_file(struct inode *inode, int type)
f2fs_mark_inode_dirty_sync(inode, true);
}
+static inline bool f2fs_is_time_consistent(struct inode *inode)
+{
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+ return false;
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
+ return false;
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+ return false;
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3,
+ &F2FS_I(inode)->i_crtime))
+ return false;
+ return true;
+}
+
static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
{
bool ret;
@@ -2721,14 +2747,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
i_size_read(inode) & ~PAGE_MASK)
return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
- return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
- return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
- return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3,
- &F2FS_I(inode)->i_crtime))
+ if (!f2fs_is_time_consistent(inode))
return false;
down_read(&F2FS_I(inode)->i_sem);
@@ -2783,7 +2802,7 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
void *ret;
if (time_to_inject(sbi, FAULT_KMALLOC)) {
- f2fs_show_injection_info(FAULT_KMALLOC);
+ f2fs_show_injection_info(sbi, FAULT_KMALLOC);
return NULL;
}
@@ -2804,7 +2823,7 @@ static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
if (time_to_inject(sbi, FAULT_KVMALLOC)) {
- f2fs_show_injection_info(FAULT_KVMALLOC);
+ f2fs_show_injection_info(sbi, FAULT_KVMALLOC);
return NULL;
}
@@ -3102,7 +3121,7 @@ void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
unsigned int start, unsigned int end);
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type);
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
@@ -3188,10 +3207,14 @@ void f2fs_destroy_checkpoint_caches(void);
*/
int f2fs_init_post_read_processing(void);
void f2fs_destroy_post_read_processing(void);
+int f2fs_init_bio_entry_cache(void);
+void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct inode *inode, struct page *page,
nid_t ino, enum page_type type);
+void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
+ struct bio **bio, struct page *page);
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
int f2fs_submit_page_bio(struct f2fs_io_info *fio);
int f2fs_merge_page_bio(struct f2fs_io_info *fio);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 29bc0a542759..85af112e868d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -681,7 +681,7 @@ int f2fs_truncate(struct inode *inode)
trace_f2fs_truncate(inode);
if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) {
- f2fs_show_injection_info(FAULT_TRUNCATE);
+ f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_TRUNCATE);
return -EIO;
}
@@ -726,11 +726,14 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (flags & F2FS_NODUMP_FL)
stat->attributes |= STATX_ATTR_NODUMP;
+ if (IS_VERITY(inode))
+ stat->attributes |= STATX_ATTR_VERITY;
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE |
- STATX_ATTR_NODUMP);
+ STATX_ATTR_NODUMP |
+ STATX_ATTR_VERITY);
generic_fillattr(inode, stat);
@@ -1139,7 +1142,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
}
dn.ofs_in_node++;
i++;
- new_size = (dst + i) << PAGE_SHIFT;
+ new_size = (loff_t)(dst + i) << PAGE_SHIFT;
if (dst_inode->i_size < new_size)
f2fs_i_size_write(dst_inode, new_size);
} while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
@@ -1545,12 +1548,44 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (off_end)
map.m_len++;
- if (f2fs_is_pinned_file(inode))
- map.m_seg_type = CURSEG_COLD_DATA;
+ if (!map.m_len)
+ return 0;
+
+ if (f2fs_is_pinned_file(inode)) {
+ block_t len = (map.m_len >> sbi->log_blocks_per_seg) <<
+ sbi->log_blocks_per_seg;
+ block_t done = 0;
+
+ if (map.m_len % sbi->blocks_per_seg)
+ len += sbi->blocks_per_seg;
+
+ map.m_len = sbi->blocks_per_seg;
+next_alloc:
+ if (has_not_enough_free_secs(sbi, 0,
+ GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
+ mutex_lock(&sbi->gc_mutex);
+ err = f2fs_gc(sbi, true, false, NULL_SEGNO);
+ if (err && err != -ENODATA && err != -EAGAIN)
+ goto out_err;
+ }
+
+ down_write(&sbi->pin_sem);
+ map.m_seg_type = CURSEG_COLD_DATA_PINNED;
+ f2fs_allocate_new_segments(sbi, CURSEG_COLD_DATA);
+ err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+ up_write(&sbi->pin_sem);
- err = f2fs_map_blocks(inode, &map, 1, (f2fs_is_pinned_file(inode) ?
- F2FS_GET_BLOCK_PRE_DIO :
- F2FS_GET_BLOCK_PRE_AIO));
+ done += map.m_len;
+ len -= map.m_len;
+ map.m_lblk += map.m_len;
+ if (!err && len)
+ goto next_alloc;
+
+ map.m_len = done;
+ } else {
+ err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ }
+out_err:
if (err) {
pgoff_t last_off;
@@ -1890,6 +1925,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
if (list_empty(&fi->inmem_ilist))
list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
+ sbi->atomic_files++;
spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
/* add inode in inmem_list first and set atomic_file */
@@ -3403,6 +3439,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_RELEASE_VOLATILE_WRITE:
case F2FS_IOC_ABORT_VOLATILE_WRITE:
case F2FS_IOC_SHUTDOWN:
+ case FITRIM:
case F2FS_IOC_SET_ENCRYPTION_POLICY:
case F2FS_IOC_GET_ENCRYPTION_PWSALT:
case F2FS_IOC_GET_ENCRYPTION_POLICY:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5877bd729689..b3d399623290 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -54,7 +54,7 @@ static int gc_thread_func(void *data)
}
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
- f2fs_show_injection_info(FAULT_CHECKPOINT);
+ f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
f2fs_stop_checkpoint(sbi, false);
}
@@ -1012,8 +1012,14 @@ next_step:
block_t start_bidx;
nid_t nid = le32_to_cpu(entry->nid);
- /* stop BG_GC if there is not enough free sections. */
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
+ /*
+ * stop BG_GC if there is not enough free sections.
+ * Or, stop GC if the segment becomes fully valid caused by
+ * race condition along with SSR block allocation.
+ */
+ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
+ get_valid_blocks(sbi, segno, false) ==
+ sbi->blocks_per_seg)
return submitted;
if (check_valid_map(sbi, segno, off) == 0)
@@ -1437,11 +1443,20 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
raw_sb->block_count = cpu_to_le64(block_count +
(long long)segs * sbi->blocks_per_seg);
+ if (f2fs_is_multi_device(sbi)) {
+ int last_dev = sbi->s_ndevs - 1;
+ int dev_segs =
+ le32_to_cpu(raw_sb->devs[last_dev].total_segments);
+
+ raw_sb->devs[last_dev].total_segments =
+ cpu_to_le32(dev_segs + segs);
+ }
}
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
{
int segs = secs * sbi->segs_per_sec;
+ long long blks = (long long)segs * sbi->blocks_per_seg;
long long user_block_count =
le64_to_cpu(F2FS_CKPT(sbi)->user_block_count);
@@ -1449,8 +1464,20 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs;
FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs;
FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs;
- F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count +
- (long long)segs * sbi->blocks_per_seg);
+ F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks);
+
+ if (f2fs_is_multi_device(sbi)) {
+ int last_dev = sbi->s_ndevs - 1;
+
+ FDEV(last_dev).total_segments =
+ (int)FDEV(last_dev).total_segments + segs;
+ FDEV(last_dev).end_blk =
+ (long long)FDEV(last_dev).end_blk + blks;
+#ifdef CONFIG_BLK_DEV_ZONED
+ FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz +
+ (int)(blks >> sbi->log_blocks_per_blkz);
+#endif
+ }
}
int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
@@ -1465,6 +1492,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
if (block_count > old_block_count)
return -EINVAL;
+ if (f2fs_is_multi_device(sbi)) {
+ int last_dev = sbi->s_ndevs - 1;
+ __u64 last_segs = FDEV(last_dev).total_segments;
+
+ if (block_count + last_segs * sbi->blocks_per_seg <=
+ old_block_count)
+ return -EINVAL;
+ }
+
/* new fs size should align to section size */
div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem);
if (rem)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index db4fec30c30d..502bd491336a 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -615,7 +615,11 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
- if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
+ /*
+ * atime could be updated without dirtying f2fs inode in lazytime mode
+ */
+ if (f2fs_is_time_consistent(inode) &&
+ !is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
if (!f2fs_is_checkpoint_ready(sbi))
@@ -677,7 +681,7 @@ retry:
err = f2fs_truncate(inode);
if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
- f2fs_show_injection_info(FAULT_EVICT_INODE);
+ f2fs_show_injection_info(sbi, FAULT_EVICT_INODE);
err = -EIO;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 4faf06e8bf89..a1c507b0b4ac 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -981,7 +981,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!old_dir_entry || whiteout)
file_lost_pino(old_inode);
else
- F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+ /* adjust dir's i_pino to pass fsck check */
+ f2fs_i_pino_write(old_inode, new_dir->i_ino);
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
@@ -1141,7 +1142,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_set_link(old_dir, old_entry, old_page, new_inode);
down_write(&F2FS_I(old_inode)->i_sem);
- file_lost_pino(old_inode);
+ if (!old_dir_entry)
+ file_lost_pino(old_inode);
+ else
+ /* adjust dir's i_pino to pass fsck check */
+ f2fs_i_pino_write(old_inode, new_dir->i_ino);
up_write(&F2FS_I(old_inode)->i_sem);
old_dir->i_ctime = current_time(old_dir);
@@ -1156,7 +1161,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
down_write(&F2FS_I(new_inode)->i_sem);
- file_lost_pino(new_inode);
+ if (!new_dir_entry)
+ file_lost_pino(new_inode);
+ else
+ /* adjust dir's i_pino to pass fsck check */
+ f2fs_i_pino_write(new_inode, old_dir->i_ino);
up_write(&F2FS_I(new_inode)->i_sem);
new_dir->i_ctime = current_time(new_dir);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 8b66bc4c004b..3314a0f3405e 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2349,7 +2349,6 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
if (ret) {
up_read(&nm_i->nat_tree_lock);
- f2fs_bug_on(sbi, !mount);
f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
return ret;
}
@@ -2399,7 +2398,7 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
struct free_nid *i = NULL;
retry:
if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
- f2fs_show_injection_info(FAULT_ALLOC_NID);
+ f2fs_show_injection_info(sbi, FAULT_ALLOC_NID);
return false;
}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 783773e4560d..76477f71d4ee 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -711,7 +711,7 @@ next:
f2fs_put_page(page, 1);
}
if (!err)
- f2fs_allocate_new_segments(sbi);
+ f2fs_allocate_new_segments(sbi, NO_CHECK_TYPE);
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 808709581481..56e81447e2f3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -288,6 +288,8 @@ void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure)
struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
struct inode *inode;
struct f2fs_inode_info *fi;
+ unsigned int count = sbi->atomic_files;
+ unsigned int looped = 0;
next:
spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
if (list_empty(head)) {
@@ -296,22 +298,26 @@ next:
}
fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
inode = igrab(&fi->vfs_inode);
+ if (inode)
+ list_move_tail(&fi->inmem_ilist, head);
spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
if (inode) {
if (gc_failure) {
- if (fi->i_gc_failures[GC_FAILURE_ATOMIC])
- goto drop;
- goto skip;
+ if (!fi->i_gc_failures[GC_FAILURE_ATOMIC])
+ goto skip;
}
-drop:
set_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
f2fs_drop_inmem_pages(inode);
+skip:
iput(inode);
}
-skip:
congestion_wait(BLK_RW_ASYNC, HZ/50);
cond_resched();
+ if (gc_failure) {
+ if (++looped >= count)
+ return;
+ }
goto next;
}
@@ -327,13 +333,16 @@ void f2fs_drop_inmem_pages(struct inode *inode)
mutex_unlock(&fi->inmem_lock);
}
- clear_inode_flag(inode, FI_ATOMIC_FILE);
fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
stat_dec_atomic_write(inode);
spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
if (!list_empty(&fi->inmem_ilist))
list_del_init(&fi->inmem_ilist);
+ if (f2fs_is_atomic_file(inode)) {
+ clear_inode_flag(inode, FI_ATOMIC_FILE);
+ sbi->atomic_files--;
+ }
spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
}
@@ -480,7 +489,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
- f2fs_show_injection_info(FAULT_CHECKPOINT);
+ f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
f2fs_stop_checkpoint(sbi, false);
}
@@ -1008,8 +1017,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
if (dc->error)
printk_ratelimited(
- "%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d",
- KERN_INFO, dc->lstart, dc->start, dc->len, dc->error);
+ "%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d",
+ KERN_INFO, sbi->sb->s_id,
+ dc->lstart, dc->start, dc->len, dc->error);
__detach_discard_cmd(dcc, dc);
}
@@ -1149,7 +1159,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
dc->len += len;
if (time_to_inject(sbi, FAULT_DISCARD)) {
- f2fs_show_injection_info(FAULT_DISCARD);
+ f2fs_show_injection_info(sbi, FAULT_DISCARD);
err = -EIO;
goto submit;
}
@@ -1771,7 +1781,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
return -EIO;
}
trace_f2fs_issue_reset_zone(bdev, blkstart);
- return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS);
+ return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ sector, nr_sects, GFP_NOFS);
}
/* For conventional zones, use regular discard if supported */
@@ -2690,7 +2701,7 @@ unlock:
up_read(&SM_I(sbi)->curseg_lock);
}
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
+void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg;
unsigned int old_segno;
@@ -2699,10 +2710,17 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
down_write(&SIT_I(sbi)->sentry_lock);
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ if (type != NO_CHECK_TYPE && i != type)
+ continue;
+
curseg = CURSEG_I(sbi, i);
- old_segno = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
- locate_dirty_segment(sbi, old_segno);
+ if (type == NO_CHECK_TYPE || curseg->next_blkoff ||
+ get_valid_blocks(sbi, curseg->segno, false) ||
+ get_ckpt_valid_blocks(sbi, curseg->segno)) {
+ old_segno = curseg->segno;
+ SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+ locate_dirty_segment(sbi, old_segno);
+ }
}
up_write(&SIT_I(sbi)->sentry_lock);
@@ -3068,6 +3086,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
+ bool put_pin_sem = false;
+
+ if (type == CURSEG_COLD_DATA) {
+ /* GC during CURSEG_COLD_DATA_PINNED allocation */
+ if (down_read_trylock(&sbi->pin_sem)) {
+ put_pin_sem = true;
+ } else {
+ type = CURSEG_WARM_DATA;
+ curseg = CURSEG_I(sbi, type);
+ }
+ } else if (type == CURSEG_COLD_DATA_PINNED) {
+ type = CURSEG_COLD_DATA;
+ }
down_read(&SM_I(sbi)->curseg_lock);
@@ -3133,6 +3164,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
up_read(&SM_I(sbi)->curseg_lock);
+
+ if (put_pin_sem)
+ up_read(&sbi->pin_sem);
}
static void update_device_state(struct f2fs_io_info *fio)
@@ -3379,7 +3413,10 @@ void f2fs_wait_on_page_writeback(struct page *page,
if (PageWriteback(page)) {
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ /* submit cached LFS IO */
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
+ /* sbumit cached IPU IO */
+ f2fs_submit_merged_ipu_write(sbi, NULL, page);
if (ordered) {
wait_on_page_writeback(page);
f2fs_bug_on(sbi, locked && PageWriteback(page));
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 325781a1ae4d..a95467b202ea 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -313,6 +313,8 @@ struct sit_entry_set {
*/
static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
{
+ if (type == CURSEG_COLD_DATA_PINNED)
+ type = CURSEG_COLD_DATA;
return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1443cee15863..5111e1ffe58a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1213,9 +1213,13 @@ static int f2fs_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = (dquot->dq_dqb.dqb_bsoftlimit ?
- dquot->dq_dqb.dqb_bsoftlimit :
- dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ limit = 0;
+ if (dquot->dq_dqb.dqb_bsoftlimit)
+ limit = dquot->dq_dqb.dqb_bsoftlimit;
+ if (dquot->dq_dqb.dqb_bhardlimit &&
+ (!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
+ limit = dquot->dq_dqb.dqb_bhardlimit;
+
if (limit && buf->f_blocks > limit) {
curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
buf->f_blocks = limit;
@@ -1224,9 +1228,13 @@ static int f2fs_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = dquot->dq_dqb.dqb_isoftlimit ?
- dquot->dq_dqb.dqb_isoftlimit :
- dquot->dq_dqb.dqb_ihardlimit;
+ limit = 0;
+ if (dquot->dq_dqb.dqb_isoftlimit)
+ limit = dquot->dq_dqb.dqb_isoftlimit;
+ if (dquot->dq_dqb.dqb_ihardlimit &&
+ (!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
+ limit = dquot->dq_dqb.dqb_ihardlimit;
+
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
@@ -1932,7 +1940,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
- err = dquot_enable(qf_inode, type, format_id, flags);
+ err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
iput(qf_inode);
return err;
}
@@ -2308,13 +2316,27 @@ static bool f2fs_dummy_context(struct inode *inode)
return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode));
}
+static bool f2fs_has_stable_inodes(struct super_block *sb)
+{
+ return true;
+}
+
+static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
+ int *ino_bits_ret, int *lblk_bits_ret)
+{
+ *ino_bits_ret = 8 * sizeof(nid_t);
+ *lblk_bits_ret = 8 * sizeof(block_t);
+}
+
static const struct fscrypt_operations f2fs_cryptops = {
- .key_prefix = "f2fs:",
- .get_context = f2fs_get_context,
- .set_context = f2fs_set_context,
- .dummy_context = f2fs_dummy_context,
- .empty_dir = f2fs_empty_dir,
- .max_namelen = F2FS_NAME_LEN,
+ .key_prefix = "f2fs:",
+ .get_context = f2fs_get_context,
+ .set_context = f2fs_set_context,
+ .dummy_context = f2fs_dummy_context,
+ .empty_dir = f2fs_empty_dir,
+ .max_namelen = F2FS_NAME_LEN,
+ .has_stable_inodes = f2fs_has_stable_inodes,
+ .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits,
};
#endif
@@ -2604,6 +2626,21 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
+ if (RDEV(0).path[0]) {
+ block_t dev_seg_count = le32_to_cpu(RDEV(0).total_segments);
+ int i = 1;
+
+ while (i < MAX_DEVICES && RDEV(i).path[0]) {
+ dev_seg_count += le32_to_cpu(RDEV(i).total_segments);
+ i++;
+ }
+ if (segment_count != dev_seg_count) {
+ f2fs_info(sbi, "Segment count (%u) mismatch with total segments from devices (%u)",
+ segment_count, dev_seg_count);
+ return -EFSCORRUPTED;
+ }
+ }
+
if (secs_per_zone > total_sections || !secs_per_zone) {
f2fs_info(sbi, "Wrong secs_per_zone / total_sections (%u, %u)",
secs_per_zone, total_sections);
@@ -2838,6 +2875,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
spin_lock_init(&sbi->dev_lock);
init_rwsem(&sbi->sb_lock);
+ init_rwsem(&sbi->pin_sem);
}
static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -2857,15 +2895,21 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
}
#ifdef CONFIG_BLK_DEV_ZONED
+static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
+ struct f2fs_dev_info *dev = data;
+
+ if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
+ set_bit(idx, dev->blkz_seq);
+ return 0;
+}
+
static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
{
struct block_device *bdev = FDEV(devi).bdev;
sector_t nr_sectors = bdev->bd_part->nr_sects;
- sector_t sector = 0;
- struct blk_zone *zones;
- unsigned int i, nr_zones;
- unsigned int n = 0;
- int err = -EIO;
+ int ret;
if (!f2fs_sb_has_blkzoned(sbi))
return 0;
@@ -2890,38 +2934,13 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
if (!FDEV(devi).blkz_seq)
return -ENOMEM;
-#define F2FS_REPORT_NR_ZONES 4096
-
- zones = f2fs_kzalloc(sbi,
- array_size(F2FS_REPORT_NR_ZONES,
- sizeof(struct blk_zone)),
- GFP_KERNEL);
- if (!zones)
- return -ENOMEM;
-
/* Get block zones type */
- while (zones && sector < nr_sectors) {
-
- nr_zones = F2FS_REPORT_NR_ZONES;
- err = blkdev_report_zones(bdev, sector, zones, &nr_zones);
- if (err)
- break;
- if (!nr_zones) {
- err = -EIO;
- break;
- }
-
- for (i = 0; i < nr_zones; i++) {
- if (zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL)
- set_bit(n, FDEV(devi).blkz_seq);
- sector += zones[i].len;
- n++;
- }
- }
-
- kvfree(zones);
+ ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb,
+ &FDEV(devi));
+ if (ret < 0)
+ return ret;
- return err;
+ return 0;
}
#endif
@@ -2951,6 +2970,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Unable to read %dth superblock",
block + 1);
err = -EIO;
+ *recovery = 1;
continue;
}
@@ -2960,6 +2980,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
f2fs_err(sbi, "Can't find valid F2FS filesystem in %dth superblock",
block + 1);
brelse(bh);
+ *recovery = 1;
continue;
}
@@ -2972,10 +2993,6 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
brelse(bh);
}
- /* Fail to read any one of the superblocks*/
- if (err < 0)
- *recovery = 1;
-
/* No valid superblock */
if (!*raw_super)
kvfree(super);
@@ -3329,6 +3346,8 @@ try_onemore:
sbi->write_io[i][j].bio = NULL;
spin_lock_init(&sbi->write_io[i][j].io_lock);
INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
+ INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
+ init_rwsem(&sbi->write_io[i][j].bio_list_lock);
}
}
@@ -3740,8 +3759,13 @@ static int __init init_f2fs_fs(void)
err = f2fs_init_post_read_processing();
if (err)
goto free_root_stats;
+ err = f2fs_init_bio_entry_cache();
+ if (err)
+ goto free_post_read;
return 0;
+free_post_read:
+ f2fs_destroy_post_read_processing();
free_root_stats:
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
@@ -3765,6 +3789,7 @@ fail:
static void __exit exit_f2fs_fs(void)
{
+ f2fs_destroy_bio_entry_cache();
f2fs_destroy_post_read_processing();
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index b558b64a4c9c..70945ceb9c0c 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -154,6 +154,8 @@ static ssize_t features_show(struct f2fs_attr *a,
if (f2fs_sb_has_casefold(sbi))
len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "casefold");
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "pin_file");
len += snprintf(buf + len, PAGE_SIZE - len, "\n");
return len;
}
@@ -443,6 +445,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, main_blkaddr, main_blkaddr);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
@@ -510,6 +513,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_idle),
ATTR_LIST(gc_urgent),
ATTR_LIST(reclaim_segments),
+ ATTR_LIST(main_blkaddr),
ATTR_LIST(max_small_discards),
ATTR_LIST(discard_granularity),
ATTR_LIST(batched_trim_sections),
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 181900af2576..296b3189448a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -539,8 +539,9 @@ out:
ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct inode *inode = d_inode(dentry);
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
struct f2fs_xattr_entry *entry;
- void *base_addr;
+ void *base_addr, *last_base_addr;
int error = 0;
size_t rest = buffer_size;
@@ -550,6 +551,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (error)
return error;
+ last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
f2fs_xattr_handler(entry->e_name_index);
@@ -557,6 +560,15 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
size_t prefix_len;
size_t size;
+ if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
+ (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
+ f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr",
+ inode->i_ino);
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+ error = -EFSCORRUPTED;
+ goto cleanup;
+ }
+
if (!handler || (handler->list && !handler->list(dentry)))
continue;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 4614c0ba5f1c..bdc4503c00a3 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -172,15 +172,6 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
}
-#ifdef CONFIG_COMPAT
-static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
- unsigned long arg)
-
-{
- return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
static int fat_file_release(struct inode *inode, struct file *filp)
{
if ((filp->f_mode & FMODE_WRITE) &&
@@ -215,9 +206,7 @@ const struct file_operations fat_file_operations = {
.mmap = generic_file_mmap,
.release = fat_file_release,
.unlocked_ioctl = fat_generic_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = fat_generic_compat_ioctl,
-#endif
+ .compat_ioctl = compat_ptr_ioctl,
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3d40771e8e7c..41b6438bd2d9 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -261,7 +261,7 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
static bool rw_hint_valid(enum rw_hint hint)
{
switch (hint) {
- case RWF_WRITE_LIFE_NOT_SET:
+ case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
case RWH_WRITE_LIFE_SHORT:
case RWH_WRITE_LIFE_MEDIUM:
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ed1abc9e33cf..d4e6691d2d92 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -705,7 +705,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
cs->pipebufs++;
cs->nr_segs--;
} else {
- if (cs->nr_segs == cs->pipe->buffers)
+ if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
page = alloc_page(GFP_HIGHUSER);
@@ -881,7 +881,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
struct pipe_buffer *buf;
int err;
- if (cs->nr_segs == cs->pipe->buffers)
+ if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
err = unlock_request(cs->req);
@@ -1343,7 +1343,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (!fud)
return -EPERM;
- bufs = kvmalloc_array(pipe->buffers, sizeof(struct pipe_buffer),
+ bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
GFP_KERNEL);
if (!bufs)
return -ENOMEM;
@@ -1355,7 +1355,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (ret < 0)
goto out;
- if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+ if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) {
ret = -EIO;
goto out;
}
@@ -1937,6 +1937,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
{
+ unsigned int head, tail, mask, count;
unsigned nbuf;
unsigned idx;
struct pipe_buffer *bufs;
@@ -1951,8 +1952,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe_lock(pipe);
- bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer),
- GFP_KERNEL);
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
+ count = head - tail;
+
+ bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
if (!bufs) {
pipe_unlock(pipe);
return -ENOMEM;
@@ -1960,8 +1965,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
nbuf = 0;
rem = 0;
- for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
- rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+ for (idx = tail; idx < head && rem < len; idx++)
+ rem += pipe->bufs[idx & mask].len;
ret = -EINVAL;
if (rem < len)
@@ -1972,16 +1977,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct pipe_buffer *ibuf;
struct pipe_buffer *obuf;
- BUG_ON(nbuf >= pipe->buffers);
- BUG_ON(!pipe->nrbufs);
- ibuf = &pipe->bufs[pipe->curbuf];
+ BUG_ON(nbuf >= pipe->ring_size);
+ BUG_ON(tail == head);
+ ibuf = &pipe->bufs[tail & mask];
obuf = &bufs[nbuf];
if (rem >= ibuf->len) {
*obuf = *ibuf;
ibuf->ops = NULL;
- pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
- pipe->nrbufs--;
+ tail++;
+ pipe->tail = tail;
} else {
if (!pipe_buf_get(pipe, ibuf))
goto out_free;
@@ -2262,7 +2267,7 @@ const struct file_operations fuse_dev_operations = {
.release = fuse_dev_release,
.fasync = fuse_dev_fasync,
.unlocked_ioctl = fuse_dev_ioctl,
- .compat_ioctl = fuse_dev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index f63df54a08c6..516103248272 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1149,7 +1149,8 @@ static inline bool gfs2_iomap_need_write_lock(unsigned flags)
}
static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
- unsigned flags, struct iomap *iomap)
+ unsigned flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct metapath mp = { .mp_aheight = 1, };
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 997b326247e2..d07a295f9cac 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/compat.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <linux/pagemap.h>
@@ -354,6 +355,31 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -ENOTTY;
}
+#ifdef CONFIG_COMPAT
+static long gfs2_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch(cmd) {
+ /* These are just misnamed, they actually get/put from/to user an int */
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ /* Keep this list in sync with gfs2_ioctl */
+ case FITRIM:
+ case FS_IOC_GETFSLABEL:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return gfs2_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#else
+#define gfs2_compat_ioctl NULL
+#endif
+
/**
* gfs2_size_hint - Give a hint to the size of a write request
* @filep: The struct file
@@ -732,7 +758,8 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
if (ret)
goto out_uninit;
- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL);
+ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
gfs2_glock_dq(&gh);
out_uninit:
@@ -767,7 +794,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (offset + len > i_size_read(&ip->i_inode))
goto out;
- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL);
+ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
out:
gfs2_glock_dq(&gh);
@@ -1293,6 +1321,7 @@ const struct file_operations gfs2_file_fops = {
.write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
.release = gfs2_release,
@@ -1308,6 +1337,7 @@ const struct file_operations gfs2_file_fops = {
const struct file_operations gfs2_dir_fops = {
.iterate_shared = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
@@ -1324,6 +1354,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
.release = gfs2_release,
@@ -1337,6 +1368,7 @@ const struct file_operations gfs2_file_fops_nolock = {
const struct file_operations gfs2_dir_fops_nolock = {
.iterate_shared = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index d85230c84ef2..f32f15669996 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -325,4 +325,5 @@ const struct file_operations hpfs_dir_ops =
.release = hpfs_dir_release,
.fsync = hpfs_file_fsync,
.unlocked_ioctl = hpfs_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 1ecec124e76f..b36abf9cb345 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -215,6 +215,7 @@ const struct file_operations hpfs_file_ops =
.fsync = hpfs_file_fsync,
.splice_read = generic_file_splice_read,
.unlocked_ioctl = hpfs_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
const struct inode_operations hpfs_file_iops =
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a478df035651..d5c2a3158610 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash;
index = page->index;
- hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
+ hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
addr = index * hpage_size;
/* mutex taken here, fault path and hole punch */
- hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
+ hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
@@ -815,8 +815,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* File creation. Allocate an inode, and we're done..
*/
-static int hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t dev)
+static int do_hugetlbfs_mknod(struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t dev,
+ bool tmpfile)
{
struct inode *inode;
int error = -ENOSPC;
@@ -824,13 +827,23 @@ static int hugetlbfs_mknod(struct inode *dir,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (inode) {
dir->i_ctime = dir->i_mtime = current_time(dir);
- d_instantiate(dentry, inode);
- dget(dentry); /* Extra count - pin the dentry in core */
+ if (tmpfile) {
+ d_tmpfile(dentry, inode);
+ } else {
+ d_instantiate(dentry, inode);
+ dget(dentry);/* Extra count - pin the dentry in core */
+ }
error = 0;
}
return error;
}
+static int hugetlbfs_mknod(struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
+{
+ return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
+}
+
static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
@@ -844,6 +857,12 @@ static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mo
return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
+static int hugetlbfs_tmpfile(struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+{
+ return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
+}
+
static int hugetlbfs_symlink(struct inode *dir,
struct dentry *dentry, const char *symname)
{
@@ -1102,6 +1121,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations = {
.mknod = hugetlbfs_mknod,
.rename = simple_rename,
.setattr = hugetlbfs_setattr,
+ .tmpfile = hugetlbfs_tmpfile,
};
static const struct inode_operations hugetlbfs_inode_operations = {
@@ -1461,28 +1481,41 @@ static int __init init_hugetlbfs_fs(void)
sizeof(struct hugetlbfs_inode_info),
0, SLAB_ACCOUNT, init_once);
if (hugetlbfs_inode_cachep == NULL)
- goto out2;
+ goto out;
error = register_filesystem(&hugetlbfs_fs_type);
if (error)
- goto out;
+ goto out_free;
+ /* default hstate mount is required */
+ mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
+ if (IS_ERR(mnt)) {
+ error = PTR_ERR(mnt);
+ goto out_unreg;
+ }
+ hugetlbfs_vfsmount[default_hstate_idx] = mnt;
+
+ /* other hstates are optional */
i = 0;
for_each_hstate(h) {
+ if (i == default_hstate_idx)
+ continue;
+
mnt = mount_one_hugetlbfs(h);
- if (IS_ERR(mnt) && i == 0) {
- error = PTR_ERR(mnt);
- goto out;
- }
- hugetlbfs_vfsmount[i] = mnt;
+ if (IS_ERR(mnt))
+ hugetlbfs_vfsmount[i] = NULL;
+ else
+ hugetlbfs_vfsmount[i] = mnt;
i++;
}
return 0;
- out:
+ out_unreg:
+ (void)unregister_filesystem(&hugetlbfs_fs_type);
+ out_free:
kmem_cache_destroy(hugetlbfs_inode_cachep);
- out2:
+ out:
return error;
}
fs_initcall(init_hugetlbfs_fs)
diff --git a/fs/io-wq.c b/fs/io-wq.c
new file mode 100644
index 000000000000..91b85df0861e
--- /dev/null
+++ b/fs/io-wq.c
@@ -0,0 +1,1094 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic worker thread pool for io_uring
+ *
+ * Copyright (C) 2019 Jens Axboe
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/sched/signal.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/rculist_nulls.h>
+
+#include "io-wq.h"
+
+#define WORKER_IDLE_TIMEOUT (5 * HZ)
+
+enum {
+ IO_WORKER_F_UP = 1, /* up and active */
+ IO_WORKER_F_RUNNING = 2, /* account as running */
+ IO_WORKER_F_FREE = 4, /* worker on free list */
+ IO_WORKER_F_EXITING = 8, /* worker exiting */
+ IO_WORKER_F_FIXED = 16, /* static idle worker */
+ IO_WORKER_F_BOUND = 32, /* is doing bounded work */
+};
+
+enum {
+ IO_WQ_BIT_EXIT = 0, /* wq exiting */
+ IO_WQ_BIT_CANCEL = 1, /* cancel work on list */
+ IO_WQ_BIT_ERROR = 2, /* error on setup */
+};
+
+enum {
+ IO_WQE_FLAG_STALLED = 1, /* stalled on hash */
+};
+
+/*
+ * One for each thread in a wqe pool
+ */
+struct io_worker {
+ refcount_t ref;
+ unsigned flags;
+ struct hlist_nulls_node nulls_node;
+ struct list_head all_list;
+ struct task_struct *task;
+ wait_queue_head_t wait;
+ struct io_wqe *wqe;
+
+ struct io_wq_work *cur_work;
+ spinlock_t lock;
+
+ struct rcu_head rcu;
+ struct mm_struct *mm;
+ const struct cred *creds;
+ struct files_struct *restore_files;
+};
+
+#if BITS_PER_LONG == 64
+#define IO_WQ_HASH_ORDER 6
+#else
+#define IO_WQ_HASH_ORDER 5
+#endif
+
+struct io_wqe_acct {
+ unsigned nr_workers;
+ unsigned max_workers;
+ atomic_t nr_running;
+};
+
+enum {
+ IO_WQ_ACCT_BOUND,
+ IO_WQ_ACCT_UNBOUND,
+};
+
+/*
+ * Per-node worker thread pool
+ */
+struct io_wqe {
+ struct {
+ spinlock_t lock;
+ struct io_wq_work_list work_list;
+ unsigned long hash_map;
+ unsigned flags;
+ } ____cacheline_aligned_in_smp;
+
+ int node;
+ struct io_wqe_acct acct[2];
+
+ struct hlist_nulls_head free_list;
+ struct hlist_nulls_head busy_list;
+ struct list_head all_list;
+
+ struct io_wq *wq;
+};
+
+/*
+ * Per io_wq state
+ */
+struct io_wq {
+ struct io_wqe **wqes;
+ unsigned long state;
+
+ get_work_fn *get_work;
+ put_work_fn *put_work;
+
+ struct task_struct *manager;
+ struct user_struct *user;
+ struct cred *creds;
+ struct mm_struct *mm;
+ refcount_t refs;
+ struct completion done;
+};
+
+static bool io_worker_get(struct io_worker *worker)
+{
+ return refcount_inc_not_zero(&worker->ref);
+}
+
+static void io_worker_release(struct io_worker *worker)
+{
+ if (refcount_dec_and_test(&worker->ref))
+ wake_up_process(worker->task);
+}
+
+/*
+ * Note: drops the wqe->lock if returning true! The caller must re-acquire
+ * the lock in that case. Some callers need to restart handling if this
+ * happens, so we can't just re-acquire the lock on behalf of the caller.
+ */
+static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
+{
+ bool dropped_lock = false;
+
+ if (worker->creds) {
+ revert_creds(worker->creds);
+ worker->creds = NULL;
+ }
+
+ if (current->files != worker->restore_files) {
+ __acquire(&wqe->lock);
+ spin_unlock_irq(&wqe->lock);
+ dropped_lock = true;
+
+ task_lock(current);
+ current->files = worker->restore_files;
+ task_unlock(current);
+ }
+
+ /*
+ * If we have an active mm, we need to drop the wq lock before unusing
+ * it. If we do, return true and let the caller retry the idle loop.
+ */
+ if (worker->mm) {
+ if (!dropped_lock) {
+ __acquire(&wqe->lock);
+ spin_unlock_irq(&wqe->lock);
+ dropped_lock = true;
+ }
+ __set_current_state(TASK_RUNNING);
+ set_fs(KERNEL_DS);
+ unuse_mm(worker->mm);
+ mmput(worker->mm);
+ worker->mm = NULL;
+ }
+
+ return dropped_lock;
+}
+
+static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe,
+ struct io_wq_work *work)
+{
+ if (work->flags & IO_WQ_WORK_UNBOUND)
+ return &wqe->acct[IO_WQ_ACCT_UNBOUND];
+
+ return &wqe->acct[IO_WQ_ACCT_BOUND];
+}
+
+static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe,
+ struct io_worker *worker)
+{
+ if (worker->flags & IO_WORKER_F_BOUND)
+ return &wqe->acct[IO_WQ_ACCT_BOUND];
+
+ return &wqe->acct[IO_WQ_ACCT_UNBOUND];
+}
+
+static void io_worker_exit(struct io_worker *worker)
+{
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+ unsigned nr_workers;
+
+ /*
+ * If we're not at zero, someone else is holding a brief reference
+ * to the worker. Wait for that to go away.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!refcount_dec_and_test(&worker->ref))
+ schedule();
+ __set_current_state(TASK_RUNNING);
+
+ preempt_disable();
+ current->flags &= ~PF_IO_WORKER;
+ if (worker->flags & IO_WORKER_F_RUNNING)
+ atomic_dec(&acct->nr_running);
+ if (!(worker->flags & IO_WORKER_F_BOUND))
+ atomic_dec(&wqe->wq->user->processes);
+ worker->flags = 0;
+ preempt_enable();
+
+ spin_lock_irq(&wqe->lock);
+ hlist_nulls_del_rcu(&worker->nulls_node);
+ list_del_rcu(&worker->all_list);
+ if (__io_worker_unuse(wqe, worker)) {
+ __release(&wqe->lock);
+ spin_lock_irq(&wqe->lock);
+ }
+ acct->nr_workers--;
+ nr_workers = wqe->acct[IO_WQ_ACCT_BOUND].nr_workers +
+ wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers;
+ spin_unlock_irq(&wqe->lock);
+
+ /* all workers gone, wq exit can proceed */
+ if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
+ complete(&wqe->wq->done);
+
+ kfree_rcu(worker, rcu);
+}
+
+static inline bool io_wqe_run_queue(struct io_wqe *wqe)
+ __must_hold(wqe->lock)
+{
+ if (!wq_list_empty(&wqe->work_list) &&
+ !(wqe->flags & IO_WQE_FLAG_STALLED))
+ return true;
+ return false;
+}
+
+/*
+ * Check head of free list for an available worker. If one isn't available,
+ * caller must wake up the wq manager to create one.
+ */
+static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
+ __must_hold(RCU)
+{
+ struct hlist_nulls_node *n;
+ struct io_worker *worker;
+
+ n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
+ if (is_a_nulls(n))
+ return false;
+
+ worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
+ if (io_worker_get(worker)) {
+ wake_up(&worker->wait);
+ io_worker_release(worker);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We need a worker. If we find a free one, we're good. If not, and we're
+ * below the max number of workers, wake up the manager to create one.
+ */
+static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
+{
+ bool ret;
+
+ /*
+ * Most likely an attempt to queue unbounded work on an io_wq that
+ * wasn't setup with any unbounded workers.
+ */
+ WARN_ON_ONCE(!acct->max_workers);
+
+ rcu_read_lock();
+ ret = io_wqe_activate_free_worker(wqe);
+ rcu_read_unlock();
+
+ if (!ret && acct->nr_workers < acct->max_workers)
+ wake_up_process(wqe->wq->manager);
+}
+
+static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker)
+{
+ struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+
+ atomic_inc(&acct->nr_running);
+}
+
+static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker)
+ __must_hold(wqe->lock)
+{
+ struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker);
+
+ if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe))
+ io_wqe_wake_worker(wqe, acct);
+}
+
+static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
+{
+ allow_kernel_signal(SIGINT);
+
+ current->flags |= PF_IO_WORKER;
+
+ worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+ worker->restore_files = current->files;
+ io_wqe_inc_running(wqe, worker);
+}
+
+/*
+ * Worker will start processing some work. Move it to the busy list, if
+ * it's currently on the freelist
+ */
+static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
+ struct io_wq_work *work)
+ __must_hold(wqe->lock)
+{
+ bool worker_bound, work_bound;
+
+ if (worker->flags & IO_WORKER_F_FREE) {
+ worker->flags &= ~IO_WORKER_F_FREE;
+ hlist_nulls_del_init_rcu(&worker->nulls_node);
+ hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->busy_list);
+ }
+
+ /*
+ * If worker is moving from bound to unbound (or vice versa), then
+ * ensure we update the running accounting.
+ */
+ worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
+ work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
+ if (worker_bound != work_bound) {
+ io_wqe_dec_running(wqe, worker);
+ if (work_bound) {
+ worker->flags |= IO_WORKER_F_BOUND;
+ wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--;
+ wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++;
+ atomic_dec(&wqe->wq->user->processes);
+ } else {
+ worker->flags &= ~IO_WORKER_F_BOUND;
+ wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++;
+ wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--;
+ atomic_inc(&wqe->wq->user->processes);
+ }
+ io_wqe_inc_running(wqe, worker);
+ }
+}
+
+/*
+ * No work, worker going to sleep. Move to freelist, and unuse mm if we
+ * have one attached. Dropping the mm may potentially sleep, so we drop
+ * the lock in that case and return success. Since the caller has to
+ * retry the loop in that case (we changed task state), we don't regrab
+ * the lock if we return success.
+ */
+static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
+ __must_hold(wqe->lock)
+{
+ if (!(worker->flags & IO_WORKER_F_FREE)) {
+ worker->flags |= IO_WORKER_F_FREE;
+ hlist_nulls_del_init_rcu(&worker->nulls_node);
+ hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
+ }
+
+ return __io_worker_unuse(wqe, worker);
+}
+
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
+ __must_hold(wqe->lock)
+{
+ struct io_wq_work_node *node, *prev;
+ struct io_wq_work *work;
+
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+
+ /* not hashed, can run anytime */
+ if (!(work->flags & IO_WQ_WORK_HASHED)) {
+ wq_node_del(&wqe->work_list, node, prev);
+ return work;
+ }
+
+ /* hashed, can run if not already running */
+ *hash = work->flags >> IO_WQ_HASH_SHIFT;
+ if (!(wqe->hash_map & BIT_ULL(*hash))) {
+ wqe->hash_map |= BIT_ULL(*hash);
+ wq_node_del(&wqe->work_list, node, prev);
+ return work;
+ }
+ }
+
+ return NULL;
+}
+
+static void io_worker_handle_work(struct io_worker *worker)
+ __releases(wqe->lock)
+{
+ struct io_wq_work *work, *old_work = NULL, *put_work = NULL;
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+
+ do {
+ unsigned hash = -1U;
+
+ /*
+ * If we got some work, mark us as busy. If we didn't, but
+ * the list isn't empty, it means we stalled on hashed work.
+ * Mark us stalled so we don't keep looking for work when we
+ * can't make progress, any work completion or insertion will
+ * clear the stalled flag.
+ */
+ work = io_get_next_work(wqe, &hash);
+ if (work)
+ __io_worker_busy(wqe, worker, work);
+ else if (!wq_list_empty(&wqe->work_list))
+ wqe->flags |= IO_WQE_FLAG_STALLED;
+
+ spin_unlock_irq(&wqe->lock);
+ if (put_work && wq->put_work)
+ wq->put_work(old_work);
+ if (!work)
+ break;
+next:
+ /* flush any pending signals before assigning new work */
+ if (signal_pending(current))
+ flush_signals(current);
+
+ spin_lock_irq(&worker->lock);
+ worker->cur_work = work;
+ spin_unlock_irq(&worker->lock);
+
+ if (work->flags & IO_WQ_WORK_CB)
+ work->func(&work);
+
+ if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
+ current->files != work->files) {
+ task_lock(current);
+ current->files = work->files;
+ task_unlock(current);
+ }
+ if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
+ wq->mm && mmget_not_zero(wq->mm)) {
+ use_mm(wq->mm);
+ set_fs(USER_DS);
+ worker->mm = wq->mm;
+ }
+ if (!worker->creds)
+ worker->creds = override_creds(wq->creds);
+ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
+ work->flags |= IO_WQ_WORK_CANCEL;
+ if (worker->mm)
+ work->flags |= IO_WQ_WORK_HAS_MM;
+
+ if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) {
+ put_work = work;
+ wq->get_work(work);
+ }
+
+ old_work = work;
+ work->func(&work);
+
+ spin_lock_irq(&worker->lock);
+ worker->cur_work = NULL;
+ spin_unlock_irq(&worker->lock);
+
+ spin_lock_irq(&wqe->lock);
+
+ if (hash != -1U) {
+ wqe->hash_map &= ~BIT_ULL(hash);
+ wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ }
+ if (work && work != old_work) {
+ spin_unlock_irq(&wqe->lock);
+
+ if (put_work && wq->put_work) {
+ wq->put_work(put_work);
+ put_work = NULL;
+ }
+
+ /* dependent work not hashed */
+ hash = -1U;
+ goto next;
+ }
+ } while (1);
+}
+
+static int io_wqe_worker(void *data)
+{
+ struct io_worker *worker = data;
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+ DEFINE_WAIT(wait);
+
+ io_worker_start(wqe, worker);
+
+ while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+ prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE);
+
+ spin_lock_irq(&wqe->lock);
+ if (io_wqe_run_queue(wqe)) {
+ __set_current_state(TASK_RUNNING);
+ io_worker_handle_work(worker);
+ continue;
+ }
+ /* drops the lock on success, retry */
+ if (__io_worker_idle(wqe, worker)) {
+ __release(&wqe->lock);
+ continue;
+ }
+ spin_unlock_irq(&wqe->lock);
+ if (signal_pending(current))
+ flush_signals(current);
+ if (schedule_timeout(WORKER_IDLE_TIMEOUT))
+ continue;
+ /* timed out, exit unless we're the fixed worker */
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
+ !(worker->flags & IO_WORKER_F_FIXED))
+ break;
+ }
+
+ finish_wait(&worker->wait, &wait);
+
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+ spin_lock_irq(&wqe->lock);
+ if (!wq_list_empty(&wqe->work_list))
+ io_worker_handle_work(worker);
+ else
+ spin_unlock_irq(&wqe->lock);
+ }
+
+ io_worker_exit(worker);
+ return 0;
+}
+
+/*
+ * Called when a worker is scheduled in. Mark us as currently running.
+ */
+void io_wq_worker_running(struct task_struct *tsk)
+{
+ struct io_worker *worker = kthread_data(tsk);
+ struct io_wqe *wqe = worker->wqe;
+
+ if (!(worker->flags & IO_WORKER_F_UP))
+ return;
+ if (worker->flags & IO_WORKER_F_RUNNING)
+ return;
+ worker->flags |= IO_WORKER_F_RUNNING;
+ io_wqe_inc_running(wqe, worker);
+}
+
+/*
+ * Called when worker is going to sleep. If there are no workers currently
+ * running and we have work pending, wake up a free one or have the manager
+ * set one up.
+ */
+void io_wq_worker_sleeping(struct task_struct *tsk)
+{
+ struct io_worker *worker = kthread_data(tsk);
+ struct io_wqe *wqe = worker->wqe;
+
+ if (!(worker->flags & IO_WORKER_F_UP))
+ return;
+ if (!(worker->flags & IO_WORKER_F_RUNNING))
+ return;
+
+ worker->flags &= ~IO_WORKER_F_RUNNING;
+
+ spin_lock_irq(&wqe->lock);
+ io_wqe_dec_running(wqe, worker);
+ spin_unlock_irq(&wqe->lock);
+}
+
+static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+{
+ struct io_wqe_acct *acct =&wqe->acct[index];
+ struct io_worker *worker;
+
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
+ if (!worker)
+ return false;
+
+ refcount_set(&worker->ref, 1);
+ worker->nulls_node.pprev = NULL;
+ init_waitqueue_head(&worker->wait);
+ worker->wqe = wqe;
+ spin_lock_init(&worker->lock);
+
+ worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
+ "io_wqe_worker-%d/%d", index, wqe->node);
+ if (IS_ERR(worker->task)) {
+ kfree(worker);
+ return false;
+ }
+
+ spin_lock_irq(&wqe->lock);
+ hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
+ list_add_tail_rcu(&worker->all_list, &wqe->all_list);
+ worker->flags |= IO_WORKER_F_FREE;
+ if (index == IO_WQ_ACCT_BOUND)
+ worker->flags |= IO_WORKER_F_BOUND;
+ if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND))
+ worker->flags |= IO_WORKER_F_FIXED;
+ acct->nr_workers++;
+ spin_unlock_irq(&wqe->lock);
+
+ if (index == IO_WQ_ACCT_UNBOUND)
+ atomic_inc(&wq->user->processes);
+
+ wake_up_process(worker->task);
+ return true;
+}
+
+static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
+ __must_hold(wqe->lock)
+{
+ struct io_wqe_acct *acct = &wqe->acct[index];
+
+ /* if we have available workers or no work, no need */
+ if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
+ return false;
+ return acct->nr_workers < acct->max_workers;
+}
+
+/*
+ * Manager thread. Tasked with creating new workers, if we need them.
+ */
+static int io_wq_manager(void *data)
+{
+ struct io_wq *wq = data;
+ int workers_to_create = num_possible_nodes();
+ int node;
+
+ /* create fixed workers */
+ refcount_set(&wq->refs, workers_to_create);
+ for_each_node(node) {
+ if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
+ goto err;
+ workers_to_create--;
+ }
+
+ complete(&wq->done);
+
+ while (!kthread_should_stop()) {
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+ bool fork_worker[2] = { false, false };
+
+ spin_lock_irq(&wqe->lock);
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND))
+ fork_worker[IO_WQ_ACCT_BOUND] = true;
+ if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND))
+ fork_worker[IO_WQ_ACCT_UNBOUND] = true;
+ spin_unlock_irq(&wqe->lock);
+ if (fork_worker[IO_WQ_ACCT_BOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND);
+ if (fork_worker[IO_WQ_ACCT_UNBOUND])
+ create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND);
+ }
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+ }
+
+ return 0;
+err:
+ set_bit(IO_WQ_BIT_ERROR, &wq->state);
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ if (refcount_sub_and_test(workers_to_create, &wq->refs))
+ complete(&wq->done);
+ return 0;
+}
+
+static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
+ struct io_wq_work *work)
+{
+ bool free_worker;
+
+ if (!(work->flags & IO_WQ_WORK_UNBOUND))
+ return true;
+ if (atomic_read(&acct->nr_running))
+ return true;
+
+ rcu_read_lock();
+ free_worker = !hlist_nulls_empty(&wqe->free_list);
+ rcu_read_unlock();
+ if (free_worker)
+ return true;
+
+ if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers &&
+ !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN)))
+ return false;
+
+ return true;
+}
+
+static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
+{
+ struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ unsigned long flags;
+
+ /*
+ * Do early check to see if we need a new unbound worker, and if we do,
+ * if we're allowed to do so. This isn't 100% accurate as there's a
+ * gap between this check and incrementing the value, but that's OK.
+ * It's close enough to not be an issue, fork() has the same delay.
+ */
+ if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ work->func(&work);
+ return;
+ }
+
+ spin_lock_irqsave(&wqe->lock, flags);
+ wq_list_add_tail(&work->list, &wqe->work_list);
+ wqe->flags &= ~IO_WQE_FLAG_STALLED;
+ spin_unlock_irqrestore(&wqe->lock, flags);
+
+ if (!atomic_read(&acct->nr_running))
+ io_wqe_wake_worker(wqe, acct);
+}
+
+void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
+{
+ struct io_wqe *wqe = wq->wqes[numa_node_id()];
+
+ io_wqe_enqueue(wqe, work);
+}
+
+/*
+ * Enqueue work, hashed by some key. Work items that hash to the same value
+ * will not be done in parallel. Used to limit concurrent writes, generally
+ * hashed by inode.
+ */
+void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val)
+{
+ struct io_wqe *wqe = wq->wqes[numa_node_id()];
+ unsigned bit;
+
+
+ bit = hash_ptr(val, IO_WQ_HASH_ORDER);
+ work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
+ io_wqe_enqueue(wqe, work);
+}
+
+static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
+{
+ send_sig(SIGINT, worker->task, 1);
+ return false;
+}
+
+/*
+ * Iterate the passed in list and call the specific function for each
+ * worker that isn't exiting
+ */
+static bool io_wq_for_each_worker(struct io_wqe *wqe,
+ bool (*func)(struct io_worker *, void *),
+ void *data)
+{
+ struct io_worker *worker;
+ bool ret = false;
+
+ list_for_each_entry_rcu(worker, &wqe->all_list, all_list) {
+ if (io_worker_get(worker)) {
+ ret = func(worker, data);
+ io_worker_release(worker);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+void io_wq_cancel_all(struct io_wq *wq)
+{
+ int node;
+
+ set_bit(IO_WQ_BIT_CANCEL, &wq->state);
+
+ /*
+ * Browse both lists, as there's a gap between handing work off
+ * to a worker and the worker putting itself on the busy_list
+ */
+ rcu_read_lock();
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+
+ io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
+ }
+ rcu_read_unlock();
+}
+
+struct io_cb_cancel_data {
+ struct io_wqe *wqe;
+ work_cancel_fn *cancel;
+ void *caller_data;
+};
+
+static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
+{
+ struct io_cb_cancel_data *data = cancel_data;
+ unsigned long flags;
+ bool ret = false;
+
+ /*
+ * Hold the lock to avoid ->cur_work going out of scope, caller
+ * may dereference the passed in work.
+ */
+ spin_lock_irqsave(&worker->lock, flags);
+ if (worker->cur_work &&
+ data->cancel(worker->cur_work, data->caller_data)) {
+ send_sig(SIGINT, worker->task, 1);
+ ret = true;
+ }
+ spin_unlock_irqrestore(&worker->lock, flags);
+
+ return ret;
+}
+
+static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
+ work_cancel_fn *cancel,
+ void *cancel_data)
+{
+ struct io_cb_cancel_data data = {
+ .wqe = wqe,
+ .cancel = cancel,
+ .caller_data = cancel_data,
+ };
+ struct io_wq_work_node *node, *prev;
+ struct io_wq_work *work;
+ unsigned long flags;
+ bool found = false;
+
+ spin_lock_irqsave(&wqe->lock, flags);
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+
+ if (cancel(work, cancel_data)) {
+ wq_node_del(&wqe->work_list, node, prev);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&wqe->lock, flags);
+
+ if (found) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ work->func(&work);
+ return IO_WQ_CANCEL_OK;
+ }
+
+ rcu_read_lock();
+ found = io_wq_for_each_worker(wqe, io_work_cancel, &data);
+ rcu_read_unlock();
+ return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
+}
+
+enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
+ void *data)
+{
+ enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
+ int node;
+
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+
+ ret = io_wqe_cancel_cb_work(wqe, cancel, data);
+ if (ret != IO_WQ_CANCEL_NOTFOUND)
+ break;
+ }
+
+ return ret;
+}
+
+static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
+{
+ struct io_wq_work *work = data;
+ unsigned long flags;
+ bool ret = false;
+
+ if (worker->cur_work != work)
+ return false;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ if (worker->cur_work == work) {
+ send_sig(SIGINT, worker->task, 1);
+ ret = true;
+ }
+ spin_unlock_irqrestore(&worker->lock, flags);
+
+ return ret;
+}
+
+static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
+ struct io_wq_work *cwork)
+{
+ struct io_wq_work_node *node, *prev;
+ struct io_wq_work *work;
+ unsigned long flags;
+ bool found = false;
+
+ cwork->flags |= IO_WQ_WORK_CANCEL;
+
+ /*
+ * First check pending list, if we're lucky we can just remove it
+ * from there. CANCEL_OK means that the work is returned as-new,
+ * no completion will be posted for it.
+ */
+ spin_lock_irqsave(&wqe->lock, flags);
+ wq_list_for_each(node, prev, &wqe->work_list) {
+ work = container_of(node, struct io_wq_work, list);
+
+ if (work == cwork) {
+ wq_node_del(&wqe->work_list, node, prev);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&wqe->lock, flags);
+
+ if (found) {
+ work->flags |= IO_WQ_WORK_CANCEL;
+ work->func(&work);
+ return IO_WQ_CANCEL_OK;
+ }
+
+ /*
+ * Now check if a free (going busy) or busy worker has the work
+ * currently running. If we find it there, we'll return CANCEL_RUNNING
+ * as an indication that we attempte to signal cancellation. The
+ * completion will run normally in this case.
+ */
+ rcu_read_lock();
+ found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork);
+ rcu_read_unlock();
+ return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND;
+}
+
+enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
+{
+ enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
+ int node;
+
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+
+ ret = io_wqe_cancel_work(wqe, cwork);
+ if (ret != IO_WQ_CANCEL_NOTFOUND)
+ break;
+ }
+
+ return ret;
+}
+
+struct io_wq_flush_data {
+ struct io_wq_work work;
+ struct completion done;
+};
+
+static void io_wq_flush_func(struct io_wq_work **workptr)
+{
+ struct io_wq_work *work = *workptr;
+ struct io_wq_flush_data *data;
+
+ data = container_of(work, struct io_wq_flush_data, work);
+ complete(&data->done);
+}
+
+/*
+ * Doesn't wait for previously queued work to finish. When this completes,
+ * it just means that previously queued work was started.
+ */
+void io_wq_flush(struct io_wq *wq)
+{
+ struct io_wq_flush_data data;
+ int node;
+
+ for_each_node(node) {
+ struct io_wqe *wqe = wq->wqes[node];
+
+ init_completion(&data.done);
+ INIT_IO_WORK(&data.work, io_wq_flush_func);
+ data.work.flags |= IO_WQ_WORK_INTERNAL;
+ io_wqe_enqueue(wqe, &data.work);
+ wait_for_completion(&data.done);
+ }
+}
+
+struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
+{
+ int ret = -ENOMEM, node;
+ struct io_wq *wq;
+
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+ if (!wq)
+ return ERR_PTR(-ENOMEM);
+
+ wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
+ if (!wq->wqes) {
+ kfree(wq);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ wq->get_work = data->get_work;
+ wq->put_work = data->put_work;
+
+ /* caller must already hold a reference to this */
+ wq->user = data->user;
+ wq->creds = data->creds;
+
+ for_each_node(node) {
+ struct io_wqe *wqe;
+
+ wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
+ if (!wqe)
+ goto err;
+ wq->wqes[node] = wqe;
+ wqe->node = node;
+ wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
+ atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
+ if (wq->user) {
+ wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
+ task_rlimit(current, RLIMIT_NPROC);
+ }
+ atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
+ wqe->node = node;
+ wqe->wq = wq;
+ spin_lock_init(&wqe->lock);
+ INIT_WQ_LIST(&wqe->work_list);
+ INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
+ INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1);
+ INIT_LIST_HEAD(&wqe->all_list);
+ }
+
+ init_completion(&wq->done);
+
+ /* caller must have already done mmgrab() on this mm */
+ wq->mm = data->mm;
+
+ wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
+ if (!IS_ERR(wq->manager)) {
+ wake_up_process(wq->manager);
+ wait_for_completion(&wq->done);
+ if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ reinit_completion(&wq->done);
+ return wq;
+ }
+
+ ret = PTR_ERR(wq->manager);
+ complete(&wq->done);
+err:
+ for_each_node(node)
+ kfree(wq->wqes[node]);
+ kfree(wq->wqes);
+ kfree(wq);
+ return ERR_PTR(ret);
+}
+
+static bool io_wq_worker_wake(struct io_worker *worker, void *data)
+{
+ wake_up_process(worker->task);
+ return false;
+}
+
+void io_wq_destroy(struct io_wq *wq)
+{
+ int node;
+
+ set_bit(IO_WQ_BIT_EXIT, &wq->state);
+ if (wq->manager)
+ kthread_stop(wq->manager);
+
+ rcu_read_lock();
+ for_each_node(node)
+ io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
+ rcu_read_unlock();
+
+ wait_for_completion(&wq->done);
+
+ for_each_node(node)
+ kfree(wq->wqes[node]);
+ kfree(wq->wqes);
+ kfree(wq);
+}
diff --git a/fs/io-wq.h b/fs/io-wq.h
new file mode 100644
index 000000000000..600e0158cba7
--- /dev/null
+++ b/fs/io-wq.h
@@ -0,0 +1,127 @@
+#ifndef INTERNAL_IO_WQ_H
+#define INTERNAL_IO_WQ_H
+
+struct io_wq;
+
+enum {
+ IO_WQ_WORK_CANCEL = 1,
+ IO_WQ_WORK_HAS_MM = 2,
+ IO_WQ_WORK_HASHED = 4,
+ IO_WQ_WORK_NEEDS_USER = 8,
+ IO_WQ_WORK_NEEDS_FILES = 16,
+ IO_WQ_WORK_UNBOUND = 32,
+ IO_WQ_WORK_INTERNAL = 64,
+ IO_WQ_WORK_CB = 128,
+
+ IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
+};
+
+enum io_wq_cancel {
+ IO_WQ_CANCEL_OK, /* cancelled before started */
+ IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
+ IO_WQ_CANCEL_NOTFOUND, /* work not found */
+};
+
+struct io_wq_work_node {
+ struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+ struct io_wq_work_node *first;
+ struct io_wq_work_node *last;
+};
+
+static inline void wq_list_add_tail(struct io_wq_work_node *node,
+ struct io_wq_work_list *list)
+{
+ if (!list->first) {
+ list->first = list->last = node;
+ } else {
+ list->last->next = node;
+ list->last = node;
+ }
+}
+
+static inline void wq_node_del(struct io_wq_work_list *list,
+ struct io_wq_work_node *node,
+ struct io_wq_work_node *prev)
+{
+ if (node == list->first)
+ list->first = node->next;
+ if (node == list->last)
+ list->last = prev;
+ if (prev)
+ prev->next = node->next;
+}
+
+#define wq_list_for_each(pos, prv, head) \
+ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list) ((list)->first == NULL)
+#define INIT_WQ_LIST(list) do { \
+ (list)->first = NULL; \
+ (list)->last = NULL; \
+} while (0)
+
+struct io_wq_work {
+ union {
+ struct io_wq_work_node list;
+ void *data;
+ };
+ void (*func)(struct io_wq_work **);
+ struct files_struct *files;
+ unsigned flags;
+};
+
+#define INIT_IO_WORK(work, _func) \
+ do { \
+ (work)->list.next = NULL; \
+ (work)->func = _func; \
+ (work)->flags = 0; \
+ (work)->files = NULL; \
+ } while (0) \
+
+typedef void (get_work_fn)(struct io_wq_work *);
+typedef void (put_work_fn)(struct io_wq_work *);
+
+struct io_wq_data {
+ struct mm_struct *mm;
+ struct user_struct *user;
+ struct cred *creds;
+
+ get_work_fn *get_work;
+ put_work_fn *put_work;
+};
+
+struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
+void io_wq_destroy(struct io_wq *wq);
+
+void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
+void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
+void io_wq_flush(struct io_wq *wq);
+
+void io_wq_cancel_all(struct io_wq *wq);
+enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
+
+typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
+
+enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
+ void *data);
+
+#if defined(CONFIG_IO_WQ)
+extern void io_wq_worker_sleeping(struct task_struct *);
+extern void io_wq_worker_running(struct task_struct *);
+#else
+static inline void io_wq_worker_sleeping(struct task_struct *tsk)
+{
+}
+static inline void io_wq_worker_running(struct task_struct *tsk)
+{
+}
+#endif
+
+static inline bool io_wq_current_is_worker(void)
+{
+ return in_task() && (current->flags & PF_IO_WORKER);
+}
+#endif
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f9a38998f2fc..ec53aa7cdc94 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -56,7 +56,6 @@
#include <linux/mmu_context.h>
#include <linux/percpu.h>
#include <linux/slab.h>
-#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
#include <linux/bvec.h>
@@ -70,13 +69,26 @@
#include <linux/nospec.h>
#include <linux/sizes.h>
#include <linux/hugetlb.h>
+#include <linux/highmem.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "internal.h"
+#include "io-wq.h"
#define IORING_MAX_ENTRIES 32768
-#define IORING_MAX_FIXED_FILES 1024
+#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
+
+/*
+ * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
+ */
+#define IORING_FILE_TABLE_SHIFT 9
+#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
+#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
+#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
@@ -161,14 +173,8 @@ struct io_mapped_ubuf {
unsigned int nr_bvecs;
};
-struct async_list {
- spinlock_t lock;
- atomic_t cnt;
- struct list_head list;
-
- struct file *file;
- off_t io_start;
- size_t io_len;
+struct fixed_file_table {
+ struct file **files;
};
struct io_ring_ctx {
@@ -180,6 +186,8 @@ struct io_ring_ctx {
unsigned int flags;
bool compat;
bool account_mem;
+ bool cq_overflow_flushed;
+ bool drain_next;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
@@ -198,38 +206,30 @@ struct io_ring_ctx {
unsigned sq_mask;
unsigned sq_thread_idle;
unsigned cached_sq_dropped;
+ atomic_t cached_cq_overflow;
struct io_uring_sqe *sq_sqes;
struct list_head defer_list;
struct list_head timeout_list;
+ struct list_head cq_overflow_list;
+
+ wait_queue_head_t inflight_wait;
} ____cacheline_aligned_in_smp;
+ struct io_rings *rings;
+
/* IO offload */
- struct workqueue_struct *sqo_wq[2];
+ struct io_wq *io_wq;
struct task_struct *sqo_thread; /* if using sq thread polling */
struct mm_struct *sqo_mm;
wait_queue_head_t sqo_wait;
- struct completion sqo_thread_started;
-
- struct {
- unsigned cached_cq_tail;
- atomic_t cached_cq_overflow;
- unsigned cq_entries;
- unsigned cq_mask;
- struct wait_queue_head cq_wait;
- struct fasync_struct *cq_fasync;
- struct eventfd_ctx *cq_ev_fd;
- atomic_t cq_timeouts;
- } ____cacheline_aligned_in_smp;
-
- struct io_rings *rings;
/*
* If used, fixed file set. Writers must ensure that ->refs is dead,
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
- struct file **user_files;
+ struct fixed_file_table *file_table;
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
@@ -238,7 +238,27 @@ struct io_ring_ctx {
struct user_struct *user;
- struct completion ctx_done;
+ struct cred *creds;
+
+ /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
+ struct completion *completions;
+
+ /* if all else fails... */
+ struct io_kiocb *fallback_req;
+
+#if defined(CONFIG_UNIX)
+ struct socket *ring_sock;
+#endif
+
+ struct {
+ unsigned cached_cq_tail;
+ unsigned cq_entries;
+ unsigned cq_mask;
+ atomic_t cq_timeouts;
+ struct wait_queue_head cq_wait;
+ struct fasync_struct *cq_fasync;
+ struct eventfd_ctx *cq_ev_fd;
+ } ____cacheline_aligned_in_smp;
struct {
struct mutex uring_lock;
@@ -255,23 +275,11 @@ struct io_ring_ctx {
* manipulate the list, hence no extra locking is needed there.
*/
struct list_head poll_list;
- struct list_head cancel_list;
- } ____cacheline_aligned_in_smp;
-
- struct async_list pending_async[2];
-
-#if defined(CONFIG_UNIX)
- struct socket *ring_sock;
-#endif
-};
+ struct rb_root cancel_tree;
-struct sqe_submit {
- const struct io_uring_sqe *sqe;
- unsigned short index;
- u32 sequence;
- bool has_user;
- bool needs_lock;
- bool needs_fixed_file;
+ spinlock_t inflight_lock;
+ struct list_head inflight_list;
+ } ____cacheline_aligned_in_smp;
};
/*
@@ -284,12 +292,20 @@ struct io_poll_iocb {
__poll_t events;
bool done;
bool canceled;
- struct wait_queue_entry wait;
+ struct wait_queue_entry *wait;
+};
+
+struct io_timeout_data {
+ struct io_kiocb *req;
+ struct hrtimer timer;
+ struct timespec64 ts;
+ enum hrtimer_mode mode;
+ u32 seq_offset;
};
struct io_timeout {
struct file *file;
- struct hrtimer timer;
+ struct io_timeout_data *data;
};
/*
@@ -306,31 +322,45 @@ struct io_kiocb {
struct io_timeout timeout;
};
- struct sqe_submit submit;
+ const struct io_uring_sqe *sqe;
+ struct file *ring_file;
+ int ring_fd;
+ bool has_user;
+ bool in_async;
+ bool needs_fixed_file;
struct io_ring_ctx *ctx;
- struct list_head list;
+ union {
+ struct list_head list;
+ struct rb_node rb_node;
+ };
struct list_head link_list;
unsigned int flags;
refcount_t refs;
#define REQ_F_NOWAIT 1 /* must not punt to workers */
#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
-#define REQ_F_SEQ_PREV 8 /* sequential with previous */
+#define REQ_F_LINK_NEXT 8 /* already grabbed next link */
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */
#define REQ_F_LINK 64 /* linked sqes */
-#define REQ_F_LINK_DONE 128 /* linked sqes done */
+#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
-#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */
+#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
#define REQ_F_TIMEOUT 1024 /* timeout request */
#define REQ_F_ISREG 2048 /* regular file */
#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
+#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
+#define REQ_F_INFLIGHT 16384 /* on inflight list */
+#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
+#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */
u64 user_data;
u32 result;
u32 sequence;
- struct work_struct work;
+ struct list_head inflight_entry;
+
+ struct io_wq_work work;
};
#define IO_PLUG_THRESHOLD 2
@@ -356,10 +386,14 @@ struct io_submit_state {
unsigned int ios_left;
};
-static void io_sq_wq_submit_work(struct work_struct *work);
-static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
- long res);
+static void io_wq_submit_work(struct io_wq_work **workptr);
+static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void __io_free_req(struct io_kiocb *req);
+static void io_put_req(struct io_kiocb *req);
+static void io_double_put_req(struct io_kiocb *req);
+static void __io_double_put_req(struct io_kiocb *req);
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
+static void io_queue_linked_timeout(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
@@ -382,57 +416,67 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
- complete(&ctx->ctx_done);
+ complete(&ctx->completions[0]);
}
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int i;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
+ ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
+ if (!ctx->fallback_req)
+ goto err;
+
+ ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL);
+ if (!ctx->completions)
+ goto err;
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
- PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
- kfree(ctx);
- return NULL;
- }
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
+ goto err;
ctx->flags = p->flags;
init_waitqueue_head(&ctx->cq_wait);
- init_completion(&ctx->ctx_done);
- init_completion(&ctx->sqo_thread_started);
+ INIT_LIST_HEAD(&ctx->cq_overflow_list);
+ init_completion(&ctx->completions[0]);
+ init_completion(&ctx->completions[1]);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
- for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
- spin_lock_init(&ctx->pending_async[i].lock);
- INIT_LIST_HEAD(&ctx->pending_async[i].list);
- atomic_set(&ctx->pending_async[i].cnt, 0);
- }
spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->poll_list);
- INIT_LIST_HEAD(&ctx->cancel_list);
+ ctx->cancel_tree = RB_ROOT;
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
+ init_waitqueue_head(&ctx->inflight_wait);
+ spin_lock_init(&ctx->inflight_lock);
+ INIT_LIST_HEAD(&ctx->inflight_list);
return ctx;
+err:
+ if (ctx->fallback_req)
+ kmem_cache_free(req_cachep, ctx->fallback_req);
+ kfree(ctx->completions);
+ kfree(ctx);
+ return NULL;
}
-static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static inline bool __req_need_defer(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
+
return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
+ atomic_read(&ctx->cached_cq_overflow);
}
-static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static inline bool req_need_defer(struct io_kiocb *req)
{
- if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
- return false;
+ if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
+ return __req_need_defer(req);
- return __io_sequence_defer(ctx, req);
+ return false;
}
static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
@@ -440,7 +484,7 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
struct io_kiocb *req;
req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list);
- if (req && !io_sequence_defer(ctx, req)) {
+ if (req && !req_need_defer(req)) {
list_del_init(&req->list);
return req;
}
@@ -453,9 +497,13 @@ static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
struct io_kiocb *req;
req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
- if (req && !__io_sequence_defer(ctx, req)) {
- list_del_init(&req->list);
- return req;
+ if (req) {
+ if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+ return NULL;
+ if (!__req_need_defer(req)) {
+ list_del_init(&req->list);
+ return req;
+ }
}
return NULL;
@@ -476,33 +524,80 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
}
}
-static inline void io_queue_async_work(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
+static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
{
- int rw = 0;
+ u8 opcode = READ_ONCE(sqe->opcode);
+
+ return !(opcode == IORING_OP_READ_FIXED ||
+ opcode == IORING_OP_WRITE_FIXED);
+}
- if (req->submit.sqe) {
- switch (req->submit.sqe->opcode) {
+static inline bool io_prep_async_work(struct io_kiocb *req,
+ struct io_kiocb **link)
+{
+ bool do_hashed = false;
+
+ if (req->sqe) {
+ switch (req->sqe->opcode) {
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
- rw = !(req->rw.ki_flags & IOCB_DIRECT);
+ do_hashed = true;
+ /* fall-through */
+ case IORING_OP_READV:
+ case IORING_OP_READ_FIXED:
+ case IORING_OP_SENDMSG:
+ case IORING_OP_RECVMSG:
+ case IORING_OP_ACCEPT:
+ case IORING_OP_POLL_ADD:
+ case IORING_OP_CONNECT:
+ /*
+ * We know REQ_F_ISREG is not set on some of these
+ * opcodes, but this enables us to keep the check in
+ * just one place.
+ */
+ if (!(req->flags & REQ_F_ISREG))
+ req->work.flags |= IO_WQ_WORK_UNBOUND;
break;
}
+ if (io_sqe_needs_user(req->sqe))
+ req->work.flags |= IO_WQ_WORK_NEEDS_USER;
+ }
+
+ *link = io_prep_linked_timeout(req);
+ return do_hashed;
+}
+
+static inline void io_queue_async_work(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *link;
+ bool do_hashed;
+
+ do_hashed = io_prep_async_work(req, &link);
+
+ trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
+ req->flags);
+ if (!do_hashed) {
+ io_wq_enqueue(ctx->io_wq, &req->work);
+ } else {
+ io_wq_enqueue_hashed(ctx->io_wq, &req->work,
+ file_inode(req->file));
}
- queue_work(ctx->sqo_wq[rw], &req->work);
+ if (link)
+ io_queue_linked_timeout(link);
}
static void io_kill_timeout(struct io_kiocb *req)
{
int ret;
- ret = hrtimer_try_to_cancel(&req->timeout.timer);
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
- list_del(&req->list);
- io_cqring_fill_event(req->ctx, req->user_data, 0);
- __io_free_req(req);
+ list_del_init(&req->list);
+ io_cqring_fill_event(req, 0);
+ io_put_req(req);
}
}
@@ -526,13 +621,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_commit_cqring(ctx);
while ((req = io_get_deferred_req(ctx)) != NULL) {
- if (req->flags & REQ_F_SHADOW_DRAIN) {
- /* Just for drain, free it. */
- __io_free_req(req);
- continue;
- }
req->flags |= REQ_F_IO_DRAINED;
- io_queue_async_work(ctx, req);
+ io_queue_async_work(req);
}
}
@@ -554,10 +644,77 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
return &rings->cqes[tail & ctx->cq_mask];
}
-static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
- long res)
+static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
+ if (ctx->cq_ev_fd)
+ eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+/* Returns true if there are no backlogged entries after the flush */
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+{
+ struct io_rings *rings = ctx->rings;
struct io_uring_cqe *cqe;
+ struct io_kiocb *req;
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ if (!force) {
+ if (list_empty_careful(&ctx->cq_overflow_list))
+ return true;
+ if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
+ rings->cq_ring_entries))
+ return false;
+ }
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+
+ /* if force is set, the ring is going away. always drop after that */
+ if (force)
+ ctx->cq_overflow_flushed = true;
+
+ cqe = NULL;
+ while (!list_empty(&ctx->cq_overflow_list)) {
+ cqe = io_get_cqring(ctx);
+ if (!cqe && !force)
+ break;
+
+ req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
+ list);
+ list_move(&req->list, &list);
+ if (cqe) {
+ WRITE_ONCE(cqe->user_data, req->user_data);
+ WRITE_ONCE(cqe->res, req->result);
+ WRITE_ONCE(cqe->flags, 0);
+ } else {
+ WRITE_ONCE(ctx->rings->cq_overflow,
+ atomic_inc_return(&ctx->cached_cq_overflow));
+ }
+ }
+
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ io_cqring_ev_posted(ctx);
+
+ while (!list_empty(&list)) {
+ req = list_first_entry(&list, struct io_kiocb, list);
+ list_del(&req->list);
+ io_put_req(req);
+ }
+
+ return cqe != NULL;
+}
+
+static void io_cqring_fill_event(struct io_kiocb *req, long res)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_uring_cqe *cqe;
+
+ trace_io_uring_complete(ctx, req->user_data, res);
/*
* If we can't get a cq entry, userspace overflowed the
@@ -565,39 +722,50 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
* the ring.
*/
cqe = io_get_cqring(ctx);
- if (cqe) {
- WRITE_ONCE(cqe->user_data, ki_user_data);
+ if (likely(cqe)) {
+ WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, 0);
- } else {
+ } else if (ctx->cq_overflow_flushed) {
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
+ } else {
+ refcount_inc(&req->refs);
+ req->result = res;
+ list_add_tail(&req->list, &ctx->cq_overflow_list);
}
}
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
-{
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
- if (waitqueue_active(&ctx->sqo_wait))
- wake_up(&ctx->sqo_wait);
- if (ctx->cq_ev_fd)
- eventfd_signal(ctx->cq_ev_fd, 1);
-}
-
-static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
- long res)
+static void io_cqring_add_event(struct io_kiocb *req, long res)
{
+ struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags);
- io_cqring_fill_event(ctx, user_data, res);
+ io_cqring_fill_event(req, res);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
}
+static inline bool io_is_fallback_req(struct io_kiocb *req)
+{
+ return req == (struct io_kiocb *)
+ ((unsigned long) req->ctx->fallback_req & ~1UL);
+}
+
+static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
+{
+ struct io_kiocb *req;
+
+ req = ctx->fallback_req;
+ if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req))
+ return req;
+
+ return NULL;
+}
+
static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
@@ -610,7 +778,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
if (!state) {
req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req))
- goto out;
+ goto fallback;
} else if (!state->free_reqs) {
size_t sz;
int ret;
@@ -625,7 +793,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
if (unlikely(ret <= 0)) {
state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
if (!state->reqs[0])
- goto out;
+ goto fallback;
ret = 1;
}
state->free_reqs = ret - 1;
@@ -637,14 +805,20 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
state->cur_req++;
}
+got_it:
+ req->ring_file = NULL;
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
/* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2);
req->result = 0;
+ INIT_IO_WORK(&req->work, io_wq_submit_work);
return req;
-out:
+fallback:
+ req = io_get_fallback_req(ctx);
+ if (req)
+ goto got_it;
percpu_ref_put(&ctx->refs);
return NULL;
}
@@ -660,15 +834,56 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
static void __io_free_req(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (req->flags & REQ_F_FREE_SQE)
+ kfree(req->sqe);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
- percpu_ref_put(&req->ctx->refs);
- kmem_cache_free(req_cachep, req);
+ if (req->flags & REQ_F_INFLIGHT) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->inflight_lock, flags);
+ list_del(&req->inflight_entry);
+ if (waitqueue_active(&ctx->inflight_wait))
+ wake_up(&ctx->inflight_wait);
+ spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+ }
+ if (req->flags & REQ_F_TIMEOUT)
+ kfree(req->timeout.data);
+ percpu_ref_put(&ctx->refs);
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
}
-static void io_req_link_next(struct io_kiocb *req)
+static bool io_link_cancel_timeout(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
+ if (ret != -1) {
+ io_cqring_fill_event(req, -ECANCELED);
+ io_commit_cqring(ctx);
+ req->flags &= ~REQ_F_LINK;
+ io_put_req(req);
+ return true;
+ }
+
+ return false;
+}
+
+static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt;
+ bool wake_ev = false;
+
+ /* Already got next link */
+ if (req->flags & REQ_F_LINK_NEXT)
+ return;
/*
* The list should never be empty when we are called here. But could
@@ -676,18 +891,30 @@ static void io_req_link_next(struct io_kiocb *req)
* safe side.
*/
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
- if (nxt) {
- list_del(&nxt->list);
+ while (nxt) {
+ list_del_init(&nxt->list);
+
+ if ((req->flags & REQ_F_LINK_TIMEOUT) &&
+ (nxt->flags & REQ_F_TIMEOUT)) {
+ wake_ev |= io_link_cancel_timeout(nxt);
+ nxt = list_first_entry_or_null(&req->link_list,
+ struct io_kiocb, list);
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
+ continue;
+ }
if (!list_empty(&req->link_list)) {
INIT_LIST_HEAD(&nxt->link_list);
list_splice(&req->link_list, &nxt->link_list);
nxt->flags |= REQ_F_LINK;
}
- nxt->flags |= REQ_F_LINK_DONE;
- INIT_WORK(&nxt->work, io_sq_wq_submit_work);
- io_queue_async_work(req->ctx, nxt);
+ *nxtptr = nxt;
+ break;
}
+
+ req->flags |= REQ_F_LINK_NEXT;
+ if (wake_ev)
+ io_cqring_ev_posted(ctx);
}
/*
@@ -695,33 +922,86 @@ static void io_req_link_next(struct io_kiocb *req)
*/
static void io_fail_links(struct io_kiocb *req)
{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) {
link = list_first_entry(&req->link_list, struct io_kiocb, list);
- list_del(&link->list);
+ list_del_init(&link->list);
- io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
- __io_free_req(link);
+ trace_io_uring_fail_link(req, link);
+
+ if ((req->flags & REQ_F_LINK_TIMEOUT) &&
+ link->sqe->opcode == IORING_OP_LINK_TIMEOUT) {
+ io_link_cancel_timeout(link);
+ } else {
+ io_cqring_fill_event(link, -ECANCELED);
+ __io_double_put_req(link);
+ }
+ req->flags &= ~REQ_F_LINK_TIMEOUT;
}
+
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ io_cqring_ev_posted(ctx);
}
-static void io_free_req(struct io_kiocb *req)
+static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
{
+ if (likely(!(req->flags & REQ_F_LINK)))
+ return;
+
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
- if (req->flags & REQ_F_LINK) {
- if (req->flags & REQ_F_FAIL_LINK)
- io_fail_links(req);
- else
- io_req_link_next(req);
+ if (req->flags & REQ_F_FAIL_LINK) {
+ io_fail_links(req);
+ } else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
+ REQ_F_LINK_TIMEOUT) {
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long flags;
+
+ /*
+ * If this is a timeout link, we could be racing with the
+ * timeout timer. Grab the completion lock for this case to
+ * protect against that.
+ */
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ io_req_link_next(req, nxt);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ io_req_link_next(req, nxt);
}
+}
+static void io_free_req(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt = NULL;
+
+ io_req_find_next(req, &nxt);
__io_free_req(req);
+
+ if (nxt)
+ io_queue_async_work(nxt);
+}
+
+/*
+ * Drop reference to request, return next in chain (if there is one) if this
+ * was the last reference to this request.
+ */
+__attribute__((nonnull))
+static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+{
+ io_req_find_next(req, nxtptr);
+
+ if (refcount_dec_and_test(&req->refs))
+ __io_free_req(req);
}
static void io_put_req(struct io_kiocb *req)
@@ -730,8 +1010,38 @@ static void io_put_req(struct io_kiocb *req)
io_free_req(req);
}
-static unsigned io_cqring_events(struct io_rings *rings)
+/*
+ * Must only be used if we don't need to care about links, usually from
+ * within the completion handling itself.
+ */
+static void __io_double_put_req(struct io_kiocb *req)
+{
+ /* drop both submit and complete references */
+ if (refcount_sub_and_test(2, &req->refs))
+ __io_free_req(req);
+}
+
+static void io_double_put_req(struct io_kiocb *req)
{
+ /* drop both submit and complete references */
+ if (refcount_sub_and_test(2, &req->refs))
+ io_free_req(req);
+}
+
+static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
+{
+ struct io_rings *rings = ctx->rings;
+
+ /*
+ * noflush == true is from the waitqueue handler, just ensure we wake
+ * up the task, and the next invocation will flush the entries. We
+ * cannot safely to it from here.
+ */
+ if (noflush && !list_empty(&ctx->cq_overflow_list))
+ return -1U;
+
+ io_cqring_overflow_flush(ctx, false);
+
/* See comment at the top of this file */
smp_rmb();
return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
@@ -760,7 +1070,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
- io_cqring_fill_event(ctx, req->user_data, req->result);
+ io_cqring_fill_event(req, req->result);
(*nr_events)++;
if (refcount_dec_and_test(&req->refs)) {
@@ -769,8 +1079,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
* completions for those, only batch free for fixed
* file and non-linked commands.
*/
- if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
- REQ_F_FIXED_FILE) {
+ if (((req->flags &
+ (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) ==
+ REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
io_free_req_many(ctx, reqs, &to_free);
@@ -887,7 +1198,7 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
* If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error).
*/
- if (io_cqring_events(ctx->rings))
+ if (io_cqring_events(ctx, false))
break;
/*
@@ -947,7 +1258,7 @@ static void kiocb_end_write(struct io_kiocb *req)
file_end_write(req->file);
}
-static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+static void io_complete_rw_common(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
@@ -956,10 +1267,28 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
if ((req->flags & REQ_F_LINK) && res != req->result)
req->flags |= REQ_F_FAIL_LINK;
- io_cqring_add_event(req->ctx, req->user_data, res);
+ io_cqring_add_event(req, res);
+}
+
+static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+
+ io_complete_rw_common(kiocb, res);
io_put_req(req);
}
+static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res)
+{
+ struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
+ struct io_kiocb *nxt = NULL;
+
+ io_complete_rw_common(kiocb, res);
+ io_put_req_find_next(req, &nxt);
+
+ return nxt;
+}
+
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
@@ -1067,10 +1396,9 @@ static bool io_file_supports_async(struct file *file)
return false;
}
-static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
- bool force_nonblock)
+static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
{
- const struct io_uring_sqe *sqe = s->sqe;
+ const struct io_uring_sqe *sqe = req->sqe;
struct io_ring_ctx *ctx = req->ctx;
struct kiocb *kiocb = &req->rw;
unsigned ioprio;
@@ -1154,9 +1482,18 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
-static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
- const struct io_uring_sqe *sqe,
- struct iov_iter *iter)
+static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
+ bool in_async)
+{
+ if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
+ *nxt = __io_complete_rw(kiocb, ret);
+ else
+ io_rw_done(kiocb, ret);
+}
+
+static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
+ const struct io_uring_sqe *sqe,
+ struct iov_iter *iter)
{
size_t len = READ_ONCE(sqe->len);
struct io_mapped_ubuf *imu;
@@ -1225,14 +1562,13 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
}
}
- return 0;
+ return len;
}
-static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
- const struct sqe_submit *s, struct iovec **iovec,
- struct iov_iter *iter)
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+ struct iovec **iovec, struct iov_iter *iter)
{
- const struct io_uring_sqe *sqe = s->sqe;
+ const struct io_uring_sqe *sqe = req->sqe;
void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
size_t sqe_len = READ_ONCE(sqe->len);
u8 opcode;
@@ -1246,18 +1582,16 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
* flag.
*/
opcode = READ_ONCE(sqe->opcode);
- if (opcode == IORING_OP_READ_FIXED ||
- opcode == IORING_OP_WRITE_FIXED) {
- ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+ if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
*iovec = NULL;
- return ret;
+ return io_import_fixed(req->ctx, rw, sqe, iter);
}
- if (!s->has_user)
+ if (!req->has_user)
return -EFAULT;
#ifdef CONFIG_COMPAT
- if (ctx->compat)
+ if (req->ctx->compat)
return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
iovec, iter);
#endif
@@ -1265,65 +1599,6 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
}
-static inline bool io_should_merge(struct async_list *al, struct kiocb *kiocb)
-{
- if (al->file == kiocb->ki_filp) {
- off_t start, end;
-
- /*
- * Allow merging if we're anywhere in the range of the same
- * page. Generally this happens for sub-page reads or writes,
- * and it's beneficial to allow the first worker to bring the
- * page in and the piggy backed work can then work on the
- * cached page.
- */
- start = al->io_start & PAGE_MASK;
- end = (al->io_start + al->io_len + PAGE_SIZE - 1) & PAGE_MASK;
- if (kiocb->ki_pos >= start && kiocb->ki_pos <= end)
- return true;
- }
-
- al->file = NULL;
- return false;
-}
-
-/*
- * Make a note of the last file/offset/direction we punted to async
- * context. We'll use this information to see if we can piggy back a
- * sequential request onto the previous one, if it's still hasn't been
- * completed by the async worker.
- */
-static void io_async_list_note(int rw, struct io_kiocb *req, size_t len)
-{
- struct async_list *async_list = &req->ctx->pending_async[rw];
- struct kiocb *kiocb = &req->rw;
- struct file *filp = kiocb->ki_filp;
-
- if (io_should_merge(async_list, kiocb)) {
- unsigned long max_bytes;
-
- /* Use 8x RA size as a decent limiter for both reads/writes */
- max_bytes = filp->f_ra.ra_pages << (PAGE_SHIFT + 3);
- if (!max_bytes)
- max_bytes = VM_READAHEAD_PAGES << (PAGE_SHIFT + 3);
-
- /* If max len are exceeded, reset the state */
- if (async_list->io_len + len <= max_bytes) {
- req->flags |= REQ_F_SEQ_PREV;
- async_list->io_len += len;
- } else {
- async_list->file = NULL;
- }
- }
-
- /* New file? Reset state. */
- if (async_list->file != filp) {
- async_list->io_start = kiocb->ki_pos;
- async_list->io_len = len;
- async_list->file = filp;
- }
-}
-
/*
* For files that don't have ->read_iter() and ->write_iter(), handle them
* by looping over ->read() or ->write() manually.
@@ -1344,9 +1619,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return -EAGAIN;
while (iov_iter_count(iter)) {
- struct iovec iovec = iov_iter_iovec(iter);
+ struct iovec iovec;
ssize_t nr;
+ if (!iov_iter_is_bvec(iter)) {
+ iovec = iov_iter_iovec(iter);
+ } else {
+ /* fixed buffers import bvec */
+ iovec.iov_base = kmap(iter->bvec->bv_page)
+ + iter->iov_offset;
+ iovec.iov_len = min(iter->count,
+ iter->bvec->bv_len - iter->iov_offset);
+ }
+
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
iovec.iov_len, &kiocb->ki_pos);
@@ -1355,6 +1640,9 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
iovec.iov_len, &kiocb->ki_pos);
}
+ if (iov_iter_is_bvec(iter))
+ kunmap(iter->bvec->bv_page);
+
if (nr < 0) {
if (!ret)
ret = nr;
@@ -1369,7 +1657,7 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return ret;
}
-static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
+static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -1379,7 +1667,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
size_t iov_count;
ssize_t read_size, ret;
- ret = io_prep_rw(req, s, force_nonblock);
+ ret = io_prep_rw(req, force_nonblock);
if (ret)
return ret;
file = kiocb->ki_filp;
@@ -1387,7 +1675,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
if (unlikely(!(file->f_mode & FMODE_READ)))
return -EBADF;
- ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
+ ret = io_import_iovec(READ, req, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1418,23 +1706,16 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
ret2 > 0 && ret2 < read_size)
ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */
- if (!force_nonblock || ret2 != -EAGAIN) {
- io_rw_done(kiocb, ret2);
- } else {
- /*
- * If ->needs_lock is true, we're already in async
- * context.
- */
- if (!s->needs_lock)
- io_async_list_note(READ, req, iov_count);
+ if (!force_nonblock || ret2 != -EAGAIN)
+ kiocb_done(kiocb, ret2, nxt, req->in_async);
+ else
ret = -EAGAIN;
- }
}
kfree(iovec);
return ret;
}
-static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
+static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -1444,7 +1725,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
size_t iov_count;
ssize_t ret;
- ret = io_prep_rw(req, s, force_nonblock);
+ ret = io_prep_rw(req, force_nonblock);
if (ret)
return ret;
@@ -1452,7 +1733,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
if (unlikely(!(file->f_mode & FMODE_WRITE)))
return -EBADF;
- ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
+ ret = io_import_iovec(WRITE, req, &iovec, &iter);
if (ret < 0)
return ret;
@@ -1462,12 +1743,8 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
iov_count = iov_iter_count(&iter);
ret = -EAGAIN;
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) {
- /* If ->needs_lock is true, we're already in async context. */
- if (!s->needs_lock)
- io_async_list_note(WRITE, req, iov_count);
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
goto out_free;
- }
ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
if (!ret) {
@@ -1492,17 +1769,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
ret2 = call_write_iter(file, kiocb, &iter);
else
ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
- if (!force_nonblock || ret2 != -EAGAIN) {
- io_rw_done(kiocb, ret2);
- } else {
- /*
- * If ->needs_lock is true, we're already in async
- * context.
- */
- if (!s->needs_lock)
- io_async_list_note(WRITE, req, iov_count);
+ if (!force_nonblock || ret2 != -EAGAIN)
+ kiocb_done(kiocb, ret2, nxt, req->in_async);
+ else
ret = -EAGAIN;
- }
}
out_free:
kfree(iovec);
@@ -1512,15 +1782,14 @@ out_free:
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
-static int io_nop(struct io_kiocb *req, u64 user_data)
+static int io_nop(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- long err = 0;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- io_cqring_add_event(ctx, user_data, err);
+ io_cqring_add_event(req, 0);
io_put_req(req);
return 0;
}
@@ -1541,7 +1810,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+ struct io_kiocb **nxt, bool force_nonblock)
{
loff_t sqe_off = READ_ONCE(sqe->off);
loff_t sqe_len = READ_ONCE(sqe->len);
@@ -1567,8 +1836,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK;
- io_cqring_add_event(req->ctx, sqe->user_data, ret);
- io_put_req(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
return 0;
}
@@ -1590,6 +1859,7 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_sync_file_range(struct io_kiocb *req,
const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt,
bool force_nonblock)
{
loff_t sqe_off;
@@ -1613,14 +1883,14 @@ static int io_sync_file_range(struct io_kiocb *req,
if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK;
- io_cqring_add_event(req->ctx, sqe->user_data, ret);
- io_put_req(req);
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
return 0;
}
#if defined(CONFIG_NET)
static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock,
+ struct io_kiocb **nxt, bool force_nonblock,
long (*fn)(struct socket *, struct user_msghdr __user *,
unsigned int))
{
@@ -1649,59 +1919,161 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return ret;
}
- io_cqring_add_event(req->ctx, sqe->user_data, ret);
- io_put_req(req);
+ io_cqring_add_event(req, ret);
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_put_req_find_next(req, nxt);
return 0;
}
#endif
static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+ struct io_kiocb **nxt, bool force_nonblock)
{
#if defined(CONFIG_NET)
- return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
+ return io_send_recvmsg(req, sqe, nxt, force_nonblock,
+ __sys_sendmsg_sock);
#else
return -EOPNOTSUPP;
#endif
}
static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- bool force_nonblock)
+ struct io_kiocb **nxt, bool force_nonblock)
{
#if defined(CONFIG_NET)
- return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
+ return io_send_recvmsg(req, sqe, nxt, force_nonblock,
+ __sys_recvmsg_sock);
#else
return -EOPNOTSUPP;
#endif
}
+static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct sockaddr __user *addr;
+ int __user *addr_len;
+ unsigned file_flags;
+ int flags, ret;
+
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->len || sqe->buf_index)
+ return -EINVAL;
+
+ addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
+ addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2);
+ flags = READ_ONCE(sqe->accept_flags);
+ file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+ ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags);
+ if (ret == -EAGAIN && force_nonblock) {
+ req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
+ return -EAGAIN;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+ struct sockaddr __user *addr;
+ unsigned file_flags;
+ int addr_len, ret;
+
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
+ return -EINVAL;
+
+ addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
+ addr_len = READ_ONCE(sqe->addr2);
+ file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+ ret = __sys_connect_file(req->file, addr, addr_len, file_flags);
+ if (ret == -EAGAIN && force_nonblock)
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_cqring_add_event(req, ret);
+ io_put_req_find_next(req, nxt);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static inline void io_poll_remove_req(struct io_kiocb *req)
+{
+ if (!RB_EMPTY_NODE(&req->rb_node)) {
+ rb_erase(&req->rb_node, &req->ctx->cancel_tree);
+ RB_CLEAR_NODE(&req->rb_node);
+ }
+}
+
static void io_poll_remove_one(struct io_kiocb *req)
{
struct io_poll_iocb *poll = &req->poll;
spin_lock(&poll->head->lock);
WRITE_ONCE(poll->canceled, true);
- if (!list_empty(&poll->wait.entry)) {
- list_del_init(&poll->wait.entry);
- io_queue_async_work(req->ctx, req);
+ if (!list_empty(&poll->wait->entry)) {
+ list_del_init(&poll->wait->entry);
+ io_queue_async_work(req);
}
spin_unlock(&poll->head->lock);
-
- list_del_init(&req->list);
+ io_poll_remove_req(req);
}
static void io_poll_remove_all(struct io_ring_ctx *ctx)
{
+ struct rb_node *node;
struct io_kiocb *req;
spin_lock_irq(&ctx->completion_lock);
- while (!list_empty(&ctx->cancel_list)) {
- req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list);
+ while ((node = rb_first(&ctx->cancel_tree)) != NULL) {
+ req = rb_entry(node, struct io_kiocb, rb_node);
io_poll_remove_one(req);
}
spin_unlock_irq(&ctx->completion_lock);
}
+static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
+{
+ struct rb_node *p, *parent = NULL;
+ struct io_kiocb *req;
+
+ p = ctx->cancel_tree.rb_node;
+ while (p) {
+ parent = p;
+ req = rb_entry(parent, struct io_kiocb, rb_node);
+ if (sqe_addr < req->user_data) {
+ p = p->rb_left;
+ } else if (sqe_addr > req->user_data) {
+ p = p->rb_right;
+ } else {
+ io_poll_remove_one(req);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
/*
* Find a running poll command that matches one specified in sqe->addr,
* and remove it if found.
@@ -1709,8 +2081,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_kiocb *poll_req, *next;
- int ret = -ENOENT;
+ int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
@@ -1719,37 +2090,48 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
spin_lock_irq(&ctx->completion_lock);
- list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) {
- if (READ_ONCE(sqe->addr) == poll_req->user_data) {
- io_poll_remove_one(poll_req);
- ret = 0;
- break;
- }
- }
+ ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr));
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_add_event(req->ctx, sqe->user_data, ret);
+ io_cqring_add_event(req, ret);
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
io_put_req(req);
return 0;
}
-static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req,
- __poll_t mask)
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
+ struct io_ring_ctx *ctx = req->ctx;
+
req->poll.done = true;
- io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask));
+ kfree(req->poll.wait);
+ if (error)
+ io_cqring_fill_event(req, error);
+ else
+ io_cqring_fill_event(req, mangle_poll(mask));
io_commit_cqring(ctx);
}
-static void io_poll_complete_work(struct work_struct *work)
+static void io_poll_complete_work(struct io_wq_work **workptr)
{
+ struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_poll_iocb *poll = &req->poll;
struct poll_table_struct pt = { ._key = poll->events };
struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *nxt = NULL;
__poll_t mask = 0;
+ int ret = 0;
+
+ if (work->flags & IO_WQ_WORK_CANCEL) {
+ WRITE_ONCE(poll->canceled, true);
+ ret = -ECANCELED;
+ } else if (READ_ONCE(poll->canceled)) {
+ ret = -ECANCELED;
+ }
- if (!READ_ONCE(poll->canceled))
+ if (ret != -ECANCELED)
mask = vfs_poll(poll->file, &pt) & poll->events;
/*
@@ -1760,24 +2142,28 @@ static void io_poll_complete_work(struct work_struct *work)
* avoid further branches in the fast path.
*/
spin_lock_irq(&ctx->completion_lock);
- if (!mask && !READ_ONCE(poll->canceled)) {
- add_wait_queue(poll->head, &poll->wait);
+ if (!mask && ret != -ECANCELED) {
+ add_wait_queue(poll->head, poll->wait);
spin_unlock_irq(&ctx->completion_lock);
return;
}
- list_del_init(&req->list);
- io_poll_complete(ctx, req, mask);
+ io_poll_remove_req(req);
+ io_poll_complete(req, mask, ret);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- io_put_req(req);
+
+ if (ret < 0 && req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+ io_put_req_find_next(req, &nxt);
+ if (nxt)
+ *workptr = &nxt->work;
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
- struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
- wait);
+ struct io_poll_iocb *poll = wait->private;
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
struct io_ring_ctx *ctx = req->ctx;
__poll_t mask = key_to_poll(key);
@@ -1787,17 +2173,24 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & poll->events))
return 0;
- list_del_init(&poll->wait.entry);
+ list_del_init(&poll->wait->entry);
+ /*
+ * Run completion inline if we can. We're using trylock here because
+ * we are violating the completion_lock -> poll wq lock ordering.
+ * If we have a link timeout we're going to need the completion_lock
+ * for finalizing the request, mark us as having grabbed that already.
+ */
if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
- list_del(&req->list);
- io_poll_complete(ctx, req, mask);
+ io_poll_remove_req(req);
+ io_poll_complete(req, mask, 0);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_put_req(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
- io_put_req(req);
} else {
- io_queue_async_work(ctx, req);
+ io_queue_async_work(req);
}
return 1;
@@ -1821,10 +2214,30 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
pt->error = 0;
pt->req->poll.head = head;
- add_wait_queue(head, &pt->req->poll.wait);
+ add_wait_queue(head, pt->req->poll.wait);
}
-static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void io_poll_req_insert(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct rb_node **p = &ctx->cancel_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct io_kiocb *tmp;
+
+ while (*p) {
+ parent = *p;
+ tmp = rb_entry(parent, struct io_kiocb, rb_node);
+ if (req->user_data < tmp->user_data)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+ rb_link_node(&req->rb_node, parent, p);
+ rb_insert_color(&req->rb_node, &ctx->cancel_tree);
+}
+
+static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt)
{
struct io_poll_iocb *poll = &req->poll;
struct io_ring_ctx *ctx = req->ctx;
@@ -1840,10 +2253,15 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (!poll->file)
return -EBADF;
- req->submit.sqe = NULL;
- INIT_WORK(&req->work, io_poll_complete_work);
+ poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL);
+ if (!poll->wait)
+ return -ENOMEM;
+
+ req->sqe = NULL;
+ INIT_IO_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+ RB_CLEAR_NODE(&req->rb_node);
poll->head = NULL;
poll->done = false;
@@ -1855,8 +2273,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
- INIT_LIST_HEAD(&poll->wait.entry);
- init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+ INIT_LIST_HEAD(&poll->wait->entry);
+ init_waitqueue_func_entry(poll->wait, io_poll_wake);
+ poll->wait->private = poll;
INIT_LIST_HEAD(&req->list);
@@ -1865,94 +2284,196 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
spin_lock_irq(&ctx->completion_lock);
if (likely(poll->head)) {
spin_lock(&poll->head->lock);
- if (unlikely(list_empty(&poll->wait.entry))) {
+ if (unlikely(list_empty(&poll->wait->entry))) {
if (ipt.error)
cancel = true;
ipt.error = 0;
mask = 0;
}
if (mask || ipt.error)
- list_del_init(&poll->wait.entry);
+ list_del_init(&poll->wait->entry);
else if (cancel)
WRITE_ONCE(poll->canceled, true);
else if (!poll->done) /* actually waiting for an event */
- list_add_tail(&req->list, &ctx->cancel_list);
+ io_poll_req_insert(req);
spin_unlock(&poll->head->lock);
}
if (mask) { /* no async, we'd stolen it */
ipt.error = 0;
- io_poll_complete(ctx, req, mask);
+ io_poll_complete(req, mask, 0);
}
spin_unlock_irq(&ctx->completion_lock);
if (mask) {
io_cqring_ev_posted(ctx);
- io_put_req(req);
+ io_put_req_find_next(req, nxt);
}
return ipt.error;
}
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{
- struct io_ring_ctx *ctx;
- struct io_kiocb *req, *prev;
+ struct io_timeout_data *data = container_of(timer,
+ struct io_timeout_data, timer);
+ struct io_kiocb *req = data->req;
+ struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
- req = container_of(timer, struct io_kiocb, timeout.timer);
- ctx = req->ctx;
atomic_inc(&ctx->cq_timeouts);
spin_lock_irqsave(&ctx->completion_lock, flags);
/*
- * Adjust the reqs sequence before the current one because it
- * will consume a slot in the cq_ring and the the cq_tail pointer
- * will be increased, otherwise other timeout reqs may return in
- * advance without waiting for enough wait_nr.
+ * We could be racing with timeout deletion. If the list is empty,
+ * then timeout lookup already found it and will be handling it.
*/
- prev = req;
- list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
- prev->sequence++;
- list_del(&req->list);
+ if (!list_empty(&req->list)) {
+ struct io_kiocb *prev;
- io_cqring_fill_event(ctx, req->user_data, -ETIME);
+ /*
+ * Adjust the reqs sequence before the current one because it
+ * will consume a slot in the cq_ring and the the cq_tail
+ * pointer will be increased, otherwise other timeout reqs may
+ * return in advance without waiting for enough wait_nr.
+ */
+ prev = req;
+ list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
+ prev->sequence++;
+ list_del_init(&req->list);
+ }
+
+ io_cqring_fill_event(req, -ETIME);
io_commit_cqring(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
-
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
io_put_req(req);
return HRTIMER_NORESTART;
}
-static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
+{
+ struct io_kiocb *req;
+ int ret = -ENOENT;
+
+ list_for_each_entry(req, &ctx->timeout_list, list) {
+ if (user_data == req->user_data) {
+ list_del_init(&req->list);
+ ret = 0;
+ break;
+ }
+ }
+
+ if (ret == -ENOENT)
+ return ret;
+
+ ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
+ if (ret == -1)
+ return -EALREADY;
+
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+ io_cqring_fill_event(req, -ECANCELED);
+ io_put_req(req);
+ return 0;
+}
+
+/*
+ * Remove or update an existing timeout command
+ */
+static int io_timeout_remove(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
{
- unsigned count;
struct io_ring_ctx *ctx = req->ctx;
- struct list_head *entry;
- struct timespec64 ts;
- unsigned span = 0;
+ unsigned flags;
+ int ret;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
- if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags ||
- sqe->len != 1)
+ if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len)
+ return -EINVAL;
+ flags = READ_ONCE(sqe->timeout_flags);
+ if (flags)
return -EINVAL;
- if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
+ spin_lock_irq(&ctx->completion_lock);
+ ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr));
+
+ io_cqring_fill_event(req, ret);
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+ if (ret < 0 && req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+ io_put_req(req);
+ return 0;
+}
+
+static int io_timeout_setup(struct io_kiocb *req)
+{
+ const struct io_uring_sqe *sqe = req->sqe;
+ struct io_timeout_data *data;
+ unsigned flags;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
+ return -EINVAL;
+ flags = READ_ONCE(sqe->timeout_flags);
+ if (flags & ~IORING_TIMEOUT_ABS)
+ return -EINVAL;
+
+ data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->req = req;
+ req->timeout.data = data;
+ req->flags |= REQ_F_TIMEOUT;
+
+ if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
return -EFAULT;
+ if (flags & IORING_TIMEOUT_ABS)
+ data->mode = HRTIMER_MODE_ABS;
+ else
+ data->mode = HRTIMER_MODE_REL;
+
+ hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+ return 0;
+}
+
+static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ unsigned count;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_timeout_data *data;
+ struct list_head *entry;
+ unsigned span = 0;
+ int ret;
+
+ ret = io_timeout_setup(req);
+ /* common setup allows flags (like links) set, we don't */
+ if (!ret && sqe->flags)
+ ret = -EINVAL;
+ if (ret)
+ return ret;
+
/*
* sqe->off holds how many events that need to occur for this
- * timeout event to be satisfied.
+ * timeout event to be satisfied. If it isn't set, then this is
+ * a pure timeout request, sequence isn't used.
*/
count = READ_ONCE(sqe->off);
- if (!count)
- count = 1;
+ if (!count) {
+ req->flags |= REQ_F_TIMEOUT_NOSEQ;
+ spin_lock_irq(&ctx->completion_lock);
+ entry = ctx->timeout_list.prev;
+ goto add;
+ }
req->sequence = ctx->cached_sq_head + count - 1;
- /* reuse it to store the count */
- req->submit.sequence = count;
- req->flags |= REQ_F_TIMEOUT;
+ req->timeout.data->seq_offset = count;
/*
* Insertion sort, ensuring the first entry in the list is always
@@ -1963,14 +2484,18 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
unsigned nxt_sq_head;
long long tmp, tmp_nxt;
+ u32 nxt_offset = nxt->timeout.data->seq_offset;
+
+ if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
+ continue;
/*
* Since cached_sq_head + count - 1 can overflow, use type long
* long to store it.
*/
tmp = (long long)ctx->cached_sq_head + count - 1;
- nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
- tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
+ nxt_sq_head = nxt->sequence - nxt_offset + 1;
+ tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
/*
* cached_sq_head may overflow, and it will never overflow twice
@@ -1990,22 +2515,96 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
nxt->sequence++;
}
req->sequence -= span;
+add:
list_add(&req->list, entry);
+ data = req->timeout.data;
+ data->timer.function = io_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
spin_unlock_irq(&ctx->completion_lock);
+ return 0;
+}
+
+static bool io_cancel_cb(struct io_wq_work *work, void *data)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ return req->user_data == (unsigned long) data;
+}
+
+static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
+{
+ enum io_wq_cancel cancel_ret;
+ int ret = 0;
+
+ cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr);
+ switch (cancel_ret) {
+ case IO_WQ_CANCEL_OK:
+ ret = 0;
+ break;
+ case IO_WQ_CANCEL_RUNNING:
+ ret = -EALREADY;
+ break;
+ case IO_WQ_CANCEL_NOTFOUND:
+ ret = -ENOENT;
+ break;
+ }
+
+ return ret;
+}
+
+static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
+ struct io_kiocb *req, __u64 sqe_addr,
+ struct io_kiocb **nxt, int success_ret)
+{
+ unsigned long flags;
+ int ret;
+
+ ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
+ if (ret != -ENOENT) {
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ goto done;
+ }
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ ret = io_timeout_cancel(ctx, sqe_addr);
+ if (ret != -ENOENT)
+ goto done;
+ ret = io_poll_cancel(ctx, sqe_addr);
+done:
+ if (!ret)
+ ret = success_ret;
+ io_cqring_fill_event(req, ret);
+ io_commit_cqring(ctx);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ io_cqring_ev_posted(ctx);
- hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- req->timeout.timer.function = io_timeout_fn;
- hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
- HRTIMER_MODE_REL);
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_put_req_find_next(req, nxt);
+}
+
+static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+ if (sqe->flags || sqe->ioprio || sqe->off || sqe->len ||
+ sqe->cancel_flags)
+ return -EINVAL;
+
+ io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0);
return 0;
}
-static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+static int io_req_defer(struct io_kiocb *req)
{
struct io_uring_sqe *sqe_copy;
+ struct io_ring_ctx *ctx = req->ctx;
- if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
+ /* Still need defer if there is pending req in defer list. */
+ if (!req_need_defer(req) && list_empty(&ctx->defer_list))
return 0;
sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
@@ -2013,72 +2612,82 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
return -EAGAIN;
spin_lock_irq(&ctx->completion_lock);
- if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
+ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock);
kfree(sqe_copy);
return 0;
}
- memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
- req->submit.sqe = sqe_copy;
+ memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy));
+ req->flags |= REQ_F_FREE_SQE;
+ req->sqe = sqe_copy;
- INIT_WORK(&req->work, io_sq_wq_submit_work);
+ trace_io_uring_defer(ctx, req, req->user_data);
list_add_tail(&req->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
return -EIOCBQUEUED;
}
-static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct sqe_submit *s, bool force_nonblock)
+__attribute__((nonnull))
+static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
+ bool force_nonblock)
{
int ret, opcode;
+ struct io_ring_ctx *ctx = req->ctx;
- req->user_data = READ_ONCE(s->sqe->user_data);
-
- if (unlikely(s->index >= ctx->sq_entries))
- return -EINVAL;
-
- opcode = READ_ONCE(s->sqe->opcode);
+ opcode = READ_ONCE(req->sqe->opcode);
switch (opcode) {
case IORING_OP_NOP:
- ret = io_nop(req, req->user_data);
+ ret = io_nop(req);
break;
case IORING_OP_READV:
- if (unlikely(s->sqe->buf_index))
+ if (unlikely(req->sqe->buf_index))
return -EINVAL;
- ret = io_read(req, s, force_nonblock);
+ ret = io_read(req, nxt, force_nonblock);
break;
case IORING_OP_WRITEV:
- if (unlikely(s->sqe->buf_index))
+ if (unlikely(req->sqe->buf_index))
return -EINVAL;
- ret = io_write(req, s, force_nonblock);
+ ret = io_write(req, nxt, force_nonblock);
break;
case IORING_OP_READ_FIXED:
- ret = io_read(req, s, force_nonblock);
+ ret = io_read(req, nxt, force_nonblock);
break;
case IORING_OP_WRITE_FIXED:
- ret = io_write(req, s, force_nonblock);
+ ret = io_write(req, nxt, force_nonblock);
break;
case IORING_OP_FSYNC:
- ret = io_fsync(req, s->sqe, force_nonblock);
+ ret = io_fsync(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_POLL_ADD:
- ret = io_poll_add(req, s->sqe);
+ ret = io_poll_add(req, req->sqe, nxt);
break;
case IORING_OP_POLL_REMOVE:
- ret = io_poll_remove(req, s->sqe);
+ ret = io_poll_remove(req, req->sqe);
break;
case IORING_OP_SYNC_FILE_RANGE:
- ret = io_sync_file_range(req, s->sqe, force_nonblock);
+ ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_SENDMSG:
- ret = io_sendmsg(req, s->sqe, force_nonblock);
+ ret = io_sendmsg(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_RECVMSG:
- ret = io_recvmsg(req, s->sqe, force_nonblock);
+ ret = io_recvmsg(req, req->sqe, nxt, force_nonblock);
break;
case IORING_OP_TIMEOUT:
- ret = io_timeout(req, s->sqe);
+ ret = io_timeout(req, req->sqe);
+ break;
+ case IORING_OP_TIMEOUT_REMOVE:
+ ret = io_timeout_remove(req, req->sqe);
+ break;
+ case IORING_OP_ACCEPT:
+ ret = io_accept(req, req->sqe, nxt, force_nonblock);
+ break;
+ case IORING_OP_CONNECT:
+ ret = io_connect(req, req->sqe, nxt, force_nonblock);
+ break;
+ case IORING_OP_ASYNC_CANCEL:
+ ret = io_async_cancel(req, req->sqe, nxt);
break;
default:
ret = -EINVAL;
@@ -2093,187 +2702,76 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return -EAGAIN;
/* workqueue context doesn't hold uring_lock, grab it now */
- if (s->needs_lock)
+ if (req->in_async)
mutex_lock(&ctx->uring_lock);
io_iopoll_req_issued(req);
- if (s->needs_lock)
+ if (req->in_async)
mutex_unlock(&ctx->uring_lock);
}
return 0;
}
-static struct async_list *io_async_list_from_sqe(struct io_ring_ctx *ctx,
- const struct io_uring_sqe *sqe)
+static void io_link_work_cb(struct io_wq_work **workptr)
{
- switch (sqe->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- return &ctx->pending_async[READ];
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- return &ctx->pending_async[WRITE];
- default:
- return NULL;
- }
-}
+ struct io_wq_work *work = *workptr;
+ struct io_kiocb *link = work->data;
-static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
-{
- u8 opcode = READ_ONCE(sqe->opcode);
-
- return !(opcode == IORING_OP_READ_FIXED ||
- opcode == IORING_OP_WRITE_FIXED);
+ io_queue_linked_timeout(link);
+ work->func = io_wq_submit_work;
}
-static void io_sq_wq_submit_work(struct work_struct *work)
+static void io_wq_submit_work(struct io_wq_work **workptr)
{
+ struct io_wq_work *work = *workptr;
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
- struct io_ring_ctx *ctx = req->ctx;
- struct mm_struct *cur_mm = NULL;
- struct async_list *async_list;
- LIST_HEAD(req_list);
- mm_segment_t old_fs;
- int ret;
+ struct io_kiocb *nxt = NULL;
+ int ret = 0;
- async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
-restart:
- do {
- struct sqe_submit *s = &req->submit;
- const struct io_uring_sqe *sqe = s->sqe;
- unsigned int flags = req->flags;
+ /* Ensure we clear previously set non-block flag */
+ req->rw.ki_flags &= ~IOCB_NOWAIT;
- /* Ensure we clear previously set non-block flag */
- req->rw.ki_flags &= ~IOCB_NOWAIT;
+ if (work->flags & IO_WQ_WORK_CANCEL)
+ ret = -ECANCELED;
- ret = 0;
- if (io_sqe_needs_user(sqe) && !cur_mm) {
- if (!mmget_not_zero(ctx->sqo_mm)) {
- ret = -EFAULT;
- } else {
- cur_mm = ctx->sqo_mm;
- use_mm(cur_mm);
- old_fs = get_fs();
- set_fs(USER_DS);
- }
- }
+ if (!ret) {
+ req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
+ req->in_async = true;
+ do {
+ ret = io_issue_sqe(req, &nxt, false);
+ /*
+ * We can get EAGAIN for polled IO even though we're
+ * forcing a sync submission from here, since we can't
+ * wait for request slots on the block side.
+ */
+ if (ret != -EAGAIN)
+ break;
+ cond_resched();
+ } while (1);
+ }
- if (!ret) {
- s->has_user = cur_mm != NULL;
- s->needs_lock = true;
- do {
- ret = __io_submit_sqe(ctx, req, s, false);
- /*
- * We can get EAGAIN for polled IO even though
- * we're forcing a sync submission from here,
- * since we can't wait for request slots on the
- * block side.
- */
- if (ret != -EAGAIN)
- break;
- cond_resched();
- } while (1);
- }
+ /* drop submission reference */
+ io_put_req(req);
- /* drop submission reference */
+ if (ret) {
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+ io_cqring_add_event(req, ret);
io_put_req(req);
-
- if (ret) {
- io_cqring_add_event(ctx, sqe->user_data, ret);
- io_put_req(req);
- }
-
- /* async context always use a copy of the sqe */
- kfree(sqe);
-
- /* req from defer and link list needn't decrease async cnt */
- if (flags & (REQ_F_IO_DRAINED | REQ_F_LINK_DONE))
- goto out;
-
- if (!async_list)
- break;
- if (!list_empty(&req_list)) {
- req = list_first_entry(&req_list, struct io_kiocb,
- list);
- list_del(&req->list);
- continue;
- }
- if (list_empty(&async_list->list))
- break;
-
- req = NULL;
- spin_lock(&async_list->lock);
- if (list_empty(&async_list->list)) {
- spin_unlock(&async_list->lock);
- break;
- }
- list_splice_init(&async_list->list, &req_list);
- spin_unlock(&async_list->lock);
-
- req = list_first_entry(&req_list, struct io_kiocb, list);
- list_del(&req->list);
- } while (req);
-
- /*
- * Rare case of racing with a submitter. If we find the count has
- * dropped to zero AND we have pending work items, then restart
- * the processing. This is a tiny race window.
- */
- if (async_list) {
- ret = atomic_dec_return(&async_list->cnt);
- while (!ret && !list_empty(&async_list->list)) {
- spin_lock(&async_list->lock);
- atomic_inc(&async_list->cnt);
- list_splice_init(&async_list->list, &req_list);
- spin_unlock(&async_list->lock);
-
- if (!list_empty(&req_list)) {
- req = list_first_entry(&req_list,
- struct io_kiocb, list);
- list_del(&req->list);
- goto restart;
- }
- ret = atomic_dec_return(&async_list->cnt);
- }
- }
-
-out:
- if (cur_mm) {
- set_fs(old_fs);
- unuse_mm(cur_mm);
- mmput(cur_mm);
}
-}
-
-/*
- * See if we can piggy back onto previously submitted work, that is still
- * running. We currently only allow this if the new request is sequential
- * to the previous one we punted.
- */
-static bool io_add_to_prev_work(struct async_list *list, struct io_kiocb *req)
-{
- bool ret;
- if (!list)
- return false;
- if (!(req->flags & REQ_F_SEQ_PREV))
- return false;
- if (!atomic_read(&list->cnt))
- return false;
+ /* if a dependent link is ready, pass it back */
+ if (!ret && nxt) {
+ struct io_kiocb *link;
- ret = true;
- spin_lock(&list->lock);
- list_add_tail(&req->list, &list->list);
- /*
- * Ensure we see a simultaneous modification from io_sq_wq_submit_work()
- */
- smp_mb();
- if (!atomic_read(&list->cnt)) {
- list_del_init(&req->list);
- ret = false;
+ io_prep_async_work(nxt, &link);
+ *workptr = &nxt->work;
+ if (link) {
+ nxt->work.flags |= IO_WQ_WORK_CB;
+ nxt->work.func = io_link_work_cb;
+ nxt->work.data = link;
+ }
}
- spin_unlock(&list->lock);
- return ret;
}
static bool io_op_needs_file(const struct io_uring_sqe *sqe)
@@ -2283,42 +2781,53 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe)
switch (op) {
case IORING_OP_NOP:
case IORING_OP_POLL_REMOVE:
+ case IORING_OP_TIMEOUT:
+ case IORING_OP_TIMEOUT_REMOVE:
+ case IORING_OP_ASYNC_CANCEL:
+ case IORING_OP_LINK_TIMEOUT:
return false;
default:
return true;
}
}
-static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
- struct io_submit_state *state, struct io_kiocb *req)
+static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
+ int index)
{
+ struct fixed_file_table *table;
+
+ table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
+ return table->files[index & IORING_FILE_TABLE_MASK];
+}
+
+static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
unsigned flags;
int fd;
- flags = READ_ONCE(s->sqe->flags);
- fd = READ_ONCE(s->sqe->fd);
+ flags = READ_ONCE(req->sqe->flags);
+ fd = READ_ONCE(req->sqe->fd);
if (flags & IOSQE_IO_DRAIN)
req->flags |= REQ_F_IO_DRAIN;
- /*
- * All io need record the previous position, if LINK vs DARIN,
- * it can be used to mark the position of the first IO in the
- * link list.
- */
- req->sequence = s->sequence;
- if (!io_op_needs_file(s->sqe))
+ if (!io_op_needs_file(req->sqe))
return 0;
if (flags & IOSQE_FIXED_FILE) {
- if (unlikely(!ctx->user_files ||
+ if (unlikely(!ctx->file_table ||
(unsigned) fd >= ctx->nr_user_files))
return -EBADF;
- req->file = ctx->user_files[fd];
+ fd = array_index_nospec(fd, ctx->nr_user_files);
+ req->file = io_file_from_index(ctx, fd);
+ if (!req->file)
+ return -EBADF;
req->flags |= REQ_F_FIXED_FILE;
} else {
- if (s->needs_fixed_file)
+ if (req->needs_fixed_file)
return -EBADF;
+ trace_io_uring_file_get(ctx, fd);
req->file = io_file_get(state, fd);
if (unlikely(!req->file))
return -EBADF;
@@ -2327,12 +2836,116 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
return 0;
}
-static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s)
+static int io_grab_files(struct io_kiocb *req)
{
+ int ret = -EBADF;
+ struct io_ring_ctx *ctx = req->ctx;
+
+ rcu_read_lock();
+ spin_lock_irq(&ctx->inflight_lock);
+ /*
+ * We use the f_ops->flush() handler to ensure that we can flush
+ * out work accessing these files if the fd is closed. Check if
+ * the fd has changed since we started down this path, and disallow
+ * this operation if it has.
+ */
+ if (fcheck(req->ring_fd) == req->ring_file) {
+ list_add(&req->inflight_entry, &ctx->inflight_list);
+ req->flags |= REQ_F_INFLIGHT;
+ req->work.files = current->files;
+ ret = 0;
+ }
+ spin_unlock_irq(&ctx->inflight_lock);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
+{
+ struct io_timeout_data *data = container_of(timer,
+ struct io_timeout_data, timer);
+ struct io_kiocb *req = data->req;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_kiocb *prev = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+
+ /*
+ * We don't expect the list to be empty, that will only happen if we
+ * race with the completion of the linked work.
+ */
+ if (!list_empty(&req->list)) {
+ prev = list_entry(req->list.prev, struct io_kiocb, link_list);
+ if (refcount_inc_not_zero(&prev->refs)) {
+ list_del_init(&req->list);
+ prev->flags &= ~REQ_F_LINK_TIMEOUT;
+ } else
+ prev = NULL;
+ }
+
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ if (prev) {
+ if (prev->flags & REQ_F_LINK)
+ prev->flags |= REQ_F_FAIL_LINK;
+ io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
+ -ETIME);
+ io_put_req(prev);
+ } else {
+ io_cqring_add_event(req, -ETIME);
+ io_put_req(req);
+ }
+ return HRTIMER_NORESTART;
+}
+
+static void io_queue_linked_timeout(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ /*
+ * If the list is now empty, then our linked request finished before
+ * we got a chance to setup the timer
+ */
+ spin_lock_irq(&ctx->completion_lock);
+ if (!list_empty(&req->list)) {
+ struct io_timeout_data *data = req->timeout.data;
+
+ data->timer.function = io_link_timeout_fn;
+ hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
+ data->mode);
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ /* drop submission reference */
+ io_put_req(req);
+}
+
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
+{
+ struct io_kiocb *nxt;
+
+ if (!(req->flags & REQ_F_LINK))
+ return NULL;
+
+ nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
+ if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT)
+ return NULL;
+
+ req->flags |= REQ_F_LINK_TIMEOUT;
+ return nxt;
+}
+
+static void __io_queue_sqe(struct io_kiocb *req)
+{
+ struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ struct io_kiocb *nxt = NULL;
int ret;
- ret = __io_submit_sqe(ctx, req, s, true);
+ ret = io_issue_sqe(req, &nxt, true);
+ if (nxt)
+ io_queue_async_work(nxt);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2342,133 +2955,103 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
(req->flags & REQ_F_MUST_PUNT))) {
struct io_uring_sqe *sqe_copy;
- sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
- if (sqe_copy) {
- struct async_list *list;
-
- s->sqe = sqe_copy;
- memcpy(&req->submit, s, sizeof(*s));
- list = io_async_list_from_sqe(ctx, s->sqe);
- if (!io_add_to_prev_work(list, req)) {
- if (list)
- atomic_inc(&list->cnt);
- INIT_WORK(&req->work, io_sq_wq_submit_work);
- io_queue_async_work(ctx, req);
- }
+ sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+ if (!sqe_copy)
+ goto err;
- /*
- * Queued up for async execution, worker will release
- * submit reference when the iocb is actually submitted.
- */
- return 0;
+ req->sqe = sqe_copy;
+ req->flags |= REQ_F_FREE_SQE;
+
+ if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
+ ret = io_grab_files(req);
+ if (ret)
+ goto err;
}
+
+ /*
+ * Queued up for async execution, worker will release
+ * submit reference when the iocb is actually submitted.
+ */
+ io_queue_async_work(req);
+ return;
}
+err:
/* drop submission reference */
io_put_req(req);
+ if (linked_timeout) {
+ if (!ret)
+ io_queue_linked_timeout(linked_timeout);
+ else
+ io_put_req(linked_timeout);
+ }
+
/* and drop final reference, if we failed */
if (ret) {
- io_cqring_add_event(ctx, req->user_data, ret);
+ io_cqring_add_event(req, ret);
if (req->flags & REQ_F_LINK)
req->flags |= REQ_F_FAIL_LINK;
io_put_req(req);
}
-
- return ret;
}
-static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s)
+static void io_queue_sqe(struct io_kiocb *req)
{
int ret;
- ret = io_req_defer(ctx, req, s->sqe);
- if (ret) {
- if (ret != -EIOCBQUEUED) {
- io_free_req(req);
- io_cqring_add_event(ctx, s->sqe->user_data, ret);
- }
- return 0;
+ if (unlikely(req->ctx->drain_next)) {
+ req->flags |= REQ_F_IO_DRAIN;
+ req->ctx->drain_next = false;
}
+ req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
- return __io_queue_sqe(ctx, req, s);
-}
-
-static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
- struct sqe_submit *s, struct io_kiocb *shadow)
-{
- int ret;
- int need_submit = false;
-
- if (!shadow)
- return io_queue_sqe(ctx, req, s);
-
- /*
- * Mark the first IO in link list as DRAIN, let all the following
- * IOs enter the defer list. all IO needs to be completed before link
- * list.
- */
- req->flags |= REQ_F_IO_DRAIN;
- ret = io_req_defer(ctx, req, s->sqe);
+ ret = io_req_defer(req);
if (ret) {
if (ret != -EIOCBQUEUED) {
- io_free_req(req);
- __io_free_req(shadow);
- io_cqring_add_event(ctx, s->sqe->user_data, ret);
- return 0;
+ io_cqring_add_event(req, ret);
+ if (req->flags & REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+ io_double_put_req(req);
}
- } else {
- /*
- * If ret == 0 means that all IOs in front of link io are
- * running done. let's queue link head.
- */
- need_submit = true;
- }
-
- /* Insert shadow req to defer_list, blocking next IOs */
- spin_lock_irq(&ctx->completion_lock);
- list_add_tail(&shadow->list, &ctx->defer_list);
- spin_unlock_irq(&ctx->completion_lock);
-
- if (need_submit)
- return __io_queue_sqe(ctx, req, s);
+ } else
+ __io_queue_sqe(req);
+}
- return 0;
+static inline void io_queue_link_head(struct io_kiocb *req)
+{
+ if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+ io_cqring_add_event(req, -ECANCELED);
+ io_double_put_req(req);
+ } else
+ io_queue_sqe(req);
}
+
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
-static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
- struct io_submit_state *state, struct io_kiocb **link)
+static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
+ struct io_kiocb **link)
{
- struct io_uring_sqe *sqe_copy;
- struct io_kiocb *req;
+ struct io_ring_ctx *ctx = req->ctx;
int ret;
+ req->user_data = req->sqe->user_data;
+
/* enforce forwards compatibility on users */
- if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
+ if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) {
ret = -EINVAL;
- goto err;
+ goto err_req;
}
- req = io_get_req(ctx, state);
- if (unlikely(!req)) {
- ret = -EAGAIN;
- goto err;
- }
-
- ret = io_req_set_file(ctx, s, state, req);
+ ret = io_req_set_file(state, req);
if (unlikely(ret)) {
err_req:
- io_free_req(req);
-err:
- io_cqring_add_event(ctx, s->sqe->user_data, ret);
+ io_cqring_add_event(req, ret);
+ io_double_put_req(req);
return;
}
- req->user_data = s->sqe->user_data;
-
/*
* If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but
@@ -2478,24 +3061,39 @@ err:
*/
if (*link) {
struct io_kiocb *prev = *link;
+ struct io_uring_sqe *sqe_copy;
+
+ if (req->sqe->flags & IOSQE_IO_DRAIN)
+ (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
+
+ if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
+ ret = io_timeout_setup(req);
+ /* common setup allows offset being set, we don't */
+ if (!ret && req->sqe->off)
+ ret = -EINVAL;
+ if (ret) {
+ prev->flags |= REQ_F_FAIL_LINK;
+ goto err_req;
+ }
+ }
- sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+ sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
if (!sqe_copy) {
ret = -EAGAIN;
goto err_req;
}
- s->sqe = sqe_copy;
- memcpy(&req->submit, s, sizeof(*s));
+ req->sqe = sqe_copy;
+ req->flags |= REQ_F_FREE_SQE;
+ trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->list, &prev->link_list);
- } else if (s->sqe->flags & IOSQE_IO_LINK) {
+ } else if (req->sqe->flags & IOSQE_IO_LINK) {
req->flags |= REQ_F_LINK;
- memcpy(&req->submit, s, sizeof(*s));
INIT_LIST_HEAD(&req->link_list);
*link = req;
} else {
- io_queue_sqe(ctx, req, s);
+ io_queue_sqe(req);
}
}
@@ -2545,7 +3143,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req)
{
struct io_rings *rings = ctx->rings;
u32 *sq_array = ctx->sq_array;
@@ -2561,14 +3159,18 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
*/
head = ctx->cached_sq_head;
/* make sure SQ entry isn't read before tail */
- if (head == smp_load_acquire(&rings->sq.tail))
+ if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
return false;
head = READ_ONCE(sq_array[head & ctx->sq_mask]);
- if (head < ctx->sq_entries) {
- s->index = head;
- s->sqe = &ctx->sq_sqes[head];
- s->sequence = ctx->cached_sq_head;
+ if (likely(head < ctx->sq_entries)) {
+ /*
+ * All io need record the previous position, if LINK vs DARIN,
+ * it can be used to mark the position of the first IO in the
+ * link list.
+ */
+ req->sequence = ctx->cached_sq_head;
+ req->sqe = &ctx->sq_sqes[head];
ctx->cached_sq_head++;
return true;
}
@@ -2581,13 +3183,18 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
- bool has_user, bool mm_fault)
+ struct file *ring_file, int ring_fd,
+ struct mm_struct **mm, bool async)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
- struct io_kiocb *shadow_req = NULL;
- bool prev_was_link = false;
int i, submitted = 0;
+ bool mm_fault = false;
+
+ /* if we have a backlog and couldn't flush it all, return BUSY */
+ if (!list_empty(&ctx->cq_overflow_list) &&
+ !io_cqring_overflow_flush(ctx, false))
+ return -EBUSY;
if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, ctx, nr);
@@ -2595,51 +3202,58 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
}
for (i = 0; i < nr; i++) {
- struct sqe_submit s;
+ struct io_kiocb *req;
+ unsigned int sqe_flags;
- if (!io_get_sqring(ctx, &s))
+ req = io_get_req(ctx, statep);
+ if (unlikely(!req)) {
+ if (!submitted)
+ submitted = -EAGAIN;
+ break;
+ }
+ if (!io_get_sqring(ctx, req)) {
+ __io_free_req(req);
break;
+ }
+
+ if (io_sqe_needs_user(req->sqe) && !*mm) {
+ mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
+ if (!mm_fault) {
+ use_mm(ctx->sqo_mm);
+ *mm = ctx->sqo_mm;
+ }
+ }
+
+ sqe_flags = req->sqe->flags;
+
+ req->ring_file = ring_file;
+ req->ring_fd = ring_fd;
+ req->has_user = *mm != NULL;
+ req->in_async = async;
+ req->needs_fixed_file = async;
+ trace_io_uring_submit_sqe(ctx, req->sqe->user_data,
+ true, async);
+ io_submit_sqe(req, statep, &link);
+ submitted++;
/*
* If previous wasn't linked and we have a linked command,
* that's the end of the chain. Submit the previous link.
*/
- if (!prev_was_link && link) {
- io_queue_link_head(ctx, link, &link->submit, shadow_req);
+ if (!(sqe_flags & IOSQE_IO_LINK) && link) {
+ io_queue_link_head(link);
link = NULL;
- shadow_req = NULL;
- }
- prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
-
- if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
- if (!shadow_req) {
- shadow_req = io_get_req(ctx, NULL);
- if (unlikely(!shadow_req))
- goto out;
- shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
- refcount_dec(&shadow_req->refs);
- }
- shadow_req->sequence = s.sequence;
- }
-
-out:
- if (unlikely(mm_fault)) {
- io_cqring_add_event(ctx, s.sqe->user_data,
- -EFAULT);
- } else {
- s.has_user = has_user;
- s.needs_lock = true;
- s.needs_fixed_file = true;
- io_submit_sqe(ctx, &s, statep, &link);
- submitted++;
}
}
if (link)
- io_queue_link_head(ctx, link, &link->submit, shadow_req);
+ io_queue_link_head(link);
if (statep)
io_submit_state_end(&state);
+ /* Commit SQ ring head once we've consumed and submitted all SQEs */
+ io_commit_sqring(ctx);
+
return submitted;
}
@@ -2647,19 +3261,21 @@ static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
struct mm_struct *cur_mm = NULL;
+ const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
unsigned inflight;
unsigned long timeout;
+ int ret;
- complete(&ctx->sqo_thread_started);
+ complete(&ctx->completions[1]);
old_fs = get_fs();
set_fs(USER_DS);
+ old_cred = override_creds(ctx->creds);
- timeout = inflight = 0;
+ ret = timeout = inflight = 0;
while (!kthread_should_park()) {
- bool mm_fault = false;
unsigned int to_submit;
if (inflight) {
@@ -2694,13 +3310,21 @@ static int io_sq_thread(void *data)
}
to_submit = io_sqring_entries(ctx);
- if (!to_submit) {
+
+ /*
+ * If submit got -EBUSY, flag us as needing the application
+ * to enter the kernel to reap and flush events.
+ */
+ if (!to_submit || ret == -EBUSY) {
/*
* We're polling. If we're within the defined idle
* period, then let us spin without work before going
- * to sleep.
+ * to sleep. The exception is if we got EBUSY doing
+ * more IO, we should wait for the application to
+ * reap events and wake us up.
*/
- if (inflight || !time_after(jiffies, timeout)) {
+ if (inflight ||
+ (!time_after(jiffies, timeout) && ret != -EBUSY)) {
cond_resched();
continue;
}
@@ -2726,7 +3350,7 @@ static int io_sq_thread(void *data)
smp_mb();
to_submit = io_sqring_entries(ctx);
- if (!to_submit) {
+ if (!to_submit || ret == -EBUSY) {
if (kthread_should_park()) {
finish_wait(&ctx->sqo_wait, &wait);
break;
@@ -2744,21 +3368,10 @@ static int io_sq_thread(void *data)
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
}
- /* Unless all new commands are FIXED regions, grab mm */
- if (!cur_mm) {
- mm_fault = !mmget_not_zero(ctx->sqo_mm);
- if (!mm_fault) {
- use_mm(ctx->sqo_mm);
- cur_mm = ctx->sqo_mm;
- }
- }
-
to_submit = min(to_submit, ctx->sq_entries);
- inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
- mm_fault);
-
- /* Commit SQ ring head once we've consumed all SQEs */
- io_commit_sqring(ctx);
+ ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+ if (ret > 0)
+ inflight += ret;
}
set_fs(old_fs);
@@ -2766,71 +3379,13 @@ static int io_sq_thread(void *data)
unuse_mm(cur_mm);
mmput(cur_mm);
}
+ revert_creds(old_cred);
kthread_parkme();
return 0;
}
-static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
-{
- struct io_submit_state state, *statep = NULL;
- struct io_kiocb *link = NULL;
- struct io_kiocb *shadow_req = NULL;
- bool prev_was_link = false;
- int i, submit = 0;
-
- if (to_submit > IO_PLUG_THRESHOLD) {
- io_submit_state_start(&state, ctx, to_submit);
- statep = &state;
- }
-
- for (i = 0; i < to_submit; i++) {
- struct sqe_submit s;
-
- if (!io_get_sqring(ctx, &s))
- break;
-
- /*
- * If previous wasn't linked and we have a linked command,
- * that's the end of the chain. Submit the previous link.
- */
- if (!prev_was_link && link) {
- io_queue_link_head(ctx, link, &link->submit, shadow_req);
- link = NULL;
- shadow_req = NULL;
- }
- prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
-
- if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
- if (!shadow_req) {
- shadow_req = io_get_req(ctx, NULL);
- if (unlikely(!shadow_req))
- goto out;
- shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
- refcount_dec(&shadow_req->refs);
- }
- shadow_req->sequence = s.sequence;
- }
-
-out:
- s.has_user = true;
- s.needs_lock = false;
- s.needs_fixed_file = false;
- submit++;
- io_submit_sqe(ctx, &s, statep, &link);
- }
-
- if (link)
- io_queue_link_head(ctx, link, &link->submit, shadow_req);
- if (statep)
- io_submit_state_end(statep);
-
- io_commit_sqring(ctx);
-
- return submit;
-}
-
struct io_wait_queue {
struct wait_queue_entry wq;
struct io_ring_ctx *ctx;
@@ -2838,7 +3393,7 @@ struct io_wait_queue {
unsigned nr_timeouts;
};
-static inline bool io_should_wake(struct io_wait_queue *iowq)
+static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
{
struct io_ring_ctx *ctx = iowq->ctx;
@@ -2847,7 +3402,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
* started waiting. For timeouts, we always want to return to userspace,
* regardless of event count.
*/
- return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+ return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}
@@ -2857,7 +3412,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
wq);
- if (!io_should_wake(iowq))
+ /* use noflush == true, as we can't safely rely on locking context */
+ if (!io_should_wake(iowq, true))
return -1;
return autoremove_wake_function(curr, mode, wake_flags, key);
@@ -2880,9 +3436,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
.to_wait = min_events,
};
struct io_rings *rings = ctx->rings;
- int ret;
+ int ret = 0;
- if (io_cqring_events(rings) >= min_events)
+ if (io_cqring_events(ctx, false) >= min_events)
return 0;
if (sig) {
@@ -2898,24 +3454,22 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
- ret = 0;
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
+ trace_io_uring_cqring_wait(ctx, min_events);
do {
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- if (io_should_wake(&iowq))
+ if (io_should_wake(&iowq, false))
break;
schedule();
if (signal_pending(current)) {
- ret = -ERESTARTSYS;
+ ret = -EINTR;
break;
}
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
- restore_saved_sigmask_unless(ret == -ERESTARTSYS);
- if (ret == -ERESTARTSYS)
- ret = -EINTR;
+ restore_saved_sigmask_unless(ret == -EINTR);
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
@@ -2933,19 +3487,29 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
#else
int i;
- for (i = 0; i < ctx->nr_user_files; i++)
- fput(ctx->user_files[i]);
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ struct file *file;
+
+ file = io_file_from_index(ctx, i);
+ if (file)
+ fput(file);
+ }
#endif
}
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
- if (!ctx->user_files)
+ unsigned nr_tables, i;
+
+ if (!ctx->file_table)
return -ENXIO;
__io_sqe_files_unregister(ctx);
- kfree(ctx->user_files);
- ctx->user_files = NULL;
+ nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
+ for (i = 0; i < nr_tables; i++)
+ kfree(ctx->file_table[i].files);
+ kfree(ctx->file_table);
+ ctx->file_table = NULL;
ctx->nr_user_files = 0;
return 0;
}
@@ -2953,7 +3517,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
static void io_sq_thread_stop(struct io_ring_ctx *ctx)
{
if (ctx->sqo_thread) {
- wait_for_completion(&ctx->sqo_thread_started);
+ wait_for_completion(&ctx->completions[1]);
/*
* The park is a bit of a work-around, without it we get
* warning spews on shutdown with SQPOLL set and affinity
@@ -2967,15 +3531,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx)
static void io_finish_async(struct io_ring_ctx *ctx)
{
- int i;
-
io_sq_thread_stop(ctx);
- for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) {
- if (ctx->sqo_wq[i]) {
- destroy_workqueue(ctx->sqo_wq[i]);
- ctx->sqo_wq[i] = NULL;
- }
+ if (ctx->io_wq) {
+ io_wq_destroy(ctx->io_wq);
+ ctx->io_wq = NULL;
}
}
@@ -2983,11 +3543,9 @@ static void io_finish_async(struct io_ring_ctx *ctx)
static void io_destruct_skb(struct sk_buff *skb)
{
struct io_ring_ctx *ctx = skb->sk->sk_user_data;
- int i;
- for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
- if (ctx->sqo_wq[i])
- flush_workqueue(ctx->sqo_wq[i]);
+ if (ctx->io_wq)
+ io_wq_flush(ctx->io_wq);
unix_destruct_scm(skb);
}
@@ -3002,7 +3560,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
struct sock *sk = ctx->ring_sock->sk;
struct scm_fp_list *fpl;
struct sk_buff *skb;
- int i;
+ int i, nr_files;
if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
unsigned long inflight = ctx->user->unix_inflight + nr;
@@ -3022,21 +3580,33 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
}
skb->sk = sk;
- skb->destructor = io_destruct_skb;
+ nr_files = 0;
fpl->user = get_uid(ctx->user);
for (i = 0; i < nr; i++) {
- fpl->fp[i] = get_file(ctx->user_files[i + offset]);
- unix_inflight(fpl->user, fpl->fp[i]);
+ struct file *file = io_file_from_index(ctx, i + offset);
+
+ if (!file)
+ continue;
+ fpl->fp[nr_files] = get_file(file);
+ unix_inflight(fpl->user, fpl->fp[nr_files]);
+ nr_files++;
}
- fpl->max = fpl->count = nr;
- UNIXCB(skb).fp = fpl;
- refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- skb_queue_head(&sk->sk_receive_queue, skb);
+ if (nr_files) {
+ fpl->max = SCM_MAX_FD;
+ fpl->count = nr_files;
+ UNIXCB(skb).fp = fpl;
+ skb->destructor = io_destruct_skb;
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+ skb_queue_head(&sk->sk_receive_queue, skb);
- for (i = 0; i < nr; i++)
- fput(fpl->fp[i]);
+ for (i = 0; i < nr_files; i++)
+ fput(fpl->fp[i]);
+ } else {
+ kfree_skb(skb);
+ kfree(fpl);
+ }
return 0;
}
@@ -3067,7 +3637,10 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
return 0;
while (total < ctx->nr_user_files) {
- fput(ctx->user_files[total]);
+ struct file *file = io_file_from_index(ctx, total);
+
+ if (file)
+ fput(file);
total++;
}
@@ -3080,33 +3653,79 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
+static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
+ unsigned nr_files)
+{
+ int i;
+
+ for (i = 0; i < nr_tables; i++) {
+ struct fixed_file_table *table = &ctx->file_table[i];
+ unsigned this_files;
+
+ this_files = min(nr_files, IORING_MAX_FILES_TABLE);
+ table->files = kcalloc(this_files, sizeof(struct file *),
+ GFP_KERNEL);
+ if (!table->files)
+ break;
+ nr_files -= this_files;
+ }
+
+ if (i == nr_tables)
+ return 0;
+
+ for (i = 0; i < nr_tables; i++) {
+ struct fixed_file_table *table = &ctx->file_table[i];
+ kfree(table->files);
+ }
+ return 1;
+}
+
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
__s32 __user *fds = (__s32 __user *) arg;
+ unsigned nr_tables;
int fd, ret = 0;
unsigned i;
- if (ctx->user_files)
+ if (ctx->file_table)
return -EBUSY;
if (!nr_args)
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
- ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
- if (!ctx->user_files)
+ nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
+ ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
+ GFP_KERNEL);
+ if (!ctx->file_table)
return -ENOMEM;
- for (i = 0; i < nr_args; i++) {
+ if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
+ kfree(ctx->file_table);
+ ctx->file_table = NULL;
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+ struct fixed_file_table *table;
+ unsigned index;
+
ret = -EFAULT;
if (copy_from_user(&fd, &fds[i], sizeof(fd)))
break;
+ /* allow sparse sets */
+ if (fd == -1) {
+ ret = 0;
+ continue;
+ }
- ctx->user_files[i] = fget(fd);
+ table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+ index = i & IORING_FILE_TABLE_MASK;
+ table->files[index] = fget(fd);
ret = -EBADF;
- if (!ctx->user_files[i])
+ if (!table->files[index])
break;
/*
* Don't allow io_uring instances to be registered. If UNIX
@@ -3115,20 +3734,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
* handle it just fine, but there's still no point in allowing
* a ring fd as it doesn't support regular read/write anyway.
*/
- if (ctx->user_files[i]->f_op == &io_uring_fops) {
- fput(ctx->user_files[i]);
+ if (table->files[index]->f_op == &io_uring_fops) {
+ fput(table->files[index]);
break;
}
- ctx->nr_user_files++;
ret = 0;
}
if (ret) {
- for (i = 0; i < ctx->nr_user_files; i++)
- fput(ctx->user_files[i]);
+ for (i = 0; i < ctx->nr_user_files; i++) {
+ struct file *file;
- kfree(ctx->user_files);
- ctx->user_files = NULL;
+ file = io_file_from_index(ctx, i);
+ if (file)
+ fput(file);
+ }
+ for (i = 0; i < nr_tables; i++)
+ kfree(ctx->file_table[i].files);
+
+ kfree(ctx->file_table);
+ ctx->file_table = NULL;
ctx->nr_user_files = 0;
return ret;
}
@@ -3140,9 +3765,202 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
+static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
+{
+#if defined(CONFIG_UNIX)
+ struct file *file = io_file_from_index(ctx, index);
+ struct sock *sock = ctx->ring_sock->sk;
+ struct sk_buff_head list, *head = &sock->sk_receive_queue;
+ struct sk_buff *skb;
+ int i;
+
+ __skb_queue_head_init(&list);
+
+ /*
+ * Find the skb that holds this file in its SCM_RIGHTS. When found,
+ * remove this entry and rearrange the file array.
+ */
+ skb = skb_dequeue(head);
+ while (skb) {
+ struct scm_fp_list *fp;
+
+ fp = UNIXCB(skb).fp;
+ for (i = 0; i < fp->count; i++) {
+ int left;
+
+ if (fp->fp[i] != file)
+ continue;
+
+ unix_notinflight(fp->user, fp->fp[i]);
+ left = fp->count - 1 - i;
+ if (left) {
+ memmove(&fp->fp[i], &fp->fp[i + 1],
+ left * sizeof(struct file *));
+ }
+ fp->count--;
+ if (!fp->count) {
+ kfree_skb(skb);
+ skb = NULL;
+ } else {
+ __skb_queue_tail(&list, skb);
+ }
+ fput(file);
+ file = NULL;
+ break;
+ }
+
+ if (!file)
+ break;
+
+ __skb_queue_tail(&list, skb);
+
+ skb = skb_dequeue(head);
+ }
+
+ if (skb_peek(&list)) {
+ spin_lock_irq(&head->lock);
+ while ((skb = __skb_dequeue(&list)) != NULL)
+ __skb_queue_tail(head, skb);
+ spin_unlock_irq(&head->lock);
+ }
+#else
+ fput(io_file_from_index(ctx, index));
+#endif
+}
+
+static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
+ int index)
+{
+#if defined(CONFIG_UNIX)
+ struct sock *sock = ctx->ring_sock->sk;
+ struct sk_buff_head *head = &sock->sk_receive_queue;
+ struct sk_buff *skb;
+
+ /*
+ * See if we can merge this file into an existing skb SCM_RIGHTS
+ * file set. If there's no room, fall back to allocating a new skb
+ * and filling it in.
+ */
+ spin_lock_irq(&head->lock);
+ skb = skb_peek(head);
+ if (skb) {
+ struct scm_fp_list *fpl = UNIXCB(skb).fp;
+
+ if (fpl->count < SCM_MAX_FD) {
+ __skb_unlink(skb, head);
+ spin_unlock_irq(&head->lock);
+ fpl->fp[fpl->count] = get_file(file);
+ unix_inflight(fpl->user, fpl->fp[fpl->count]);
+ fpl->count++;
+ spin_lock_irq(&head->lock);
+ __skb_queue_head(head, skb);
+ } else {
+ skb = NULL;
+ }
+ }
+ spin_unlock_irq(&head->lock);
+
+ if (skb) {
+ fput(file);
+ return 0;
+ }
+
+ return __io_sqe_files_scm(ctx, 1, index);
+#else
+ return 0;
+#endif
+}
+
+static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned nr_args)
+{
+ struct io_uring_files_update up;
+ __s32 __user *fds;
+ int fd, i, err;
+ __u32 done;
+
+ if (!ctx->file_table)
+ return -ENXIO;
+ if (!nr_args)
+ return -EINVAL;
+ if (copy_from_user(&up, arg, sizeof(up)))
+ return -EFAULT;
+ if (check_add_overflow(up.offset, nr_args, &done))
+ return -EOVERFLOW;
+ if (done > ctx->nr_user_files)
+ return -EINVAL;
+
+ done = 0;
+ fds = (__s32 __user *) up.fds;
+ while (nr_args) {
+ struct fixed_file_table *table;
+ unsigned index;
+
+ err = 0;
+ if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
+ err = -EFAULT;
+ break;
+ }
+ i = array_index_nospec(up.offset, ctx->nr_user_files);
+ table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
+ index = i & IORING_FILE_TABLE_MASK;
+ if (table->files[index]) {
+ io_sqe_file_unregister(ctx, i);
+ table->files[index] = NULL;
+ }
+ if (fd != -1) {
+ struct file *file;
+
+ file = fget(fd);
+ if (!file) {
+ err = -EBADF;
+ break;
+ }
+ /*
+ * Don't allow io_uring instances to be registered. If
+ * UNIX isn't enabled, then this causes a reference
+ * cycle and this instance can never get freed. If UNIX
+ * is enabled we'll handle it just fine, but there's
+ * still no point in allowing a ring fd as it doesn't
+ * support regular read/write anyway.
+ */
+ if (file->f_op == &io_uring_fops) {
+ fput(file);
+ err = -EBADF;
+ break;
+ }
+ table->files[index] = file;
+ err = io_sqe_file_register(ctx, file, i);
+ if (err)
+ break;
+ }
+ nr_args--;
+ done++;
+ up.offset++;
+ }
+
+ return done ? done : err;
+}
+
+static void io_put_work(struct io_wq_work *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ io_put_req(req);
+}
+
+static void io_get_work(struct io_wq_work *work)
+{
+ struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+
+ refcount_inc(&req->refs);
+}
+
static int io_sq_offload_start(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
+ struct io_wq_data data;
+ unsigned concurrency;
int ret;
init_waitqueue_head(&ctx->sqo_wait);
@@ -3186,26 +4004,18 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
goto err;
}
- /* Do QD, or 2 * CPUS, whatever is smallest */
- ctx->sqo_wq[0] = alloc_workqueue("io_ring-wq",
- WQ_UNBOUND | WQ_FREEZABLE,
- min(ctx->sq_entries - 1, 2 * num_online_cpus()));
- if (!ctx->sqo_wq[0]) {
- ret = -ENOMEM;
- goto err;
- }
-
- /*
- * This is for buffered writes, where we want to limit the parallelism
- * due to file locking in file systems. As "normal" buffered writes
- * should parellelize on writeout quite nicely, limit us to having 2
- * pending. This avoids massive contention on the inode when doing
- * buffered async writes.
- */
- ctx->sqo_wq[1] = alloc_workqueue("io_ring-write-wq",
- WQ_UNBOUND | WQ_FREEZABLE, 2);
- if (!ctx->sqo_wq[1]) {
- ret = -ENOMEM;
+ data.mm = ctx->sqo_mm;
+ data.user = ctx->user;
+ data.creds = ctx->creds;
+ data.get_work = io_get_work;
+ data.put_work = io_put_work;
+
+ /* Do QD, or 4 * CPUS, whatever is smallest */
+ concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
+ ctx->io_wq = io_wq_create(concurrency, &data);
+ if (IS_ERR(ctx->io_wq)) {
+ ret = PTR_ERR(ctx->io_wq);
+ ctx->io_wq = NULL;
goto err;
}
@@ -3551,6 +4361,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_unaccount_mem(ctx->user,
ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
+ put_cred(ctx->creds);
+ kfree(ctx->completions);
+ kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx);
}
@@ -3589,8 +4402,15 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_kill_timeouts(ctx);
io_poll_remove_all(ctx);
+
+ if (ctx->io_wq)
+ io_wq_cancel_all(ctx->io_wq);
+
io_iopoll_reap_events(ctx);
- wait_for_completion(&ctx->ctx_done);
+ /* if we failed setting up the ctx, we might not have any rings */
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+ wait_for_completion(&ctx->completions[0]);
io_ring_ctx_free(ctx);
}
@@ -3603,12 +4423,58 @@ static int io_uring_release(struct inode *inode, struct file *file)
return 0;
}
-static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+static void io_uring_cancel_files(struct io_ring_ctx *ctx,
+ struct files_struct *files)
+{
+ struct io_kiocb *req;
+ DEFINE_WAIT(wait);
+
+ while (!list_empty_careful(&ctx->inflight_list)) {
+ struct io_kiocb *cancel_req = NULL;
+
+ spin_lock_irq(&ctx->inflight_lock);
+ list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
+ if (req->work.files != files)
+ continue;
+ /* req is being completed, ignore */
+ if (!refcount_inc_not_zero(&req->refs))
+ continue;
+ cancel_req = req;
+ break;
+ }
+ if (cancel_req)
+ prepare_to_wait(&ctx->inflight_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&ctx->inflight_lock);
+
+ /* We need to keep going until we don't find a matching req */
+ if (!cancel_req)
+ break;
+
+ io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
+ io_put_req(cancel_req);
+ schedule();
+ }
+ finish_wait(&ctx->inflight_wait, &wait);
+}
+
+static int io_uring_flush(struct file *file, void *data)
{
- loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT;
- unsigned long sz = vma->vm_end - vma->vm_start;
struct io_ring_ctx *ctx = file->private_data;
- unsigned long pfn;
+
+ io_uring_cancel_files(ctx, data);
+ if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
+ io_cqring_overflow_flush(ctx, true);
+ io_wq_cancel_all(ctx->io_wq);
+ }
+ return 0;
+}
+
+static void *io_uring_validate_mmap_request(struct file *file,
+ loff_t pgoff, size_t sz)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+ loff_t offset = pgoff << PAGE_SHIFT;
struct page *page;
void *ptr;
@@ -3621,17 +4487,59 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
ptr = ctx->sq_sqes;
break;
default:
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
page = virt_to_head_page(ptr);
if (sz > page_size(page))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
+
+ return ptr;
+}
+
+#ifdef CONFIG_MMU
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ size_t sz = vma->vm_end - vma->vm_start;
+ unsigned long pfn;
+ void *ptr;
+
+ ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
+#else /* !CONFIG_MMU */
+
+static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
+}
+
+static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
+}
+
+static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ void *ptr;
+
+ ptr = io_uring_validate_mmap_request(file, pgoff, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ return (unsigned long) ptr;
+}
+
+#endif /* !CONFIG_MMU */
+
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
u32, min_complete, u32, flags, const sigset_t __user *, sig,
size_t, sigsz)
@@ -3664,14 +4572,20 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
+ if (!list_empty_careful(&ctx->cq_overflow_list))
+ io_cqring_overflow_flush(ctx, false);
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sqo_wait);
submitted = to_submit;
} else if (to_submit) {
- to_submit = min(to_submit, ctx->sq_entries);
+ struct mm_struct *cur_mm;
+ to_submit = min(to_submit, ctx->sq_entries);
mutex_lock(&ctx->uring_lock);
- submitted = io_ring_submit(ctx, to_submit);
+ /* already have mm, so io_submit_sqes() won't try to grab it */
+ cur_mm = ctx->sqo_mm;
+ submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
+ &cur_mm, false);
mutex_unlock(&ctx->uring_lock);
}
if (flags & IORING_ENTER_GETEVENTS) {
@@ -3694,7 +4608,12 @@ out_fput:
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
+ .flush = io_uring_flush,
.mmap = io_uring_mmap,
+#ifndef CONFIG_MMU
+ .get_unmapped_area = io_uring_nommu_get_unmapped_area,
+ .mmap_capabilities = io_uring_nommu_mmap_capabilities,
+#endif
.poll = io_uring_poll,
.fasync = io_uring_fasync,
};
@@ -3725,12 +4644,18 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
ctx->cq_entries = rings->cq_ring_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
- if (size == SIZE_MAX)
+ if (size == SIZE_MAX) {
+ io_mem_free(ctx->rings);
+ ctx->rings = NULL;
return -EOVERFLOW;
+ }
ctx->sq_sqes = io_mem_alloc(size);
- if (!ctx->sq_sqes)
+ if (!ctx->sq_sqes) {
+ io_mem_free(ctx->rings);
+ ctx->rings = NULL;
return -ENOMEM;
+ }
return 0;
}
@@ -3793,10 +4718,23 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
* Use twice as many entries for the CQ ring. It's possible for the
* application to drive a higher depth than the size of the SQ ring,
* since the sqes are only used at submission time. This allows for
- * some flexibility in overcommitting a bit.
+ * some flexibility in overcommitting a bit. If the application has
+ * set IORING_SETUP_CQSIZE, it will have passed in the desired number
+ * of CQ ring entries manually.
*/
p->sq_entries = roundup_pow_of_two(entries);
- p->cq_entries = 2 * p->sq_entries;
+ if (p->flags & IORING_SETUP_CQSIZE) {
+ /*
+ * If IORING_SETUP_CQSIZE is set, we do the same roundup
+ * to a power-of-two, if it isn't already. We do NOT impose
+ * any cq vs sq ring sizing.
+ */
+ if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
+ return -EINVAL;
+ p->cq_entries = roundup_pow_of_two(p->cq_entries);
+ } else {
+ p->cq_entries = 2 * p->sq_entries;
+ }
user = get_uid(current_user());
account_mem = !capable(CAP_IPC_LOCK);
@@ -3821,6 +4759,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
ctx->compat = in_compat_syscall();
ctx->account_mem = account_mem;
ctx->user = user;
+ ctx->creds = prepare_creds();
ret = io_allocate_scq_urings(ctx, p);
if (ret)
@@ -3855,7 +4794,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
if (ret < 0)
goto err;
- p->features = IORING_FEAT_SINGLE_MMAP;
+ p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP;
+ trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
io_ring_ctx_wait_and_kill(ctx);
@@ -3881,7 +4821,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
}
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
- IORING_SETUP_SQ_AFF))
+ IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
return -EINVAL;
ret = io_uring_create(entries, &p);
@@ -3925,7 +4865,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
* no new references will come in after we've killed the percpu ref.
*/
mutex_unlock(&ctx->uring_lock);
- wait_for_completion(&ctx->ctx_done);
+ wait_for_completion(&ctx->completions[0]);
mutex_lock(&ctx->uring_lock);
switch (opcode) {
@@ -3947,6 +4887,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_sqe_files_unregister(ctx);
break;
+ case IORING_REGISTER_FILES_UPDATE:
+ ret = io_sqe_files_update(ctx, arg, nr_args);
+ break;
case IORING_REGISTER_EVENTFD:
ret = -EINVAL;
if (nr_args != 1)
@@ -3965,7 +4908,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
}
/* bring the ctx back to life */
- reinit_completion(&ctx->ctx_done);
+ reinit_completion(&ctx->completions[0]);
percpu_ref_reinit(&ctx->refs);
return ret;
}
@@ -3990,6 +4933,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
+ trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
+ ctx->cq_ev_fd != NULL, ret);
out_fput:
fdput(f);
return ret;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index fef3a6bf7c78..2f5e4e5b97e1 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -8,6 +8,7 @@
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/capability.h>
+#include <linux/compat.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/security.h>
@@ -174,10 +175,9 @@ static int fiemap_check_ranges(struct super_block *sb,
return 0;
}
-static int ioctl_fiemap(struct file *filp, unsigned long arg)
+static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
{
struct fiemap fiemap;
- struct fiemap __user *ufiemap = (struct fiemap __user *) arg;
struct fiemap_extent_info fieinfo = { 0, };
struct inode *inode = file_inode(filp);
struct super_block *sb = inode->i_sb;
@@ -244,7 +244,8 @@ fdput:
return ret;
}
-static long ioctl_file_clone_range(struct file *file, void __user *argp)
+static long ioctl_file_clone_range(struct file *file,
+ struct file_clone_range __user *argp)
{
struct file_clone_range args;
@@ -466,7 +467,7 @@ EXPORT_SYMBOL(generic_block_fiemap);
* Only the l_start, l_len and l_whence fields of the 'struct space_resv'
* are used here, rest are ignored.
*/
-int ioctl_preallocate(struct file *filp, void __user *argp)
+int ioctl_preallocate(struct file *filp, int mode, void __user *argp)
{
struct inode *inode = file_inode(filp);
struct space_resv sr;
@@ -487,9 +488,39 @@ int ioctl_preallocate(struct file *filp, void __user *argp)
return -EINVAL;
}
- return vfs_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+ return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start,
+ sr.l_len);
}
+/* on ia32 l_start is on a 32-bit boundary */
+#if defined CONFIG_COMPAT && defined(CONFIG_X86_64)
+/* just account for different alignment */
+int compat_ioctl_preallocate(struct file *file, int mode,
+ struct space_resv_32 __user *argp)
+{
+ struct inode *inode = file_inode(file);
+ struct space_resv_32 sr;
+
+ if (copy_from_user(&sr, argp, sizeof(sr)))
+ return -EFAULT;
+
+ switch (sr.l_whence) {
+ case SEEK_SET:
+ break;
+ case SEEK_CUR:
+ sr.l_start += file->f_pos;
+ break;
+ case SEEK_END:
+ sr.l_start += i_size_read(inode);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
+}
+#endif
+
static int file_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -503,7 +534,12 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
return put_user(i_size_read(inode) - filp->f_pos, p);
case FS_IOC_RESVSP:
case FS_IOC_RESVSP64:
- return ioctl_preallocate(filp, p);
+ return ioctl_preallocate(filp, 0, p);
+ case FS_IOC_UNRESVSP:
+ case FS_IOC_UNRESVSP64:
+ return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p);
+ case FS_IOC_ZERO_RANGE:
+ return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p);
}
return vfs_ioctl(filp, cmd, arg);
@@ -584,9 +620,9 @@ static int ioctl_fsthaw(struct file *filp)
return thaw_super(sb);
}
-static int ioctl_file_dedupe_range(struct file *file, void __user *arg)
+static int ioctl_file_dedupe_range(struct file *file,
+ struct file_dedupe_range __user *argp)
{
- struct file_dedupe_range __user *argp = arg;
struct file_dedupe_range *same = NULL;
int ret;
unsigned long size;
@@ -635,7 +671,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
unsigned long arg)
{
int error = 0;
- int __user *argp = (int __user *)arg;
+ void __user *argp = (void __user *)arg;
struct inode *inode = file_inode(filp);
switch (cmd) {
@@ -674,13 +710,13 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
break;
case FS_IOC_FIEMAP:
- return ioctl_fiemap(filp, arg);
+ return ioctl_fiemap(filp, argp);
case FIGETBSZ:
/* anon_bdev filesystems may not have a block size */
if (!inode->i_sb->s_blocksize)
return -EINVAL;
- return put_user(inode->i_sb->s_blocksize, argp);
+ return put_user(inode->i_sb->s_blocksize, (int __user *)argp);
case FICLONE:
return ioctl_file_clone(filp, arg, 0, 0, 0);
@@ -719,3 +755,37 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
return ksys_ioctl(fd, cmd, arg);
}
+
+#ifdef CONFIG_COMPAT
+/**
+ * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
+ *
+ * This is not normally called as a function, but instead set in struct
+ * file_operations as
+ *
+ * .compat_ioctl = compat_ptr_ioctl,
+ *
+ * On most architectures, the compat_ptr_ioctl() just passes all arguments
+ * to the corresponding ->ioctl handler. The exception is arch/s390, where
+ * compat_ptr() clears the top bit of a 32-bit pointer value, so user space
+ * pointers to the second 2GB alias the first 2GB, as is the case for
+ * native 32-bit s390 user space.
+ *
+ * The compat_ptr_ioctl() function must therefore be used only with ioctl
+ * functions that either ignore the argument or pass a pointer to a
+ * compatible data type.
+ *
+ * If any ioctl command handled by fops->unlocked_ioctl passes a plain
+ * integer instead of a pointer, or any of the passed data types
+ * is incompatible between 32-bit and 64-bit architectures, a proper
+ * handler is required instead of compat_ptr_ioctl.
+ */
+long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ if (!file->f_op->unlocked_ioctl)
+ return -ENOIOCTLCMD;
+
+ return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+EXPORT_SYMBOL(compat_ptr_ioctl);
+#endif
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 93cd11938bf5..eef2722d93a1 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -3,13 +3,15 @@
# Copyright (c) 2019 Oracle.
# All Rights Reserved.
#
-obj-$(CONFIG_FS_IOMAP) += iomap.o
-iomap-y += \
- apply.o \
- buffered-io.o \
- direct-io.o \
- fiemap.o \
- seek.o
+ccflags-y += -I $(srctree)/$(src) # needed for trace events
+
+obj-$(CONFIG_FS_IOMAP) += iomap.o
+iomap-y += trace.o \
+ apply.o \
+ buffered-io.o \
+ direct-io.o \
+ fiemap.o \
+ seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
index 54c02aecf3cd..76925b40b5fd 100644
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -7,6 +7,7 @@
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/iomap.h>
+#include "trace.h"
/*
* Execute a iomap write on a segment of the mapping that spans a
@@ -23,8 +24,12 @@ loff_t
iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
const struct iomap_ops *ops, void *data, iomap_actor_t actor)
{
- struct iomap iomap = { 0 };
+ struct iomap iomap = { .type = IOMAP_HOLE };
+ struct iomap srcmap = { .type = IOMAP_HOLE };
loff_t written = 0, ret;
+ u64 end;
+
+ trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_);
/*
* Need to map a range from start position for length bytes. This can
@@ -38,7 +43,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
* expose transient stale data. If the reserve fails, we can safely
* back out at this point as there is nothing to undo.
*/
- ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+ ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap);
if (ret)
return ret;
if (WARN_ON(iomap.offset > pos))
@@ -46,19 +51,34 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
if (WARN_ON(iomap.length == 0))
return -EIO;
+ trace_iomap_apply_dstmap(inode, &iomap);
+ if (srcmap.type != IOMAP_HOLE)
+ trace_iomap_apply_srcmap(inode, &srcmap);
+
/*
* Cut down the length to the one actually provided by the filesystem,
* as it might not be able to give us the whole size that we requested.
*/
- if (iomap.offset + iomap.length < pos + length)
- length = iomap.offset + iomap.length - pos;
+ end = iomap.offset + iomap.length;
+ if (srcmap.type != IOMAP_HOLE)
+ end = min(end, srcmap.offset + srcmap.length);
+ if (pos + length > end)
+ length = end - pos;
/*
- * Now that we have guaranteed that the space allocation will succeed.
+ * Now that we have guaranteed that the space allocation will succeed,
* we can do the copy-in page by page without having to worry about
* failures exposing transient data.
+ *
+ * To support COW operations, we read in data for partially blocks from
+ * the srcmap if the file system filled it in. In that case we the
+ * length needs to be limited to the earlier of the ends of the iomaps.
+ * If the file system did not provide a srcmap we pass in the normal
+ * iomap into the actors so that they don't need to have special
+ * handling for the two cases.
*/
- written = actor(inode, pos, length, data, &iomap);
+ written = actor(inode, pos, length, data, &iomap,
+ srcmap.type != IOMAP_HOLE ? &srcmap : &iomap);
/*
* Now the data has been copied, commit the range we've copied. This
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e25901ae3ff4..d33c7bc5ee92 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (C) 2016-2019 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -12,13 +12,34 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
+#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
+#include "trace.h"
#include "../internal.h"
+/*
+ * Structure allocated for each page when block size < PAGE_SIZE to track
+ * sub-page uptodate status and I/O completions.
+ */
+struct iomap_page {
+ atomic_t read_count;
+ atomic_t write_count;
+ DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+};
+
+static inline struct iomap_page *to_iomap_page(struct page *page)
+{
+ if (page_has_private(page))
+ return (struct iomap_page *)page_private(page);
+ return NULL;
+}
+
+static struct bio_set iomap_ioend_bioset;
+
static struct iomap_page *
iomap_page_create(struct inode *inode, struct page *page)
{
@@ -203,9 +224,17 @@ iomap_read_inline_data(struct inode *inode, struct page *page,
SetPageUptodate(page);
}
+static inline bool iomap_block_needs_zeroing(struct inode *inode,
+ struct iomap *iomap, loff_t pos)
+{
+ return iomap->type != IOMAP_MAPPED ||
+ (iomap->flags & IOMAP_F_NEW) ||
+ pos >= i_size_read(inode);
+}
+
static loff_t
iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
struct page *page = ctx->cur_page;
@@ -226,7 +255,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (plen == 0)
goto done;
- if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
+ if (iomap_block_needs_zeroing(inode, iomap, pos)) {
zero_user(page, poff, plen);
iomap_set_range_uptodate(page, poff, plen);
goto done;
@@ -293,6 +322,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
unsigned poff;
loff_t ret;
+ trace_iomap_readpage(page->mapping->host, 1);
+
for (poff = 0; poff < PAGE_SIZE; poff += ret) {
ret = iomap_apply(inode, page_offset(page) + poff,
PAGE_SIZE - poff, 0, ops, &ctx,
@@ -351,7 +382,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
static loff_t
iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
loff_t done, ret;
@@ -371,7 +402,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
ctx->cur_page_in_bio = false;
}
ret = iomap_readpage_actor(inode, pos + done, length - done,
- ctx, iomap);
+ ctx, iomap, srcmap);
}
return done;
@@ -389,6 +420,8 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
loff_t last = page_offset(list_entry(pages->next, struct page, lru));
loff_t length = last - pos + PAGE_SIZE, ret = 0;
+ trace_iomap_readpages(mapping->host, nr_pages);
+
while (length > 0) {
ret = iomap_apply(mapping->host, pos, length, 0, ops,
&ctx, iomap_readpages_actor);
@@ -455,6 +488,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
int
iomap_releasepage(struct page *page, gfp_t gfp_mask)
{
+ trace_iomap_releasepage(page->mapping->host, page, 0, 0);
+
/*
* mm accommodates an old ext3 case where clean pages might not have had
* the dirty bit cleared. Thus, it can send actual dirty pages to
@@ -470,6 +505,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage);
void
iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
{
+ trace_iomap_invalidatepage(page->mapping->host, page, offset, len);
+
/*
* If we are invalidating the entire page, clear the dirty state from it
* and release it to avoid unnecessary buildup of the LRU.
@@ -511,6 +548,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
EXPORT_SYMBOL_GPL(iomap_migrate_page);
#endif /* CONFIG_MIGRATION */
+enum {
+ IOMAP_WRITE_F_UNSHARE = (1 << 0),
+};
+
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -525,19 +566,12 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
}
static int
-iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
- unsigned poff, unsigned plen, unsigned from, unsigned to,
- struct iomap *iomap)
+iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
+ unsigned plen, struct iomap *iomap)
{
struct bio_vec bvec;
struct bio bio;
- if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
- zero_user_segments(page, poff, from, to, poff + plen);
- iomap_set_range_uptodate(page, poff, plen);
- return 0;
- }
-
bio_init(&bio, &bvec, 1);
bio.bi_opf = REQ_OP_READ;
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
@@ -547,15 +581,15 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
}
static int
-__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
- struct page *page, struct iomap *iomap)
+__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
+ struct page *page, struct iomap *srcmap)
{
struct iomap_page *iop = iomap_page_create(inode, page);
loff_t block_size = i_blocksize(inode);
loff_t block_start = pos & ~(block_size - 1);
loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
unsigned from = offset_in_page(pos), to = from + len, poff, plen;
- int status = 0;
+ int status;
if (PageUptodate(page))
return 0;
@@ -566,29 +600,39 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
if (plen == 0)
break;
- if ((from > poff && from < poff + plen) ||
- (to > poff && to < poff + plen)) {
- status = iomap_read_page_sync(inode, block_start, page,
- poff, plen, from, to, iomap);
- if (status)
- break;
+ if (!(flags & IOMAP_WRITE_F_UNSHARE) &&
+ (from <= poff || from >= poff + plen) &&
+ (to <= poff || to >= poff + plen))
+ continue;
+
+ if (iomap_block_needs_zeroing(inode, srcmap, block_start)) {
+ if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
+ return -EIO;
+ zero_user_segments(page, poff, from, to, poff + plen);
+ iomap_set_range_uptodate(page, poff, plen);
+ continue;
}
+ status = iomap_read_page_sync(block_start, page, poff, plen,
+ srcmap);
+ if (status)
+ return status;
} while ((block_start += plen) < block_end);
- return status;
+ return 0;
}
static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, struct iomap *iomap)
+ struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
{
const struct iomap_page_ops *page_ops = iomap->page_ops;
- pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status = 0;
BUG_ON(pos + len > iomap->offset + iomap->length);
+ if (srcmap != iomap)
+ BUG_ON(pos + len > srcmap->offset + srcmap->length);
if (fatal_signal_pending(current))
return -EINTR;
@@ -599,18 +643,20 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
return status;
}
- page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+ page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT,
+ AOP_FLAG_NOFS);
if (!page) {
status = -ENOMEM;
goto out_no_page;
}
- if (iomap->type == IOMAP_INLINE)
- iomap_read_inline_data(inode, page, iomap);
+ if (srcmap->type == IOMAP_INLINE)
+ iomap_read_inline_data(inode, page, srcmap);
else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
- status = __block_write_begin_int(page, pos, len, NULL, iomap);
+ status = __block_write_begin_int(page, pos, len, NULL, srcmap);
else
- status = __iomap_write_begin(inode, pos, len, page, iomap);
+ status = __iomap_write_begin(inode, pos, len, flags, page,
+ srcmap);
if (unlikely(status))
goto out_unlock;
@@ -656,7 +702,7 @@ EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
static int
__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page, struct iomap *iomap)
+ unsigned copied, struct page *page)
{
flush_dcache_page(page);
@@ -696,20 +742,20 @@ iomap_write_end_inline(struct inode *inode, struct page *page,
}
static int
-iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page, struct iomap *iomap)
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied,
+ struct page *page, struct iomap *iomap, struct iomap *srcmap)
{
const struct iomap_page_ops *page_ops = iomap->page_ops;
loff_t old_size = inode->i_size;
int ret;
- if (iomap->type == IOMAP_INLINE) {
+ if (srcmap->type == IOMAP_INLINE) {
ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
- } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
+ } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
page, NULL);
} else {
- ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
+ ret = __iomap_write_end(inode, pos, len, copied, page);
}
/*
@@ -736,12 +782,11 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
static loff_t
iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ struct iomap *iomap, struct iomap *srcmap)
{
struct iov_iter *i = data;
long status = 0;
ssize_t written = 0;
- unsigned int flags = AOP_FLAG_NOFS;
do {
struct page *page;
@@ -771,8 +816,8 @@ again:
break;
}
- status = iomap_write_begin(inode, pos, bytes, flags, &page,
- iomap);
+ status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
+ srcmap);
if (unlikely(status))
break;
@@ -783,8 +828,8 @@ again:
flush_dcache_page(page);
- status = iomap_write_end(inode, pos, bytes, copied, page,
- iomap);
+ status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
+ srcmap);
if (unlikely(status < 0))
break;
copied = status;
@@ -835,50 +880,32 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
-static struct page *
-__iomap_read_page(struct inode *inode, loff_t offset)
-{
- struct address_space *mapping = inode->i_mapping;
- struct page *page;
-
- page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
- if (IS_ERR(page))
- return page;
- if (!PageUptodate(page)) {
- put_page(page);
- return ERR_PTR(-EIO);
- }
- return page;
-}
-
static loff_t
-iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+ struct iomap *iomap, struct iomap *srcmap)
{
long status = 0;
ssize_t written = 0;
- do {
- struct page *page, *rpage;
- unsigned long offset; /* Offset into pagecache page */
- unsigned long bytes; /* Bytes to write to page */
-
- offset = offset_in_page(pos);
- bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+ /* don't bother with blocks that are not shared to start with */
+ if (!(iomap->flags & IOMAP_F_SHARED))
+ return length;
+ /* don't bother with holes or unwritten extents */
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+ return length;
- rpage = __iomap_read_page(inode, pos);
- if (IS_ERR(rpage))
- return PTR_ERR(rpage);
+ do {
+ unsigned long offset = offset_in_page(pos);
+ unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+ struct page *page;
status = iomap_write_begin(inode, pos, bytes,
- AOP_FLAG_NOFS, &page, iomap);
- put_page(rpage);
+ IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap);
if (unlikely(status))
return status;
- WARN_ON_ONCE(!PageUptodate(page));
-
- status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
+ status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
+ srcmap);
if (unlikely(status <= 0)) {
if (WARN_ON_ONCE(status == 0))
return -EIO;
@@ -898,14 +925,14 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
}
int
-iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops)
{
loff_t ret;
while (len) {
ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
- iomap_dirty_actor);
+ iomap_unshare_actor);
if (ret <= 0)
return ret;
pos += ret;
@@ -914,23 +941,22 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
return 0;
}
-EXPORT_SYMBOL_GPL(iomap_file_dirty);
+EXPORT_SYMBOL_GPL(iomap_file_unshare);
static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
- unsigned bytes, struct iomap *iomap)
+ unsigned bytes, struct iomap *iomap, struct iomap *srcmap)
{
struct page *page;
int status;
- status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
- iomap);
+ status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap);
if (status)
return status;
zero_user(page, offset, bytes);
mark_page_accessed(page);
- return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
+ return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
}
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
@@ -942,14 +968,14 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
bool *did_zero = data;
loff_t written = 0;
int status;
/* already zeroed? we're done. */
- if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
return count;
do {
@@ -961,7 +987,8 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
if (IS_DAX(inode))
status = iomap_dax_zero(pos, offset, bytes, iomap);
else
- status = iomap_zero(inode, pos, offset, bytes, iomap);
+ status = iomap_zero(inode, pos, offset, bytes, iomap,
+ srcmap);
if (status < 0)
return status;
@@ -1011,7 +1038,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page);
static loff_t
iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct page *page = data;
int ret;
@@ -1040,20 +1067,19 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
lock_page(page);
size = i_size_read(inode);
- if ((page->mapping != inode->i_mapping) ||
- (page_offset(page) > size)) {
+ offset = page_offset(page);
+ if (page->mapping != inode->i_mapping || offset > size) {
/* We overload EFAULT to mean page got truncated */
ret = -EFAULT;
goto out_unlock;
}
/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_SHIFT) > size)
+ if (offset > size - PAGE_SIZE)
length = offset_in_page(size);
else
length = PAGE_SIZE;
- offset = page_offset(page);
while (length > 0) {
ret = iomap_apply(inode, offset, length,
IOMAP_WRITE | IOMAP_FAULT, ops, page,
@@ -1071,3 +1097,551 @@ out_unlock:
return block_page_mkwrite_return(ret);
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+static void
+iomap_finish_page_writeback(struct inode *inode, struct page *page,
+ int error)
+{
+ struct iomap_page *iop = to_iomap_page(page);
+
+ if (error) {
+ SetPageError(page);
+ mapping_set_error(inode->i_mapping, -EIO);
+ }
+
+ WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
+ WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0);
+
+ if (!iop || atomic_dec_and_test(&iop->write_count))
+ end_page_writeback(page);
+}
+
+/*
+ * We're now finished for good with this ioend structure. Update the page
+ * state, release holds on bios, and finally free up memory. Do not use the
+ * ioend after this.
+ */
+static void
+iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+ struct inode *inode = ioend->io_inode;
+ struct bio *bio = &ioend->io_inline_bio;
+ struct bio *last = ioend->io_bio, *next;
+ u64 start = bio->bi_iter.bi_sector;
+ bool quiet = bio_flagged(bio, BIO_QUIET);
+
+ for (bio = &ioend->io_inline_bio; bio; bio = next) {
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
+
+ /*
+ * For the last bio, bi_private points to the ioend, so we
+ * need to explicitly end the iteration here.
+ */
+ if (bio == last)
+ next = NULL;
+ else
+ next = bio->bi_private;
+
+ /* walk each page on bio, ending page IO on them */
+ bio_for_each_segment_all(bv, bio, iter_all)
+ iomap_finish_page_writeback(inode, bv->bv_page, error);
+ bio_put(bio);
+ }
+
+ if (unlikely(error && !quiet)) {
+ printk_ratelimited(KERN_ERR
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino, ioend->io_offset,
+ start);
+ }
+}
+
+void
+iomap_finish_ioends(struct iomap_ioend *ioend, int error)
+{
+ struct list_head tmp;
+
+ list_replace_init(&ioend->io_list, &tmp);
+ iomap_finish_ioend(ioend, error);
+
+ while (!list_empty(&tmp)) {
+ ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
+ list_del_init(&ioend->io_list);
+ iomap_finish_ioend(ioend, error);
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_finish_ioends);
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool
+iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
+{
+ if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+ return false;
+ if ((ioend->io_flags & IOMAP_F_SHARED) ^
+ (next->io_flags & IOMAP_F_SHARED))
+ return false;
+ if ((ioend->io_type == IOMAP_UNWRITTEN) ^
+ (next->io_type == IOMAP_UNWRITTEN))
+ return false;
+ if (ioend->io_offset + ioend->io_size != next->io_offset)
+ return false;
+ return true;
+}
+
+void
+iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
+ void (*merge_private)(struct iomap_ioend *ioend,
+ struct iomap_ioend *next))
+{
+ struct iomap_ioend *next;
+
+ INIT_LIST_HEAD(&ioend->io_list);
+
+ while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
+ io_list))) {
+ if (!iomap_ioend_can_merge(ioend, next))
+ break;
+ list_move_tail(&next->io_list, &ioend->io_list);
+ ioend->io_size += next->io_size;
+ if (next->io_private && merge_private)
+ merge_private(ioend, next);
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
+
+static int
+iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
+ struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
+
+ if (ia->io_offset < ib->io_offset)
+ return -1;
+ if (ia->io_offset > ib->io_offset)
+ return 1;
+ return 0;
+}
+
+void
+iomap_sort_ioends(struct list_head *ioend_list)
+{
+ list_sort(NULL, ioend_list, iomap_ioend_compare);
+}
+EXPORT_SYMBOL_GPL(iomap_sort_ioends);
+
+static void iomap_writepage_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = bio->bi_private;
+
+ iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
+}
+
+/*
+ * Submit the final bio for an ioend.
+ *
+ * If @error is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the bio instead of
+ * submitting it. This typically only happens on a filesystem shutdown.
+ */
+static int
+iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
+ int error)
+{
+ ioend->io_bio->bi_private = ioend;
+ ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
+
+ if (wpc->ops->prepare_ioend)
+ error = wpc->ops->prepare_ioend(ioend, error);
+ if (error) {
+ /*
+ * If we are failing the IO now, just mark the ioend with an
+ * error and finish it. This will run IO completion immediately
+ * as there is only one reference to the ioend at this point in
+ * time.
+ */
+ ioend->io_bio->bi_status = errno_to_blk_status(error);
+ bio_endio(ioend->io_bio);
+ return error;
+ }
+
+ submit_bio(ioend->io_bio);
+ return 0;
+}
+
+static struct iomap_ioend *
+iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
+ loff_t offset, sector_t sector, struct writeback_control *wbc)
+{
+ struct iomap_ioend *ioend;
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset);
+ bio_set_dev(bio, wpc->iomap.bdev);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ bio->bi_write_hint = inode->i_write_hint;
+ wbc_init_bio(wbc, bio);
+
+ ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
+ INIT_LIST_HEAD(&ioend->io_list);
+ ioend->io_type = wpc->iomap.type;
+ ioend->io_flags = wpc->iomap.flags;
+ ioend->io_inode = inode;
+ ioend->io_size = 0;
+ ioend->io_offset = offset;
+ ioend->io_private = NULL;
+ ioend->io_bio = bio;
+ return ioend;
+}
+
+/*
+ * Allocate a new bio, and chain the old bio to the new one.
+ *
+ * Note that we have to do perform the chaining in this unintuitive order
+ * so that the bi_private linkage is set up in the right direction for the
+ * traversal in iomap_finish_ioend().
+ */
+static struct bio *
+iomap_chain_bio(struct bio *prev)
+{
+ struct bio *new;
+
+ new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+ bio_copy_dev(new, prev);/* also copies over blkcg information */
+ new->bi_iter.bi_sector = bio_end_sector(prev);
+ new->bi_opf = prev->bi_opf;
+ new->bi_write_hint = prev->bi_write_hint;
+
+ bio_chain(prev, new);
+ bio_get(prev); /* for iomap_finish_ioend */
+ submit_bio(prev);
+ return new;
+}
+
+static bool
+iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
+ sector_t sector)
+{
+ if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
+ (wpc->ioend->io_flags & IOMAP_F_SHARED))
+ return false;
+ if (wpc->iomap.type != wpc->ioend->io_type)
+ return false;
+ if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
+ return false;
+ if (sector != bio_end_sector(wpc->ioend->io_bio))
+ return false;
+ return true;
+}
+
+/*
+ * Test to see if we have an existing ioend structure that we could append to
+ * first, otherwise finish off the current ioend and start another.
+ */
+static void
+iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
+ struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct list_head *iolist)
+{
+ sector_t sector = iomap_sector(&wpc->iomap, offset);
+ unsigned len = i_blocksize(inode);
+ unsigned poff = offset & (PAGE_SIZE - 1);
+ bool merged, same_page = false;
+
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) {
+ if (wpc->ioend)
+ list_add(&wpc->ioend->io_list, iolist);
+ wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc);
+ }
+
+ merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
+ &same_page);
+ if (iop && !same_page)
+ atomic_inc(&iop->write_count);
+
+ if (!merged) {
+ if (bio_full(wpc->ioend->io_bio, len)) {
+ wpc->ioend->io_bio =
+ iomap_chain_bio(wpc->ioend->io_bio);
+ }
+ bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ }
+
+ wpc->ioend->io_size += len;
+ wbc_account_cgroup_owner(wbc, page, len);
+}
+
+/*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding blocks to is cached on the writepage context, and if the new block
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected. While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+static int
+iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct inode *inode,
+ struct page *page, u64 end_offset)
+{
+ struct iomap_page *iop = to_iomap_page(page);
+ struct iomap_ioend *ioend, *next;
+ unsigned len = i_blocksize(inode);
+ u64 file_offset; /* file offset of page */
+ int error = 0, count = 0, i;
+ LIST_HEAD(submit_list);
+
+ WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
+ WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0);
+
+ /*
+ * Walk through the page to find areas to write back. If we run off the
+ * end of the current map or find the current map invalid, grab a new
+ * one.
+ */
+ for (i = 0, file_offset = page_offset(page);
+ i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
+ i++, file_offset += len) {
+ if (iop && !test_bit(i, iop->uptodate))
+ continue;
+
+ error = wpc->ops->map_blocks(wpc, inode, file_offset);
+ if (error)
+ break;
+ if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
+ continue;
+ if (wpc->iomap.type == IOMAP_HOLE)
+ continue;
+ iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
+ &submit_list);
+ count++;
+ }
+
+ WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
+ WARN_ON_ONCE(!PageLocked(page));
+ WARN_ON_ONCE(PageWriteback(page));
+
+ /*
+ * We cannot cancel the ioend directly here on error. We may have
+ * already set other pages under writeback and hence we have to run I/O
+ * completion to mark the error state of the pages under writeback
+ * appropriately.
+ */
+ if (unlikely(error)) {
+ if (!count) {
+ /*
+ * If the current page hasn't been added to ioend, it
+ * won't be affected by I/O completions and we must
+ * discard and unlock it right here.
+ */
+ if (wpc->ops->discard_page)
+ wpc->ops->discard_page(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ goto done;
+ }
+
+ /*
+ * If the page was not fully cleaned, we need to ensure that the
+ * higher layers come back to it correctly. That means we need
+ * to keep the page dirty, and for WB_SYNC_ALL writeback we need
+ * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
+ * so another attempt to write this page in this writeback sweep
+ * will be made.
+ */
+ set_page_writeback_keepwrite(page);
+ } else {
+ clear_page_dirty_for_io(page);
+ set_page_writeback(page);
+ }
+
+ unlock_page(page);
+
+ /*
+ * Preserve the original error if there was one, otherwise catch
+ * submission errors here and propagate into subsequent ioend
+ * submissions.
+ */
+ list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+ int error2;
+
+ list_del_init(&ioend->io_list);
+ error2 = iomap_submit_ioend(wpc, ioend, error);
+ if (error2 && !error)
+ error = error2;
+ }
+
+ /*
+ * We can end up here with no error and nothing to write only if we race
+ * with a partial page truncate on a sub-page block sized filesystem.
+ */
+ if (!count)
+ end_page_writeback(page);
+done:
+ mapping_set_error(page->mapping, error);
+ return error;
+}
+
+/*
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ */
+static int
+iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
+{
+ struct iomap_writepage_ctx *wpc = data;
+ struct inode *inode = page->mapping->host;
+ pgoff_t end_index;
+ u64 end_offset;
+ loff_t offset;
+
+ trace_iomap_writepage(inode, page, 0, 0);
+
+ /*
+ * Refuse to write the page out if we are called from reclaim context.
+ *
+ * This avoids stack overflows when called from deeply used stacks in
+ * random callers for direct reclaim or memcg reclaim. We explicitly
+ * allow reclaim from kswapd as the stack usage there is relatively low.
+ *
+ * This should never happen except in the case of a VM regression so
+ * warn about it.
+ */
+ if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+ PF_MEMALLOC))
+ goto redirty;
+
+ /*
+ * Given that we do not allow direct reclaim to call us, we should
+ * never be called in a recursive filesystem reclaim context.
+ */
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
+ goto redirty;
+
+ /*
+ * Is this page beyond the end of the file?
+ *
+ * The page index is less than the end_index, adjust the end_offset
+ * to the highest offset that this page should represent.
+ * -----------------------------------------------------
+ * | file mapping | <EOF> |
+ * -----------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | |
+ * ^--------------------------------^----------|--------
+ * | desired writeback range | see else |
+ * ---------------------------------^------------------|
+ */
+ offset = i_size_read(inode);
+ end_index = offset >> PAGE_SHIFT;
+ if (page->index < end_index)
+ end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT;
+ else {
+ /*
+ * Check whether the page to write out is beyond or straddles
+ * i_size or not.
+ * -------------------------------------------------------
+ * | file mapping | <EOF> |
+ * -------------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
+ * ^--------------------------------^-----------|---------
+ * | | Straddles |
+ * ---------------------------------^-----------|--------|
+ */
+ unsigned offset_into_page = offset & (PAGE_SIZE - 1);
+
+ /*
+ * Skip the page if it is fully outside i_size, e.g. due to a
+ * truncate operation that is in progress. We must redirty the
+ * page so that reclaim stops reclaiming it. Otherwise
+ * iomap_vm_releasepage() is called on it and gets confused.
+ *
+ * Note that the end_index is unsigned long, it would overflow
+ * if the given offset is greater than 16TB on 32-bit system
+ * and if we do check the page is fully outside i_size or not
+ * via "if (page->index >= end_index + 1)" as "end_index + 1"
+ * will be evaluated to 0. Hence this page will be redirtied
+ * and be written out repeatedly which would result in an
+ * infinite loop, the user program that perform this operation
+ * will hang. Instead, we can verify this situation by checking
+ * if the page to write is totally beyond the i_size or if it's
+ * offset is just equal to the EOF.
+ */
+ if (page->index > end_index ||
+ (page->index == end_index && offset_into_page == 0))
+ goto redirty;
+
+ /*
+ * The page straddles i_size. It must be zeroed out on each
+ * and every writepage invocation because it may be mmapped.
+ * "A file is mapped in multiples of the page size. For a file
+ * that is not a multiple of the page size, the remaining
+ * memory is zeroed when mapped, and writes to that region are
+ * not written out to the file."
+ */
+ zero_user_segment(page, offset_into_page, PAGE_SIZE);
+
+ /* Adjust the end_offset to the end of file */
+ end_offset = offset;
+ }
+
+ return iomap_writepage_map(wpc, wbc, inode, page, end_offset);
+
+redirty:
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+}
+
+int
+iomap_writepage(struct page *page, struct writeback_control *wbc,
+ struct iomap_writepage_ctx *wpc,
+ const struct iomap_writeback_ops *ops)
+{
+ int ret;
+
+ wpc->ops = ops;
+ ret = iomap_do_writepage(page, wbc, wpc);
+ if (!wpc->ioend)
+ return ret;
+ return iomap_submit_ioend(wpc, wpc->ioend, ret);
+}
+EXPORT_SYMBOL_GPL(iomap_writepage);
+
+int
+iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
+ struct iomap_writepage_ctx *wpc,
+ const struct iomap_writeback_ops *ops)
+{
+ int ret;
+
+ wpc->ops = ops;
+ ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
+ if (!wpc->ioend)
+ return ret;
+ return iomap_submit_ioend(wpc, wpc->ioend, ret);
+}
+EXPORT_SYMBOL_GPL(iomap_writepages);
+
+static int __init iomap_init(void)
+{
+ return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
+ offsetof(struct iomap_ioend, io_inline_bio),
+ BIOSET_NEED_BVECS);
+}
+fs_initcall(iomap_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 1fc28c2da279..23837926c0c5 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -201,12 +201,12 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
unsigned int align = iov_iter_alignment(dio->submit.iter);
- struct iov_iter iter;
struct bio *bio;
bool need_zeroout = false;
bool use_fua = false;
int nr_pages, ret = 0;
size_t copied = 0;
+ size_t orig_count;
if ((pos | length | align) & ((1 << blkbits) - 1))
return -EINVAL;
@@ -236,15 +236,18 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
}
/*
- * Operate on a partial iter trimmed to the extent we were called for.
- * We'll update the iter in the dio once we're done with this extent.
+ * Save the original count and trim the iter to just the extent we
+ * are operating on right now. The iter will be re-expanded once
+ * we are done.
*/
- iter = *dio->submit.iter;
- iov_iter_truncate(&iter, length);
+ orig_count = iov_iter_count(dio->submit.iter);
+ iov_iter_truncate(dio->submit.iter, length);
- nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
- if (nr_pages <= 0)
- return nr_pages;
+ nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
+ if (nr_pages <= 0) {
+ ret = nr_pages;
+ goto out;
+ }
if (need_zeroout) {
/* zero out from the start of the block to the write offset */
@@ -257,7 +260,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
size_t n;
if (dio->error) {
iov_iter_revert(dio->submit.iter, copied);
- return 0;
+ copied = ret = 0;
+ goto out;
}
bio = bio_alloc(GFP_KERNEL, nr_pages);
@@ -268,7 +272,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- ret = bio_iov_iter_get_pages(bio, &iter);
+ ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
if (unlikely(ret)) {
/*
* We have to stop part way through an IO. We must fall
@@ -294,13 +298,11 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
bio_set_pages_dirty(bio);
}
- iov_iter_advance(dio->submit.iter, n);
-
dio->size += n;
pos += n;
copied += n;
- nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+ nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
iomap_dio_submit_bio(dio, iomap, bio);
} while (nr_pages);
@@ -318,7 +320,12 @@ zero_tail:
if (pad)
iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
}
- return copied ? copied : ret;
+out:
+ /* Undo iter limitation to current extent */
+ iov_iter_reexpand(dio->submit.iter, orig_count - copied);
+ if (copied)
+ return copied;
+ return ret;
}
static loff_t
@@ -358,7 +365,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
static loff_t
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_dio *dio = data;
@@ -392,15 +399,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
*/
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
- const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
+ const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+ bool wait_for_completion)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
size_t count = iov_iter_count(iter);
- loff_t pos = iocb->ki_pos, start = pos;
+ loff_t pos = iocb->ki_pos;
loff_t end = iocb->ki_pos + count - 1, ret = 0;
unsigned int flags = IOMAP_DIRECT;
- bool wait_for_completion = is_sync_kiocb(iocb);
struct blk_plug plug;
struct iomap_dio *dio;
@@ -409,6 +416,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!count)
return 0;
+ if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
+ return -EIO;
+
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
return -ENOMEM;
@@ -430,7 +440,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (pos >= dio->i_size)
goto out_free_dio;
- if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
+ if (iter_is_iovec(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
flags |= IOMAP_WRITE;
@@ -451,14 +461,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_has_page(mapping, start, end)) {
+ if (filemap_range_has_page(mapping, pos, end)) {
ret = -EAGAIN;
goto out_free_dio;
}
flags |= IOMAP_NOWAIT;
}
- ret = filemap_write_and_wait_range(mapping, start, end);
+ ret = filemap_write_and_wait_range(mapping, pos, end);
if (ret)
goto out_free_dio;
@@ -469,7 +479,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
* pretty crazy thing to do, so we don't support it 100%.
*/
ret = invalidate_inode_pages2_range(mapping,
- start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
if (ret)
dio_warn_stale_pagecache(iocb->ki_filp);
ret = 0;
@@ -497,8 +507,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
pos += ret;
- if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+ if (iov_iter_rw(iter) == READ && pos >= dio->i_size) {
+ /*
+ * We only report that we've read data up to i_size.
+ * Revert iter to a state corresponding to that as
+ * some callers (such as splice code) rely on it.
+ */
+ iov_iter_revert(iter, pos - dio->i_size);
break;
+ }
} while ((count = iov_iter_count(iter)) > 0);
blk_finish_plug(&plug);
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index f26fdd36e383..bccf305ea9ce 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -44,7 +44,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
static loff_t
iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
- struct iomap *iomap)
+ struct iomap *iomap, struct iomap *srcmap)
{
struct fiemap_ctx *ctx = data;
loff_t ret = length;
@@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(iomap_fiemap);
static loff_t
iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
sector_t *bno = data, addr;
@@ -133,12 +133,16 @@ iomap_bmap(struct address_space *mapping, sector_t bno,
struct inode *inode = mapping->host;
loff_t pos = bno << inode->i_blkbits;
unsigned blocksize = i_blocksize(inode);
+ int ret;
if (filemap_write_and_wait(mapping))
return 0;
bno = 0;
- iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+ ret = iomap_apply(inode, pos, blocksize, 0, ops, &bno,
+ iomap_bmap_actor);
+ if (ret)
+ return 0;
return bno;
}
EXPORT_SYMBOL_GPL(iomap_bmap);
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index c04bad4b2b43..89f61d93c0bc 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -119,7 +119,7 @@ out:
static loff_t
iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
switch (iomap->type) {
case IOMAP_UNWRITTEN:
@@ -165,7 +165,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole);
static loff_t
iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
- void *data, struct iomap *iomap)
+ void *data, struct iomap *iomap, struct iomap *srcmap)
{
switch (iomap->type) {
case IOMAP_HOLE:
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index 152a230f668d..a648dbf6991e 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -76,7 +76,8 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
* distinction between written and unwritten extents.
*/
static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
- loff_t count, void *data, struct iomap *iomap)
+ loff_t count, void *data, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct iomap_swapfile_info *isi = data;
int error;
diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c
new file mode 100644
index 000000000000..da217246b1a9
--- /dev/null
+++ b/fs/iomap/trace.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Christoph Hellwig
+ */
+#include <linux/iomap.h>
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
new file mode 100644
index 000000000000..6dc227b8c47e
--- /dev/null
+++ b/fs/iomap/trace.h
@@ -0,0 +1,191 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2009-2019 Christoph Hellwig
+ *
+ * NOTE: none of these tracepoints shall be consider a stable kernel ABI
+ * as they can change at any time.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM iomap
+
+#if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _IOMAP_TRACE_H
+
+#include <linux/tracepoint.h>
+
+struct inode;
+
+DECLARE_EVENT_CLASS(iomap_readpage_class,
+ TP_PROTO(struct inode *inode, int nr_pages),
+ TP_ARGS(inode, nr_pages),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(int, nr_pages)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->nr_pages = nr_pages;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->nr_pages)
+)
+
+#define DEFINE_READPAGE_EVENT(name) \
+DEFINE_EVENT(iomap_readpage_class, name, \
+ TP_PROTO(struct inode *inode, int nr_pages), \
+ TP_ARGS(inode, nr_pages))
+DEFINE_READPAGE_EVENT(iomap_readpage);
+DEFINE_READPAGE_EVENT(iomap_readpages);
+
+DECLARE_EVENT_CLASS(iomap_page_class,
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+ unsigned int len),
+ TP_ARGS(inode, page, off, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(pgoff_t, pgoff)
+ __field(loff_t, size)
+ __field(unsigned long, offset)
+ __field(unsigned int, length)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pgoff = page_offset(page);
+ __entry->size = i_size_read(inode);
+ __entry->offset = off;
+ __entry->length = len;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+ "length %x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pgoff,
+ __entry->size,
+ __entry->offset,
+ __entry->length)
+)
+
+#define DEFINE_PAGE_EVENT(name) \
+DEFINE_EVENT(iomap_page_class, name, \
+ TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+ unsigned int len), \
+ TP_ARGS(inode, page, off, len))
+DEFINE_PAGE_EVENT(iomap_writepage);
+DEFINE_PAGE_EVENT(iomap_releasepage);
+DEFINE_PAGE_EVENT(iomap_invalidatepage);
+
+#define IOMAP_TYPE_STRINGS \
+ { IOMAP_HOLE, "HOLE" }, \
+ { IOMAP_DELALLOC, "DELALLOC" }, \
+ { IOMAP_MAPPED, "MAPPED" }, \
+ { IOMAP_UNWRITTEN, "UNWRITTEN" }, \
+ { IOMAP_INLINE, "INLINE" }
+
+#define IOMAP_FLAGS_STRINGS \
+ { IOMAP_WRITE, "WRITE" }, \
+ { IOMAP_ZERO, "ZERO" }, \
+ { IOMAP_REPORT, "REPORT" }, \
+ { IOMAP_FAULT, "FAULT" }, \
+ { IOMAP_DIRECT, "DIRECT" }, \
+ { IOMAP_NOWAIT, "NOWAIT" }
+
+#define IOMAP_F_FLAGS_STRINGS \
+ { IOMAP_F_NEW, "NEW" }, \
+ { IOMAP_F_DIRTY, "DIRTY" }, \
+ { IOMAP_F_SHARED, "SHARED" }, \
+ { IOMAP_F_MERGED, "MERGED" }, \
+ { IOMAP_F_BUFFER_HEAD, "BH" }, \
+ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }
+
+DECLARE_EVENT_CLASS(iomap_class,
+ TP_PROTO(struct inode *inode, struct iomap *iomap),
+ TP_ARGS(inode, iomap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(u64, addr)
+ __field(loff_t, offset)
+ __field(u64, length)
+ __field(u16, type)
+ __field(u16, flags)
+ __field(dev_t, bdev)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->addr = iomap->addr;
+ __entry->offset = iomap->offset;
+ __entry->length = iomap->length;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld "
+ "length %llu type %s flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ MAJOR(__entry->bdev), MINOR(__entry->bdev),
+ __entry->addr,
+ __entry->offset,
+ __entry->length,
+ __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
+ __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+)
+
+#define DEFINE_IOMAP_EVENT(name) \
+DEFINE_EVENT(iomap_class, name, \
+ TP_PROTO(struct inode *inode, struct iomap *iomap), \
+ TP_ARGS(inode, iomap))
+DEFINE_IOMAP_EVENT(iomap_apply_dstmap);
+DEFINE_IOMAP_EVENT(iomap_apply_srcmap);
+
+TRACE_EVENT(iomap_apply,
+ TP_PROTO(struct inode *inode, loff_t pos, loff_t length,
+ unsigned int flags, const void *ops, void *actor,
+ unsigned long caller),
+ TP_ARGS(inode, pos, length, flags, ops, actor, caller),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(loff_t, pos)
+ __field(loff_t, length)
+ __field(unsigned int, flags)
+ __field(const void *, ops)
+ __field(void *, actor)
+ __field(unsigned long, caller)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pos = pos;
+ __entry->length = length;
+ __entry->flags = flags;
+ __entry->ops = ops;
+ __entry->actor = actor;
+ __entry->caller = caller;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) "
+ "ops %ps caller %pS actor %ps",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pos,
+ __entry->length,
+ __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
+ __entry->flags,
+ __entry->ops,
+ (void *)__entry->caller,
+ __entry->actor)
+);
+
+#endif /* _IOMAP_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index a1909066bde6..8fff6677a5da 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -110,7 +110,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
- nblocks = jbd2_space_needed(journal);
+ nblocks = journal->j_max_transaction_buffers;
while (jbd2_log_space_left(journal) < nblocks) {
write_unlock(&journal->j_state_lock);
mutex_lock_io(&journal->j_checkpoint_mutex);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 132fb92098c7..7f0b362b3842 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -482,10 +482,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (jh->b_committed_data) {
struct buffer_head *bh = jh2bh(jh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
jbd2_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
jbd2_journal_refile_buffer(journal, jh);
}
@@ -560,8 +560,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
stats.run.rs_logging = jiffies;
stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
stats.run.rs_logging);
- stats.run.rs_blocks =
- atomic_read(&commit_transaction->t_outstanding_credits);
+ stats.run.rs_blocks = commit_transaction->t_nr_buffers;
stats.run.rs_blocks_logged = 0;
J_ASSERT(commit_transaction->t_nr_buffers <=
@@ -642,8 +641,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/*
* start_this_handle() uses t_outstanding_credits to determine
- * the free space in the log, but this counter is changed
- * by jbd2_journal_next_log_block() also.
+ * the free space in the log.
*/
atomic_dec(&commit_transaction->t_outstanding_credits);
@@ -727,7 +725,6 @@ start_journal_io:
submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
}
cond_resched();
- stats.run.rs_blocks_logged += bufs;
/* Force a new descriptor to be generated next
time round the loop. */
@@ -814,6 +811,7 @@ start_journal_io:
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
jbd2_unfile_log_bh(bh);
+ stats.run.rs_blocks_logged++;
/*
* The list contains temporary buffer heads created by
@@ -859,6 +857,7 @@ start_journal_io:
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
jbd2_unfile_log_bh(bh);
+ stats.run.rs_blocks_logged++;
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
@@ -880,6 +879,7 @@ start_journal_io:
}
if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
+ stats.run.rs_blocks_logged++;
if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
@@ -888,6 +888,9 @@ start_journal_io:
if (err)
jbd2_journal_abort(journal, err);
+ WARN_ON_ONCE(
+ atomic_read(&commit_transaction->t_outstanding_credits) < 0);
+
/*
* Now disk caches for filesystem device are flushed so we are safe to
* erase checkpointed transactions from the log by updating journal
@@ -918,6 +921,7 @@ restart_loop:
transaction_t *cp_transaction;
struct buffer_head *bh;
int try_to_free = 0;
+ bool drop_ref;
jh = commit_transaction->t_forget;
spin_unlock(&journal->j_list_lock);
@@ -927,7 +931,7 @@ restart_loop:
* done with it.
*/
get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
/*
@@ -1022,8 +1026,10 @@ restart_loop:
try_to_free = 1;
}
JBUFFER_TRACE(jh, "refile or unfile buffer");
- __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ drop_ref = __jbd2_journal_refile_buffer(jh);
+ spin_unlock(&jh->b_state_lock);
+ if (drop_ref)
+ jbd2_journal_put_journal_head(jh);
if (try_to_free)
release_buffer_page(bh); /* Drops bh reference */
else
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1c58859aa592..5e408ee24a1a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -363,7 +363,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
- jbd_lock_bh_state(bh_in);
+ spin_lock(&jh_in->b_state_lock);
repeat:
/*
* If a new transaction has already done a buffer copy-out, then
@@ -405,13 +405,13 @@ repeat:
if (need_copy_out && !done_copy_out) {
char *tmp;
- jbd_unlock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
brelse(new_bh);
return -ENOMEM;
}
- jbd_lock_bh_state(bh_in);
+ spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
goto repeat;
@@ -464,7 +464,7 @@ repeat:
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
set_buffer_shadow(bh_in);
- jbd_unlock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
return do_escape | (done_copy_out << 1);
}
@@ -840,6 +840,7 @@ jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return NULL;
+ atomic_dec(&transaction->t_outstanding_credits);
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
header = (journal_header_t *)bh->b_data;
@@ -1098,6 +1099,16 @@ static void jbd2_stats_proc_exit(journal_t *journal)
remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}
+/* Minimum size of descriptor tag */
+static int jbd2_min_tag_size(void)
+{
+ /*
+ * Tag with 32-bit block numbers does not use last four bytes of the
+ * structure
+ */
+ return sizeof(journal_block_tag_t) - 4;
+}
+
/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
@@ -1156,7 +1167,8 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
journal->j_maxlen = len;
- n = journal->j_blocksize / sizeof(journal_block_tag_t);
+ /* We need enough buffers to write out full descriptor block. */
+ n = journal->j_blocksize / jbd2_min_tag_size();
journal->j_wbufsize = n;
journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
GFP_KERNEL);
@@ -1488,6 +1500,21 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
+static int journal_revoke_records_per_block(journal_t *journal)
+{
+ int record_size;
+ int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
+
+ if (jbd2_has_feature_64bit(journal))
+ record_size = 8;
+ else
+ record_size = 4;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ space -= sizeof(struct jbd2_journal_block_tail);
+ return space / record_size;
+}
+
/*
* Read the superblock for a given journal, performing initial
* validation of the format.
@@ -1596,6 +1623,8 @@ static int journal_get_superblock(journal_t *journal)
sizeof(sb->s_uuid));
}
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
set_buffer_verified(bh);
return 0;
@@ -1916,6 +1945,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
unlock_buffer(journal->j_sb_buffer);
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
return 1;
#undef COMPAT_FEATURE_ON
@@ -1946,6 +1977,8 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
sb->s_feature_compat &= ~cpu_to_be32(compat);
sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
sb->s_feature_incompat &= ~cpu_to_be32(incompat);
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);
@@ -2410,6 +2443,8 @@ static struct journal_head *journal_alloc_journal_head(void)
ret = kmem_cache_zalloc(jbd2_journal_head_cache,
GFP_NOFS | __GFP_NOFAIL);
}
+ if (ret)
+ spin_lock_init(&ret->b_state_lock);
return ret;
}
@@ -2529,17 +2564,23 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
J_ASSERT_BH(bh, buffer_jbd(bh));
J_ASSERT_BH(bh, jh2bh(jh) == bh);
BUFFER_TRACE(bh, "remove journal_head");
+
+ /* Unlink before dropping the lock */
+ bh->b_private = NULL;
+ jh->b_bh = NULL; /* debug, really */
+ clear_buffer_jbd(bh);
+}
+
+static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
+{
if (jh->b_frozen_data) {
printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
- jbd2_free(jh->b_frozen_data, bh->b_size);
+ jbd2_free(jh->b_frozen_data, b_size);
}
if (jh->b_committed_data) {
printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
- jbd2_free(jh->b_committed_data, bh->b_size);
+ jbd2_free(jh->b_committed_data, b_size);
}
- bh->b_private = NULL;
- jh->b_bh = NULL; /* debug, really */
- clear_buffer_jbd(bh);
journal_free_journal_head(jh);
}
@@ -2557,9 +2598,11 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
if (!jh->b_jcount) {
__journal_remove_journal_head(bh);
jbd_unlock_bh_journal_head(bh);
+ journal_release_journal_head(jh, bh->b_size);
__brelse(bh);
- } else
+ } else {
jbd_unlock_bh_journal_head(bh);
+ }
}
/*
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f08073d7bbf5..fa608788b93d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -371,6 +371,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
}
#endif
+ if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) {
+ if (!bh_in)
+ brelse(bh);
+ return -EIO;
+ }
/* We really ought not ever to revoke twice in a row without
first having the revoke cancelled: it's illegal to free a
block twice without allocating it in between! */
@@ -391,6 +396,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
__brelse(bh);
}
}
+ handle->h_revoke_credits--;
jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bee8498d7792..27b9f9dee434 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -63,6 +63,28 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
}
/*
+ * Base amount of descriptor blocks we reserve for each transaction.
+ */
+static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+{
+ int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+ int tags_per_block;
+
+ /* Subtract UUID */
+ tag_space -= 16;
+ if (jbd2_journal_has_csum_v2or3(journal))
+ tag_space -= sizeof(struct jbd2_journal_block_tail);
+ /* Commit code leaves a slack space of 16 bytes at the end of block */
+ tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+ /*
+ * Revoke descriptors are accounted separately so we need to reserve
+ * space for commit block and normal transaction descriptor blocks.
+ */
+ return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
+ tags_per_block);
+}
+
+/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
* Simply initialise a new transaction. Initialize it in
@@ -88,7 +110,9 @@ static void jbd2_get_transaction(journal_t *journal,
spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
+ jbd2_descriptor_blocks_per_trans(journal) +
atomic_read(&journal->j_reserved_credits));
+ atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
@@ -258,12 +282,13 @@ static int add_transaction_credits(journal_t *journal, int blocks,
* *before* starting to dirty potentially checkpointed buffers
* in the new transaction.
*/
- if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ if (jbd2_log_space_left(journal) < journal->j_max_transaction_buffers) {
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
- if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+ if (jbd2_log_space_left(journal) <
+ journal->j_max_transaction_buffers)
__jbd2_log_wait_for_space(journal);
write_unlock(&journal->j_state_lock);
return 1;
@@ -299,12 +324,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
- int blocks = handle->h_buffer_credits;
+ int blocks = handle->h_total_credits;
int rsv_blocks = 0;
unsigned long ts = jiffies;
if (handle->h_rsv_handle)
- rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+ rsv_blocks = handle->h_rsv_handle->h_total_credits;
/*
* Limit the number of reserved credits to 1/2 of maximum transaction
@@ -405,6 +430,7 @@ repeat:
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
handle->h_requested_credits = blocks;
+ handle->h_revoke_credits_requested = handle->h_revoke_credits;
handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
@@ -431,15 +457,15 @@ static handle_t *new_handle(int nblocks)
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
- handle->h_buffer_credits = nblocks;
+ handle->h_total_credits = nblocks;
handle->h_ref = 1;
return handle;
}
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
- gfp_t gfp_mask, unsigned int type,
- unsigned int line_no)
+ int revoke_records, gfp_t gfp_mask,
+ unsigned int type, unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -453,6 +479,8 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
return handle;
}
+ nblocks += DIV_ROUND_UP(revoke_records,
+ journal->j_revoke_records_per_block);
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
@@ -468,6 +496,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
rsv_handle->h_journal = journal;
handle->h_rsv_handle = rsv_handle;
}
+ handle->h_revoke_credits = revoke_records;
err = start_this_handle(journal, handle, gfp_mask);
if (err < 0) {
@@ -508,16 +537,21 @@ EXPORT_SYMBOL(jbd2__journal_start);
*/
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
+ return jbd2__journal_start(journal, nblocks, 0, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
-void jbd2_journal_free_reserved(handle_t *handle)
+static void __jbd2_journal_unreserve_handle(handle_t *handle)
{
journal_t *journal = handle->h_journal;
WARN_ON(!handle->h_reserved);
- sub_reserved_credits(journal, handle->h_buffer_credits);
+ sub_reserved_credits(journal, handle->h_total_credits);
+}
+
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+ __jbd2_journal_unreserve_handle(handle);
jbd2_free_handle(handle);
}
EXPORT_SYMBOL(jbd2_journal_free_reserved);
@@ -571,7 +605,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
handle->h_line_no = line_no;
trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
handle->h_transaction->t_tid, type,
- line_no, handle->h_buffer_credits);
+ line_no, handle->h_total_credits);
return 0;
}
EXPORT_SYMBOL(jbd2_journal_start_reserved);
@@ -580,6 +614,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved);
* int jbd2_journal_extend() - extend buffer credits.
* @handle: handle to 'extend'
* @nblocks: nr blocks to try to extend by.
+ * @revoke_records: number of revoke records to try to extend by.
*
* Some transactions, such as large extends and truncates, can be done
* atomically all at once or in several stages. The operation requests
@@ -596,7 +631,7 @@ EXPORT_SYMBOL(jbd2_journal_start_reserved);
* return code < 0 implies an error
* return code > 0 implies normal transaction-full status.
*/
-int jbd2_journal_extend(handle_t *handle, int nblocks)
+int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
@@ -618,6 +653,12 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
goto error_out;
}
+ nblocks += DIV_ROUND_UP(
+ handle->h_revoke_credits_requested + revoke_records,
+ journal->j_revoke_records_per_block) -
+ DIV_ROUND_UP(
+ handle->h_revoke_credits_requested,
+ journal->j_revoke_records_per_block);
spin_lock(&transaction->t_handle_lock);
wanted = atomic_add_return(nblocks,
&transaction->t_outstanding_credits);
@@ -629,22 +670,16 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
goto unlock;
}
- if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
- jbd2_log_space_left(journal)) {
- jbd_debug(3, "denied handle %p %d blocks: "
- "insufficient log space\n", handle, nblocks);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- goto unlock;
- }
-
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
transaction->t_tid,
handle->h_type, handle->h_line_no,
- handle->h_buffer_credits,
+ handle->h_total_credits,
nblocks);
- handle->h_buffer_credits += nblocks;
+ handle->h_total_credits += nblocks;
handle->h_requested_credits += nblocks;
+ handle->h_revoke_credits += revoke_records;
+ handle->h_revoke_credits_requested += revoke_records;
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -655,11 +690,55 @@ error_out:
return result;
}
+static void stop_this_handle(handle_t *handle)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int revokes;
+
+ J_ASSERT(journal_current_handle() == handle);
+ J_ASSERT(atomic_read(&transaction->t_updates) > 0);
+ current->journal_info = NULL;
+ /*
+ * Subtract necessary revoke descriptor blocks from handle credits. We
+ * take care to account only for revoke descriptor blocks the
+ * transaction will really need as large sequences of transactions with
+ * small numbers of revokes are relatively common.
+ */
+ revokes = handle->h_revoke_credits_requested - handle->h_revoke_credits;
+ if (revokes) {
+ int t_revokes, revoke_descriptors;
+ int rr_per_blk = journal->j_revoke_records_per_block;
+
+ WARN_ON_ONCE(DIV_ROUND_UP(revokes, rr_per_blk)
+ > handle->h_total_credits);
+ t_revokes = atomic_add_return(revokes,
+ &transaction->t_outstanding_revokes);
+ revoke_descriptors =
+ DIV_ROUND_UP(t_revokes, rr_per_blk) -
+ DIV_ROUND_UP(t_revokes - revokes, rr_per_blk);
+ handle->h_total_credits -= revoke_descriptors;
+ }
+ atomic_sub(handle->h_total_credits,
+ &transaction->t_outstanding_credits);
+ if (handle->h_rsv_handle)
+ __jbd2_journal_unreserve_handle(handle->h_rsv_handle);
+ if (atomic_dec_and_test(&transaction->t_updates))
+ wake_up(&journal->j_wait_updates);
+
+ rwsem_release(&journal->j_trans_commit_map, _THIS_IP_);
+ /*
+ * Scope of the GFP_NOFS context is over here and so we can restore the
+ * original alloc context.
+ */
+ memalloc_nofs_restore(handle->saved_alloc_context);
+}
/**
* int jbd2_journal_restart() - restart a handle .
* @handle: handle to restart
* @nblocks: nr credits requested
+ * @revoke_records: number of revoke record credits requested
* @gfp_mask: memory allocation flags (for start_this_handle)
*
* Restart a handle for a multi-transaction filesystem
@@ -672,56 +751,48 @@ error_out:
* credits. We preserve reserved handle if there's any attached to the
* passed in handle.
*/
-int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
+int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records,
+ gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal;
tid_t tid;
- int need_to_start, ret;
+ int need_to_start;
+ int ret;
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
journal = transaction->t_journal;
+ tid = transaction->t_tid;
/*
* First unlink the handle from its current transaction, and start the
* commit on that.
*/
- J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- J_ASSERT(journal_current_handle() == handle);
-
- read_lock(&journal->j_state_lock);
- spin_lock(&transaction->t_handle_lock);
- atomic_sub(handle->h_buffer_credits,
- &transaction->t_outstanding_credits);
- if (handle->h_rsv_handle) {
- sub_reserved_credits(journal,
- handle->h_rsv_handle->h_buffer_credits);
- }
- if (atomic_dec_and_test(&transaction->t_updates))
- wake_up(&journal->j_wait_updates);
- tid = transaction->t_tid;
- spin_unlock(&transaction->t_handle_lock);
+ jbd_debug(2, "restarting handle %p\n", handle);
+ stop_this_handle(handle);
handle->h_transaction = NULL;
- current->journal_info = NULL;
- jbd_debug(2, "restarting handle %p\n", handle);
+ /*
+ * TODO: If we use READ_ONCE / WRITE_ONCE for j_commit_request we can
+ * get rid of pointless j_state_lock traffic like this.
+ */
+ read_lock(&journal->j_state_lock);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
-
- rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
- handle->h_buffer_credits = nblocks;
- /*
- * Restore the original nofs context because the journal restart
- * is basically the same thing as journal stop and start.
- * start_this_handle will start a new nofs context.
- */
- memalloc_nofs_restore(handle->saved_alloc_context);
+ handle->h_total_credits = nblocks +
+ DIV_ROUND_UP(revoke_records,
+ journal->j_revoke_records_per_block);
+ handle->h_revoke_credits = revoke_records;
ret = start_this_handle(journal, handle, gfp_mask);
+ trace_jbd2_handle_restart(journal->j_fs_dev->bd_dev,
+ ret ? 0 : handle->h_transaction->t_tid,
+ handle->h_type, handle->h_line_no,
+ handle->h_total_credits);
return ret;
}
EXPORT_SYMBOL(jbd2__journal_restart);
@@ -729,7 +800,7 @@ EXPORT_SYMBOL(jbd2__journal_restart);
int jbd2_journal_restart(handle_t *handle, int nblocks)
{
- return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
+ return jbd2__journal_restart(handle, nblocks, 0, GFP_NOFS);
}
EXPORT_SYMBOL(jbd2_journal_restart);
@@ -879,7 +950,7 @@ repeat:
start_lock = jiffies;
lock_buffer(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
/* If it takes too long to lock the buffer, trace it */
time_lock = jbd2_time_diff(start_lock, jiffies);
@@ -929,7 +1000,7 @@ repeat:
error = -EROFS;
if (is_handle_aborted(handle)) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
goto out;
}
error = 0;
@@ -993,7 +1064,7 @@ repeat:
*/
if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
goto repeat;
}
@@ -1014,7 +1085,7 @@ repeat:
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS | __GFP_NOFAIL);
goto repeat;
@@ -1033,7 +1104,7 @@ attach_next:
jh->b_next_transaction = transaction;
done:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* If we are about to journal a buffer, then any revoke pending on it is
@@ -1172,7 +1243,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
* that case: the transaction must have deleted the buffer for it to be
* reused here.
*/
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
jh->b_transaction == NULL ||
(jh->b_transaction == journal->j_committing_transaction &&
@@ -1207,7 +1278,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
jh->b_next_transaction = transaction;
spin_unlock(&journal->j_list_lock);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* akpm: I added this. ext3_alloc_branch can pick up new indirect
@@ -1275,13 +1346,13 @@ repeat:
committed_data = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS|__GFP_NOFAIL);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (!jh->b_committed_data) {
/* Copy out the current buffer contents into the
* preserved, committed copy. */
JBUFFER_TRACE(jh, "generate b_committed data");
if (!committed_data) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
goto repeat;
}
@@ -1289,7 +1360,7 @@ repeat:
committed_data = NULL;
memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
jbd2_journal_put_journal_head(jh);
if (unlikely(committed_data))
@@ -1390,16 +1461,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
*/
if (jh->b_transaction != transaction &&
jh->b_next_transaction != transaction) {
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
jh->b_next_transaction == transaction);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
if (jh->b_modified == 1) {
/* If it's in our transaction it must be in BJ_Metadata list. */
if (jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata) {
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata)
pr_err("JBD2: assertion failure: h_type=%u "
@@ -1409,13 +1480,13 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
jh->b_jlist);
J_ASSERT_JH(jh, jh->b_transaction != transaction ||
jh->b_jlist == BJ_Metadata);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
goto out;
}
journal = transaction->t_journal;
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (jh->b_modified == 0) {
/*
@@ -1423,12 +1494,12 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
* of the transaction. This needs to be done
* once a transaction -bzzz
*/
- if (handle->h_buffer_credits <= 0) {
+ if (WARN_ON_ONCE(jbd2_handle_buffer_credits(handle) <= 0)) {
ret = -ENOSPC;
goto out_unlock_bh;
}
jh->b_modified = 1;
- handle->h_buffer_credits--;
+ handle->h_total_credits--;
}
/*
@@ -1501,7 +1572,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
JBUFFER_TRACE(jh, "exit");
return ret;
@@ -1539,18 +1610,20 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
BUFFER_TRACE(bh, "entry");
- jbd_lock_bh_state(bh);
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh) {
+ __bforget(bh);
+ return 0;
+ }
- if (!buffer_jbd(bh))
- goto not_jbd;
- jh = bh2jh(bh);
+ spin_lock(&jh->b_state_lock);
/* Critical error: attempting to delete a bitmap buffer, maybe?
* Don't do any jbd operations, and return an error. */
if (!J_EXPECT_JH(jh, !jh->b_committed_data,
"inconsistent data on disk")) {
err = -EIO;
- goto not_jbd;
+ goto drop;
}
/* keep track of whether or not this transaction modified us */
@@ -1598,10 +1671,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
} else {
__jbd2_journal_unfile_buffer(jh);
- if (!buffer_jbd(bh)) {
- spin_unlock(&journal->j_list_lock);
- goto not_jbd;
- }
+ jbd2_journal_put_journal_head(jh);
}
spin_unlock(&journal->j_list_lock);
} else if (jh->b_transaction) {
@@ -1643,7 +1713,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (!jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "belongs to none transaction");
spin_unlock(&journal->j_list_lock);
- goto not_jbd;
+ goto drop;
}
/*
@@ -1653,7 +1723,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (!buffer_dirty(bh)) {
__jbd2_journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
- goto not_jbd;
+ goto drop;
}
/*
@@ -1666,20 +1736,15 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
spin_unlock(&journal->j_list_lock);
}
-
- jbd_unlock_bh_state(bh);
- __brelse(bh);
drop:
+ __brelse(bh);
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
if (drop_reserve) {
/* no need to reserve log space for this block -bzzz */
- handle->h_buffer_credits++;
+ handle->h_total_credits++;
}
return err;
-
-not_jbd:
- jbd_unlock_bh_state(bh);
- __bforget(bh);
- goto drop;
}
/**
@@ -1706,45 +1771,34 @@ int jbd2_journal_stop(handle_t *handle)
tid_t tid;
pid_t pid;
+ if (--handle->h_ref > 0) {
+ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ handle->h_ref);
+ if (is_handle_aborted(handle))
+ return -EIO;
+ return 0;
+ }
if (!transaction) {
/*
- * Handle is already detached from the transaction so
- * there is nothing to do other than decrease a refcount,
- * or free the handle if refcount drops to zero
+ * Handle is already detached from the transaction so there is
+ * nothing to do other than free the handle.
*/
- if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
- handle->h_ref);
- return err;
- } else {
- if (handle->h_rsv_handle)
- jbd2_free_handle(handle->h_rsv_handle);
- goto free_and_exit;
- }
+ memalloc_nofs_restore(handle->saved_alloc_context);
+ goto free_and_exit;
}
journal = transaction->t_journal;
-
- J_ASSERT(journal_current_handle() == handle);
+ tid = transaction->t_tid;
if (is_handle_aborted(handle))
err = -EIO;
- else
- J_ASSERT(atomic_read(&transaction->t_updates) > 0);
-
- if (--handle->h_ref > 0) {
- jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
- handle->h_ref);
- return err;
- }
jbd_debug(4, "Handle %p going down\n", handle);
trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
- transaction->t_tid,
- handle->h_type, handle->h_line_no,
+ tid, handle->h_type, handle->h_line_no,
jiffies - handle->h_start_jiffies,
handle->h_sync, handle->h_requested_credits,
(handle->h_requested_credits -
- handle->h_buffer_credits));
+ handle->h_total_credits));
/*
* Implement synchronous transaction batching. If the handle
@@ -1804,19 +1858,13 @@ int jbd2_journal_stop(handle_t *handle)
if (handle->h_sync)
transaction->t_synchronous_commit = 1;
- current->journal_info = NULL;
- atomic_sub(handle->h_buffer_credits,
- &transaction->t_outstanding_credits);
/*
* If the handle is marked SYNC, we need to set another commit
- * going! We also want to force a commit if the current
- * transaction is occupying too much of the log, or if the
- * transaction is too old now.
+ * going! We also want to force a commit if the transaction is too
+ * old now.
*/
if (handle->h_sync ||
- (atomic_read(&transaction->t_outstanding_credits) >
- journal->j_max_transaction_buffers) ||
time_after_eq(jiffies, transaction->t_expires)) {
/* Do this even for aborted journals: an abort still
* completes the commit thread, it just doesn't write
@@ -1825,7 +1873,7 @@ int jbd2_journal_stop(handle_t *handle)
jbd_debug(2, "transaction too old, requesting commit for "
"handle %p\n", handle);
/* This is non-blocking */
- jbd2_log_start_commit(journal, transaction->t_tid);
+ jbd2_log_start_commit(journal, tid);
/*
* Special case: JBD2_SYNC synchronous updates require us
@@ -1836,31 +1884,19 @@ int jbd2_journal_stop(handle_t *handle)
}
/*
- * Once we drop t_updates, if it goes to zero the transaction
- * could start committing on us and eventually disappear. So
- * once we do this, we must not dereference transaction
- * pointer again.
+ * Once stop_this_handle() drops t_updates, the transaction could start
+ * committing on us and eventually disappear. So we must not
+ * dereference transaction pointer again after calling
+ * stop_this_handle().
*/
- tid = transaction->t_tid;
- if (atomic_dec_and_test(&transaction->t_updates)) {
- wake_up(&journal->j_wait_updates);
- if (journal->j_barrier_count)
- wake_up(&journal->j_wait_transaction_locked);
- }
-
- rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_);
+ stop_this_handle(handle);
if (wait_for_commit)
err = jbd2_log_wait_commit(journal, tid);
- if (handle->h_rsv_handle)
- jbd2_journal_free_reserved(handle->h_rsv_handle);
free_and_exit:
- /*
- * Scope of the GFP_NOFS context is over here and so we can restore the
- * original alloc context.
- */
- memalloc_nofs_restore(handle->saved_alloc_context);
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
jbd2_free_handle(handle);
return err;
}
@@ -1878,7 +1914,7 @@ free_and_exit:
*
* j_list_lock is held.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1902,7 +1938,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
*
* Called with j_list_lock held, and the journal may not be locked.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1934,7 +1970,7 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
transaction_t *transaction;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
transaction = jh->b_transaction;
if (transaction)
assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1971,17 +2007,15 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
}
/*
- * Remove buffer from all transactions.
+ * Remove buffer from all transactions. The caller is responsible for dropping
+ * the jh reference that belonged to the transaction.
*
* Called with bh_state lock and j_list_lock
- *
- * jh and bh may be already freed when this function returns.
*/
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
{
__jbd2_journal_temp_unlink_buffer(jh);
jh->b_transaction = NULL;
- jbd2_journal_put_journal_head(jh);
}
void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
@@ -1990,18 +2024,19 @@ void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
/* Get reference so that buffer cannot be freed before we unlock it */
get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
__jbd2_journal_unfile_buffer(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
__brelse(bh);
}
/*
* Called from jbd2_journal_try_to_free_buffers().
*
- * Called under jbd_lock_bh_state(bh)
+ * Called under jh->b_state_lock
*/
static void
__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
@@ -2088,10 +2123,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
if (!jh)
continue;
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
__journal_try_to_free_buffer(journal, bh);
+ spin_unlock(&jh->b_state_lock);
jbd2_journal_put_journal_head(jh);
- jbd_unlock_bh_state(bh);
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
@@ -2112,7 +2147,7 @@ busy:
*
* Called under j_list_lock.
*
- * Called under jbd_lock_bh_state(bh).
+ * Called under jh->b_state_lock.
*/
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
@@ -2133,6 +2168,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
} else {
JBUFFER_TRACE(jh, "on running transaction");
__jbd2_journal_unfile_buffer(jh);
+ jbd2_journal_put_journal_head(jh);
}
return may_free;
}
@@ -2199,18 +2235,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
* holding the page lock. --sct
*/
- if (!buffer_jbd(bh))
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh)
goto zap_buffer_unlocked;
/* OK, we have data buffer in journaled mode */
write_lock(&journal->j_state_lock);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
- jh = jbd2_journal_grab_journal_head(bh);
- if (!jh)
- goto zap_buffer_no_jh;
-
/*
* We cannot remove the buffer from checkpoint lists until the
* transaction adding inode to orphan list (let's call it T)
@@ -2289,10 +2322,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
* for commit and try again.
*/
if (partial_page) {
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
return -EBUSY;
}
/*
@@ -2304,10 +2337,10 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
jh->b_next_transaction = journal->j_running_transaction;
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
return 0;
} else {
/* Good, the buffer belongs to the running transaction.
@@ -2331,11 +2364,10 @@ zap_buffer:
* here.
*/
jh->b_modified = 0;
- jbd2_journal_put_journal_head(jh);
-zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
zap_buffer_unlocked:
clear_buffer_dirty(bh);
J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2422,7 +2454,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
int was_dirty = 0;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
assert_spin_locked(&transaction->t_journal->j_list_lock);
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2484,11 +2516,11 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
void jbd2_journal_file_buffer(struct journal_head *jh,
transaction_t *transaction, int jlist)
{
- jbd_lock_bh_state(jh2bh(jh));
+ spin_lock(&jh->b_state_lock);
spin_lock(&transaction->t_journal->j_list_lock);
__jbd2_journal_file_buffer(jh, transaction, jlist);
spin_unlock(&transaction->t_journal->j_list_lock);
- jbd_unlock_bh_state(jh2bh(jh));
+ spin_unlock(&jh->b_state_lock);
}
/*
@@ -2498,23 +2530,25 @@ void jbd2_journal_file_buffer(struct journal_head *jh,
* buffer on that transaction's metadata list.
*
* Called under j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh))
+ * Called under jh->b_state_lock
*
- * jh and bh may be already free when this function returns
+ * When this function returns true, there's no next transaction to refile to
+ * and the caller has to drop jh reference through
+ * jbd2_journal_put_journal_head().
*/
-void __jbd2_journal_refile_buffer(struct journal_head *jh)
+bool __jbd2_journal_refile_buffer(struct journal_head *jh)
{
int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
if (jh->b_transaction)
assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
/* If the buffer is now unused, just drop it. */
if (jh->b_next_transaction == NULL) {
__jbd2_journal_unfile_buffer(jh);
- return;
+ return true;
}
/*
@@ -2542,6 +2576,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
if (was_dirty)
set_buffer_jbddirty(bh);
+ return false;
}
/*
@@ -2552,16 +2587,15 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
*/
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
- struct buffer_head *bh = jh2bh(jh);
+ bool drop;
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
- __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ drop = __jbd2_journal_refile_buffer(jh);
+ spin_unlock(&jh->b_state_lock);
spin_unlock(&journal->j_list_lock);
- __brelse(bh);
+ if (drop)
+ jbd2_journal_put_journal_head(jh);
}
/*
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 021a4a2190ee..b86c78d178c6 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -226,7 +226,7 @@ static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *r
lastend = this->ofs + this->size;
} else {
dbg_fragtree2("lookup gave no frag\n");
- return -EINVAL;
+ lastend = 0;
}
/* See if we ran off the end of the fragtree */
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 6ebae6bbe6a5..9d96e6871e1a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -438,7 +438,7 @@ void kernfs_put_active(struct kernfs_node *kn)
return;
if (kernfs_lockdep(kn))
- rwsem_release(&kn->dep_map, 1, _RET_IP_);
+ rwsem_release(&kn->dep_map, _RET_IP_);
v = atomic_dec_return(&kn->active);
if (likely(v != KN_DEACTIVATED_BIAS))
return;
@@ -476,7 +476,7 @@ static void kernfs_drain(struct kernfs_node *kn)
if (kernfs_lockdep(kn)) {
lock_acquired(&kn->dep_map, _RET_IP_);
- rwsem_release(&kn->dep_map, 1, _RET_IP_);
+ rwsem_release(&kn->dep_map, _RET_IP_);
}
kernfs_drain_open_files(kn);
@@ -508,10 +508,6 @@ void kernfs_put(struct kernfs_node *kn)
struct kernfs_node *parent;
struct kernfs_root *root;
- /*
- * kernfs_node is freed with ->count 0, kernfs_find_and_get_node_by_ino
- * depends on this to filter reused stale node
- */
if (!kn || !atomic_dec_and_test(&kn->count))
return;
root = kernfs_root(kn);
@@ -536,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn)
kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
spin_lock(&kernfs_idr_lock);
- idr_remove(&root->ino_idr, kn->id.ino);
+ idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
spin_unlock(&kernfs_idr_lock);
kmem_cache_free(kernfs_node_cache, kn);
@@ -621,8 +617,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
unsigned flags)
{
struct kernfs_node *kn;
- u32 gen;
- int cursor;
+ u32 id_highbits;
int ret;
name = kstrdup_const(name, GFP_KERNEL);
@@ -635,23 +630,19 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
idr_preload(GFP_KERNEL);
spin_lock(&kernfs_idr_lock);
- cursor = idr_get_cursor(&root->ino_idr);
ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
- if (ret >= 0 && ret < cursor)
- root->next_generation++;
- gen = root->next_generation;
+ if (ret >= 0 && ret < root->last_id_lowbits)
+ root->id_highbits++;
+ id_highbits = root->id_highbits;
+ root->last_id_lowbits = ret;
spin_unlock(&kernfs_idr_lock);
idr_preload_end();
if (ret < 0)
goto err_out2;
- kn->id.ino = ret;
- kn->id.generation = gen;
- /*
- * set ino first. This RELEASE is paired with atomic_inc_not_zero in
- * kernfs_find_and_get_node_by_ino
- */
- atomic_set_release(&kn->count, 1);
+ kn->id = (u64)id_highbits << 32 | ret;
+
+ atomic_set(&kn->count, 1);
atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
RB_CLEAR_NODE(&kn->rb);
@@ -680,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
return kn;
err_out3:
- idr_remove(&root->ino_idr, kn->id.ino);
+ idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
err_out2:
kmem_cache_free(kernfs_node_cache, kn);
err_out1:
@@ -705,50 +696,52 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
}
/*
- * kernfs_find_and_get_node_by_ino - get kernfs_node from inode number
+ * kernfs_find_and_get_node_by_id - get kernfs_node from node id
* @root: the kernfs root
- * @ino: inode number
+ * @id: the target node id
+ *
+ * @id's lower 32bits encode ino and upper gen. If the gen portion is
+ * zero, all generations are matched.
*
* RETURNS:
* NULL on failure. Return a kernfs node with reference counter incremented
*/
-struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root,
- unsigned int ino)
+struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
+ u64 id)
{
struct kernfs_node *kn;
+ ino_t ino = kernfs_id_ino(id);
+ u32 gen = kernfs_id_gen(id);
- rcu_read_lock();
- kn = idr_find(&root->ino_idr, ino);
+ spin_lock(&kernfs_idr_lock);
+
+ kn = idr_find(&root->ino_idr, (u32)ino);
if (!kn)
- goto out;
+ goto err_unlock;
- /*
- * Since kernfs_node is freed in RCU, it's possible an old node for ino
- * is freed, but reused before RCU grace period. But a freed node (see
- * kernfs_put) or an incompletedly initialized node (see
- * __kernfs_new_node) should have 'count' 0. We can use this fact to
- * filter out such node.
- */
- if (!atomic_inc_not_zero(&kn->count)) {
- kn = NULL;
- goto out;
+ if (sizeof(ino_t) >= sizeof(u64)) {
+ /* we looked up with the low 32bits, compare the whole */
+ if (kernfs_ino(kn) != ino)
+ goto err_unlock;
+ } else {
+ /* 0 matches all generations */
+ if (unlikely(gen && kernfs_gen(kn) != gen))
+ goto err_unlock;
}
/*
- * The node could be a new node or a reused node. If it's a new node,
- * we are ok. If it's reused because of RCU (because of
- * SLAB_TYPESAFE_BY_RCU), the __kernfs_new_node always sets its 'ino'
- * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate,
- * hence we can use 'ino' to filter stale node.
+ * ACTIVATED is protected with kernfs_mutex but it was clear when
+ * @kn was added to idr and we just wanna see it set. No need to
+ * grab kernfs_mutex.
*/
- if (kn->id.ino != ino)
- goto out;
- rcu_read_unlock();
+ if (unlikely(!(kn->flags & KERNFS_ACTIVATED) ||
+ !atomic_inc_not_zero(&kn->count)))
+ goto err_unlock;
+ spin_unlock(&kernfs_idr_lock);
return kn;
-out:
- rcu_read_unlock();
- kernfs_put(kn);
+err_unlock:
+ spin_unlock(&kernfs_idr_lock);
return NULL;
}
@@ -962,7 +955,17 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
idr_init(&root->ino_idr);
INIT_LIST_HEAD(&root->supers);
- root->next_generation = 1;
+
+ /*
+ * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino.
+ * High bits generation. The starting value for both ino and
+ * genenration is 1. Initialize upper 32bit allocation
+ * accordingly.
+ */
+ if (sizeof(ino_t) >= sizeof(u64))
+ root->id_highbits = 0;
+ else
+ root->id_highbits = 1;
kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO,
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
@@ -1678,7 +1681,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
const char *name = pos->name;
unsigned int type = dt_type(pos);
int len = strlen(name);
- ino_t ino = pos->id.ino;
+ ino_t ino = kernfs_ino(pos);
ctx->pos = pos->hash;
file->private_data = pos;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e8c792b49616..34366db3620d 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -892,7 +892,7 @@ repeat:
* have the matching @file available. Look up the inodes
* and generate the events manually.
*/
- inode = ilookup(info->sb, kn->id.ino);
+ inode = ilookup(info->sb, kernfs_ino(kn));
if (!inode)
continue;
@@ -901,7 +901,7 @@ repeat:
if (parent) {
struct inode *p_inode;
- p_inode = ilookup(info->sb, parent->id.ino);
+ p_inode = ilookup(info->sb, kernfs_ino(parent));
if (p_inode) {
fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
inode, FSNOTIFY_EVENT_INODE, &name, 0);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index f3eaa8869f42..eac277c63d42 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -201,7 +201,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
inode->i_private = kn;
inode->i_mapping->a_ops = &kernfs_aops;
inode->i_op = &kernfs_iops;
- inode->i_generation = kn->id.generation;
+ inode->i_generation = kernfs_gen(kn);
set_default_inode_attr(inode, kn->mode);
kernfs_refresh_inode(kn, inode);
@@ -247,7 +247,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{
struct inode *inode;
- inode = iget_locked(sb, kn->id.ino);
+ inode = iget_locked(sb, kernfs_ino(kn));
if (inode && (inode->i_state & I_NEW))
kernfs_init_inode(kn, inode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 02ce570a9a3c..2f3c51d55261 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -109,8 +109,6 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
const char *name, umode_t mode,
kuid_t uid, kgid_t gid,
unsigned flags);
-struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root,
- unsigned int ino);
/*
* file.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 6c12fac2c287..4d31503abaee 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -53,63 +53,85 @@ const struct super_operations kernfs_sops = {
.show_path = kernfs_sop_show_path,
};
-/*
- * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode
- * number and generation
- */
-struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root,
- const union kernfs_node_id *id)
+static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
+ struct inode *parent)
{
- struct kernfs_node *kn;
+ struct kernfs_node *kn = inode->i_private;
- kn = kernfs_find_and_get_node_by_ino(root, id->ino);
- if (!kn)
- return NULL;
- if (kn->id.generation != id->generation) {
- kernfs_put(kn);
- return NULL;
+ if (*max_len < 2) {
+ *max_len = 2;
+ return FILEID_INVALID;
}
- return kn;
+
+ *max_len = 2;
+ *(u64 *)fh = kn->id;
+ return FILEID_KERNFS;
}
-static struct inode *kernfs_fh_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len,
+ int fh_type, bool get_parent)
{
struct kernfs_super_info *info = kernfs_info(sb);
- struct inode *inode;
struct kernfs_node *kn;
+ struct inode *inode;
+ u64 id;
- if (ino == 0)
- return ERR_PTR(-ESTALE);
+ if (fh_len < 2)
+ return NULL;
+
+ switch (fh_type) {
+ case FILEID_KERNFS:
+ id = *(u64 *)fid;
+ break;
+ case FILEID_INO32_GEN:
+ case FILEID_INO32_GEN_PARENT:
+ /*
+ * blk_log_action() exposes "LOW32,HIGH32" pair without
+ * type and userland can call us with generic fid
+ * constructed from them. Combine it back to ID. See
+ * blk_log_action().
+ */
+ id = ((u64)fid->i32.gen << 32) | fid->i32.ino;
+ break;
+ default:
+ return NULL;
+ }
- kn = kernfs_find_and_get_node_by_ino(info->root, ino);
+ kn = kernfs_find_and_get_node_by_id(info->root, id);
if (!kn)
return ERR_PTR(-ESTALE);
+
+ if (get_parent) {
+ struct kernfs_node *parent;
+
+ parent = kernfs_get_parent(kn);
+ kernfs_put(kn);
+ kn = parent;
+ if (!kn)
+ return ERR_PTR(-ESTALE);
+ }
+
inode = kernfs_get_inode(sb, kn);
kernfs_put(kn);
if (!inode)
return ERR_PTR(-ESTALE);
- if (generation && inode->i_generation != generation) {
- /* we didn't find the right inode.. */
- iput(inode);
- return ERR_PTR(-ESTALE);
- }
- return inode;
+ return d_obtain_alias(inode);
}
-static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+static struct dentry *kernfs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len,
+ int fh_type)
{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- kernfs_fh_get_inode);
+ return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false);
}
-static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+static struct dentry *kernfs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len,
+ int fh_type)
{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- kernfs_fh_get_inode);
+ return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true);
}
static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
@@ -120,6 +142,7 @@ static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
}
static const struct export_operations kernfs_export_ops = {
+ .encode_fh = kernfs_encode_fh,
.fh_to_dentry = kernfs_fh_to_dentry,
.fh_to_parent = kernfs_fh_to_parent,
.get_parent = kernfs_get_parent_dentry,
@@ -363,18 +386,9 @@ void kernfs_kill_sb(struct super_block *sb)
void __init kernfs_init(void)
{
-
- /*
- * the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino
- * can access the slab lock free. This could introduce stale nodes,
- * please see how kernfs_find_and_get_node_by_ino filters out stale
- * nodes.
- */
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
sizeof(struct kernfs_node),
- 0,
- SLAB_PANIC | SLAB_TYPESAFE_BY_RCU,
- NULL);
+ 0, SLAB_PANIC, NULL);
/* Creates slab cache for kernfs inode attributes */
kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache",
diff --git a/fs/namei.c b/fs/namei.c
index 671c3c1a3425..2dda552bcf7a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -925,7 +925,7 @@ static inline int may_follow_link(struct nameidata *nd)
return -ECHILD;
audit_inode(nd->name, nd->stack[0].link.dentry, 0);
- audit_log_link_denied("follow_link");
+ audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
return -EACCES;
}
@@ -993,7 +993,7 @@ static int may_linkat(struct path *link)
if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
return 0;
- audit_log_link_denied("linkat");
+ audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
return -EPERM;
}
@@ -1031,6 +1031,10 @@ static int may_create_in_sticky(struct dentry * const dir,
(dir->d_inode->i_mode & 0020 &&
((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
(sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
+ const char *operation = S_ISFIFO(inode->i_mode) ?
+ "sticky_create_fifo" :
+ "sticky_create_regular";
+ audit_log_path_denied(AUDIT_ANOM_CREAT, operation);
return -EACCES;
}
return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index fe0e9e1410fe..2adfe7b166a3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2478,8 +2478,10 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
time64_to_tm(sb->s_time_max, 0, &tm);
- pr_warn("Mounted %s file system at %s supports timestamps until %04ld (0x%llx)\n",
- sb->s_type->name, mntpath,
+ pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n",
+ sb->s_type->name,
+ is_mounted(mnt) ? "remounted" : "mounted",
+ mntpath,
tm.tm_year+1900, (unsigned long long)sb->s_time_max);
free_page((unsigned long)buf);
@@ -2764,14 +2766,11 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
if (IS_ERR(mnt))
return PTR_ERR(mnt);
- error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
- if (error < 0) {
- mntput(mnt);
- return error;
- }
-
mnt_warn_timestamp_expiry(mountpoint, mnt);
+ error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+ if (error < 0)
+ mntput(mnt);
return error;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 91b9dac6b2cc..4ba73dbf3e8d 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1354,6 +1354,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case NILFS_IOCTL_SYNC:
case NILFS_IOCTL_RESIZE:
case NILFS_IOCTL_SET_ALLOC_RANGE:
+ case FITRIM:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8508ab575017..0aa362b88550 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -523,7 +523,7 @@ static const struct file_operations fanotify_fops = {
.fasync = NULL,
.release = fanotify_release,
.unlocked_ioctl = fanotify_ioctl,
- .compat_ioctl = fanotify_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.llseek = noop_llseek,
};
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 1e2bfd26b352..ef83f4020554 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -50,7 +50,7 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode)
f.handle.handle_bytes = sizeof(f.pad);
size = f.handle.handle_bytes >> 2;
- ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
+ ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, NULL);
if ((ret == FILEID_INVALID) || (ret < 0)) {
WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
return;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 2ecef6155fc0..3e77b728a22b 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -381,8 +381,6 @@ out:
}
EXPORT_SYMBOL_GPL(fsnotify);
-extern struct kmem_cache *fsnotify_mark_connector_cachep;
-
static __init int fsnotify_init(void)
{
int ret;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index f3462828a0e2..ff2063ec6b0f 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -65,4 +65,6 @@ extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+extern struct kmem_cache *fsnotify_mark_connector_cachep;
+
#endif /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 3e7da392aa6f..bb981ec76456 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -327,8 +327,8 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
down_read(&OCFS2_I(inode)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
up_read(&OCFS2_I(inode)->ip_xattr_sem);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
+ if (IS_ERR_OR_NULL(acl))
+ return PTR_ERR_OR_ZERO(acl);
ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
if (ret)
return ret;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f9baefc76cf9..88534eb0e7c2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2288,9 +2288,9 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
int ret = 0;
int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
- if (handle->h_buffer_credits < credits)
+ if (jbd2_handle_buffer_credits(handle) < credits)
ret = ocfs2_extend_trans(handle,
- credits - handle->h_buffer_credits);
+ credits - jbd2_handle_buffer_credits(handle));
return ret;
}
@@ -2367,7 +2367,7 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
- int ret, start, orig_credits = handle->h_buffer_credits;
+ int ret, start, orig_credits = jbd2_handle_buffer_credits(handle);
u32 cpos;
struct ocfs2_path *left_path = NULL;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
@@ -3148,7 +3148,7 @@ static int ocfs2_rotate_tree_left(handle_t *handle,
struct ocfs2_path *path,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
- int ret, orig_credits = handle->h_buffer_credits;
+ int ret, orig_credits = jbd2_handle_buffer_credits(handle);
struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
@@ -3386,8 +3386,8 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
- handle->h_buffer_credits,
- right_path);
+ jbd2_handle_buffer_credits(handle),
+ right_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3548,8 +3548,8 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
right_path);
ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
- handle->h_buffer_credits,
- left_path);
+ jbd2_handle_buffer_credits(handle),
+ left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3623,7 +3623,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
le16_to_cpu(el->l_next_free_rec) == 1) {
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
right_path);
if (ret) {
mlog_errno(ret);
@@ -3669,7 +3669,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -3725,7 +3725,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -3755,7 +3755,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -3799,7 +3799,7 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
if (ctxt->c_split_covers_rec) {
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -5358,7 +5358,7 @@ static int ocfs2_truncate_rec(handle_t *handle,
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
/* extend credit for ocfs2_remove_rightmost_path */
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
+ jbd2_handle_buffer_credits(handle),
path);
if (ret) {
mlog_errno(ret);
@@ -5427,8 +5427,8 @@ static int ocfs2_truncate_rec(handle_t *handle,
}
ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
+ jbd2_handle_buffer_credits(handle),
+ path);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9cd0a6815933..3a67a6518ddf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -11,7 +11,6 @@
#include <linux/pagemap.h>
#include <asm/byteorder.h>
#include <linux/swap.h>
-#include <linux/pipe_fs_i.h>
#include <linux/mpage.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e774c5ea13b..1c4c51f3df60 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1687,7 +1687,7 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
spin_unlock_irqrestore(&lockres->l_lock, flags);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (lockres->l_lockdep_map.key != NULL)
- rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
+ rwsem_release(&lockres->l_lockdep_map, caller_ip);
#endif
}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index efeea208fdeb..89984172fc4a 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -985,6 +985,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return -EFAULT;
return ocfs2_info_handle(inode, &info, 1);
+ case FITRIM:
case OCFS2_IOC_MOVE_EXT:
break;
default:
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 699a560efbb0..1afe57f425a0 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -420,14 +420,14 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
if (!nblocks)
return 0;
- old_nblocks = handle->h_buffer_credits;
+ old_nblocks = jbd2_handle_buffer_credits(handle);
trace_ocfs2_extend_trans(old_nblocks, nblocks);
#ifdef CONFIG_OCFS2_DEBUG_FS
status = 1;
#else
- status = jbd2_journal_extend(handle, nblocks);
+ status = jbd2_journal_extend(handle, nblocks, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -461,13 +461,13 @@ int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
BUG_ON(!handle);
- old_nblks = handle->h_buffer_credits;
+ old_nblks = jbd2_handle_buffer_credits(handle);
trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
if (old_nblks < thresh)
return 0;
- status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+ status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA, 0);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 7a922190a8c7..eda83487c9ec 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -728,7 +728,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
- if (atomic_read(&dquot->dq_count) > 1)
+ if (dquot_is_busy(dquot))
goto out;
/* Running from downconvert thread? Postpone quota processing to wq */
if (current == osb->dc_task) {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 69c21a3843af..4180c3ef0a68 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1252,6 +1252,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr)
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ struct journal_head *jh;
int ret;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
@@ -1260,13 +1261,14 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
if (!buffer_jbd(bg_bh))
return 1;
- jbd_lock_bh_state(bg_bh);
- bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+ jh = bh2jh(bg_bh);
+ spin_lock(&jh->b_state_lock);
+ bg = (struct ocfs2_group_desc *) jh->b_committed_data;
if (bg)
ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
else
ret = 1;
- jbd_unlock_bh_state(bg_bh);
+ spin_unlock(&jh->b_state_lock);
return ret;
}
@@ -2387,6 +2389,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
int status;
unsigned int tmp;
struct ocfs2_group_desc *undo_bg = NULL;
+ struct journal_head *jh;
/* The caller got this descriptor from
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
@@ -2405,10 +2408,10 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
goto bail;
}
+ jh = bh2jh(group_bh);
if (undo_fn) {
- jbd_lock_bh_state(group_bh);
- undo_bg = (struct ocfs2_group_desc *)
- bh2jh(group_bh)->b_committed_data;
+ spin_lock(&jh->b_state_lock);
+ undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
BUG_ON(!undo_bg);
}
@@ -2423,7 +2426,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
if (undo_fn)
- jbd_unlock_bh_state(group_bh);
+ spin_unlock(&jh->b_state_lock);
return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
(unsigned long long)le64_to_cpu(bg->bg_blkno),
le16_to_cpu(bg->bg_bits),
@@ -2432,7 +2435,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
if (undo_fn)
- jbd_unlock_bh_state(group_bh);
+ spin_unlock(&jh->b_state_lock);
ocfs2_journal_dirty(handle, group_bh);
bail:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c81e86c62380..05dd68ade293 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -926,8 +926,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
status = -ENOENT;
goto out_quota_off;
}
- status = dquot_enable(inode[type], type, QFMT_OCFS2,
- DQUOT_USAGE_ENABLED);
+ status = dquot_load_quota_inode(inode[type], type, QFMT_OCFS2,
+ DQUOT_USAGE_ENABLED);
if (status < 0)
goto out_quota_off;
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d8507972ee13..90c830e3758e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1490,6 +1490,18 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
return loc->xl_ops->xlo_check_space(loc, xi);
}
+static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
+{
+ loc->xl_ops->xlo_add_entry(loc, name_hash);
+ loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
+ /*
+ * We can't leave the new entry's xe_name_offset at zero or
+ * add_namevalue() will go nuts. We set it to the size of our
+ * storage so that it can never be less than any other entry.
+ */
+ loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
+}
+
static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
struct ocfs2_xattr_info *xi)
{
@@ -2121,31 +2133,29 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
if (rc)
goto out;
- if (!loc->xl_entry) {
- rc = -EINVAL;
- goto out;
- }
-
- if (ocfs2_xa_can_reuse_entry(loc, xi)) {
- orig_value_size = loc->xl_entry->xe_value_size;
- rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
- if (rc)
- goto out;
- goto alloc_value;
- }
+ if (loc->xl_entry) {
+ if (ocfs2_xa_can_reuse_entry(loc, xi)) {
+ orig_value_size = loc->xl_entry->xe_value_size;
+ rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
+ if (rc)
+ goto out;
+ goto alloc_value;
+ }
- if (!ocfs2_xattr_is_local(loc->xl_entry)) {
- orig_clusters = ocfs2_xa_value_clusters(loc);
- rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
- if (rc) {
- mlog_errno(rc);
- ocfs2_xa_cleanup_value_truncate(loc,
- "overwriting",
- orig_clusters);
- goto out;
+ if (!ocfs2_xattr_is_local(loc->xl_entry)) {
+ orig_clusters = ocfs2_xa_value_clusters(loc);
+ rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
+ if (rc) {
+ mlog_errno(rc);
+ ocfs2_xa_cleanup_value_truncate(loc,
+ "overwriting",
+ orig_clusters);
+ goto out;
+ }
}
- }
- ocfs2_xa_wipe_namevalue(loc);
+ ocfs2_xa_wipe_namevalue(loc);
+ } else
+ ocfs2_xa_add_entry(loc, name_hash);
/*
* If we get here, we have a blank entry. Fill it. We grow our
diff --git a/fs/pipe.c b/fs/pipe.c
index 8a2ab2f974bd..648ce440ca85 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -43,10 +43,12 @@ unsigned long pipe_user_pages_hard;
unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
/*
- * We use a start+len construction, which provides full use of the
- * allocated memory.
- * -- Florian Coosmann (FGC)
- *
+ * We use head and tail indices that aren't masked off, except at the point of
+ * dereference, but rather they're allowed to wrap naturally. This means there
+ * isn't a dead spot in the buffer, but the ring has to be a power of two and
+ * <= 2^31.
+ * -- David Howells 2019-09-23.
+ *
* Reads with count = 0 should always return 0.
* -- Julian Bradfield 1999-06-07.
*
@@ -285,10 +287,12 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
ret = 0;
__pipe_lock(pipe);
for (;;) {
- int bufs = pipe->nrbufs;
- if (bufs) {
- int curbuf = pipe->curbuf;
- struct pipe_buffer *buf = pipe->bufs + curbuf;
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
+
+ if (!pipe_empty(head, tail)) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t chars = buf->len;
size_t written;
int error;
@@ -320,18 +324,27 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
}
if (!buf->len) {
+ bool wake;
pipe_buf_release(pipe, buf);
- curbuf = (curbuf + 1) & (pipe->buffers - 1);
- pipe->curbuf = curbuf;
- pipe->nrbufs = --bufs;
+ spin_lock_irq(&pipe->wait.lock);
+ tail++;
+ pipe->tail = tail;
do_wakeup = 1;
+ wake = head - (tail - 1) == pipe->max_usage / 2;
+ if (wake)
+ wake_up_locked_poll(
+ &pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ spin_unlock_irq(&pipe->wait.lock);
+ if (wake)
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
+ if (!pipe_empty(head, tail)) /* More to do? */
+ continue;
}
- if (bufs) /* More to do? */
- continue;
+
if (!pipe->writers)
break;
if (!pipe->waiting_writers) {
@@ -352,17 +365,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
ret = -ERESTARTSYS;
break;
}
- if (do_wakeup) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
- }
pipe_wait(pipe);
}
__pipe_unlock(pipe);
/* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ wake_up_interruptible_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (ret > 0)
@@ -380,6 +389,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
+ unsigned int head, max_usage, mask;
ssize_t ret = 0;
int do_wakeup = 0;
size_t total_len = iov_iter_count(from);
@@ -397,12 +407,14 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
+ head = pipe->head;
+ max_usage = pipe->max_usage;
+ mask = pipe->ring_size - 1;
+
/* We try to merge small writes */
chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
- if (pipe->nrbufs && chars != 0) {
- int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
- (pipe->buffers - 1);
- struct pipe_buffer *buf = pipe->bufs + lastbuf;
+ if (!pipe_empty(head, pipe->tail) && chars != 0) {
+ struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;
if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
@@ -423,18 +435,16 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
}
for (;;) {
- int bufs;
-
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
- bufs = pipe->nrbufs;
- if (bufs < pipe->buffers) {
- int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
- struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+ head = pipe->head;
+ if (!pipe_full(head, pipe->tail, max_usage)) {
+ struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;
@@ -446,38 +456,64 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
}
pipe->tmp_page = page;
}
+
+ /* Allocate a slot in the ring in advance and attach an
+ * empty buffer. If we fault or otherwise fail to use
+ * it, either the reader will consume it or it'll still
+ * be there for the next write.
+ */
+ spin_lock_irq(&pipe->wait.lock);
+
+ head = pipe->head;
+ if (pipe_full(head, pipe->tail, max_usage)) {
+ spin_unlock_irq(&pipe->wait.lock);
+ continue;
+ }
+
+ pipe->head = head + 1;
+
/* Always wake up, even if the copy fails. Otherwise
* we lock up (O_NONBLOCK-)readers that sleep due to
* syscall merging.
* FIXME! Is this really true?
*/
- do_wakeup = 1;
- copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
- if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
- if (!ret)
- ret = -EFAULT;
- break;
- }
- ret += copied;
+ wake_up_locked_poll(
+ &pipe->wait, EPOLLIN | EPOLLRDNORM);
+
+ spin_unlock_irq(&pipe->wait.lock);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
/* Insert it into the buffer array */
+ buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
- buf->len = copied;
+ buf->len = 0;
buf->flags = 0;
if (is_packetized(filp)) {
buf->ops = &packet_pipe_buf_ops;
buf->flags = PIPE_BUF_FLAG_PACKET;
}
- pipe->nrbufs = ++bufs;
pipe->tmp_page = NULL;
+ copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
+ if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
+ if (!ret)
+ ret = -EFAULT;
+ break;
+ }
+ ret += copied;
+ buf->offset = 0;
+ buf->len = copied;
+
if (!iov_iter_count(from))
break;
}
- if (bufs < pipe->buffers)
+
+ if (!pipe_full(head, pipe->tail, max_usage))
continue;
+
+ /* Wait for buffer space to become available. */
if (filp->f_flags & O_NONBLOCK) {
if (!ret)
ret = -EAGAIN;
@@ -488,11 +524,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
ret = -ERESTARTSYS;
break;
}
- if (do_wakeup) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- do_wakeup = 0;
- }
pipe->waiting_writers++;
pipe_wait(pipe);
pipe->waiting_writers--;
@@ -500,7 +531,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
out:
__pipe_unlock(pipe);
if (do_wakeup) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
+ wake_up_interruptible_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
@@ -515,17 +546,19 @@ out:
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe = filp->private_data;
- int count, buf, nrbufs;
+ int count, head, tail, mask;
switch (cmd) {
case FIONREAD:
__pipe_lock(pipe);
count = 0;
- buf = pipe->curbuf;
- nrbufs = pipe->nrbufs;
- while (--nrbufs >= 0) {
- count += pipe->bufs[buf].len;
- buf = (buf+1) & (pipe->buffers - 1);
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
+
+ while (tail != head) {
+ count += pipe->bufs[tail & mask].len;
+ tail++;
}
__pipe_unlock(pipe);
@@ -541,21 +574,25 @@ pipe_poll(struct file *filp, poll_table *wait)
{
__poll_t mask;
struct pipe_inode_info *pipe = filp->private_data;
- int nrbufs;
+ unsigned int head = READ_ONCE(pipe->head);
+ unsigned int tail = READ_ONCE(pipe->tail);
poll_wait(filp, &pipe->wait, wait);
+ BUG_ON(pipe_occupancy(head, tail) > pipe->ring_size);
+
/* Reading only -- no need for acquiring the semaphore. */
- nrbufs = pipe->nrbufs;
mask = 0;
if (filp->f_mode & FMODE_READ) {
- mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
+ if (!pipe_empty(head, tail))
+ mask |= EPOLLIN | EPOLLRDNORM;
if (!pipe->writers && filp->f_version != pipe->w_counter)
mask |= EPOLLHUP;
}
if (filp->f_mode & FMODE_WRITE) {
- mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0;
+ if (!pipe_full(head, tail, pipe->max_usage))
+ mask |= EPOLLOUT | EPOLLWRNORM;
/*
* Most Unices do not set EPOLLERR for FIFOs but on Linux they
* behave exactly like pipes for poll().
@@ -679,7 +716,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
if (pipe->bufs) {
init_waitqueue_head(&pipe->wait);
pipe->r_counter = pipe->w_counter = 1;
- pipe->buffers = pipe_bufs;
+ pipe->max_usage = pipe_bufs;
+ pipe->ring_size = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
@@ -697,9 +735,9 @@ void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
- (void) account_pipe_buffers(pipe->user, pipe->buffers, 0);
+ (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0);
free_uid(pipe->user);
- for (i = 0; i < pipe->buffers; i++) {
+ for (i = 0; i < pipe->ring_size; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
pipe_buf_release(pipe, buf);
@@ -793,6 +831,8 @@ int create_pipe_files(struct file **res, int flags)
}
res[0]->private_data = inode->i_pipe;
res[1] = f;
+ stream_open(inode, res[0]);
+ stream_open(inode, res[1]);
return 0;
}
@@ -880,7 +920,7 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
- int cur = *cnt;
+ int cur = *cnt;
while (cur == *cnt) {
pipe_wait(pipe);
@@ -931,9 +971,9 @@ static int fifo_open(struct inode *inode, struct file *filp)
__pipe_lock(pipe);
/* We can only do regular read/write on fifos */
- filp->f_mode &= (FMODE_READ | FMODE_WRITE);
+ stream_open(inode, filp);
- switch (filp->f_mode) {
+ switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
case FMODE_READ:
/*
* O_RDONLY
@@ -955,7 +995,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
}
}
break;
-
+
case FMODE_WRITE:
/*
* O_WRONLY
@@ -975,7 +1015,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
goto err_wr;
}
break;
-
+
case FMODE_READ | FMODE_WRITE:
/*
* O_RDWR
@@ -1054,14 +1094,14 @@ unsigned int round_pipe_size(unsigned long size)
static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
struct pipe_buffer *bufs;
- unsigned int size, nr_pages;
+ unsigned int size, nr_slots, head, tail, mask, n;
unsigned long user_bufs;
long ret = 0;
size = round_pipe_size(arg);
- nr_pages = size >> PAGE_SHIFT;
+ nr_slots = size >> PAGE_SHIFT;
- if (!nr_pages)
+ if (!nr_slots)
return -EINVAL;
/*
@@ -1071,13 +1111,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
* Decreasing the pipe capacity is always permitted, even
* if the user is currently over a limit.
*/
- if (nr_pages > pipe->buffers &&
+ if (nr_slots > pipe->ring_size &&
size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
return -EPERM;
- user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
+ user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots);
- if (nr_pages > pipe->buffers &&
+ if (nr_slots > pipe->ring_size &&
(too_many_pipe_buffers_hard(user_bufs) ||
too_many_pipe_buffers_soft(user_bufs)) &&
is_unprivileged_user()) {
@@ -1086,17 +1126,21 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
}
/*
- * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
- * expect a lot of shrink+grow operations, just free and allocate
- * again like we would do for growing. If the pipe currently
+ * We can shrink the pipe, if arg is greater than the ring occupancy.
+ * Since we don't expect a lot of shrink+grow operations, just free and
+ * allocate again like we would do for growing. If the pipe currently
* contains more buffers than arg, then return busy.
*/
- if (nr_pages < pipe->nrbufs) {
+ mask = pipe->ring_size - 1;
+ head = pipe->head;
+ tail = pipe->tail;
+ n = pipe_occupancy(pipe->head, pipe->tail);
+ if (nr_slots < n) {
ret = -EBUSY;
goto out_revert_acct;
}
- bufs = kcalloc(nr_pages, sizeof(*bufs),
+ bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs)) {
ret = -ENOMEM;
@@ -1105,33 +1149,37 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
/*
* The pipe array wraps around, so just start the new one at zero
- * and adjust the indexes.
+ * and adjust the indices.
*/
- if (pipe->nrbufs) {
- unsigned int tail;
- unsigned int head;
-
- tail = pipe->curbuf + pipe->nrbufs;
- if (tail < pipe->buffers)
- tail = 0;
- else
- tail &= (pipe->buffers - 1);
-
- head = pipe->nrbufs - tail;
- if (head)
- memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
- if (tail)
- memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
+ if (n > 0) {
+ unsigned int h = head & mask;
+ unsigned int t = tail & mask;
+ if (h > t) {
+ memcpy(bufs, pipe->bufs + t,
+ n * sizeof(struct pipe_buffer));
+ } else {
+ unsigned int tsize = pipe->ring_size - t;
+ if (h > 0)
+ memcpy(bufs + tsize, pipe->bufs,
+ h * sizeof(struct pipe_buffer));
+ memcpy(bufs, pipe->bufs + t,
+ tsize * sizeof(struct pipe_buffer));
+ }
}
- pipe->curbuf = 0;
+ head = n;
+ tail = 0;
+
kfree(pipe->bufs);
pipe->bufs = bufs;
- pipe->buffers = nr_pages;
- return nr_pages * PAGE_SIZE;
+ pipe->ring_size = nr_slots;
+ pipe->max_usage = nr_slots;
+ pipe->tail = tail;
+ pipe->head = head;
+ return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
- (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
+ (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size);
return ret;
}
@@ -1161,7 +1209,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
ret = pipe_set_size(pipe, arg);
break;
case F_GETPIPE_SZ:
- ret = pipe->buffers * PAGE_SIZE;
+ ret = pipe->max_usage * PAGE_SIZE;
break;
default:
ret = -EINVAL;
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index cb5629bd5fff..733881a6387b 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -42,8 +42,8 @@ config PROC_VMCORE
bool "/proc/vmcore support"
depends on PROC_FS && CRASH_DUMP
default y
- help
- Exports the dump image of crashed kernel in ELF format.
+ help
+ Exports the dump image of crashed kernel in ELF format.
config PROC_VMCORE_DEVICE_DUMP
bool "Device Hardware/Firmware Log Collection"
@@ -72,7 +72,7 @@ config PROC_SYSCTL
a recompile of the kernel or reboot of the system. The primary
interface is through /proc/sys. If you say Y here a tree of
modifiable sysctl entries will be generated beneath the
- /proc/sys directory. They are explained in the files
+ /proc/sys directory. They are explained in the files
in <file:Documentation/admin-guide/sysctl/>. Note that enabling this
option will enlarge the kernel by at least 8 KB.
@@ -88,7 +88,7 @@ config PROC_PAGE_MONITOR
Various /proc files exist to monitor process memory utilization:
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
- interfaces will reduce the size of the kernel by approximately 4kb.
+ interfaces will reduce the size of the kernel by approximately 4kb.
config PROC_CHILDREN
bool "Include /proc/<pid>/task/<tid>/children file"
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 46dcb6f0eccf..5efaf3708ec6 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -533,7 +533,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
nice = task_nice(task);
/* convert nsec -> ticks */
- start_time = nsec_to_clock_t(task->real_start_time);
+ start_time = nsec_to_clock_t(task->start_boottime);
seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
seq_puts(m, " (");
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 64e9ee1b129e..074e9585c699 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -138,8 +138,12 @@ static int proc_getattr(const struct path *path, struct kstat *stat,
{
struct inode *inode = d_inode(path->dentry);
struct proc_dir_entry *de = PDE(inode);
- if (de && de->nlink)
- set_nlink(inode, de->nlink);
+ if (de) {
+ nlink_t nlink = READ_ONCE(de->nlink);
+ if (nlink > 0) {
+ set_nlink(inode, nlink);
+ }
+ }
generic_fillattr(inode, stat);
return 0;
@@ -159,7 +163,6 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
{
const char *cp = name, *next;
struct proc_dir_entry *de;
- unsigned int len;
de = *ret;
if (!de)
@@ -170,13 +173,12 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
if (!next)
break;
- len = next - cp;
- de = pde_subdir_find(de, cp, len);
+ de = pde_subdir_find(de, cp, next - cp);
if (!de) {
WARN(1, "name '%s'\n", name);
return -ENOENT;
}
- cp += len + 1;
+ cp = next + 1;
}
*residual = cp;
*ret = de;
@@ -362,6 +364,7 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
write_unlock(&proc_subdir_lock);
goto out_free_inum;
}
+ dir->nlink++;
write_unlock(&proc_subdir_lock);
return dp;
@@ -472,10 +475,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent->data = data;
ent->proc_fops = &proc_dir_operations;
ent->proc_iops = &proc_dir_inode_operations;
- parent->nlink++;
ent = proc_register(parent, ent);
- if (!ent)
- parent->nlink--;
}
return ent;
}
@@ -505,10 +505,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
ent->data = NULL;
ent->proc_fops = NULL;
ent->proc_iops = NULL;
- parent->nlink++;
ent = proc_register(parent, ent);
- if (!ent)
- parent->nlink--;
}
return ent;
}
@@ -666,8 +663,12 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
len = strlen(fn);
de = pde_subdir_find(parent, fn, len);
- if (de)
+ if (de) {
rb_erase(&de->subdir_node, &parent->subdir);
+ if (S_ISDIR(de->mode)) {
+ parent->nlink--;
+ }
+ }
write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
@@ -676,9 +677,6 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
proc_entry_rundown(de);
- if (S_ISDIR(de->mode))
- parent->nlink--;
- de->nlink = 0;
WARN(pde_subdir_first(de),
"%s: removing non-empty directory '%s/%s', leaking at least '%s'\n",
__func__, de->parent->name, de->name, pde_subdir_first(de)->name);
@@ -714,13 +712,12 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
de = next;
continue;
}
- write_unlock(&proc_subdir_lock);
-
- proc_entry_rundown(de);
next = de->parent;
if (S_ISDIR(de->mode))
next->nlink--;
- de->nlink = 0;
+ write_unlock(&proc_subdir_lock);
+
+ proc_entry_rundown(de);
if (de == root)
break;
pde_put(de);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd0c8d5ce9a1..0f3b557c9b77 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -197,8 +197,8 @@ extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, lof
* inode.c
*/
struct pde_opener {
- struct file *file;
struct list_head lh;
+ struct file *file;
bool closing;
struct completion *c;
} __randomize_layout;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 80c305f206bb..37bdbec5b402 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -120,20 +120,23 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
- struct kernel_cpustat *kcs = &kcpustat_cpu(i);
-
- user += kcs->cpustat[CPUTIME_USER];
- nice += kcs->cpustat[CPUTIME_NICE];
- system += kcs->cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(kcs, i);
- iowait += get_iowait_time(kcs, i);
- irq += kcs->cpustat[CPUTIME_IRQ];
- softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
- steal += kcs->cpustat[CPUTIME_STEAL];
- guest += kcs->cpustat[CPUTIME_GUEST];
- guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
- sum += kstat_cpu_irqs_sum(i);
- sum += arch_irq_stat_cpu(i);
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+
+ user += cpustat[CPUTIME_USER];
+ nice += cpustat[CPUTIME_NICE];
+ system += cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(&kcpustat, i);
+ iowait += get_iowait_time(&kcpustat, i);
+ irq += cpustat[CPUTIME_IRQ];
+ softirq += cpustat[CPUTIME_SOFTIRQ];
+ steal += cpustat[CPUTIME_STEAL];
+ guest += cpustat[CPUTIME_GUEST];
+ guest_nice += cpustat[CPUTIME_USER];
+ sum += kstat_cpu_irqs_sum(i);
+ sum += arch_irq_stat_cpu(i);
for (j = 0; j < NR_SOFTIRQS; j++) {
unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -157,19 +160,22 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
- struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcs->cpustat[CPUTIME_USER];
- nice = kcs->cpustat[CPUTIME_NICE];
- system = kcs->cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(kcs, i);
- iowait = get_iowait_time(kcs, i);
- irq = kcs->cpustat[CPUTIME_IRQ];
- softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
- steal = kcs->cpustat[CPUTIME_STEAL];
- guest = kcs->cpustat[CPUTIME_GUEST];
- guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
+ user = cpustat[CPUTIME_USER];
+ nice = cpustat[CPUTIME_NICE];
+ system = cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(&kcpustat, i);
+ iowait = get_iowait_time(&kcpustat, i);
+ irq = cpustat[CPUTIME_IRQ];
+ softirq = cpustat[CPUTIME_SOFTIRQ];
+ steal = cpustat[CPUTIME_STEAL];
+ guest = cpustat[CPUTIME_GUEST];
+ guest_nice = cpustat[CPUTIME_USER];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 3d7024662d29..d896457e7c11 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -793,7 +793,7 @@ static void pstore_timefunc(struct timer_list *unused)
jiffies + msecs_to_jiffies(pstore_update_ms));
}
-void __init pstore_choose_compression(void)
+static void __init pstore_choose_compression(void)
{
const struct pstore_zbackend *step;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 6e826b454082..4639d53e96a3 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -497,7 +497,7 @@ int dquot_release(struct dquot *dquot)
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
- if (atomic_read(&dquot->dq_count) > 1)
+ if (dquot_is_busy(dquot))
goto out_dqlock;
if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
@@ -595,7 +595,6 @@ int dquot_scan_active(struct super_block *sb,
/* Now we have active dquot so we can just increase use count */
atomic_inc(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- dqstats_inc(DQST_LOOKUPS);
dqput(old_dquot);
old_dquot = dquot;
/*
@@ -623,7 +622,7 @@ EXPORT_SYMBOL(dquot_scan_active);
/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
- struct list_head *dirty;
+ struct list_head dirty;
struct dquot *dquot;
struct quota_info *dqopt = sb_dqopt(sb);
int cnt;
@@ -637,9 +636,10 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
if (!sb_has_quota_active(sb, cnt))
continue;
spin_lock(&dq_list_lock);
- dirty = &dqopt->info[cnt].dqi_dirty_list;
- while (!list_empty(dirty)) {
- dquot = list_first_entry(dirty, struct dquot,
+ /* Move list away to avoid livelock. */
+ list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty);
+ while (!list_empty(&dirty)) {
+ dquot = list_first_entry(&dirty, struct dquot,
dq_dirty);
WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
@@ -649,7 +649,6 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
* use count */
dqgrab(dquot);
spin_unlock(&dq_list_lock);
- dqstats_inc(DQST_LOOKUPS);
err = sb->dq_op->write_dquot(dquot);
if (err) {
/*
@@ -2162,14 +2161,29 @@ int dquot_file_open(struct inode *inode, struct file *file)
}
EXPORT_SYMBOL(dquot_file_open);
+static void vfs_cleanup_quota_inode(struct super_block *sb, int type)
+{
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct inode *inode = dqopt->files[type];
+
+ if (!inode)
+ return;
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ inode_lock(inode);
+ inode->i_flags &= ~S_NOQUOTA;
+ inode_unlock(inode);
+ }
+ dqopt->files[type] = NULL;
+ iput(inode);
+}
+
/*
* Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
*/
int dquot_disable(struct super_block *sb, int type, unsigned int flags)
{
- int cnt, ret = 0;
+ int cnt;
struct quota_info *dqopt = sb_dqopt(sb);
- struct inode *toputinode[MAXQUOTAS];
/* s_umount should be held in exclusive mode */
if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
@@ -2191,7 +2205,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
return 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- toputinode[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
if (!sb_has_quota_loaded(sb, cnt))
@@ -2211,8 +2224,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
dqopt->flags &= ~dquot_state_flag(
DQUOT_SUSPENDED, cnt);
spin_unlock(&dq_state_lock);
- iput(dqopt->files[cnt]);
- dqopt->files[cnt] = NULL;
+ vfs_cleanup_quota_inode(sb, cnt);
continue;
}
spin_unlock(&dq_state_lock);
@@ -2234,10 +2246,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
if (dqopt->ops[cnt]->free_file_info)
dqopt->ops[cnt]->free_file_info(sb, cnt);
put_quota_format(dqopt->info[cnt].dqi_format);
-
- toputinode[cnt] = dqopt->files[cnt];
- if (!sb_has_quota_loaded(sb, cnt))
- dqopt->files[cnt] = NULL;
dqopt->info[cnt].dqi_flags = 0;
dqopt->info[cnt].dqi_igrace = 0;
dqopt->info[cnt].dqi_bgrace = 0;
@@ -2259,32 +2267,22 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
* must also discard the blockdev buffers so that we see the
* changes done by userspace on the next quotaon() */
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- /* This can happen when suspending quotas on remount-ro... */
- if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) {
- inode_lock(toputinode[cnt]);
- toputinode[cnt]->i_flags &= ~S_NOQUOTA;
- truncate_inode_pages(&toputinode[cnt]->i_data, 0);
- inode_unlock(toputinode[cnt]);
- mark_inode_dirty_sync(toputinode[cnt]);
+ if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) {
+ inode_lock(dqopt->files[cnt]);
+ truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
+ inode_unlock(dqopt->files[cnt]);
}
if (sb->s_bdev)
invalidate_bdev(sb->s_bdev);
put_inodes:
+ /* We are done when suspending quotas */
+ if (flags & DQUOT_SUSPENDED)
+ return 0;
+
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- if (toputinode[cnt]) {
- /* On remount RO, we keep the inode pointer so that we
- * can reenable quota on the subsequent remount RW. We
- * have to check 'flags' variable and not use sb_has_
- * function because another quotaon / quotaoff could
- * change global state before we got here. We refuse
- * to suspend quotas when there is pending delete on
- * the quota file... */
- if (!(flags & DQUOT_SUSPENDED))
- iput(toputinode[cnt]);
- else if (!toputinode[cnt]->i_nlink)
- ret = -EBUSY;
- }
- return ret;
+ if (!sb_has_quota_loaded(sb, cnt))
+ vfs_cleanup_quota_inode(sb, cnt);
+ return 0;
}
EXPORT_SYMBOL(dquot_disable);
@@ -2299,28 +2297,52 @@ EXPORT_SYMBOL(dquot_quota_off);
* Turn quotas on on a device
*/
-/*
- * Helper function to turn quotas on when we already have the inode of
- * quota file and no quota information is loaded.
- */
-static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+static int vfs_setup_quota_inode(struct inode *inode, int type)
+{
+ struct super_block *sb = inode->i_sb;
+ struct quota_info *dqopt = sb_dqopt(sb);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EACCES;
+ if (IS_RDONLY(inode))
+ return -EROFS;
+ if (sb_has_quota_loaded(sb, type))
+ return -EBUSY;
+
+ dqopt->files[type] = igrab(inode);
+ if (!dqopt->files[type])
+ return -EIO;
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ /* We don't want quota and atime on quota files (deadlocks
+ * possible) Also nobody should write to the file - we use
+ * special IO operations which ignore the immutable bit. */
+ inode_lock(inode);
+ inode->i_flags |= S_NOQUOTA;
+ inode_unlock(inode);
+ /*
+ * When S_NOQUOTA is set, remove dquot references as no more
+ * references can be added
+ */
+ __dquot_drop(inode);
+ }
+ return 0;
+}
+
+int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
unsigned int flags)
{
struct quota_format_type *fmt = find_quota_format(format_id);
- struct super_block *sb = inode->i_sb;
struct quota_info *dqopt = sb_dqopt(sb);
int error;
+ /* Just unsuspend quotas? */
+ BUG_ON(flags & DQUOT_SUSPENDED);
+ /* s_umount should be held in exclusive mode */
+ if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
+ up_read(&sb->s_umount);
+
if (!fmt)
return -ESRCH;
- if (!S_ISREG(inode->i_mode)) {
- error = -EACCES;
- goto out_fmt;
- }
- if (IS_RDONLY(inode)) {
- error = -EROFS;
- goto out_fmt;
- }
if (!sb->s_op->quota_write || !sb->s_op->quota_read ||
(type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
error = -EINVAL;
@@ -2352,27 +2374,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
invalidate_bdev(sb->s_bdev);
}
- if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
- /* We don't want quota and atime on quota files (deadlocks
- * possible) Also nobody should write to the file - we use
- * special IO operations which ignore the immutable bit. */
- inode_lock(inode);
- inode->i_flags |= S_NOQUOTA;
- inode_unlock(inode);
- /*
- * When S_NOQUOTA is set, remove dquot references as no more
- * references can be added
- */
- __dquot_drop(inode);
- }
-
- error = -EIO;
- dqopt->files[type] = igrab(inode);
- if (!dqopt->files[type])
- goto out_file_flags;
error = -EINVAL;
if (!fmt->qf_ops->check_quota_file(sb, type))
- goto out_file_init;
+ goto out_fmt;
dqopt->ops[type] = fmt->qf_ops;
dqopt->info[type].dqi_format = fmt;
@@ -2380,7 +2384,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
error = dqopt->ops[type]->read_file_info(sb, type);
if (error < 0)
- goto out_file_init;
+ goto out_fmt;
if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) {
spin_lock(&dq_data_lock);
dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
@@ -2395,24 +2399,36 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
dquot_disable(sb, type, flags);
return error;
-out_file_init:
- dqopt->files[type] = NULL;
- iput(inode);
-out_file_flags:
- inode_lock(inode);
- inode->i_flags &= ~S_NOQUOTA;
- inode_unlock(inode);
out_fmt:
put_quota_format(fmt);
return error;
}
+EXPORT_SYMBOL(dquot_load_quota_sb);
+
+/*
+ * More powerful function for turning on quotas on given quota inode allowing
+ * setting of individual quota flags
+ */
+int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
+ unsigned int flags)
+{
+ int err;
+
+ err = vfs_setup_quota_inode(inode, type);
+ if (err < 0)
+ return err;
+ err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags);
+ if (err < 0)
+ vfs_cleanup_quota_inode(inode->i_sb, type);
+ return err;
+}
+EXPORT_SYMBOL(dquot_load_quota_inode);
/* Reenable quotas on remount RW */
int dquot_resume(struct super_block *sb, int type)
{
struct quota_info *dqopt = sb_dqopt(sb);
- struct inode *inode;
int ret = 0, cnt;
unsigned int flags;
@@ -2426,8 +2442,6 @@ int dquot_resume(struct super_block *sb, int type)
if (!sb_has_quota_suspended(sb, cnt))
continue;
- inode = dqopt->files[cnt];
- dqopt->files[cnt] = NULL;
spin_lock(&dq_state_lock);
flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
DQUOT_LIMITS_ENABLED,
@@ -2436,9 +2450,10 @@ int dquot_resume(struct super_block *sb, int type)
spin_unlock(&dq_state_lock);
flags = dquot_generic_flag(flags, cnt);
- ret = vfs_load_quota_inode(inode, cnt,
- dqopt->info[cnt].dqi_fmt_id, flags);
- iput(inode);
+ ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id,
+ flags);
+ if (ret < 0)
+ vfs_cleanup_quota_inode(sb, type);
}
return ret;
@@ -2455,7 +2470,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id,
if (path->dentry->d_sb != sb)
error = -EXDEV;
else
- error = vfs_load_quota_inode(d_inode(path->dentry), type,
+ error = dquot_load_quota_inode(d_inode(path->dentry), type,
format_id, DQUOT_USAGE_ENABLED |
DQUOT_LIMITS_ENABLED);
return error;
@@ -2463,41 +2478,6 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id,
EXPORT_SYMBOL(dquot_quota_on);
/*
- * More powerful function for turning on quotas allowing setting
- * of individual quota flags
- */
-int dquot_enable(struct inode *inode, int type, int format_id,
- unsigned int flags)
-{
- struct super_block *sb = inode->i_sb;
-
- /* Just unsuspend quotas? */
- BUG_ON(flags & DQUOT_SUSPENDED);
- /* s_umount should be held in exclusive mode */
- if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
- up_read(&sb->s_umount);
-
- if (!flags)
- return 0;
- /* Just updating flags needed? */
- if (sb_has_quota_loaded(sb, type)) {
- if (flags & DQUOT_USAGE_ENABLED &&
- sb_has_quota_usage_enabled(sb, type))
- return -EBUSY;
- if (flags & DQUOT_LIMITS_ENABLED &&
- sb_has_quota_limits_enabled(sb, type))
- return -EBUSY;
- spin_lock(&dq_state_lock);
- sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
- spin_unlock(&dq_state_lock);
- return 0;
- }
-
- return vfs_load_quota_inode(inode, type, format_id, flags);
-}
-EXPORT_SYMBOL(dquot_enable);
-
-/*
* This function is used when filesystem needs to initialize quotas
* during mount time.
*/
@@ -2518,7 +2498,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
error = security_quota_on(dentry);
if (!error)
- error = vfs_load_quota_inode(d_inode(dentry), type, format_id,
+ error = dquot_load_quota_inode(d_inode(dentry), type, format_id,
DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
out:
@@ -2543,13 +2523,17 @@ static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
if (!(flags & qtype_enforce_flag(type)))
continue;
/* Can't enforce without accounting */
- if (!sb_has_quota_usage_enabled(sb, type))
- return -EINVAL;
- ret = dquot_enable(dqopt->files[type], type,
- dqopt->info[type].dqi_fmt_id,
- DQUOT_LIMITS_ENABLED);
- if (ret < 0)
+ if (!sb_has_quota_usage_enabled(sb, type)) {
+ ret = -EINVAL;
goto out_err;
+ }
+ if (sb_has_quota_limits_enabled(sb, type)) {
+ ret = -EBUSY;
+ goto out_err;
+ }
+ spin_lock(&dq_state_lock);
+ dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
+ spin_unlock(&dq_state_lock);
}
return 0;
out_err:
@@ -2599,10 +2583,12 @@ static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
out_err:
/* Backout enforcement disabling we already did */
for (type--; type >= 0; type--) {
- if (flags & qtype_enforce_flag(type))
- dquot_enable(dqopt->files[type], type,
- dqopt->info[type].dqi_fmt_id,
- DQUOT_LIMITS_ENABLED);
+ if (flags & qtype_enforce_flag(type)) {
+ spin_lock(&dq_state_lock);
+ dqopt->flags |=
+ dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
+ spin_unlock(&dq_state_lock);
+ }
}
return ret;
}
@@ -2800,8 +2786,10 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state)
tstate->flags |= QCI_LIMITS_ENFORCED;
tstate->spc_timelimit = mi->dqi_bgrace;
tstate->ino_timelimit = mi->dqi_igrace;
- tstate->ino = dqopt->files[type]->i_ino;
- tstate->blocks = dqopt->files[type]->i_blocks;
+ if (dqopt->files[type]) {
+ tstate->ino = dqopt->files[type]->i_ino;
+ tstate->blocks = dqopt->files[type]->i_blocks;
+ }
tstate->nextents = 1; /* We don't know... */
spin_unlock(&dq_data_lock);
}
@@ -2860,68 +2848,73 @@ EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
static int do_proc_dqstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- unsigned int type = (int *)table->data - dqstats.stat;
+ unsigned int type = (unsigned long *)table->data - dqstats.stat;
+ s64 value = percpu_counter_sum(&dqstats.counter[type]);
+
+ /* Filter negative values for non-monotonic counters */
+ if (value < 0 && (type == DQST_ALLOC_DQUOTS ||
+ type == DQST_FREE_DQUOTS))
+ value = 0;
/* Update global table */
- dqstats.stat[type] =
- percpu_counter_sum_positive(&dqstats.counter[type]);
- return proc_dointvec(table, write, buffer, lenp, ppos);
+ dqstats.stat[type] = value;
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
static struct ctl_table fs_dqstats_table[] = {
{
.procname = "lookups",
.data = &dqstats.stat[DQST_LOOKUPS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "drops",
.data = &dqstats.stat[DQST_DROPS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "reads",
.data = &dqstats.stat[DQST_READS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "writes",
.data = &dqstats.stat[DQST_WRITES],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "cache_hits",
.data = &dqstats.stat[DQST_CACHE_HITS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "allocated_dquots",
.data = &dqstats.stat[DQST_ALLOC_DQUOTS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "free_dquots",
.data = &dqstats.stat[DQST_FREE_DQUOTS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "syncs",
.data = &dqstats.stat[DQST_SYNCS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
@@ -2983,11 +2976,7 @@ static int __init dquot_init(void)
/* Find power-of-two hlist_heads which can fit into allocation */
nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
- dq_hash_bits = 0;
- do {
- dq_hash_bits++;
- } while (nr_hash >> dq_hash_bits);
- dq_hash_bits--;
+ dq_hash_bits = ilog2(nr_hash);
nr_hash = 1UL << dq_hash_bits;
dq_hash_mask = nr_hash - 1;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index cb13fb76dbee..5444d3c4d93f 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -60,8 +60,6 @@ static int quota_sync_all(int type)
{
int ret;
- if (type >= MAXQUOTAS)
- return -EINVAL;
ret = security_quotactl(Q_SYNC, type, 0, NULL);
if (!ret)
iterate_supers(quota_sync_one, &type);
@@ -686,8 +684,6 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
{
int ret;
- if (type >= MAXQUOTAS)
- return -EINVAL;
type = array_index_nospec(type, MAXQUOTAS);
/*
* Quota not supported on this fs? Check this before s_quota_types
@@ -831,6 +827,9 @@ int kernel_quotactl(unsigned int cmd, const char __user *special,
cmds = cmd >> SUBCMDSHIFT;
type = cmd & SUBCMDMASK;
+ if (type >= MAXQUOTAS)
+ return -EINVAL;
+
/*
* As a special case Q_SYNC can be called without a specific device.
* It will iterate all superblocks that have quota enabled and call
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index c740e5572eb8..cd92e5fa0062 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -217,7 +217,6 @@ static const struct quota_format_ops v1_format_ops = {
.check_quota_file = v1_check_quota_file,
.read_file_info = v1_read_file_info,
.write_file_info = v1_write_file_info,
- .free_file_info = NULL,
.read_dqblk = v1_read_dqblk,
.commit_dqblk = v1_commit_dqblk,
};
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 843aadcc123c..84cf8bdbec9c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -38,16 +38,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
BUG_ON(!S_ISREG(inode->i_mode));
- if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
+ if (!atomic_dec_and_mutex_lock(&REISERFS_I(inode)->openers,
+ &REISERFS_I(inode)->tailpack))
return 0;
- mutex_lock(&REISERFS_I(inode)->tailpack);
-
- if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
- mutex_unlock(&REISERFS_I(inode)->tailpack);
- return 0;
- }
-
/* fast out for when nothing needs to be done */
if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
!tail_has_to_be_packed(inode)) &&
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 132ec4406ed0..6419e6dacc39 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2097,6 +2097,15 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
goto out_inserted_sd;
}
+ /*
+ * Mark it private if we're creating the privroot
+ * or something under it.
+ */
+ if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) {
+ inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
+
if (reiserfs_posixacl(inode->i_sb)) {
reiserfs_write_unlock(inode->i_sb);
retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
@@ -2111,8 +2120,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
reiserfs_warning(inode->i_sb, "jdm-13090",
"ACLs aren't enabled in the fs, "
"but vfs thinks they are!");
- } else if (IS_PRIVATE(dir))
- inode->i_flags |= S_PRIVATE;
+ }
if (security->name) {
reiserfs_write_unlock(inode->i_sb);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 97f3fc4fdd79..959a066b7bb0 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -377,10 +377,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
/*
* Propagate the private flag so we know we're
- * in the priv tree
+ * in the priv tree. Also clear IOP_XATTR
+ * since we don't have xattrs on xattr files.
*/
- if (IS_PRIVATE(dir))
+ if (IS_PRIVATE(dir)) {
inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
}
reiserfs_write_unlock(dir->i_sb);
if (retval == IO_ERROR) {
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index e5ca9ed79e54..726580114d55 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1168,6 +1168,8 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
return bmap_nr > ((1LL << 16) - 1);
}
+extern const struct xattr_handler *reiserfs_xattr_handlers[];
+
/*
* this says about version of key of all items (but stat data) the
* object consists of
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index d69b4ac0ae2f..3244037b1286 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2049,6 +2049,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (replay_only(s))
goto error_unlocked;
+ s->s_xattr = reiserfs_xattr_handlers;
+
if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
SWARN(silent, s, "clm-7000",
"Detected readonly device, marking FS readonly");
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index b5b26d8a192c..62b40df36c98 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -122,13 +122,13 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
struct dentry *xaroot;
if (d_really_is_negative(privroot))
- return ERR_PTR(-ENODATA);
+ return ERR_PTR(-EOPNOTSUPP);
inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
xaroot = dget(REISERFS_SB(sb)->xattr_root);
if (!xaroot)
- xaroot = ERR_PTR(-ENODATA);
+ xaroot = ERR_PTR(-EOPNOTSUPP);
else if (d_really_is_negative(xaroot)) {
int err = -ENODATA;
@@ -619,6 +619,10 @@ int reiserfs_xattr_set(struct inode *inode, const char *name,
int error, error2;
size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
+ /* Check before we start a transaction and then do nothing. */
+ if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root))
+ return -EOPNOTSUPP;
+
if (!(flags & XATTR_REPLACE))
jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
@@ -841,8 +845,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
if (d_really_is_negative(dentry))
return -EINVAL;
- if (!dentry->d_sb->s_xattr ||
- get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
+ if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
@@ -882,6 +885,7 @@ static int create_privroot(struct dentry *dentry)
}
d_inode(dentry)->i_flags |= S_PRIVATE;
+ d_inode(dentry)->i_opflags &= ~IOP_XATTR;
reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
"storage.\n", PRIVROOT_NAME);
@@ -895,7 +899,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
#endif
/* Actual operations that are exported to VFS-land */
-static const struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
#ifdef CONFIG_REISERFS_FS_XATTR
&reiserfs_xattr_user_handler,
&reiserfs_xattr_trusted_handler,
@@ -966,8 +970,10 @@ int reiserfs_lookup_privroot(struct super_block *s)
if (!IS_ERR(dentry)) {
REISERFS_SB(s)->priv_root = dentry;
d_set_d_op(dentry, &xattr_lookup_poison_ops);
- if (d_really_is_positive(dentry))
+ if (d_really_is_positive(dentry)) {
d_inode(dentry)->i_flags |= S_PRIVATE;
+ d_inode(dentry)->i_opflags &= ~IOP_XATTR;
+ }
} else
err = PTR_ERR(dentry);
inode_unlock(d_inode(s->s_root));
@@ -996,7 +1002,6 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
}
if (d_really_is_positive(privroot)) {
- s->s_xattr = reiserfs_xattr_handlers;
inode_lock(d_inode(privroot));
if (!REISERFS_SB(s)->xattr_root) {
struct dentry *dentry;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index aa9380bac196..05f666794561 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -320,10 +320,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
* would be useless since permissions are ignored, and a pain because
* it introduces locking cycles
*/
- if (IS_PRIVATE(dir)) {
- inode->i_flags |= S_PRIVATE;
+ if (IS_PRIVATE(inode))
goto apply_umask;
- }
err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (err)
diff --git a/fs/select.c b/fs/select.c
index 53a0c149f528..11d0285d46b7 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -321,7 +321,7 @@ static int poll_select_finish(struct timespec64 *end_time,
switch (pt_type) {
case PT_TIMEVAL:
{
- struct timeval rtv;
+ struct __kernel_old_timeval rtv;
if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
memset(&rtv, 0, sizeof(rtv));
@@ -698,10 +698,10 @@ out_nofds:
}
static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
- fd_set __user *exp, struct timeval __user *tvp)
+ fd_set __user *exp, struct __kernel_old_timeval __user *tvp)
{
struct timespec64 end_time, *to = NULL;
- struct timeval tv;
+ struct __kernel_old_timeval tv;
int ret;
if (tvp) {
@@ -720,7 +720,7 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
}
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
- fd_set __user *, exp, struct timeval __user *, tvp)
+ fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp)
{
return kern_select(n, inp, outp, exp, tvp);
}
@@ -810,7 +810,7 @@ SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *,
struct sel_arg_struct {
unsigned long n;
fd_set __user *inp, *outp, *exp;
- struct timeval __user *tvp;
+ struct __kernel_old_timeval __user *tvp;
};
SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
diff --git a/fs/splice.c b/fs/splice.c
index 98412721f056..f2400ce7d528 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd)
{
unsigned int spd_pages = spd->nr_pages;
+ unsigned int tail = pipe->tail;
+ unsigned int head = pipe->head;
+ unsigned int mask = pipe->ring_size - 1;
int ret = 0, page_nr = 0;
if (!spd_pages)
@@ -196,9 +199,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
goto out;
}
- while (pipe->nrbufs < pipe->buffers) {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
- struct pipe_buffer *buf = pipe->bufs + newbuf;
+ while (!pipe_full(head, tail, pipe->max_usage)) {
+ struct pipe_buffer *buf = &pipe->bufs[head & mask];
buf->page = spd->pages[page_nr];
buf->offset = spd->partial[page_nr].offset;
@@ -207,7 +209,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
buf->ops = spd->ops;
buf->flags = 0;
- pipe->nrbufs++;
+ head++;
+ pipe->head = head;
page_nr++;
ret += buf->len;
@@ -228,17 +231,19 @@ EXPORT_SYMBOL_GPL(splice_to_pipe);
ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
int ret;
if (unlikely(!pipe->readers)) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
- } else if (pipe->nrbufs == pipe->buffers) {
+ } else if (pipe_full(head, tail, pipe->max_usage)) {
ret = -EAGAIN;
} else {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
- pipe->bufs[newbuf] = *buf;
- pipe->nrbufs++;
+ pipe->bufs[head & mask] = *buf;
+ pipe->head = head + 1;
return buf->len;
}
pipe_buf_release(pipe, buf);
@@ -252,14 +257,14 @@ EXPORT_SYMBOL(add_to_pipe);
*/
int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
{
- unsigned int buffers = READ_ONCE(pipe->buffers);
+ unsigned int max_usage = READ_ONCE(pipe->max_usage);
- spd->nr_pages_max = buffers;
- if (buffers <= PIPE_DEF_BUFFERS)
+ spd->nr_pages_max = max_usage;
+ if (max_usage <= PIPE_DEF_BUFFERS)
return 0;
- spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL);
- spd->partial = kmalloc_array(buffers, sizeof(struct partial_page),
+ spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
+ spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
GFP_KERNEL);
if (spd->pages && spd->partial)
@@ -298,10 +303,11 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
{
struct iov_iter to;
struct kiocb kiocb;
- int idx, ret;
+ unsigned int i_head;
+ int ret;
iov_iter_pipe(&to, READ, pipe, len);
- idx = to.idx;
+ i_head = to.head;
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -309,7 +315,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
*ppos = kiocb.ki_pos;
file_accessed(in);
} else if (ret < 0) {
- to.idx = idx;
+ to.head = i_head;
to.iov_offset = 0;
iov_iter_advance(&to, 0); /* to free what was emitted */
/*
@@ -370,11 +376,12 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
struct iov_iter to;
struct page **pages;
unsigned int nr_pages;
+ unsigned int mask;
size_t offset, base, copied = 0;
ssize_t res;
int i;
- if (pipe->nrbufs == pipe->buffers)
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
return -EAGAIN;
/*
@@ -400,8 +407,9 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
}
}
- pipe->bufs[to.idx].offset = offset;
- pipe->bufs[to.idx].len -= offset;
+ mask = pipe->ring_size - 1;
+ pipe->bufs[to.head & mask].offset = offset;
+ pipe->bufs[to.head & mask].len -= offset;
for (i = 0; i < nr_pages; i++) {
size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
@@ -443,7 +451,8 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
- if (sd->len < sd->total_len && pipe->nrbufs > 1)
+ if (sd->len < sd->total_len &&
+ pipe_occupancy(pipe->head, pipe->tail) > 1)
more |= MSG_SENDPAGE_NOTLAST;
return file->f_op->sendpage(file, buf->page, buf->offset,
@@ -481,10 +490,13 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_actor *actor)
{
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
int ret;
- while (pipe->nrbufs) {
- struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ while (!pipe_empty(tail, head)) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
sd->len = buf->len;
if (sd->len > sd->total_len)
@@ -511,8 +523,8 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
if (!buf->len) {
pipe_buf_release(pipe, buf);
- pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
- pipe->nrbufs--;
+ tail++;
+ pipe->tail = tail;
if (pipe->files)
sd->need_wakeup = true;
}
@@ -543,7 +555,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
if (signal_pending(current))
return -ERESTARTSYS;
- while (!pipe->nrbufs) {
+ while (pipe_empty(pipe->head, pipe->tail)) {
if (!pipe->writers)
return 0;
@@ -686,7 +698,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
.pos = *ppos,
.u.file = out,
};
- int nbufs = pipe->buffers;
+ int nbufs = pipe->max_usage;
struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
GFP_KERNEL);
ssize_t ret;
@@ -699,16 +711,19 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
splice_from_pipe_begin(&sd);
while (sd.total_len) {
struct iov_iter from;
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
size_t left;
- int n, idx;
+ int n;
ret = splice_from_pipe_next(pipe, &sd);
if (ret <= 0)
break;
- if (unlikely(nbufs < pipe->buffers)) {
+ if (unlikely(nbufs < pipe->max_usage)) {
kfree(array);
- nbufs = pipe->buffers;
+ nbufs = pipe->max_usage;
array = kcalloc(nbufs, sizeof(struct bio_vec),
GFP_KERNEL);
if (!array) {
@@ -719,16 +734,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
/* build the vector */
left = sd.total_len;
- for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
- struct pipe_buffer *buf = pipe->bufs + idx;
+ for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t this_len = buf->len;
if (this_len > left)
this_len = left;
- if (idx == pipe->buffers - 1)
- idx = -1;
-
ret = pipe_buf_confirm(pipe, buf);
if (unlikely(ret)) {
if (ret == -ENODATA)
@@ -752,14 +764,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
*ppos = sd.pos;
/* dismiss the fully eaten buffers, adjust the partial one */
+ tail = pipe->tail;
while (ret) {
- struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
if (ret >= buf->len) {
ret -= buf->len;
buf->len = 0;
pipe_buf_release(pipe, buf);
- pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
- pipe->nrbufs--;
+ tail++;
+ pipe->tail = tail;
if (pipe->files)
sd.need_wakeup = true;
} else {
@@ -942,15 +955,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
sd->flags &= ~SPLICE_F_NONBLOCK;
more = sd->flags & SPLICE_F_MORE;
- WARN_ON_ONCE(pipe->nrbufs != 0);
+ WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
while (len) {
+ unsigned int p_space;
size_t read_len;
loff_t pos = sd->pos, prev_pos = pos;
/* Don't try to read more the pipe has space for. */
- read_len = min_t(size_t, len,
- (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT);
+ p_space = pipe->max_usage -
+ pipe_occupancy(pipe->head, pipe->tail);
+ read_len = min_t(size_t, len, p_space << PAGE_SHIFT);
ret = do_splice_to(in, &pos, pipe, read_len, flags);
if (unlikely(ret <= 0))
goto out_release;
@@ -989,7 +1004,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
}
done:
- pipe->nrbufs = pipe->curbuf = 0;
+ pipe->tail = pipe->head = 0;
file_accessed(in);
return bytes;
@@ -998,8 +1013,8 @@ out_release:
* If we did an incomplete transfer we must release
* the pipe buffers in question:
*/
- for (i = 0; i < pipe->buffers; i++) {
- struct pipe_buffer *buf = pipe->bufs + i;
+ for (i = 0; i < pipe->ring_size; i++) {
+ struct pipe_buffer *buf = &pipe->bufs[i];
if (buf->ops)
pipe_buf_release(pipe, buf);
@@ -1075,7 +1090,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
send_sig(SIGPIPE, current, 0);
return -EPIPE;
}
- if (pipe->nrbufs != pipe->buffers)
+ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
return 0;
if (flags & SPLICE_F_NONBLOCK)
return -EAGAIN;
@@ -1180,8 +1195,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
- if (!ret)
+ if (!ret) {
+ unsigned int p_space;
+
+ /* Don't try to read more the pipe has space for. */
+ p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);
+ len = min_t(size_t, len, p_space << PAGE_SHIFT);
+
ret = do_splice_to(in, &offset, opipe, len, flags);
+ }
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
@@ -1442,16 +1464,16 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
int ret;
/*
- * Check ->nrbufs without the inode lock first. This function
+ * Check the pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe->nrbufs)
+ if (!pipe_empty(pipe->head, pipe->tail))
return 0;
ret = 0;
pipe_lock(pipe);
- while (!pipe->nrbufs) {
+ while (pipe_empty(pipe->head, pipe->tail)) {
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
@@ -1480,16 +1502,16 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
int ret;
/*
- * Check ->nrbufs without the inode lock first. This function
+ * Check pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe->nrbufs < pipe->buffers)
+ if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
return 0;
ret = 0;
pipe_lock(pipe);
- while (pipe->nrbufs >= pipe->buffers) {
+ while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
@@ -1520,7 +1542,10 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
- int ret = 0, nbuf;
+ unsigned int i_head, o_head;
+ unsigned int i_tail, o_tail;
+ unsigned int i_mask, o_mask;
+ int ret = 0;
bool input_wakeup = false;
@@ -1540,7 +1565,14 @@ retry:
*/
pipe_double_lock(ipipe, opipe);
+ i_tail = ipipe->tail;
+ i_mask = ipipe->ring_size - 1;
+ o_head = opipe->head;
+ o_mask = opipe->ring_size - 1;
+
do {
+ size_t o_len;
+
if (!opipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
@@ -1548,14 +1580,18 @@ retry:
break;
}
- if (!ipipe->nrbufs && !ipipe->writers)
+ i_head = ipipe->head;
+ o_tail = opipe->tail;
+
+ if (pipe_empty(i_head, i_tail) && !ipipe->writers)
break;
/*
* Cannot make any progress, because either the input
* pipe is empty or the output pipe is full.
*/
- if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
+ if (pipe_empty(i_head, i_tail) ||
+ pipe_full(o_head, o_tail, opipe->max_usage)) {
/* Already processed some buffers, break */
if (ret)
break;
@@ -1575,9 +1611,8 @@ retry:
goto retry;
}
- ibuf = ipipe->bufs + ipipe->curbuf;
- nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
- obuf = opipe->bufs + nbuf;
+ ibuf = &ipipe->bufs[i_tail & i_mask];
+ obuf = &opipe->bufs[o_head & o_mask];
if (len >= ibuf->len) {
/*
@@ -1585,10 +1620,12 @@ retry:
*/
*obuf = *ibuf;
ibuf->ops = NULL;
- opipe->nrbufs++;
- ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
- ipipe->nrbufs--;
+ i_tail++;
+ ipipe->tail = i_tail;
input_wakeup = true;
+ o_len = obuf->len;
+ o_head++;
+ opipe->head = o_head;
} else {
/*
* Get a reference to this pipe buffer,
@@ -1610,12 +1647,14 @@ retry:
pipe_buf_mark_unmergeable(obuf);
obuf->len = len;
- opipe->nrbufs++;
- ibuf->offset += obuf->len;
- ibuf->len -= obuf->len;
+ ibuf->offset += len;
+ ibuf->len -= len;
+ o_len = len;
+ o_head++;
+ opipe->head = o_head;
}
- ret += obuf->len;
- len -= obuf->len;
+ ret += o_len;
+ len -= o_len;
} while (len);
pipe_unlock(ipipe);
@@ -1641,7 +1680,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
- int ret = 0, i = 0, nbuf;
+ unsigned int i_head, o_head;
+ unsigned int i_tail, o_tail;
+ unsigned int i_mask, o_mask;
+ int ret = 0;
/*
* Potential ABBA deadlock, work around it by ordering lock
@@ -1650,6 +1692,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
*/
pipe_double_lock(ipipe, opipe);
+ i_tail = ipipe->tail;
+ i_mask = ipipe->ring_size - 1;
+ o_head = opipe->head;
+ o_mask = opipe->ring_size - 1;
+
do {
if (!opipe->readers) {
send_sig(SIGPIPE, current, 0);
@@ -1658,15 +1705,19 @@ static int link_pipe(struct pipe_inode_info *ipipe,
break;
}
+ i_head = ipipe->head;
+ o_tail = opipe->tail;
+
/*
- * If we have iterated all input buffers or ran out of
+ * If we have iterated all input buffers or run out of
* output room, break.
*/
- if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
+ if (pipe_empty(i_head, i_tail) ||
+ pipe_full(o_head, o_tail, opipe->max_usage))
break;
- ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
- nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
+ ibuf = &ipipe->bufs[i_tail & i_mask];
+ obuf = &opipe->bufs[o_head & o_mask];
/*
* Get a reference to this pipe buffer,
@@ -1678,7 +1729,6 @@ static int link_pipe(struct pipe_inode_info *ipipe,
break;
}
- obuf = opipe->bufs + nbuf;
*obuf = *ibuf;
/*
@@ -1691,11 +1741,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
if (obuf->len > len)
obuf->len = len;
-
- opipe->nrbufs++;
ret += obuf->len;
len -= obuf->len;
- i++;
+
+ o_head++;
+ opipe->head = o_head;
+ i_tail++;
} while (len);
/*
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 48305ba41e3c..ac7f59a58f94 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -302,11 +302,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
static void timerfd_show(struct seq_file *m, struct file *file)
{
struct timerfd_ctx *ctx = file->private_data;
- struct itimerspec t;
+ struct timespec64 value, interval;
spin_lock_irq(&ctx->wqh.lock);
- t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
- t.it_interval = ktime_to_timespec(ctx->tintv);
+ value = ktime_to_timespec64(timerfd_get_remaining(ctx));
+ interval = ktime_to_timespec64(ctx->tintv);
spin_unlock_irq(&ctx->wqh.lock);
seq_printf(m,
@@ -318,10 +318,10 @@ static void timerfd_show(struct seq_file *m, struct file *file)
ctx->clockid,
(unsigned long long)ctx->ticks,
ctx->settime_flags,
- (unsigned long long)t.it_value.tv_sec,
- (unsigned long long)t.it_value.tv_nsec,
- (unsigned long long)t.it_interval.tv_sec,
- (unsigned long long)t.it_interval.tv_nsec);
+ (unsigned long long)value.tv_sec,
+ (unsigned long long)value.tv_nsec,
+ (unsigned long long)interval.tv_sec,
+ (unsigned long long)interval.tv_nsec);
}
#else
#define timerfd_show NULL
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e4b52783819d..0f5a480fe264 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2737,18 +2737,6 @@ static ssize_t dfs_file_write(struct file *file, const char __user *u,
struct dentry *dent = file->f_path.dentry;
int val;
- /*
- * TODO: this is racy - the file-system might have already been
- * unmounted and we'd oops in this case. The plan is to fix it with
- * help of 'iterate_supers_type()' which we should have in v3.0: when
- * a debugfs opened, we rember FS's UUID in file->private_data. Then
- * whenever we access the FS via a debugfs file, we iterate all UBIFS
- * superblocks and fine the one with the same UUID, and take the
- * locking right.
- *
- * The other way to go suggested by Al Viro is to create a separate
- * 'ubifs-debug' file-system instead.
- */
if (file->f_path.dentry == d->dfs_dump_lprops) {
ubifs_dump_lprops(c);
return count;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 4fd9683b8245..388fe8f5dc51 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -503,7 +503,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
{
if (c->double_hash)
- dent->cookie = prandom_u32();
+ dent->cookie = (__force __le32) prandom_u32();
else
dent->cookie = 0;
}
@@ -899,7 +899,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
fname_name(&nm) = xent->name;
fname_len(&nm) = le16_to_cpu(xent->nlen);
- xino = ubifs_iget(c->vfs_sb, xent->inum);
+ xino = ubifs_iget(c->vfs_sb, le64_to_cpu(xent->inum));
if (IS_ERR(xino)) {
err = PTR_ERR(xino);
ubifs_err(c, "dead directory entry '%s', error %d",
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 3b4b4114f208..54d6db61106f 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -631,12 +631,17 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
ino_t inum;
int i, n, err, first = 1;
+ ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+ if (!ino)
+ return -ENOMEM;
+
list_for_each_entry(snod, &sleb->nodes, list) {
if (snod->type != UBIFS_ORPH_NODE) {
ubifs_err(c, "invalid node type %d in orphan area at %d:%d",
snod->type, sleb->lnum, snod->offs);
ubifs_dump_node(c, snod->node);
- return -EINVAL;
+ err = -EINVAL;
+ goto out_free;
}
orph = snod->node;
@@ -663,20 +668,18 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
ubifs_err(c, "out of order commit number %llu in orphan node at %d:%d",
cmt_no, sleb->lnum, snod->offs);
ubifs_dump_node(c, snod->node);
- return -EINVAL;
+ err = -EINVAL;
+ goto out_free;
}
dbg_rcvry("out of date LEB %d", sleb->lnum);
*outofdate = 1;
- return 0;
+ err = 0;
+ goto out_free;
}
if (first)
first = 0;
- ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
- if (!ino)
- return -ENOMEM;
-
n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
for (i = 0; i < n; i++) {
union ubifs_key key1, key2;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index a551eb3e9b89..2b7c04bf8983 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -184,7 +184,7 @@ static int create_default_filesystem(struct ubifs_info *c)
if (err)
goto out;
} else {
- sup->hash_algo = 0xffff;
+ sup->hash_algo = cpu_to_le16(0xffff);
}
sup->ch.node_type = UBIFS_SB_NODE;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7d4547e5202d..5e1e8ec0589e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2267,10 +2267,8 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
}
} else {
err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
- if (err) {
- kfree(c);
+ if (err)
goto out_deact;
- }
/* We do not support atime */
sb->s_flags |= SB_ACTIVE;
if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index a384a0f9ff32..234be1c4dc87 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -212,7 +212,7 @@ static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
/**
* layout_leb_in_gaps - layout index nodes using in-the-gaps method.
* @c: UBIFS file-system description object
- * @p: return LEB number here
+ * @p: return LEB number in @c->gap_lebs[p]
*
* This function lays out new index nodes for dirty znodes using in-the-gaps
* method of TNC commit.
@@ -221,7 +221,7 @@ static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
* This function returns the number of index nodes written into the gaps, or a
* negative error code on failure.
*/
-static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
+static int layout_leb_in_gaps(struct ubifs_info *c, int p)
{
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
@@ -236,7 +236,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
* filled, however we do not check there at present.
*/
return lnum; /* Error code */
- *p = lnum;
+ c->gap_lebs[p] = lnum;
dbg_gc("LEB %d", lnum);
/*
* Scan the index LEB. We use the generic scan for this even though
@@ -355,7 +355,7 @@ static int get_leb_cnt(struct ubifs_info *c, int cnt)
*/
static int layout_in_gaps(struct ubifs_info *c, int cnt)
{
- int err, leb_needed_cnt, written, *p;
+ int err, leb_needed_cnt, written, p = 0, old_idx_lebs, *gap_lebs;
dbg_gc("%d znodes to write", cnt);
@@ -364,9 +364,9 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
if (!c->gap_lebs)
return -ENOMEM;
- p = c->gap_lebs;
+ old_idx_lebs = c->lst.idx_lebs;
do {
- ubifs_assert(c, p < c->gap_lebs + c->lst.idx_lebs);
+ ubifs_assert(c, p < c->lst.idx_lebs);
written = layout_leb_in_gaps(c, p);
if (written < 0) {
err = written;
@@ -392,9 +392,29 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
leb_needed_cnt = get_leb_cnt(c, cnt);
dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
leb_needed_cnt, c->ileb_cnt);
+ /*
+ * Dynamically change the size of @c->gap_lebs to prevent
+ * oob, because @c->lst.idx_lebs could be increased by
+ * function @get_idx_gc_leb (called by layout_leb_in_gaps->
+ * ubifs_find_dirty_idx_leb) during loop. Only enlarge
+ * @c->gap_lebs when needed.
+ *
+ */
+ if (leb_needed_cnt > c->ileb_cnt && p >= old_idx_lebs &&
+ old_idx_lebs < c->lst.idx_lebs) {
+ old_idx_lebs = c->lst.idx_lebs;
+ gap_lebs = krealloc(c->gap_lebs, sizeof(int) *
+ (old_idx_lebs + 1), GFP_NOFS);
+ if (!gap_lebs) {
+ kfree(c->gap_lebs);
+ c->gap_lebs = NULL;
+ return -ENOMEM;
+ }
+ c->gap_lebs = gap_lebs;
+ }
} while (leb_needed_cnt > c->ileb_cnt);
- *p = -1;
+ c->gap_lebs[p] = -1;
return 0;
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f9fd18670e22..37df7c9eedb1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1460,7 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
start = vma->vm_start;
vma_end = min(end, vma->vm_end);
- new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+ new_flags = (vma->vm_flags &
+ ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags;
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
@@ -1834,13 +1835,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
features = uffdio_api.features;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
- memset(&uffdio_api, 0, sizeof(uffdio_api));
- if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
- goto out;
- ret = -EINVAL;
- goto out;
- }
+ ret = -EINVAL;
+ if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ goto err_out;
+ ret = -EPERM;
+ if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
+ goto err_out;
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
uffdio_api.ioctls = UFFD_API_IOCTLS;
@@ -1853,6 +1853,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = 0;
out:
return ret;
+err_out:
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ ret = -EFAULT;
+ goto out;
}
static long userfaultfd_ioctl(struct file *file, unsigned cmd,
@@ -1923,7 +1928,7 @@ static const struct file_operations userfaultfd_fops = {
.poll = userfaultfd_poll,
.read = userfaultfd_read,
.unlocked_ioctl = userfaultfd_ioctl,
- .compat_ioctl = userfaultfd_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.llseek = noop_llseek,
};
diff --git a/fs/utimes.c b/fs/utimes.c
index 1ba3f7883870..c952b6b3d8a0 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -161,9 +161,9 @@ SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
* utimensat() instead.
*/
static long do_futimesat(int dfd, const char __user *filename,
- struct timeval __user *utimes)
+ struct __kernel_old_timeval __user *utimes)
{
- struct timeval times[2];
+ struct __kernel_old_timeval times[2];
struct timespec64 tstimes[2];
if (utimes) {
@@ -190,13 +190,13 @@ static long do_futimesat(int dfd, const char __user *filename,
SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
- struct timeval __user *, utimes)
+ struct __kernel_old_timeval __user *, utimes)
{
return do_futimesat(dfd, filename, utimes);
}
SYSCALL_DEFINE2(utimes, char __user *, filename,
- struct timeval __user *, utimes)
+ struct __kernel_old_timeval __user *, utimes)
{
return do_futimesat(AT_FDCWD, filename, utimes);
}
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 06b68b6115bc..aceca2f9a3db 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -27,7 +27,6 @@ xfs-y += $(addprefix libxfs/, \
xfs_bmap_btree.o \
xfs_btree.o \
xfs_da_btree.o \
- xfs_da_format.o \
xfs_defer.o \
xfs_dir2.o \
xfs_dir2_block.o \
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index da031b93e182..1da94237a8cf 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -32,7 +32,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
/*
- * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * __vmalloc() will allocate data pages and auxiliary structures (e.g.
* pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence
* we need to tell memory reclaim that we are in such a context via
* PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 8170d95cf930..6143117770e9 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -78,39 +78,9 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
* Zone interfaces
*/
-#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
-#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
-#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
-#define KM_ZONE_ACCOUNT SLAB_ACCOUNT
-
#define kmem_zone kmem_cache
#define kmem_zone_t struct kmem_cache
-static inline kmem_zone_t *
-kmem_zone_init(int size, char *zone_name)
-{
- return kmem_cache_create(zone_name, size, 0, 0, NULL);
-}
-
-static inline kmem_zone_t *
-kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags,
- void (*construct)(void *))
-{
- return kmem_cache_create(zone_name, size, 0, flags, construct);
-}
-
-static inline void
-kmem_zone_free(kmem_zone_t *zone, void *ptr)
-{
- kmem_cache_free(zone, ptr);
-}
-
-static inline void
-kmem_zone_destroy(kmem_zone_t *zone)
-{
- kmem_cache_destroy(zone);
-}
-
extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
static inline void *
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 87a9747f1d36..fdfe6dc0d307 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -19,6 +19,8 @@
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_sb.h"
+#include "xfs_ag_resv.h"
/*
* Per-AG Block Reservations
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 533b04aaf6f6..c284e10af491 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -146,9 +146,13 @@ xfs_alloc_lookup_eq(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
+ int error;
+
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
- return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+ cur->bc_private.a.priv.abt.active = (*stat == 1);
+ return error;
}
/*
@@ -162,9 +166,13 @@ xfs_alloc_lookup_ge(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
+ int error;
+
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
- return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+ cur->bc_private.a.priv.abt.active = (*stat == 1);
+ return error;
}
/*
@@ -178,9 +186,19 @@ xfs_alloc_lookup_le(
xfs_extlen_t len, /* length of extent */
int *stat) /* success/failure */
{
+ int error;
cur->bc_rec.a.ar_startblock = bno;
cur->bc_rec.a.ar_blockcount = len;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ cur->bc_private.a.priv.abt.active = (*stat == 1);
+ return error;
+}
+
+static inline bool
+xfs_alloc_cur_active(
+ struct xfs_btree_cur *cur)
+{
+ return cur && cur->bc_private.a.priv.abt.active;
}
/*
@@ -313,7 +331,7 @@ xfs_alloc_compute_diff(
xfs_extlen_t newlen1=0; /* length with newbno1 */
xfs_extlen_t newlen2=0; /* length with newbno2 */
xfs_agblock_t wantend; /* end of target extent */
- bool userdata = xfs_alloc_is_userdata(datatype);
+ bool userdata = datatype & XFS_ALLOC_USERDATA;
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
@@ -433,13 +451,17 @@ xfs_alloc_fixup_trees(
#ifdef DEBUG
if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp,
- i == 1 && nfbno1 == fbno && nflen1 == flen);
+ if (XFS_IS_CORRUPT(mp,
+ i != 1 ||
+ nfbno1 != fbno ||
+ nflen1 != flen))
+ return -EFSCORRUPTED;
#endif
} else {
if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
/*
* Look up the record in the by-block tree if necessary.
@@ -448,13 +470,17 @@ xfs_alloc_fixup_trees(
#ifdef DEBUG
if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp,
- i == 1 && nfbno1 == fbno && nflen1 == flen);
+ if (XFS_IS_CORRUPT(mp,
+ i != 1 ||
+ nfbno1 != fbno ||
+ nflen1 != flen))
+ return -EFSCORRUPTED;
#endif
} else {
if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
#ifdef DEBUG
@@ -465,8 +491,10 @@ xfs_alloc_fixup_trees(
bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
- XFS_WANT_CORRUPTED_RETURN(mp,
- bnoblock->bb_numrecs == cntblock->bb_numrecs);
+ if (XFS_IS_CORRUPT(mp,
+ bnoblock->bb_numrecs !=
+ cntblock->bb_numrecs))
+ return -EFSCORRUPTED;
}
#endif
@@ -496,25 +524,30 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(cnt_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
/*
* Add new by-size btree entry(s).
*/
if (nfbno1 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
+ if (XFS_IS_CORRUPT(mp, i != 0))
+ return -EFSCORRUPTED;
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
if (nfbno2 != NULLAGBLOCK) {
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
+ if (XFS_IS_CORRUPT(mp, i != 0))
+ return -EFSCORRUPTED;
if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
/*
* Fix up the by-block btree entry(s).
@@ -525,7 +558,8 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
} else {
/*
* Update the by-block entry to start later|be shorter.
@@ -539,10 +573,12 @@ xfs_alloc_fixup_trees(
*/
if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
+ if (XFS_IS_CORRUPT(mp, i != 0))
+ return -EFSCORRUPTED;
if ((error = xfs_btree_insert(bno_cur, &i)))
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -684,16 +720,298 @@ xfs_alloc_update_counters(
xfs_trans_agblocks_delta(tp, len);
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
- be32_to_cpu(agf->agf_length)))
+ be32_to_cpu(agf->agf_length))) {
+ xfs_buf_corruption_error(agbp);
return -EFSCORRUPTED;
+ }
xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
return 0;
}
/*
- * Allocation group level functions.
+ * Block allocation algorithm and data structures.
+ */
+struct xfs_alloc_cur {
+ struct xfs_btree_cur *cnt; /* btree cursors */
+ struct xfs_btree_cur *bnolt;
+ struct xfs_btree_cur *bnogt;
+ xfs_extlen_t cur_len;/* current search length */
+ xfs_agblock_t rec_bno;/* extent startblock */
+ xfs_extlen_t rec_len;/* extent length */
+ xfs_agblock_t bno; /* alloc bno */
+ xfs_extlen_t len; /* alloc len */
+ xfs_extlen_t diff; /* diff from search bno */
+ unsigned int busy_gen;/* busy state */
+ bool busy;
+};
+
+/*
+ * Set up cursors, etc. in the extent allocation cursor. This function can be
+ * called multiple times to reset an initialized structure without having to
+ * reallocate cursors.
+ */
+static int
+xfs_alloc_cur_setup(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ int error;
+ int i;
+
+ ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
+
+ acur->cur_len = args->maxlen;
+ acur->rec_bno = 0;
+ acur->rec_len = 0;
+ acur->bno = 0;
+ acur->len = 0;
+ acur->diff = -1;
+ acur->busy = false;
+ acur->busy_gen = 0;
+
+ /*
+ * Perform an initial cntbt lookup to check for availability of maxlen
+ * extents. If this fails, we'll return -ENOSPC to signal the caller to
+ * attempt a small allocation.
+ */
+ if (!acur->cnt)
+ acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_CNT);
+ error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
+ if (error)
+ return error;
+
+ /*
+ * Allocate the bnobt left and right search cursors.
+ */
+ if (!acur->bnolt)
+ acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_BNO);
+ if (!acur->bnogt)
+ acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_BNO);
+ return i == 1 ? 0 : -ENOSPC;
+}
+
+static void
+xfs_alloc_cur_close(
+ struct xfs_alloc_cur *acur,
+ bool error)
+{
+ int cur_error = XFS_BTREE_NOERROR;
+
+ if (error)
+ cur_error = XFS_BTREE_ERROR;
+
+ if (acur->cnt)
+ xfs_btree_del_cursor(acur->cnt, cur_error);
+ if (acur->bnolt)
+ xfs_btree_del_cursor(acur->bnolt, cur_error);
+ if (acur->bnogt)
+ xfs_btree_del_cursor(acur->bnogt, cur_error);
+ acur->cnt = acur->bnolt = acur->bnogt = NULL;
+}
+
+/*
+ * Check an extent for allocation and track the best available candidate in the
+ * allocation structure. The cursor is deactivated if it has entered an out of
+ * range state based on allocation arguments. Optionally return the extent
+ * extent geometry and allocation status if requested by the caller.
+ */
+static int
+xfs_alloc_cur_check(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ struct xfs_btree_cur *cur,
+ int *new)
+{
+ int error, i;
+ xfs_agblock_t bno, bnoa, bnew;
+ xfs_extlen_t len, lena, diff = -1;
+ bool busy;
+ unsigned busy_gen = 0;
+ bool deactivate = false;
+ bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+
+ *new = 0;
+
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(args->mp, i != 1))
+ return -EFSCORRUPTED;
+
+ /*
+ * Check minlen and deactivate a cntbt cursor if out of acceptable size
+ * range (i.e., walking backwards looking for a minlen extent).
+ */
+ if (len < args->minlen) {
+ deactivate = !isbnobt;
+ goto out;
+ }
+
+ busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
+ &busy_gen);
+ acur->busy |= busy;
+ if (busy)
+ acur->busy_gen = busy_gen;
+ /* deactivate a bnobt cursor outside of locality range */
+ if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
+ deactivate = isbnobt;
+ goto out;
+ }
+ if (lena < args->minlen)
+ goto out;
+
+ args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
+ xfs_alloc_fix_len(args);
+ ASSERT(args->len >= args->minlen);
+ if (args->len < acur->len)
+ goto out;
+
+ /*
+ * We have an aligned record that satisfies minlen and beats or matches
+ * the candidate extent size. Compare locality for near allocation mode.
+ */
+ ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+ diff = xfs_alloc_compute_diff(args->agbno, args->len,
+ args->alignment, args->datatype,
+ bnoa, lena, &bnew);
+ if (bnew == NULLAGBLOCK)
+ goto out;
+
+ /*
+ * Deactivate a bnobt cursor with worse locality than the current best.
+ */
+ if (diff > acur->diff) {
+ deactivate = isbnobt;
+ goto out;
+ }
+
+ ASSERT(args->len > acur->len ||
+ (args->len == acur->len && diff <= acur->diff));
+ acur->rec_bno = bno;
+ acur->rec_len = len;
+ acur->bno = bnew;
+ acur->len = args->len;
+ acur->diff = diff;
+ *new = 1;
+
+ /*
+ * We're done if we found a perfect allocation. This only deactivates
+ * the current cursor, but this is just an optimization to terminate a
+ * cntbt search that otherwise runs to the edge of the tree.
+ */
+ if (acur->diff == 0 && acur->len == args->maxlen)
+ deactivate = true;
+out:
+ if (deactivate)
+ cur->bc_private.a.priv.abt.active = false;
+ trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
+ *new);
+ return 0;
+}
+
+/*
+ * Complete an allocation of a candidate extent. Remove the extent from both
+ * trees and update the args structure.
*/
+STATIC int
+xfs_alloc_cur_finish(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ int error;
+
+ ASSERT(acur->cnt && acur->bnolt);
+ ASSERT(acur->bno >= acur->rec_bno);
+ ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
+ ASSERT(acur->rec_bno + acur->rec_len <=
+ be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
+
+ error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
+ acur->rec_len, acur->bno, acur->len, 0);
+ if (error)
+ return error;
+
+ args->agbno = acur->bno;
+ args->len = acur->len;
+ args->wasfromfl = 0;
+
+ trace_xfs_alloc_cur(args);
+ return 0;
+}
+
+/*
+ * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
+ * bno optimized lookup to search for extents with ideal size and locality.
+ */
+STATIC int
+xfs_alloc_cntbt_iter(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur)
+{
+ struct xfs_btree_cur *cur = acur->cnt;
+ xfs_agblock_t bno;
+ xfs_extlen_t len, cur_len;
+ int error;
+ int i;
+
+ if (!xfs_alloc_cur_active(cur))
+ return 0;
+
+ /* locality optimized lookup */
+ cur_len = acur->cur_len;
+ error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
+ if (error)
+ return error;
+ if (i == 0)
+ return 0;
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (error)
+ return error;
+
+ /* check the current record and update search length from it */
+ error = xfs_alloc_cur_check(args, acur, cur, &i);
+ if (error)
+ return error;
+ ASSERT(len >= acur->cur_len);
+ acur->cur_len = len;
+
+ /*
+ * We looked up the first record >= [agbno, len] above. The agbno is a
+ * secondary key and so the current record may lie just before or after
+ * agbno. If it is past agbno, check the previous record too so long as
+ * the length matches as it may be closer. Don't check a smaller record
+ * because that could deactivate our cursor.
+ */
+ if (bno > args->agbno) {
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (!error && i) {
+ error = xfs_alloc_get_rec(cur, &bno, &len, &i);
+ if (!error && i && len == acur->cur_len)
+ error = xfs_alloc_cur_check(args, acur, cur,
+ &i);
+ }
+ if (error)
+ return error;
+ }
+
+ /*
+ * Increment the search key until we find at least one allocation
+ * candidate or if the extent we found was larger. Otherwise, double the
+ * search key to optimize the search. Efficiency is more important here
+ * than absolute best locality.
+ */
+ cur_len <<= 1;
+ if (!acur->len || acur->cur_len >= cur_len)
+ acur->cur_len++;
+ else
+ acur->cur_len = cur_len;
+
+ return error;
+}
/*
* Deal with the case where only small freespaces remain. Either return the
@@ -727,7 +1045,10 @@ xfs_alloc_ag_vextent_small(
error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
if (error)
goto error;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
goto out;
}
@@ -744,13 +1065,13 @@ xfs_alloc_ag_vextent_small(
goto out;
xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
- xfs_alloc_allow_busy_reuse(args->datatype));
+ (args->datatype & XFS_ALLOC_NOBUSY));
- if (xfs_alloc_is_userdata(args->datatype)) {
+ if (args->datatype & XFS_ALLOC_USERDATA) {
struct xfs_buf *bp;
bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno);
- if (!bp) {
+ if (XFS_IS_CORRUPT(args->mp, !bp)) {
error = -EFSCORRUPTED;
goto error;
}
@@ -758,9 +1079,12 @@ xfs_alloc_ag_vextent_small(
}
*fbnop = args->agbno = fbno;
*flenp = args->len = 1;
- XFS_WANT_CORRUPTED_GOTO(args->mp,
- fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
- error);
+ if (XFS_IS_CORRUPT(args->mp,
+ fbno >= be32_to_cpu(
+ XFS_BUF_TO_AGF(args->agbp)->agf_length))) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
args->wasfromfl = 1;
trace_xfs_alloc_small_freelist(args);
@@ -915,7 +1239,10 @@ xfs_alloc_ag_vextent_exact(
error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
ASSERT(fbno <= args->agbno);
/*
@@ -984,98 +1311,243 @@ error0:
}
/*
- * Search the btree in a given direction via the search cursor and compare
- * the records found against the good extent we've already found.
+ * Search a given number of btree records in a given direction. Check each
+ * record against the good extent we've already found.
*/
STATIC int
-xfs_alloc_find_best_extent(
- struct xfs_alloc_arg *args, /* allocation argument structure */
- struct xfs_btree_cur **gcur, /* good cursor */
- struct xfs_btree_cur **scur, /* searching cursor */
- xfs_agblock_t gdiff, /* difference for search comparison */
- xfs_agblock_t *sbno, /* extent found by search */
- xfs_extlen_t *slen, /* extent length */
- xfs_agblock_t *sbnoa, /* aligned extent found by search */
- xfs_extlen_t *slena, /* aligned extent length */
- int dir) /* 0 = search right, 1 = search left */
+xfs_alloc_walk_iter(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ struct xfs_btree_cur *cur,
+ bool increment,
+ bool find_one, /* quit on first candidate */
+ int count, /* rec count (-1 for infinite) */
+ int *stat)
{
- xfs_agblock_t new;
- xfs_agblock_t sdiff;
int error;
int i;
- unsigned busy_gen;
- /* The good extent is perfect, no need to search. */
- if (!gdiff)
- goto out_use_good;
+ *stat = 0;
/*
- * Look until we find a better one, run out of space or run off the end.
+ * Search so long as the cursor is active or we find a better extent.
+ * The cursor is deactivated if it extends beyond the range of the
+ * current allocation candidate.
*/
- do {
- error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
+ while (xfs_alloc_cur_active(cur) && count) {
+ error = xfs_alloc_cur_check(args, acur, cur, &i);
if (error)
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- xfs_alloc_compute_aligned(args, *sbno, *slen,
- sbnoa, slena, &busy_gen);
+ return error;
+ if (i == 1) {
+ *stat = 1;
+ if (find_one)
+ break;
+ }
+ if (!xfs_alloc_cur_active(cur))
+ break;
+
+ if (increment)
+ error = xfs_btree_increment(cur, 0, &i);
+ else
+ error = xfs_btree_decrement(cur, 0, &i);
+ if (error)
+ return error;
+ if (i == 0)
+ cur->bc_private.a.priv.abt.active = false;
+
+ if (count > 0)
+ count--;
+ }
+
+ return 0;
+}
+
+/*
+ * Search the by-bno and by-size btrees in parallel in search of an extent with
+ * ideal locality based on the NEAR mode ->agbno locality hint.
+ */
+STATIC int
+xfs_alloc_ag_vextent_locality(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ int *stat)
+{
+ struct xfs_btree_cur *fbcur = NULL;
+ int error;
+ int i;
+ bool fbinc;
+
+ ASSERT(acur->len == 0);
+ ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
+
+ *stat = 0;
+
+ error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
+ if (error)
+ return error;
+ error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
+ if (error)
+ return error;
+ error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
+ if (error)
+ return error;
+
+ /*
+ * Search the bnobt and cntbt in parallel. Search the bnobt left and
+ * right and lookup the closest extent to the locality hint for each
+ * extent size key in the cntbt. The entire search terminates
+ * immediately on a bnobt hit because that means we've found best case
+ * locality. Otherwise the search continues until the cntbt cursor runs
+ * off the end of the tree. If no allocation candidate is found at this
+ * point, give up on locality, walk backwards from the end of the cntbt
+ * and take the first available extent.
+ *
+ * The parallel tree searches balance each other out to provide fairly
+ * consistent performance for various situations. The bnobt search can
+ * have pathological behavior in the worst case scenario of larger
+ * allocation requests and fragmented free space. On the other hand, the
+ * bnobt is able to satisfy most smaller allocation requests much more
+ * quickly than the cntbt. The cntbt search can sift through fragmented
+ * free space and sets of free extents for larger allocation requests
+ * more quickly than the bnobt. Since the locality hint is just a hint
+ * and we don't want to scan the entire bnobt for perfect locality, the
+ * cntbt search essentially bounds the bnobt search such that we can
+ * find good enough locality at reasonable performance in most cases.
+ */
+ while (xfs_alloc_cur_active(acur->bnolt) ||
+ xfs_alloc_cur_active(acur->bnogt) ||
+ xfs_alloc_cur_active(acur->cnt)) {
+
+ trace_xfs_alloc_cur_lookup(args);
/*
- * The good extent is closer than this one.
+ * Search the bnobt left and right. In the case of a hit, finish
+ * the search in the opposite direction and we're done.
*/
- if (!dir) {
- if (*sbnoa > args->max_agbno)
- goto out_use_good;
- if (*sbnoa >= args->agbno + gdiff)
- goto out_use_good;
- } else {
- if (*sbnoa < args->min_agbno)
- goto out_use_good;
- if (*sbnoa <= args->agbno - gdiff)
- goto out_use_good;
+ error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
+ true, 1, &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ trace_xfs_alloc_cur_left(args);
+ fbcur = acur->bnogt;
+ fbinc = true;
+ break;
+ }
+ error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
+ 1, &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ trace_xfs_alloc_cur_right(args);
+ fbcur = acur->bnolt;
+ fbinc = false;
+ break;
}
/*
- * Same distance, compare length and pick the best.
+ * Check the extent with best locality based on the current
+ * extent size search key and keep track of the best candidate.
*/
- if (*slena >= args->minlen) {
- args->len = XFS_EXTLEN_MIN(*slena, args->maxlen);
- xfs_alloc_fix_len(args);
-
- sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment,
- args->datatype, *sbnoa,
- *slena, &new);
+ error = xfs_alloc_cntbt_iter(args, acur);
+ if (error)
+ return error;
+ if (!xfs_alloc_cur_active(acur->cnt)) {
+ trace_xfs_alloc_cur_lookup_done(args);
+ break;
+ }
+ }
- /*
- * Choose closer size and invalidate other cursor.
- */
- if (sdiff < gdiff)
- goto out_use_search;
- goto out_use_good;
+ /*
+ * If we failed to find anything due to busy extents, return empty
+ * handed so the caller can flush and retry. If no busy extents were
+ * found, walk backwards from the end of the cntbt as a last resort.
+ */
+ if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
+ error = xfs_btree_decrement(acur->cnt, 0, &i);
+ if (error)
+ return error;
+ if (i) {
+ acur->cnt->bc_private.a.priv.abt.active = true;
+ fbcur = acur->cnt;
+ fbinc = false;
}
+ }
- if (!dir)
- error = xfs_btree_increment(*scur, 0, &i);
- else
- error = xfs_btree_decrement(*scur, 0, &i);
+ /*
+ * Search in the opposite direction for a better entry in the case of
+ * a bnobt hit or walk backwards from the end of the cntbt.
+ */
+ if (fbcur) {
+ error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
+ &i);
if (error)
- goto error0;
- } while (i);
+ return error;
+ }
-out_use_good:
- xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR);
- *scur = NULL;
- return 0;
+ if (acur->len)
+ *stat = 1;
-out_use_search:
- xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR);
- *gcur = NULL;
return 0;
+}
-error0:
- /* caller invalidates cursors */
- return error;
+/* Check the last block of the cnt btree for allocations. */
+static int
+xfs_alloc_ag_vextent_lastblock(
+ struct xfs_alloc_arg *args,
+ struct xfs_alloc_cur *acur,
+ xfs_agblock_t *bno,
+ xfs_extlen_t *len,
+ bool *allocated)
+{
+ int error;
+ int i;
+
+#ifdef DEBUG
+ /* Randomly don't execute the first algorithm. */
+ if (prandom_u32() & 1)
+ return 0;
+#endif
+
+ /*
+ * Start from the entry that lookup found, sequence through all larger
+ * free blocks. If we're actually pointing at a record smaller than
+ * maxlen, go to the start of this block, and skip all those smaller
+ * than minlen.
+ */
+ if (len || args->alignment > 1) {
+ acur->cnt->bc_ptrs[0] = 1;
+ do {
+ error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(args->mp, i != 1))
+ return -EFSCORRUPTED;
+ if (*len >= args->minlen)
+ break;
+ error = xfs_btree_increment(acur->cnt, 0, &i);
+ if (error)
+ return error;
+ } while (i);
+ ASSERT(*len >= args->minlen);
+ if (!i)
+ return 0;
+ }
+
+ error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
+ if (error)
+ return error;
+
+ /*
+ * It didn't work. We COULD be in a case where there's a good record
+ * somewhere, so try again.
+ */
+ if (acur->len == 0)
+ return 0;
+
+ trace_xfs_alloc_near_first(args);
+ *allocated = true;
+ return 0;
}
/*
@@ -1084,41 +1556,17 @@ error0:
* and of the form k * prod + mod unless there's nothing that large.
* Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
*/
-STATIC int /* error */
+STATIC int
xfs_alloc_ag_vextent_near(
- xfs_alloc_arg_t *args) /* allocation argument structure */
+ struct xfs_alloc_arg *args)
{
- xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */
- xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */
- xfs_btree_cur_t *cnt_cur; /* cursor for count btree */
- xfs_agblock_t gtbno; /* start bno of right side entry */
- xfs_agblock_t gtbnoa; /* aligned ... */
- xfs_extlen_t gtdiff; /* difference to right side entry */
- xfs_extlen_t gtlen; /* length of right side entry */
- xfs_extlen_t gtlena; /* aligned ... */
- xfs_agblock_t gtnew; /* useful start bno of right side */
- int error; /* error code */
- int i; /* result code, temporary */
- int j; /* result code, temporary */
- xfs_agblock_t ltbno; /* start bno of left side entry */
- xfs_agblock_t ltbnoa; /* aligned ... */
- xfs_extlen_t ltdiff; /* difference to left side entry */
- xfs_extlen_t ltlen; /* length of left side entry */
- xfs_extlen_t ltlena; /* aligned ... */
- xfs_agblock_t ltnew; /* useful start bno of left side */
- xfs_extlen_t rlen; /* length of returned extent */
- bool busy;
- unsigned busy_gen;
-#ifdef DEBUG
- /*
- * Randomly don't execute the first algorithm.
- */
- int dofirst; /* set to do first algorithm */
-
- dofirst = prandom_u32() & 1;
-#endif
+ struct xfs_alloc_cur acur = {};
+ int error; /* error code */
+ int i; /* result code, temporary */
+ xfs_agblock_t bno;
+ xfs_extlen_t len;
- /* handle unitialized agbno range so caller doesn't have to */
+ /* handle uninitialized agbno range so caller doesn't have to */
if (!args->min_agbno && !args->max_agbno)
args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
ASSERT(args->min_agbno <= args->max_agbno);
@@ -1130,40 +1578,27 @@ xfs_alloc_ag_vextent_near(
args->agbno = args->max_agbno;
restart:
- bno_cur_lt = NULL;
- bno_cur_gt = NULL;
- ltlen = 0;
- gtlena = 0;
- ltlena = 0;
- busy = false;
-
- /*
- * Get a cursor for the by-size btree.
- */
- cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT);
+ len = 0;
/*
- * See if there are any free extents as big as maxlen.
+ * Set up cursors and see if there are any free extents as big as
+ * maxlen. If not, pick the last entry in the tree unless the tree is
+ * empty.
*/
- if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
- goto error0;
- /*
- * If none, then pick up the last entry in the tree unless the
- * tree is empty.
- */
- if (!i) {
- if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
- &ltlen, &i)))
- goto error0;
- if (i == 0 || ltlen == 0) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ error = xfs_alloc_cur_setup(args, &acur);
+ if (error == -ENOSPC) {
+ error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
+ &len, &i);
+ if (error)
+ goto out;
+ if (i == 0 || len == 0) {
trace_xfs_alloc_near_noentry(args);
- return 0;
+ goto out;
}
ASSERT(i == 1);
+ } else if (error) {
+ goto out;
}
- args->wasfromfl = 0;
/*
* First algorithm.
@@ -1172,311 +1607,47 @@ restart:
* near the right edge of the tree. If it's in the last btree leaf
* block, then we just examine all the entries in that block
* that are big enough, and pick the best one.
- * This is written as a while loop so we can break out of it,
- * but we never loop back to the top.
*/
- while (xfs_btree_islastblock(cnt_cur, 0)) {
- xfs_extlen_t bdiff;
- int besti=0;
- xfs_extlen_t blen=0;
- xfs_agblock_t bnew=0;
-
-#ifdef DEBUG
- if (dofirst)
- break;
-#endif
- /*
- * Start from the entry that lookup found, sequence through
- * all larger free blocks. If we're actually pointing at a
- * record smaller than maxlen, go to the start of this block,
- * and skip all those smaller than minlen.
- */
- if (ltlen || args->alignment > 1) {
- cnt_cur->bc_ptrs[0] = 1;
- do {
- if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
- &ltlen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- if (ltlen >= args->minlen)
- break;
- if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
- goto error0;
- } while (i);
- ASSERT(ltlen >= args->minlen);
- if (!i)
- break;
- }
- i = cnt_cur->bc_ptrs[0];
- for (j = 1, blen = 0, bdiff = 0;
- !error && j && (blen < args->maxlen || bdiff > 0);
- error = xfs_btree_increment(cnt_cur, 0, &j)) {
- /*
- * For each entry, decide if it's better than
- * the previous best entry.
- */
- if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- busy = xfs_alloc_compute_aligned(args, ltbno, ltlen,
- &ltbnoa, &ltlena, &busy_gen);
- if (ltlena < args->minlen)
- continue;
- if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
- continue;
- args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- ASSERT(args->len >= args->minlen);
- if (args->len < blen)
- continue;
- ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype, ltbnoa,
- ltlena, &ltnew);
- if (ltnew != NULLAGBLOCK &&
- (args->len > blen || ltdiff < bdiff)) {
- bdiff = ltdiff;
- bnew = ltnew;
- blen = args->len;
- besti = cnt_cur->bc_ptrs[0];
- }
- }
- /*
- * It didn't work. We COULD be in a case where
- * there's a good record somewhere, so try again.
- */
- if (blen == 0)
- break;
- /*
- * Point at the best entry, and retrieve it again.
- */
- cnt_cur->bc_ptrs[0] = besti;
- if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
- args->len = blen;
-
- /*
- * We are allocating starting at bnew for blen blocks.
- */
- args->agbno = bnew;
- ASSERT(bnew >= ltbno);
- ASSERT(bnew + blen <= ltbno + ltlen);
- /*
- * Set up a cursor for the by-bno tree.
- */
- bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_BNO);
- /*
- * Fix up the btree entries.
- */
- if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
- ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
- goto error0;
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+ if (xfs_btree_islastblock(acur.cnt, 0)) {
+ bool allocated = false;
- trace_xfs_alloc_near_first(args);
- return 0;
- }
- /*
- * Second algorithm.
- * Search in the by-bno tree to the left and to the right
- * simultaneously, until in each case we find a space big enough,
- * or run into the edge of the tree. When we run into the edge,
- * we deallocate that cursor.
- * If both searches succeed, we compare the two spaces and pick
- * the better one.
- * With alignment, it's possible for both to fail; the upper
- * level algorithm that picks allocation groups for allocations
- * is not supposed to do this.
- */
- /*
- * Allocate and initialize the cursor for the leftward search.
- */
- bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO);
- /*
- * Lookup <= bno to find the leftward search's starting point.
- */
- if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
- goto error0;
- if (!i) {
- /*
- * Didn't find anything; use this cursor for the rightward
- * search.
- */
- bno_cur_gt = bno_cur_lt;
- bno_cur_lt = NULL;
- }
- /*
- * Found something. Duplicate the cursor for the rightward search.
- */
- else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
- goto error0;
- /*
- * Increment the cursor, so we will point at the entry just right
- * of the leftward entry if any, or to the leftmost entry.
- */
- if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
- goto error0;
- if (!i) {
- /*
- * It failed, there are no rightward entries.
- */
- xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
+ error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
+ &allocated);
+ if (error)
+ goto out;
+ if (allocated)
+ goto alloc_finish;
}
- /*
- * Loop going left with the leftward cursor, right with the
- * rightward cursor, until either both directions give up or
- * we find an entry at least as big as minlen.
- */
- do {
- if (bno_cur_lt) {
- if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
- &ltbnoa, &ltlena, &busy_gen);
- if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
- break;
- if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
- goto error0;
- if (!i || ltbnoa < args->min_agbno) {
- xfs_btree_del_cursor(bno_cur_lt,
- XFS_BTREE_NOERROR);
- bno_cur_lt = NULL;
- }
- }
- if (bno_cur_gt) {
- if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
- &gtbnoa, &gtlena, &busy_gen);
- if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
- break;
- if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
- goto error0;
- if (!i || gtbnoa > args->max_agbno) {
- xfs_btree_del_cursor(bno_cur_gt,
- XFS_BTREE_NOERROR);
- bno_cur_gt = NULL;
- }
- }
- } while (bno_cur_lt || bno_cur_gt);
/*
- * Got both cursors still active, need to find better entry.
+ * Second algorithm. Combined cntbt and bnobt search to find ideal
+ * locality.
*/
- if (bno_cur_lt && bno_cur_gt) {
- if (ltlena >= args->minlen) {
- /*
- * Left side is good, look for a right side entry.
- */
- args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype, ltbnoa,
- ltlena, &ltnew);
-
- error = xfs_alloc_find_best_extent(args,
- &bno_cur_lt, &bno_cur_gt,
- ltdiff, &gtbno, &gtlen,
- &gtbnoa, &gtlena,
- 0 /* search right */);
- } else {
- ASSERT(gtlena >= args->minlen);
-
- /*
- * Right side is good, look for a left side entry.
- */
- args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
- xfs_alloc_fix_len(args);
- gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, args->datatype, gtbnoa,
- gtlena, &gtnew);
-
- error = xfs_alloc_find_best_extent(args,
- &bno_cur_gt, &bno_cur_lt,
- gtdiff, &ltbno, &ltlen,
- &ltbnoa, &ltlena,
- 1 /* search left */);
- }
-
- if (error)
- goto error0;
- }
+ error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
+ if (error)
+ goto out;
/*
* If we couldn't get anything, give up.
*/
- if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-
- if (busy) {
+ if (!acur.len) {
+ if (acur.busy) {
trace_xfs_alloc_near_busy(args);
- xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
+ xfs_extent_busy_flush(args->mp, args->pag,
+ acur.busy_gen);
goto restart;
}
trace_xfs_alloc_size_neither(args);
args->agbno = NULLAGBLOCK;
- return 0;
+ goto out;
}
- /*
- * At this point we have selected a freespace entry, either to the
- * left or to the right. If it's on the right, copy all the
- * useful variables to the "left" set so we only have one
- * copy of this code.
- */
- if (bno_cur_gt) {
- bno_cur_lt = bno_cur_gt;
- bno_cur_gt = NULL;
- ltbno = gtbno;
- ltbnoa = gtbnoa;
- ltlen = gtlen;
- ltlena = gtlena;
- j = 1;
- } else
- j = 0;
-
- /*
- * Fix up the length and compute the useful address.
- */
- args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
- xfs_alloc_fix_len(args);
- rlen = args->len;
- (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- args->datatype, ltbnoa, ltlena, &ltnew);
- ASSERT(ltnew >= ltbno);
- ASSERT(ltnew + rlen <= ltbnoa + ltlena);
- ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
- ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno);
- args->agbno = ltnew;
-
- if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
- ltnew, rlen, XFSA_FIXUP_BNO_OK)))
- goto error0;
-
- if (j)
- trace_xfs_alloc_near_greater(args);
- else
- trace_xfs_alloc_near_lesser(args);
-
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
- return 0;
+alloc_finish:
+ /* fix up btrees on a successful allocation */
+ error = xfs_alloc_cur_finish(args, &acur);
- error0:
- trace_xfs_alloc_near_error(args);
- if (cnt_cur != NULL)
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
- if (bno_cur_lt != NULL)
- xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
- if (bno_cur_gt != NULL)
- xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+out:
+ xfs_alloc_cur_close(&acur, error);
return error;
}
@@ -1545,7 +1716,10 @@ restart:
error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
busy = xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen, &busy_gen);
@@ -1579,8 +1753,13 @@ restart:
* This can't happen in the second case above.
*/
rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
- XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
- (rlen <= flen && rbno + rlen <= fbno + flen), error0);
+ if (XFS_IS_CORRUPT(args->mp,
+ rlen != 0 &&
+ (rlen > flen ||
+ rbno + rlen > fbno + flen))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (rlen < args->maxlen) {
xfs_agblock_t bestfbno;
xfs_extlen_t bestflen;
@@ -1599,15 +1778,22 @@ restart:
if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
&i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (flen < bestrlen)
break;
busy = xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen, &busy_gen);
rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
- XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
- (rlen <= flen && rbno + rlen <= fbno + flen),
- error0);
+ if (XFS_IS_CORRUPT(args->mp,
+ rlen != 0 &&
+ (rlen > flen ||
+ rbno + rlen > fbno + flen))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (rlen > bestrlen) {
bestrlen = rlen;
bestrbno = rbno;
@@ -1620,7 +1806,10 @@ restart:
if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
&i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
rlen = bestrlen;
rbno = bestrbno;
flen = bestflen;
@@ -1643,7 +1832,10 @@ restart:
xfs_alloc_fix_len(args);
rlen = args->len;
- XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
+ if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Allocate and initialize a cursor for the by-block tree.
*/
@@ -1657,10 +1849,13 @@ restart:
cnt_cur = bno_cur = NULL;
args->len = rlen;
args->agbno = rbno;
- XFS_WANT_CORRUPTED_GOTO(args->mp,
- args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
- error0);
+ if (XFS_IS_CORRUPT(args->mp,
+ args->agbno + args->len >
+ be32_to_cpu(
+ XFS_BUF_TO_AGF(args->agbp)->agf_length))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
trace_xfs_alloc_size_done(args);
return 0;
@@ -1732,7 +1927,10 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* It's not contiguous, though.
*/
@@ -1744,8 +1942,10 @@ xfs_free_ag_extent(
* space was invalid, it's (partly) already free.
* Very bad.
*/
- XFS_WANT_CORRUPTED_GOTO(mp,
- ltbno + ltlen <= bno, error0);
+ if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
}
/*
@@ -1760,7 +1960,10 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* It's not contiguous, though.
*/
@@ -1772,7 +1975,10 @@ xfs_free_ag_extent(
* space was invalid, it's (partly) already free.
* Very bad.
*/
- XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0);
+ if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
}
/*
@@ -1789,31 +1995,49 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Delete the old by-size entry on the right.
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Delete the old by-block entry for the right block.
*/
if ((error = xfs_btree_delete(bno_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Move the by-block cursor back to the left neighbor.
*/
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
#ifdef DEBUG
/*
* Check that this is the right record: delete didn't
@@ -1826,9 +2050,13 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
&i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp,
- i == 1 && xxbno == ltbno && xxlen == ltlen,
- error0);
+ if (XFS_IS_CORRUPT(mp,
+ i != 1 ||
+ xxbno != ltbno ||
+ xxlen != ltlen)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
#endif
/*
@@ -1849,17 +2077,26 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Back up the by-block cursor to the left neighbor, and
* update its length.
*/
if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
nbno = ltbno;
nlen = len + ltlen;
if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
@@ -1875,10 +2112,16 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Update the starting block and length of the right
* neighbor in the by-block tree.
@@ -1897,7 +2140,10 @@ xfs_free_ag_extent(
nlen = len;
if ((error = xfs_btree_insert(bno_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
bno_cur = NULL;
@@ -1906,10 +2152,16 @@ xfs_free_ag_extent(
*/
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
cnt_cur = NULL;
@@ -1989,7 +2241,8 @@ xfs_alloc_longest_free_extent(
* reservations and AGFL rules in place, we can return this extent.
*/
if (pag->pagf_longest > delta)
- return pag->pagf_longest - delta;
+ return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
+ pag->pagf_longest - delta);
/* Otherwise, let the caller try for 1 block if there's space. */
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
@@ -2087,7 +2340,7 @@ xfs_free_agfl_block(
return error;
bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno);
- if (!bp)
+ if (XFS_IS_CORRUPT(tp->t_mountp, !bp))
return -EFSCORRUPTED;
xfs_trans_binval(tp, bp);
@@ -2253,7 +2506,7 @@ xfs_alloc_fix_freelist(
* somewhere else if we are not being asked to try harder at this
* point
*/
- if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&
+ if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
(flags & XFS_ALLOC_FLAG_TRYLOCK)) {
ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
goto out_agbp_relse;
@@ -2956,13 +3209,6 @@ xfs_alloc_vextent(
args->len);
#endif
- /* Zero the extent if we were asked to do so */
- if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {
- error = xfs_zero_extent(args->ip, args->fsbno, args->len);
- if (error)
- goto error0;
- }
-
}
xfs_perag_put(args->pag);
return 0;
@@ -3038,12 +3284,18 @@ __xfs_free_extent(
if (error)
return error;
- XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
+ if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
+ error = -EFSCORRUPTED;
+ goto err;
+ }
/* validate the extent size is legal now we have the agf locked */
- XFS_WANT_CORRUPTED_GOTO(mp,
- agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
- err);
+ if (XFS_IS_CORRUPT(mp,
+ agbno + len >
+ be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) {
+ error = -EFSCORRUPTED;
+ goto err;
+ }
error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
if (error)
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d6ed5d2c07c2..7380fbe4a3ff 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -54,7 +54,6 @@ typedef struct xfs_alloc_arg {
struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for a.g. freelist header */
struct xfs_perag *pag; /* per-ag struct for this agno */
- struct xfs_inode *ip; /* for userdata zeroing method */
xfs_fsblock_t fsbno; /* file system block number */
xfs_agnumber_t agno; /* allocation group number */
xfs_agblock_t agbno; /* allocation group-relative block # */
@@ -83,20 +82,7 @@ typedef struct xfs_alloc_arg {
*/
#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
-#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
-#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */
-
-static inline bool
-xfs_alloc_is_userdata(int datatype)
-{
- return (datatype & ~XFS_ALLOC_NOBUSY) != 0;
-}
-
-static inline bool
-xfs_alloc_allow_busy_reuse(int datatype)
-{
- return (datatype & XFS_ALLOC_NOBUSY) == 0;
-}
+#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */
/* freespace limit calculations */
#define XFS_ALLOC_AGFL_RESERVE 4
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 2a94543857a1..279694d73e4e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -507,6 +507,7 @@ xfs_allocbt_init_cursor(
cur->bc_private.a.agbp = agbp;
cur->bc_private.a.agno = agno;
+ cur->bc_private.a.priv.abt.active = false;
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 510ca6974604..0d7fcc983b3d 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -589,7 +589,7 @@ xfs_attr_leaf_addname(
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
if (error)
return error;
@@ -715,7 +715,7 @@ xfs_attr_leaf_addname(
* remove the "old" attr from that block (neat, huh!)
*/
error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- -1, &bp);
+ &bp);
if (error)
return error;
@@ -769,7 +769,7 @@ xfs_attr_leaf_removename(
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
if (error)
return error;
@@ -813,7 +813,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
trace_xfs_attr_leaf_get(args);
args->blkno = 0;
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
if (error)
return error;
@@ -1173,7 +1173,7 @@ xfs_attr_node_removename(
ASSERT(state->path.blk[0].bp);
state->path.blk[0].bp = NULL;
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
if (error)
goto out;
@@ -1266,10 +1266,9 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da3_node_read(state->args->trans,
- state->args->dp,
- blk->blkno, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
if (error)
return error;
} else {
@@ -1285,10 +1284,9 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da3_node_read(state->args->trans,
- state->args->dp,
- blk->blkno, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
if (error)
return error;
} else {
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index f0089e862216..08d4b10ae2d5 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -233,6 +233,61 @@ xfs_attr3_leaf_hdr_to_disk(
}
static xfs_failaddr_t
+xfs_attr3_leaf_verify_entry(
+ struct xfs_mount *mp,
+ char *buf_end,
+ struct xfs_attr_leafblock *leaf,
+ struct xfs_attr3_icleaf_hdr *leafhdr,
+ struct xfs_attr_leaf_entry *ent,
+ int idx,
+ __u32 *last_hashval)
+{
+ struct xfs_attr_leaf_name_local *lentry;
+ struct xfs_attr_leaf_name_remote *rentry;
+ char *name_end;
+ unsigned int nameidx;
+ unsigned int namesize;
+ __u32 hashval;
+
+ /* hash order check */
+ hashval = be32_to_cpu(ent->hashval);
+ if (hashval < *last_hashval)
+ return __this_address;
+ *last_hashval = hashval;
+
+ nameidx = be16_to_cpu(ent->nameidx);
+ if (nameidx < leafhdr->firstused || nameidx >= mp->m_attr_geo->blksize)
+ return __this_address;
+
+ /*
+ * Check the name information. The namelen fields are u8 so we can't
+ * possibly exceed the maximum name length of 255 bytes.
+ */
+ if (ent->flags & XFS_ATTR_LOCAL) {
+ lentry = xfs_attr3_leaf_name_local(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_local(lentry->namelen,
+ be16_to_cpu(lentry->valuelen));
+ name_end = (char *)lentry + namesize;
+ if (lentry->namelen == 0)
+ return __this_address;
+ } else {
+ rentry = xfs_attr3_leaf_name_remote(leaf, idx);
+ namesize = xfs_attr_leaf_entsize_remote(rentry->namelen);
+ name_end = (char *)rentry + namesize;
+ if (rentry->namelen == 0)
+ return __this_address;
+ if (!(ent->flags & XFS_ATTR_INCOMPLETE) &&
+ rentry->valueblk == 0)
+ return __this_address;
+ }
+
+ if (name_end > buf_end)
+ return __this_address;
+
+ return NULL;
+}
+
+static xfs_failaddr_t
xfs_attr3_leaf_verify(
struct xfs_buf *bp)
{
@@ -240,7 +295,10 @@ xfs_attr3_leaf_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_attr_leafblock *leaf = bp->b_addr;
struct xfs_attr_leaf_entry *entries;
+ struct xfs_attr_leaf_entry *ent;
+ char *buf_end;
uint32_t end; /* must be 32bit - see below */
+ __u32 last_hashval = 0;
int i;
xfs_failaddr_t fa;
@@ -273,8 +331,13 @@ xfs_attr3_leaf_verify(
(char *)bp->b_addr + ichdr.firstused)
return __this_address;
- /* XXX: need to range check rest of attr header values */
- /* XXX: hash order check? */
+ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize;
+ for (i = 0, ent = entries; i < ichdr.count; ent++, i++) {
+ fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr,
+ ent, i, &last_hashval);
+ if (fa)
+ return fa;
+ }
/*
* Quickly check the freemap information. Attribute data has to be
@@ -367,13 +430,12 @@ xfs_attr3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
int err;
- err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
- XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
+ err = xfs_da_read_buf(tp, dp, bno, 0, bpp, XFS_ATTR_FORK,
+ &xfs_attr3_leaf_buf_ops);
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
return err;
@@ -453,13 +515,15 @@ xfs_attr_copy_value(
* special case for dev/uuid inodes, they have fixed size data forks.
*/
int
-xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
+xfs_attr_shortform_bytesfit(
+ struct xfs_inode *dp,
+ int bytes)
{
- int offset;
- int minforkoff; /* lower limit on valid forkoff locations */
- int maxforkoff; /* upper limit on valid forkoff locations */
- int dsize;
- xfs_mount_t *mp = dp->i_mount;
+ struct xfs_mount *mp = dp->i_mount;
+ int64_t dsize;
+ int minforkoff;
+ int maxforkoff;
+ int offset;
/* rounded down */
offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
@@ -525,7 +589,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
* A data fork btree root must have space for at least
* MINDBTPTRS key/ptr pairs if the data fork is small or empty.
*/
- minforkoff = max(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
+ minforkoff = max_t(int64_t, dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS));
minforkoff = roundup(minforkoff, 8) >> 3;
/* attr fork btree root can have at least this many key/ptr pairs */
@@ -764,7 +828,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
}
/*
- * Retreive the attribute value and length.
+ * Retrieve the attribute value and length.
*
* If ATTR_KERNOVAL is specified, only the length needs to be returned.
* Unlike a lookup, we only return an error if the attribute does not
@@ -924,7 +988,7 @@ xfs_attr_shortform_verify(
char *endp;
struct xfs_ifork *ifp;
int i;
- int size;
+ int64_t size;
ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL);
ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK);
@@ -1080,7 +1144,6 @@ xfs_attr3_leaf_to_node(
struct xfs_attr_leafblock *leaf;
struct xfs_attr3_icleaf_hdr icleafhdr;
struct xfs_attr_leaf_entry *entries;
- struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr icnodehdr;
struct xfs_da_intnode *node;
struct xfs_inode *dp = args->dp;
@@ -1095,11 +1158,11 @@ xfs_attr3_leaf_to_node(
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
- error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, dp, 0, &bp1);
if (error)
goto out;
- error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK);
+ error = xfs_da_get_buf(args->trans, dp, blkno, &bp2, XFS_ATTR_FORK);
if (error)
goto out;
@@ -1120,18 +1183,17 @@ xfs_attr3_leaf_to_node(
if (error)
goto out;
node = bp1->b_addr;
- dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(mp, &icnodehdr, node);
leaf = bp2->b_addr;
xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
/* both on-disk, don't endian-flip twice */
- btree[0].hashval = entries[icleafhdr.count - 1].hashval;
- btree[0].before = cpu_to_be32(blkno);
+ icnodehdr.btree[0].hashval = entries[icleafhdr.count - 1].hashval;
+ icnodehdr.btree[0].before = cpu_to_be32(blkno);
icnodehdr.count = 1;
- dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &icnodehdr);
xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
error = 0;
out:
@@ -1161,7 +1223,7 @@ xfs_attr3_leaf_create(
trace_xfs_attr_leaf_create(args);
- error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+ error = xfs_da_get_buf(args->trans, args->dp, blkno, &bp,
XFS_ATTR_FORK);
if (error)
return error;
@@ -1447,7 +1509,9 @@ xfs_attr3_leaf_add_work(
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
if (ichdr->freemap[i].base == tmp) {
ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t);
- ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t);
+ ichdr->freemap[i].size -=
+ min_t(uint16_t, ichdr->freemap[i].size,
+ sizeof(xfs_attr_leaf_entry_t));
}
}
ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
@@ -1931,7 +1995,7 @@ xfs_attr3_leaf_toosmall(
if (blkno == 0)
continue;
error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
- blkno, -1, &bp);
+ blkno, &bp);
if (error)
return error;
@@ -2281,8 +2345,10 @@ xfs_attr3_leaf_lookup_int(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
- if (ichdr.count >= args->geo->blksize / 8)
+ if (ichdr.count >= args->geo->blksize / 8) {
+ xfs_buf_corruption_error(bp);
return -EFSCORRUPTED;
+ }
/*
* Binary search. (note: small blocks will skip this loop)
@@ -2298,10 +2364,14 @@ xfs_attr3_leaf_lookup_int(
else
break;
}
- if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count)))
+ if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
+ xfs_buf_corruption_error(bp);
return -EFSCORRUPTED;
- if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval))
+ }
+ if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
+ xfs_buf_corruption_error(bp);
return -EFSCORRUPTED;
+ }
/*
* Since we may have duplicate hashval's, find the first matching
@@ -2661,7 +2731,7 @@ xfs_attr3_leaf_clearflag(
/*
* Set up the operation.
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
if (error)
return error;
@@ -2728,7 +2798,7 @@ xfs_attr3_leaf_setflag(
/*
* Set up the operation.
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp);
if (error)
return error;
@@ -2790,7 +2860,7 @@ xfs_attr3_leaf_flipflags(
/*
* Read the block containing the "old" attr
*/
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, &bp1);
if (error)
return error;
@@ -2799,7 +2869,7 @@ xfs_attr3_leaf_flipflags(
*/
if (args->blkno2 != args->blkno) {
error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2,
- -1, &bp2);
+ &bp2);
if (error)
return error;
} else {
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 7b74e18becff..f4a188e28b7b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -17,6 +17,29 @@ struct xfs_inode;
struct xfs_trans;
/*
+ * Incore version of the attribute leaf header.
+ */
+struct xfs_attr3_icleaf_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t usedbytes;
+ /*
+ * Firstused is 32-bit here instead of 16-bit like the on-disk variant
+ * to support maximum fsb size of 64k without overflow issues throughout
+ * the attr code. Instead, the overflow condition is handled on
+ * conversion to/from disk.
+ */
+ uint32_t firstused;
+ __u8 holes;
+ struct {
+ uint16_t base;
+ uint16_t size;
+ } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*
* Used to keep a list of "remote value" extents when unlinking an inode.
*/
typedef struct xfs_attr_inactive_list {
@@ -67,8 +90,8 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
struct xfs_da_args *args);
-void xfs_attr3_leaf_list_int(struct xfs_buf *bp,
- struct xfs_attr_list_context *context);
+int xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+ struct xfs_attr_list_context *context);
/*
* Routines used for shrinking the Btree.
@@ -85,8 +108,7 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mappedbno,
- struct xfs_buf **bpp);
+ xfs_dablk_t bno, struct xfs_buf **bpp);
void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
struct xfs_attr3_icleaf_hdr *to,
struct xfs_attr_leafblock *from);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 3e39b7d40f25..a6ef5df42669 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -19,6 +19,7 @@
#include "xfs_trans.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
#include "xfs_trace.h"
#include "xfs_error.h"
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 7071ff98fdbc..40ce5f3094d1 100644
--- a/fs/xfs/libxfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -5,6 +5,7 @@
*/
#include "xfs.h"
#include "xfs_log_format.h"
+#include "xfs_bit.h"
/*
* XFS bit manipulation routines, used in non-realtime code.
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 02469d59c787..4acc6e37c31d 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -34,6 +34,7 @@
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
#include "xfs_icache.h"
+#include "xfs_iomap.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -383,8 +384,10 @@ xfs_bmap_check_leaf_extents(
xfs_check_block(block, mp, 0, 0);
pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(mp,
- xfs_verify_fsbno(mp, bno), error0);
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (bp_release) {
bp_release = 0;
xfs_trans_brelse(NULL, bp);
@@ -611,8 +614,8 @@ xfs_bmap_btree_to_extents(
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
#ifdef DEBUG
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
- xfs_btree_check_lptr(cur, cbno, 1));
+ if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1)))
+ return -EFSCORRUPTED;
#endif
error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
@@ -728,7 +731,7 @@ xfs_bmap_extents_to_btree(
ip->i_d.di_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
abp = xfs_btree_get_bufl(mp, tp, args.fsbno);
- if (!abp) {
+ if (XFS_IS_CORRUPT(mp, !abp)) {
error = -EFSCORRUPTED;
goto out_unreserve_dquot;
}
@@ -936,7 +939,10 @@ xfs_bmap_add_attrfork_btree(
if (error)
goto error0;
/* must be at least one entry */
- XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
+ if (XFS_IS_CORRUPT(mp, stat != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
goto error0;
if (stat == 0) {
@@ -1083,7 +1089,7 @@ xfs_bmap_add_attrfork(
goto trans_cancel;
if (XFS_IFORK_Q(ip))
goto trans_cancel;
- if (ip->i_d.di_anextents != 0) {
+ if (XFS_IS_CORRUPT(mp, ip->i_d.di_anextents != 0)) {
error = -EFSCORRUPTED;
goto trans_cancel;
}
@@ -1154,6 +1160,65 @@ trans_cancel:
* Internal and external extent tree search functions.
*/
+struct xfs_iread_state {
+ struct xfs_iext_cursor icur;
+ xfs_extnum_t loaded;
+};
+
+/* Stuff every bmbt record from this block into the incore extent map. */
+static int
+xfs_iread_bmbt_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xfs_iread_state *ir = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ struct xfs_bmbt_rec *frp;
+ xfs_extnum_t num_recs;
+ xfs_extnum_t j;
+ int whichfork = cur->bc_private.b.whichfork;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+
+ /* Abort if we find more records than nextents. */
+ num_recs = xfs_btree_get_numrecs(block);
+ if (unlikely(ir->loaded + num_recs >
+ XFS_IFORK_NEXTENTS(ip, whichfork))) {
+ xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).",
+ (unsigned long long)ip->i_ino);
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block,
+ sizeof(*block), __this_address);
+ return -EFSCORRUPTED;
+ }
+
+ /* Copy records into the incore cache. */
+ frp = XFS_BMBT_REC_ADDR(mp, block, 1);
+ for (j = 0; j < num_recs; j++, frp++, ir->loaded++) {
+ struct xfs_bmbt_irec new;
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(frp, &new);
+ fa = xfs_bmap_validate_extent(ip, whichfork, &new);
+ if (fa) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED,
+ "xfs_iread_extents(2)", frp,
+ sizeof(*frp), fa);
+ return -EFSCORRUPTED;
+ }
+ xfs_iext_insert(ip, &ir->icur, &new,
+ xfs_bmap_fork_to_state(whichfork));
+ trace_xfs_read_extent(ip, &ir->icur,
+ xfs_bmap_fork_to_state(whichfork), _THIS_IP_);
+ xfs_iext_next(XFS_IFORK_PTR(ip, whichfork), &ir->icur);
+ }
+
+ return 0;
+}
+
/*
* Read in extents from a btree-format inode.
*/
@@ -1163,134 +1228,39 @@ xfs_iread_extents(
struct xfs_inode *ip,
int whichfork)
{
- struct xfs_mount *mp = ip->i_mount;
- int state = xfs_bmap_fork_to_state(whichfork);
+ struct xfs_iread_state ir;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_extnum_t nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
- struct xfs_btree_block *block = ifp->if_broot;
- struct xfs_iext_cursor icur;
- struct xfs_bmbt_irec new;
- xfs_fsblock_t bno;
- struct xfs_buf *bp;
- xfs_extnum_t i, j;
- int level;
- __be64 *pp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_cur *cur;
int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
-
- /*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
- */
- level = be16_to_cpu(block->bb_level);
- if (unlikely(level == 0)) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
-
- /*
- * Go down the tree until leaf level is reached, following the first
- * pointer (leftmost) at each level.
- */
- while (level-- > 0) {
- error = xfs_btree_read_bufl(mp, tp, bno, &bp,
- XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
- if (error)
- goto out;
- block = XFS_BUF_TO_BLOCK(bp);
- if (level == 0)
- break;
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- XFS_WANT_CORRUPTED_GOTO(mp,
- xfs_verify_fsbno(mp, bno), out_brelse);
- xfs_trans_brelse(tp, bp);
+ if (XFS_IS_CORRUPT(mp,
+ XFS_IFORK_FORMAT(ip, whichfork) !=
+ XFS_DINODE_FMT_BTREE)) {
+ error = -EFSCORRUPTED;
+ goto out;
}
- /*
- * Here with bp and block set to the leftmost leaf node in the tree.
- */
- i = 0;
- xfs_iext_first(ifp, &icur);
-
- /*
- * Loop over all leaf nodes. Copy information to the extent records.
- */
- for (;;) {
- xfs_bmbt_rec_t *frp;
- xfs_fsblock_t nextbno;
- xfs_extnum_t num_recs;
-
- num_recs = xfs_btree_get_numrecs(block);
- if (unlikely(i + num_recs > nextents)) {
- xfs_warn(ip->i_mount,
- "corrupt dinode %Lu, (btree extents).",
- (unsigned long long) ip->i_ino);
- xfs_inode_verifier_error(ip, -EFSCORRUPTED,
- __func__, block, sizeof(*block),
- __this_address);
- error = -EFSCORRUPTED;
- goto out_brelse;
- }
- /*
- * Read-ahead the next leaf block, if any.
- */
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- if (nextbno != NULLFSBLOCK)
- xfs_btree_reada_bufl(mp, nextbno, 1,
- &xfs_bmbt_buf_ops);
- /*
- * Copy records into the extent records.
- */
- frp = XFS_BMBT_REC_ADDR(mp, block, 1);
- for (j = 0; j < num_recs; j++, frp++, i++) {
- xfs_failaddr_t fa;
-
- xfs_bmbt_disk_get_all(frp, &new);
- fa = xfs_bmap_validate_extent(ip, whichfork, &new);
- if (fa) {
- error = -EFSCORRUPTED;
- xfs_inode_verifier_error(ip, error,
- "xfs_iread_extents(2)",
- frp, sizeof(*frp), fa);
- goto out_brelse;
- }
- xfs_iext_insert(ip, &icur, &new, state);
- trace_xfs_read_extent(ip, &icur, state, _THIS_IP_);
- xfs_iext_next(ifp, &icur);
- }
- xfs_trans_brelse(tp, bp);
- bno = nextbno;
- /*
- * If we've reached the end, stop.
- */
- if (bno == NULLFSBLOCK)
- break;
- error = xfs_btree_read_bufl(mp, tp, bno, &bp,
- XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
- if (error)
- goto out;
- block = XFS_BUF_TO_BLOCK(bp);
- }
+ ir.loaded = 0;
+ xfs_iext_first(ifp, &ir.icur);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ error = xfs_btree_visit_blocks(cur, xfs_iread_bmbt_block,
+ XFS_BTREE_VISIT_RECORDS, &ir);
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out;
- if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) {
+ if (XFS_IS_CORRUPT(mp,
+ ir.loaded != XFS_IFORK_NEXTENTS(ip, whichfork))) {
error = -EFSCORRUPTED;
goto out;
}
- ASSERT(i == xfs_iext_count(ifp));
+ ASSERT(ir.loaded == xfs_iext_count(ifp));
ifp->if_flags |= XFS_IFEXTENTS;
return 0;
-
-out_brelse:
- xfs_trans_brelse(tp, bp);
out:
xfs_iext_destroy(ifp);
return error;
@@ -1317,8 +1287,7 @@ xfs_bmap_first_unused(
xfs_fileoff_t lowest, max;
int error;
- ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+ ASSERT(xfs_ifork_has_extents(ip, whichfork) ||
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
@@ -1374,7 +1343,8 @@ xfs_bmap_last_before(
case XFS_DINODE_FMT_EXTENTS:
break;
default:
- return -EIO;
+ ASSERT(0);
+ return -EFSCORRUPTED;
}
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -1473,9 +1443,8 @@ xfs_bmap_last_offset(
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL)
return 0;
- if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return -EIO;
+ if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ip, whichfork)))
+ return -EFSCORRUPTED;
error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
if (error || is_empty)
@@ -1652,15 +1621,24 @@ xfs_bmap_add_extent_delay_real(
error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_delete(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(bma->cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
@@ -1686,7 +1664,10 @@ xfs_bmap_add_extent_delay_real(
error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
@@ -1716,7 +1697,10 @@ xfs_bmap_add_extent_delay_real(
error = xfs_bmbt_lookup_eq(bma->cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &PREV);
if (error)
goto done;
@@ -1741,11 +1725,17 @@ xfs_bmap_add_extent_delay_real(
error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
@@ -1776,7 +1766,10 @@ xfs_bmap_add_extent_delay_real(
error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &LEFT);
if (error)
goto done;
@@ -1797,11 +1790,17 @@ xfs_bmap_add_extent_delay_real(
error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -1842,7 +1841,10 @@ xfs_bmap_add_extent_delay_real(
error = xfs_bmbt_lookup_eq(bma->cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(bma->cur, &RIGHT);
if (error)
goto done;
@@ -1874,11 +1876,17 @@ xfs_bmap_add_extent_delay_real(
error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -1954,11 +1962,17 @@ xfs_bmap_add_extent_delay_real(
error = xfs_bmbt_lookup_eq(bma->cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(bma->cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2152,19 +2166,34 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &LEFT);
if (error)
goto done;
@@ -2190,13 +2219,22 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, &PREV, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &LEFT);
if (error)
goto done;
@@ -2225,13 +2263,22 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
@@ -2254,7 +2301,10 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
@@ -2284,7 +2334,10 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
@@ -2318,14 +2371,20 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
cur->bc_rec.b = *new;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
@@ -2352,7 +2411,10 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
@@ -2386,17 +2448,26 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &PREV);
if (error)
goto done;
error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
@@ -2430,7 +2501,10 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/* new right extent - oldext */
error = xfs_bmbt_update(cur, &r[1]);
if (error)
@@ -2439,7 +2513,10 @@ xfs_bmap_add_extent_unwritten_real(
cur->bc_rec.b = PREV;
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/*
* Reset the cursor to the position of the new extent
* we are about to insert as we can't trust it after
@@ -2448,11 +2525,17 @@ xfs_bmap_add_extent_unwritten_real(
error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/* new middle extent - newext */
if ((error = xfs_btree_insert(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
@@ -2735,15 +2818,24 @@ xfs_bmap_add_extent_hole_real(
error = xfs_bmbt_lookup_eq(cur, &right, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &left);
if (error)
goto done;
@@ -2769,7 +2861,10 @@ xfs_bmap_add_extent_hole_real(
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &left);
if (error)
goto done;
@@ -2796,7 +2891,10 @@ xfs_bmap_add_extent_hole_real(
error = xfs_bmbt_lookup_eq(cur, &old, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_bmbt_update(cur, &right);
if (error)
goto done;
@@ -2819,11 +2917,17 @@ xfs_bmap_add_extent_hole_real(
error = xfs_bmbt_lookup_eq(cur, new, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
break;
}
@@ -3058,7 +3162,7 @@ xfs_bmap_adjacent(
mp = ap->ip->i_mount;
nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
rt = XFS_IS_REALTIME_INODE(ap->ip) &&
- xfs_alloc_is_userdata(ap->datatype);
+ (ap->datatype & XFS_ALLOC_USERDATA);
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
ap->tp->t_firstblock);
/*
@@ -3411,7 +3515,7 @@ xfs_bmap_btalloc(
if (ap->flags & XFS_BMAPI_COWFORK)
align = xfs_get_cowextsz_hint(ap->ip);
- else if (xfs_alloc_is_userdata(ap->datatype))
+ else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
if (align) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -3426,7 +3530,7 @@ xfs_bmap_btalloc(
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp,
ap->tp->t_firstblock);
if (nullfb) {
- if (xfs_alloc_is_userdata(ap->datatype) &&
+ if ((ap->datatype & XFS_ALLOC_USERDATA) &&
xfs_inode_is_filestream(ap->ip)) {
ag = xfs_filestream_lookup_ag(ap->ip);
ag = (ag != NULLAGNUMBER) ? ag : 0;
@@ -3466,7 +3570,7 @@ xfs_bmap_btalloc(
* enough for the request. If one isn't found, then adjust
* the minimum allocation size to the largest space found.
*/
- if (xfs_alloc_is_userdata(ap->datatype) &&
+ if ((ap->datatype & XFS_ALLOC_USERDATA) &&
xfs_inode_is_filestream(ap->ip))
error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
else
@@ -3500,13 +3604,11 @@ xfs_bmap_btalloc(
args.mod = args.prod - args.mod;
}
/*
- * If we are not low on available data blocks, and the
- * underlying logical volume manager is a stripe, and
- * the file offset is zero then try to allocate data
- * blocks on stripe unit boundary.
- * NOTE: ap->aeof is only set if the allocation length
- * is >= the stripe unit and the allocation offset is
- * at the end of file.
+ * If we are not low on available data blocks, and the underlying
+ * logical volume manager is a stripe, and the file offset is zero then
+ * try to allocate data blocks on stripe unit boundary. NOTE: ap->aeof
+ * is only set if the allocation length is >= the stripe unit and the
+ * allocation offset is at the end of file.
*/
if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) {
if (!ap->offset) {
@@ -3514,9 +3616,11 @@ xfs_bmap_btalloc(
atype = args.type;
isaligned = 1;
/*
- * Adjust for alignment
+ * Adjust minlen to try and preserve alignment if we
+ * can't guarantee an aligned maxlen extent.
*/
- if (blen > args.alignment && blen <= args.maxlen)
+ if (blen > args.alignment &&
+ blen <= args.maxlen + args.alignment)
args.minlen = blen - args.alignment;
args.minalignslop = 0;
} else {
@@ -3554,8 +3658,6 @@ xfs_bmap_btalloc(
args.wasdel = ap->wasdel;
args.resv = XFS_AG_RESV_NONE;
args.datatype = ap->datatype;
- if (ap->datatype & XFS_ALLOC_USERDATA_ZERO)
- args.ip = ap->ip;
error = xfs_alloc_vextent(&args);
if (error)
@@ -3640,20 +3742,6 @@ xfs_bmap_btalloc(
return 0;
}
-/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
- */
-STATIC int
-xfs_bmap_alloc(
- struct xfs_bmalloca *ap) /* bmap alloc argument struct */
-{
- if (XFS_IS_REALTIME_INODE(ap->ip) &&
- xfs_alloc_is_userdata(ap->datatype))
- return xfs_bmap_rtalloc(ap);
- return xfs_bmap_btalloc(ap);
-}
-
/* Trim extent to fit a logical block range. */
void
xfs_trim_extent(
@@ -3815,11 +3903,8 @@ xfs_bmapi_read(
XFS_BMAPI_COWFORK)));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
@@ -4010,6 +4095,39 @@ out_unreserve_quota:
}
static int
+xfs_bmap_alloc_userdata(
+ struct xfs_bmalloca *bma)
+{
+ struct xfs_mount *mp = bma->ip->i_mount;
+ int whichfork = xfs_bmapi_whichfork(bma->flags);
+ int error;
+
+ /*
+ * Set the data type being allocated. For the data fork, the first data
+ * in the file is treated differently to all other allocations. For the
+ * attribute fork, we only need to ensure the allocated range is not on
+ * the busy list.
+ */
+ bma->datatype = XFS_ALLOC_NOBUSY;
+ if (whichfork == XFS_DATA_FORK) {
+ bma->datatype |= XFS_ALLOC_USERDATA;
+ if (bma->offset == 0)
+ bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
+
+ if (mp->m_dalign && bma->length >= mp->m_dalign) {
+ error = xfs_bmap_isaeof(bma, whichfork);
+ if (error)
+ return error;
+ }
+
+ if (XFS_IS_REALTIME_INODE(bma->ip))
+ return xfs_bmap_rtalloc(bma);
+ }
+
+ return xfs_bmap_btalloc(bma);
+}
+
+static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
{
@@ -4028,7 +4146,8 @@ xfs_bmapi_allocate(
if (bma->wasdel) {
bma->length = (xfs_extlen_t)bma->got.br_blockcount;
bma->offset = bma->got.br_startoff;
- xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev);
+ if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
+ bma->prev.br_startoff = NULLFILEOFF;
} else {
bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
if (!bma->eof)
@@ -4036,43 +4155,24 @@ xfs_bmapi_allocate(
bma->got.br_startoff - bma->offset);
}
- /*
- * Set the data type being allocated. For the data fork, the first data
- * in the file is treated differently to all other allocations. For the
- * attribute fork, we only need to ensure the allocated range is not on
- * the busy list.
- */
- if (!(bma->flags & XFS_BMAPI_METADATA)) {
- bma->datatype = XFS_ALLOC_NOBUSY;
- if (whichfork == XFS_DATA_FORK) {
- if (bma->offset == 0)
- bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
- else
- bma->datatype |= XFS_ALLOC_USERDATA;
- }
- if (bma->flags & XFS_BMAPI_ZERO)
- bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
- }
+ if (bma->flags & XFS_BMAPI_CONTIG)
+ bma->minlen = bma->length;
+ else
+ bma->minlen = 1;
- bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+ if (bma->flags & XFS_BMAPI_METADATA)
+ error = xfs_bmap_btalloc(bma);
+ else
+ error = xfs_bmap_alloc_userdata(bma);
+ if (error || bma->blkno == NULLFSBLOCK)
+ return error;
- /*
- * Only want to do the alignment at the eof if it is userdata and
- * allocation length is larger than a stripe unit.
- */
- if (mp->m_dalign && bma->length >= mp->m_dalign &&
- !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
- error = xfs_bmap_isaeof(bma, whichfork);
+ if (bma->flags & XFS_BMAPI_ZERO) {
+ error = xfs_zero_extent(bma->ip, bma->blkno, bma->length);
if (error)
return error;
}
- error = xfs_bmap_alloc(bma);
- if (error)
- return error;
-
- if (bma->blkno == NULLFSBLOCK)
- return 0;
if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur)
bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork);
/*
@@ -4312,11 +4412,8 @@ xfs_bmapi_write(
ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
@@ -4456,16 +4553,21 @@ int
xfs_bmapi_convert_delalloc(
struct xfs_inode *ip,
int whichfork,
- xfs_fileoff_t offset_fsb,
- struct xfs_bmbt_irec *imap,
+ xfs_off_t offset,
+ struct iomap *iomap,
unsigned int *seq)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmalloca bma = { NULL };
+ u16 flags = 0;
struct xfs_trans *tp;
int error;
+ if (whichfork == XFS_COW_FORK)
+ flags |= IOMAP_F_SHARED;
+
/*
* Space for the extent and indirect blocks was reserved when the
* delalloc extent was created so there's no need to do so here.
@@ -4495,7 +4597,7 @@ xfs_bmapi_convert_delalloc(
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
- *imap = bma.got;
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4505,7 +4607,6 @@ xfs_bmapi_convert_delalloc(
bma.wasdel = true;
bma.offset = bma.got.br_startoff;
bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
- bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
if (whichfork == XFS_COW_FORK)
bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
@@ -4528,7 +4629,7 @@ xfs_bmapi_convert_delalloc(
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
- *imap = bma.got;
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
@@ -4578,11 +4679,8 @@ xfs_bmapi_remap(
ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
@@ -5013,7 +5111,10 @@ xfs_bmap_del_extent_real(
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
}
if (got.br_startoff == del->br_startoff)
@@ -5037,7 +5138,10 @@ xfs_bmap_del_extent_real(
}
if ((error = xfs_btree_delete(cur, &i)))
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
break;
case BMAP_LEFT_FILLING:
/*
@@ -5108,7 +5212,10 @@ xfs_bmap_del_extent_real(
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/*
* Update the btree record back
* to the original value.
@@ -5125,7 +5232,10 @@ xfs_bmap_del_extent_real(
error = -ENOSPC;
goto done;
}
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
} else
flags |= xfs_ilog_fext(whichfork);
XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -5192,7 +5302,7 @@ __xfs_bunmapi(
int isrt; /* freeing in rt area */
int logflags; /* transaction logging flags */
xfs_extlen_t mod; /* rt extent offset */
- struct xfs_mount *mp; /* mount structure */
+ struct xfs_mount *mp = ip->i_mount;
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
@@ -5209,14 +5319,8 @@ __xfs_bunmapi(
whichfork = xfs_bmapi_whichfork(flags);
ASSERT(whichfork != XFS_COW_FORK);
ifp = XFS_IFORK_PTR(ip, whichfork);
- if (unlikely(
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
- XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
- ip->i_mount);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)))
return -EFSCORRUPTED;
- }
- mp = ip->i_mount;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -5610,18 +5714,21 @@ xfs_bmse_merge(
error = xfs_bmbt_lookup_eq(cur, got, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_btree_delete(cur, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
/* lookup and update size of the previous extent */
error = xfs_bmbt_lookup_eq(cur, left, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_bmbt_update(cur, &new);
if (error)
@@ -5669,7 +5776,8 @@ xfs_bmap_shift_update_extent(
error = xfs_bmbt_lookup_eq(cur, &prev, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_bmbt_update(cur, got);
if (error)
@@ -5705,11 +5813,8 @@ xfs_bmap_collapse_extents(
int error = 0;
int logflags = 0;
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
@@ -5733,8 +5838,10 @@ xfs_bmap_collapse_extents(
*done = true;
goto del_cursor;
}
- XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock),
- del_cursor);
+ if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
new_startoff = got.br_startoff - offset_shift_fsb;
if (xfs_iext_peek_prev_extent(ifp, &icur, &prev)) {
@@ -5823,11 +5930,8 @@ xfs_bmap_insert_extents(
int error = 0;
int logflags = 0;
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
@@ -5860,11 +5964,14 @@ xfs_bmap_insert_extents(
goto del_cursor;
}
}
- XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock),
- del_cursor);
+ if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
- if (stop_fsb >= got.br_startoff + got.br_blockcount) {
- error = -EIO;
+ if (XFS_IS_CORRUPT(mp,
+ stop_fsb >= got.br_startoff + got.br_blockcount)) {
+ error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -5929,12 +6036,8 @@ xfs_bmap_split_extent_at(
int logflags = 0;
int i = 0;
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
return -EFSCORRUPTED;
}
@@ -5968,7 +6071,10 @@ xfs_bmap_split_extent_at(
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
goto del_cursor;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
}
got.br_blockcount = gotblkcnt;
@@ -5993,11 +6099,17 @@ xfs_bmap_split_extent_at(
error = xfs_bmbt_lookup_eq(cur, &new, &i);
if (error)
goto del_cursor;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
error = xfs_btree_insert(cur, &i);
if (error)
goto del_cursor;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto del_cursor;
+ }
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e2798c6f3a5f..14d25e0b7d9c 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -228,8 +228,7 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
- xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
- unsigned int *seq);
+ xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 71de937f9e64..e2cc98931552 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -105,11 +105,10 @@ xfs_btree_check_lblock(
xfs_failaddr_t fa;
fa = __xfs_btree_check_lblock(cur, block, level, bp);
- if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
- XFS_ERRTAG_BTREE_CHECK_LBLOCK))) {
+ if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
return 0;
@@ -169,11 +168,10 @@ xfs_btree_check_sblock(
xfs_failaddr_t fa;
fa = __xfs_btree_check_sblock(cur, block, level, bp);
- if (unlikely(XFS_TEST_ERROR(fa != NULL, mp,
- XFS_ERRTAG_BTREE_CHECK_SBLOCK))) {
+ if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
return 0;
@@ -384,7 +382,7 @@ xfs_btree_del_cursor(
/*
* Free the cursor.
*/
- kmem_zone_free(xfs_btree_cur_zone, cur);
+ kmem_cache_free(xfs_btree_cur_zone, cur);
}
/*
@@ -717,25 +715,6 @@ xfs_btree_get_bufs(
}
/*
- * Check for the cursor referring to the last block at the given level.
- */
-int /* 1=is last block, 0=not last block */
-xfs_btree_islastblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level) /* level to check */
-{
- struct xfs_btree_block *block; /* generic btree block pointer */
- xfs_buf_t *bp; /* buffer containing block */
-
- block = xfs_btree_get_block(cur, level, &bp);
- xfs_btree_check_block(cur, block, level, bp);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
- return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
- else
- return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
-}
-
-/*
* Change the cursor to point to the first record at the given level.
* Other levels are unaffected.
*/
@@ -1820,6 +1799,7 @@ xfs_btree_lookup_get_block(
out_bad:
*blkp = NULL;
+ xfs_buf_corruption_error(bp);
xfs_trans_brelse(cur->bc_tp, bp);
return -EFSCORRUPTED;
}
@@ -1867,7 +1847,7 @@ xfs_btree_lookup(
XFS_BTREE_STATS_INC(cur, lookup);
/* No such thing as a zero-level tree. */
- if (cur->bc_nlevels == 0)
+ if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
return -EFSCORRUPTED;
block = NULL;
@@ -1987,7 +1967,8 @@ xfs_btree_lookup(
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
*stat = 1;
return 0;
}
@@ -2442,7 +2423,10 @@ xfs_btree_lshift(
if (error)
goto error0;
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_decrement(tcur, level, &i);
if (error)
@@ -2609,7 +2593,10 @@ xfs_btree_rshift(
if (error)
goto error0;
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(tcur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_increment(tcur, level, &i);
if (error)
@@ -3463,7 +3450,10 @@ xfs_btree_insert(
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
level++;
/*
@@ -3867,15 +3857,24 @@ xfs_btree_delrec(
* Actually any entry but the first would suffice.
*/
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_increment(tcur, level, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/* Grab a pointer to the block. */
right = xfs_btree_get_block(tcur, level, &rbp);
@@ -3919,12 +3918,18 @@ xfs_btree_delrec(
rrecs = xfs_btree_get_numrecs(right);
if (!xfs_btree_ptr_is_null(cur, &lptr)) {
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_decrement(tcur, level, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
}
@@ -3938,13 +3943,19 @@ xfs_btree_delrec(
* previous block.
*/
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_btree_decrement(tcur, level, &i);
if (error)
goto error0;
i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/* Grab a pointer to the block. */
left = xfs_btree_get_block(tcur, level, &lbp);
@@ -4286,6 +4297,7 @@ int
xfs_btree_visit_blocks(
struct xfs_btree_cur *cur,
xfs_btree_visit_blocks_fn fn,
+ unsigned int flags,
void *data)
{
union xfs_btree_ptr lptr;
@@ -4311,6 +4323,11 @@ xfs_btree_visit_blocks(
/* save for the next iteration of the loop */
xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
+
+ if (!(flags & XFS_BTREE_VISIT_LEAVES))
+ continue;
+ } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
+ continue;
}
/* for each buffer in the level */
@@ -4413,7 +4430,7 @@ xfs_btree_change_owner(
bbcoi.buffer_list = buffer_list;
return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
- &bbcoi);
+ XFS_BTREE_VISIT_ALL, &bbcoi);
}
/* Verify the v5 fields of a long-format btree block. */
@@ -4865,7 +4882,7 @@ xfs_btree_count_blocks(
{
*blocks = 0;
return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
- blocks);
+ XFS_BTREE_VISIT_ALL, blocks);
}
/* Compare two btree pointers. */
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index ced1e65d1483..fb9b2121c628 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -183,6 +183,9 @@ union xfs_btree_cur_private {
unsigned long nr_ops; /* # record updates */
int shape_changes; /* # of extent splits */
} refc;
+ struct {
+ bool active; /* allocation cursor state */
+ } abt;
};
/*
@@ -315,14 +318,6 @@ xfs_btree_get_bufs(
xfs_agblock_t agbno); /* allocation group block number */
/*
- * Check for the cursor referring to the last block at the given level.
- */
-int /* 1=is last block, 0=not last block */
-xfs_btree_islastblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level); /* level to check */
-
-/*
* Compute first and last byte offsets for the fields given.
* Interprets the offsets table, which contains struct field offsets.
*/
@@ -482,8 +477,15 @@ int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
void *data);
+/* Visit record blocks. */
+#define XFS_BTREE_VISIT_RECORDS (1 << 0)
+/* Visit leaf blocks. */
+#define XFS_BTREE_VISIT_LEAVES (1 << 1)
+/* Visit all blocks. */
+#define XFS_BTREE_VISIT_ALL (XFS_BTREE_VISIT_RECORDS | \
+ XFS_BTREE_VISIT_LEAVES)
int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
- xfs_btree_visit_blocks_fn fn, void *data);
+ xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data);
int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks);
@@ -514,4 +516,21 @@ int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
union xfs_btree_irec *high, bool *exists);
bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
+/* Does this cursor point to the last block in the given level? */
+static inline bool
+xfs_btree_islastblock(
+ xfs_btree_cur_t *cur,
+ int level)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, level, &bp);
+ ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
+ return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
+}
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 4fd1223c1bd5..8c3eafe280ed 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -12,9 +12,9 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
+#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
#include "xfs_attr_leaf.h"
@@ -107,7 +107,66 @@ xfs_da_state_free(xfs_da_state_t *state)
#ifdef DEBUG
memset((char *)state, 0, sizeof(*state));
#endif /* DEBUG */
- kmem_zone_free(xfs_da_state_zone, state);
+ kmem_cache_free(xfs_da_state_zone, state);
+}
+
+static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork)
+{
+ if (whichfork == XFS_DATA_FORK)
+ return mp->m_dir_geo->fsbcount;
+ return mp->m_attr_geo->fsbcount;
+}
+
+void
+xfs_da3_node_hdr_from_disk(
+ struct xfs_mount *mp,
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from;
+
+ to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
+ to->back = be32_to_cpu(from3->hdr.info.hdr.back);
+ to->magic = be16_to_cpu(from3->hdr.info.hdr.magic);
+ to->count = be16_to_cpu(from3->hdr.__count);
+ to->level = be16_to_cpu(from3->hdr.__level);
+ to->btree = from3->__btree;
+ ASSERT(to->magic == XFS_DA3_NODE_MAGIC);
+ } else {
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.__count);
+ to->level = be16_to_cpu(from->hdr.__level);
+ to->btree = from->__btree;
+ ASSERT(to->magic == XFS_DA_NODE_MAGIC);
+ }
+}
+
+void
+xfs_da3_node_hdr_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to;
+
+ ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+ to3->hdr.info.hdr.forw = cpu_to_be32(from->forw);
+ to3->hdr.info.hdr.back = cpu_to_be32(from->back);
+ to3->hdr.info.hdr.magic = cpu_to_be16(from->magic);
+ to3->hdr.__count = cpu_to_be16(from->count);
+ to3->hdr.__level = cpu_to_be16(from->level);
+ } else {
+ ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.__count = cpu_to_be16(from->count);
+ to->hdr.__level = cpu_to_be16(from->level);
+ }
}
/*
@@ -145,12 +204,9 @@ xfs_da3_node_verify(
struct xfs_mount *mp = bp->b_mount;
struct xfs_da_intnode *hdr = bp->b_addr;
struct xfs_da3_icnode_hdr ichdr;
- const struct xfs_dir_ops *ops;
xfs_failaddr_t fa;
- ops = xfs_dir_get_ops(mp, NULL);
-
- ops->node_hdr_from_disk(&ichdr, hdr);
+ xfs_da3_node_hdr_from_disk(mp, &ichdr, hdr);
fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
if (fa)
@@ -275,46 +331,76 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = {
.verify_struct = xfs_da3_node_verify_struct,
};
+static int
+xfs_da3_node_set_type(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
+ return 0;
+ case XFS_ATTR_LEAF_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_ATTR_LEAF_BUF);
+ return 0;
+ case XFS_DIR2_LEAFN_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
+ return 0;
+ default:
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp,
+ info, sizeof(*info));
+ xfs_trans_brelse(tp, bp);
+ return -EFSCORRUPTED;
+ }
+}
+
int
xfs_da3_node_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
+ struct xfs_buf **bpp,
+ int whichfork)
+{
+ int error;
+
+ error = xfs_da_read_buf(tp, dp, bno, 0, bpp, whichfork,
+ &xfs_da3_node_buf_ops);
+ if (error || !*bpp || !tp)
+ return error;
+ return xfs_da3_node_set_type(tp, *bpp);
+}
+
+int
+xfs_da3_node_read_mapped(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp,
- int which_fork)
+ int whichfork)
{
- int err;
+ struct xfs_mount *mp = dp->i_mount;
+ int error;
- err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
- which_fork, &xfs_da3_node_buf_ops);
- if (!err && tp && *bpp) {
- struct xfs_da_blkinfo *info = (*bpp)->b_addr;
- int type;
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno,
+ XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0,
+ bpp, &xfs_da3_node_buf_ops);
+ if (error || !*bpp)
+ return error;
- switch (be16_to_cpu(info->magic)) {
- case XFS_DA_NODE_MAGIC:
- case XFS_DA3_NODE_MAGIC:
- type = XFS_BLFT_DA_NODE_BUF;
- break;
- case XFS_ATTR_LEAF_MAGIC:
- case XFS_ATTR3_LEAF_MAGIC:
- type = XFS_BLFT_ATTR_LEAF_BUF;
- break;
- case XFS_DIR2_LEAFN_MAGIC:
- case XFS_DIR3_LEAFN_MAGIC:
- type = XFS_BLFT_DIR_LEAFN_BUF;
- break;
- default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- tp->t_mountp, info, sizeof(*info));
- xfs_trans_brelse(tp, *bpp);
- *bpp = NULL;
- return -EFSCORRUPTED;
- }
- xfs_trans_buf_set_type(tp, *bpp, type);
- }
- return err;
+ if (whichfork == XFS_ATTR_FORK)
+ xfs_buf_set_ref(*bpp, XFS_ATTR_BTREE_REF);
+ else
+ xfs_buf_set_ref(*bpp, XFS_DIR_BTREE_REF);
+
+ if (!tp)
+ return 0;
+ return xfs_da3_node_set_type(tp, *bpp);
}
/*========================================================================
@@ -343,7 +429,7 @@ xfs_da3_node_create(
trace_xfs_da_node_create(args);
ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
- error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
+ error = xfs_da_get_buf(tp, dp, blkno, &bp, whichfork);
if (error)
return error;
bp->b_ops = &xfs_da3_node_buf_ops;
@@ -363,9 +449,9 @@ xfs_da3_node_create(
}
ichdr.level = level;
- dp->d_ops->node_hdr_to_disk(node, &ichdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &ichdr);
xfs_trans_log_buf(tp, bp,
- XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+ XFS_DA_LOGRANGE(node, &node->hdr, args->geo->node_hdr_size));
*bpp = bp;
return 0;
@@ -504,6 +590,7 @@ xfs_da3_split(
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
+ xfs_buf_corruption_error(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -516,6 +603,7 @@ xfs_da3_split(
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
+ xfs_buf_corruption_error(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -568,7 +656,7 @@ xfs_da3_root_split(
dp = args->dp;
tp = args->trans;
- error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+ error = xfs_da_get_buf(tp, dp, blkno, &bp, args->whichfork);
if (error)
return error;
node = bp->b_addr;
@@ -577,8 +665,8 @@ xfs_da3_root_split(
oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
struct xfs_da3_icnode_hdr icnodehdr;
- dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
- btree = dp->d_ops->node_tree_p(oldroot);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &icnodehdr, oldroot);
+ btree = icnodehdr.btree;
size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
level = icnodehdr.level;
@@ -589,15 +677,14 @@ xfs_da3_root_split(
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
} else {
struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
leaf = (xfs_dir2_leaf_t *)oldroot;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
- size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
+ size = (int)((char *)&leafhdr.ents[leafhdr.count] -
+ (char *)leaf);
level = 0;
/*
@@ -637,14 +724,14 @@ xfs_da3_root_split(
return error;
node = bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
btree[0].hashval = cpu_to_be32(blk1->hashval);
btree[0].before = cpu_to_be32(blk1->blkno);
btree[1].hashval = cpu_to_be32(blk2->hashval);
btree[1].before = cpu_to_be32(blk2->blkno);
nodehdr.count = 2;
- dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
#ifdef DEBUG
if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
@@ -686,7 +773,7 @@ xfs_da3_node_split(
trace_xfs_da_node_split(state->args);
node = oldblk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
/*
* With V2 dirs the extra block is data or freespace.
@@ -733,7 +820,7 @@ xfs_da3_node_split(
* If we had double-split op below us, then add the extra block too.
*/
node = oldblk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
if (oldblk->index <= nodehdr.count) {
oldblk->index++;
xfs_da3_node_add(state, oldblk, addblk);
@@ -788,10 +875,10 @@ xfs_da3_node_rebalance(
node1 = blk1->bp->b_addr;
node2 = blk2->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
- dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
- btree1 = dp->d_ops->node_tree_p(node1);
- btree2 = dp->d_ops->node_tree_p(node2);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+ btree1 = nodehdr1.btree;
+ btree2 = nodehdr2.btree;
/*
* Figure out how many entries need to move, and in which direction.
@@ -804,10 +891,10 @@ xfs_da3_node_rebalance(
tmpnode = node1;
node1 = node2;
node2 = tmpnode;
- dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
- dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
- btree1 = dp->d_ops->node_tree_p(node1);
- btree2 = dp->d_ops->node_tree_p(node2);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+ btree1 = nodehdr1.btree;
+ btree2 = nodehdr2.btree;
swap = 1;
}
@@ -869,14 +956,15 @@ xfs_da3_node_rebalance(
/*
* Log header of node 1 and all current bits of node 2.
*/
- dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node1, &nodehdr1);
xfs_trans_log_buf(tp, blk1->bp,
- XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
+ XFS_DA_LOGRANGE(node1, &node1->hdr,
+ state->args->geo->node_hdr_size));
- dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node2, &nodehdr2);
xfs_trans_log_buf(tp, blk2->bp,
XFS_DA_LOGRANGE(node2, &node2->hdr,
- dp->d_ops->node_hdr_size +
+ state->args->geo->node_hdr_size +
(sizeof(btree2[0]) * nodehdr2.count)));
/*
@@ -886,10 +974,10 @@ xfs_da3_node_rebalance(
if (swap) {
node1 = blk1->bp->b_addr;
node2 = blk2->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
- dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
- btree1 = dp->d_ops->node_tree_p(node1);
- btree2 = dp->d_ops->node_tree_p(node2);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr1, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr2, node2);
+ btree1 = nodehdr1.btree;
+ btree2 = nodehdr2.btree;
}
blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
@@ -921,8 +1009,8 @@ xfs_da3_node_add(
trace_xfs_da_node_add(state->args);
node = oldblk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
ASSERT(newblk->blkno != 0);
@@ -945,9 +1033,10 @@ xfs_da3_node_add(
tmp + sizeof(*btree)));
nodehdr.count += 1;
- dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
xfs_trans_log_buf(state->args->trans, oldblk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+ XFS_DA_LOGRANGE(node, &node->hdr,
+ state->args->geo->node_hdr_size));
/*
* Copy the last hash value from the oldblk to propagate upwards.
@@ -1082,7 +1171,6 @@ xfs_da3_root_join(
xfs_dablk_t child;
struct xfs_buf *bp;
struct xfs_da3_icnode_hdr oldroothdr;
- struct xfs_da_node_entry *btree;
int error;
struct xfs_inode *dp = state->args->dp;
@@ -1092,7 +1180,7 @@ xfs_da3_root_join(
args = state->args;
oldroot = root_blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &oldroothdr, oldroot);
ASSERT(oldroothdr.forw == 0);
ASSERT(oldroothdr.back == 0);
@@ -1106,11 +1194,9 @@ xfs_da3_root_join(
* Read in the (only) child block, then copy those bytes into
* the root block's buffer and free the original child block.
*/
- btree = dp->d_ops->node_tree_p(oldroot);
- child = be32_to_cpu(btree[0].before);
+ child = be32_to_cpu(oldroothdr.btree[0].before);
ASSERT(child != 0);
- error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
- args->whichfork);
+ error = xfs_da3_node_read(args->trans, dp, child, &bp, args->whichfork);
if (error)
return error;
xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
@@ -1172,7 +1258,7 @@ xfs_da3_node_toosmall(
blk = &state->path.blk[ state->path.active-1 ];
info = blk->bp->b_addr;
node = (xfs_da_intnode_t *)info;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return 0; /* blk over 50%, don't try to join */
@@ -1224,13 +1310,13 @@ xfs_da3_node_toosmall(
blkno = nodehdr.back;
if (blkno == 0)
continue;
- error = xfs_da3_node_read(state->args->trans, dp,
- blkno, -1, &bp, state->args->whichfork);
+ error = xfs_da3_node_read(state->args->trans, dp, blkno, &bp,
+ state->args->whichfork);
if (error)
return error;
node = bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&thdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &thdr, node);
xfs_trans_brelse(state->args->trans, bp);
if (count - thdr.count >= 0)
@@ -1272,18 +1358,14 @@ xfs_da3_node_lasthash(
struct xfs_buf *bp,
int *count)
{
- struct xfs_da_intnode *node;
- struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
- node = bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, bp->b_addr);
if (count)
*count = nodehdr.count;
if (!nodehdr.count)
return 0;
- btree = dp->d_ops->node_tree_p(node);
- return be32_to_cpu(btree[nodehdr.count - 1].hashval);
+ return be32_to_cpu(nodehdr.btree[nodehdr.count - 1].hashval);
}
/*
@@ -1328,8 +1410,8 @@ xfs_da3_fixhashpath(
struct xfs_da3_icnode_hdr nodehdr;
node = blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
break;
blk->hashval = lasthash;
@@ -1360,7 +1442,7 @@ xfs_da3_node_remove(
trace_xfs_da_node_remove(state->args);
node = drop_blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
ASSERT(drop_blk->index < nodehdr.count);
ASSERT(drop_blk->index >= 0);
@@ -1368,7 +1450,7 @@ xfs_da3_node_remove(
* Copy over the offending entry, or just zero it out.
*/
index = drop_blk->index;
- btree = dp->d_ops->node_tree_p(node);
+ btree = nodehdr.btree;
if (index < nodehdr.count - 1) {
tmp = nodehdr.count - index - 1;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
@@ -1381,9 +1463,9 @@ xfs_da3_node_remove(
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
nodehdr.count -= 1;
- dp->d_ops->node_hdr_to_disk(node, &nodehdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, node, &nodehdr);
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
+ XFS_DA_LOGRANGE(node, &node->hdr, state->args->geo->node_hdr_size));
/*
* Copy the last hash value from the block to propagate upwards.
@@ -1416,10 +1498,10 @@ xfs_da3_node_unbalance(
drop_node = drop_blk->bp->b_addr;
save_node = save_blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
- dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
- drop_btree = dp->d_ops->node_tree_p(drop_node);
- save_btree = dp->d_ops->node_tree_p(save_node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &drop_hdr, drop_node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &save_hdr, save_node);
+ drop_btree = drop_hdr.btree;
+ save_btree = save_hdr.btree;
tp = state->args->trans;
/*
@@ -1453,10 +1535,10 @@ xfs_da3_node_unbalance(
memcpy(&save_btree[sindex], &drop_btree[0], tmp);
save_hdr.count += drop_hdr.count;
- dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
+ xfs_da3_node_hdr_to_disk(dp->i_mount, save_node, &save_hdr);
xfs_trans_log_buf(tp, save_blk->bp,
XFS_DA_LOGRANGE(save_node, &save_node->hdr,
- dp->d_ops->node_hdr_size));
+ state->args->geo->node_hdr_size));
/*
* Save the last hashval in the remaining block for upward propagation.
@@ -1517,7 +1599,7 @@ xfs_da3_node_lookup_int(
*/
blk->blkno = blkno;
error = xfs_da3_node_read(args->trans, args->dp, blkno,
- -1, &blk->bp, args->whichfork);
+ &blk->bp, args->whichfork);
if (error) {
blk->blkno = 0;
state->path.active--;
@@ -1541,8 +1623,10 @@ xfs_da3_node_lookup_int(
break;
}
- if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC)
+ if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
+ xfs_buf_corruption_error(blk->bp);
return -EFSCORRUPTED;
+ }
blk->magic = XFS_DA_NODE_MAGIC;
@@ -1550,19 +1634,22 @@ xfs_da3_node_lookup_int(
* Search an intermediate node for a match.
*/
node = blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
/* Tree taller than we can handle; bail out! */
- if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
+ xfs_buf_corruption_error(blk->bp);
return -EFSCORRUPTED;
+ }
/* Check the level from the root. */
if (blkno == args->geo->leafblk)
expected_level = nodehdr.level - 1;
- else if (expected_level != nodehdr.level)
+ else if (expected_level != nodehdr.level) {
+ xfs_buf_corruption_error(blk->bp);
return -EFSCORRUPTED;
- else
+ } else
expected_level--;
max = nodehdr.count;
@@ -1612,11 +1699,11 @@ xfs_da3_node_lookup_int(
}
/* We can't point back to the root. */
- if (blkno == args->geo->leafblk)
+ if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk))
return -EFSCORRUPTED;
}
- if (expected_level != 0)
+ if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0))
return -EFSCORRUPTED;
/*
@@ -1678,10 +1765,10 @@ xfs_da3_node_order(
node1 = node1_bp->b_addr;
node2 = node2_bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
- dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
- btree1 = dp->d_ops->node_tree_p(node1);
- btree2 = dp->d_ops->node_tree_p(node2);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &node1hdr, node1);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &node2hdr, node2);
+ btree1 = node1hdr.btree;
+ btree2 = node2hdr.btree;
if (node1hdr.count > 0 && node2hdr.count > 0 &&
((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
@@ -1746,7 +1833,7 @@ xfs_da3_blk_link(
if (old_info->back) {
error = xfs_da3_node_read(args->trans, dp,
be32_to_cpu(old_info->back),
- -1, &bp, args->whichfork);
+ &bp, args->whichfork);
if (error)
return error;
ASSERT(bp != NULL);
@@ -1767,7 +1854,7 @@ xfs_da3_blk_link(
if (old_info->forw) {
error = xfs_da3_node_read(args->trans, dp,
be32_to_cpu(old_info->forw),
- -1, &bp, args->whichfork);
+ &bp, args->whichfork);
if (error)
return error;
ASSERT(bp != NULL);
@@ -1826,7 +1913,7 @@ xfs_da3_blk_unlink(
if (drop_info->back) {
error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->back),
- -1, &bp, args->whichfork);
+ &bp, args->whichfork);
if (error)
return error;
ASSERT(bp != NULL);
@@ -1843,7 +1930,7 @@ xfs_da3_blk_unlink(
if (drop_info->forw) {
error = xfs_da3_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->forw),
- -1, &bp, args->whichfork);
+ &bp, args->whichfork);
if (error)
return error;
ASSERT(bp != NULL);
@@ -1878,7 +1965,6 @@ xfs_da3_path_shift(
{
struct xfs_da_state_blk *blk;
struct xfs_da_blkinfo *info;
- struct xfs_da_intnode *node;
struct xfs_da_args *args;
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
@@ -1901,17 +1987,16 @@ xfs_da3_path_shift(
ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
level = (path->active-1) - 1; /* skip bottom layer in path */
for (blk = &path->blk[level]; level >= 0; blk--, level--) {
- node = blk->bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
+ blk->bp->b_addr);
if (forward && (blk->index < nodehdr.count - 1)) {
blk->index++;
- blkno = be32_to_cpu(btree[blk->index].before);
+ blkno = be32_to_cpu(nodehdr.btree[blk->index].before);
break;
} else if (!forward && (blk->index > 0)) {
blk->index--;
- blkno = be32_to_cpu(btree[blk->index].before);
+ blkno = be32_to_cpu(nodehdr.btree[blk->index].before);
break;
}
}
@@ -1929,7 +2014,7 @@ xfs_da3_path_shift(
/*
* Read the next child block into a local buffer.
*/
- error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp,
+ error = xfs_da3_node_read(args->trans, dp, blkno, &bp,
args->whichfork);
if (error)
return error;
@@ -1962,9 +2047,9 @@ xfs_da3_path_shift(
case XFS_DA_NODE_MAGIC:
case XFS_DA3_NODE_MAGIC:
blk->magic = XFS_DA_NODE_MAGIC;
- node = (xfs_da_intnode_t *)info;
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = dp->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr,
+ bp->b_addr);
+ btree = nodehdr.btree;
blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
if (forward)
blk->index = 0;
@@ -2044,18 +2129,6 @@ xfs_da_compname(
XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
}
-static xfs_dahash_t
-xfs_default_hashname(
- struct xfs_name *name)
-{
- return xfs_da_hashname(name->name, name->len);
-}
-
-const struct xfs_nameops xfs_default_nameops = {
- .hashname = xfs_default_hashname,
- .compname = xfs_da_compname
-};
-
int
xfs_da_grow_inode_int(
struct xfs_da_args *args,
@@ -2213,16 +2286,13 @@ xfs_da3_swap_lastblock(
error = xfs_bmap_last_before(tp, dp, &lastoff, w);
if (error)
return error;
- if (unlikely(lastoff == 0)) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
- mp);
+ if (XFS_IS_CORRUPT(mp, lastoff == 0))
return -EFSCORRUPTED;
- }
/*
* Read the last block in the btree space.
*/
last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
- error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
+ error = xfs_da3_node_read(tp, dp, last_blkno, &last_buf, w);
if (error)
return error;
/*
@@ -2240,16 +2310,17 @@ xfs_da3_swap_lastblock(
struct xfs_dir2_leaf_entry *ents;
dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
- ents = dp->d_ops->leaf_ents_p(dead_leaf2);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr,
+ dead_leaf2);
+ ents = leafhdr.ents;
dead_level = 0;
dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
} else {
struct xfs_da3_icnode_hdr deadhdr;
dead_node = (xfs_da_intnode_t *)dead_info;
- dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
- btree = dp->d_ops->node_tree_p(dead_node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &deadhdr, dead_node);
+ btree = deadhdr.btree;
dead_level = deadhdr.level;
dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
}
@@ -2258,15 +2329,13 @@ xfs_da3_swap_lastblock(
* If the moved block has a left sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->back))) {
- error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
- if (unlikely(
- be32_to_cpu(sib_info->forw) != last_blkno ||
- sib_info->magic != dead_info->magic)) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp,
+ be32_to_cpu(sib_info->forw) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
error = -EFSCORRUPTED;
goto done;
}
@@ -2280,15 +2349,13 @@ xfs_da3_swap_lastblock(
* If the moved block has a right sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
- error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, dp, sib_blkno, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
- if (unlikely(
- be32_to_cpu(sib_info->back) != last_blkno ||
- sib_info->magic != dead_info->magic)) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp,
+ be32_to_cpu(sib_info->back) != last_blkno ||
+ sib_info->magic != dead_info->magic)) {
error = -EFSCORRUPTED;
goto done;
}
@@ -2304,27 +2371,24 @@ xfs_da3_swap_lastblock(
* Walk down the tree looking for the parent of the moved block.
*/
for (;;) {
- error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
- if (level >= 0 && level != par_hdr.level + 1) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
- XFS_ERRLEVEL_LOW, mp);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
+ if (XFS_IS_CORRUPT(mp,
+ level >= 0 && level != par_hdr.level + 1)) {
error = -EFSCORRUPTED;
goto done;
}
level = par_hdr.level;
- btree = dp->d_ops->node_tree_p(par_node);
+ btree = par_hdr.btree;
for (entno = 0;
entno < par_hdr.count &&
be32_to_cpu(btree[entno].hashval) < dead_hash;
entno++)
continue;
- if (entno == par_hdr.count) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) {
error = -EFSCORRUPTED;
goto done;
}
@@ -2349,24 +2413,20 @@ xfs_da3_swap_lastblock(
par_blkno = par_hdr.forw;
xfs_trans_brelse(tp, par_buf);
par_buf = NULL;
- if (unlikely(par_blkno == 0)) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
- XFS_ERRLEVEL_LOW, mp);
+ if (XFS_IS_CORRUPT(mp, par_blkno == 0)) {
error = -EFSCORRUPTED;
goto done;
}
- error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, dp, par_blkno, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
- if (par_hdr.level != level) {
- XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
- XFS_ERRLEVEL_LOW, mp);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
+ if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
error = -EFSCORRUPTED;
goto done;
}
- btree = dp->d_ops->node_tree_p(par_node);
+ btree = par_hdr.btree;
entno = 0;
}
/*
@@ -2429,159 +2489,84 @@ xfs_da_shrink_inode(
return error;
}
-/*
- * See if the mapping(s) for this btree block are valid, i.e.
- * don't contain holes, are logically contiguous, and cover the whole range.
- */
-STATIC int
-xfs_da_map_covers_blocks(
- int nmap,
- xfs_bmbt_irec_t *mapp,
- xfs_dablk_t bno,
- int count)
-{
- int i;
- xfs_fileoff_t off;
-
- for (i = 0, off = bno; i < nmap; i++) {
- if (mapp[i].br_startblock == HOLESTARTBLOCK ||
- mapp[i].br_startblock == DELAYSTARTBLOCK) {
- return 0;
- }
- if (off != mapp[i].br_startoff) {
- return 0;
- }
- off += mapp[i].br_blockcount;
- }
- return off == bno + count;
-}
-
-/*
- * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
- *
- * For the single map case, it is assumed that the caller has provided a pointer
- * to a valid xfs_buf_map. For the multiple map case, this function will
- * allocate the xfs_buf_map to hold all the maps and replace the caller's single
- * map pointer with the allocated map.
- */
static int
-xfs_buf_map_from_irec(
- struct xfs_mount *mp,
+xfs_dabuf_map(
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ unsigned int flags,
+ int whichfork,
struct xfs_buf_map **mapp,
- int *nmaps,
- struct xfs_bmbt_irec *irecs,
- int nirecs)
+ int *nmaps)
{
- struct xfs_buf_map *map;
- int i;
-
- ASSERT(*nmaps == 1);
- ASSERT(nirecs >= 1);
+ struct xfs_mount *mp = dp->i_mount;
+ int nfsb = xfs_dabuf_nfsb(mp, whichfork);
+ struct xfs_bmbt_irec irec, *irecs = &irec;
+ struct xfs_buf_map *map = *mapp;
+ xfs_fileoff_t off = bno;
+ int error = 0, nirecs, i;
+
+ if (nfsb > 1)
+ irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS);
+
+ nirecs = nfsb;
+ error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs,
+ xfs_bmapi_aflag(whichfork));
+ if (error)
+ goto out_free_irecs;
+ /*
+ * Use the caller provided map for the single map case, else allocate a
+ * larger one that needs to be free by the caller.
+ */
if (nirecs > 1) {
- map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
- KM_NOFS);
+ map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
if (!map)
- return -ENOMEM;
+ goto out_free_irecs;
*mapp = map;
}
- *nmaps = nirecs;
- map = *mapp;
- for (i = 0; i < *nmaps; i++) {
- ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
- irecs[i].br_startblock != HOLESTARTBLOCK);
+ for (i = 0; i < nirecs; i++) {
+ if (irecs[i].br_startblock == HOLESTARTBLOCK ||
+ irecs[i].br_startblock == DELAYSTARTBLOCK)
+ goto invalid_mapping;
+ if (off != irecs[i].br_startoff)
+ goto invalid_mapping;
+
map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
+ off += irecs[i].br_blockcount;
}
- return 0;
-}
-
-/*
- * Map the block we are given ready for reading. There are three possible return
- * values:
- * -1 - will be returned if we land in a hole and mappedbno == -2 so the
- * caller knows not to execute a subsequent read.
- * 0 - if we mapped the block successfully
- * >0 - positive error number if there was an error.
- */
-static int
-xfs_dabuf_map(
- struct xfs_inode *dp,
- xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
- int whichfork,
- struct xfs_buf_map **map,
- int *nmaps)
-{
- struct xfs_mount *mp = dp->i_mount;
- int nfsb;
- int error = 0;
- struct xfs_bmbt_irec irec;
- struct xfs_bmbt_irec *irecs = &irec;
- int nirecs;
- ASSERT(map && *map);
- ASSERT(*nmaps == 1);
+ if (off != bno + nfsb)
+ goto invalid_mapping;
- if (whichfork == XFS_DATA_FORK)
- nfsb = mp->m_dir_geo->fsbcount;
- else
- nfsb = mp->m_attr_geo->fsbcount;
-
- /*
- * Caller doesn't have a mapping. -2 means don't complain
- * if we land in a hole.
- */
- if (mappedbno == -1 || mappedbno == -2) {
- /*
- * Optimize the one-block case.
- */
- if (nfsb != 1)
- irecs = kmem_zalloc(sizeof(irec) * nfsb,
- KM_NOFS);
+ *nmaps = nirecs;
+out_free_irecs:
+ if (irecs != &irec)
+ kmem_free(irecs);
+ return error;
- nirecs = nfsb;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
- &nirecs, xfs_bmapi_aflag(whichfork));
- if (error)
- goto out;
- } else {
- irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
- irecs->br_startoff = (xfs_fileoff_t)bno;
- irecs->br_blockcount = nfsb;
- irecs->br_state = 0;
- nirecs = 1;
- }
+invalid_mapping:
+ /* Caller ok with no mapping. */
+ if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) {
+ error = -EFSCORRUPTED;
+ if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+ xfs_alert(mp, "%s: bno %u inode %llu",
+ __func__, bno, dp->i_ino);
- if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
- error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
- if (unlikely(error == -EFSCORRUPTED)) {
- if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
- int i;
- xfs_alert(mp, "%s: bno %lld dir: inode %lld",
- __func__, (long long)bno,
- (long long)dp->i_ino);
- for (i = 0; i < *nmaps; i++) {
- xfs_alert(mp,
+ for (i = 0; i < nirecs; i++) {
+ xfs_alert(mp,
"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
- i,
- (long long)irecs[i].br_startoff,
- (long long)irecs[i].br_startblock,
- (long long)irecs[i].br_blockcount,
- irecs[i].br_state);
- }
+ i, irecs[i].br_startoff,
+ irecs[i].br_startblock,
+ irecs[i].br_blockcount,
+ irecs[i].br_state);
}
- XFS_ERROR_REPORT("xfs_da_do_buf(1)",
- XFS_ERRLEVEL_LOW, mp);
}
- goto out;
+ } else {
+ *nmaps = 0;
}
- error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
-out:
- if (irecs != &irec)
- kmem_free(irecs);
- return error;
+ goto out_free_irecs;
}
/*
@@ -2589,37 +2574,28 @@ out:
*/
int
xfs_da_get_buf(
- struct xfs_trans *trans,
+ struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
struct xfs_buf **bpp,
int whichfork)
{
+ struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp;
- struct xfs_buf_map map;
- struct xfs_buf_map *mapp;
- int nmap;
+ struct xfs_buf_map map, *mapp = &map;
+ int nmap = 1;
int error;
*bpp = NULL;
- mapp = &map;
- nmap = 1;
- error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
- &mapp, &nmap);
- if (error) {
- /* mapping a hole is not an error, but we don't continue */
- if (error == -1)
- error = 0;
+ error = xfs_dabuf_map(dp, bno, 0, whichfork, &mapp, &nmap);
+ if (error || nmap == 0)
goto out_free;
- }
- bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
- mapp, nmap, 0);
+ bp = xfs_trans_get_buf_map(tp, mp->m_ddev_targp, mapp, nmap, 0);
error = bp ? bp->b_error : -EIO;
if (error) {
if (bp)
- xfs_trans_brelse(trans, bp);
+ xfs_trans_brelse(tp, bp);
goto out_free;
}
@@ -2637,35 +2613,27 @@ out_free:
*/
int
xfs_da_read_buf(
- struct xfs_trans *trans,
+ struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
+ unsigned int flags,
struct xfs_buf **bpp,
int whichfork,
const struct xfs_buf_ops *ops)
{
+ struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp;
- struct xfs_buf_map map;
- struct xfs_buf_map *mapp;
- int nmap;
+ struct xfs_buf_map map, *mapp = &map;
+ int nmap = 1;
int error;
*bpp = NULL;
- mapp = &map;
- nmap = 1;
- error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
- &mapp, &nmap);
- if (error) {
- /* mapping a hole is not an error, but we don't continue */
- if (error == -1)
- error = 0;
+ error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap);
+ if (error || !nmap)
goto out_free;
- }
- error = xfs_trans_read_buf_map(dp->i_mount, trans,
- dp->i_mount->m_ddev_targp,
- mapp, nmap, 0, &bp, ops);
+ error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0,
+ &bp, ops);
if (error)
goto out_free;
@@ -2688,7 +2656,7 @@ int
xfs_da_reada_buf(
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mappedbno,
+ unsigned int flags,
int whichfork,
const struct xfs_buf_ops *ops)
{
@@ -2699,16 +2667,10 @@ xfs_da_reada_buf(
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
- &mapp, &nmap);
- if (error) {
- /* mapping a hole is not an error, but we don't continue */
- if (error == -1)
- error = 0;
+ error = xfs_dabuf_map(dp, bno, flags, whichfork, &mapp, &nmap);
+ if (error || !nmap)
goto out_free;
- }
- mappedbno = mapp[0].bm_bn;
xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
out_free:
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ae0bbd20d9ca..e16610d1c14f 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -10,7 +10,6 @@
struct xfs_inode;
struct xfs_trans;
struct zone;
-struct xfs_dir_ops;
/*
* Directory/attribute geometry information. There will be one of these for each
@@ -18,15 +17,23 @@ struct xfs_dir_ops;
* structures will be attached to the xfs_mount.
*/
struct xfs_da_geometry {
- int blksize; /* da block size in bytes */
- int fsbcount; /* da block size in filesystem blocks */
+ unsigned int blksize; /* da block size in bytes */
+ unsigned int fsbcount; /* da block size in filesystem blocks */
uint8_t fsblog; /* log2 of _filesystem_ block size */
uint8_t blklog; /* log2 of da block size */
- uint node_ents; /* # of entries in a danode */
- int magicpct; /* 37% of block size in bytes */
+ unsigned int node_hdr_size; /* danode header size in bytes */
+ unsigned int node_ents; /* # of entries in a danode */
+ unsigned int magicpct; /* 37% of block size in bytes */
xfs_dablk_t datablk; /* blockno of dir data v2 */
+ unsigned int leaf_hdr_size; /* dir2 leaf header size */
+ unsigned int leaf_max_ents; /* # of entries in dir2 leaf */
xfs_dablk_t leafblk; /* blockno of leaf data v2 */
+ unsigned int free_hdr_size; /* dir2 free header size */
+ unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
+
+ xfs_dir2_data_aoff_t data_first_offset;
+ size_t data_entry_offset;
};
/*========================================================================
@@ -125,6 +132,25 @@ typedef struct xfs_da_state {
} xfs_da_state_t;
/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t level;
+
+ /*
+ * Pointer to the on-disk format entries, which are behind the
+ * variable size (v4 vs v5) header in the on-disk block.
+ */
+ struct xfs_da_node_entry *btree;
+};
+
+/*
* Utility macros to aid in logging changed structure fields.
*/
#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
@@ -132,16 +158,6 @@ typedef struct xfs_da_state {
(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
-/*
- * Name ops for directory and/or attr name operations
- */
-struct xfs_nameops {
- xfs_dahash_t (*hashname)(struct xfs_name *);
- enum xfs_dacmp (*compname)(struct xfs_da_args *,
- const unsigned char *, int);
-};
-
-
/*========================================================================
* Function prototypes.
*========================================================================*/
@@ -172,25 +188,28 @@ int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
xfs_da_state_blk_t *new_blk);
int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mappedbno,
- struct xfs_buf **bpp, int which_fork);
+ xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork);
+int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_daddr_t mappedbno, struct xfs_buf **bpp,
+ int whichfork);
/*
* Utility routines.
*/
+
+#define XFS_DABUF_MAP_HOLE_OK (1 << 0)
+
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
int count);
int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mappedbno,
- struct xfs_buf **bp, int whichfork);
+ xfs_dablk_t bno, struct xfs_buf **bp, int whichfork);
int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mappedbno,
- struct xfs_buf **bpp, int whichfork,
- const struct xfs_buf_ops *ops);
+ xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp,
+ int whichfork, const struct xfs_buf_ops *ops);
int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
- xfs_daddr_t mapped_bno, int whichfork,
- const struct xfs_buf_ops *ops);
+ unsigned int flags, int whichfork,
+ const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
@@ -202,7 +221,11 @@ enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
xfs_da_state_t *xfs_da_state_alloc(void);
void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
+ struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
+void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
+ struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
+
extern struct kmem_zone *xfs_da_state_zone;
-extern const struct xfs_nameops xfs_default_nameops;
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
deleted file mode 100644
index b1ae572496b6..000000000000
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ /dev/null
@@ -1,888 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
- * Copyright (c) 2013 Red Hat, Inc.
- * All Rights Reserved.
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_inode.h"
-#include "xfs_dir2.h"
-
-/*
- * Shortform directory ops
- */
-static int
-xfs_dir2_sf_entsize(
- struct xfs_dir2_sf_hdr *hdr,
- int len)
-{
- int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
-
- count += len; /* name */
- count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
- return count;
-}
-
-static int
-xfs_dir3_sf_entsize(
- struct xfs_dir2_sf_hdr *hdr,
- int len)
-{
- return xfs_dir2_sf_entsize(hdr, len) + sizeof(uint8_t);
-}
-
-static struct xfs_dir2_sf_entry *
-xfs_dir2_sf_nextentry(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return (struct xfs_dir2_sf_entry *)
- ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
-}
-
-static struct xfs_dir2_sf_entry *
-xfs_dir3_sf_nextentry(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return (struct xfs_dir2_sf_entry *)
- ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
-}
-
-
-/*
- * For filetype enabled shortform directories, the file type field is stored at
- * the end of the name. Because it's only a single byte, endian conversion is
- * not necessary. For non-filetype enable directories, the type is always
- * unknown and we never store the value.
- */
-static uint8_t
-xfs_dir2_sfe_get_ftype(
- struct xfs_dir2_sf_entry *sfep)
-{
- return XFS_DIR3_FT_UNKNOWN;
-}
-
-static void
-xfs_dir2_sfe_put_ftype(
- struct xfs_dir2_sf_entry *sfep,
- uint8_t ftype)
-{
- ASSERT(ftype < XFS_DIR3_FT_MAX);
-}
-
-static uint8_t
-xfs_dir3_sfe_get_ftype(
- struct xfs_dir2_sf_entry *sfep)
-{
- uint8_t ftype;
-
- ftype = sfep->name[sfep->namelen];
- if (ftype >= XFS_DIR3_FT_MAX)
- return XFS_DIR3_FT_UNKNOWN;
- return ftype;
-}
-
-static void
-xfs_dir3_sfe_put_ftype(
- struct xfs_dir2_sf_entry *sfep,
- uint8_t ftype)
-{
- ASSERT(ftype < XFS_DIR3_FT_MAX);
-
- sfep->name[sfep->namelen] = ftype;
-}
-
-/*
- * Inode numbers in short-form directories can come in two versions,
- * either 4 bytes or 8 bytes wide. These helpers deal with the
- * two forms transparently by looking at the headers i8count field.
- *
- * For 64-bit inode number the most significant byte must be zero.
- */
-static xfs_ino_t
-xfs_dir2_sf_get_ino(
- struct xfs_dir2_sf_hdr *hdr,
- uint8_t *from)
-{
- if (hdr->i8count)
- return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
- else
- return get_unaligned_be32(from);
-}
-
-static void
-xfs_dir2_sf_put_ino(
- struct xfs_dir2_sf_hdr *hdr,
- uint8_t *to,
- xfs_ino_t ino)
-{
- ASSERT((ino & 0xff00000000000000ULL) == 0);
-
- if (hdr->i8count)
- put_unaligned_be64(ino, to);
- else
- put_unaligned_be32(ino, to);
-}
-
-static xfs_ino_t
-xfs_dir2_sf_get_parent_ino(
- struct xfs_dir2_sf_hdr *hdr)
-{
- return xfs_dir2_sf_get_ino(hdr, hdr->parent);
-}
-
-static void
-xfs_dir2_sf_put_parent_ino(
- struct xfs_dir2_sf_hdr *hdr,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
-}
-
-/*
- * In short-form directory entries the inode numbers are stored at variable
- * offset behind the entry name. If the entry stores a filetype value, then it
- * sits between the name and the inode number. Hence the inode numbers may only
- * be accessed through the helpers below.
- */
-static xfs_ino_t
-xfs_dir2_sfe_get_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
-}
-
-static void
-xfs_dir2_sfe_put_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
-}
-
-static xfs_ino_t
-xfs_dir3_sfe_get_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
-}
-
-static void
-xfs_dir3_sfe_put_ino(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
-}
-
-
-/*
- * Directory data block operations
- */
-
-/*
- * For special situations, the dirent size ends up fixed because we always know
- * what the size of the entry is. That's true for the "." and "..", and
- * therefore we know that they are a fixed size and hence their offsets are
- * constant, as is the first entry.
- *
- * Hence, this calculation is written as a macro to be able to be calculated at
- * compile time and so certain offsets can be calculated directly in the
- * structure initaliser via the macro. There are two macros - one for dirents
- * with ftype and without so there are no unresolvable conditionals in the
- * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
- * of 2 and the compiler doesn't reject it (unlike roundup()).
- */
-#define XFS_DIR2_DATA_ENTSIZE(n) \
- round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
- sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
-
-#define XFS_DIR3_DATA_ENTSIZE(n) \
- round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
- sizeof(xfs_dir2_data_off_t) + sizeof(uint8_t)), \
- XFS_DIR2_DATA_ALIGN)
-
-static int
-xfs_dir2_data_entsize(
- int n)
-{
- return XFS_DIR2_DATA_ENTSIZE(n);
-}
-
-static int
-xfs_dir3_data_entsize(
- int n)
-{
- return XFS_DIR3_DATA_ENTSIZE(n);
-}
-
-static uint8_t
-xfs_dir2_data_get_ftype(
- struct xfs_dir2_data_entry *dep)
-{
- return XFS_DIR3_FT_UNKNOWN;
-}
-
-static void
-xfs_dir2_data_put_ftype(
- struct xfs_dir2_data_entry *dep,
- uint8_t ftype)
-{
- ASSERT(ftype < XFS_DIR3_FT_MAX);
-}
-
-static uint8_t
-xfs_dir3_data_get_ftype(
- struct xfs_dir2_data_entry *dep)
-{
- uint8_t ftype = dep->name[dep->namelen];
-
- if (ftype >= XFS_DIR3_FT_MAX)
- return XFS_DIR3_FT_UNKNOWN;
- return ftype;
-}
-
-static void
-xfs_dir3_data_put_ftype(
- struct xfs_dir2_data_entry *dep,
- uint8_t type)
-{
- ASSERT(type < XFS_DIR3_FT_MAX);
- ASSERT(dep->namelen != 0);
-
- dep->name[dep->namelen] = type;
-}
-
-/*
- * Pointer to an entry's tag word.
- */
-static __be16 *
-xfs_dir2_data_entry_tag_p(
- struct xfs_dir2_data_entry *dep)
-{
- return (__be16 *)((char *)dep +
- xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
-}
-
-static __be16 *
-xfs_dir3_data_entry_tag_p(
- struct xfs_dir2_data_entry *dep)
-{
- return (__be16 *)((char *)dep +
- xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
-}
-
-/*
- * location of . and .. in data space (always block 0)
- */
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_dot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_dotdot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR2_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_first_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR2_DATA_ENTSIZE(1) +
- XFS_DIR2_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_ftype_data_dotdot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_ftype_data_first_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1) +
- XFS_DIR3_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1) +
- XFS_DIR3_DATA_ENTSIZE(2));
-}
-
-static struct xfs_dir2_data_free *
-xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
- return hdr->bestfree;
-}
-
-static struct xfs_dir2_data_free *
-xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
- return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_unused *
-xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_unused *)
- ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
-}
-
-static struct xfs_dir2_data_entry *
-xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-static struct xfs_dir2_data_unused *
-xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_unused *)
- ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
-}
-
-
-/*
- * Directory Leaf block operations
- */
-static int
-xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
-{
- return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
- (uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-static struct xfs_dir2_leaf_entry *
-xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
- return lp->__ents;
-}
-
-static int
-xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
-{
- return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
- (uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-static struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
- return ((struct xfs_dir3_leaf *)lp)->__ents;
-}
-
-static void
-xfs_dir2_leaf_hdr_from_disk(
- struct xfs_dir3_icleaf_hdr *to,
- struct xfs_dir2_leaf *from)
-{
- to->forw = be32_to_cpu(from->hdr.info.forw);
- to->back = be32_to_cpu(from->hdr.info.back);
- to->magic = be16_to_cpu(from->hdr.info.magic);
- to->count = be16_to_cpu(from->hdr.count);
- to->stale = be16_to_cpu(from->hdr.stale);
-
- ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
- to->magic == XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir2_leaf_hdr_to_disk(
- struct xfs_dir2_leaf *to,
- struct xfs_dir3_icleaf_hdr *from)
-{
- ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
- from->magic == XFS_DIR2_LEAFN_MAGIC);
-
- to->hdr.info.forw = cpu_to_be32(from->forw);
- to->hdr.info.back = cpu_to_be32(from->back);
- to->hdr.info.magic = cpu_to_be16(from->magic);
- to->hdr.count = cpu_to_be16(from->count);
- to->hdr.stale = cpu_to_be16(from->stale);
-}
-
-static void
-xfs_dir3_leaf_hdr_from_disk(
- struct xfs_dir3_icleaf_hdr *to,
- struct xfs_dir2_leaf *from)
-{
- struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
-
- to->forw = be32_to_cpu(hdr3->info.hdr.forw);
- to->back = be32_to_cpu(hdr3->info.hdr.back);
- to->magic = be16_to_cpu(hdr3->info.hdr.magic);
- to->count = be16_to_cpu(hdr3->count);
- to->stale = be16_to_cpu(hdr3->stale);
-
- ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
- to->magic == XFS_DIR3_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leaf_hdr_to_disk(
- struct xfs_dir2_leaf *to,
- struct xfs_dir3_icleaf_hdr *from)
-{
- struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
-
- ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
- from->magic == XFS_DIR3_LEAFN_MAGIC);
-
- hdr3->info.hdr.forw = cpu_to_be32(from->forw);
- hdr3->info.hdr.back = cpu_to_be32(from->back);
- hdr3->info.hdr.magic = cpu_to_be16(from->magic);
- hdr3->count = cpu_to_be16(from->count);
- hdr3->stale = cpu_to_be16(from->stale);
-}
-
-
-/*
- * Directory/Attribute Node block operations
- */
-static struct xfs_da_node_entry *
-xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
-{
- return dap->__btree;
-}
-
-static struct xfs_da_node_entry *
-xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
-{
- return ((struct xfs_da3_intnode *)dap)->__btree;
-}
-
-static void
-xfs_da2_node_hdr_from_disk(
- struct xfs_da3_icnode_hdr *to,
- struct xfs_da_intnode *from)
-{
- ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
- to->forw = be32_to_cpu(from->hdr.info.forw);
- to->back = be32_to_cpu(from->hdr.info.back);
- to->magic = be16_to_cpu(from->hdr.info.magic);
- to->count = be16_to_cpu(from->hdr.__count);
- to->level = be16_to_cpu(from->hdr.__level);
-}
-
-static void
-xfs_da2_node_hdr_to_disk(
- struct xfs_da_intnode *to,
- struct xfs_da3_icnode_hdr *from)
-{
- ASSERT(from->magic == XFS_DA_NODE_MAGIC);
- to->hdr.info.forw = cpu_to_be32(from->forw);
- to->hdr.info.back = cpu_to_be32(from->back);
- to->hdr.info.magic = cpu_to_be16(from->magic);
- to->hdr.__count = cpu_to_be16(from->count);
- to->hdr.__level = cpu_to_be16(from->level);
-}
-
-static void
-xfs_da3_node_hdr_from_disk(
- struct xfs_da3_icnode_hdr *to,
- struct xfs_da_intnode *from)
-{
- struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
-
- ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
- to->forw = be32_to_cpu(hdr3->info.hdr.forw);
- to->back = be32_to_cpu(hdr3->info.hdr.back);
- to->magic = be16_to_cpu(hdr3->info.hdr.magic);
- to->count = be16_to_cpu(hdr3->__count);
- to->level = be16_to_cpu(hdr3->__level);
-}
-
-static void
-xfs_da3_node_hdr_to_disk(
- struct xfs_da_intnode *to,
- struct xfs_da3_icnode_hdr *from)
-{
- struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
-
- ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
- hdr3->info.hdr.forw = cpu_to_be32(from->forw);
- hdr3->info.hdr.back = cpu_to_be32(from->back);
- hdr3->info.hdr.magic = cpu_to_be16(from->magic);
- hdr3->__count = cpu_to_be16(from->count);
- hdr3->__level = cpu_to_be16(from->level);
-}
-
-
-/*
- * Directory free space block operations
- */
-static int
-xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
-{
- return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
- sizeof(xfs_dir2_data_off_t);
-}
-
-static __be16 *
-xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
-{
- return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
- return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
- (db / xfs_dir2_free_max_bests(geo));
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static int
-xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
- return db % xfs_dir2_free_max_bests(geo);
-}
-
-static int
-xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
-{
- return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
- sizeof(xfs_dir2_data_off_t);
-}
-
-static __be16 *
-xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
-{
- return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static xfs_dir2_db_t
-xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
- return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
- (db / xfs_dir3_free_max_bests(geo));
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static int
-xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
-{
- return db % xfs_dir3_free_max_bests(geo);
-}
-
-static void
-xfs_dir2_free_hdr_from_disk(
- struct xfs_dir3_icfree_hdr *to,
- struct xfs_dir2_free *from)
-{
- to->magic = be32_to_cpu(from->hdr.magic);
- to->firstdb = be32_to_cpu(from->hdr.firstdb);
- to->nvalid = be32_to_cpu(from->hdr.nvalid);
- to->nused = be32_to_cpu(from->hdr.nused);
- ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
-}
-
-static void
-xfs_dir2_free_hdr_to_disk(
- struct xfs_dir2_free *to,
- struct xfs_dir3_icfree_hdr *from)
-{
- ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
-
- to->hdr.magic = cpu_to_be32(from->magic);
- to->hdr.firstdb = cpu_to_be32(from->firstdb);
- to->hdr.nvalid = cpu_to_be32(from->nvalid);
- to->hdr.nused = cpu_to_be32(from->nused);
-}
-
-static void
-xfs_dir3_free_hdr_from_disk(
- struct xfs_dir3_icfree_hdr *to,
- struct xfs_dir2_free *from)
-{
- struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
-
- to->magic = be32_to_cpu(hdr3->hdr.magic);
- to->firstdb = be32_to_cpu(hdr3->firstdb);
- to->nvalid = be32_to_cpu(hdr3->nvalid);
- to->nused = be32_to_cpu(hdr3->nused);
-
- ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
-}
-
-static void
-xfs_dir3_free_hdr_to_disk(
- struct xfs_dir2_free *to,
- struct xfs_dir3_icfree_hdr *from)
-{
- struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
-
- ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
-
- hdr3->hdr.magic = cpu_to_be32(from->magic);
- hdr3->firstdb = cpu_to_be32(from->firstdb);
- hdr3->nvalid = cpu_to_be32(from->nvalid);
- hdr3->nused = cpu_to_be32(from->nused);
-}
-
-static const struct xfs_dir_ops xfs_dir2_ops = {
- .sf_entsize = xfs_dir2_sf_entsize,
- .sf_nextentry = xfs_dir2_sf_nextentry,
- .sf_get_ftype = xfs_dir2_sfe_get_ftype,
- .sf_put_ftype = xfs_dir2_sfe_put_ftype,
- .sf_get_ino = xfs_dir2_sfe_get_ino,
- .sf_put_ino = xfs_dir2_sfe_put_ino,
- .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
- .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
- .data_entsize = xfs_dir2_data_entsize,
- .data_get_ftype = xfs_dir2_data_get_ftype,
- .data_put_ftype = xfs_dir2_data_put_ftype,
- .data_entry_tag_p = xfs_dir2_data_entry_tag_p,
- .data_bestfree_p = xfs_dir2_data_bestfree_p,
-
- .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
- .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR2_DATA_ENTSIZE(1),
- .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR2_DATA_ENTSIZE(1) +
- XFS_DIR2_DATA_ENTSIZE(2),
- .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
-
- .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
- .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
- .data_first_entry_p = xfs_dir2_data_first_entry_p,
- .data_entry_p = xfs_dir2_data_entry_p,
- .data_unused_p = xfs_dir2_data_unused_p,
-
- .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
- .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
- .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
- .leaf_max_ents = xfs_dir2_max_leaf_ents,
- .leaf_ents_p = xfs_dir2_leaf_ents_p,
-
- .node_hdr_size = sizeof(struct xfs_da_node_hdr),
- .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
- .node_tree_p = xfs_da2_node_tree_p,
-
- .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
- .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
- .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
- .free_max_bests = xfs_dir2_free_max_bests,
- .free_bests_p = xfs_dir2_free_bests_p,
- .db_to_fdb = xfs_dir2_db_to_fdb,
- .db_to_fdindex = xfs_dir2_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
- .sf_entsize = xfs_dir3_sf_entsize,
- .sf_nextentry = xfs_dir3_sf_nextentry,
- .sf_get_ftype = xfs_dir3_sfe_get_ftype,
- .sf_put_ftype = xfs_dir3_sfe_put_ftype,
- .sf_get_ino = xfs_dir3_sfe_get_ino,
- .sf_put_ino = xfs_dir3_sfe_put_ino,
- .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
- .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
- .data_entsize = xfs_dir3_data_entsize,
- .data_get_ftype = xfs_dir3_data_get_ftype,
- .data_put_ftype = xfs_dir3_data_put_ftype,
- .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
- .data_bestfree_p = xfs_dir2_data_bestfree_p,
-
- .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
- .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1),
- .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1) +
- XFS_DIR3_DATA_ENTSIZE(2),
- .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
-
- .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
- .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
- .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
- .data_entry_p = xfs_dir2_data_entry_p,
- .data_unused_p = xfs_dir2_data_unused_p,
-
- .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
- .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
- .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
- .leaf_max_ents = xfs_dir2_max_leaf_ents,
- .leaf_ents_p = xfs_dir2_leaf_ents_p,
-
- .node_hdr_size = sizeof(struct xfs_da_node_hdr),
- .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
- .node_tree_p = xfs_da2_node_tree_p,
-
- .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
- .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
- .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
- .free_max_bests = xfs_dir2_free_max_bests,
- .free_bests_p = xfs_dir2_free_bests_p,
- .db_to_fdb = xfs_dir2_db_to_fdb,
- .db_to_fdindex = xfs_dir2_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir3_ops = {
- .sf_entsize = xfs_dir3_sf_entsize,
- .sf_nextentry = xfs_dir3_sf_nextentry,
- .sf_get_ftype = xfs_dir3_sfe_get_ftype,
- .sf_put_ftype = xfs_dir3_sfe_put_ftype,
- .sf_get_ino = xfs_dir3_sfe_get_ino,
- .sf_put_ino = xfs_dir3_sfe_put_ino,
- .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
- .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
-
- .data_entsize = xfs_dir3_data_entsize,
- .data_get_ftype = xfs_dir3_data_get_ftype,
- .data_put_ftype = xfs_dir3_data_put_ftype,
- .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
- .data_bestfree_p = xfs_dir3_data_bestfree_p,
-
- .data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
- .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1),
- .data_first_offset = sizeof(struct xfs_dir3_data_hdr) +
- XFS_DIR3_DATA_ENTSIZE(1) +
- XFS_DIR3_DATA_ENTSIZE(2),
- .data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
-
- .data_dot_entry_p = xfs_dir3_data_dot_entry_p,
- .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
- .data_first_entry_p = xfs_dir3_data_first_entry_p,
- .data_entry_p = xfs_dir3_data_entry_p,
- .data_unused_p = xfs_dir3_data_unused_p,
-
- .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
- .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
- .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
- .leaf_max_ents = xfs_dir3_max_leaf_ents,
- .leaf_ents_p = xfs_dir3_leaf_ents_p,
-
- .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
- .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
- .node_tree_p = xfs_da3_node_tree_p,
-
- .free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
- .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
- .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
- .free_max_bests = xfs_dir3_free_max_bests,
- .free_bests_p = xfs_dir3_free_bests_p,
- .db_to_fdb = xfs_dir3_db_to_fdb,
- .db_to_fdindex = xfs_dir3_db_to_fdindex,
-};
-
-static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
- .node_hdr_size = sizeof(struct xfs_da_node_hdr),
- .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
- .node_tree_p = xfs_da2_node_tree_p,
-};
-
-static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
- .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
- .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
- .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
- .node_tree_p = xfs_da3_node_tree_p,
-};
-
-/*
- * Return the ops structure according to the current config. If we are passed
- * an inode, then that overrides the default config we use which is based on
- * feature bits.
- */
-const struct xfs_dir_ops *
-xfs_dir_get_ops(
- struct xfs_mount *mp,
- struct xfs_inode *dp)
-{
- if (dp)
- return dp->d_ops;
- if (mp->m_dir_inode_ops)
- return mp->m_dir_inode_ops;
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return &xfs_dir3_ops;
- if (xfs_sb_version_hasftype(&mp->m_sb))
- return &xfs_dir2_ftype_ops;
- return &xfs_dir2_ops;
-}
-
-const struct xfs_dir_ops *
-xfs_nondir_get_ops(
- struct xfs_mount *mp,
- struct xfs_inode *dp)
-{
- if (dp)
- return dp->d_ops;
- if (mp->m_nondir_inode_ops)
- return mp->m_nondir_inode_ops;
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return &xfs_dir3_nondir_ops;
- return &xfs_dir2_nondir_ops;
-}
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index ae654e06b2fb..3dee33043e09 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -94,19 +94,6 @@ struct xfs_da3_intnode {
};
/*
- * In-core version of the node header to abstract the differences in the v2 and
- * v3 disk format of the headers. Callers need to convert to/from disk format as
- * appropriate.
- */
-struct xfs_da3_icnode_hdr {
- uint32_t forw;
- uint32_t back;
- uint16_t magic;
- uint16_t count;
- uint16_t level;
-};
-
-/*
* Directory version 2.
*
* There are 4 possible formats:
@@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr {
__be32 pad; /* 64 bit alignment */
};
-struct xfs_dir3_icleaf_hdr {
- uint32_t forw;
- uint32_t back;
- uint16_t magic;
- uint16_t count;
- uint16_t stale;
-};
-
/*
* Leaf block entry.
*/
@@ -482,7 +461,7 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
}
/*
- * Free space block defintions for the node format.
+ * Free space block definitions for the node format.
*/
/*
@@ -521,19 +500,6 @@ struct xfs_dir3_free {
#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
/*
- * In core version of the free block header, abstracted away from on-disk format
- * differences. Use this in the code, and convert to/from the disk version using
- * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
- */
-struct xfs_dir3_icfree_hdr {
- uint32_t magic;
- uint32_t firstdb;
- uint32_t nvalid;
- uint32_t nused;
-
-};
-
-/*
* Single block format.
*
* The single block format looks like the following drawing on disk:
@@ -710,29 +676,6 @@ struct xfs_attr3_leafblock {
};
/*
- * incore, neutral version of the attribute leaf header
- */
-struct xfs_attr3_icleaf_hdr {
- uint32_t forw;
- uint32_t back;
- uint16_t magic;
- uint16_t count;
- uint16_t usedbytes;
- /*
- * firstused is 32-bit here instead of 16-bit like the on-disk variant
- * to support maximum fsb size of 64k without overflow issues throughout
- * the attr code. Instead, the overflow condition is handled on
- * conversion to/from disk.
- */
- uint32_t firstused;
- __u8 holes;
- struct {
- uint16_t base;
- uint16_t size;
- } freemap[XFS_ATTR_LEAF_MAPSIZE];
-};
-
-/*
* Special value to represent fs block size in the leaf header firstused field.
* Only used when block size overflows the 2-bytes available on disk.
*/
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 867c5dee0751..0aa87cbde49e 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -52,7 +52,7 @@ xfs_mode_to_ftype(
* ASCII case-insensitive (ie. A-Z) support for directories that was
* used in IRIX.
*/
-STATIC xfs_dahash_t
+xfs_dahash_t
xfs_ascii_ci_hashname(
struct xfs_name *name)
{
@@ -65,14 +65,14 @@ xfs_ascii_ci_hashname(
return hash;
}
-STATIC enum xfs_dacmp
+enum xfs_dacmp
xfs_ascii_ci_compname(
- struct xfs_da_args *args,
- const unsigned char *name,
- int len)
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
{
- enum xfs_dacmp result;
- int i;
+ enum xfs_dacmp result;
+ int i;
if (args->namelen != len)
return XFS_CMP_DIFFERENT;
@@ -89,26 +89,16 @@ xfs_ascii_ci_compname(
return result;
}
-static const struct xfs_nameops xfs_ascii_ci_nameops = {
- .hashname = xfs_ascii_ci_hashname,
- .compname = xfs_ascii_ci_compname,
-};
-
int
xfs_da_mount(
struct xfs_mount *mp)
{
struct xfs_da_geometry *dageo;
- int nodehdr_size;
ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE);
- mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
- mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
-
- nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
KM_MAYFAIL);
mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
@@ -125,6 +115,27 @@ xfs_da_mount(
dageo->fsblog = mp->m_sb.sb_blocklog;
dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb);
dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr);
+ dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr);
+ dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr);
+ dageo->data_entry_offset =
+ sizeof(struct xfs_dir3_data_hdr);
+ } else {
+ dageo->node_hdr_size = sizeof(struct xfs_da_node_hdr);
+ dageo->leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr);
+ dageo->free_hdr_size = sizeof(struct xfs_dir2_free_hdr);
+ dageo->data_entry_offset =
+ sizeof(struct xfs_dir2_data_hdr);
+ }
+ dageo->leaf_max_ents = (dageo->blksize - dageo->leaf_hdr_size) /
+ sizeof(struct xfs_dir2_leaf_entry);
+ dageo->free_max_bests = (dageo->blksize - dageo->free_hdr_size) /
+ sizeof(xfs_dir2_data_off_t);
+
+ dageo->data_first_offset = dageo->data_entry_offset +
+ xfs_dir2_data_entsize(mp, 1) +
+ xfs_dir2_data_entsize(mp, 2);
/*
* Now we've set up the block conversion variables, we can calculate the
@@ -133,7 +144,7 @@ xfs_da_mount(
dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
- dageo->node_ents = (dageo->blksize - nodehdr_size) /
+ dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
dageo->magicpct = (dageo->blksize * 37) / 100;
@@ -143,15 +154,10 @@ xfs_da_mount(
dageo->fsblog = mp->m_sb.sb_blocklog;
dageo->blksize = 1 << dageo->blklog;
dageo->fsbcount = 1;
- dageo->node_ents = (dageo->blksize - nodehdr_size) /
+ dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
+ dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
dageo->magicpct = (dageo->blksize * 37) / 100;
-
- if (xfs_sb_version_hasasciici(&mp->m_sb))
- mp->m_dirnameops = &xfs_ascii_ci_nameops;
- else
- mp->m_dirnameops = &xfs_default_nameops;
-
return 0;
}
@@ -191,10 +197,10 @@ xfs_dir_ino_validate(
{
bool ino_ok = xfs_verify_dir_ino(mp, ino);
- if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE))) {
+ if (XFS_IS_CORRUPT(mp, !ino_ok) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
xfs_warn(mp, "Invalid inode number 0x%Lx",
(unsigned long long) ino);
- XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
return 0;
@@ -262,7 +268,7 @@ xfs_dir_createname(
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
- args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
args->inumber = inum;
args->dp = dp;
args->total = total;
@@ -358,7 +364,7 @@ xfs_dir_lookup(
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
- args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
args->dp = dp;
args->whichfork = XFS_DATA_FORK;
args->trans = tp;
@@ -430,7 +436,7 @@ xfs_dir_removename(
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
- args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
args->inumber = ino;
args->dp = dp;
args->total = total;
@@ -491,7 +497,7 @@ xfs_dir_replace(
args->name = name->name;
args->namelen = name->len;
args->filetype = name->type;
- args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->hashval = xfs_dir2_hashname(dp->i_mount, name);
args->inumber = inum;
args->dp = dp;
args->total = total;
@@ -600,7 +606,9 @@ xfs_dir2_isblock(
if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
- if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize)
+ if (XFS_IS_CORRUPT(args->dp->i_mount,
+ rval != 0 &&
+ args->dp->i_d.di_size != args->geo->blksize))
return -EFSCORRUPTED;
*vp = rval;
return 0;
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index f54244779492..033777e282f2 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry;
struct xfs_dir2_data_hdr;
struct xfs_dir2_data_entry;
struct xfs_dir2_data_unused;
+struct xfs_dir3_icfree_hdr;
+struct xfs_dir3_icleaf_hdr;
extern struct xfs_name xfs_name_dotdot;
@@ -27,85 +29,6 @@ extern struct xfs_name xfs_name_dotdot;
extern unsigned char xfs_mode_to_ftype(int mode);
/*
- * directory operations vector for encode/decode routines
- */
-struct xfs_dir_ops {
- int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
- struct xfs_dir2_sf_entry *
- (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep);
- uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
- void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
- uint8_t ftype);
- xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep);
- void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep,
- xfs_ino_t ino);
- xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
- void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
- xfs_ino_t ino);
-
- int (*data_entsize)(int len);
- uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
- void (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
- uint8_t ftype);
- __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
- struct xfs_dir2_data_free *
- (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
-
- xfs_dir2_data_aoff_t data_dot_offset;
- xfs_dir2_data_aoff_t data_dotdot_offset;
- xfs_dir2_data_aoff_t data_first_offset;
- size_t data_entry_offset;
-
- struct xfs_dir2_data_entry *
- (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
- struct xfs_dir2_data_entry *
- (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
- struct xfs_dir2_data_entry *
- (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
- struct xfs_dir2_data_entry *
- (*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
- struct xfs_dir2_data_unused *
- (*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
-
- int leaf_hdr_size;
- void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
- struct xfs_dir3_icleaf_hdr *from);
- void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
- struct xfs_dir2_leaf *from);
- int (*leaf_max_ents)(struct xfs_da_geometry *geo);
- struct xfs_dir2_leaf_entry *
- (*leaf_ents_p)(struct xfs_dir2_leaf *lp);
-
- int node_hdr_size;
- void (*node_hdr_to_disk)(struct xfs_da_intnode *to,
- struct xfs_da3_icnode_hdr *from);
- void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
- struct xfs_da_intnode *from);
- struct xfs_da_node_entry *
- (*node_tree_p)(struct xfs_da_intnode *dap);
-
- int free_hdr_size;
- void (*free_hdr_to_disk)(struct xfs_dir2_free *to,
- struct xfs_dir3_icfree_hdr *from);
- void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
- struct xfs_dir2_free *from);
- int (*free_max_bests)(struct xfs_da_geometry *geo);
- __be16 * (*free_bests_p)(struct xfs_dir2_free *free);
- xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
- xfs_dir2_db_t db);
- int (*db_to_fdindex)(struct xfs_da_geometry *geo,
- xfs_dir2_db_t db);
-};
-
-extern const struct xfs_dir_ops *
- xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
-extern const struct xfs_dir_ops *
- xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
-
-/*
* Generic directory interface routines
*/
extern void xfs_dir_startup(void);
@@ -124,6 +47,8 @@ extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
+extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp,
+ xfs_ino_t inum);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
@@ -143,10 +68,7 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
-extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo,
- const struct xfs_dir_ops *ops,
- struct xfs_dir2_data_hdr *hdr, int *loghead);
-extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
+extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
struct xfs_dir2_data_hdr *hdr, int *loghead);
extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
@@ -324,7 +246,7 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
#define XFS_READDIR_BUFSIZE (32768)
unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
-void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
+unsigned int xfs_dir3_data_end_offset(struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr);
bool xfs_dir2_namecheck(const void *name, size_t length);
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 49e4bc39e7bb..d6ced59b9567 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -123,7 +123,7 @@ xfs_dir3_block_read(
struct xfs_mount *mp = dp->i_mount;
int err;
- err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
+ err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
@@ -172,7 +172,7 @@ xfs_dir2_block_need_space(
struct xfs_dir2_data_unused *enddup = NULL;
*compact = 0;
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
/*
* If there are stale entries we'll use one for the leaf.
@@ -311,7 +311,7 @@ xfs_dir2_block_compact(
* This needs to happen before the next call to use_free.
*/
if (needscan)
- xfs_dir2_data_freescan(args->dp, hdr, needlog);
+ xfs_dir2_data_freescan(args->dp->i_mount, hdr, needlog);
}
/*
@@ -355,7 +355,7 @@ xfs_dir2_block_addname(
if (error)
return error;
- len = dp->d_ops->data_entsize(args->namelen);
+ len = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
/*
* Set up pointers to parts of the block.
@@ -458,7 +458,7 @@ xfs_dir2_block_addname(
* This needs to happen before the next call to use_free.
*/
if (needscan) {
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
needscan = 0;
}
/*
@@ -541,14 +541,14 @@ xfs_dir2_block_addname(
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, args->namelen);
- dp->d_ops->data_put_ftype(dep, args->filetype);
- tagp = dp->d_ops->data_entry_tag_p(dep);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Clean up the bestfree array and log the header, tail, and entry.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, bp);
xfs_dir2_block_log_tail(tp, bp);
@@ -633,7 +633,7 @@ xfs_dir2_block_lookup(
* Fill in inode number, CI name if appropriate, release the block.
*/
args->inumber = be64_to_cpu(dep->inumber);
- args->filetype = dp->d_ops->data_get_ftype(dep);
+ args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(args->trans, bp);
return error;
@@ -660,13 +660,11 @@ xfs_dir2_block_lookup_int(
int high; /* binary search high index */
int low; /* binary search low index */
int mid; /* binary search current idx */
- xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
enum xfs_dacmp cmp; /* comparison result */
dp = args->dp;
tp = args->trans;
- mp = dp->i_mount;
error = xfs_dir3_block_read(tp, dp, &bp);
if (error)
@@ -718,7 +716,7 @@ xfs_dir2_block_lookup_int(
* and buffer. If it's the first case-insensitive match, store
* the index and buffer and continue looking for an exact match.
*/
- cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
args->cmpresult = cmp;
*bpp = bp;
@@ -791,7 +789,8 @@ xfs_dir2_block_removename(
needlog = needscan = 0;
xfs_dir2_data_make_free(args, bp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
- dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+ &needscan);
/*
* Fix up the block tail.
*/
@@ -806,7 +805,7 @@ xfs_dir2_block_removename(
* Fix up bestfree, log the header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, bp);
xfs_dir3_data_check(dp, bp);
@@ -864,7 +863,7 @@ xfs_dir2_block_replace(
* Change the inode number to the new value.
*/
dep->inumber = cpu_to_be64(args->inumber);
- dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
xfs_dir2_data_log_entry(args, bp, dep);
xfs_dir3_data_check(dp, bp);
return 0;
@@ -914,7 +913,6 @@ xfs_dir2_leaf_to_block(
__be16 *tagp; /* end of entry (tag) */
int to; /* block/leaf to index */
xfs_trans_t *tp; /* transaction pointer */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_to_block(args);
@@ -923,8 +921,7 @@ xfs_dir2_leaf_to_block(
tp = args->trans;
mp = dp->i_mount;
leaf = lbp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
@@ -938,7 +935,7 @@ xfs_dir2_leaf_to_block(
while (dp->i_d.di_size > args->geo->blksize) {
int hdrsz;
- hdrsz = dp->d_ops->data_entry_offset;
+ hdrsz = args->geo->data_entry_offset;
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
args->geo->blksize - hdrsz) {
@@ -953,7 +950,7 @@ xfs_dir2_leaf_to_block(
* Read the data block if we don't already have it, give up if it fails.
*/
if (!dbp) {
- error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->geo->datablk, 0, &dbp);
if (error)
return error;
}
@@ -1004,9 +1001,10 @@ xfs_dir2_leaf_to_block(
*/
lep = xfs_dir2_block_leaf_p(btp);
for (from = to = 0; from < leafhdr.count; from++) {
- if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (leafhdr.ents[from].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
- lep[to++] = ents[from];
+ lep[to++] = leafhdr.ents[from];
}
ASSERT(to == be32_to_cpu(btp->count));
xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1);
@@ -1014,7 +1012,7 @@ xfs_dir2_leaf_to_block(
* Scan the bestfree if we need it and log the data block header.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, dbp);
/*
@@ -1039,47 +1037,38 @@ xfs_dir2_leaf_to_block(
*/
int /* error */
xfs_dir2_sf_to_block(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args)
{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+ struct xfs_da_geometry *geo = args->geo;
xfs_dir2_db_t blkno; /* dir-relative block # (0) */
xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
struct xfs_buf *bp; /* block buffer */
xfs_dir2_block_tail_t *btp; /* block tail pointer */
xfs_dir2_data_entry_t *dep; /* data entry pointer */
- xfs_inode_t *dp; /* incore directory inode */
int dummy; /* trash */
xfs_dir2_data_unused_t *dup; /* unused entry pointer */
int endoffset; /* end of data objects */
int error; /* error return value */
int i; /* index */
- xfs_mount_t *mp; /* filesystem mount point */
int needlog; /* need to log block header */
int needscan; /* need to scan block freespc */
int newoffset; /* offset from current entry */
- int offset; /* target block offset */
+ unsigned int offset = geo->data_entry_offset;
xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */
xfs_dir2_sf_hdr_t *sfp; /* shortform header */
__be16 *tagp; /* end of data entry */
- xfs_trans_t *tp; /* transaction pointer */
struct xfs_name name;
- struct xfs_ifork *ifp;
trace_xfs_dir2_sf_to_block(args);
- dp = args->dp;
- tp = args->trans;
- mp = dp->i_mount;
- ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
ASSERT(ifp->if_flags & XFS_IFINLINE);
- /*
- * Bomb out if the shortform directory is way too short.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- return -EIO;
- }
+ ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
@@ -1123,7 +1112,7 @@ xfs_dir2_sf_to_block(
* The whole thing is initialized to free by the init routine.
* Say we're using the leaf and tail area.
*/
- dup = dp->d_ops->data_unused_p(hdr);
+ dup = bp->b_addr + offset;
needlog = needscan = 0;
error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
i, &needlog, &needscan);
@@ -1146,35 +1135,37 @@ xfs_dir2_sf_to_block(
be16_to_cpu(dup->length), &needlog, &needscan);
if (error)
goto out_free;
+
/*
* Create entry for .
*/
- dep = dp->d_ops->data_dot_entry_p(hdr);
+ dep = bp->b_addr + offset;
dep->inumber = cpu_to_be64(dp->i_ino);
dep->namelen = 1;
dep->name[0] = '.';
- dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
- tagp = dp->d_ops->data_entry_tag_p(dep);
- *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+ tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+ *tagp = cpu_to_be16(offset);
xfs_dir2_data_log_entry(args, bp, dep);
blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
- blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
- (char *)dep - (char *)hdr));
+ blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset));
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
+
/*
* Create entry for ..
*/
- dep = dp->d_ops->data_dotdot_entry_p(hdr);
- dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
+ dep = bp->b_addr + offset;
+ dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
dep->namelen = 2;
dep->name[0] = dep->name[1] = '.';
- dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
- tagp = dp->d_ops->data_entry_tag_p(dep);
- *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ xfs_dir2_data_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
+ tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+ *tagp = cpu_to_be16(offset);
xfs_dir2_data_log_entry(args, bp, dep);
blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
- blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
- (char *)dep - (char *)hdr));
- offset = dp->d_ops->data_first_offset;
+ blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(offset));
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
+
/*
* Loop over existing entries, stuff them in.
*/
@@ -1183,6 +1174,7 @@ xfs_dir2_sf_to_block(
sfep = NULL;
else
sfep = xfs_dir2_sf_firstentry(sfp);
+
/*
* Need to preserve the existing offset values in the sf directory.
* Insert holes (unused entries) where necessary.
@@ -1199,40 +1191,39 @@ xfs_dir2_sf_to_block(
* There should be a hole here, make one.
*/
if (offset < newoffset) {
- dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
+ dup = bp->b_addr + offset;
dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
dup->length = cpu_to_be16(newoffset - offset);
- *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
- ((char *)dup - (char *)hdr));
+ *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(offset);
xfs_dir2_data_log_unused(args, bp, dup);
xfs_dir2_data_freeinsert(hdr,
- dp->d_ops->data_bestfree_p(hdr),
- dup, &dummy);
+ xfs_dir2_data_bestfree_p(mp, hdr),
+ dup, &dummy);
offset += be16_to_cpu(dup->length);
continue;
}
/*
* Copy a real entry.
*/
- dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
- dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
+ dep = bp->b_addr + newoffset;
+ dep->inumber = cpu_to_be64(xfs_dir2_sf_get_ino(mp, sfp, sfep));
dep->namelen = sfep->namelen;
- dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
+ xfs_dir2_data_put_ftype(mp, dep,
+ xfs_dir2_sf_get_ftype(mp, sfep));
memcpy(dep->name, sfep->name, dep->namelen);
- tagp = dp->d_ops->data_entry_tag_p(dep);
- *tagp = cpu_to_be16((char *)dep - (char *)hdr);
+ tagp = xfs_dir2_data_entry_tag_p(mp, dep);
+ *tagp = cpu_to_be16(newoffset);
xfs_dir2_data_log_entry(args, bp, dep);
name.name = sfep->name;
name.len = sfep->namelen;
- blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
- hashname(&name));
- blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
- (char *)dep - (char *)hdr));
+ blp[2 + i].hashval = cpu_to_be32(xfs_dir2_hashname(mp, &name));
+ blp[2 + i].address =
+ cpu_to_be32(xfs_dir2_byte_to_dataptr(newoffset));
offset = (int)((char *)(tagp + 1) - (char *)hdr);
if (++i == sfp->count)
sfep = NULL;
else
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
/* Done with the temporary buffer */
kmem_free(sfp);
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 2c79be4c3153..b9eba8213180 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -13,6 +13,7 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
@@ -23,6 +24,71 @@ static xfs_failaddr_t xfs_dir2_data_freefind_verify(
struct xfs_dir2_data_unused *dup,
struct xfs_dir2_data_free **bf_ent);
+struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+ return hdr->bestfree;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+__be16 *
+xfs_dir2_data_entry_tag_p(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir2_data_entsize(mp, dep->namelen) - sizeof(__be16));
+}
+
+uint8_t
+xfs_dir2_data_get_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep)
+{
+ if (xfs_sb_version_hasftype(&mp->m_sb)) {
+ uint8_t ftype = dep->name[dep->namelen];
+
+ if (likely(ftype < XFS_DIR3_FT_MAX))
+ return ftype;
+ }
+
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+void
+xfs_dir2_data_put_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep,
+ uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+ ASSERT(dep->namelen != 0);
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ dep->name[dep->namelen] = ftype;
+}
+
+/*
+ * The number of leaf entries is limited by the size of the block and the amount
+ * of space used by the data entries. We don't know how much space is used by
+ * the data entries yet, so just ensure that the count falls somewhere inside
+ * the block right now.
+ */
+static inline unsigned int
+xfs_dir2_data_max_leaf_entries(
+ struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir2_block_tail) -
+ geo->data_entry_offset) /
+ sizeof(struct xfs_dir2_leaf_entry);
+}
+
/*
* Check the consistency of the data block.
* The input can also be a block-format directory.
@@ -38,40 +104,27 @@ __xfs_dir3_data_check(
xfs_dir2_block_tail_t *btp=NULL; /* block tail */
int count; /* count of entries found */
xfs_dir2_data_hdr_t *hdr; /* data block header */
- xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_free_t *dfp; /* bestfree entry */
- xfs_dir2_data_unused_t *dup; /* unused entry */
- char *endp; /* end of useful data */
int freeseen; /* mask of bestfrees seen */
xfs_dahash_t hash; /* hash of current name */
int i; /* leaf index */
int lastfree; /* last entry was unused */
xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */
struct xfs_mount *mp = bp->b_mount;
- char *p; /* current data position */
int stale; /* count of stale leaves */
struct xfs_name name;
- const struct xfs_dir_ops *ops;
- struct xfs_da_geometry *geo;
-
- geo = mp->m_dir_geo;
+ unsigned int offset;
+ unsigned int end;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
/*
- * We can be passed a null dp here from a verifier, so we need to go the
- * hard way to get them.
+ * If this isn't a directory, something is seriously wrong. Bail out.
*/
- ops = xfs_dir_get_ops(mp, dp);
-
- /*
- * If this isn't a directory, or we don't get handed the dir ops,
- * something is seriously wrong. Bail out.
- */
- if ((dp && !S_ISDIR(VFS_I(dp)->i_mode)) ||
- ops != xfs_dir_get_ops(mp, NULL))
+ if (dp && !S_ISDIR(VFS_I(dp)->i_mode))
return __this_address;
hdr = bp->b_addr;
- p = (char *)ops->data_entry_p(hdr);
+ offset = geo->data_entry_offset;
switch (hdr->magic) {
case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
@@ -79,15 +132,8 @@ __xfs_dir3_data_check(
btp = xfs_dir2_block_tail_p(geo, hdr);
lep = xfs_dir2_block_leaf_p(btp);
- /*
- * The number of leaf entries is limited by the size of the
- * block and the amount of space used by the data entries.
- * We don't know how much space is used by the data entries yet,
- * so just ensure that the count falls somewhere inside the
- * block right now.
- */
if (be32_to_cpu(btp->count) >=
- ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry))
+ xfs_dir2_data_max_leaf_entries(geo))
return __this_address;
break;
case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
@@ -96,14 +142,14 @@ __xfs_dir3_data_check(
default:
return __this_address;
}
- endp = xfs_dir3_data_endp(geo, hdr);
- if (!endp)
+ end = xfs_dir3_data_end_offset(geo, hdr);
+ if (!end)
return __this_address;
/*
* Account for zero bestfree entries.
*/
- bf = ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(mp, hdr);
count = lastfree = freeseen = 0;
if (!bf[0].length) {
if (bf[0].offset)
@@ -128,8 +174,10 @@ __xfs_dir3_data_check(
/*
* Loop over the data/unused entries.
*/
- while (p < endp) {
- dup = (xfs_dir2_data_unused_t *)p;
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
/*
* If it's unused, look for the space in the bestfree table.
* If we find it, account for that, else make sure it
@@ -140,10 +188,10 @@ __xfs_dir3_data_check(
if (lastfree != 0)
return __this_address;
- if (endp < p + be16_to_cpu(dup->length))
+ if (offset + be16_to_cpu(dup->length) > end)
return __this_address;
if (be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) !=
- (char *)dup - (char *)hdr)
+ offset)
return __this_address;
fa = xfs_dir2_data_freefind_verify(hdr, bf, dup, &dfp);
if (fa)
@@ -158,7 +206,7 @@ __xfs_dir3_data_check(
be16_to_cpu(bf[2].length))
return __this_address;
}
- p += be16_to_cpu(dup->length);
+ offset += be16_to_cpu(dup->length);
lastfree = 1;
continue;
}
@@ -168,17 +216,15 @@ __xfs_dir3_data_check(
* in the leaf section of the block.
* The linear search is crude but this is DEBUG code.
*/
- dep = (xfs_dir2_data_entry_t *)p;
if (dep->namelen == 0)
return __this_address;
if (xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)))
return __this_address;
- if (endp < p + ops->data_entsize(dep->namelen))
+ if (offset + xfs_dir2_data_entsize(mp, dep->namelen) > end)
return __this_address;
- if (be16_to_cpu(*ops->data_entry_tag_p(dep)) !=
- (char *)dep - (char *)hdr)
+ if (be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)) != offset)
return __this_address;
- if (ops->data_get_ftype(dep) >= XFS_DIR3_FT_MAX)
+ if (xfs_dir2_data_get_ftype(mp, dep) >= XFS_DIR3_FT_MAX)
return __this_address;
count++;
lastfree = 0;
@@ -189,7 +235,7 @@ __xfs_dir3_data_check(
((char *)dep - (char *)hdr));
name.name = dep->name;
name.len = dep->namelen;
- hash = mp->m_dirnameops->hashname(&name);
+ hash = xfs_dir2_hashname(mp, &name);
for (i = 0; i < be32_to_cpu(btp->count); i++) {
if (be32_to_cpu(lep[i].address) == addr &&
be32_to_cpu(lep[i].hashval) == hash)
@@ -198,7 +244,7 @@ __xfs_dir3_data_check(
if (i >= be32_to_cpu(btp->count))
return __this_address;
}
- p += ops->data_entsize(dep->namelen);
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
}
/*
* Need to have seen all the entries and all the bestfree slots.
@@ -354,13 +400,13 @@ xfs_dir3_data_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mapped_bno,
+ unsigned int flags,
struct xfs_buf **bpp)
{
int err;
- err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
- XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
+ err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK,
+ &xfs_dir3_data_buf_ops);
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
return err;
@@ -370,10 +416,10 @@ int
xfs_dir3_data_readahead(
struct xfs_inode *dp,
xfs_dablk_t bno,
- xfs_daddr_t mapped_bno)
+ unsigned int flags)
{
- return xfs_da_reada_buf(dp, bno, mapped_bno,
- XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
+ return xfs_da_reada_buf(dp, bno, flags, XFS_DATA_FORK,
+ &xfs_dir3_data_reada_buf_ops);
}
/*
@@ -561,17 +607,16 @@ xfs_dir2_data_freeremove(
* Given a data block, reconstruct its bestfree map.
*/
void
-xfs_dir2_data_freescan_int(
- struct xfs_da_geometry *geo,
- const struct xfs_dir_ops *ops,
- struct xfs_dir2_data_hdr *hdr,
- int *loghead)
+xfs_dir2_data_freescan(
+ struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr,
+ int *loghead)
{
- xfs_dir2_data_entry_t *dep; /* active data entry */
- xfs_dir2_data_unused_t *dup; /* unused data entry */
- struct xfs_dir2_data_free *bf;
- char *endp; /* end of block's data */
- char *p; /* current entry pointer */
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ struct xfs_dir2_data_free *bf = xfs_dir2_data_bestfree_p(mp, hdr);
+ void *addr = hdr;
+ unsigned int offset = geo->data_entry_offset;
+ unsigned int end;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -581,79 +626,60 @@ xfs_dir2_data_freescan_int(
/*
* Start by clearing the table.
*/
- bf = ops->data_bestfree_p(hdr);
memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
*loghead = 1;
- /*
- * Set up pointers.
- */
- p = (char *)ops->data_entry_p(hdr);
- endp = xfs_dir3_data_endp(geo, hdr);
- /*
- * Loop over the block's entries.
- */
- while (p < endp) {
- dup = (xfs_dir2_data_unused_t *)p;
+
+ end = xfs_dir3_data_end_offset(geo, addr);
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = addr + offset;
+ struct xfs_dir2_data_entry *dep = addr + offset;
+
/*
* If it's a free entry, insert it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- ASSERT((char *)dup - (char *)hdr ==
+ ASSERT(offset ==
be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
- p += be16_to_cpu(dup->length);
+ offset += be16_to_cpu(dup->length);
+ continue;
}
+
/*
* For active entries, check their tags and skip them.
*/
- else {
- dep = (xfs_dir2_data_entry_t *)p;
- ASSERT((char *)dep - (char *)hdr ==
- be16_to_cpu(*ops->data_entry_tag_p(dep)));
- p += ops->data_entsize(dep->namelen);
- }
+ ASSERT(offset ==
+ be16_to_cpu(*xfs_dir2_data_entry_tag_p(mp, dep)));
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
}
}
-void
-xfs_dir2_data_freescan(
- struct xfs_inode *dp,
- struct xfs_dir2_data_hdr *hdr,
- int *loghead)
-{
- return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops,
- hdr, loghead);
-}
-
/*
* Initialize a data block at the given block number in the directory.
* Give back the buffer for the created block.
*/
int /* error */
xfs_dir3_data_init(
- xfs_da_args_t *args, /* directory operation args */
- xfs_dir2_db_t blkno, /* logical dir block number */
- struct xfs_buf **bpp) /* output block buffer */
+ struct xfs_da_args *args, /* directory operation args */
+ xfs_dir2_db_t blkno, /* logical dir block number */
+ struct xfs_buf **bpp) /* output block buffer */
{
- struct xfs_buf *bp; /* block buffer */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* unused entry pointer */
- struct xfs_dir2_data_free *bf;
- int error; /* error return value */
- int i; /* bestfree index */
- xfs_mount_t *mp; /* filesystem mount point */
- xfs_trans_t *tp; /* transaction pointer */
- int t; /* temp */
-
- dp = args->dp;
- mp = dp->i_mount;
- tp = args->trans;
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_geometry *geo = args->geo;
+ struct xfs_buf *bp;
+ struct xfs_dir2_data_hdr *hdr;
+ struct xfs_dir2_data_unused *dup;
+ struct xfs_dir2_data_free *bf;
+ int error;
+ int i;
+
/*
* Get the buffer set up for the block.
*/
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
- -1, &bp, XFS_DATA_FORK);
+ &bp, XFS_DATA_FORK);
if (error)
return error;
bp->b_ops = &xfs_dir3_data_buf_ops;
@@ -675,8 +701,9 @@ xfs_dir3_data_init(
} else
hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
- bf = dp->d_ops->data_bestfree_p(hdr);
- bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
+ bf = xfs_dir2_data_bestfree_p(mp, hdr);
+ bf[0].offset = cpu_to_be16(geo->data_entry_offset);
+ bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset);
for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
bf[i].length = 0;
bf[i].offset = 0;
@@ -685,13 +712,11 @@ xfs_dir3_data_init(
/*
* Set up an unused entry for the block's body.
*/
- dup = dp->d_ops->data_unused_p(hdr);
+ dup = bp->b_addr + geo->data_entry_offset;
dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
-
- t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
- bf[0].length = cpu_to_be16(t);
- dup->length = cpu_to_be16(t);
+ dup->length = bf[0].length;
*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
+
/*
* Log it and return it.
*/
@@ -710,6 +735,7 @@ xfs_dir2_data_log_entry(
struct xfs_buf *bp,
xfs_dir2_data_entry_t *dep) /* data entry pointer */
{
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir2_data_hdr *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
@@ -718,7 +744,7 @@ xfs_dir2_data_log_entry(
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
- (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
+ (uint)((char *)(xfs_dir2_data_entry_tag_p(mp, dep) + 1) -
(char *)hdr - 1));
}
@@ -739,8 +765,7 @@ xfs_dir2_data_log_header(
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
#endif
- xfs_trans_log_buf(args->trans, bp, 0,
- args->dp->d_ops->data_entry_offset - 1);
+ xfs_trans_log_buf(args->trans, bp, 0, args->geo->data_entry_offset - 1);
}
/*
@@ -789,11 +814,11 @@ xfs_dir2_data_make_free(
{
xfs_dir2_data_hdr_t *hdr; /* data block pointer */
xfs_dir2_data_free_t *dfp; /* bestfree pointer */
- char *endptr; /* end of data area */
int needscan; /* need to regen bestfree */
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *postdup; /* unused entry after us */
xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
+ unsigned int end;
struct xfs_dir2_data_free *bf;
hdr = bp->b_addr;
@@ -801,14 +826,14 @@ xfs_dir2_data_make_free(
/*
* Figure out where the end of the data area is.
*/
- endptr = xfs_dir3_data_endp(args->geo, hdr);
- ASSERT(endptr != NULL);
+ end = xfs_dir3_data_end_offset(args->geo, hdr);
+ ASSERT(end != 0);
/*
* If this isn't the start of the block, then back up to
* the previous entry and see if it's free.
*/
- if (offset > args->dp->d_ops->data_entry_offset) {
+ if (offset > args->geo->data_entry_offset) {
__be16 *tagp; /* tag just before us */
tagp = (__be16 *)((char *)hdr + offset) - 1;
@@ -821,7 +846,7 @@ xfs_dir2_data_make_free(
* If this isn't the end of the block, see if the entry after
* us is free.
*/
- if ((char *)hdr + offset + len < endptr) {
+ if (offset + len < end) {
postdup =
(xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG)
@@ -834,7 +859,7 @@ xfs_dir2_data_make_free(
* Previous and following entries are both free,
* merge everything into a single free entry.
*/
- bf = args->dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr);
if (prevdup && postdup) {
xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
@@ -1025,7 +1050,7 @@ xfs_dir2_data_use_free(
* Look up the entry in the bestfree table.
*/
oldlen = be16_to_cpu(dup->length);
- bf = args->dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(args->dp->i_mount, hdr);
dfp = xfs_dir2_data_freefind(hdr, bf, dup);
ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
/*
@@ -1149,19 +1174,22 @@ corrupt:
}
/* Find the end of the entry data in a data/block format dir block. */
-void *
-xfs_dir3_data_endp(
+unsigned int
+xfs_dir3_data_end_offset(
struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr)
{
+ void *p;
+
switch (hdr->magic) {
case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
- return xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr));
+ p = xfs_dir2_block_leaf_p(xfs_dir2_block_tail_p(geo, hdr));
+ return p - (void *)hdr;
case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
- return (char *)hdr + geo->blksize;
+ return geo->blksize;
default:
- return NULL;
+ return 0;
}
}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index a53e4585a2f3..a131b520aac7 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -24,12 +24,73 @@
* Local function declarations.
*/
static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
- int *indexp, struct xfs_buf **dbpp);
+ int *indexp, struct xfs_buf **dbpp,
+ struct xfs_dir3_icleaf_hdr *leafhdr);
static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
struct xfs_buf *bp, int first, int last);
static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
struct xfs_buf *bp);
+void
+xfs_dir2_leaf_hdr_from_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from;
+
+ to->forw = be32_to_cpu(from3->hdr.info.hdr.forw);
+ to->back = be32_to_cpu(from3->hdr.info.hdr.back);
+ to->magic = be16_to_cpu(from3->hdr.info.hdr.magic);
+ to->count = be16_to_cpu(from3->hdr.count);
+ to->stale = be16_to_cpu(from3->hdr.stale);
+ to->ents = from3->__ents;
+
+ ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+ to->magic == XFS_DIR3_LEAFN_MAGIC);
+ } else {
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->stale = be16_to_cpu(from->hdr.stale);
+ to->ents = from->__ents;
+
+ ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+ to->magic == XFS_DIR2_LEAFN_MAGIC);
+ }
+}
+
+void
+xfs_dir2_leaf_hdr_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to;
+
+ ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+ from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+ to3->hdr.info.hdr.forw = cpu_to_be32(from->forw);
+ to3->hdr.info.hdr.back = cpu_to_be32(from->back);
+ to3->hdr.info.hdr.magic = cpu_to_be16(from->magic);
+ to3->hdr.count = cpu_to_be16(from->count);
+ to3->hdr.stale = cpu_to_be16(from->stale);
+ } else {
+ ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+ from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.stale = cpu_to_be16(from->stale);
+ }
+}
+
/*
* Check the internal consistency of a leaf1 block.
* Pop an assert if something is wrong.
@@ -43,7 +104,7 @@ xfs_dir3_leaf1_check(
struct xfs_dir2_leaf *leaf = bp->b_addr;
struct xfs_dir3_icleaf_hdr leafhdr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
@@ -52,7 +113,7 @@ xfs_dir3_leaf1_check(
} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
return __this_address;
- return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+ return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf);
}
static inline void
@@ -76,31 +137,15 @@ xfs_dir3_leaf_check(
xfs_failaddr_t
xfs_dir3_leaf_check_int(
- struct xfs_mount *mp,
- struct xfs_inode *dp,
- struct xfs_dir3_icleaf_hdr *hdr,
- struct xfs_dir2_leaf *leaf)
+ struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *hdr,
+ struct xfs_dir2_leaf *leaf)
{
- struct xfs_dir2_leaf_entry *ents;
- xfs_dir2_leaf_tail_t *ltp;
- int stale;
- int i;
- const struct xfs_dir_ops *ops;
- struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_da_geometry *geo = mp->m_dir_geo;
-
- /*
- * we can be passed a null dp here from a verifier, so we need to go the
- * hard way to get them.
- */
- ops = xfs_dir_get_ops(mp, dp);
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+ xfs_dir2_leaf_tail_t *ltp;
+ int stale;
+ int i;
- if (!hdr) {
- ops->leaf_hdr_from_disk(&leafhdr, leaf);
- hdr = &leafhdr;
- }
-
- ents = ops->leaf_ents_p(leaf);
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
/*
@@ -108,23 +153,23 @@ xfs_dir3_leaf_check_int(
* Should factor in the size of the bests table as well.
* We can deduce a value for that from di_size.
*/
- if (hdr->count > ops->leaf_max_ents(geo))
+ if (hdr->count > geo->leaf_max_ents)
return __this_address;
/* Leaves and bests don't overlap in leaf format. */
if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC ||
hdr->magic == XFS_DIR3_LEAF1_MAGIC) &&
- (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
+ (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp))
return __this_address;
/* Check hash value order, count stale entries. */
for (i = stale = 0; i < hdr->count; i++) {
if (i + 1 < hdr->count) {
- if (be32_to_cpu(ents[i].hashval) >
- be32_to_cpu(ents[i + 1].hashval))
+ if (be32_to_cpu(hdr->ents[i].hashval) >
+ be32_to_cpu(hdr->ents[i + 1].hashval))
return __this_address;
}
- if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
}
if (hdr->stale != stale)
@@ -139,17 +184,18 @@ xfs_dir3_leaf_check_int(
*/
static xfs_failaddr_t
xfs_dir3_leaf_verify(
- struct xfs_buf *bp)
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_mount;
- struct xfs_dir2_leaf *leaf = bp->b_addr;
- xfs_failaddr_t fa;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ xfs_failaddr_t fa;
fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
if (fa)
return fa;
- return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, bp->b_addr);
+ return xfs_dir3_leaf_check_int(mp, &leafhdr, bp->b_addr);
}
static void
@@ -216,13 +262,12 @@ xfs_dir3_leaf_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
- xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
int err;
- err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
+ err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
+ &xfs_dir3_leaf1_buf_ops);
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
return err;
@@ -233,13 +278,12 @@ xfs_dir3_leafn_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
- xfs_daddr_t mappedbno,
struct xfs_buf **bpp)
{
int err;
- err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
+ err = xfs_da_read_buf(tp, dp, fbno, 0, bpp, XFS_DATA_FORK,
+ &xfs_dir3_leafn_buf_ops);
if (!err && tp && *bpp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
return err;
@@ -311,7 +355,7 @@ xfs_dir3_leaf_get_buf(
bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
- -1, &bp, XFS_DATA_FORK);
+ &bp, XFS_DATA_FORK);
if (error)
return error;
@@ -346,7 +390,6 @@ xfs_dir2_block_to_leaf(
int needscan; /* need to rescan bestfree */
xfs_trans_t *tp; /* transaction pointer */
struct xfs_dir2_data_free *bf;
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_block_to_leaf(args);
@@ -375,24 +418,24 @@ xfs_dir2_block_to_leaf(
xfs_dir3_data_check(dp, dbp);
btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
- bf = dp->d_ops->data_bestfree_p(hdr);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
/*
* Set the counts in the leaf header.
*/
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
leafhdr.count = be32_to_cpu(btp->count);
leafhdr.stale = be32_to_cpu(btp->stale);
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, lbp);
/*
* Could compact these but I think we always do the conversion
* after squeezing out stale entries.
*/
- memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
+ memcpy(leafhdr.ents, blp,
+ be32_to_cpu(btp->count) * sizeof(struct xfs_dir2_leaf_entry));
+ xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, 0, leafhdr.count - 1);
needscan = 0;
needlog = 1;
/*
@@ -415,7 +458,7 @@ xfs_dir2_block_to_leaf(
hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
/*
* Set up leaf tail and bests table.
*/
@@ -594,7 +637,7 @@ xfs_dir2_leaf_addname(
trace_xfs_dir2_leaf_addname(args);
- error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
if (error)
return error;
@@ -607,10 +650,10 @@ xfs_dir2_leaf_addname(
index = xfs_dir2_leaf_search_hash(args, lbp);
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ ents = leafhdr.ents;
bestsp = xfs_dir2_leaf_bests_p(ltp);
- length = dp->d_ops->data_entsize(args->namelen);
+ length = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
/*
* See if there are any entries with the same hash value
@@ -773,7 +816,7 @@ xfs_dir2_leaf_addname(
else
xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
bestsp[use_block] = bf[0].length;
grown = 1;
} else {
@@ -783,13 +826,13 @@ xfs_dir2_leaf_addname(
*/
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo, use_block),
- -1, &dbp);
+ 0, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
}
hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
grown = 0;
}
/*
@@ -815,14 +858,14 @@ xfs_dir2_leaf_addname(
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- dp->d_ops->data_put_ftype(dep, args->filetype);
- tagp = dp->d_ops->data_entry_tag_p(dep);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Need to scan fix up the bestfree table.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
/*
* Need to log the data block's header.
*/
@@ -852,9 +895,9 @@ xfs_dir2_leaf_addname(
/*
* Log the leaf fields and give up the buffers.
*/
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, lbp);
- xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, lfloglow, lfloghigh);
xfs_dir3_leaf_check(dp, lbp);
xfs_dir3_data_check(dp, dbp);
return 0;
@@ -874,7 +917,6 @@ xfs_dir3_leaf_compact(
xfs_dir2_leaf_t *leaf; /* leaf structure */
int loglow; /* first leaf entry to log */
int to; /* target leaf index */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_inode *dp = args->dp;
leaf = bp->b_addr;
@@ -884,9 +926,9 @@ xfs_dir3_leaf_compact(
/*
* Compress out the stale entries in place.
*/
- ents = dp->d_ops->leaf_ents_p(leaf);
for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
- if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (leafhdr->ents[from].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
/*
* Only actually copy the entries that are different.
@@ -894,7 +936,7 @@ xfs_dir3_leaf_compact(
if (from > to) {
if (loglow == -1)
loglow = to;
- ents[to] = ents[from];
+ leafhdr->ents[to] = leafhdr->ents[from];
}
to++;
}
@@ -905,10 +947,10 @@ xfs_dir3_leaf_compact(
leafhdr->count -= leafhdr->stale;
leafhdr->stale = 0;
- dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, leafhdr);
xfs_dir3_leaf_log_header(args, bp);
if (loglow != -1)
- xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
+ xfs_dir3_leaf_log_ents(args, leafhdr, bp, loglow, to - 1);
}
/*
@@ -1037,6 +1079,7 @@ xfs_dir3_leaf_log_bests(
void
xfs_dir3_leaf_log_ents(
struct xfs_da_args *args,
+ struct xfs_dir3_icleaf_hdr *hdr,
struct xfs_buf *bp,
int first,
int last)
@@ -1044,16 +1087,14 @@ xfs_dir3_leaf_log_ents(
xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
struct xfs_dir2_leaf *leaf = bp->b_addr;
- struct xfs_dir2_leaf_entry *ents;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- ents = args->dp->d_ops->leaf_ents_p(leaf);
- firstlep = &ents[first];
- lastlep = &ents[last];
+ firstlep = &hdr->ents[first];
+ lastlep = &hdr->ents[last];
xfs_trans_log_buf(args->trans, bp,
(uint)((char *)firstlep - (char *)leaf),
(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
@@ -1076,7 +1117,7 @@ xfs_dir3_leaf_log_header(
xfs_trans_log_buf(args->trans, bp,
(uint)((char *)&leaf->hdr - (char *)leaf),
- args->dp->d_ops->leaf_hdr_size - 1);
+ args->geo->leaf_hdr_size - 1);
}
/*
@@ -1115,28 +1156,27 @@ xfs_dir2_leaf_lookup(
int error; /* error return code */
int index; /* found entry index */
struct xfs_buf *lbp; /* leaf buffer */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
- struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_lookup(args);
/*
* Look up name in the leaf block, returning both buffers and index.
*/
- if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+ error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+ if (error)
return error;
- }
+
tp = args->trans;
dp = args->dp;
xfs_dir3_leaf_check(dp, lbp);
- leaf = lbp->b_addr;
- ents = dp->d_ops->leaf_ents_p(leaf);
+
/*
* Get to the leaf entry and contained data entry address.
*/
- lep = &ents[index];
+ lep = &leafhdr.ents[index];
/*
* Point to the data entry.
@@ -1148,7 +1188,7 @@ xfs_dir2_leaf_lookup(
* Return the found inode number & CI name if appropriate
*/
args->inumber = be64_to_cpu(dep->inumber);
- args->filetype = dp->d_ops->data_get_ftype(dep);
+ args->filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(tp, dbp);
xfs_trans_brelse(tp, lbp);
@@ -1166,7 +1206,8 @@ xfs_dir2_leaf_lookup_int(
xfs_da_args_t *args, /* operation arguments */
struct xfs_buf **lbpp, /* out: leaf buffer */
int *indexp, /* out: index in leaf block */
- struct xfs_buf **dbpp) /* out: data buffer */
+ struct xfs_buf **dbpp, /* out: data buffer */
+ struct xfs_dir3_icleaf_hdr *leafhdr)
{
xfs_dir2_db_t curdb = -1; /* current data block number */
struct xfs_buf *dbp = NULL; /* data buffer */
@@ -1182,22 +1223,19 @@ xfs_dir2_leaf_lookup_int(
xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t cidb = -1; /* case match data block no. */
enum xfs_dacmp cmp; /* name compare result */
- struct xfs_dir2_leaf_entry *ents;
- struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, &lbp);
if (error)
return error;
*lbpp = lbp;
leaf = lbp->b_addr;
xfs_dir3_leaf_check(dp, lbp);
- ents = dp->d_ops->leaf_ents_p(leaf);
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, leafhdr, leaf);
/*
* Look for the first leaf entry with our hash value.
@@ -1207,8 +1245,9 @@ xfs_dir2_leaf_lookup_int(
* Loop over all the entries with the right hash value
* looking to match the name.
*/
- for (lep = &ents[index];
- index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
+ for (lep = &leafhdr->ents[index];
+ index < leafhdr->count &&
+ be32_to_cpu(lep->hashval) == args->hashval;
lep++, index++) {
/*
* Skip over stale leaf entries.
@@ -1229,7 +1268,7 @@ xfs_dir2_leaf_lookup_int(
xfs_trans_brelse(tp, dbp);
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo, newdb),
- -1, &dbp);
+ 0, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1247,7 +1286,7 @@ xfs_dir2_leaf_lookup_int(
* and buffer. If it's the first case-insensitive match, store
* the index and buffer and continue looking for an exact match.
*/
- cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
args->cmpresult = cmp;
*indexp = index;
@@ -1271,7 +1310,7 @@ xfs_dir2_leaf_lookup_int(
xfs_trans_brelse(tp, dbp);
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo, cidb),
- -1, &dbp);
+ 0, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1297,6 +1336,7 @@ int /* error */
xfs_dir2_leaf_removename(
xfs_da_args_t *args) /* operation arguments */
{
+ struct xfs_da_geometry *geo = args->geo;
__be16 *bestsp; /* leaf block best freespace */
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t db; /* data block number */
@@ -1314,7 +1354,6 @@ xfs_dir2_leaf_removename(
int needscan; /* need to rescan data frees */
xfs_dir2_data_off_t oldbest; /* old value of best free */
struct xfs_dir2_data_free *bf; /* bestfree table */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_removename(args);
@@ -1322,51 +1361,54 @@ xfs_dir2_leaf_removename(
/*
* Lookup the leaf entry, get the leaf and data blocks read in.
*/
- if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+ error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+ if (error)
return error;
- }
+
dp = args->dp;
leaf = lbp->b_addr;
hdr = dbp->b_addr;
xfs_dir3_data_check(dp, dbp);
- bf = dp->d_ops->data_bestfree_p(hdr);
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
+
/*
* Point to the leaf entry, use that to point to the data entry.
*/
- lep = &ents[index];
- db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ lep = &leafhdr.ents[index];
+ db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address));
dep = (xfs_dir2_data_entry_t *)((char *)hdr +
- xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address)));
needscan = needlog = 0;
oldbest = be16_to_cpu(bf[0].length);
- ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
- if (be16_to_cpu(bestsp[db]) != oldbest)
+ if (be16_to_cpu(bestsp[db]) != oldbest) {
+ xfs_buf_corruption_error(lbp);
return -EFSCORRUPTED;
+ }
/*
* Mark the former data entry unused.
*/
xfs_dir2_data_make_free(args, dbp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
- dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+ &needscan);
/*
* We just mark the leaf entry stale by putting a null in it.
*/
leafhdr.stale++;
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, lbp);
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir3_leaf_log_ents(args, lbp, index, index);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, lbp, index, index);
/*
* Scan the freespace in the data block again if necessary,
* log the data block header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, dbp);
/*
@@ -1382,8 +1424,8 @@ xfs_dir2_leaf_removename(
* If the data block is now empty then get rid of the data block.
*/
if (be16_to_cpu(bf[0].length) ==
- args->geo->blksize - dp->d_ops->data_entry_offset) {
- ASSERT(db != args->geo->datablk);
+ geo->blksize - geo->data_entry_offset) {
+ ASSERT(db != geo->datablk);
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
/*
* Nope, can't get rid of it because it caused
@@ -1425,7 +1467,7 @@ xfs_dir2_leaf_removename(
/*
* If the data block was not the first one, drop it.
*/
- else if (db != args->geo->datablk)
+ else if (db != geo->datablk)
dbp = NULL;
xfs_dir3_leaf_check(dp, lbp);
@@ -1448,26 +1490,24 @@ xfs_dir2_leaf_replace(
int error; /* error return code */
int index; /* index of leaf entry */
struct xfs_buf *lbp; /* leaf buffer */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_trans_t *tp; /* transaction pointer */
- struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_replace(args);
/*
* Look up the entry.
*/
- if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+ error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp, &leafhdr);
+ if (error)
return error;
- }
+
dp = args->dp;
- leaf = lbp->b_addr;
- ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Point to the leaf entry, get data address from it.
*/
- lep = &ents[index];
+ lep = &leafhdr.ents[index];
/*
* Point to the data entry.
*/
@@ -1479,7 +1519,7 @@ xfs_dir2_leaf_replace(
* Put the new inode number in, log it.
*/
dep->inumber = cpu_to_be64(args->inumber);
- dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
tp = args->trans;
xfs_dir2_data_log_entry(args, dbp, dep);
xfs_dir3_leaf_check(dp, lbp);
@@ -1501,21 +1541,17 @@ xfs_dir2_leaf_search_hash(
xfs_dahash_t hashwant; /* hash value looking for */
int high; /* high leaf index */
int low; /* low leaf index */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
int mid=0; /* current leaf index */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
- leaf = lbp->b_addr;
- ents = args->dp->d_ops->leaf_ents_p(leaf);
- args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(args->dp->i_mount, &leafhdr, lbp->b_addr);
/*
* Note, the table cannot be empty, so we have to go through the loop.
* Binary search the leaf entries looking for our hash value.
*/
- for (lep = ents, low = 0, high = leafhdr.count - 1,
+ for (lep = leafhdr.ents, low = 0, high = leafhdr.count - 1,
hashwant = args->hashval;
low <= high; ) {
mid = (low + high) >> 1;
@@ -1552,6 +1588,7 @@ xfs_dir2_leaf_trim_data(
struct xfs_buf *lbp, /* leaf buffer */
xfs_dir2_db_t db) /* data block number */
{
+ struct xfs_da_geometry *geo = args->geo;
__be16 *bestsp; /* leaf bests table */
struct xfs_buf *dbp; /* data block buffer */
xfs_inode_t *dp; /* incore directory inode */
@@ -1565,23 +1602,23 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
- -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(geo, db), 0, &dbp);
if (error)
return error;
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
#ifdef DEBUG
{
struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
- struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
+ struct xfs_dir2_data_free *bf =
+ xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
ASSERT(be16_to_cpu(bf[0].length) ==
- args->geo->blksize - dp->d_ops->data_entry_offset);
+ geo->blksize - geo->data_entry_offset);
ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
}
#endif
@@ -1639,7 +1676,6 @@ xfs_dir2_node_to_leaf(
int error; /* error return code */
struct xfs_buf *fbp; /* buffer for freespace block */
xfs_fileoff_t fo; /* freespace file offset */
- xfs_dir2_free_t *free; /* freespace structure */
struct xfs_buf *lbp; /* buffer for leaf block */
xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */
xfs_dir2_leaf_t *leaf; /* leaf structure */
@@ -1697,7 +1733,7 @@ xfs_dir2_node_to_leaf(
return 0;
lbp = state->path.blk[0].bp;
leaf = lbp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
@@ -1708,8 +1744,7 @@ xfs_dir2_node_to_leaf(
error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
if (error)
return error;
- free = fbp->b_addr;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(mp, &freehdr, fbp->b_addr);
ASSERT(!freehdr.firstdb);
@@ -1743,10 +1778,10 @@ xfs_dir2_node_to_leaf(
/*
* Set up the leaf bests table.
*/
- memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
+ memcpy(xfs_dir2_leaf_bests_p(ltp), freehdr.bests,
freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(mp, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, lbp);
xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
xfs_dir3_leaf_log_tail(args, lbp);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 705c4f562758..a0cc5e240306 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -34,6 +34,25 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
int *rval);
/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / geo->free_max_bests);
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return db % geo->free_max_bests;
+}
+
+/*
* Check internal consistency of a leafn block.
*/
#ifdef DEBUG
@@ -45,7 +64,7 @@ xfs_dir3_leafn_check(
struct xfs_dir2_leaf *leaf = bp->b_addr;
struct xfs_dir3_icleaf_hdr leafhdr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
@@ -54,7 +73,7 @@ xfs_dir3_leafn_check(
} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
return __this_address;
- return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
+ return xfs_dir3_leaf_check_int(dp->i_mount, &leafhdr, leaf);
}
static inline void
@@ -160,10 +179,9 @@ xfs_dir3_free_header_check(
struct xfs_buf *bp)
{
struct xfs_mount *mp = dp->i_mount;
+ int maxbests = mp->m_dir_geo->free_max_bests;
unsigned int firstdb;
- int maxbests;
- maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo);
firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
maxbests;
@@ -194,14 +212,14 @@ __xfs_dir3_free_read(
struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t fbno,
- xfs_daddr_t mappedbno,
+ unsigned int flags,
struct xfs_buf **bpp)
{
xfs_failaddr_t fa;
int err;
- err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
- XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+ err = xfs_da_read_buf(tp, dp, fbno, flags, bpp, XFS_DATA_FORK,
+ &xfs_dir3_free_buf_ops);
if (err || !*bpp)
return err;
@@ -220,6 +238,58 @@ __xfs_dir3_free_read(
return 0;
}
+void
+xfs_dir2_free_hdr_from_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from;
+
+ to->magic = be32_to_cpu(from3->hdr.hdr.magic);
+ to->firstdb = be32_to_cpu(from3->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from3->hdr.nvalid);
+ to->nused = be32_to_cpu(from3->hdr.nused);
+ to->bests = from3->bests;
+
+ ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+ } else {
+ to->magic = be32_to_cpu(from->hdr.magic);
+ to->firstdb = be32_to_cpu(from->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from->hdr.nvalid);
+ to->nused = be32_to_cpu(from->hdr.nused);
+ to->bests = from->bests;
+
+ ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+ }
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+ struct xfs_mount *mp,
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to;
+
+ ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+ to3->hdr.hdr.magic = cpu_to_be32(from->magic);
+ to3->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to3->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to3->hdr.nused = cpu_to_be32(from->nused);
+ } else {
+ ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+ to->hdr.magic = cpu_to_be32(from->magic);
+ to->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to->hdr.nused = cpu_to_be32(from->nused);
+ }
+}
+
int
xfs_dir2_free_read(
struct xfs_trans *tp,
@@ -227,7 +297,7 @@ xfs_dir2_free_read(
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp);
+ return __xfs_dir3_free_read(tp, dp, fbno, 0, bpp);
}
static int
@@ -237,7 +307,7 @@ xfs_dir2_free_try_read(
xfs_dablk_t fbno,
struct xfs_buf **bpp)
{
- return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
+ return __xfs_dir3_free_read(tp, dp, fbno, XFS_DABUF_MAP_HOLE_OK, bpp);
}
static int
@@ -254,7 +324,7 @@ xfs_dir3_free_get_buf(
struct xfs_dir3_icfree_hdr hdr;
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
- -1, &bp, XFS_DATA_FORK);
+ &bp, XFS_DATA_FORK);
if (error)
return error;
@@ -278,7 +348,7 @@ xfs_dir3_free_get_buf(
uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr.magic = XFS_DIR2_FREE_MAGIC;
- dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
+ xfs_dir2_free_hdr_to_disk(mp, bp->b_addr, &hdr);
*bpp = bp;
return 0;
}
@@ -289,21 +359,19 @@ xfs_dir3_free_get_buf(
STATIC void
xfs_dir2_free_log_bests(
struct xfs_da_args *args,
+ struct xfs_dir3_icfree_hdr *hdr,
struct xfs_buf *bp,
int first, /* first entry to log */
int last) /* last entry to log */
{
- xfs_dir2_free_t *free; /* freespace structure */
- __be16 *bests;
+ struct xfs_dir2_free *free = bp->b_addr;
- free = bp->b_addr;
- bests = args->dp->d_ops->free_bests_p(free);
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
xfs_trans_log_buf(args->trans, bp,
- (uint)((char *)&bests[first] - (char *)free),
- (uint)((char *)&bests[last] - (char *)free +
- sizeof(bests[0]) - 1));
+ (char *)&hdr->bests[first] - (char *)free,
+ (char *)&hdr->bests[last] - (char *)free +
+ sizeof(hdr->bests[0]) - 1);
}
/*
@@ -322,7 +390,7 @@ xfs_dir2_free_log_header(
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
#endif
xfs_trans_log_buf(args->trans, bp, 0,
- args->dp->d_ops->free_hdr_size - 1);
+ args->geo->free_hdr_size - 1);
}
/*
@@ -339,14 +407,12 @@ xfs_dir2_leaf_to_node(
int error; /* error return value */
struct xfs_buf *fbp; /* freespace buffer */
xfs_dir2_db_t fdb; /* freespace block number */
- xfs_dir2_free_t *free; /* freespace structure */
__be16 *from; /* pointer to freespace entry */
int i; /* leaf freespace index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
int n; /* count of live freespc ents */
xfs_dir2_data_off_t off; /* freespace entry value */
- __be16 *to; /* pointer to freespace entry */
xfs_trans_t *tp; /* transaction pointer */
struct xfs_dir3_icfree_hdr freehdr;
@@ -368,24 +434,25 @@ xfs_dir2_leaf_to_node(
if (error)
return error;
- free = fbp->b_addr;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, fbp->b_addr);
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
if (be32_to_cpu(ltp->bestcount) >
- (uint)dp->i_d.di_size / args->geo->blksize)
+ (uint)dp->i_d.di_size / args->geo->blksize) {
+ xfs_buf_corruption_error(lbp);
return -EFSCORRUPTED;
+ }
/*
* Copy freespace entries from the leaf block to the new block.
* Count active entries.
*/
from = xfs_dir2_leaf_bests_p(ltp);
- to = dp->d_ops->free_bests_p(free);
- for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
- if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
+ for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++) {
+ off = be16_to_cpu(*from);
+ if (off != NULLDATAOFF)
n++;
- *to = cpu_to_be16(off);
+ freehdr.bests[i] = cpu_to_be16(off);
}
/*
@@ -394,8 +461,8 @@ xfs_dir2_leaf_to_node(
freehdr.nused = n;
freehdr.nvalid = be32_to_cpu(ltp->bestcount);
- dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
- xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+ xfs_dir2_free_hdr_to_disk(dp->i_mount, fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, 0, freehdr.nvalid - 1);
xfs_dir2_free_log_header(args, fbp);
/*
@@ -438,15 +505,17 @@ xfs_dir2_leafn_add(
trace_xfs_dir2_leafn_add(args, index);
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ ents = leafhdr.ents;
/*
* Quick check just to make sure we are not going to index
* into other peoples memory
*/
- if (index < 0)
+ if (index < 0) {
+ xfs_buf_corruption_error(bp);
return -EFSCORRUPTED;
+ }
/*
* If there are already the maximum number of leaf entries in
@@ -455,7 +524,7 @@ xfs_dir2_leafn_add(
* a compact.
*/
- if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
+ if (leafhdr.count == args->geo->leaf_max_ents) {
if (!leafhdr.stale)
return -ENOSPC;
compact = leafhdr.stale > 1;
@@ -493,9 +562,9 @@ xfs_dir2_leafn_add(
lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
args->blkno, args->index));
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, bp);
- xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, bp, lfloglow, lfloghigh);
xfs_dir3_leaf_check(dp, bp);
return 0;
}
@@ -509,10 +578,9 @@ xfs_dir2_free_hdr_check(
{
struct xfs_dir3_icfree_hdr hdr;
- dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &hdr, bp->b_addr);
- ASSERT((hdr.firstdb %
- dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
+ ASSERT((hdr.firstdb % dp->i_mount->m_dir_geo->free_max_bests) == 0);
ASSERT(hdr.firstdb <= db);
ASSERT(db < hdr.firstdb + hdr.nvalid);
}
@@ -530,11 +598,9 @@ xfs_dir2_leaf_lasthash(
struct xfs_buf *bp, /* leaf buffer */
int *count) /* count of entries in leaf */
{
- struct xfs_dir2_leaf *leaf = bp->b_addr;
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, bp->b_addr);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAFN_MAGIC ||
@@ -545,9 +611,7 @@ xfs_dir2_leaf_lasthash(
*count = leafhdr.count;
if (!leafhdr.count)
return 0;
-
- ents = dp->d_ops->leaf_ents_p(leaf);
- return be32_to_cpu(ents[leafhdr.count - 1].hashval);
+ return be32_to_cpu(leafhdr.ents[leafhdr.count - 1].hashval);
}
/*
@@ -576,15 +640,13 @@ xfs_dir2_leafn_lookup_for_addname(
xfs_dir2_db_t newdb; /* new data block number */
xfs_dir2_db_t newfdb; /* new free block number */
xfs_trans_t *tp; /* transaction pointer */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
xfs_dir3_leaf_check(dp, bp);
ASSERT(leafhdr.count > 0);
@@ -604,11 +666,11 @@ xfs_dir2_leafn_lookup_for_addname(
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
}
- length = dp->d_ops->data_entsize(args->namelen);
+ length = xfs_dir2_data_entsize(mp, args->namelen);
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &ents[index];
+ for (lep = &leafhdr.ents[index];
index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
lep++, index++) {
/*
@@ -630,14 +692,14 @@ xfs_dir2_leafn_lookup_for_addname(
* in hand, take a look at it.
*/
if (newdb != curdb) {
- __be16 *bests;
+ struct xfs_dir3_icfree_hdr freehdr;
curdb = newdb;
/*
* Convert the data block to the free block
* holding its freespace information.
*/
- newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
+ newfdb = xfs_dir2_db_to_fdb(args->geo, newdb);
/*
* If it's not the one we have in hand, read it in.
*/
@@ -661,20 +723,20 @@ xfs_dir2_leafn_lookup_for_addname(
/*
* Get the index for our entry.
*/
- fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
+ fi = xfs_dir2_db_to_fdindex(args->geo, curdb);
/*
* If it has room, return it.
*/
- bests = dp->d_ops->free_bests_p(free);
- if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
- XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
- XFS_ERRLEVEL_LOW, mp);
+ xfs_dir2_free_hdr_from_disk(mp, &freehdr, free);
+ if (XFS_IS_CORRUPT(mp,
+ freehdr.bests[fi] ==
+ cpu_to_be16(NULLDATAOFF))) {
if (curfdb != newfdb)
xfs_trans_brelse(tp, curbp);
return -EFSCORRUPTED;
}
curfdb = newfdb;
- if (be16_to_cpu(bests[fi]) >= length)
+ if (be16_to_cpu(freehdr.bests[fi]) >= length)
goto out;
}
}
@@ -728,19 +790,19 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_dir2_db_t newdb; /* new data block number */
xfs_trans_t *tp; /* transaction pointer */
enum xfs_dacmp cmp; /* comparison result */
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(mp, &leafhdr, leaf);
xfs_dir3_leaf_check(dp, bp);
- if (leafhdr.count <= 0)
+ if (leafhdr.count <= 0) {
+ xfs_buf_corruption_error(bp);
return -EFSCORRUPTED;
+ }
/*
* Look up the hash value in the leaf entries.
@@ -756,7 +818,7 @@ xfs_dir2_leafn_lookup_for_entry(
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &ents[index];
+ for (lep = &leafhdr.ents[index];
index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval;
lep++, index++) {
/*
@@ -795,7 +857,7 @@ xfs_dir2_leafn_lookup_for_entry(
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo,
newdb),
- -1, &curbp);
+ 0, &curbp);
if (error)
return error;
}
@@ -813,7 +875,7 @@ xfs_dir2_leafn_lookup_for_entry(
* EEXIST immediately. If it's the first case-insensitive
* match, store the block & inode number and continue looking.
*/
- cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ cmp = xfs_dir2_compname(args, dep->name, dep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
/* If there is a CI match block, drop it */
if (args->cmpresult != XFS_CMP_DIFFERENT &&
@@ -821,7 +883,7 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_trans_brelse(tp, state->extrablk.bp);
args->cmpresult = cmp;
args->inumber = be64_to_cpu(dep->inumber);
- args->filetype = dp->d_ops->data_get_ftype(dep);
+ args->filetype = xfs_dir2_data_get_ftype(mp, dep);
*indexp = index;
state->extravalid = 1;
state->extrablk.bp = curbp;
@@ -911,7 +973,7 @@ xfs_dir3_leafn_moveents(
if (start_d < dhdr->count) {
memmove(&dents[start_d + count], &dents[start_d],
(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
+ xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d + count,
count + dhdr->count - 1);
}
/*
@@ -933,7 +995,7 @@ xfs_dir3_leafn_moveents(
*/
memcpy(&dents[start_d], &sents[start_s],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
+ xfs_dir3_leaf_log_ents(args, dhdr, bp_d, start_d, start_d + count - 1);
/*
* If there are source entries after the ones we copied,
@@ -942,7 +1004,8 @@ xfs_dir3_leafn_moveents(
if (start_s + count < shdr->count) {
memmove(&sents[start_s], &sents[start_s + count],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
+ xfs_dir3_leaf_log_ents(args, shdr, bp_s, start_s,
+ start_s + count - 1);
}
/*
@@ -971,10 +1034,10 @@ xfs_dir2_leafn_order(
struct xfs_dir3_icleaf_hdr hdr1;
struct xfs_dir3_icleaf_hdr hdr2;
- dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
- dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
- ents1 = dp->d_ops->leaf_ents_p(leaf1);
- ents2 = dp->d_ops->leaf_ents_p(leaf2);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2);
+ ents1 = hdr1.ents;
+ ents2 = hdr2.ents;
if (hdr1.count > 0 && hdr2.count > 0 &&
(be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
@@ -1024,10 +1087,10 @@ xfs_dir2_leafn_rebalance(
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
- dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
- ents1 = dp->d_ops->leaf_ents_p(leaf1);
- ents2 = dp->d_ops->leaf_ents_p(leaf2);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr1, leaf1);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf2);
+ ents1 = hdr1.ents;
+ ents2 = hdr2.ents;
oldsum = hdr1.count + hdr2.count;
#if defined(DEBUG) || defined(XFS_WARN)
@@ -1073,8 +1136,8 @@ xfs_dir2_leafn_rebalance(
ASSERT(hdr1.stale + hdr2.stale == oldstale);
/* log the changes made when moving the entries */
- dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
- dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf1, &hdr1);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf2, &hdr2);
xfs_dir3_leaf_log_header(args, blk1->bp);
xfs_dir3_leaf_log_header(args, blk2->bp);
@@ -1120,19 +1183,17 @@ xfs_dir3_data_block_free(
int longest)
{
int logfree = 0;
- __be16 *bests;
struct xfs_dir3_icfree_hdr freehdr;
struct xfs_inode *dp = args->dp;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
- bests = dp->d_ops->free_bests_p(free);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
if (hdr) {
/*
* Data block is not empty, just set the free entry to the new
* value.
*/
- bests[findex] = cpu_to_be16(longest);
- xfs_dir2_free_log_bests(args, fbp, findex, findex);
+ freehdr.bests[findex] = cpu_to_be16(longest);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
return 0;
}
@@ -1148,18 +1209,18 @@ xfs_dir3_data_block_free(
int i; /* free entry index */
for (i = findex - 1; i >= 0; i--) {
- if (bests[i] != cpu_to_be16(NULLDATAOFF))
+ if (freehdr.bests[i] != cpu_to_be16(NULLDATAOFF))
break;
}
freehdr.nvalid = i + 1;
logfree = 0;
} else {
/* Not the last entry, just punch it out. */
- bests[findex] = cpu_to_be16(NULLDATAOFF);
+ freehdr.bests[findex] = cpu_to_be16(NULLDATAOFF);
logfree = 1;
}
- dp->d_ops->free_hdr_to_disk(free, &freehdr);
+ xfs_dir2_free_hdr_to_disk(dp->i_mount, free, &freehdr);
xfs_dir2_free_log_header(args, fbp);
/*
@@ -1184,7 +1245,7 @@ xfs_dir3_data_block_free(
/* Log the free entry that changed, unless we got rid of it. */
if (logfree)
- xfs_dir2_free_log_bests(args, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
return 0;
}
@@ -1201,6 +1262,7 @@ xfs_dir2_leafn_remove(
xfs_da_state_blk_t *dblk, /* data block */
int *rval) /* resulting block needs join */
{
+ struct xfs_da_geometry *geo = args->geo;
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_db_t db; /* data block number */
struct xfs_buf *dbp; /* data block buffer */
@@ -1215,27 +1277,25 @@ xfs_dir2_leafn_remove(
xfs_trans_t *tp; /* transaction pointer */
struct xfs_dir2_data_free *bf; /* bestfree table */
struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
trace_xfs_dir2_leafn_remove(args, index);
dp = args->dp;
tp = args->trans;
leaf = bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
/*
* Point to the entry we're removing.
*/
- lep = &ents[index];
+ lep = &leafhdr.ents[index];
/*
* Extract the data block and offset from the entry.
*/
- db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ db = xfs_dir2_dataptr_to_db(geo, be32_to_cpu(lep->address));
ASSERT(dblk->blkno == db);
- off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
+ off = xfs_dir2_dataptr_to_off(geo, be32_to_cpu(lep->address));
ASSERT(dblk->index == off);
/*
@@ -1243,11 +1303,11 @@ xfs_dir2_leafn_remove(
* Log the leaf block changes.
*/
leafhdr.stale++;
- dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, leaf, &leafhdr);
xfs_dir3_leaf_log_header(args, bp);
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir3_leaf_log_ents(args, bp, index, index);
+ xfs_dir3_leaf_log_ents(args, &leafhdr, bp, index, index);
/*
* Make the data entry free. Keep track of the longest freespace
@@ -1256,17 +1316,18 @@ xfs_dir2_leafn_remove(
dbp = dblk->bp;
hdr = dbp->b_addr;
dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
longest = be16_to_cpu(bf[0].length);
needlog = needscan = 0;
xfs_dir2_data_make_free(args, dbp, off,
- dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_entsize(dp->i_mount, dep->namelen), &needlog,
+ &needscan);
/*
* Rescan the data block freespaces for bestfree.
* Log the data block header if needed.
*/
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, dbp);
xfs_dir3_data_check(dp, dbp);
@@ -1285,9 +1346,8 @@ xfs_dir2_leafn_remove(
* Convert the data block number to a free block,
* read in the free block.
*/
- fdb = dp->d_ops->db_to_fdb(args->geo, db);
- error = xfs_dir2_free_read(tp, dp,
- xfs_dir2_db_to_da(args->geo, fdb),
+ fdb = xfs_dir2_db_to_fdb(geo, db);
+ error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(geo, fdb),
&fbp);
if (error)
return error;
@@ -1295,23 +1355,22 @@ xfs_dir2_leafn_remove(
#ifdef DEBUG
{
struct xfs_dir3_icfree_hdr freehdr;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
- ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
- (fdb - xfs_dir2_byte_to_db(args->geo,
- XFS_DIR2_FREE_OFFSET)));
+
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
+ ASSERT(freehdr.firstdb == geo->free_max_bests *
+ (fdb - xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET)));
}
#endif
/*
* Calculate which entry we need to fix.
*/
- findex = dp->d_ops->db_to_fdindex(args->geo, db);
+ findex = xfs_dir2_db_to_fdindex(geo, db);
longest = be16_to_cpu(bf[0].length);
/*
* If the data block is now empty we can get rid of it
* (usually).
*/
- if (longest == args->geo->blksize -
- dp->d_ops->data_entry_offset) {
+ if (longest == geo->blksize - geo->data_entry_offset) {
/*
* Try to punch out the data block.
*/
@@ -1343,9 +1402,9 @@ xfs_dir2_leafn_remove(
* Return indication of whether this leaf block is empty enough
* to justify trying to join it with a neighbor.
*/
- *rval = (dp->d_ops->leaf_hdr_size +
- (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
- args->geo->magicpct;
+ *rval = (geo->leaf_hdr_size +
+ (uint)sizeof(leafhdr.ents) * (leafhdr.count - leafhdr.stale)) <
+ geo->magicpct;
return 0;
}
@@ -1444,12 +1503,12 @@ xfs_dir2_leafn_toosmall(
*/
blk = &state->path.blk[state->path.active - 1];
leaf = blk->bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &leafhdr, leaf);
+ ents = leafhdr.ents;
xfs_dir3_leaf_check(dp, blk->bp);
count = leafhdr.count - leafhdr.stale;
- bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
+ bytes = state->args->geo->leaf_hdr_size + count * sizeof(ents[0]);
if (bytes > (state->args->geo->blksize >> 1)) {
/*
* Blk over 50%, don't try to join.
@@ -1494,8 +1553,7 @@ xfs_dir2_leafn_toosmall(
/*
* Read the sibling leaf block.
*/
- error = xfs_dir3_leafn_read(state->args->trans, dp,
- blkno, -1, &bp);
+ error = xfs_dir3_leafn_read(state->args->trans, dp, blkno, &bp);
if (error)
return error;
@@ -1507,8 +1565,8 @@ xfs_dir2_leafn_toosmall(
(state->args->geo->blksize >> 2);
leaf = bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
- ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &hdr2, leaf);
+ ents = hdr2.ents;
count += hdr2.count - hdr2.stale;
bytes -= count * sizeof(ents[0]);
@@ -1570,10 +1628,10 @@ xfs_dir2_leafn_unbalance(
drop_leaf = drop_blk->bp->b_addr;
save_leaf = save_blk->bp->b_addr;
- dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
- dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
- sents = dp->d_ops->leaf_ents_p(save_leaf);
- dents = dp->d_ops->leaf_ents_p(drop_leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &savehdr, save_leaf);
+ xfs_dir2_leaf_hdr_from_disk(dp->i_mount, &drophdr, drop_leaf);
+ sents = savehdr.ents;
+ dents = drophdr.ents;
/*
* If there are any stale leaf entries, take this opportunity
@@ -1599,8 +1657,8 @@ xfs_dir2_leafn_unbalance(
save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
/* log the changes made when moving the entries */
- dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
- dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, save_leaf, &savehdr);
+ xfs_dir2_leaf_hdr_to_disk(dp->i_mount, drop_leaf, &drophdr);
xfs_dir3_leaf_log_header(args, save_blk->bp);
xfs_dir3_leaf_log_header(args, drop_blk->bp);
@@ -1619,19 +1677,16 @@ xfs_dir2_node_add_datablk(
xfs_dir2_db_t *dbno,
struct xfs_buf **dbpp,
struct xfs_buf **fbpp,
+ struct xfs_dir3_icfree_hdr *hdr,
int *findex)
{
struct xfs_inode *dp = args->dp;
struct xfs_trans *tp = args->trans;
struct xfs_mount *mp = dp->i_mount;
- struct xfs_dir3_icfree_hdr freehdr;
struct xfs_dir2_data_free *bf;
- struct xfs_dir2_data_hdr *hdr;
- struct xfs_dir2_free *free = NULL;
xfs_dir2_db_t fbno;
struct xfs_buf *fbp;
struct xfs_buf *dbp;
- __be16 *bests = NULL;
int error;
/* Not allowed to allocate, return failure. */
@@ -1650,7 +1705,7 @@ xfs_dir2_node_add_datablk(
* Get the freespace block corresponding to the data block
* that was just allocated.
*/
- fbno = dp->d_ops->db_to_fdb(args->geo, *dbno);
+ fbno = xfs_dir2_db_to_fdb(args->geo, *dbno);
error = xfs_dir2_free_try_read(tp, dp,
xfs_dir2_db_to_da(args->geo, fbno), &fbp);
if (error)
@@ -1665,11 +1720,13 @@ xfs_dir2_node_add_datablk(
if (error)
return error;
- if (dp->d_ops->db_to_fdb(args->geo, *dbno) != fbno) {
+ if (XFS_IS_CORRUPT(mp,
+ xfs_dir2_db_to_fdb(args->geo, *dbno) !=
+ fbno)) {
xfs_alert(mp,
"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld",
__func__, (unsigned long long)dp->i_ino,
- (long long)dp->d_ops->db_to_fdb(args->geo, *dbno),
+ (long long)xfs_dir2_db_to_fdb(args->geo, *dbno),
(long long)*dbno, (long long)fbno);
if (fblk) {
xfs_alert(mp,
@@ -1679,7 +1736,6 @@ xfs_dir2_node_add_datablk(
} else {
xfs_alert(mp, " ... fblk is NULL");
}
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
@@ -1687,44 +1743,39 @@ xfs_dir2_node_add_datablk(
error = xfs_dir3_free_get_buf(args, fbno, &fbp);
if (error)
return error;
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr);
/* Remember the first slot as our empty slot. */
- freehdr.firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
+ hdr->firstdb = (fbno - xfs_dir2_byte_to_db(args->geo,
XFS_DIR2_FREE_OFFSET)) *
- dp->d_ops->free_max_bests(args->geo);
+ args->geo->free_max_bests;
} else {
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(mp, hdr, fbp->b_addr);
}
/* Set the freespace block index from the data block number. */
- *findex = dp->d_ops->db_to_fdindex(args->geo, *dbno);
+ *findex = xfs_dir2_db_to_fdindex(args->geo, *dbno);
/* Extend the freespace table if the new data block is off the end. */
- if (*findex >= freehdr.nvalid) {
- ASSERT(*findex < dp->d_ops->free_max_bests(args->geo));
- freehdr.nvalid = *findex + 1;
- bests[*findex] = cpu_to_be16(NULLDATAOFF);
+ if (*findex >= hdr->nvalid) {
+ ASSERT(*findex < args->geo->free_max_bests);
+ hdr->nvalid = *findex + 1;
+ hdr->bests[*findex] = cpu_to_be16(NULLDATAOFF);
}
/*
* If this entry was for an empty data block (this should always be
* true) then update the header.
*/
- if (bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
- freehdr.nused++;
- dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ if (hdr->bests[*findex] == cpu_to_be16(NULLDATAOFF)) {
+ hdr->nused++;
+ xfs_dir2_free_hdr_to_disk(mp, fbp->b_addr, hdr);
xfs_dir2_free_log_header(args, fbp);
}
/* Update the freespace value for the new block in the table. */
- hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
- bests[*findex] = bf[0].length;
+ bf = xfs_dir2_data_bestfree_p(mp, dbp->b_addr);
+ hdr->bests[*findex] = bf[0].length;
*dbpp = dbp;
*fbpp = fbp;
@@ -1737,11 +1788,10 @@ xfs_dir2_node_find_freeblk(
struct xfs_da_state_blk *fblk,
xfs_dir2_db_t *dbnop,
struct xfs_buf **fbpp,
+ struct xfs_dir3_icfree_hdr *hdr,
int *findexp,
int length)
{
- struct xfs_dir3_icfree_hdr freehdr;
- struct xfs_dir2_free *free = NULL;
struct xfs_inode *dp = args->dp;
struct xfs_trans *tp = args->trans;
struct xfs_buf *fbp = NULL;
@@ -1751,7 +1801,6 @@ xfs_dir2_node_find_freeblk(
xfs_dir2_db_t dbno = -1;
xfs_dir2_db_t fbno;
xfs_fileoff_t fo;
- __be16 *bests = NULL;
int findex = 0;
int error;
@@ -1762,17 +1811,14 @@ xfs_dir2_node_find_freeblk(
*/
if (fblk) {
fbp = fblk->bp;
- free = fbp->b_addr;
findex = fblk->index;
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr);
if (findex >= 0) {
/* caller already found the freespace for us. */
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
-
- ASSERT(findex < freehdr.nvalid);
- ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF);
- ASSERT(be16_to_cpu(bests[findex]) >= length);
- dbno = freehdr.firstdb + findex;
+ ASSERT(findex < hdr->nvalid);
+ ASSERT(be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF);
+ ASSERT(be16_to_cpu(hdr->bests[findex]) >= length);
+ dbno = hdr->firstdb + findex;
goto found_block;
}
@@ -1814,15 +1860,13 @@ xfs_dir2_node_find_freeblk(
if (!fbp)
continue;
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, hdr, fbp->b_addr);
/* Scan the free entry array for a large enough free space. */
- for (findex = freehdr.nvalid - 1; findex >= 0; findex--) {
- if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
- be16_to_cpu(bests[findex]) >= length) {
- dbno = freehdr.firstdb + findex;
+ for (findex = hdr->nvalid - 1; findex >= 0; findex--) {
+ if (be16_to_cpu(hdr->bests[findex]) != NULLDATAOFF &&
+ be16_to_cpu(hdr->bests[findex]) >= length) {
+ dbno = hdr->firstdb + findex;
goto found_block;
}
}
@@ -1838,7 +1882,6 @@ found_block:
return 0;
}
-
/*
* Add the data entry for a node-format directory name addition.
* The leaf entry is added in xfs_dir2_leafn_add.
@@ -1853,9 +1896,9 @@ xfs_dir2_node_addname_int(
struct xfs_dir2_data_entry *dep; /* data entry pointer */
struct xfs_dir2_data_hdr *hdr; /* data block header */
struct xfs_dir2_data_free *bf;
- struct xfs_dir2_free *free = NULL; /* freespace block structure */
struct xfs_trans *tp = args->trans;
struct xfs_inode *dp = args->dp;
+ struct xfs_dir3_icfree_hdr freehdr;
struct xfs_buf *dbp; /* data block buffer */
struct xfs_buf *fbp; /* freespace buffer */
xfs_dir2_data_aoff_t aoff;
@@ -1867,11 +1910,10 @@ xfs_dir2_node_addname_int(
int needlog = 0; /* need to log data header */
int needscan = 0; /* need to rescan data frees */
__be16 *tagp; /* data entry tag pointer */
- __be16 *bests;
- length = dp->d_ops->data_entsize(args->namelen);
- error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &findex,
- length);
+ length = xfs_dir2_data_entsize(dp->i_mount, args->namelen);
+ error = xfs_dir2_node_find_freeblk(args, fblk, &dbno, &fbp, &freehdr,
+ &findex, length);
if (error)
return error;
@@ -1893,19 +1935,19 @@ xfs_dir2_node_addname_int(
/* we're going to have to log the free block index later */
logfree = 1;
error = xfs_dir2_node_add_datablk(args, fblk, &dbno, &dbp, &fbp,
- &findex);
+ &freehdr, &findex);
} else {
/* Read the data block in. */
error = xfs_dir3_data_read(tp, dp,
xfs_dir2_db_to_da(args->geo, dbno),
- -1, &dbp);
+ 0, &dbp);
}
if (error)
return error;
/* setup for data block up now */
hdr = dbp->b_addr;
- bf = dp->d_ops->data_bestfree_p(hdr);
+ bf = xfs_dir2_data_bestfree_p(dp->i_mount, hdr);
ASSERT(be16_to_cpu(bf[0].length) >= length);
/* Point to the existing unused space. */
@@ -1926,28 +1968,26 @@ xfs_dir2_node_addname_int(
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- dp->d_ops->data_put_ftype(dep, args->filetype);
- tagp = dp->d_ops->data_entry_tag_p(dep);
+ xfs_dir2_data_put_ftype(dp->i_mount, dep, args->filetype);
+ tagp = xfs_dir2_data_entry_tag_p(dp->i_mount, dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
xfs_dir2_data_log_entry(args, dbp, dep);
/* Rescan the freespace and log the data block if needed. */
if (needscan)
- xfs_dir2_data_freescan(dp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp->i_mount, hdr, &needlog);
if (needlog)
xfs_dir2_data_log_header(args, dbp);
/* If the freespace block entry is now wrong, update it. */
- free = fbp->b_addr;
- bests = dp->d_ops->free_bests_p(free);
- if (bests[findex] != bf[0].length) {
- bests[findex] = bf[0].length;
+ if (freehdr.bests[findex] != bf[0].length) {
+ freehdr.bests[findex] = bf[0].length;
logfree = 1;
}
/* Log the freespace entry if needed. */
if (logfree)
- xfs_dir2_free_log_bests(args, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, &freehdr, fbp, findex, findex);
/* Return the data block and offset in args. */
args->blkno = (xfs_dablk_t)dbno;
@@ -2155,8 +2195,6 @@ xfs_dir2_node_replace(
int i; /* btree level */
xfs_ino_t inum; /* new inode number */
int ftype; /* new file type */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */
int rval; /* internal return value */
xfs_da_state_t *state; /* btree cursor */
@@ -2188,16 +2226,17 @@ xfs_dir2_node_replace(
* and locked it. But paranoia is good.
*/
if (rval == -EEXIST) {
- struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+
/*
* Find the leaf entry.
*/
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
- leaf = blk->bp->b_addr;
- ents = args->dp->d_ops->leaf_ents_p(leaf);
- lep = &ents[blk->index];
ASSERT(state->extravalid);
+
+ xfs_dir2_leaf_hdr_from_disk(state->mp, &leafhdr,
+ blk->bp->b_addr);
/*
* Point to the data entry.
*/
@@ -2207,13 +2246,13 @@ xfs_dir2_node_replace(
dep = (xfs_dir2_data_entry_t *)
((char *)hdr +
xfs_dir2_dataptr_to_off(args->geo,
- be32_to_cpu(lep->address)));
+ be32_to_cpu(leafhdr.ents[blk->index].address)));
ASSERT(inum != be64_to_cpu(dep->inumber));
/*
* Fill in the new inode number and log the entry.
*/
dep->inumber = cpu_to_be64(inum);
- args->dp->d_ops->data_put_ftype(dep, ftype);
+ xfs_dir2_data_put_ftype(state->mp, dep, ftype);
xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
rval = 0;
}
@@ -2270,7 +2309,7 @@ xfs_dir2_node_trim_free(
if (!bp)
return 0;
free = bp->b_addr;
- dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ xfs_dir2_free_hdr_from_disk(dp->i_mount, &freehdr, free);
/*
* If there are used entries, there's nothing to do.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 59f9fb2241a5..c031c53d0f0d 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -8,7 +8,41 @@
struct dir_context;
+/*
+ * In-core version of the leaf and free block headers to abstract the
+ * differences in the v2 and v3 disk format of the headers.
+ */
+struct xfs_dir3_icleaf_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t stale;
+
+ /*
+ * Pointer to the on-disk format entries, which are behind the
+ * variable size (v4 vs v5) header in the on-disk block.
+ */
+ struct xfs_dir2_leaf_entry *ents;
+};
+
+struct xfs_dir3_icfree_hdr {
+ uint32_t magic;
+ uint32_t firstdb;
+ uint32_t nvalid;
+ uint32_t nused;
+
+ /*
+ * Pointer to the on-disk format entries, which are behind the
+ * variable size (v4 vs v5) header in the on-disk block.
+ */
+ __be16 *bests;
+};
+
/* xfs_dir2.c */
+xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name);
+enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args,
+ const unsigned char *name, int len);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
xfs_dir2_db_t *dbp);
extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
@@ -26,6 +60,15 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
struct xfs_buf *lbp, struct xfs_buf *dbp);
/* xfs_dir2_data.c */
+struct xfs_dir2_data_free *xfs_dir2_data_bestfree_p(struct xfs_mount *mp,
+ struct xfs_dir2_data_hdr *hdr);
+__be16 *xfs_dir2_data_entry_tag_p(struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep);
+uint8_t xfs_dir2_data_get_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep);
+void xfs_dir2_data_put_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_data_entry *dep, uint8_t ftype);
+
#ifdef DEBUG
extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
#else
@@ -34,10 +77,10 @@ extern void xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
extern xfs_failaddr_t __xfs_dir3_data_check(struct xfs_inode *dp,
struct xfs_buf *bp);
-extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
- xfs_daddr_t mapped_bno);
+int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp);
+int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+ unsigned int flags);
extern struct xfs_dir2_data_free *
xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
@@ -47,10 +90,14 @@ extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
struct xfs_buf **bpp);
/* xfs_dir2_leaf.c */
-extern int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
-extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
+void xfs_dir2_leaf_hdr_from_disk(struct xfs_mount *mp,
+ struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from);
+void xfs_dir2_leaf_hdr_to_disk(struct xfs_mount *mp, struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from);
+int xfs_dir3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
+int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
struct xfs_buf *dbp);
extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -62,7 +109,8 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
struct xfs_buf **bpp, uint16_t magic);
extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
- struct xfs_buf *bp, int first, int last);
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_buf *bp, int first,
+ int last);
extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
struct xfs_buf *bp);
extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
@@ -79,10 +127,11 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
extern xfs_failaddr_t xfs_dir3_leaf_check_int(struct xfs_mount *mp,
- struct xfs_inode *dp, struct xfs_dir3_icleaf_hdr *hdr,
- struct xfs_dir2_leaf *leaf);
+ struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
/* xfs_dir2_node.c */
+void xfs_dir2_free_hdr_from_disk(struct xfs_mount *mp,
+ struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from);
extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
struct xfs_buf *lbp);
extern xfs_dahash_t xfs_dir2_leaf_lasthash(struct xfs_inode *dp,
@@ -108,6 +157,14 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t fbno, struct xfs_buf **bpp);
/* xfs_dir2_sf.c */
+xfs_ino_t xfs_dir2_sf_get_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep);
+xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *hdr);
+void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino);
+uint8_t xfs_dir2_sf_get_ftype(struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep);
+struct xfs_dir2_sf_entry *xfs_dir2_sf_nextentry(struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep);
extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
@@ -123,4 +180,39 @@ extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip);
extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp,
struct dir_context *ctx, size_t bufsize);
+static inline unsigned int
+xfs_dir2_data_entsize(
+ struct xfs_mount *mp,
+ unsigned int namelen)
+{
+ unsigned int len;
+
+ len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen +
+ sizeof(xfs_dir2_data_off_t) /* tag */;
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ len += sizeof(uint8_t);
+ return round_up(len, XFS_DIR2_DATA_ALIGN);
+}
+
+static inline xfs_dahash_t
+xfs_dir2_hashname(
+ struct xfs_mount *mp,
+ struct xfs_name *name)
+{
+ if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb)))
+ return xfs_ascii_ci_hashname(name);
+ return xfs_da_hashname(name->name, name->len);
+}
+
+static inline enum xfs_dacmp
+xfs_dir2_compname(
+ struct xfs_da_args *args,
+ const unsigned char *name,
+ int len)
+{
+ if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb)))
+ return xfs_ascii_ci_compname(args, name, len);
+ return xfs_da_compname(args, name, len);
+}
+
#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 85f14fc2a8da..8b94d33d232f 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -37,6 +37,126 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args);
static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+static int
+xfs_dir2_sf_entsize(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ int len)
+{
+ int count = len;
+
+ count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
+ count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ count += sizeof(uint8_t);
+ return count;
+}
+
+struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return (void *)sfep + xfs_dir2_sf_entsize(mp, hdr, sfep->namelen);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. The actual inode numbers can
+ * come in two formats as well, either 4 bytes or 8 bytes wide.
+ */
+xfs_ino_t
+xfs_dir2_sf_get_ino(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ uint8_t *from = sfep->name + sfep->namelen;
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ from++;
+
+ if (!hdr->i8count)
+ return get_unaligned_be32(from);
+ return get_unaligned_be64(from) & XFS_MAXINUMBER;
+}
+
+static void
+xfs_dir2_sf_put_ino(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino)
+{
+ uint8_t *to = sfep->name + sfep->namelen;
+
+ ASSERT(ino <= XFS_MAXINUMBER);
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ to++;
+
+ if (hdr->i8count)
+ put_unaligned_be64(ino, to);
+ else
+ put_unaligned_be32(ino, to);
+}
+
+xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr)
+{
+ if (!hdr->i8count)
+ return get_unaligned_be32(hdr->parent);
+ return get_unaligned_be64(hdr->parent) & XFS_MAXINUMBER;
+}
+
+void
+xfs_dir2_sf_put_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_ino_t ino)
+{
+ ASSERT(ino <= XFS_MAXINUMBER);
+
+ if (hdr->i8count)
+ put_unaligned_be64(ino, hdr->parent);
+ else
+ put_unaligned_be32(ino, hdr->parent);
+}
+
+/*
+ * The file type field is stored at the end of the name for filetype enabled
+ * shortform directories, or not at all otherwise.
+ */
+uint8_t
+xfs_dir2_sf_get_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ if (xfs_sb_version_hasftype(&mp->m_sb)) {
+ uint8_t ftype = sfep->name[sfep->namelen];
+
+ if (ftype < XFS_DIR3_FT_MAX)
+ return ftype;
+ }
+
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_sf_put_ftype(
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_entry *sfep,
+ uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ sfep->name[sfep->namelen] = ftype;
+}
+
/*
* Given a block directory (dp/block), calculate its size as a shortform (sf)
* directory and a header for the sf directory, if it will fit it the
@@ -125,7 +245,7 @@ xfs_dir2_block_sfsize(
*/
sfhp->count = count;
sfhp->i8count = i8count;
- dp->d_ops->sf_put_parent_ino(sfhp, parent);
+ xfs_dir2_sf_put_parent_ino(sfhp, parent);
return size;
}
@@ -135,64 +255,48 @@ xfs_dir2_block_sfsize(
*/
int /* error */
xfs_dir2_block_to_sf(
- xfs_da_args_t *args, /* operation arguments */
+ struct xfs_da_args *args, /* operation arguments */
struct xfs_buf *bp,
int size, /* shortform directory size */
- xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */
+ struct xfs_dir2_sf_hdr *sfhp) /* shortform directory hdr */
{
- xfs_dir2_data_hdr_t *hdr; /* block header */
- xfs_dir2_data_entry_t *dep; /* data entry pointer */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* unused data pointer */
- char *endptr; /* end of data entries */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int error; /* error return value */
int logflags; /* inode logging flags */
- xfs_mount_t *mp; /* filesystem mount point */
- char *ptr; /* current data pointer */
- xfs_dir2_sf_entry_t *sfep; /* shortform entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */
- xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */
+ struct xfs_dir2_sf_entry *sfep; /* shortform entry */
+ struct xfs_dir2_sf_hdr *sfp; /* shortform directory header */
+ unsigned int offset = args->geo->data_entry_offset;
+ unsigned int end;
trace_xfs_dir2_block_to_sf(args);
- dp = args->dp;
- mp = dp->i_mount;
-
- /*
- * allocate a temporary destination buffer the size of the inode
- * to format the data into. Once we have formatted the data, we
- * can free the block and copy the formatted data into the inode literal
- * area.
- */
- dst = kmem_alloc(mp->m_sb.sb_inodesize, 0);
- hdr = bp->b_addr;
-
/*
- * Copy the header into the newly allocate local space.
+ * Allocate a temporary destination buffer the size of the inode to
+ * format the data into. Once we have formatted the data, we can free
+ * the block and copy the formatted data into the inode literal area.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dst;
+ sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0);
memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
/*
- * Set up to loop over the block's entries.
+ * Loop over the active and unused entries. Stop when we reach the
+ * leaf/tail portion of the block.
*/
- ptr = (char *)dp->d_ops->data_entry_p(hdr);
- endptr = xfs_dir3_data_endp(args->geo, hdr);
+ end = xfs_dir3_data_end_offset(args->geo, bp->b_addr);
sfep = xfs_dir2_sf_firstentry(sfp);
- /*
- * Loop over the active and unused entries.
- * Stop when we reach the leaf/tail portion of the block.
- */
- while (ptr < endptr) {
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
+
/*
* If it's unused, just skip over it.
*/
- dup = (xfs_dir2_data_unused_t *)ptr;
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- ptr += be16_to_cpu(dup->length);
+ offset += be16_to_cpu(dup->length);
continue;
}
- dep = (xfs_dir2_data_entry_t *)ptr;
+
/*
* Skip .
*/
@@ -204,24 +308,22 @@ xfs_dir2_block_to_sf(
else if (dep->namelen == 2 &&
dep->name[0] == '.' && dep->name[1] == '.')
ASSERT(be64_to_cpu(dep->inumber) ==
- dp->d_ops->sf_get_parent_ino(sfp));
+ xfs_dir2_sf_get_parent_ino(sfp));
/*
* Normal entry, copy it into shortform.
*/
else {
sfep->namelen = dep->namelen;
- xfs_dir2_sf_put_offset(sfep,
- (xfs_dir2_data_aoff_t)
- ((char *)dep - (char *)hdr));
+ xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, dep->name, dep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep,
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
be64_to_cpu(dep->inumber));
- dp->d_ops->sf_put_ftype(sfep,
- dp->d_ops->data_get_ftype(dep));
+ xfs_dir2_sf_put_ftype(mp, sfep,
+ xfs_dir2_data_get_ftype(mp, dep));
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
- ptr += dp->d_ops->data_entsize(dep->namelen);
+ offset += xfs_dir2_data_entsize(mp, dep->namelen);
}
ASSERT((char *)sfep - (char *)sfp == size);
@@ -240,7 +342,7 @@ xfs_dir2_block_to_sf(
* Convert the inode to local format and copy the data in.
*/
ASSERT(dp->i_df.if_bytes == 0);
- xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size);
+ xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size);
dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
dp->i_d.di_size = size;
@@ -248,7 +350,7 @@ xfs_dir2_block_to_sf(
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
- kmem_free(dst);
+ kmem_free(sfp);
return error;
}
@@ -277,13 +379,7 @@ xfs_dir2_sf_addname(
ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
dp = args->dp;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Make sure the shortform value has some of its header.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return -EIO;
- }
+ ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
@@ -291,7 +387,7 @@ xfs_dir2_sf_addname(
/*
* Compute entry (and change in) size.
*/
- incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
+ incr_isize = xfs_dir2_sf_entsize(dp->i_mount, sfp, args->namelen);
objchange = 0;
/*
@@ -364,18 +460,17 @@ xfs_dir2_sf_addname_easy(
xfs_dir2_data_aoff_t offset, /* offset to use for new ent */
int new_isize) /* new directory size */
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int byteoff; /* byte offset in sf dir */
- xfs_inode_t *dp; /* incore directory inode */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
- dp = args->dp;
-
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
byteoff = (int)((char *)sfep - (char *)sfp);
/*
* Grow the in-inode space.
*/
- xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
+ xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
XFS_DATA_FORK);
/*
* Need to set up again due to realloc of the inode data.
@@ -388,8 +483,8 @@ xfs_dir2_sf_addname_easy(
sfep->namelen = args->namelen;
xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
- dp->d_ops->sf_put_ftype(sfep, args->filetype);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber);
+ xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
/*
* Update the header and inode.
@@ -416,9 +511,10 @@ xfs_dir2_sf_addname_hard(
int objchange, /* changing inode number size */
int new_isize) /* new directory size */
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int add_datasize; /* data size need for new ent */
char *buf; /* buffer for old */
- xfs_inode_t *dp; /* incore directory inode */
int eof; /* reached end of old dir */
int nbytes; /* temp for byte copies */
xfs_dir2_data_aoff_t new_offset; /* next offset value */
@@ -432,8 +528,6 @@ xfs_dir2_sf_addname_hard(
/*
* Copy the old directory to the stack buffer.
*/
- dp = args->dp;
-
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_d.di_size;
buf = kmem_alloc(old_isize, 0);
@@ -444,13 +538,13 @@ xfs_dir2_sf_addname_hard(
* to insert the new entry.
* If it's going to end up at the end then oldsfep will point there.
*/
- for (offset = dp->d_ops->data_first_offset,
+ for (offset = args->geo->data_first_offset,
oldsfep = xfs_dir2_sf_firstentry(oldsfp),
- add_datasize = dp->d_ops->data_entsize(args->namelen),
+ add_datasize = xfs_dir2_data_entsize(mp, args->namelen),
eof = (char *)oldsfep == &buf[old_isize];
!eof;
- offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
- oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
+ offset = new_offset + xfs_dir2_data_entsize(mp, oldsfep->namelen),
+ oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep),
eof = (char *)oldsfep == &buf[old_isize]) {
new_offset = xfs_dir2_sf_get_offset(oldsfep);
if (offset + add_datasize <= new_offset)
@@ -479,8 +573,8 @@ xfs_dir2_sf_addname_hard(
sfep->namelen = args->namelen;
xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
- dp->d_ops->sf_put_ftype(sfep, args->filetype);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep, args->inumber);
+ xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
sfp->count++;
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
sfp->i8count++;
@@ -488,7 +582,7 @@ xfs_dir2_sf_addname_hard(
* If there's more left to copy, do that.
*/
if (!eof) {
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
memcpy(sfep, oldsfep, old_isize - nbytes);
}
kmem_free(buf);
@@ -510,7 +604,8 @@ xfs_dir2_sf_addname_pick(
xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */
xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int holefit; /* found hole it will fit in */
int i; /* entry number */
xfs_dir2_data_aoff_t offset; /* data block offset */
@@ -519,11 +614,9 @@ xfs_dir2_sf_addname_pick(
int size; /* entry's data size */
int used; /* data bytes used */
- dp = args->dp;
-
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- size = dp->d_ops->data_entsize(args->namelen);
- offset = dp->d_ops->data_first_offset;
+ size = xfs_dir2_data_entsize(mp, args->namelen);
+ offset = args->geo->data_first_offset;
sfep = xfs_dir2_sf_firstentry(sfp);
holefit = 0;
/*
@@ -535,8 +628,8 @@ xfs_dir2_sf_addname_pick(
if (!holefit)
holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
offset = xfs_dir2_sf_get_offset(sfep) +
- dp->d_ops->data_entsize(sfep->namelen);
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ xfs_dir2_data_entsize(mp, sfep->namelen);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
/*
* Calculate data bytes used excluding the new entry, if this
@@ -578,7 +671,8 @@ static void
xfs_dir2_sf_check(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int i; /* entry number */
int i8count; /* number of big inode#s */
xfs_ino_t ino; /* entry inode number */
@@ -586,23 +680,21 @@ xfs_dir2_sf_check(
xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
- dp = args->dp;
-
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- offset = dp->d_ops->data_first_offset;
- ino = dp->d_ops->sf_get_parent_ino(sfp);
+ offset = args->geo->data_first_offset;
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
- ino = dp->d_ops->sf_get_ino(sfp, sfep);
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
offset =
xfs_dir2_sf_get_offset(sfep) +
- dp->d_ops->data_entsize(sfep->namelen);
- ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
+ xfs_dir2_data_entsize(mp, sfep->namelen);
+ ASSERT(xfs_dir2_sf_get_ftype(mp, sfep) < XFS_DIR3_FT_MAX);
}
ASSERT(i8count == sfp->i8count);
ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
@@ -622,22 +714,16 @@ xfs_dir2_sf_verify(
struct xfs_dir2_sf_entry *sfep;
struct xfs_dir2_sf_entry *next_sfep;
char *endp;
- const struct xfs_dir_ops *dops;
struct xfs_ifork *ifp;
xfs_ino_t ino;
int i;
int i8count;
int offset;
- int size;
+ int64_t size;
int error;
uint8_t filetype;
ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL);
- /*
- * xfs_iread calls us before xfs_setup_inode sets up ip->d_ops,
- * so we can only trust the mountpoint to have the right pointer.
- */
- dops = xfs_dir_get_ops(mp, NULL);
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
@@ -653,12 +739,12 @@ xfs_dir2_sf_verify(
endp = (char *)sfp + size;
/* Check .. entry */
- ino = dops->sf_get_parent_ino(sfp);
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
error = xfs_dir_ino_validate(mp, ino);
if (error)
return __this_address;
- offset = dops->data_first_offset;
+ offset = mp->m_dir_geo->data_first_offset;
/* Check all reported entries */
sfep = xfs_dir2_sf_firstentry(sfp);
@@ -680,7 +766,7 @@ xfs_dir2_sf_verify(
* within the data buffer. The next entry starts after the
* name component, so nextentry is an acceptable test.
*/
- next_sfep = dops->sf_nextentry(sfp, sfep);
+ next_sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
if (endp < (char *)next_sfep)
return __this_address;
@@ -689,19 +775,19 @@ xfs_dir2_sf_verify(
return __this_address;
/* Check the inode number. */
- ino = dops->sf_get_ino(sfp, sfep);
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
error = xfs_dir_ino_validate(mp, ino);
if (error)
return __this_address;
/* Check the file type. */
- filetype = dops->sf_get_ftype(sfep);
+ filetype = xfs_dir2_sf_get_ftype(mp, sfep);
if (filetype >= XFS_DIR3_FT_MAX)
return __this_address;
offset = xfs_dir2_sf_get_offset(sfep) +
- dops->data_entsize(sfep->namelen);
+ xfs_dir2_data_entsize(mp, sfep->namelen);
sfep = next_sfep;
}
@@ -763,7 +849,7 @@ xfs_dir2_sf_create(
/*
* Now can put in the inode number, since i8count is set.
*/
- dp->d_ops->sf_put_parent_ino(sfp, pino);
+ xfs_dir2_sf_put_parent_ino(sfp, pino);
sfp->count = 0;
dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
@@ -779,7 +865,8 @@ int /* error */
xfs_dir2_sf_lookup(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int i; /* entry index */
int error;
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
@@ -790,16 +877,9 @@ xfs_dir2_sf_lookup(
trace_xfs_dir2_sf_lookup(args);
xfs_dir2_sf_check(args);
- dp = args->dp;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Bail out if the directory is way too short.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return -EIO;
- }
+ ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
@@ -818,7 +898,7 @@ xfs_dir2_sf_lookup(
*/
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
- args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
+ args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
args->cmpresult = XFS_CMP_EXACT;
args->filetype = XFS_DIR3_FT_DIR;
return -EEXIST;
@@ -828,18 +908,17 @@ xfs_dir2_sf_lookup(
*/
ci_sfep = NULL;
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
/*
* Compare name and if it's an exact match, return the inode
* number. If it's the first case-insensitive match, store the
* inode number and continue looking for an exact match.
*/
- cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
- sfep->namelen);
+ cmp = xfs_dir2_compname(args, sfep->name, sfep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
args->cmpresult = cmp;
- args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
- args->filetype = dp->d_ops->sf_get_ftype(sfep);
+ args->inumber = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+ args->filetype = xfs_dir2_sf_get_ftype(mp, sfep);
if (cmp == XFS_CMP_EXACT)
return -EEXIST;
ci_sfep = sfep;
@@ -864,8 +943,9 @@ int /* error */
xfs_dir2_sf_removename(
xfs_da_args_t *args)
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int byteoff; /* offset of removed entry */
- xfs_inode_t *dp; /* incore directory inode */
int entsize; /* this entry's size */
int i; /* shortform entry index */
int newsize; /* new inode size */
@@ -875,17 +955,9 @@ xfs_dir2_sf_removename(
trace_xfs_dir2_sf_removename(args);
- dp = args->dp;
-
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
oldsize = (int)dp->i_d.di_size;
- /*
- * Bail out if the directory is way too short.
- */
- if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return -EIO;
- }
+ ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == oldsize);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
@@ -895,10 +967,10 @@ xfs_dir2_sf_removename(
* Find the one we're deleting.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
XFS_CMP_EXACT) {
- ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
+ ASSERT(xfs_dir2_sf_get_ino(mp, sfp, sfep) ==
args->inumber);
break;
}
@@ -912,7 +984,7 @@ xfs_dir2_sf_removename(
* Calculate sizes.
*/
byteoff = (int)((char *)sfep - (char *)sfp);
- entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
+ entsize = xfs_dir2_sf_entsize(mp, sfp, args->namelen);
newsize = oldsize - entsize;
/*
* Copy the part if any after the removed entry, sliding it down.
@@ -945,13 +1017,35 @@ xfs_dir2_sf_removename(
}
/*
+ * Check whether the sf dir replace operation need more blocks.
+ */
+bool
+xfs_dir2_sf_replace_needblock(
+ struct xfs_inode *dp,
+ xfs_ino_t inum)
+{
+ int newsize;
+ struct xfs_dir2_sf_hdr *sfp;
+
+ if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+ return false;
+
+ sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+ newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
+
+ return inum > XFS_DIR2_MAX_SHORT_INUM &&
+ sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp);
+}
+
+/*
* Replace the inode number of an entry in a shortform directory.
*/
int /* error */
xfs_dir2_sf_replace(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
int i; /* entry index */
xfs_ino_t ino=0; /* entry old inode number */
int i8elevated; /* sf_toino8 set i8count=1 */
@@ -960,16 +1054,8 @@ xfs_dir2_sf_replace(
trace_xfs_dir2_sf_replace(args);
- dp = args->dp;
-
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
- /*
- * Bail out if the shortform directory is way too small.
- */
- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
- return -EIO;
- }
+ ASSERT(dp->i_d.di_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
@@ -980,17 +1066,14 @@ xfs_dir2_sf_replace(
*/
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
int error; /* error return value */
- int newsize; /* new inode size */
- newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
/*
* Won't fit as shortform, convert to block then do replace.
*/
- if (newsize > XFS_IFORK_DSIZE(dp)) {
+ if (xfs_dir2_sf_replace_needblock(dp, args->inumber)) {
error = xfs_dir2_sf_to_block(args);
- if (error) {
+ if (error)
return error;
- }
return xfs_dir2_block_replace(args);
}
/*
@@ -1008,22 +1091,23 @@ xfs_dir2_sf_replace(
*/
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
- ino = dp->d_ops->sf_get_parent_ino(sfp);
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
ASSERT(args->inumber != ino);
- dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
+ xfs_dir2_sf_put_parent_ino(sfp, args->inumber);
}
/*
* Normal entry, look for the name.
*/
else {
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep)) {
if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
XFS_CMP_EXACT) {
- ino = dp->d_ops->sf_get_ino(sfp, sfep);
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
ASSERT(args->inumber != ino);
- dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
- dp->d_ops->sf_put_ftype(sfep, args->filetype);
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ args->inumber);
+ xfs_dir2_sf_put_ftype(mp, sfep, args->filetype);
break;
}
}
@@ -1076,8 +1160,9 @@ static void
xfs_dir2_sf_toino4(
xfs_da_args_t *args) /* operation arguments */
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
char *buf; /* old dir's buffer */
- xfs_inode_t *dp; /* incore directory inode */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
@@ -1088,8 +1173,6 @@ xfs_dir2_sf_toino4(
trace_xfs_dir2_sf_toino4(args);
- dp = args->dp;
-
/*
* Copy the old directory to the buffer.
* Then nuke it from the inode, and add the new buffer to the inode.
@@ -1116,21 +1199,22 @@ xfs_dir2_sf_toino4(
*/
sfp->count = oldsfp->count;
sfp->i8count = 0;
- dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+ xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
oldsfep = xfs_dir2_sf_firstentry(oldsfp);
i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
- oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep),
+ oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep,
- dp->d_ops->sf_get_ino(oldsfp, oldsfep));
- dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep));
+ xfs_dir2_sf_put_ftype(mp, sfep,
+ xfs_dir2_sf_get_ftype(mp, oldsfep));
}
/*
* Clean up the inode.
@@ -1149,8 +1233,9 @@ static void
xfs_dir2_sf_toino8(
xfs_da_args_t *args) /* operation arguments */
{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
char *buf; /* old dir's buffer */
- xfs_inode_t *dp; /* incore directory inode */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
@@ -1161,8 +1246,6 @@ xfs_dir2_sf_toino8(
trace_xfs_dir2_sf_toino8(args);
- dp = args->dp;
-
/*
* Copy the old directory to the buffer.
* Then nuke it from the inode, and add the new buffer to the inode.
@@ -1189,21 +1272,22 @@ xfs_dir2_sf_toino8(
*/
sfp->count = oldsfp->count;
sfp->i8count = 1;
- dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
+ xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
oldsfep = xfs_dir2_sf_firstentry(oldsfp);
i < sfp->count;
- i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
- oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
+ i++, sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep),
+ oldsfep = xfs_dir2_sf_nextentry(mp, oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- dp->d_ops->sf_put_ino(sfp, sfep,
- dp->d_ops->sf_get_ino(oldsfp, oldsfep));
- dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
+ xfs_dir2_sf_put_ino(mp, sfp, sfep,
+ xfs_dir2_sf_get_ino(mp, oldsfp, oldsfep));
+ xfs_dir2_sf_put_ftype(mp, sfep,
+ xfs_dir2_sf_get_ftype(mp, oldsfep));
}
/*
* Clean up the inode.
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index e8bd688a4073..bedc1e752b60 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -35,10 +35,10 @@ xfs_calc_dquots_per_chunk(
xfs_failaddr_t
xfs_dquot_verify(
- struct xfs_mount *mp,
- xfs_disk_dquot_t *ddq,
- xfs_dqid_t id,
- uint type) /* used only during quotacheck */
+ struct xfs_mount *mp,
+ struct xfs_disk_dquot *ddq,
+ xfs_dqid_t id,
+ uint type) /* used only during quotacheck */
{
/*
* We can encounter an uninitialized dquot buffer for 2 reasons:
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index c968b60cee15..1b7dcbae051c 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -920,13 +920,13 @@ static inline uint xfs_dinode_size(int version)
* This enum is used in string mapping in xfs_trace.h; please keep the
* TRACE_DEFINE_ENUMs for it up to date.
*/
-typedef enum xfs_dinode_fmt {
+enum xfs_dinode_fmt {
XFS_DINODE_FMT_DEV, /* xfs_dev_t */
XFS_DINODE_FMT_LOCAL, /* bulk data */
XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
XFS_DINODE_FMT_UUID /* added long ago, but never used */
-} xfs_dinode_fmt_t;
+};
#define XFS_INODE_FORMAT_STR \
{ XFS_DINODE_FMT_DEV, "dev" }, \
@@ -1144,11 +1144,11 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
/*
* This is the main portion of the on-disk representation of quota
- * information for a user. This is the q_core of the xfs_dquot_t that
+ * information for a user. This is the q_core of the struct xfs_dquot that
* is kept in kernel memory. We pad this with some more expansion room
* to construct the on disk structure.
*/
-typedef struct xfs_disk_dquot {
+struct xfs_disk_dquot {
__be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
__u8 d_version; /* dquot version */
__u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
@@ -1171,15 +1171,15 @@ typedef struct xfs_disk_dquot {
__be32 d_rtbtimer; /* similar to above; for RT disk blocks */
__be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
__be16 d_pad;
-} xfs_disk_dquot_t;
+};
/*
* This is what goes on disk. This is separated from the xfs_disk_dquot because
* carrying the unnecessary padding would be a waste of memory.
*/
typedef struct xfs_dqblk {
- xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
- char dd_fill[4]; /* filling for posterity */
+ struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */
+ char dd_fill[4];/* filling for posterity */
/*
* These two are only present on filesystems with the CRC bits set.
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index e9371a8e0e26..ef95ca07d084 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -324,7 +324,7 @@ typedef struct xfs_growfs_rt {
* Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
*/
typedef struct xfs_bstime {
- time_t tv_sec; /* seconds */
+ __kernel_long_t tv_sec; /* seconds */
__s32 tv_nsec; /* and nanoseconds */
} xfs_bstime_t;
@@ -416,7 +416,7 @@ struct xfs_bulkstat {
/*
* Project quota id helpers (previously projid was 16bit only
- * and using two 16bit values to hold new 32bit projid was choosen
+ * and using two 16bit values to hold new 32bit projid was chosen
* to retain compatibility with "old" filesystems).
*/
static inline uint32_t
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 588d44613094..988cde7744e6 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -544,7 +544,10 @@ xfs_inobt_insert_sprec(
nrec->ir_free, &i);
if (error)
goto error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
goto out;
}
@@ -557,17 +560,23 @@ xfs_inobt_insert_sprec(
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
goto error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
- XFS_WANT_CORRUPTED_GOTO(mp,
- rec.ir_startino == nrec->ir_startino,
- error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
/*
* This should never fail. If we have coexisting records that
* cannot merge, something is seriously wrong.
*/
- XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
- error);
+ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
rec.ir_holemask, nrec->ir_startino,
@@ -1057,7 +1066,8 @@ xfs_ialloc_next_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
}
return 0;
@@ -1081,7 +1091,8 @@ xfs_ialloc_get_rec(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
}
return 0;
@@ -1161,12 +1172,18 @@ xfs_dialloc_ag_inobt(
error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_inobt_get_rec(cur, &rec, &j);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
+ if (XFS_IS_CORRUPT(mp, j != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (rec.ir_freecount > 0) {
/*
@@ -1321,19 +1338,28 @@ xfs_dialloc_ag_inobt(
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
for (;;) {
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
if (rec.ir_freecount > 0)
break;
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto error0;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
}
alloc_inode:
@@ -1393,7 +1419,8 @@ xfs_dialloc_ag_finobt_near(
error = xfs_inobt_get_rec(lcur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
/*
* See if we've landed in the parent inode record. The finobt
@@ -1416,10 +1443,16 @@ xfs_dialloc_ag_finobt_near(
error = xfs_inobt_get_rec(rcur, &rrec, &j);
if (error)
goto error_rcur;
- XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
+ if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
+ error = -EFSCORRUPTED;
+ goto error_rcur;
+ }
}
- XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
+ if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
+ error = -EFSCORRUPTED;
+ goto error_rcur;
+ }
if (i == 1 && j == 1) {
/*
* Both the left and right records are valid. Choose the closer
@@ -1472,7 +1505,8 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
return 0;
}
}
@@ -1483,12 +1517,14 @@ xfs_dialloc_ag_finobt_newino(
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
return 0;
}
@@ -1510,20 +1546,24 @@ xfs_dialloc_ag_update_inobt(
error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+ return -EFSCORRUPTED;
ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
- (rec.ir_freecount == frec->ir_freecount));
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ rec.ir_free != frec->ir_free ||
+ rec.ir_freecount != frec->ir_freecount))
+ return -EFSCORRUPTED;
return xfs_inobt_update(cur, &rec);
}
@@ -1933,14 +1973,20 @@ xfs_difree_inobt(
__func__, error);
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error) {
xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
__func__, error);
goto error0;
}
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error0;
+ }
/*
* Get the offset in the inode chunk.
*/
@@ -2052,7 +2098,10 @@ xfs_difree_finobt(
* freed an inode in a previously fully allocated chunk. If not,
* something is out of sync.
*/
- XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
+ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
ibtrec->ir_count,
@@ -2075,14 +2124,20 @@ xfs_difree_finobt(
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
goto error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
rec.ir_free |= XFS_INOBT_MASK(offset);
rec.ir_freecount++;
- XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
- (rec.ir_freecount == ibtrec->ir_freecount),
- error);
+ if (XFS_IS_CORRUPT(mp,
+ rec.ir_free != ibtrec->ir_free ||
+ rec.ir_freecount != ibtrec->ir_freecount)) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
/*
* The content of inobt records should always match between the inobt
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 7bc87408f1a0..52451809c478 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -596,7 +596,7 @@ xfs_iext_realloc_root(
struct xfs_ifork *ifp,
struct xfs_iext_cursor *cur)
{
- size_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
+ int64_t new_size = ifp->if_bytes + sizeof(struct xfs_iext_rec);
void *new;
/* account for the prev/next pointers */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 28ab3c5255e1..8afacfe4be0a 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -213,13 +213,12 @@ xfs_inode_from_disk(
to->di_version = from->di_version;
if (to->di_version == 1) {
set_nlink(inode, be16_to_cpu(from->di_onlink));
- to->di_projid_lo = 0;
- to->di_projid_hi = 0;
+ to->di_projid = 0;
to->di_version = 2;
} else {
set_nlink(inode, be32_to_cpu(from->di_nlink));
- to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
- to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+ to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
+ be16_to_cpu(from->di_projid_lo);
}
to->di_format = from->di_format;
@@ -256,8 +255,8 @@ xfs_inode_from_disk(
if (to->di_version == 3) {
inode_set_iversion_queried(inode,
be64_to_cpu(from->di_changecount));
- to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
- to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
+ to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec);
+ to->di_crtime.tv_nsec = be32_to_cpu(from->di_crtime.t_nsec);
to->di_flags2 = be64_to_cpu(from->di_flags2);
to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
}
@@ -279,8 +278,8 @@ xfs_inode_to_disk(
to->di_format = from->di_format;
to->di_uid = cpu_to_be32(from->di_uid);
to->di_gid = cpu_to_be32(from->di_gid);
- to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
- to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+ to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff);
+ to->di_projid_hi = cpu_to_be16(from->di_projid >> 16);
memset(to->di_pad, 0, sizeof(to->di_pad));
to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
@@ -306,8 +305,8 @@ xfs_inode_to_disk(
if (from->di_version == 3) {
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
- to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
- to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+ to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec);
+ to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec);
to->di_flags2 = cpu_to_be64(from->di_flags2);
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
@@ -632,8 +631,6 @@ xfs_iread(
if ((iget_flags & XFS_IGET_CREATE) &&
xfs_sb_version_hascrc(&mp->m_sb) &&
!(mp->m_flags & XFS_MOUNT_IKEEP)) {
- /* initialise the on-disk inode core */
- memset(&ip->i_d, 0, sizeof(ip->i_d));
VFS_I(ip)->i_generation = prandom_u32();
ip->i_d.di_version = 3;
return 0;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index ab0f84165317..fd94b1078722 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -21,8 +21,7 @@ struct xfs_icdinode {
uint16_t di_flushiter; /* incremented on flush */
uint32_t di_uid; /* owner's user id */
uint32_t di_gid; /* owner's group id */
- uint16_t di_projid_lo; /* lower part of owner's project id */
- uint16_t di_projid_hi; /* higher part of owner's project id */
+ uint32_t di_projid; /* owner's project id */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
@@ -37,7 +36,7 @@ struct xfs_icdinode {
uint64_t di_flags2; /* more random flags */
uint32_t di_cowextsize; /* basic cow extent size for file */
- xfs_ictimestamp_t di_crtime; /* time created */
+ struct timespec64 di_crtime; /* time created */
};
/*
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index c643beeb5a24..ad2b9c313fd2 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -75,11 +75,15 @@ xfs_iformat_fork(
error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
break;
default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+ dip, sizeof(*dip), __this_address);
return -EFSCORRUPTED;
}
break;
default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
return -EFSCORRUPTED;
}
if (error)
@@ -110,14 +114,16 @@ xfs_iformat_fork(
error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
break;
default:
+ xfs_inode_verifier_error(ip, error, __func__, dip,
+ sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
break;
}
if (error) {
- kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_zone, ip->i_afp);
ip->i_afp = NULL;
if (ip->i_cowfp)
- kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+ kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
ip->i_cowfp = NULL;
xfs_idestroy_fork(ip, XFS_DATA_FORK);
}
@@ -129,7 +135,7 @@ xfs_init_local_fork(
struct xfs_inode *ip,
int whichfork,
const void *data,
- int size)
+ int64_t size)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
int mem_size = size, real_size = 0;
@@ -467,11 +473,11 @@ xfs_iroot_realloc(
void
xfs_idata_realloc(
struct xfs_inode *ip,
- int byte_diff,
+ int64_t byte_diff,
int whichfork)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- int new_size = (int)ifp->if_bytes + byte_diff;
+ int64_t new_size = ifp->if_bytes + byte_diff;
ASSERT(new_size >= 0);
ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork));
@@ -525,10 +531,10 @@ xfs_idestroy_fork(
}
if (whichfork == XFS_ATTR_FORK) {
- kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+ kmem_cache_free(xfs_ifork_zone, ip->i_afp);
ip->i_afp = NULL;
} else if (whichfork == XFS_COW_FORK) {
- kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+ kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
ip->i_cowfp = NULL;
}
}
@@ -552,7 +558,7 @@ xfs_iextents_copy(
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec rec;
- int copied = 0;
+ int64_t copied = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
ASSERT(ifp->if_bytes > 0);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 00c62ce170d0..500333d0101e 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -13,16 +13,16 @@ struct xfs_dinode;
* File incore extent information, present for each of data & attr forks.
*/
struct xfs_ifork {
- int if_bytes; /* bytes in if_u1 */
- unsigned int if_seq; /* fork mod counter */
+ int64_t if_bytes; /* bytes in if_u1 */
struct xfs_btree_block *if_broot; /* file's incore btree root */
- short if_broot_bytes; /* bytes allocated for root */
- unsigned char if_flags; /* per-fork flags */
+ unsigned int if_seq; /* fork mod counter */
int if_height; /* height of the extent tree */
union {
void *if_root; /* extent tree root */
char *if_data; /* inline file data */
} if_u1;
+ short if_broot_bytes; /* bytes allocated for root */
+ unsigned char if_flags; /* per-fork flags */
};
/*
@@ -87,18 +87,24 @@ struct xfs_ifork {
#define XFS_IFORK_MAXEXT(ip, w) \
(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
+#define xfs_ifork_has_extents(ip, w) \
+ (XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_EXTENTS || \
+ XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_BTREE)
+
struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
struct xfs_inode_log_item *, int);
void xfs_idestroy_fork(struct xfs_inode *, int);
-void xfs_idata_realloc(struct xfs_inode *, int, int);
+void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
+ int whichfork);
void xfs_iroot_realloc(struct xfs_inode *, int, int);
int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
int);
-void xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
+void xfs_init_local_fork(struct xfs_inode *ip, int whichfork,
+ const void *data, int64_t size);
xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp);
void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index e5f97c69b320..8ef31d71a9c7 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -432,9 +432,9 @@ static inline uint xfs_log_dinode_size(int version)
}
/*
- * Buffer Log Format defintions
+ * Buffer Log Format definitions
*
- * These are the physical dirty bitmap defintions for the log format structure.
+ * These are the physical dirty bitmap definitions for the log format structure.
*/
#define XFS_BLF_CHUNK 128
#define XFS_BLF_SHIFT 7
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index f3d18eaecebb..3bf671637a91 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -30,14 +30,14 @@ typedef struct xlog_recover_item {
xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */
} xlog_recover_item_t;
-typedef struct xlog_recover {
+struct xlog_recover {
struct hlist_node r_list;
xlog_tid_t r_log_tid; /* log's transaction id */
xfs_trans_header_t r_theader; /* trans header for partial */
int r_state; /* not needed */
xfs_lsn_t r_lsn; /* xact lsn */
struct list_head r_itemq; /* q for items */
-} xlog_recover_t;
+};
#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 9a7fadb1361c..d7d702ee4d1a 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -200,7 +200,10 @@ xfs_refcount_insert(
error = xfs_btree_insert(cur, i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
out_error:
if (error)
@@ -227,10 +230,16 @@ xfs_refcount_delete(
error = xfs_refcount_get_rec(cur, &irec, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec);
error = xfs_btree_delete(cur, i);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (error)
goto out_error;
error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec);
@@ -349,7 +358,10 @@ xfs_refcount_split_extent(
error = xfs_refcount_get_rec(cur, &rcext, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
return 0;
@@ -371,7 +383,10 @@ xfs_refcount_split_extent(
error = xfs_refcount_insert(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
return error;
out_error:
@@ -410,19 +425,27 @@ xfs_refcount_merge_center_extents(
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (center->rc_refcount > 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* Enlarge the left extent. */
@@ -430,7 +453,10 @@ xfs_refcount_merge_center_extents(
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
left->rc_blockcount = extlen;
error = xfs_refcount_update(cur, left);
@@ -469,14 +495,18 @@ xfs_refcount_merge_left_extent(
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* Enlarge the left extent. */
@@ -484,7 +514,10 @@ xfs_refcount_merge_left_extent(
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
left->rc_blockcount += cleft->rc_blockcount;
error = xfs_refcount_update(cur, left);
@@ -526,14 +559,18 @@ xfs_refcount_merge_right_extent(
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* Enlarge the right extent. */
@@ -541,7 +578,10 @@ xfs_refcount_merge_right_extent(
&found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
right->rc_startblock -= cright->rc_blockcount;
right->rc_blockcount += cright->rc_blockcount;
@@ -587,7 +627,10 @@ xfs_refcount_find_left_extents(
error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (xfs_refc_next(&tmp) != agbno)
return 0;
@@ -605,8 +648,10 @@ xfs_refcount_find_left_extents(
error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* if tmp starts at the end of our range, just use that */
if (tmp.rc_startblock == agbno)
@@ -671,7 +716,10 @@ xfs_refcount_find_right_extents(
error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (tmp.rc_startblock != agbno + aglen)
return 0;
@@ -689,8 +737,10 @@ xfs_refcount_find_right_extents(
error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
- out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* if tmp ends at the end of our range, just use that */
if (xfs_refc_next(&tmp) == agbno + aglen)
@@ -913,8 +963,11 @@ xfs_refcount_adjust_extents(
&found_tmp);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- found_tmp == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ found_tmp != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
cur->bc_private.a.priv.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
@@ -955,8 +1008,10 @@ xfs_refcount_adjust_extents(
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
cur->bc_private.a.priv.refc.nr_ops++;
goto advloop;
} else {
@@ -1122,7 +1177,7 @@ xfs_refcount_finish_one(
XFS_ALLOC_FLAG_FREEING, &agbp);
if (error)
return error;
- if (!agbp)
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
return -EFSCORRUPTED;
rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno);
@@ -1272,7 +1327,10 @@ xfs_refcount_find_shared(
error = xfs_refcount_get_rec(cur, &tmp, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* If the extent ends before the start, look at the next one */
if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
@@ -1284,7 +1342,10 @@ xfs_refcount_find_shared(
error = xfs_refcount_get_rec(cur, &tmp, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* If the extent starts after the range we want, bail out */
@@ -1312,7 +1373,10 @@ xfs_refcount_find_shared(
error = xfs_refcount_get_rec(cur, &tmp, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (tmp.rc_startblock >= agbno + aglen ||
tmp.rc_startblock != *fbno + *flen)
break;
@@ -1413,8 +1477,11 @@ xfs_refcount_adjust_cow_extents(
switch (adj) {
case XFS_REFCOUNT_ADJUST_COW_ALLOC:
/* Adding a CoW reservation, there should be nothing here. */
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- ext.rc_startblock >= agbno + aglen, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ agbno + aglen > ext.rc_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
tmp.rc_startblock = agbno;
tmp.rc_blockcount = aglen;
@@ -1426,17 +1493,25 @@ xfs_refcount_adjust_cow_extents(
&found_tmp);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- found_tmp == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
break;
case XFS_REFCOUNT_ADJUST_COW_FREE:
/* Removing a CoW reservation, there should be one extent. */
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- ext.rc_startblock == agbno, out_error);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- ext.rc_blockcount == aglen, out_error);
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- ext.rc_refcount == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
ext.rc_refcount = 0;
trace_xfs_refcount_modify_extent(cur->bc_mp,
@@ -1444,8 +1519,10 @@ xfs_refcount_adjust_cow_extents(
error = xfs_refcount_delete(cur, &found_rec);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
- found_rec == 1, out_error);
+ if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
break;
default:
ASSERT(0);
@@ -1584,14 +1661,15 @@ struct xfs_refcount_recovery {
/* Stuff an extent on the recovery list. */
STATIC int
xfs_refcount_recover_extent(
- struct xfs_btree_cur *cur,
+ struct xfs_btree_cur *cur,
union xfs_btree_rec *rec,
void *priv)
{
struct list_head *debris = priv;
struct xfs_refcount_recovery *rr;
- if (be32_to_cpu(rec->refc.rc_refcount) != 1)
+ if (XFS_IS_CORRUPT(cur->bc_mp,
+ be32_to_cpu(rec->refc.rc_refcount) != 1))
return -EFSCORRUPTED;
rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 38e9414878b3..ff9412f113c4 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -113,7 +113,10 @@ xfs_rmap_insert(
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 0, done);
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
rcur->bc_rec.r.rm_startblock = agbno;
rcur->bc_rec.r.rm_blockcount = len;
@@ -123,7 +126,10 @@ xfs_rmap_insert(
error = xfs_btree_insert(rcur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
done:
if (error)
trace_xfs_rmap_insert_error(rcur->bc_mp,
@@ -149,12 +155,18 @@ xfs_rmap_delete(
error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_delete(rcur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+ if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
done:
if (error)
trace_xfs_rmap_delete_error(rcur->bc_mp,
@@ -406,24 +418,39 @@ xfs_rmap_free_check_owner(
return 0;
/* Make sure the unwritten flag matches. */
- XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
- (rec->rm_flags & XFS_RMAP_UNWRITTEN), out);
+ if (XFS_IS_CORRUPT(mp,
+ (flags & XFS_RMAP_UNWRITTEN) !=
+ (rec->rm_flags & XFS_RMAP_UNWRITTEN))) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
/* Make sure the owner matches what we expect to find in the tree. */
- XFS_WANT_CORRUPTED_GOTO(mp, owner == rec->rm_owner, out);
+ if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
/* Check the offset, if necessary. */
if (XFS_RMAP_NON_INODE_OWNER(owner))
goto out;
if (flags & XFS_RMAP_BMBT_BLOCK) {
- XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_flags & XFS_RMAP_BMBT_BLOCK,
- out);
+ if (XFS_IS_CORRUPT(mp,
+ !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
} else {
- XFS_WANT_CORRUPTED_GOTO(mp, rec->rm_offset <= offset, out);
- XFS_WANT_CORRUPTED_GOTO(mp,
- ltoff + rec->rm_blockcount >= offset + len,
- out);
+ if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ if (XFS_IS_CORRUPT(mp,
+ offset + len > ltoff + rec->rm_blockcount)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
}
out:
@@ -482,12 +509,18 @@ xfs_rmap_unmap(
error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_rmap_get_rec(cur, &ltrec, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_private.a.agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -502,8 +535,12 @@ xfs_rmap_unmap(
* be the case that the "left" extent goes all the way to EOFS.
*/
if (owner == XFS_RMAP_OWN_NULL) {
- XFS_WANT_CORRUPTED_GOTO(mp, bno >= ltrec.rm_startblock +
- ltrec.rm_blockcount, out_error);
+ if (XFS_IS_CORRUPT(mp,
+ bno <
+ ltrec.rm_startblock + ltrec.rm_blockcount)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
goto out_done;
}
@@ -526,15 +563,22 @@ xfs_rmap_unmap(
error = xfs_rmap_get_rec(cur, &rtrec, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (rtrec.rm_startblock >= bno + len)
goto out_done;
}
/* Make sure the extent we found covers the entire freeing range. */
- XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
- ltrec.rm_startblock + ltrec.rm_blockcount >=
- bno + len, out_error);
+ if (XFS_IS_CORRUPT(mp,
+ ltrec.rm_startblock > bno ||
+ ltrec.rm_startblock + ltrec.rm_blockcount <
+ bno + len)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* Check owner information. */
error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, len, owner,
@@ -551,7 +595,10 @@ xfs_rmap_unmap(
error = xfs_btree_delete(cur, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
} else if (ltrec.rm_startblock == bno) {
/*
* overlap left hand side of extent: move the start, trim the
@@ -743,7 +790,10 @@ xfs_rmap_map(
error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, have_lt == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, have_lt != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_private.a.agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -753,9 +803,12 @@ xfs_rmap_map(
have_lt = 0;
}
- XFS_WANT_CORRUPTED_GOTO(mp,
- have_lt == 0 ||
- ltrec.rm_startblock + ltrec.rm_blockcount <= bno, out_error);
+ if (XFS_IS_CORRUPT(mp,
+ have_lt != 0 &&
+ ltrec.rm_startblock + ltrec.rm_blockcount > bno)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/*
* Increment the cursor to see if we have a right-adjacent record to our
@@ -769,9 +822,14 @@ xfs_rmap_map(
error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
- XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= gtrec.rm_startblock,
- out_error);
+ if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
cur->bc_private.a.agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
@@ -821,7 +879,10 @@ xfs_rmap_map(
error = xfs_btree_delete(cur, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
/* point the cursor back to the left record and update */
@@ -865,7 +926,10 @@ xfs_rmap_map(
error = xfs_btree_insert(cur, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
}
trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
@@ -957,12 +1021,18 @@ xfs_rmap_convert(
error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_rmap_get_rec(cur, &PREV, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_private.a.agno, PREV.rm_startblock,
PREV.rm_blockcount, PREV.rm_owner,
@@ -995,10 +1065,16 @@ xfs_rmap_convert(
error = xfs_rmap_get_rec(cur, &LEFT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- XFS_WANT_CORRUPTED_GOTO(mp,
- LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
- done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount >
+ bno)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
cur->bc_private.a.agno, LEFT.rm_startblock,
LEFT.rm_blockcount, LEFT.rm_owner,
@@ -1017,7 +1093,10 @@ xfs_rmap_convert(
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto done;
@@ -1026,9 +1105,14 @@ xfs_rmap_convert(
error = xfs_rmap_get_rec(cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
- done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
cur->bc_private.a.agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
@@ -1055,7 +1139,10 @@ xfs_rmap_convert(
error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/*
* Switch out based on the FILLING and CONTIG state bits.
@@ -1071,7 +1158,10 @@ xfs_rmap_convert(
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
@@ -1079,11 +1169,17 @@ xfs_rmap_convert(
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
PREV.rm_startblock, PREV.rm_blockcount,
PREV.rm_owner, PREV.rm_offset,
@@ -1091,11 +1187,17 @@ xfs_rmap_convert(
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW = LEFT;
NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
error = xfs_rmap_update(cur, &NEW);
@@ -1115,11 +1217,17 @@ xfs_rmap_convert(
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW = LEFT;
NEW.rm_blockcount += PREV.rm_blockcount;
error = xfs_rmap_update(cur, &NEW);
@@ -1135,7 +1243,10 @@ xfs_rmap_convert(
error = xfs_btree_increment(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_delete(mp, cur->bc_private.a.agno,
RIGHT.rm_startblock, RIGHT.rm_blockcount,
RIGHT.rm_owner, RIGHT.rm_offset,
@@ -1143,11 +1254,17 @@ xfs_rmap_convert(
error = xfs_btree_delete(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
error = xfs_btree_decrement(cur, 0, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW = PREV;
NEW.rm_blockcount = len + RIGHT.rm_blockcount;
NEW.rm_flags = newext;
@@ -1214,7 +1331,10 @@ xfs_rmap_convert(
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
break;
case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
@@ -1253,7 +1373,10 @@ xfs_rmap_convert(
oldext, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_startblock = bno;
NEW.rm_owner = owner;
NEW.rm_offset = offset;
@@ -1265,7 +1388,10 @@ xfs_rmap_convert(
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
break;
case 0:
@@ -1295,7 +1421,10 @@ xfs_rmap_convert(
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/*
* Reset the cursor to the position of the new extent
* we are about to insert as we can't trust it after
@@ -1305,7 +1434,10 @@ xfs_rmap_convert(
oldext, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
+ if (XFS_IS_CORRUPT(mp, i != 0)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
/* new middle extent - newext */
cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN;
cur->bc_rec.r.rm_flags |= newext;
@@ -1314,7 +1446,10 @@ xfs_rmap_convert(
error = xfs_btree_insert(cur, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
break;
case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
@@ -1383,7 +1518,10 @@ xfs_rmap_convert_shared(
&PREV, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
ASSERT(PREV.rm_offset <= offset);
ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
@@ -1406,9 +1544,12 @@ xfs_rmap_convert_shared(
goto done;
if (i) {
state |= RMAP_LEFT_VALID;
- XFS_WANT_CORRUPTED_GOTO(mp,
- LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
- done);
+ if (XFS_IS_CORRUPT(mp,
+ LEFT.rm_startblock + LEFT.rm_blockcount >
+ bno)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
if (xfs_rmap_is_mergeable(&LEFT, owner, newext))
state |= RMAP_LEFT_CONTIG;
}
@@ -1423,9 +1564,14 @@ xfs_rmap_convert_shared(
error = xfs_rmap_get_rec(cur, &RIGHT, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
- XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
- done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
+ if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
cur->bc_private.a.agno, RIGHT.rm_startblock,
RIGHT.rm_blockcount, RIGHT.rm_owner,
@@ -1472,7 +1618,10 @@ xfs_rmap_convert_shared(
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1495,7 +1644,10 @@ xfs_rmap_convert_shared(
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount += PREV.rm_blockcount;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1518,7 +1670,10 @@ xfs_rmap_convert_shared(
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount += RIGHT.rm_blockcount;
NEW.rm_flags = RIGHT.rm_flags;
error = xfs_rmap_update(cur, &NEW);
@@ -1538,7 +1693,10 @@ xfs_rmap_convert_shared(
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_flags = newext;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1570,7 +1728,10 @@ xfs_rmap_convert_shared(
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount += len;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1612,7 +1773,10 @@ xfs_rmap_convert_shared(
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount = offset - NEW.rm_offset;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1644,7 +1808,10 @@ xfs_rmap_convert_shared(
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount -= len;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1679,7 +1846,10 @@ xfs_rmap_convert_shared(
NEW.rm_offset, NEW.rm_flags, &i);
if (error)
goto done;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto done;
+ }
NEW.rm_blockcount = offset - NEW.rm_offset;
error = xfs_rmap_update(cur, &NEW);
if (error)
@@ -1765,25 +1935,44 @@ xfs_rmap_unmap_shared(
&ltrec, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
ltoff = ltrec.rm_offset;
/* Make sure the extent we found covers the entire freeing range. */
- XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
- ltrec.rm_startblock + ltrec.rm_blockcount >=
- bno + len, out_error);
+ if (XFS_IS_CORRUPT(mp,
+ ltrec.rm_startblock > bno ||
+ ltrec.rm_startblock + ltrec.rm_blockcount <
+ bno + len)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* Make sure the owner matches what we expect to find in the tree. */
- XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error);
+ if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* Make sure the unwritten flag matches. */
- XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
- (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+ if (XFS_IS_CORRUPT(mp,
+ (flags & XFS_RMAP_UNWRITTEN) !=
+ (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
/* Check the offset. */
- XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error);
- XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount,
- out_error);
+ if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
+ if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
/* Exact match, simply remove the record from rmap tree. */
@@ -1836,7 +2025,10 @@ xfs_rmap_unmap_shared(
ltrec.rm_offset, ltrec.rm_flags, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
ltrec.rm_blockcount -= len;
error = xfs_rmap_update(cur, &ltrec);
if (error)
@@ -1862,7 +2054,10 @@ xfs_rmap_unmap_shared(
ltrec.rm_offset, ltrec.rm_flags, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
ltrec.rm_blockcount = bno - ltrec.rm_startblock;
error = xfs_rmap_update(cur, &ltrec);
if (error)
@@ -1938,7 +2133,10 @@ xfs_rmap_map_shared(
error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
cur->bc_private.a.agno, gtrec.rm_startblock,
gtrec.rm_blockcount, gtrec.rm_owner,
@@ -1987,7 +2185,10 @@ xfs_rmap_map_shared(
ltrec.rm_offset, ltrec.rm_flags, &i);
if (error)
goto out_error;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_error;
+ }
error = xfs_rmap_update(cur, &ltrec);
if (error)
@@ -2199,7 +2400,7 @@ xfs_rmap_finish_one(
error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
if (error)
return error;
- if (!agbp)
+ if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
return -EFSCORRUPTED;
rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 8ea1efc97b41..f42c74cb8be5 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -15,7 +15,7 @@
#include "xfs_bmap.h"
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
-
+#include "xfs_error.h"
/*
* Realtime allocator bitmap functions shared with userspace.
@@ -70,7 +70,7 @@ xfs_rtbuf_get(
if (error)
return error;
- if (nmap == 0 || !xfs_bmap_is_real_extent(&map))
+ if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_real_extent(&map)))
return -EFSCORRUPTED;
ASSERT(map.br_startblock != NULLFSBLOCK);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index ac6cdca63e15..0ac69751fe85 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -10,6 +10,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
+#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index a9ad90926b87..2b8ccb5b975d 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -55,7 +55,7 @@ xfs_trans_ichgtime(
int flags)
{
struct inode *inode = VFS_I(ip);
- struct timespec64 tv;
+ struct timespec64 tv;
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -66,10 +66,8 @@ xfs_trans_ichgtime(
inode->i_mtime = tv;
if (flags & XFS_ICHGTIME_CHG)
inode->i_ctime = tv;
- if (flags & XFS_ICHGTIME_CREATE) {
- ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
- ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
- }
+ if (flags & XFS_ICHGTIME_CREATE)
+ ip->i_d.di_crtime = tv;
}
/*
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index d12bbd526e7c..c55cd9a3dec9 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -718,7 +718,7 @@ xfs_calc_clear_agi_bucket_reservation(
/*
* Adjusting quota limits.
- * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ * the disk quota buffer: sizeof(struct xfs_disk_dquot)
*/
STATIC uint
xfs_calc_qm_setqlim_reservation(void)
@@ -742,7 +742,7 @@ xfs_calc_qm_dqalloc_reservation(
/*
* Turning off quotas.
- * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
* the superblock for the quota flags: sector size
*/
STATIC uint
@@ -755,7 +755,7 @@ xfs_calc_qm_quotaoff_reservation(
/*
* End of turning off quotas.
- * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
*/
STATIC uint
xfs_calc_qm_quotaoff_end_reservation(void)
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 300b3e91ca3a..397d94775440 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -21,7 +21,6 @@ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
-typedef int32_t xfs_tid_t; /* transaction identifier */
typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
typedef uint32_t xfs_dahash_t; /* dir/attr hash value */
@@ -33,7 +32,6 @@ typedef uint64_t xfs_fileoff_t; /* block number in a file */
typedef uint64_t xfs_filblks_t; /* number of blocks in a file */
typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
-typedef int64_t xfs_sfiloff_t; /* signed block number in a file */
/*
* New verifiers will return the instruction address of the failing check.
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 0edc7f8eb96e..d9f0dd444b80 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -398,15 +398,14 @@ out:
STATIC int
xchk_xattr_rec(
struct xchk_da_btree *ds,
- int level,
- void *rec)
+ int level)
{
struct xfs_mount *mp = ds->state->mp;
- struct xfs_attr_leaf_entry *ent = rec;
- struct xfs_da_state_blk *blk;
+ struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
struct xfs_attr_leaf_name_local *lentry;
struct xfs_attr_leaf_name_remote *rentry;
struct xfs_buf *bp;
+ struct xfs_attr_leaf_entry *ent;
xfs_dahash_t calc_hash;
xfs_dahash_t hash;
int nameidx;
@@ -414,7 +413,9 @@ xchk_xattr_rec(
unsigned int badflags;
int error;
- blk = &ds->state->path.blk[level];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+ ent = xfs_attr3_leaf_entryp(blk->bp->b_addr) + blk->index;
/* Check the whole block, if necessary. */
error = xchk_xattr_block(ds, level);
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 3d47d111be5a..18a684e18a69 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -294,5 +294,6 @@ xfs_bitmap_set_btblocks(
struct xfs_bitmap *bitmap,
struct xfs_btree_cur *cur)
{
- return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock, bitmap);
+ return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock,
+ XFS_BTREE_VISIT_ALL, bitmap);
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 003a772cd26c..2e50d146105d 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -14,8 +14,15 @@
static inline bool
xchk_should_terminate(
struct xfs_scrub *sc,
- int *error)
+ int *error)
{
+ /*
+ * If preemption is disabled, we need to yield to the scheduler every
+ * few seconds so that we don't run afoul of the soft lockup watchdog
+ * or RCU stall detector.
+ */
+ cond_resched();
+
if (fatal_signal_pending(current)) {
if (*error == 0)
*error = -EAGAIN;
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 77ff9f97bcda..97a15b6f2865 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -77,40 +77,18 @@ xchk_da_set_corrupt(
__return_address);
}
-/* Find an entry at a certain level in a da btree. */
-STATIC void *
-xchk_da_btree_entry(
- struct xchk_da_btree *ds,
- int level,
- int rec)
+static struct xfs_da_node_entry *
+xchk_da_btree_node_entry(
+ struct xchk_da_btree *ds,
+ int level)
{
- char *ents;
- struct xfs_da_state_blk *blk;
- void *baddr;
+ struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
+ struct xfs_da3_icnode_hdr hdr;
- /* Dispatch the entry finding function. */
- blk = &ds->state->path.blk[level];
- baddr = blk->bp->b_addr;
- switch (blk->magic) {
- case XFS_ATTR_LEAF_MAGIC:
- case XFS_ATTR3_LEAF_MAGIC:
- ents = (char *)xfs_attr3_leaf_entryp(baddr);
- return ents + (rec * sizeof(struct xfs_attr_leaf_entry));
- case XFS_DIR2_LEAFN_MAGIC:
- case XFS_DIR3_LEAFN_MAGIC:
- ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
- return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
- case XFS_DIR2_LEAF1_MAGIC:
- case XFS_DIR3_LEAF1_MAGIC:
- ents = (char *)ds->dargs.dp->d_ops->leaf_ents_p(baddr);
- return ents + (rec * sizeof(struct xfs_dir2_leaf_entry));
- case XFS_DA_NODE_MAGIC:
- case XFS_DA3_NODE_MAGIC:
- ents = (char *)ds->dargs.dp->d_ops->node_tree_p(baddr);
- return ents + (rec * sizeof(struct xfs_da_node_entry));
- }
+ ASSERT(blk->magic == XFS_DA_NODE_MAGIC);
- return NULL;
+ xfs_da3_node_hdr_from_disk(ds->sc->mp, &hdr, blk->bp->b_addr);
+ return hdr.btree + blk->index;
}
/* Scrub a da btree hash (key). */
@@ -120,7 +98,6 @@ xchk_da_btree_hash(
int level,
__be32 *hashp)
{
- struct xfs_da_state_blk *blks;
struct xfs_da_node_entry *entry;
xfs_dahash_t hash;
xfs_dahash_t parent_hash;
@@ -135,8 +112,7 @@ xchk_da_btree_hash(
return 0;
/* Is this hash no larger than the parent hash? */
- blks = ds->state->path.blk;
- entry = xchk_da_btree_entry(ds, level - 1, blks[level - 1].index);
+ entry = xchk_da_btree_node_entry(ds, level - 1);
parent_hash = be32_to_cpu(entry->hashval);
if (parent_hash < hash)
xchk_da_set_corrupt(ds, level);
@@ -355,8 +331,8 @@ xchk_da_btree_block(
goto out_nobuf;
/* Read the buffer. */
- error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno, -2,
- &blk->bp, dargs->whichfork,
+ error = xfs_da_read_buf(dargs->trans, dargs->dp, blk->blkno,
+ XFS_DABUF_MAP_HOLE_OK, &blk->bp, dargs->whichfork,
&xchk_da_btree_buf_ops);
if (!xchk_da_process_error(ds, level, &error))
goto out_nobuf;
@@ -433,8 +409,8 @@ xchk_da_btree_block(
XFS_BLFT_DA_NODE_BUF);
blk->magic = XFS_DA_NODE_MAGIC;
node = blk->bp->b_addr;
- ip->d_ops->node_hdr_from_disk(&nodehdr, node);
- btree = ip->d_ops->node_tree_p(node);
+ xfs_da3_node_hdr_from_disk(ip->i_mount, &nodehdr, node);
+ btree = nodehdr.btree;
*pmaxrecs = nodehdr.count;
blk->hashval = be32_to_cpu(btree[*pmaxrecs - 1].hashval);
if (level == 0) {
@@ -479,14 +455,12 @@ xchk_da_btree(
struct xfs_mount *mp = sc->mp;
struct xfs_da_state_blk *blks;
struct xfs_da_node_entry *key;
- void *rec;
xfs_dablk_t blkno;
int level;
int error;
/* Skip short format data structures; no btree to scan. */
- if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ if (!xfs_ifork_has_extents(sc->ip, whichfork))
return 0;
/* Set up initial da state. */
@@ -538,9 +512,7 @@ xchk_da_btree(
}
/* Dispatch record scrubbing. */
- rec = xchk_da_btree_entry(&ds, level,
- blks[level].index);
- error = scrub_fn(&ds, level, rec);
+ error = scrub_fn(&ds, level);
if (error)
break;
if (xchk_should_terminate(sc, &error) ||
@@ -562,7 +534,7 @@ xchk_da_btree(
}
/* Hashes in order for scrub? */
- key = xchk_da_btree_entry(&ds, level, blks[level].index);
+ key = xchk_da_btree_node_entry(&ds, level);
error = xchk_da_btree_hash(&ds, level, &key->hashval);
if (error)
goto out;
diff --git a/fs/xfs/scrub/dabtree.h b/fs/xfs/scrub/dabtree.h
index cb3f0003245b..1f3515c6d5a8 100644
--- a/fs/xfs/scrub/dabtree.h
+++ b/fs/xfs/scrub/dabtree.h
@@ -28,8 +28,7 @@ struct xchk_da_btree {
int tree_level;
};
-typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds,
- int level, void *rec);
+typedef int (*xchk_da_btree_rec_fn)(struct xchk_da_btree *ds, int level);
/* Check for da btree operation errors. */
bool xchk_da_process_error(struct xchk_da_btree *ds, int level, int *error);
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 1e2e11721eb9..266da4e4bde6 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -113,6 +113,9 @@ xchk_dir_actor(
offset = xfs_dir2_db_to_da(mp->m_dir_geo,
xfs_dir2_dataptr_to_db(mp->m_dir_geo, pos));
+ if (xchk_should_terminate(sdc->sc, &error))
+ return error;
+
/* Does this inode number make sense? */
if (!xfs_verify_dir_ino(mp, ino)) {
xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
@@ -179,15 +182,17 @@ out:
STATIC int
xchk_dir_rec(
struct xchk_da_btree *ds,
- int level,
- void *rec)
+ int level)
{
+ struct xfs_da_state_blk *blk = &ds->state->path.blk[level];
struct xfs_mount *mp = ds->state->mp;
- struct xfs_dir2_leaf_entry *ent = rec;
struct xfs_inode *dp = ds->dargs.dp;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
struct xfs_dir2_data_entry *dent;
struct xfs_buf *bp;
- char *p, *endp;
+ struct xfs_dir2_leaf_entry *ent;
+ unsigned int end;
+ unsigned int iter_off;
xfs_ino_t ino;
xfs_dablk_t rec_bno;
xfs_dir2_db_t db;
@@ -195,9 +200,16 @@ xchk_dir_rec(
xfs_dir2_dataptr_t ptr;
xfs_dahash_t calc_hash;
xfs_dahash_t hash;
+ struct xfs_dir3_icleaf_hdr hdr;
unsigned int tag;
int error;
+ ASSERT(blk->magic == XFS_DIR2_LEAF1_MAGIC ||
+ blk->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ xfs_dir2_leaf_hdr_from_disk(mp, &hdr, blk->bp->b_addr);
+ ent = hdr.ents + blk->index;
+
/* Check the hash of the entry. */
error = xchk_da_btree_hash(ds, level, &ent->hashval);
if (error)
@@ -209,15 +221,16 @@ xchk_dir_rec(
return 0;
/* Find the directory entry's location. */
- db = xfs_dir2_dataptr_to_db(mp->m_dir_geo, ptr);
- off = xfs_dir2_dataptr_to_off(mp->m_dir_geo, ptr);
- rec_bno = xfs_dir2_db_to_da(mp->m_dir_geo, db);
+ db = xfs_dir2_dataptr_to_db(geo, ptr);
+ off = xfs_dir2_dataptr_to_off(geo, ptr);
+ rec_bno = xfs_dir2_db_to_da(geo, db);
- if (rec_bno >= mp->m_dir_geo->leafblk) {
+ if (rec_bno >= geo->leafblk) {
xchk_da_set_corrupt(ds, level);
goto out;
}
- error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno, -2, &bp);
+ error = xfs_dir3_data_read(ds->dargs.trans, dp, rec_bno,
+ XFS_DABUF_MAP_HOLE_OK, &bp);
if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
&error))
goto out;
@@ -230,38 +243,37 @@ xchk_dir_rec(
if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out_relse;
- dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off);
+ dent = bp->b_addr + off;
/* Make sure we got a real directory entry. */
- p = (char *)mp->m_dir_inode_ops->data_entry_p(bp->b_addr);
- endp = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr);
- if (!endp) {
+ iter_off = geo->data_entry_offset;
+ end = xfs_dir3_data_end_offset(geo, bp->b_addr);
+ if (!end) {
xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
goto out_relse;
}
- while (p < endp) {
- struct xfs_dir2_data_entry *dep;
- struct xfs_dir2_data_unused *dup;
+ for (;;) {
+ struct xfs_dir2_data_entry *dep = bp->b_addr + iter_off;
+ struct xfs_dir2_data_unused *dup = bp->b_addr + iter_off;
+
+ if (iter_off >= end) {
+ xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
+ goto out_relse;
+ }
- dup = (struct xfs_dir2_data_unused *)p;
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- p += be16_to_cpu(dup->length);
+ iter_off += be16_to_cpu(dup->length);
continue;
}
- dep = (struct xfs_dir2_data_entry *)p;
if (dep == dent)
break;
- p += mp->m_dir_inode_ops->data_entsize(dep->namelen);
- }
- if (p >= endp) {
- xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
- goto out_relse;
+ iter_off += xfs_dir2_data_entsize(mp, dep->namelen);
}
/* Retrieve the entry, sanity check it, and compare hashes. */
ino = be64_to_cpu(dent->inumber);
hash = be32_to_cpu(ent->hashval);
- tag = be16_to_cpup(dp->d_ops->data_entry_tag_p(dent));
+ tag = be16_to_cpup(xfs_dir2_data_entry_tag_p(mp, dent));
if (!xfs_verify_dir_ino(mp, ino) || tag != off)
xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
if (dent->namelen == 0) {
@@ -319,19 +331,15 @@ xchk_directory_data_bestfree(
struct xfs_buf *bp;
struct xfs_dir2_data_free *bf;
struct xfs_mount *mp = sc->mp;
- const struct xfs_dir_ops *d_ops;
- char *ptr;
- char *endptr;
u16 tag;
unsigned int nr_bestfrees = 0;
unsigned int nr_frees = 0;
unsigned int smallest_bestfree;
int newlen;
- int offset;
+ unsigned int offset;
+ unsigned int end;
int error;
- d_ops = sc->ip->d_ops;
-
if (is_block) {
/* dir block format */
if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
@@ -339,7 +347,7 @@ xchk_directory_data_bestfree(
error = xfs_dir3_block_read(sc->tp, sc->ip, &bp);
} else {
/* dir data format */
- error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, -1, &bp);
+ error = xfs_dir3_data_read(sc->tp, sc->ip, lblk, 0, &bp);
}
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
@@ -351,7 +359,7 @@ xchk_directory_data_bestfree(
goto out_buf;
/* Do the bestfrees correspond to actual free space? */
- bf = d_ops->data_bestfree_p(bp->b_addr);
+ bf = xfs_dir2_data_bestfree_p(mp, bp->b_addr);
smallest_bestfree = UINT_MAX;
for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
offset = be16_to_cpu(dfp->offset);
@@ -361,13 +369,13 @@ xchk_directory_data_bestfree(
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out_buf;
}
- dup = (struct xfs_dir2_data_unused *)(bp->b_addr + offset);
+ dup = bp->b_addr + offset;
tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
/* bestfree doesn't match the entry it points at? */
if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) ||
be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) ||
- tag != ((char *)dup - (char *)bp->b_addr)) {
+ tag != offset) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out_buf;
}
@@ -383,30 +391,30 @@ xchk_directory_data_bestfree(
}
/* Make sure the bestfrees are actually the best free spaces. */
- ptr = (char *)d_ops->data_entry_p(bp->b_addr);
- endptr = xfs_dir3_data_endp(mp->m_dir_geo, bp->b_addr);
+ offset = mp->m_dir_geo->data_entry_offset;
+ end = xfs_dir3_data_end_offset(mp->m_dir_geo, bp->b_addr);
/* Iterate the entries, stopping when we hit or go past the end. */
- while (ptr < endptr) {
- dup = (struct xfs_dir2_data_unused *)ptr;
+ while (offset < end) {
+ dup = bp->b_addr + offset;
+
/* Skip real entries */
if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) {
- struct xfs_dir2_data_entry *dep;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
- dep = (struct xfs_dir2_data_entry *)ptr;
- newlen = d_ops->data_entsize(dep->namelen);
+ newlen = xfs_dir2_data_entsize(mp, dep->namelen);
if (newlen <= 0) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
lblk);
goto out_buf;
}
- ptr += newlen;
+ offset += newlen;
continue;
}
/* Spot check this free entry */
tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
- if (tag != ((char *)dup - (char *)bp->b_addr)) {
+ if (tag != offset) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out_buf;
}
@@ -425,13 +433,13 @@ xchk_directory_data_bestfree(
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out_buf;
}
- ptr += newlen;
- if (ptr <= endptr)
+ offset += newlen;
+ if (offset <= end)
nr_frees++;
}
/* We're required to fill all the space. */
- if (ptr != endptr)
+ if (offset != end)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
/* Did we see at least as many free slots as there are bestfrees? */
@@ -458,7 +466,7 @@ xchk_directory_check_freesp(
{
struct xfs_dir2_data_free *dfp;
- dfp = sc->ip->d_ops->data_bestfree_p(dbp->b_addr);
+ dfp = xfs_dir2_data_bestfree_p(sc->mp, dbp->b_addr);
if (len != be16_to_cpu(dfp->length))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
@@ -475,12 +483,10 @@ xchk_directory_leaf1_bestfree(
xfs_dablk_t lblk)
{
struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
struct xfs_dir2_leaf_tail *ltp;
struct xfs_dir2_leaf *leaf;
struct xfs_buf *dbp;
struct xfs_buf *bp;
- const struct xfs_dir_ops *d_ops = sc->ip->d_ops;
struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
__be16 *bestp;
__u16 best;
@@ -492,14 +498,13 @@ xchk_directory_leaf1_bestfree(
int error;
/* Read the free space block. */
- error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, -1, &bp);
+ error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
xchk_buffer_recheck(sc, bp);
leaf = bp->b_addr;
- d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
- ents = d_ops->leaf_ents_p(leaf);
+ xfs_dir2_leaf_hdr_from_disk(sc->ip->i_mount, &leafhdr, leaf);
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
bestcount = be32_to_cpu(ltp->bestcount);
bestp = xfs_dir2_leaf_bests_p(ltp);
@@ -521,24 +526,25 @@ xchk_directory_leaf1_bestfree(
}
/* Is the leaf count even remotely sane? */
- if (leafhdr.count > d_ops->leaf_max_ents(geo)) {
+ if (leafhdr.count > geo->leaf_max_ents) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out;
}
/* Leaves and bests don't overlap in leaf format. */
- if ((char *)&ents[leafhdr.count] > (char *)bestp) {
+ if ((char *)&leafhdr.ents[leafhdr.count] > (char *)bestp) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
goto out;
}
/* Check hash value order, count stale entries. */
for (i = 0; i < leafhdr.count; i++) {
- hash = be32_to_cpu(ents[i].hashval);
+ hash = be32_to_cpu(leafhdr.ents[i].hashval);
if (i > 0 && lasthash > hash)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
lasthash = hash;
- if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
+ if (leafhdr.ents[i].address ==
+ cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
}
if (leafhdr.stale != stale)
@@ -552,7 +558,7 @@ xchk_directory_leaf1_bestfree(
if (best == NULLDATAOFF)
continue;
error = xfs_dir3_data_read(sc->tp, sc->ip,
- i * args->geo->fsbcount, -1, &dbp);
+ i * args->geo->fsbcount, 0, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
break;
@@ -575,7 +581,6 @@ xchk_directory_free_bestfree(
struct xfs_dir3_icfree_hdr freehdr;
struct xfs_buf *dbp;
struct xfs_buf *bp;
- __be16 *bestp;
__u16 best;
unsigned int stale = 0;
int i;
@@ -595,17 +600,16 @@ xchk_directory_free_bestfree(
}
/* Check all the entries. */
- sc->ip->d_ops->free_hdr_from_disk(&freehdr, bp->b_addr);
- bestp = sc->ip->d_ops->free_bests_p(bp->b_addr);
- for (i = 0; i < freehdr.nvalid; i++, bestp++) {
- best = be16_to_cpu(*bestp);
+ xfs_dir2_free_hdr_from_disk(sc->ip->i_mount, &freehdr, bp->b_addr);
+ for (i = 0; i < freehdr.nvalid; i++) {
+ best = be16_to_cpu(freehdr.bests[i]);
if (best == NULLDATAOFF) {
stale++;
continue;
}
error = xfs_dir3_data_read(sc->tp, sc->ip,
(freehdr.firstdb + i) * args->geo->fsbcount,
- -1, &dbp);
+ 0, &dbp);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
break;
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 98f82d7c8b40..7251c66a82c9 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -104,7 +104,7 @@ next_loop_perag:
pag = NULL;
error = 0;
- if (fatal_signal_pending(current))
+ if (xchk_should_terminate(sc, &error))
break;
}
@@ -163,6 +163,7 @@ xchk_fscount_aggregate_agcounts(
uint64_t delayed;
xfs_agnumber_t agno;
int tries = 8;
+ int error = 0;
retry:
fsc->icount = 0;
@@ -196,10 +197,13 @@ retry:
xfs_perag_put(pag);
- if (fatal_signal_pending(current))
+ if (xchk_should_terminate(sc, &error))
break;
}
+ if (error)
+ return error;
+
/*
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index b2f602811e9d..83d27cdf579b 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -11,6 +11,7 @@
#include "xfs_sb.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
+#include "scrub/health.h"
/*
* Scrub and In-Core Filesystem Health Assessments
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index c962bd534690..5705adc43a75 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -32,8 +32,10 @@ xchk_setup_parent(
struct xchk_parent_ctx {
struct dir_context dc;
+ struct xfs_scrub *sc;
xfs_ino_t ino;
xfs_nlink_t nlink;
+ bool cancelled;
};
/* Look for a single entry in a directory pointing to an inode. */
@@ -47,11 +49,21 @@ xchk_parent_actor(
unsigned type)
{
struct xchk_parent_ctx *spc;
+ int error = 0;
spc = container_of(dc, struct xchk_parent_ctx, dc);
if (spc->ino == ino)
spc->nlink++;
- return 0;
+
+ /*
+ * If we're facing a fatal signal, bail out. Store the cancellation
+ * status separately because the VFS readdir code squashes error codes
+ * into short directory reads.
+ */
+ if (xchk_should_terminate(spc->sc, &error))
+ spc->cancelled = true;
+
+ return error;
}
/* Count the number of dentries in the parent dir that point to this inode. */
@@ -62,10 +74,9 @@ xchk_parent_count_parent_dentries(
xfs_nlink_t *nlink)
{
struct xchk_parent_ctx spc = {
- .dc.actor = xchk_parent_actor,
- .dc.pos = 0,
- .ino = sc->ip->i_ino,
- .nlink = 0,
+ .dc.actor = xchk_parent_actor,
+ .ino = sc->ip->i_ino,
+ .sc = sc,
};
size_t bufsize;
loff_t oldpos;
@@ -80,7 +91,7 @@ xchk_parent_count_parent_dentries(
*/
lock_mode = xfs_ilock_data_map_shared(parent);
if (parent->i_d.di_nextents > 0)
- error = xfs_dir3_data_readahead(parent, 0, -1);
+ error = xfs_dir3_data_readahead(parent, 0, 0);
xfs_iunlock(parent, lock_mode);
if (error)
return error;
@@ -97,6 +108,10 @@ xchk_parent_count_parent_dentries(
error = xfs_readdir(sc->tp, parent, &spc.dc, bufsize);
if (error)
goto out;
+ if (spc.cancelled) {
+ error = -EAGAIN;
+ goto out;
+ }
if (oldpos == spc.dc.pos)
break;
oldpos = spc.dc.pos;
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 0a33b4421c32..905a34558361 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -93,6 +93,10 @@ xchk_quota_item(
unsigned long long rcount;
xfs_ino_t fs_icount;
xfs_dqid_t id = be32_to_cpu(d->d_id);
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
/*
* Except for the root dquot, the actual dquot we got must either have
@@ -178,6 +182,9 @@ xchk_quota_item(
if (id != 0 && rhard != 0 && rcount > rhard)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -EFSCORRUPTED;
+
return 0;
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 15c8c5f3f688..f1775bb19313 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -16,6 +16,7 @@
#include "xfs_qm.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 96d7071cfa46..91693fce34a8 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -12,8 +12,10 @@
#include "xfs_inode.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
-#include <linux/posix_acl_xattr.h>
+#include "xfs_error.h"
+#include "xfs_acl.h"
+#include <linux/posix_acl_xattr.h>
/*
* Locking scheme:
@@ -23,6 +25,7 @@
STATIC struct posix_acl *
xfs_acl_from_disk(
+ struct xfs_mount *mp,
const struct xfs_acl *aclp,
int len,
int max_entries)
@@ -32,11 +35,18 @@ xfs_acl_from_disk(
const struct xfs_acl_entry *ace;
unsigned int count, i;
- if (len < sizeof(*aclp))
+ if (len < sizeof(*aclp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp,
+ len);
return ERR_PTR(-EFSCORRUPTED);
+ }
+
count = be32_to_cpu(aclp->acl_cnt);
- if (count > max_entries || XFS_ACL_SIZE(count) != len)
+ if (count > max_entries || XFS_ACL_SIZE(count) != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp,
+ len);
return ERR_PTR(-EFSCORRUPTED);
+ }
acl = posix_acl_alloc(count, GFP_KERNEL);
if (!acl)
@@ -145,7 +155,7 @@ xfs_get_acl(struct inode *inode, int type)
if (error != -ENOATTR)
acl = ERR_PTR(error);
} else {
- acl = xfs_acl_from_disk(xfs_acl, len,
+ acl = xfs_acl_from_disk(ip->i_mount, xfs_acl, len,
XFS_ACL_MAX_ENTRIES(ip->i_mount));
kmem_free(xfs_acl);
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f16d5f196c6b..3a688eb5c5ae 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -18,108 +18,22 @@
#include "xfs_bmap_util.h"
#include "xfs_reflink.h"
-/*
- * structure owned by writepages passed to individual writepage calls
- */
struct xfs_writepage_ctx {
- struct xfs_bmbt_irec imap;
- int fork;
+ struct iomap_writepage_ctx ctx;
unsigned int data_seq;
unsigned int cow_seq;
- struct xfs_ioend *ioend;
};
-struct block_device *
-xfs_find_bdev_for_inode(
- struct inode *inode)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
-
- if (XFS_IS_REALTIME_INODE(ip))
- return mp->m_rtdev_targp->bt_bdev;
- else
- return mp->m_ddev_targp->bt_bdev;
-}
-
-struct dax_device *
-xfs_find_daxdev_for_inode(
- struct inode *inode)
+static inline struct xfs_writepage_ctx *
+XFS_WPC(struct iomap_writepage_ctx *ctx)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
-
- if (XFS_IS_REALTIME_INODE(ip))
- return mp->m_rtdev_targp->bt_daxdev;
- else
- return mp->m_ddev_targp->bt_daxdev;
-}
-
-static void
-xfs_finish_page_writeback(
- struct inode *inode,
- struct bio_vec *bvec,
- int error)
-{
- struct iomap_page *iop = to_iomap_page(bvec->bv_page);
-
- if (error) {
- SetPageError(bvec->bv_page);
- mapping_set_error(inode->i_mapping, -EIO);
- }
-
- ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
- ASSERT(!iop || atomic_read(&iop->write_count) > 0);
-
- if (!iop || atomic_dec_and_test(&iop->write_count))
- end_page_writeback(bvec->bv_page);
-}
-
-/*
- * We're now finished for good with this ioend structure. Update the page
- * state, release holds on bios, and finally free up memory. Do not use the
- * ioend after this.
- */
-STATIC void
-xfs_destroy_ioend(
- struct xfs_ioend *ioend,
- int error)
-{
- struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_inline_bio;
- struct bio *last = ioend->io_bio, *next;
- u64 start = bio->bi_iter.bi_sector;
- bool quiet = bio_flagged(bio, BIO_QUIET);
-
- for (bio = &ioend->io_inline_bio; bio; bio = next) {
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * For the last bio, bi_private points to the ioend, so we
- * need to explicitly end the iteration here.
- */
- if (bio == last)
- next = NULL;
- else
- next = bio->bi_private;
-
- /* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bvec, bio, iter_all)
- xfs_finish_page_writeback(inode, bvec, error);
- bio_put(bio);
- }
-
- if (unlikely(error && !quiet)) {
- xfs_err_ratelimited(XFS_I(inode)->i_mount,
- "writeback error on sector %llu", start);
- }
+ return container_of(ctx, struct xfs_writepage_ctx, ctx);
}
/*
* Fast and loose check if this write could update the on-disk inode size.
*/
-static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
{
return ioend->io_offset + ioend->io_size >
XFS_I(ioend->io_inode)->i_d.di_size;
@@ -127,7 +41,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
STATIC int
xfs_setfilesize_trans_alloc(
- struct xfs_ioend *ioend)
+ struct iomap_ioend *ioend)
{
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
struct xfs_trans *tp;
@@ -137,7 +51,7 @@ xfs_setfilesize_trans_alloc(
if (error)
return error;
- ioend->io_append_trans = tp;
+ ioend->io_private = tp;
/*
* We may pass freeze protection with a transaction. So tell lockdep
@@ -200,11 +114,11 @@ xfs_setfilesize(
STATIC int
xfs_setfilesize_ioend(
- struct xfs_ioend *ioend,
+ struct iomap_ioend *ioend,
int error)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_trans *tp = ioend->io_append_trans;
+ struct xfs_trans *tp = ioend->io_private;
/*
* The transaction may have been allocated in the I/O submission thread,
@@ -228,9 +142,8 @@ xfs_setfilesize_ioend(
*/
STATIC void
xfs_end_ioend(
- struct xfs_ioend *ioend)
+ struct iomap_ioend *ioend)
{
- struct list_head ioend_list;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
@@ -257,7 +170,7 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- if (ioend->io_fork == XFS_COW_FORK)
+ if (ioend->io_flags & IOMAP_F_SHARED)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
goto done;
}
@@ -265,154 +178,86 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_fork == XFS_COW_FORK)
+ if (ioend->io_flags & IOMAP_F_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_state == XFS_EXT_UNWRITTEN)
+ else if (ioend->io_type == IOMAP_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
else
- ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
+ ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private);
done:
- if (ioend->io_append_trans)
+ if (ioend->io_private)
error = xfs_setfilesize_ioend(ioend, error);
- list_replace_init(&ioend->io_list, &ioend_list);
- xfs_destroy_ioend(ioend, error);
-
- while (!list_empty(&ioend_list)) {
- ioend = list_first_entry(&ioend_list, struct xfs_ioend,
- io_list);
- list_del_init(&ioend->io_list);
- xfs_destroy_ioend(ioend, error);
- }
-
+ iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
/*
- * We can merge two adjacent ioends if they have the same set of work to do.
- */
-static bool
-xfs_ioend_can_merge(
- struct xfs_ioend *ioend,
- struct xfs_ioend *next)
-{
- if (ioend->io_bio->bi_status != next->io_bio->bi_status)
- return false;
- if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
- return false;
- if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
- (next->io_state == XFS_EXT_UNWRITTEN))
- return false;
- if (ioend->io_offset + ioend->io_size != next->io_offset)
- return false;
- return true;
-}
-
-/*
* If the to be merged ioend has a preallocated transaction for file
* size updates we need to ensure the ioend it is merged into also
* has one. If it already has one we can simply cancel the transaction
* as it is guaranteed to be clean.
*/
static void
-xfs_ioend_merge_append_transactions(
- struct xfs_ioend *ioend,
- struct xfs_ioend *next)
+xfs_ioend_merge_private(
+ struct iomap_ioend *ioend,
+ struct iomap_ioend *next)
{
- if (!ioend->io_append_trans) {
- ioend->io_append_trans = next->io_append_trans;
- next->io_append_trans = NULL;
+ if (!ioend->io_private) {
+ ioend->io_private = next->io_private;
+ next->io_private = NULL;
} else {
xfs_setfilesize_ioend(next, -ECANCELED);
}
}
-/* Try to merge adjacent completions. */
-STATIC void
-xfs_ioend_try_merge(
- struct xfs_ioend *ioend,
- struct list_head *more_ioends)
-{
- struct xfs_ioend *next_ioend;
-
- while (!list_empty(more_ioends)) {
- next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
- io_list);
- if (!xfs_ioend_can_merge(ioend, next_ioend))
- break;
- list_move_tail(&next_ioend->io_list, &ioend->io_list);
- ioend->io_size += next_ioend->io_size;
- if (next_ioend->io_append_trans)
- xfs_ioend_merge_append_transactions(ioend, next_ioend);
- }
-}
-
-/* list_sort compare function for ioends */
-static int
-xfs_ioend_compare(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_ioend *ia;
- struct xfs_ioend *ib;
-
- ia = container_of(a, struct xfs_ioend, io_list);
- ib = container_of(b, struct xfs_ioend, io_list);
- if (ia->io_offset < ib->io_offset)
- return -1;
- else if (ia->io_offset > ib->io_offset)
- return 1;
- return 0;
-}
-
/* Finish all pending io completions. */
void
xfs_end_io(
struct work_struct *work)
{
- struct xfs_inode *ip;
- struct xfs_ioend *ioend;
- struct list_head completion_list;
+ struct xfs_inode *ip =
+ container_of(work, struct xfs_inode, i_ioend_work);
+ struct iomap_ioend *ioend;
+ struct list_head tmp;
unsigned long flags;
- ip = container_of(work, struct xfs_inode, i_ioend_work);
-
spin_lock_irqsave(&ip->i_ioend_lock, flags);
- list_replace_init(&ip->i_ioend_list, &completion_list);
+ list_replace_init(&ip->i_ioend_list, &tmp);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
- list_sort(NULL, &completion_list, xfs_ioend_compare);
-
- while (!list_empty(&completion_list)) {
- ioend = list_first_entry(&completion_list, struct xfs_ioend,
- io_list);
+ iomap_sort_ioends(&tmp);
+ while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
+ io_list))) {
list_del_init(&ioend->io_list);
- xfs_ioend_try_merge(ioend, &completion_list);
+ iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private);
xfs_end_ioend(ioend);
}
}
+static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend)
+{
+ return ioend->io_private ||
+ ioend->io_type == IOMAP_UNWRITTEN ||
+ (ioend->io_flags & IOMAP_F_SHARED);
+}
+
STATIC void
xfs_end_bio(
struct bio *bio)
{
- struct xfs_ioend *ioend = bio->bi_private;
+ struct iomap_ioend *ioend = bio->bi_private;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
- if (ioend->io_fork == XFS_COW_FORK ||
- ioend->io_state == XFS_EXT_UNWRITTEN ||
- ioend->io_append_trans != NULL) {
- spin_lock_irqsave(&ip->i_ioend_lock, flags);
- if (list_empty(&ip->i_ioend_list))
- WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
- &ip->i_ioend_work));
- list_add_tail(&ioend->io_list, &ip->i_ioend_list);
- spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
- } else
- xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
+ ASSERT(xfs_ioend_needs_workqueue(ioend));
+
+ spin_lock_irqsave(&ip->i_ioend_lock, flags);
+ if (list_empty(&ip->i_ioend_list))
+ WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+ &ip->i_ioend_work));
+ list_add_tail(&ioend->io_list, &ip->i_ioend_list);
+ spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
}
/*
@@ -421,19 +266,19 @@ xfs_end_bio(
*/
static bool
xfs_imap_valid(
- struct xfs_writepage_ctx *wpc,
+ struct iomap_writepage_ctx *wpc,
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb)
+ loff_t offset)
{
- if (offset_fsb < wpc->imap.br_startoff ||
- offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length)
return false;
/*
* If this is a COW mapping, it is sufficient to check that the mapping
* covers the offset. Be careful to check this first because the caller
* can revalidate a COW mapping without updating the data seqno.
*/
- if (wpc->fork == XFS_COW_FORK)
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
return true;
/*
@@ -443,17 +288,17 @@ xfs_imap_valid(
* checked (and found nothing at this offset) could have added
* overlapping blocks.
*/
- if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
return false;
if (xfs_inode_has_cow_data(ip) &&
- wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
return false;
return true;
}
/*
* Pass in a dellalloc extent and convert it to real extents, return the real
- * extent that maps offset_fsb in wpc->imap.
+ * extent that maps offset_fsb in wpc->iomap.
*
* The current page is held locked so nothing could have removed the block
* backing offset_fsb, although it could have moved from the COW to the data
@@ -461,32 +306,38 @@ xfs_imap_valid(
*/
static int
xfs_convert_blocks(
- struct xfs_writepage_ctx *wpc,
+ struct iomap_writepage_ctx *wpc,
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb)
+ int whichfork,
+ loff_t offset)
{
int error;
+ unsigned *seq;
+
+ if (whichfork == XFS_COW_FORK)
+ seq = &XFS_WPC(wpc)->cow_seq;
+ else
+ seq = &XFS_WPC(wpc)->data_seq;
/*
- * Attempt to allocate whatever delalloc extent currently backs
- * offset_fsb and put the result into wpc->imap. Allocate in a loop
- * because it may take several attempts to allocate real blocks for a
- * contiguous delalloc extent if free space is sufficiently fragmented.
+ * Attempt to allocate whatever delalloc extent currently backs offset
+ * and put the result into wpc->iomap. Allocate in a loop because it
+ * may take several attempts to allocate real blocks for a contiguous
+ * delalloc extent if free space is sufficiently fragmented.
*/
do {
- error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
- &wpc->imap, wpc->fork == XFS_COW_FORK ?
- &wpc->cow_seq : &wpc->data_seq);
+ error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+ &wpc->iomap, seq);
if (error)
return error;
- } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+ } while (wpc->iomap.offset + wpc->iomap.length <= offset);
return 0;
}
-STATIC int
+static int
xfs_map_blocks(
- struct xfs_writepage_ctx *wpc,
+ struct iomap_writepage_ctx *wpc,
struct inode *inode,
loff_t offset)
{
@@ -496,6 +347,7 @@ xfs_map_blocks(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
+ int whichfork = XFS_DATA_FORK;
struct xfs_bmbt_irec imap;
struct xfs_iext_cursor icur;
int retries = 0;
@@ -519,7 +371,7 @@ xfs_map_blocks(
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- if (xfs_imap_valid(wpc, ip, offset_fsb))
+ if (xfs_imap_valid(wpc, ip, offset))
return 0;
/*
@@ -541,10 +393,10 @@ retry:
xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
cow_fsb = imap.br_startoff;
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
- wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
+ XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- wpc->fork = XFS_COW_FORK;
+ whichfork = XFS_COW_FORK;
goto allocate_blocks;
}
@@ -552,7 +404,7 @@ retry:
* No COW extent overlap. Revalidate now that we may have updated
* ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (xfs_imap_valid(wpc, ip, offset_fsb)) {
+ if (xfs_imap_valid(wpc, ip, offset)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
@@ -564,11 +416,9 @@ retry:
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
- wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
+ XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- wpc->fork = XFS_DATA_FORK;
-
/* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
imap.br_blockcount = imap.br_startoff - offset_fsb;
@@ -592,11 +442,11 @@ retry:
isnullstartblock(imap.br_startblock))
goto allocate_blocks;
- wpc->imap = imap;
- trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
+ trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
- error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ error = xfs_convert_blocks(wpc, ip, whichfork, offset);
if (error) {
/*
* If we failed to find the extent in the COW fork we might have
@@ -605,7 +455,7 @@ allocate_blocks:
* the former case, but prevent additional retries to avoid
* looping forever for the latter case.
*/
- if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
goto retry;
ASSERT(error != -EAGAIN);
return error;
@@ -616,34 +466,22 @@ allocate_blocks:
* original delalloc one. Trim the return extent to the next COW
* boundary again to force a re-lookup.
*/
- if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
- cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
- wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+ if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
+ loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
+
+ if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
+ wpc->iomap.length = cow_offset - wpc->iomap.offset;
+ }
- ASSERT(wpc->imap.br_startoff <= offset_fsb);
- ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
+ ASSERT(wpc->iomap.offset <= offset);
+ ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
+ trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
return 0;
}
-/*
- * Submit the bio for an ioend. We are passed an ioend with a bio attached to
- * it, and we submit that bio. The ioend may be used for multiple bio
- * submissions, so we only want to allocate an append transaction for the ioend
- * once. In the case of multiple bio submission, each bio will take an IO
- * reference to the ioend to ensure that the ioend completion is only done once
- * all bios have been submitted and the ioend is really done.
- *
- * If @status is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the bio and ioend
- * rather than submit it to IO. This typically only happens on a filesystem
- * shutdown.
- */
-STATIC int
-xfs_submit_ioend(
- struct writeback_control *wbc,
- struct xfs_ioend *ioend,
+static int
+xfs_prepare_ioend(
+ struct iomap_ioend *ioend,
int status)
{
unsigned int nofs_flag;
@@ -656,157 +494,24 @@ xfs_submit_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
- if (!status && ioend->io_fork == XFS_COW_FORK) {
+ if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- (ioend->io_fork == XFS_COW_FORK ||
- ioend->io_state != XFS_EXT_UNWRITTEN) &&
+ ((ioend->io_flags & IOMAP_F_SHARED) ||
+ ioend->io_type != IOMAP_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
- !ioend->io_append_trans)
+ !ioend->io_private)
status = xfs_setfilesize_trans_alloc(ioend);
memalloc_nofs_restore(nofs_flag);
- ioend->io_bio->bi_private = ioend;
- ioend->io_bio->bi_end_io = xfs_end_bio;
-
- /*
- * If we are failing the IO now, just mark the ioend with an
- * error and finish it. This will run IO completion immediately
- * as there is only one reference to the ioend at this point in
- * time.
- */
- if (status) {
- ioend->io_bio->bi_status = errno_to_blk_status(status);
- bio_endio(ioend->io_bio);
- return status;
- }
-
- submit_bio(ioend->io_bio);
- return 0;
-}
-
-static struct xfs_ioend *
-xfs_alloc_ioend(
- struct inode *inode,
- int fork,
- xfs_exntst_t state,
- xfs_off_t offset,
- struct block_device *bdev,
- sector_t sector,
- struct writeback_control *wbc)
-{
- struct xfs_ioend *ioend;
- struct bio *bio;
-
- bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
- bio_set_dev(bio, bdev);
- bio->bi_iter.bi_sector = sector;
- bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- bio->bi_write_hint = inode->i_write_hint;
- wbc_init_bio(wbc, bio);
-
- ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_fork = fork;
- ioend->io_state = state;
- ioend->io_inode = inode;
- ioend->io_size = 0;
- ioend->io_offset = offset;
- ioend->io_append_trans = NULL;
- ioend->io_bio = bio;
- return ioend;
-}
-
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to do perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in xfs_destroy_ioend().
- */
-static struct bio *
-xfs_chain_bio(
- struct bio *prev)
-{
- struct bio *new;
-
- new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
- bio_copy_dev(new, prev);/* also copies over blkcg information */
- new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_opf = prev->bi_opf;
- new->bi_write_hint = prev->bi_write_hint;
-
- bio_chain(prev, new);
- bio_get(prev); /* for xfs_destroy_ioend */
- submit_bio(prev);
- return new;
-}
-
-/*
- * Test to see if we have an existing ioend structure that we could append to
- * first, otherwise finish off the current ioend and start another.
- */
-STATIC void
-xfs_add_to_ioend(
- struct inode *inode,
- xfs_off_t offset,
- struct page *page,
- struct iomap_page *iop,
- struct xfs_writepage_ctx *wpc,
- struct writeback_control *wbc,
- struct list_head *iolist)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- struct block_device *bdev = xfs_find_bdev_for_inode(inode);
- unsigned len = i_blocksize(inode);
- unsigned poff = offset & (PAGE_SIZE - 1);
- bool merged, same_page = false;
- sector_t sector;
-
- sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
- ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
-
- if (!wpc->ioend ||
- wpc->fork != wpc->ioend->io_fork ||
- wpc->imap.br_state != wpc->ioend->io_state ||
- sector != bio_end_sector(wpc->ioend->io_bio) ||
- offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
- if (wpc->ioend)
- list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
- wpc->imap.br_state, offset, bdev, sector, wbc);
- }
-
- merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
- &same_page);
-
- if (iop && !same_page)
- atomic_inc(&iop->write_count);
-
- if (!merged) {
- if (bio_full(wpc->ioend->io_bio, len))
- wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);
- bio_add_page(wpc->ioend->io_bio, page, len, poff);
- }
-
- wpc->ioend->io_size += len;
- wbc_account_cgroup_owner(wbc, page, len);
-}
-
-STATIC void
-xfs_vm_invalidatepage(
- struct page *page,
- unsigned int offset,
- unsigned int length)
-{
- trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
- iomap_invalidatepage(page, offset, length);
+ if (xfs_ioend_needs_workqueue(ioend))
+ ioend->io_bio->bi_end_io = xfs_end_bio;
+ return status;
}
/*
@@ -820,8 +525,8 @@ xfs_vm_invalidatepage(
* transaction as there is no space left for block reservation (typically why we
* see a ENOSPC in writeback).
*/
-STATIC void
-xfs_aops_discard_page(
+static void
+xfs_discard_page(
struct page *page)
{
struct inode *inode = page->mapping->host;
@@ -843,246 +548,14 @@ xfs_aops_discard_page(
if (error && !XFS_FORCED_SHUTDOWN(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
- xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
-}
-
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * forward progress guarantees we need to provide. The current ioend we are
- * adding blocks to is cached on the writepage context, and if the new block
- * does not append to the cached ioend it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected. While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int
-xfs_writepage_map(
- struct xfs_writepage_ctx *wpc,
- struct writeback_control *wbc,
- struct inode *inode,
- struct page *page,
- uint64_t end_offset)
-{
- LIST_HEAD(submit_list);
- struct iomap_page *iop = to_iomap_page(page);
- unsigned len = i_blocksize(inode);
- struct xfs_ioend *ioend, *next;
- uint64_t file_offset; /* file offset of page */
- int error = 0, count = 0, i;
-
- ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
- ASSERT(!iop || atomic_read(&iop->write_count) == 0);
-
- /*
- * Walk through the page to find areas to write back. If we run off the
- * end of the current map or find the current map invalid, grab a new
- * one.
- */
- for (i = 0, file_offset = page_offset(page);
- i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
- i++, file_offset += len) {
- if (iop && !test_bit(i, iop->uptodate))
- continue;
-
- error = xfs_map_blocks(wpc, inode, file_offset);
- if (error)
- break;
- if (wpc->imap.br_startblock == HOLESTARTBLOCK)
- continue;
- xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
- &submit_list);
- count++;
- }
-
- ASSERT(wpc->ioend || list_empty(&submit_list));
- ASSERT(PageLocked(page));
- ASSERT(!PageWriteback(page));
-
- /*
- * On error, we have to fail the ioend here because we may have set
- * pages under writeback, we have to make sure we run IO completion to
- * mark the error state of the IO appropriately, so we can't cancel the
- * ioend directly here. That means we have to mark this page as under
- * writeback if we included any blocks from it in the ioend chain so
- * that completion treats it correctly.
- *
- * If we didn't include the page in the ioend, the on error we can
- * simply discard and unlock it as there are no other users of the page
- * now. The caller will still need to trigger submission of outstanding
- * ioends on the writepage context so they are treated correctly on
- * error.
- */
- if (unlikely(error)) {
- if (!count) {
- xfs_aops_discard_page(page);
- ClearPageUptodate(page);
- unlock_page(page);
- goto done;
- }
-
- /*
- * If the page was not fully cleaned, we need to ensure that the
- * higher layers come back to it correctly. That means we need
- * to keep the page dirty, and for WB_SYNC_ALL writeback we need
- * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
- * so another attempt to write this page in this writeback sweep
- * will be made.
- */
- set_page_writeback_keepwrite(page);
- } else {
- clear_page_dirty_for_io(page);
- set_page_writeback(page);
- }
-
- unlock_page(page);
-
- /*
- * Preserve the original error if there was one, otherwise catch
- * submission errors here and propagate into subsequent ioend
- * submissions.
- */
- list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
- int error2;
-
- list_del_init(&ioend->io_list);
- error2 = xfs_submit_ioend(wbc, ioend, error);
- if (error2 && !error)
- error = error2;
- }
-
- /*
- * We can end up here with no error and nothing to write only if we race
- * with a partial page truncate on a sub-page block sized filesystem.
- */
- if (!count)
- end_page_writeback(page);
-done:
- mapping_set_error(page->mapping, error);
- return error;
+ iomap_invalidatepage(page, 0, PAGE_SIZE);
}
-/*
- * Write out a dirty page.
- *
- * For delalloc space on the page we need to allocate space and flush it.
- * For unwritten space on the page we need to start the conversion to
- * regular allocated space.
- */
-STATIC int
-xfs_do_writepage(
- struct page *page,
- struct writeback_control *wbc,
- void *data)
-{
- struct xfs_writepage_ctx *wpc = data;
- struct inode *inode = page->mapping->host;
- loff_t offset;
- uint64_t end_offset;
- pgoff_t end_index;
-
- trace_xfs_writepage(inode, page, 0, 0);
-
- /*
- * Refuse to write the page out if we are called from reclaim context.
- *
- * This avoids stack overflows when called from deeply used stacks in
- * random callers for direct reclaim or memcg reclaim. We explicitly
- * allow reclaim from kswapd as the stack usage there is relatively low.
- *
- * This should never happen except in the case of a VM regression so
- * warn about it.
- */
- if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- PF_MEMALLOC))
- goto redirty;
-
- /*
- * Given that we do not allow direct reclaim to call us, we should
- * never be called while in a filesystem transaction.
- */
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
- goto redirty;
-
- /*
- * Is this page beyond the end of the file?
- *
- * The page index is less than the end_index, adjust the end_offset
- * to the highest offset that this page should represent.
- * -----------------------------------------------------
- * | file mapping | <EOF> |
- * -----------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | |
- * ^--------------------------------^----------|--------
- * | desired writeback range | see else |
- * ---------------------------------^------------------|
- */
- offset = i_size_read(inode);
- end_index = offset >> PAGE_SHIFT;
- if (page->index < end_index)
- end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
- else {
- /*
- * Check whether the page to write out is beyond or straddles
- * i_size or not.
- * -------------------------------------------------------
- * | file mapping | <EOF> |
- * -------------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
- * ^--------------------------------^-----------|---------
- * | | Straddles |
- * ---------------------------------^-----------|--------|
- */
- unsigned offset_into_page = offset & (PAGE_SIZE - 1);
-
- /*
- * Skip the page if it is fully outside i_size, e.g. due to a
- * truncate operation that is in progress. We must redirty the
- * page so that reclaim stops reclaiming it. Otherwise
- * xfs_vm_releasepage() is called on it and gets confused.
- *
- * Note that the end_index is unsigned long, it would overflow
- * if the given offset is greater than 16TB on 32-bit system
- * and if we do check the page is fully outside i_size or not
- * via "if (page->index >= end_index + 1)" as "end_index + 1"
- * will be evaluated to 0. Hence this page will be redirtied
- * and be written out repeatedly which would result in an
- * infinite loop, the user program that perform this operation
- * will hang. Instead, we can verify this situation by checking
- * if the page to write is totally beyond the i_size or if it's
- * offset is just equal to the EOF.
- */
- if (page->index > end_index ||
- (page->index == end_index && offset_into_page == 0))
- goto redirty;
-
- /*
- * The page straddles i_size. It must be zeroed out on each
- * and every writepage invocation because it may be mmapped.
- * "A file is mapped in multiples of the page size. For a file
- * that is not a multiple of the page size, the remaining
- * memory is zeroed when mapped, and writes to that region are
- * not written out to the file."
- */
- zero_user_segment(page, offset_into_page, PAGE_SIZE);
-
- /* Adjust the end_offset to the end of file */
- end_offset = offset;
- }
-
- return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
-
-redirty:
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
-}
+static const struct iomap_writeback_ops xfs_writeback_ops = {
+ .map_blocks = xfs_map_blocks,
+ .prepare_ioend = xfs_prepare_ioend,
+ .discard_page = xfs_discard_page,
+};
STATIC int
xfs_vm_writepage(
@@ -1090,12 +563,8 @@ xfs_vm_writepage(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
- int ret;
- ret = xfs_do_writepage(page, wbc, &wpc);
- if (wpc.ioend)
- ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
- return ret;
+ return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
}
STATIC int
@@ -1104,13 +573,9 @@ xfs_vm_writepages(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
- int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
- if (wpc.ioend)
- ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
- return ret;
+ return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
STATIC int
@@ -1118,18 +583,11 @@ xfs_dax_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
- xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- return dax_writeback_mapping_range(mapping,
- xfs_find_bdev_for_inode(mapping->host), wbc);
-}
+ struct xfs_inode *ip = XFS_I(mapping->host);
-STATIC int
-xfs_vm_releasepage(
- struct page *page,
- gfp_t gfp_mask)
-{
- trace_xfs_releasepage(page->mapping->host, page, 0, 0);
- return iomap_releasepage(page, gfp_mask);
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+ return dax_writeback_mapping_range(mapping,
+ xfs_inode_buftarg(ip)->bt_bdev, wbc);
}
STATIC sector_t
@@ -1152,7 +610,7 @@ xfs_vm_bmap(
*/
if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
- return iomap_bmap(mapping, block, &xfs_iomap_ops);
+ return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
}
STATIC int
@@ -1160,8 +618,7 @@ xfs_vm_readpage(
struct file *unused,
struct page *page)
{
- trace_xfs_vm_readpage(page->mapping->host, 1);
- return iomap_readpage(page, &xfs_iomap_ops);
+ return iomap_readpage(page, &xfs_read_iomap_ops);
}
STATIC int
@@ -1171,8 +628,7 @@ xfs_vm_readpages(
struct list_head *pages,
unsigned nr_pages)
{
- trace_xfs_vm_readpages(mapping->host, nr_pages);
- return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
+ return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops);
}
static int
@@ -1181,8 +637,9 @@ xfs_iomap_swapfile_activate(
struct file *swap_file,
sector_t *span)
{
- sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
- return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
+ sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+ return iomap_swapfile_activate(sis, swap_file, span,
+ &xfs_read_iomap_ops);
}
const struct address_space_operations xfs_address_space_operations = {
@@ -1191,8 +648,8 @@ const struct address_space_operations xfs_address_space_operations = {
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
.set_page_dirty = iomap_set_page_dirty,
- .releasepage = xfs_vm_releasepage,
- .invalidatepage = xfs_vm_invalidatepage,
+ .releasepage = iomap_releasepage,
+ .invalidatepage = iomap_invalidatepage,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 45a1ea240cbb..e0bd68419764 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -6,29 +6,9 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
-extern struct bio_set xfs_ioend_bioset;
-
-/*
- * Structure for buffered I/O completions.
- */
-struct xfs_ioend {
- struct list_head io_list; /* next ioend in chain */
- int io_fork; /* inode fork written back */
- xfs_exntst_t io_state; /* extent state */
- struct inode *io_inode; /* file being written to */
- size_t io_size; /* size of the extent */
- xfs_off_t io_offset; /* offset in the file */
- struct xfs_trans *io_append_trans;/* xact. for size update */
- struct bio *io_bio; /* bio being built */
- struct bio io_inline_bio; /* MUST BE LAST! */
-};
-
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
-extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
-extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *);
-
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index a640a285cc52..5ff49523d8ea 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -22,6 +22,7 @@
#include "xfs_attr_leaf.h"
#include "xfs_quota.h"
#include "xfs_dir2.h"
+#include "xfs_error.h"
/*
* Look at all the extents for this logical region,
@@ -190,37 +191,35 @@ xfs_attr3_leaf_inactive(
*/
STATIC int
xfs_attr3_node_inactive(
- struct xfs_trans **trans,
- struct xfs_inode *dp,
- struct xfs_buf *bp,
- int level)
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp,
+ int level)
{
- xfs_da_blkinfo_t *info;
- xfs_da_intnode_t *node;
- xfs_dablk_t child_fsb;
- xfs_daddr_t parent_blkno, child_blkno;
- int error, i;
- struct xfs_buf *child_bp;
- struct xfs_da_node_entry *btree;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_blkinfo *info;
+ xfs_dablk_t child_fsb;
+ xfs_daddr_t parent_blkno, child_blkno;
+ struct xfs_buf *child_bp;
struct xfs_da3_icnode_hdr ichdr;
+ int error, i;
/*
* Since this code is recursive (gasp!) we must protect ourselves.
*/
if (level > XFS_DA_NODE_MAXDEPTH) {
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
- return -EIO;
+ xfs_buf_corruption_error(bp);
+ return -EFSCORRUPTED;
}
- node = bp->b_addr;
- dp->d_ops->node_hdr_from_disk(&ichdr, node);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr);
parent_blkno = bp->b_bn;
if (!ichdr.count) {
xfs_trans_brelse(*trans, bp);
return 0;
}
- btree = dp->d_ops->node_tree_p(node);
- child_fsb = be32_to_cpu(btree[0].before);
+ child_fsb = be32_to_cpu(ichdr.btree[0].before);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
/*
@@ -235,7 +234,7 @@ xfs_attr3_node_inactive(
* traversal of the tree so we may deal with many blocks
* before we come back to this one.
*/
- error = xfs_da3_node_read(*trans, dp, child_fsb, -1, &child_bp,
+ error = xfs_da3_node_read(*trans, dp, child_fsb, &child_bp,
XFS_ATTR_FORK);
if (error)
return error;
@@ -258,8 +257,9 @@ xfs_attr3_node_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, child_bp);
break;
default:
- error = -EIO;
+ xfs_buf_corruption_error(child_bp);
xfs_trans_brelse(*trans, child_bp);
+ error = -EFSCORRUPTED;
break;
}
if (error)
@@ -268,10 +268,16 @@ xfs_attr3_node_inactive(
/*
* Remove the subsidiary block from the cache and from the log.
*/
- error = xfs_da_get_buf(*trans, dp, 0, child_blkno, &child_bp,
- XFS_ATTR_FORK);
- if (error)
+ child_bp = xfs_trans_get_buf(*trans, mp->m_ddev_targp,
+ child_blkno,
+ XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0);
+ if (!child_bp)
+ return -EIO;
+ error = bp->b_error;
+ if (error) {
+ xfs_trans_brelse(*trans, child_bp);
return error;
+ }
xfs_trans_binval(*trans, child_bp);
/*
@@ -279,13 +285,15 @@ xfs_attr3_node_inactive(
* child block number.
*/
if (i + 1 < ichdr.count) {
- error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
- &bp, XFS_ATTR_FORK);
+ struct xfs_da3_icnode_hdr phdr;
+
+ error = xfs_da3_node_read_mapped(*trans, dp,
+ parent_blkno, &bp, XFS_ATTR_FORK);
if (error)
return error;
- node = bp->b_addr;
- btree = dp->d_ops->node_tree_p(node);
- child_fsb = be32_to_cpu(btree[i + 1].before);
+ xfs_da3_node_hdr_from_disk(dp->i_mount, &phdr,
+ bp->b_addr);
+ child_fsb = be32_to_cpu(phdr.btree[i + 1].before);
xfs_trans_brelse(*trans, bp);
}
/*
@@ -310,6 +318,7 @@ xfs_attr3_root_inactive(
struct xfs_trans **trans,
struct xfs_inode *dp)
{
+ struct xfs_mount *mp = dp->i_mount;
struct xfs_da_blkinfo *info;
struct xfs_buf *bp;
xfs_daddr_t blkno;
@@ -321,7 +330,7 @@ xfs_attr3_root_inactive(
* the extents in reverse order the extent containing
* block 0 must still be there.
*/
- error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK);
if (error)
return error;
blkno = bp->b_bn;
@@ -341,7 +350,8 @@ xfs_attr3_root_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, bp);
break;
default:
- error = -EIO;
+ error = -EFSCORRUPTED;
+ xfs_buf_corruption_error(bp);
xfs_trans_brelse(*trans, bp);
break;
}
@@ -351,9 +361,15 @@ xfs_attr3_root_inactive(
/*
* Invalidate the incore copy of the root block.
*/
- error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
- if (error)
+ bp = xfs_trans_get_buf(*trans, mp->m_ddev_targp, blkno,
+ XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0);
+ if (!bp)
+ return -EIO;
+ error = bp->b_error;
+ if (error) {
+ xfs_trans_brelse(*trans, bp);
return error;
+ }
xfs_trans_binval(*trans, bp); /* remove from cache */
/*
* Commit the invalidate and start the next transaction.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 00758fdc2fec..d37743bdf274 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -49,14 +49,16 @@ xfs_attr_shortform_compare(const void *a, const void *b)
* we can begin returning them to the user.
*/
static int
-xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+xfs_attr_shortform_list(
+ struct xfs_attr_list_context *context)
{
- attrlist_cursor_kern_t *cursor;
- xfs_attr_sf_sort_t *sbuf, *sbp;
- xfs_attr_shortform_t *sf;
- xfs_attr_sf_entry_t *sfe;
- xfs_inode_t *dp;
- int sbsize, nsbuf, count, i;
+ struct attrlist_cursor_kern *cursor;
+ struct xfs_attr_sf_sort *sbuf, *sbp;
+ struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_entry *sfe;
+ struct xfs_inode *dp;
+ int sbsize, nsbuf, count, i;
+ int error = 0;
ASSERT(context != NULL);
dp = context->dp;
@@ -84,6 +86,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
(XFS_ISRESET_CURSOR(cursor) &&
(dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ if (XFS_IS_CORRUPT(context->dp->i_mount,
+ !xfs_attr_namecheck(sfe->nameval,
+ sfe->namelen)))
+ return -EFSCORRUPTED;
context->put_listent(context,
sfe->flags,
sfe->nameval,
@@ -161,10 +167,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
break;
}
}
- if (i == nsbuf) {
- kmem_free(sbuf);
- return 0;
- }
+ if (i == nsbuf)
+ goto out;
/*
* Loop putting entries into the user buffer.
@@ -174,6 +178,12 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
cursor->hashval = sbp->hash;
cursor->offset = 0;
}
+ if (XFS_IS_CORRUPT(context->dp->i_mount,
+ !xfs_attr_namecheck(sbp->name,
+ sbp->namelen))) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
context->put_listent(context,
sbp->flags,
sbp->name,
@@ -183,9 +193,9 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
break;
cursor->offset++;
}
-
+out:
kmem_free(sbuf);
- return 0;
+ return error;
}
/*
@@ -213,7 +223,7 @@ xfs_attr_node_list_lookup(
ASSERT(*pbp == NULL);
cursor->blkno = 0;
for (;;) {
- error = xfs_da3_node_read(tp, dp, cursor->blkno, -1, &bp,
+ error = xfs_da3_node_read(tp, dp, cursor->blkno, &bp,
XFS_ATTR_FORK);
if (error)
return error;
@@ -229,7 +239,7 @@ xfs_attr_node_list_lookup(
goto out_corruptbuf;
}
- dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ xfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
/* Tree taller than we can handle; bail out! */
if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
@@ -243,7 +253,7 @@ xfs_attr_node_list_lookup(
else
expected_level--;
- btree = dp->d_ops->node_tree_p(node);
+ btree = nodehdr.btree;
for (i = 0; i < nodehdr.count; btree++, i++) {
if (cursor->hashval <= be32_to_cpu(btree->hashval)) {
cursor->blkno = be32_to_cpu(btree->before);
@@ -258,7 +268,7 @@ xfs_attr_node_list_lookup(
return 0;
/* We can't point back to the root. */
- if (cursor->blkno == 0)
+ if (XFS_IS_CORRUPT(mp, cursor->blkno == 0))
return -EFSCORRUPTED;
}
@@ -269,6 +279,7 @@ xfs_attr_node_list_lookup(
return 0;
out_corruptbuf:
+ xfs_buf_corruption_error(bp);
xfs_trans_brelse(tp, bp);
return -EFSCORRUPTED;
}
@@ -284,7 +295,7 @@ xfs_attr_node_list(
struct xfs_buf *bp;
struct xfs_inode *dp = context->dp;
struct xfs_mount *mp = dp->i_mount;
- int error;
+ int error = 0;
trace_xfs_attr_node_list(context);
@@ -298,8 +309,8 @@ xfs_attr_node_list(
*/
bp = NULL;
if (cursor->blkno > 0) {
- error = xfs_da3_node_read(context->tp, dp, cursor->blkno, -1,
- &bp, XFS_ATTR_FORK);
+ error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp,
+ XFS_ATTR_FORK);
if ((error != 0) && (error != -EFSCORRUPTED))
return error;
if (bp) {
@@ -358,24 +369,27 @@ xfs_attr_node_list(
*/
for (;;) {
leaf = bp->b_addr;
- xfs_attr3_leaf_list_int(bp, context);
+ error = xfs_attr3_leaf_list_int(bp, context);
+ if (error)
+ break;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
if (context->seen_enough || leafhdr.forw == 0)
break;
cursor->blkno = leafhdr.forw;
xfs_trans_brelse(context->tp, bp);
- error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno, -1, &bp);
+ error = xfs_attr3_leaf_read(context->tp, dp, cursor->blkno,
+ &bp);
if (error)
return error;
}
xfs_trans_brelse(context->tp, bp);
- return 0;
+ return error;
}
/*
* Copy out attribute list entries for attr_list(), for leaf attribute lists.
*/
-void
+int
xfs_attr3_leaf_list_int(
struct xfs_buf *bp,
struct xfs_attr_list_context *context)
@@ -417,7 +431,7 @@ xfs_attr3_leaf_list_int(
}
if (i == ichdr.count) {
trace_xfs_attr_list_notfound(context);
- return;
+ return 0;
}
} else {
entry = &entries[0];
@@ -457,6 +471,9 @@ xfs_attr3_leaf_list_int(
valuelen = be32_to_cpu(name_rmt->valuelen);
}
+ if (XFS_IS_CORRUPT(context->dp->i_mount,
+ !xfs_attr_namecheck(name, namelen)))
+ return -EFSCORRUPTED;
context->put_listent(context, entry->flags,
name, namelen, valuelen);
if (context->seen_enough)
@@ -464,7 +481,7 @@ xfs_attr3_leaf_list_int(
cursor->offset++;
}
trace_xfs_attr_list_leaf_end(context);
- return;
+ return 0;
}
/*
@@ -479,13 +496,13 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
trace_xfs_attr_leaf_list(context);
context->cursor->blkno = 0;
- error = xfs_attr3_leaf_read(context->tp, context->dp, 0, -1, &bp);
+ error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp);
if (error)
return error;
- xfs_attr3_leaf_list_int(bp, context);
+ error = xfs_attr3_leaf_list_int(bp, context);
xfs_trans_brelse(context->tp, bp);
- return 0;
+ return error;
}
int
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 83d24e983d4c..ee6f4229cebc 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -21,7 +21,7 @@
#include "xfs_icache.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
-
+#include "xfs_error.h"
kmem_zone_t *xfs_bui_zone;
kmem_zone_t *xfs_bud_zone;
@@ -35,7 +35,7 @@ void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
- kmem_zone_free(xfs_bui_zone, buip);
+ kmem_cache_free(xfs_bui_zone, buip);
}
/*
@@ -201,7 +201,7 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
- kmem_zone_free(xfs_bud_zone, budp);
+ kmem_cache_free(xfs_bud_zone, budp);
}
static const struct xfs_item_ops xfs_bud_item_ops = {
@@ -456,7 +456,7 @@ xfs_bui_recover(
if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
xfs_bui_release(buip);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -490,7 +490,7 @@ xfs_bui_recover(
*/
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
xfs_bui_release(buip);
- return -EIO;
+ return -EFSCORRUPTED;
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
@@ -525,6 +525,7 @@ xfs_bui_recover(
type = bui_type;
break;
default:
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto err_inode;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 4f443703065e..2efd78a9719e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -53,15 +53,16 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
*/
int
xfs_zero_extent(
- struct xfs_inode *ip,
- xfs_fsblock_t start_fsb,
- xfs_off_t count_fsb)
+ struct xfs_inode *ip,
+ xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
- sector_t block = XFS_BB_TO_FSBT(mp, sector);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
+ sector_t block = XFS_BB_TO_FSBT(mp, sector);
- return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
+ return blkdev_issue_zeroout(target->bt_bdev,
block << (mp->m_super->s_blocksize_bits - 9),
count_fsb << (mp->m_super->s_blocksize_bits - 9),
GFP_NOFS, 0);
@@ -164,13 +165,6 @@ xfs_bmap_rtalloc(
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
-
- /* Zero the extent if we were asked to do so */
- if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) {
- error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
- if (error)
- return error;
- }
} else {
ap->length = 0;
}
@@ -179,29 +173,6 @@ xfs_bmap_rtalloc(
#endif /* CONFIG_XFS_RT */
/*
- * Check if the endoff is outside the last extent. If so the caller will grow
- * the allocation to a stripe unit boundary. All offsets are considered outside
- * the end of file for an empty fork, so 1 is returned in *eof in that case.
- */
-int
-xfs_bmap_eof(
- struct xfs_inode *ip,
- xfs_fileoff_t endoff,
- int whichfork,
- int *eof)
-{
- struct xfs_bmbt_irec rec;
- int error;
-
- error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
- if (error || *eof)
- return error;
-
- *eof = endoff >= rec.br_startoff + rec.br_blockcount;
- return 0;
-}
-
-/*
* Extent tree block counting routines.
*/
@@ -229,106 +200,6 @@ xfs_bmap_count_leaves(
}
/*
- * Count leaf blocks given a range of extent records originally
- * in btree format.
- */
-STATIC void
-xfs_bmap_disk_count_leaves(
- struct xfs_mount *mp,
- struct xfs_btree_block *block,
- int numrecs,
- xfs_filblks_t *count)
-{
- int b;
- xfs_bmbt_rec_t *frp;
-
- for (b = 1; b <= numrecs; b++) {
- frp = XFS_BMBT_REC_ADDR(mp, block, b);
- *count += xfs_bmbt_disk_get_blockcount(frp);
- }
-}
-
-/*
- * Recursively walks each level of a btree
- * to count total fsblocks in use.
- */
-STATIC int
-xfs_bmap_count_tree(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- struct xfs_ifork *ifp,
- xfs_fsblock_t blockno,
- int levelin,
- xfs_extnum_t *nextents,
- xfs_filblks_t *count)
-{
- int error;
- struct xfs_buf *bp, *nbp;
- int level = levelin;
- __be64 *pp;
- xfs_fsblock_t bno = blockno;
- xfs_fsblock_t nextbno;
- struct xfs_btree_block *block, *nextblock;
- int numrecs;
-
- error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- block = XFS_BUF_TO_BLOCK(bp);
-
- if (--level) {
- /* Not at node above leaves, count this level of nodes */
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- while (nextbno != NULLFSBLOCK) {
- error = xfs_btree_read_bufl(mp, tp, nextbno, &nbp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- nextblock = XFS_BUF_TO_BLOCK(nbp);
- nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
- xfs_trans_brelse(tp, nbp);
- }
-
- /* Dive to the next level */
- pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
- bno = be64_to_cpu(*pp);
- error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, nextents,
- count);
- if (error) {
- xfs_trans_brelse(tp, bp);
- XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
- XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
- xfs_trans_brelse(tp, bp);
- } else {
- /* count all level 1 nodes and their leaves */
- for (;;) {
- nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
- numrecs = be16_to_cpu(block->bb_numrecs);
- (*nextents) += numrecs;
- xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
- xfs_trans_brelse(tp, bp);
- if (nextbno == NULLFSBLOCK)
- break;
- bno = nextbno;
- error = xfs_btree_read_bufl(mp, tp, bno, &bp,
- XFS_BMAP_BTREE_REF,
- &xfs_bmbt_buf_ops);
- if (error)
- return error;
- *count += 1;
- block = XFS_BUF_TO_BLOCK(bp);
- }
- }
- return 0;
-}
-
-/*
* Count fsblocks of the given fork. Delayed allocation extents are
* not counted towards the totals.
*/
@@ -340,26 +211,19 @@ xfs_bmap_count_blocks(
xfs_extnum_t *nextents,
xfs_filblks_t *count)
{
- struct xfs_mount *mp; /* file system mount structure */
- __be64 *pp; /* pointer to block address */
- struct xfs_btree_block *block; /* current btree block */
- struct xfs_ifork *ifp; /* fork structure */
- xfs_fsblock_t bno; /* block # of "block" */
- int level; /* btree level, for checking */
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_cur *cur;
+ xfs_extlen_t btblocks = 0;
int error;
- bno = NULLFSBLOCK;
- mp = ip->i_mount;
*nextents = 0;
*count = 0;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+
if (!ifp)
return 0;
switch (XFS_IFORK_FORMAT(ip, whichfork)) {
- case XFS_DINODE_FMT_EXTENTS:
- *nextents = xfs_bmap_count_leaves(ifp, count);
- return 0;
case XFS_DINODE_FMT_BTREE:
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
@@ -367,26 +231,23 @@ xfs_bmap_count_blocks(
return error;
}
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ error = xfs_btree_count_blocks(cur, &btblocks);
+ xfs_btree_del_cursor(cur, error);
+ if (error)
+ return error;
+
/*
- * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+ * xfs_btree_count_blocks includes the root block contained in
+ * the inode fork in @btblocks, so subtract one because we're
+ * only interested in allocated disk blocks.
*/
- block = ifp->if_broot;
- level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
- bno = be64_to_cpu(*pp);
- ASSERT(bno != NULLFSBLOCK);
- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
-
- error = xfs_bmap_count_tree(mp, tp, ifp, bno, level,
- nextents, count);
- if (error) {
- XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)",
- XFS_ERRLEVEL_LOW, mp);
- return -EFSCORRUPTED;
- }
- return 0;
+ *count += btblocks - 1;
+
+ /* fall through */
+ case XFS_DINODE_FMT_EXTENTS:
+ *nextents = xfs_bmap_count_leaves(ifp, count);
+ break;
}
return 0;
@@ -964,8 +825,8 @@ xfs_alloc_file_space(
xfs_trans_ijoin(tp, ip, 0);
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
- allocatesize_fsb, alloc_type, resblks,
- imapp, &nimaps);
+ allocatesize_fsb, alloc_type, 0, imapp,
+ &nimaps);
if (error)
goto error0;
@@ -1039,6 +900,7 @@ out_trans_cancel:
goto out_unlock;
}
+/* Caller must first wait for the completion of any pending DIOs if required. */
int
xfs_flush_unmap_range(
struct xfs_inode *ip,
@@ -1050,9 +912,6 @@ xfs_flush_unmap_range(
xfs_off_t rounding, start, end;
int error;
- /* wait for the completion of any pending DIOs */
- inode_dio_wait(inode);
-
rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
start = round_down(offset, rounding);
end = round_up(offset + len, rounding) - 1;
@@ -1084,10 +943,6 @@ xfs_free_file_space(
if (len <= 0) /* if nothing being freed */
return 0;
- error = xfs_flush_unmap_range(ip, offset, len);
- if (error)
- return error;
-
startoffset_fsb = XFS_B_TO_FSB(mp, offset);
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
@@ -1113,7 +968,8 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops);
+ error = iomap_zero_range(VFS_I(ip), offset, len, NULL,
+ &xfs_buffered_write_iomap_ops);
if (error)
return error;
@@ -1131,43 +987,6 @@ xfs_free_file_space(
return error;
}
-/*
- * Preallocate and zero a range of a file. This mechanism has the allocation
- * semantics of fallocate and in addition converts data in the range to zeroes.
- */
-int
-xfs_zero_file_space(
- struct xfs_inode *ip,
- xfs_off_t offset,
- xfs_off_t len)
-{
- struct xfs_mount *mp = ip->i_mount;
- uint blksize;
- int error;
-
- trace_xfs_zero_file_space(ip);
-
- blksize = 1 << mp->m_sb.sb_blocklog;
-
- /*
- * Punch a hole and prealloc the range. We use hole punch rather than
- * unwritten extent conversion for two reasons:
- *
- * 1.) Hole punch handles partial block zeroing for us.
- *
- * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
- * by virtue of the hole punch.
- */
- error = xfs_free_file_space(ip, offset, len);
- if (error || xfs_is_always_cow_inode(ip))
- return error;
-
- return xfs_alloc_file_space(ip, round_down(offset, blksize),
- round_up(offset + len, blksize) -
- round_down(offset, blksize),
- XFS_BMAPI_PREALLOC);
-}
-
static int
xfs_prepare_shift(
struct xfs_inode *ip,
@@ -1750,6 +1569,14 @@ xfs_swap_extents(
goto out_unlock;
}
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ goto out_unlock;
+
+ error = xfs_qm_dqattach(tip);
+ if (error)
+ goto out_unlock;
+
error = xfs_swap_extent_flush(ip);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 7a78229cf1a7..9f993168b55b 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -30,8 +30,6 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
}
#endif /* CONFIG_XFS_RT */
-int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
- int whichfork, int *eof);
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
xfs_fileoff_t start_fsb, xfs_fileoff_t length);
@@ -59,8 +57,6 @@ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len, int alloc_type);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
-int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 0abba171aa89..a0229c368e78 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -238,7 +238,7 @@ _xfs_buf_alloc(
*/
error = xfs_buf_get_maps(bp, nmaps);
if (error) {
- kmem_zone_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_zone, bp);
return NULL;
}
@@ -304,7 +304,7 @@ _xfs_buf_free_pages(
* The buffer must not be on any hash - use xfs_buf_rele instead for
* hashed and refcounted buffers
*/
-void
+static void
xfs_buf_free(
xfs_buf_t *bp)
{
@@ -328,7 +328,7 @@ xfs_buf_free(
kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
xfs_buf_free_maps(bp);
- kmem_zone_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_zone, bp);
}
/*
@@ -461,7 +461,7 @@ _xfs_buf_map_pages(
unsigned nofs_flag;
/*
- * vm_map_ram() will allocate auxillary structures (e.g.
+ * vm_map_ram() will allocate auxiliary structures (e.g.
* pagetables) with GFP_KERNEL, yet we are likely to be under
* GFP_NOFS context here. Hence we need to tell memory reclaim
* that we are in such a context via PF_MEMALLOC_NOFS to prevent
@@ -949,7 +949,7 @@ xfs_buf_get_uncached(
_xfs_buf_free_pages(bp);
fail_free_buf:
xfs_buf_free_maps(bp);
- kmem_zone_free(xfs_buf_zone, bp);
+ kmem_cache_free(xfs_buf_zone, bp);
fail:
return NULL;
}
@@ -1261,8 +1261,7 @@ xfs_buf_ioapply_map(
int map,
int *buf_offset,
int *count,
- int op,
- int op_flags)
+ int op)
{
int page_index;
int total_nr_pages = bp->b_page_count;
@@ -1297,7 +1296,7 @@ next_chunk:
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
- bio_set_op_attrs(bio, op, op_flags);
+ bio->bi_opf = op;
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
@@ -1342,7 +1341,6 @@ _xfs_buf_ioapply(
{
struct blk_plug plug;
int op;
- int op_flags = 0;
int offset;
int size;
int i;
@@ -1384,15 +1382,14 @@ _xfs_buf_ioapply(
dump_stack();
}
}
- } else if (bp->b_flags & XBF_READ_AHEAD) {
- op = REQ_OP_READ;
- op_flags = REQ_RAHEAD;
} else {
op = REQ_OP_READ;
+ if (bp->b_flags & XBF_READ_AHEAD)
+ op |= REQ_RAHEAD;
}
/* we only use the buffer cache for meta-data */
- op_flags |= REQ_META;
+ op |= REQ_META;
/*
* Walk all the vectors issuing IO on them. Set up the initial offset
@@ -1404,7 +1401,7 @@ _xfs_buf_ioapply(
size = BBTOB(bp->b_length);
blk_start_plug(&plug);
for (i = 0; i < bp->b_map_count; i++) {
- xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
+ xfs_buf_ioapply_map(bp, i, &offset, &size, op);
if (bp->b_error)
break;
if (size <= 0)
@@ -2063,8 +2060,9 @@ xfs_buf_delwri_pushbuf(
int __init
xfs_buf_init(void)
{
- xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
- KM_ZONE_HWALIGN, NULL);
+ xfs_buf_zone = kmem_cache_create("xfs_buf",
+ sizeof(struct xfs_buf), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
if (!xfs_buf_zone)
goto out;
@@ -2077,7 +2075,7 @@ xfs_buf_init(void)
void
xfs_buf_terminate(void)
{
- kmem_zone_destroy(xfs_buf_zone);
+ kmem_cache_destroy(xfs_buf_zone);
}
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index f6ce17d8d848..56e081dd1d96 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -244,7 +244,6 @@ int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
-extern void xfs_buf_free(xfs_buf_t *);
extern void xfs_buf_rele(xfs_buf_t *);
/* Locking and Unlocking Buffers */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index d74fbd1e9d3e..3458a1264a3f 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -763,7 +763,7 @@ xfs_buf_item_init(
error = xfs_buf_item_get_format(bip, bp->b_map_count);
ASSERT(error == 0);
if (error) { /* to stop gcc throwing set-but-unused warnings */
- kmem_zone_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_zone, bip);
return error;
}
@@ -851,7 +851,7 @@ xfs_buf_item_log_segment(
* first_bit and last_bit.
*/
while ((bits_to_set - bits_set) >= NBWORD) {
- *wordp |= 0xffffffff;
+ *wordp = 0xffffffff;
bits_set += NBWORD;
wordp++;
}
@@ -939,7 +939,7 @@ xfs_buf_item_free(
{
xfs_buf_item_free_format(bip);
kmem_free(bip->bli_item.li_lv_shadow);
- kmem_zone_free(xfs_buf_item_zone, bip);
+ kmem_cache_free(xfs_buf_item_zone, bip);
}
/*
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 283df898dd9f..0d3b640cf1cc 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -17,6 +17,7 @@
#include "xfs_trace.h"
#include "xfs_bmap.h"
#include "xfs_trans.h"
+#include "xfs_error.h"
/*
* Directory file type support functions
@@ -47,6 +48,7 @@ xfs_dir2_sf_getdents(
{
int i; /* shortform entry number */
struct xfs_inode *dp = args->dp; /* incore directory inode */
+ struct xfs_mount *mp = dp->i_mount;
xfs_dir2_dataptr_t off; /* current entry's offset */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
@@ -68,15 +70,15 @@ xfs_dir2_sf_getdents(
return 0;
/*
- * Precalculate offsets for . and .. as we will always need them.
- *
- * XXX(hch): the second argument is sometimes 0 and sometimes
- * geo->datablk
+ * Precalculate offsets for "." and ".." as we will always need them.
+ * This relies on the fact that directories always start with the
+ * entries for "." and "..".
*/
dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
- dp->d_ops->data_dot_offset);
+ geo->data_entry_offset);
dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
- dp->d_ops->data_dotdot_offset);
+ geo->data_entry_offset +
+ xfs_dir2_data_entsize(mp, sizeof(".") - 1));
/*
* Put . entry unless we're starting past it.
@@ -91,7 +93,7 @@ xfs_dir2_sf_getdents(
* Put .. entry unless we're starting past it.
*/
if (ctx->pos <= dotdot_offset) {
- ino = dp->d_ops->sf_get_parent_ino(sfp);
+ ino = xfs_dir2_sf_get_parent_ino(sfp);
ctx->pos = dotdot_offset & 0x7fffffff;
if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
return 0;
@@ -108,17 +110,21 @@ xfs_dir2_sf_getdents(
xfs_dir2_sf_get_offset(sfep));
if (ctx->pos > off) {
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
continue;
}
- ino = dp->d_ops->sf_get_ino(sfp, sfep);
- filetype = dp->d_ops->sf_get_ftype(sfep);
+ ino = xfs_dir2_sf_get_ino(mp, sfp, sfep);
+ filetype = xfs_dir2_sf_get_ftype(mp, sfep);
ctx->pos = off & 0x7fffffff;
+ if (XFS_IS_CORRUPT(dp->i_mount,
+ !xfs_dir2_namecheck(sfep->name,
+ sfep->namelen)))
+ return -EFSCORRUPTED;
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
- xfs_dir3_get_dtype(dp->i_mount, filetype)))
+ xfs_dir3_get_dtype(mp, filetype)))
return 0;
- sfep = dp->d_ops->sf_nextentry(sfp, sfep);
+ sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
@@ -135,17 +141,14 @@ xfs_dir2_block_getdents(
struct dir_context *ctx)
{
struct xfs_inode *dp = args->dp; /* incore directory inode */
- xfs_dir2_data_hdr_t *hdr; /* block header */
struct xfs_buf *bp; /* buffer for block */
- xfs_dir2_data_entry_t *dep; /* block data entry */
- xfs_dir2_data_unused_t *dup; /* block unused entry */
- char *endptr; /* end of the data entries */
int error; /* error return value */
- char *ptr; /* current data entry */
int wantoff; /* starting block offset */
xfs_off_t cook;
struct xfs_da_geometry *geo = args->geo;
int lock_mode;
+ unsigned int offset;
+ unsigned int end;
/*
* If the block number in the offset is out of range, we're done.
@@ -164,56 +167,55 @@ xfs_dir2_block_getdents(
* We'll skip entries before this.
*/
wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
- hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
- /*
- * Set up values for the loop.
- */
- ptr = (char *)dp->d_ops->data_entry_p(hdr);
- endptr = xfs_dir3_data_endp(geo, hdr);
/*
* Loop over the data portion of the block.
* Each object is a real entry (dep) or an unused one (dup).
*/
- while (ptr < endptr) {
+ offset = geo->data_entry_offset;
+ end = xfs_dir3_data_end_offset(geo, bp->b_addr);
+ while (offset < end) {
+ struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
+ struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
uint8_t filetype;
- dup = (xfs_dir2_data_unused_t *)ptr;
/*
* Unused, skip it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- ptr += be16_to_cpu(dup->length);
+ offset += be16_to_cpu(dup->length);
continue;
}
- dep = (xfs_dir2_data_entry_t *)ptr;
-
/*
* Bump pointer for the next iteration.
*/
- ptr += dp->d_ops->data_entsize(dep->namelen);
+ offset += xfs_dir2_data_entsize(dp->i_mount, dep->namelen);
+
/*
* The entry is before the desired starting point, skip it.
*/
- if ((char *)dep - (char *)hdr < wantoff)
+ if (offset < wantoff)
continue;
- cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
- (char *)dep - (char *)hdr);
+ cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, offset);
ctx->pos = cook & 0x7fffffff;
- filetype = dp->d_ops->data_get_ftype(dep);
+ filetype = xfs_dir2_data_get_ftype(dp->i_mount, dep);
/*
* If it didn't fit, set the final offset to here & return.
*/
+ if (XFS_IS_CORRUPT(dp->i_mount,
+ !xfs_dir2_namecheck(dep->name,
+ dep->namelen))) {
+ error = -EFSCORRUPTED;
+ goto out_rele;
+ }
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
- xfs_dir3_get_dtype(dp->i_mount, filetype))) {
- xfs_trans_brelse(args->trans, bp);
- return 0;
- }
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
+ goto out_rele;
}
/*
@@ -222,8 +224,9 @@ xfs_dir2_block_getdents(
*/
ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
0x7fffffff;
+out_rele:
xfs_trans_brelse(args->trans, bp);
- return 0;
+ return error;
}
/*
@@ -276,7 +279,7 @@ xfs_dir2_leaf_readbuf(
new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
if (new_off > *cur_off)
*cur_off = new_off;
- error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp);
+ error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, 0, &bp);
if (error)
goto out;
@@ -311,7 +314,8 @@ xfs_dir2_leaf_readbuf(
break;
}
if (next_ra > *ra_blk) {
- xfs_dir3_data_readahead(dp, next_ra, -2);
+ xfs_dir3_data_readahead(dp, next_ra,
+ XFS_DABUF_MAP_HOLE_OK);
*ra_blk = next_ra;
}
ra_want -= geo->fsbcount;
@@ -343,17 +347,17 @@ xfs_dir2_leaf_getdents(
size_t bufsize)
{
struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp = NULL; /* data block buffer */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_unused_t *dup; /* unused entry */
- char *ptr = NULL; /* pointer to current data */
struct xfs_da_geometry *geo = args->geo;
xfs_dablk_t rablk = 0; /* current readahead block */
xfs_dir2_off_t curoff; /* current overall offset */
int length; /* temporary length value */
int byteoff; /* offset in current block */
int lock_mode;
+ unsigned int offset = 0;
int error = 0; /* error return value */
/*
@@ -380,7 +384,7 @@ xfs_dir2_leaf_getdents(
* If we have no buffer, or we're off the end of the
* current buffer, need to get another one.
*/
- if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+ if (!bp || offset >= geo->blksize) {
if (bp) {
xfs_trans_brelse(args->trans, bp);
bp = NULL;
@@ -393,36 +397,35 @@ xfs_dir2_leaf_getdents(
if (error || !bp)
break;
- hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
/*
* Find our position in the block.
*/
- ptr = (char *)dp->d_ops->data_entry_p(hdr);
+ offset = geo->data_entry_offset;
byteoff = xfs_dir2_byte_to_off(geo, curoff);
/*
* Skip past the header.
*/
if (byteoff == 0)
- curoff += dp->d_ops->data_entry_offset;
+ curoff += geo->data_entry_offset;
/*
* Skip past entries until we reach our offset.
*/
else {
- while ((char *)ptr - (char *)hdr < byteoff) {
- dup = (xfs_dir2_data_unused_t *)ptr;
+ while (offset < byteoff) {
+ dup = bp->b_addr + offset;
if (be16_to_cpu(dup->freetag)
== XFS_DIR2_DATA_FREE_TAG) {
length = be16_to_cpu(dup->length);
- ptr += length;
+ offset += length;
continue;
}
- dep = (xfs_dir2_data_entry_t *)ptr;
- length =
- dp->d_ops->data_entsize(dep->namelen);
- ptr += length;
+ dep = bp->b_addr + offset;
+ length = xfs_dir2_data_entsize(mp,
+ dep->namelen);
+ offset += length;
}
/*
* Now set our real offset.
@@ -430,32 +433,38 @@ xfs_dir2_leaf_getdents(
curoff =
xfs_dir2_db_off_to_byte(geo,
xfs_dir2_byte_to_db(geo, curoff),
- (char *)ptr - (char *)hdr);
- if (ptr >= (char *)hdr + geo->blksize) {
+ offset);
+ if (offset >= geo->blksize)
continue;
- }
}
}
+
/*
- * We have a pointer to an entry.
- * Is it a live one?
+ * We have a pointer to an entry. Is it a live one?
*/
- dup = (xfs_dir2_data_unused_t *)ptr;
+ dup = bp->b_addr + offset;
+
/*
* No, it's unused, skip over it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
length = be16_to_cpu(dup->length);
- ptr += length;
+ offset += length;
curoff += length;
continue;
}
- dep = (xfs_dir2_data_entry_t *)ptr;
- length = dp->d_ops->data_entsize(dep->namelen);
- filetype = dp->d_ops->data_get_ftype(dep);
+ dep = bp->b_addr + offset;
+ length = xfs_dir2_data_entsize(mp, dep->namelen);
+ filetype = xfs_dir2_data_get_ftype(mp, dep);
ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
+ if (XFS_IS_CORRUPT(dp->i_mount,
+ !xfs_dir2_namecheck(dep->name,
+ dep->namelen))) {
+ error = -EFSCORRUPTED;
+ break;
+ }
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
xfs_dir3_get_dtype(dp->i_mount, filetype)))
@@ -464,7 +473,7 @@ xfs_dir2_leaf_getdents(
/*
* Advance to next entry in the block.
*/
- ptr += length;
+ offset += length;
curoff += length;
/* bufsize may have just been a guess; don't go negative */
bufsize = bufsize > length ? bufsize - length : 0;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8ec7aab89044..cae613620175 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -13,6 +13,7 @@
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
+#include "xfs_discard.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_trace.h"
@@ -70,7 +71,10 @@ xfs_trim_extents(
error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
if (error)
goto out_del_cursor;
- XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor);
+ if (XFS_IS_CORRUPT(mp, i != 1)) {
+ error = -EFSCORRUPTED;
+ goto out_del_cursor;
+ }
ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
/*
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index aeb95e7391c1..2bff21ca9d78 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -48,7 +48,7 @@ static struct lock_class_key xfs_dquot_project_class;
*/
void
xfs_qm_dqdestroy(
- xfs_dquot_t *dqp)
+ struct xfs_dquot *dqp)
{
ASSERT(list_empty(&dqp->q_lru));
@@ -56,7 +56,7 @@ xfs_qm_dqdestroy(
mutex_destroy(&dqp->q_qlock);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
- kmem_zone_free(xfs_qm_dqzone, dqp);
+ kmem_cache_free(xfs_qm_dqzone, dqp);
}
/*
@@ -113,8 +113,8 @@ xfs_qm_adjust_dqlimits(
*/
void
xfs_qm_adjust_dqtimers(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *d)
+ struct xfs_mount *mp,
+ struct xfs_disk_dquot *d)
{
ASSERT(d->d_id);
@@ -305,8 +305,8 @@ xfs_dquot_disk_alloc(
/* Create the block mapping. */
xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset,
- XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
- XFS_QM_DQALLOC_SPACE_RES(mp), &map, &nmaps);
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map,
+ &nmaps);
if (error)
return error;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -497,7 +497,7 @@ xfs_dquot_from_disk(
struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset;
/* copy everything from disk dquot to the incore dquot */
- memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+ memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot));
/*
* Reservation counters are defined as reservation plus current usage
@@ -833,7 +833,7 @@ xfs_qm_id_for_quotatype(
case XFS_DQ_GROUP:
return ip->i_d.di_gid;
case XFS_DQ_PROJ:
- return xfs_get_projid(ip);
+ return ip->i_d.di_projid;
}
ASSERT(0);
return 0;
@@ -989,7 +989,7 @@ xfs_qm_dqput(
*/
void
xfs_qm_dqrele(
- xfs_dquot_t *dqp)
+ struct xfs_dquot *dqp)
{
if (!dqp)
return;
@@ -1018,8 +1018,8 @@ xfs_qm_dqflush_done(
struct xfs_buf *bp,
struct xfs_log_item *lip)
{
- xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip;
- xfs_dquot_t *dqp = qip->qli_dquot;
+ struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip;
+ struct xfs_dquot *dqp = qip->qli_dquot;
struct xfs_ail *ailp = lip->li_ailp;
/*
@@ -1126,11 +1126,11 @@ xfs_qm_dqflush(
xfs_buf_relse(bp);
xfs_dqfunlock(dqp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* This is the only portion of data that needs to persist */
- memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+ memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot));
/*
* Clear the dirty field and remember the flush lsn for later use.
@@ -1188,8 +1188,8 @@ out_unlock:
*/
void
xfs_dqlock2(
- xfs_dquot_t *d1,
- xfs_dquot_t *d2)
+ struct xfs_dquot *d1,
+ struct xfs_dquot *d2)
{
if (d1 && d2) {
ASSERT(d1 != d2);
@@ -1211,20 +1211,22 @@ xfs_dqlock2(
int __init
xfs_qm_init(void)
{
- xfs_qm_dqzone =
- kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
+ xfs_qm_dqzone = kmem_cache_create("xfs_dquot",
+ sizeof(struct xfs_dquot),
+ 0, 0, NULL);
if (!xfs_qm_dqzone)
goto out;
- xfs_qm_dqtrxzone =
- kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
+ xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx",
+ sizeof(struct xfs_dquot_acct),
+ 0, 0, NULL);
if (!xfs_qm_dqtrxzone)
goto out_free_dqzone;
return 0;
out_free_dqzone:
- kmem_zone_destroy(xfs_qm_dqzone);
+ kmem_cache_destroy(xfs_qm_dqzone);
out:
return -ENOMEM;
}
@@ -1232,8 +1234,8 @@ out:
void
xfs_qm_exit(void)
{
- kmem_zone_destroy(xfs_qm_dqtrxzone);
- kmem_zone_destroy(xfs_qm_dqzone);
+ kmem_cache_destroy(xfs_qm_dqtrxzone);
+ kmem_cache_destroy(xfs_qm_dqzone);
}
/*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 4fe85709d55d..fe3e46df604b 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -30,33 +30,36 @@ enum {
/*
* The incore dquot structure
*/
-typedef struct xfs_dquot {
- uint dq_flags; /* various flags (XFS_DQ_*) */
- struct list_head q_lru; /* global free list of dquots */
- struct xfs_mount*q_mount; /* filesystem this relates to */
- uint q_nrefs; /* # active refs from inodes */
- xfs_daddr_t q_blkno; /* blkno of dquot buffer */
- int q_bufoffset; /* off of dq in buffer (# dquots) */
- xfs_fileoff_t q_fileoffset; /* offset in quotas file */
-
- xfs_disk_dquot_t q_core; /* actual usage & quotas */
- xfs_dq_logitem_t q_logitem; /* dquot log item */
- xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
- xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
- xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
- xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */
- xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */
- int64_t q_low_space[XFS_QLOWSP_MAX];
- struct mutex q_qlock; /* quota lock */
- struct completion q_flush; /* flush completion queue */
- atomic_t q_pincount; /* dquot pin count */
- wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
-} xfs_dquot_t;
+struct xfs_dquot {
+ uint dq_flags;
+ struct list_head q_lru;
+ struct xfs_mount *q_mount;
+ uint q_nrefs;
+ xfs_daddr_t q_blkno;
+ int q_bufoffset;
+ xfs_fileoff_t q_fileoffset;
+
+ struct xfs_disk_dquot q_core;
+ struct xfs_dq_logitem q_logitem;
+ /* total regular nblks used+reserved */
+ xfs_qcnt_t q_res_bcount;
+ /* total inos allocd+reserved */
+ xfs_qcnt_t q_res_icount;
+ /* total realtime blks used+reserved */
+ xfs_qcnt_t q_res_rtbcount;
+ xfs_qcnt_t q_prealloc_lo_wmark;
+ xfs_qcnt_t q_prealloc_hi_wmark;
+ int64_t q_low_space[XFS_QLOWSP_MAX];
+ struct mutex q_qlock;
+ struct completion q_flush;
+ atomic_t q_pincount;
+ struct wait_queue_head q_pinwait;
+};
/*
* Lock hierarchy for q_qlock:
* XFS_QLOCK_NORMAL is the implicit default,
- * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
*/
enum {
XFS_QLOCK_NORMAL = 0,
@@ -64,21 +67,21 @@ enum {
};
/*
- * Manage the q_flush completion queue embedded in the dquot. This completion
+ * Manage the q_flush completion queue embedded in the dquot. This completion
* queue synchronizes processes attempting to flush the in-core dquot back to
* disk.
*/
-static inline void xfs_dqflock(xfs_dquot_t *dqp)
+static inline void xfs_dqflock(struct xfs_dquot *dqp)
{
wait_for_completion(&dqp->q_flush);
}
-static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp)
+static inline bool xfs_dqflock_nowait(struct xfs_dquot *dqp)
{
return try_wait_for_completion(&dqp->q_flush);
}
-static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+static inline void xfs_dqfunlock(struct xfs_dquot *dqp)
{
complete(&dqp->q_flush);
}
@@ -112,7 +115,7 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
}
}
-static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
+static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type)
{
switch (type & XFS_DQ_ALLTYPES) {
case XFS_DQ_USER:
@@ -147,31 +150,30 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
-extern void xfs_qm_dqdestroy(xfs_dquot_t *);
-extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
-extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
-extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
- xfs_disk_dquot_t *);
-extern void xfs_qm_adjust_dqlimits(struct xfs_mount *,
- struct xfs_dquot *);
-extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip,
- uint type);
-extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
+void xfs_qm_dqdestroy(struct xfs_dquot *dqp);
+int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp);
+void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
+void xfs_qm_adjust_dqtimers(struct xfs_mount *mp,
+ struct xfs_disk_dquot *d);
+void xfs_qm_adjust_dqlimits(struct xfs_mount *mp,
+ struct xfs_dquot *d);
+xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type);
+int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
uint type, bool can_alloc,
struct xfs_dquot **dqpp);
-extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
- bool can_alloc,
- struct xfs_dquot **dqpp);
-extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
+int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
+ bool can_alloc,
+ struct xfs_dquot **dqpp);
+int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
uint type, struct xfs_dquot **dqpp);
-extern int xfs_qm_dqget_uncached(struct xfs_mount *mp,
- xfs_dqid_t id, uint type,
- struct xfs_dquot **dqpp);
-extern void xfs_qm_dqput(xfs_dquot_t *);
+int xfs_qm_dqget_uncached(struct xfs_mount *mp,
+ xfs_dqid_t id, uint type,
+ struct xfs_dquot **dqpp);
+void xfs_qm_dqput(struct xfs_dquot *dqp);
-extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
-extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 1aed34ccdabc..3bb19e556ade 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -11,25 +11,27 @@ struct xfs_trans;
struct xfs_mount;
struct xfs_qoff_logitem;
-typedef struct xfs_dq_logitem {
- struct xfs_log_item qli_item; /* common portion */
- struct xfs_dquot *qli_dquot; /* dquot ptr */
- xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
-} xfs_dq_logitem_t;
+struct xfs_dq_logitem {
+ struct xfs_log_item qli_item; /* common portion */
+ struct xfs_dquot *qli_dquot; /* dquot ptr */
+ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
+};
-typedef struct xfs_qoff_logitem {
- struct xfs_log_item qql_item; /* common portion */
- struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+struct xfs_qoff_logitem {
+ struct xfs_log_item qql_item; /* common portion */
+ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
unsigned int qql_flags;
-} xfs_qoff_logitem_t;
+};
-extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *);
-extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
- struct xfs_qoff_logitem *, uint);
-extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
- struct xfs_qoff_logitem *, uint);
-extern void xfs_trans_log_quotaoff_item(struct xfs_trans *,
- struct xfs_qoff_logitem *);
+void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);
+struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp,
+ struct xfs_qoff_logitem *start,
+ uint flags);
+struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp,
+ struct xfs_qoff_logitem *startqoff,
+ uint flags);
+void xfs_trans_log_quotaoff_item(struct xfs_trans *tp,
+ struct xfs_qoff_logitem *qlp);
#endif /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 849fd4476950..331765afc53e 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -257,7 +257,7 @@ xfs_errortag_test(
xfs_warn_ratelimited(mp,
"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
- expression, file, line, mp->m_fsname);
+ expression, file, line, mp->m_super->s_id);
return true;
}
@@ -329,19 +329,40 @@ xfs_corruption_error(
const char *tag,
int level,
struct xfs_mount *mp,
- void *buf,
+ const void *buf,
size_t bufsize,
const char *filename,
int linenum,
xfs_failaddr_t failaddr)
{
- if (level <= xfs_error_level)
+ if (buf && level <= xfs_error_level)
xfs_hex_dump(buf, bufsize);
xfs_error_report(tag, level, mp, filename, linenum, failaddr);
xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
}
/*
+ * Complain about the kinds of metadata corruption that we can't detect from a
+ * verifier, such as incorrect inter-block relationship data. Does not set
+ * bp->b_error.
+ */
+void
+xfs_buf_corruption_error(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata corruption detected at %pS, %s block 0x%llx",
+ __return_address, bp->b_ops->name, bp->b_bn);
+
+ xfs_alert(mp, "Unmount and run xfs_repair");
+
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
+
+/*
* Warnings specifically for verifier errors. Differentiate CRC vs. invalid
* values, and omit the stack trace unless the error level is tuned high.
*/
@@ -350,7 +371,7 @@ xfs_buf_verifier_error(
struct xfs_buf *bp,
int error,
const char *name,
- void *buf,
+ const void *buf,
size_t bufsz,
xfs_failaddr_t failaddr)
{
@@ -402,7 +423,7 @@ xfs_inode_verifier_error(
struct xfs_inode *ip,
int error,
const char *name,
- void *buf,
+ const void *buf,
size_t bufsz,
xfs_failaddr_t failaddr)
{
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 602aa7d62b66..31a5d321ba9a 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -12,16 +12,17 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
const char *filename, int linenum,
xfs_failaddr_t failaddr);
extern void xfs_corruption_error(const char *tag, int level,
- struct xfs_mount *mp, void *buf, size_t bufsize,
+ struct xfs_mount *mp, const void *buf, size_t bufsize,
const char *filename, int linenum,
xfs_failaddr_t failaddr);
+void xfs_buf_corruption_error(struct xfs_buf *bp);
extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
- const char *name, void *buf, size_t bufsz,
+ const char *name, const void *buf, size_t bufsz,
xfs_failaddr_t failaddr);
extern void xfs_verifier_error(struct xfs_buf *bp, int error,
xfs_failaddr_t failaddr);
extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
- const char *name, void *buf, size_t bufsz,
+ const char *name, const void *buf, size_t bufsz,
xfs_failaddr_t failaddr);
#define XFS_ERROR_REPORT(e, lvl, mp) \
@@ -37,32 +38,6 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
/* Dump 128 bytes of any corrupt buffer */
#define XFS_CORRUPTION_DUMP_LEN (128)
-/*
- * Macros to set EFSCORRUPTED & return/branch.
- */
-#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \
- { \
- int fs_is_ok = (x); \
- ASSERT(fs_is_ok); \
- if (unlikely(!fs_is_ok)) { \
- XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
- XFS_ERRLEVEL_LOW, mp); \
- error = -EFSCORRUPTED; \
- goto l; \
- } \
- }
-
-#define XFS_WANT_CORRUPTED_RETURN(mp, x) \
- { \
- int fs_is_ok = (x); \
- ASSERT(fs_is_ok); \
- if (unlikely(!fs_is_ok)) { \
- XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
- XFS_ERRLEVEL_LOW, mp); \
- return -EFSCORRUPTED; \
- } \
- }
-
#ifdef DEBUG
extern int xfs_errortag_init(struct xfs_mount *mp);
extern void xfs_errortag_del(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 2183d87be4cf..3991e59cfd18 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -367,7 +367,7 @@ restart:
* If this is a metadata allocation, try to reuse the busy
* extent instead of trimming the allocation.
*/
- if (!xfs_alloc_is_userdata(args->datatype) &&
+ if (!(args->datatype & XFS_ALLOC_USERDATA) &&
!(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
if (!xfs_extent_busy_update_extent(args->mp, args->pag,
busyp, fbno, flen,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index e44efc41a041..6ea847f6e298 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -21,7 +21,7 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_trace.h"
-
+#include "xfs_error.h"
kmem_zone_t *xfs_efi_zone;
kmem_zone_t *xfs_efd_zone;
@@ -39,7 +39,7 @@ xfs_efi_item_free(
if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
kmem_free(efip);
else
- kmem_zone_free(xfs_efi_zone, efip);
+ kmem_cache_free(xfs_efi_zone, efip);
}
/*
@@ -228,6 +228,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
}
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
@@ -243,7 +244,7 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
kmem_free(efdp);
else
- kmem_zone_free(xfs_efd_zone, efdp);
+ kmem_cache_free(xfs_efd_zone, efdp);
}
/*
@@ -624,7 +625,7 @@ xfs_efi_recover(
*/
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
xfs_efi_release(efip);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1ffb179f35d2..c93250108952 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -188,7 +188,8 @@ xfs_file_dio_aio_read(
file_accessed(iocb->ki_filp);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+ ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
+ is_sync_kiocb(iocb));
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -215,7 +216,7 @@ xfs_file_dax_read(
xfs_ilock(ip, XFS_IOLOCK_SHARED);
}
- ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
+ ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
file_accessed(iocb->ki_filp);
@@ -351,7 +352,7 @@ restart:
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
- NULL, &xfs_iomap_ops);
+ NULL, &xfs_buffered_write_iomap_ops);
if (error)
return error;
} else
@@ -486,8 +487,7 @@ xfs_file_dio_aio_write(
int unaligned_io = 0;
int iolock;
size_t count = iov_iter_count(from);
- struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
/* DIO must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
@@ -547,15 +547,13 @@ xfs_file_dio_aio_write(
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
- ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
-
/*
- * If unaligned, this is the only IO in-flight. If it has not yet
- * completed, wait on it before we release the iolock to prevent
- * subsequent overlapping IO.
+ * If unaligned, this is the only IO in-flight. Wait on it before we
+ * release the iolock to prevent subsequent overlapping IO.
*/
- if (ret == -EIOCBQUEUED && unaligned_io)
- inode_dio_wait(inode);
+ ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+ &xfs_dio_write_ops,
+ is_sync_kiocb(iocb) || unaligned_io);
out:
xfs_iunlock(ip, iolock);
@@ -594,7 +592,7 @@ xfs_file_dax_write(
count = iov_iter_count(from);
trace_xfs_file_dax_write(ip, count, pos);
- ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
@@ -641,7 +639,8 @@ write_retry:
current->backing_dev_info = inode_to_bdi(inode);
trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
- ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops);
if (likely(ret >= 0))
iocb->ki_pos += ret;
@@ -818,6 +817,36 @@ xfs_file_fallocate(
if (error)
goto out_unlock;
+ /*
+ * Must wait for all AIO to complete before we continue as AIO can
+ * change the file size on completion without holding any locks we
+ * currently hold. We must do this first because AIO can update both
+ * the on disk and in memory inode sizes, and the operations that follow
+ * require the in-memory size to be fully up-to-date.
+ */
+ inode_dio_wait(inode);
+
+ /*
+ * Now AIO and DIO has drained we flush and (if necessary) invalidate
+ * the cached range over the first operation we are about to run.
+ *
+ * We care about zero and collapse here because they both run a hole
+ * punch over the range first. Because that can zero data, and the range
+ * of invalidation for the shift operations is much larger, we still do
+ * the required flush for collapse in xfs_prepare_shift().
+ *
+ * Insert has the same range requirements as collapse, and we extend the
+ * file first which can zero data. Hence insert has the same
+ * flush/invalidate requirements as collapse and so they are both
+ * handled at the right time by xfs_prepare_shift().
+ */
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_COLLAPSE_RANGE)) {
+ error = xfs_flush_unmap_range(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ }
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
@@ -881,16 +910,30 @@ xfs_file_fallocate(
}
if (mode & FALLOC_FL_ZERO_RANGE) {
- error = xfs_zero_file_space(ip, offset, len);
+ /*
+ * Punch a hole and prealloc the range. We use a hole
+ * punch rather than unwritten extent conversion for two
+ * reasons:
+ *
+ * 1.) Hole punch handles partial block zeroing for us.
+ * 2.) If prealloc returns ENOSPC, the file range is
+ * still zero-valued by virtue of the hole punch.
+ */
+ unsigned int blksize = i_blocksize(inode);
+
+ trace_xfs_zero_file_space(ip);
+
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+
+ len = round_up(offset + len, blksize) -
+ round_down(offset, blksize);
+ offset = round_down(offset, blksize);
} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
error = xfs_reflink_unshare(ip, offset, len);
if (error)
goto out_unlock;
-
- if (!xfs_is_always_cow_inode(ip)) {
- error = xfs_alloc_file_space(ip, offset, len,
- XFS_BMAPI_PREALLOC);
- }
} else {
/*
* If always_cow mode we can't use preallocations and
@@ -900,12 +943,14 @@ xfs_file_fallocate(
error = -EOPNOTSUPP;
goto out_unlock;
}
+ }
+ if (!xfs_is_always_cow_inode(ip)) {
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
+ if (error)
+ goto out_unlock;
}
- if (error)
- goto out_unlock;
}
if (file->f_flags & O_DSYNC)
@@ -1059,7 +1104,7 @@ xfs_dir_open(
*/
mode = xfs_ilock_data_map_shared(ip);
if (ip->i_d.di_nextents > 0)
- error = xfs_dir3_data_readahead(ip, 0, -1);
+ error = xfs_dir3_data_readahead(ip, 0, 0);
xfs_iunlock(ip, mode);
return error;
}
@@ -1156,12 +1201,16 @@ __xfs_filemap_fault(
if (IS_DAX(inode)) {
pfn_t pfn;
- ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
+ (write_fault && !vmf->cow_page) ?
+ &xfs_direct_write_iomap_ops :
+ &xfs_read_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
} else {
if (write_fault)
- ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
+ ret = iomap_page_mkwrite(vmf,
+ &xfs_buffered_write_iomap_ops);
else
ret = filemap_fault(vmf);
}
@@ -1225,22 +1274,22 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
STATIC int
xfs_file_mmap(
- struct file *filp,
- struct vm_area_struct *vma)
+ struct file *file,
+ struct vm_area_struct *vma)
{
- struct dax_device *dax_dev;
+ struct inode *inode = file_inode(file);
+ struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
- dax_dev = xfs_find_daxdev_for_inode(file_inode(filp));
/*
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(vma, dax_dev))
+ if (!daxdev_mapping_supported(vma, target->bt_daxdev))
return -EOPNOTSUPP;
- file_accessed(filp);
+ file_accessed(file);
vma->vm_ops = &xfs_file_vm_ops;
- if (IS_DAX(file_inode(filp)))
+ if (IS_DAX(inode))
vma->vm_flags |= VM_HUGEPAGE;
return 0;
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 574a7a8b4736..5f12b5d8527a 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -18,6 +18,7 @@
#include "xfs_trace.h"
#include "xfs_ag_resv.h"
#include "xfs_trans.h"
+#include "xfs_filestream.h"
struct xfs_fstrm_item {
struct xfs_mru_cache_elem mru;
@@ -374,7 +375,7 @@ xfs_filestream_new_ag(
startag = (item->ag + 1) % mp->m_sb.sb_agcount;
}
- if (xfs_alloc_is_userdata(ap->datatype))
+ if (ap->datatype & XFS_ALLOC_USERDATA)
flags |= XFS_PICK_USERDATA;
if (ap->tp->t_flags & XFS_TRANS_LOWMODE)
flags |= XFS_PICK_LOWSPACE;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index d082143feb5a..918456ca29e1 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -146,6 +146,7 @@ xfs_fsmap_owner_from_rmap(
dest->fmr_owner = XFS_FMR_OWN_FREE;
break;
default:
+ ASSERT(0);
return -EFSCORRUPTED;
}
return 0;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 944add5ff8e0..8dc2e5414276 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -44,7 +44,7 @@ xfs_inode_alloc(
if (!ip)
return NULL;
if (inode_init_always(mp->m_super, VFS_I(ip))) {
- kmem_zone_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_zone, ip);
return NULL;
}
@@ -104,7 +104,7 @@ xfs_inode_free_callback(
ip->i_itemp = NULL;
}
- kmem_zone_free(xfs_inode_zone, ip);
+ kmem_cache_free(xfs_inode_zone, ip);
}
static void
@@ -1419,7 +1419,7 @@ xfs_inode_match_id(
return 0;
if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- xfs_get_projid(ip) != eofb->eof_prid)
+ ip->i_d.di_projid != eofb->eof_prid)
return 0;
return 1;
@@ -1443,7 +1443,7 @@ xfs_inode_match_id_union(
return 1;
if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- xfs_get_projid(ip) == eofb->eof_prid)
+ ip->i_d.di_projid == eofb->eof_prid)
return 1;
return 0;
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 3ebd1b7f49d8..490fee22b878 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -55,7 +55,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
- kmem_zone_free(xfs_icreate_zone, ICR_ITEM(lip));
+ kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip));
}
static const struct xfs_item_ops xfs_icreate_item_ops = {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 18f4b262e61c..401da197f012 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -55,6 +55,12 @@ xfs_extlen_t
xfs_get_extsz_hint(
struct xfs_inode *ip)
{
+ /*
+ * No point in aligning allocations if we need to COW to actually
+ * write to them.
+ */
+ if (xfs_is_always_cow_inode(ip))
+ return 0;
if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
return ip->i_d.di_extsize;
if (XFS_IS_REALTIME_INODE(ip))
@@ -809,7 +815,7 @@ xfs_ialloc(
ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
inode->i_rdev = rdev;
- xfs_set_projid(ip, prid);
+ ip->i_d.di_projid = prid;
if (pip && XFS_INHERIT_GID(pip)) {
ip->i_d.di_gid = pip->i_d.di_gid;
@@ -845,8 +851,7 @@ xfs_ialloc(
inode_set_iversion(inode, 1);
ip->i_d.di_flags2 = 0;
ip->i_d.di_cowextsize = 0;
- ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
- ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
+ ip->i_d.di_crtime = tv;
}
@@ -1418,7 +1423,7 @@ xfs_link(
* the tree quota mechanism could be circumvented.
*/
if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+ tdp->i_d.di_projid != sip->i_d.di_projid)) {
error = -EXDEV;
goto error_return;
}
@@ -2130,8 +2135,10 @@ xfs_iunlink_update_bucket(
* passed in because either we're adding or removing ourselves from the
* head of the list.
*/
- if (old_value == new_agino)
+ if (old_value == new_agino) {
+ xfs_buf_corruption_error(agibp);
return -EFSCORRUPTED;
+ }
agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
offset = offsetof(struct xfs_agi, agi_unlinked) +
@@ -2194,6 +2201,8 @@ xfs_iunlink_update_inode(
/* Make sure the old pointer isn't garbage. */
old_value = be32_to_cpu(dip->di_next_unlinked);
if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
goto out;
}
@@ -2205,8 +2214,11 @@ xfs_iunlink_update_inode(
*/
*old_next_agino = old_value;
if (old_value == next_agino) {
- if (next_agino != NULLAGINO)
+ if (next_agino != NULLAGINO) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+ dip, sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
+ }
goto out;
}
@@ -2257,8 +2269,10 @@ xfs_iunlink(
*/
next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
if (next_agino == agino ||
- !xfs_verify_agino_or_null(mp, agno, next_agino))
+ !xfs_verify_agino_or_null(mp, agno, next_agino)) {
+ xfs_buf_corruption_error(agibp);
return -EFSCORRUPTED;
+ }
if (next_agino != NULLAGINO) {
struct xfs_perag *pag;
@@ -3196,6 +3210,7 @@ xfs_rename(
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
+ struct xfs_buf *agibp;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
@@ -3270,7 +3285,7 @@ xfs_rename(
* tree quota mechanism would be circumvented.
*/
if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+ target_dp->i_d.di_projid != src_ip->i_d.di_projid)) {
error = -EXDEV;
goto out_trans_cancel;
}
@@ -3327,7 +3342,6 @@ xfs_rename(
goto out_trans_cancel;
xfs_bumplink(tp, wip);
- xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
VFS_I(wip)->i_state &= ~I_LINKABLE;
}
@@ -3361,6 +3375,22 @@ xfs_rename(
* In case there is already an entry with the same
* name at the destination directory, remove it first.
*/
+
+ /*
+ * Check whether the replace operation will need to allocate
+ * blocks. This happens when the shortform directory lacks
+ * space and we have to convert it to a block format directory.
+ * When more blocks are necessary, we must lock the AGI first
+ * to preserve locking order (AGI -> AGF).
+ */
+ if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) {
+ error = xfs_read_agi(mp, tp,
+ XFS_INO_TO_AGNO(mp, target_ip->i_ino),
+ &agibp);
+ if (error)
+ goto out_trans_cancel;
+ }
+
error = xfs_dir_replace(tp, target_dp, target_name,
src_ip->i_ino, spaceres);
if (error)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 558173f95a03..492e53992fa9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -37,9 +37,6 @@ typedef struct xfs_inode {
struct xfs_ifork *i_cowfp; /* copy on write extents */
struct xfs_ifork i_df; /* data fork */
- /* operations vectors */
- const struct xfs_dir_ops *d_ops; /* directory ops vector */
-
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
@@ -177,30 +174,11 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
return ret;
}
-/*
- * Project quota id helpers (previously projid was 16bit only
- * and using two 16bit values to hold new 32bit projid was chosen
- * to retain compatibility with "old" filesystems).
- */
-static inline prid_t
-xfs_get_projid(struct xfs_inode *ip)
-{
- return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
-}
-
-static inline void
-xfs_set_projid(struct xfs_inode *ip,
- prid_t projid)
-{
- ip->i_d.di_projid_hi = (uint16_t) (projid >> 16);
- ip->i_d.di_projid_lo = (uint16_t) (projid & 0xffff);
-}
-
static inline prid_t
xfs_get_initial_prid(struct xfs_inode *dp)
{
if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- return xfs_get_projid(dp);
+ return dp->i_d.di_projid;
return XFS_PROJID_DEFAULT;
}
@@ -220,6 +198,13 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip)
}
/*
+ * Return the buftarg used for data allocations on a given inode.
+ */
+#define xfs_inode_buftarg(ip) \
+ (XFS_IS_REALTIME_INODE(ip) ? \
+ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
+
+/*
* In-core inode flags.
*/
#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index bb8f076805b9..8bd5d0de6321 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,6 +17,7 @@
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_error.h"
#include <linux/iversion.h>
@@ -309,8 +310,8 @@ xfs_inode_to_log_dinode(
to->di_format = from->di_format;
to->di_uid = from->di_uid;
to->di_gid = from->di_gid;
- to->di_projid_lo = from->di_projid_lo;
- to->di_projid_hi = from->di_projid_hi;
+ to->di_projid_lo = from->di_projid & 0xffff;
+ to->di_projid_hi = from->di_projid >> 16;
memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
@@ -340,8 +341,8 @@ xfs_inode_to_log_dinode(
if (from->di_version == 3) {
to->di_changecount = inode_peek_iversion(inode);
- to->di_crtime.t_sec = from->di_crtime.t_sec;
- to->di_crtime.t_nsec = from->di_crtime.t_nsec;
+ to->di_crtime.t_sec = from->di_crtime.tv_sec;
+ to->di_crtime.t_nsec = from->di_crtime.tv_nsec;
to->di_flags2 = from->di_flags2;
to->di_cowextsize = from->di_cowextsize;
to->di_ino = ip->i_ino;
@@ -666,7 +667,7 @@ xfs_inode_item_destroy(
xfs_inode_t *ip)
{
kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
- kmem_zone_free(xfs_ili_zone, ip->i_itemp);
+ kmem_cache_free(xfs_ili_zone, ip->i_itemp);
}
@@ -828,8 +829,10 @@ xfs_inode_item_format_convert(
{
struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
- if (buf->i_len != sizeof(*in_f32))
+ if (buf->i_len != sizeof(*in_f32)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
+ }
in_f->ilf_type = in_f32->ilf_type;
in_f->ilf_size = in_f32->ilf_size;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d58f0d6a699e..7b35d62ede9f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -33,6 +33,8 @@
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_health.h"
+#include "xfs_reflink.h"
+#include "xfs_ioctl.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -290,82 +292,6 @@ xfs_readlink_by_handle(
return error;
}
-int
-xfs_set_dmattrs(
- xfs_inode_t *ip,
- uint evmask,
- uint16_t state)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_trans_t *tp;
- int error;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- ip->i_d.di_dmevmask = evmask;
- ip->i_d.di_dmstate = state;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp);
-
- return error;
-}
-
-STATIC int
-xfs_fssetdm_by_handle(
- struct file *parfilp,
- void __user *arg)
-{
- int error;
- struct fsdmidata fsd;
- xfs_fsop_setdm_handlereq_t dmhreq;
- struct dentry *dentry;
-
- if (!capable(CAP_MKNOD))
- return -EPERM;
- if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
- return -EFAULT;
-
- error = mnt_want_write_file(parfilp);
- if (error)
- return error;
-
- dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
- if (IS_ERR(dentry)) {
- mnt_drop_write_file(parfilp);
- return PTR_ERR(dentry);
- }
-
- if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) {
- error = -EPERM;
- goto out;
- }
-
- if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
- error = -EFAULT;
- goto out;
- }
-
- error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask,
- fsd.fsd_dmstate);
-
- out:
- mnt_drop_write_file(parfilp);
- dput(dentry);
- return error;
-}
-
STATIC int
xfs_attrlist_by_handle(
struct file *parfilp,
@@ -588,13 +514,12 @@ xfs_attrmulti_by_handle(
int
xfs_ioc_space(
struct file *filp,
- unsigned int cmd,
xfs_flock64_t *bf)
{
struct inode *inode = file_inode(filp);
struct xfs_inode *ip = XFS_I(inode);
struct iattr iattr;
- enum xfs_prealloc_flags flags = 0;
+ enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
int error;
@@ -607,6 +532,9 @@ xfs_ioc_space(
if (!S_ISREG(inode->i_mode))
return -EINVAL;
+ if (xfs_is_always_cow_inode(ip))
+ return -EOPNOTSUPP;
+
if (filp->f_flags & O_DSYNC)
flags |= XFS_PREALLOC_SYNC;
if (filp->f_mode & FMODE_NOCMTIME)
@@ -620,6 +548,7 @@ xfs_ioc_space(
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
goto out_unlock;
+ inode_dio_wait(inode);
switch (bf->l_whence) {
case 0: /*SEEK_SET*/
@@ -635,73 +564,21 @@ xfs_ioc_space(
goto out_unlock;
}
- /*
- * length of <= 0 for resv/unresv/zero is invalid. length for
- * alloc/free is ignored completely and we have no idea what userspace
- * might have set it to, so set it to zero to allow range
- * checks to pass.
- */
- switch (cmd) {
- case XFS_IOC_ZERO_RANGE:
- case XFS_IOC_RESVSP:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_UNRESVSP64:
- if (bf->l_len <= 0) {
- error = -EINVAL;
- goto out_unlock;
- }
- break;
- default:
- bf->l_len = 0;
- break;
- }
-
- if (bf->l_start < 0 ||
- bf->l_start > inode->i_sb->s_maxbytes ||
- bf->l_start + bf->l_len < 0 ||
- bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) {
+ if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) {
error = -EINVAL;
goto out_unlock;
}
- switch (cmd) {
- case XFS_IOC_ZERO_RANGE:
- flags |= XFS_PREALLOC_SET;
- error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
- break;
- case XFS_IOC_RESVSP:
- case XFS_IOC_RESVSP64:
- flags |= XFS_PREALLOC_SET;
- error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
- XFS_BMAPI_PREALLOC);
- break;
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_UNRESVSP64:
- error = xfs_free_file_space(ip, bf->l_start, bf->l_len);
- break;
- case XFS_IOC_ALLOCSP:
- case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP:
- case XFS_IOC_FREESP64:
- flags |= XFS_PREALLOC_CLEAR;
- if (bf->l_start > XFS_ISIZE(ip)) {
- error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
- bf->l_start - XFS_ISIZE(ip), 0);
- if (error)
- goto out_unlock;
- }
-
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = bf->l_start;
-
- error = xfs_vn_setattr_size(file_dentry(filp), &iattr);
- break;
- default:
- ASSERT(0);
- error = -EINVAL;
+ if (bf->l_start > XFS_ISIZE(ip)) {
+ error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
+ bf->l_start - XFS_ISIZE(ip), 0);
+ if (error)
+ goto out_unlock;
}
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = bf->l_start;
+ error = xfs_vn_setattr_size(file_dentry(filp), &iattr);
if (error)
goto out_unlock;
@@ -1116,7 +993,7 @@ xfs_fill_fsxattr(
fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
fa->fsx_cowextsize = ip->i_d.di_cowextsize <<
ip->i_mount->m_sb.sb_blocklog;
- fa->fsx_projid = xfs_get_projid(ip);
+ fa->fsx_projid = ip->i_d.di_projid;
if (attr) {
if (ip->i_afp) {
@@ -1311,10 +1188,9 @@ xfs_ioctl_setattr_dax_invalidate(
* have to check the device for dax support or flush pagecache.
*/
if (fa->fsx_xflags & FS_XFLAG_DAX) {
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
- return -EINVAL;
- if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
- sb->s_blocksize))
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+
+ if (!bdev_dax_supported(target->bt_bdev, sb->s_blocksize))
return -EINVAL;
}
@@ -1569,7 +1445,7 @@ xfs_ioctl_setattr(
}
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
- xfs_get_projid(ip) != fa->fsx_projid) {
+ ip->i_d.di_projid != fa->fsx_projid) {
code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0);
if (code) /* out of quota */
@@ -1606,13 +1482,13 @@ xfs_ioctl_setattr(
VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
/* Change the ownerships and register project quota modifications */
- if (xfs_get_projid(ip) != fa->fsx_projid) {
+ if (ip->i_d.di_projid != fa->fsx_projid) {
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
olddquot = xfs_qm_vop_chown(tp, ip,
&ip->i_pdquot, pdqp);
}
ASSERT(ip->i_d.di_version > 1);
- xfs_set_projid(ip, fa->fsx_projid);
+ ip->i_d.di_projid = fa->fsx_projid;
}
/*
@@ -2122,24 +1998,17 @@ xfs_file_ioctl(
return xfs_ioc_setlabel(filp, mp, arg);
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
- case XFS_IOC_RESVSP:
- case XFS_IOC_UNRESVSP:
case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP64:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP64:
- case XFS_IOC_ZERO_RANGE: {
+ case XFS_IOC_FREESP64: {
xfs_flock64_t bf;
if (copy_from_user(&bf, arg, sizeof(bf)))
return -EFAULT;
- return xfs_ioc_space(filp, cmd, &bf);
+ return xfs_ioc_space(filp, &bf);
}
case XFS_IOC_DIOINFO: {
- struct dioattr da;
- xfs_buftarg_t *target =
- XFS_IS_REALTIME_INODE(ip) ?
- mp->m_rtdev_targp : mp->m_ddev_targp;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct dioattr da;
da.d_mem = da.d_miniosz = target->bt_logical_sectorsize;
da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
@@ -2183,22 +2052,6 @@ xfs_file_ioctl(
case XFS_IOC_SETXFLAGS:
return xfs_ioc_setxflags(ip, filp, arg);
- case XFS_IOC_FSSETDM: {
- struct fsdmidata dmi;
-
- if (copy_from_user(&dmi, arg, sizeof(dmi)))
- return -EFAULT;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
- dmi.fsd_dmstate);
- mnt_drop_write_file(filp);
- return error;
- }
-
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
case XFS_IOC_GETBMAPX:
@@ -2226,8 +2079,6 @@ xfs_file_ioctl(
return -EFAULT;
return xfs_open_by_handle(filp, &hreq);
}
- case XFS_IOC_FSSETDM_BY_HANDLE:
- return xfs_fssetdm_by_handle(filp, arg);
case XFS_IOC_READLINK_BY_HANDLE: {
xfs_fsop_handlereq_t hreq;
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 654c0bb1bcf8..420bd95dc326 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -9,7 +9,6 @@
extern int
xfs_ioc_space(
struct file *filp,
- unsigned int cmd,
xfs_flock64_t *bf);
int
@@ -71,12 +70,6 @@ xfs_file_compat_ioctl(
unsigned int cmd,
unsigned long arg);
-extern int
-xfs_set_dmattrs(
- struct xfs_inode *ip,
- uint evmask,
- uint16_t state);
-
struct xfs_ibulk;
struct xfs_bstat;
struct xfs_inogrp;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1e08bf79b478..c4c4f09113d3 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -500,44 +500,6 @@ xfs_compat_attrmulti_by_handle(
return error;
}
-STATIC int
-xfs_compat_fssetdm_by_handle(
- struct file *parfilp,
- void __user *arg)
-{
- int error;
- struct fsdmidata fsd;
- compat_xfs_fsop_setdm_handlereq_t dmhreq;
- struct dentry *dentry;
-
- if (!capable(CAP_MKNOD))
- return -EPERM;
- if (copy_from_user(&dmhreq, arg,
- sizeof(compat_xfs_fsop_setdm_handlereq_t)))
- return -EFAULT;
-
- dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
-
- if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) {
- error = -EPERM;
- goto out;
- }
-
- if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
- error = -EFAULT;
- goto out;
- }
-
- error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask,
- fsd.fsd_dmstate);
-
-out:
- dput(dentry);
- return error;
-}
-
long
xfs_file_compat_ioctl(
struct file *filp,
@@ -557,18 +519,13 @@ xfs_file_compat_ioctl(
case XFS_IOC_ALLOCSP_32:
case XFS_IOC_FREESP_32:
case XFS_IOC_ALLOCSP64_32:
- case XFS_IOC_FREESP64_32:
- case XFS_IOC_RESVSP_32:
- case XFS_IOC_UNRESVSP_32:
- case XFS_IOC_RESVSP64_32:
- case XFS_IOC_UNRESVSP64_32:
- case XFS_IOC_ZERO_RANGE_32: {
+ case XFS_IOC_FREESP64_32: {
struct xfs_flock64 bf;
if (xfs_compat_flock64_copyin(&bf, arg))
return -EFAULT;
cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
- return xfs_ioc_space(filp, cmd, &bf);
+ return xfs_ioc_space(filp, &bf);
}
case XFS_IOC_FSGEOMETRY_V1_32:
return xfs_compat_ioc_fsgeometry_v1(mp, arg);
@@ -651,8 +608,6 @@ xfs_file_compat_ioctl(
return xfs_compat_attrlist_by_handle(filp, arg);
case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
return xfs_compat_attrmulti_by_handle(filp, arg);
- case XFS_IOC_FSSETDM_BY_HANDLE_32:
- return xfs_compat_fssetdm_by_handle(filp, arg);
default:
/* try the native version */
return xfs_file_ioctl(filp, cmd, (unsigned long)arg);
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 7985344d3aa6..8c7743cd490e 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -99,7 +99,7 @@ typedef struct compat_xfs_fsop_handlereq {
_IOWR('X', 108, struct compat_xfs_fsop_handlereq)
/* The bstat field in the swapext struct needs translation */
-typedef struct compat_xfs_swapext {
+struct compat_xfs_swapext {
int64_t sx_version; /* version */
int64_t sx_fdtarget; /* fd of target file */
int64_t sx_fdtmp; /* fd of tmp file */
@@ -107,7 +107,7 @@ typedef struct compat_xfs_swapext {
xfs_off_t sx_length; /* leng from offset */
char sx_pad[16]; /* pad space, unused */
struct compat_xfs_bstat sx_stat; /* stat of target b4 copy */
-} __compat_packed compat_xfs_swapext_t;
+} __compat_packed;
#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext)
@@ -143,15 +143,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq {
#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
_IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
-typedef struct compat_xfs_fsop_setdm_handlereq {
- struct compat_xfs_fsop_handlereq hreq; /* handle information */
- /* ptr to struct fsdmidata */
- compat_uptr_t data; /* DMAPI data */
-} compat_xfs_fsop_setdm_handlereq_t;
-
-#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
- _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
-
#ifdef BROKEN_X86_ALIGNMENT
/* on ia32 l_start is on a 32-bit boundary */
typedef struct compat_xfs_flock64 {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f780e223b118..28e2d1f37267 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -29,8 +29,8 @@
#include "xfs_reflink.h"
-#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
- << mp->m_writeio_log)
+#define XFS_ALLOC_ALIGN(mp, off) \
+ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
static int
xfs_alert_fsblock_zero(
@@ -54,9 +54,10 @@ xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
- bool shared)
+ u16 flags)
{
struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
return xfs_alert_fsblock_zero(ip, imap);
@@ -77,14 +78,13 @@ xfs_bmbt_to_iomap(
}
iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
- iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
- iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+ iomap->bdev = target->bt_bdev;
+ iomap->dax_dev = target->bt_daxdev;
+ iomap->flags = flags;
if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
- if (shared)
- iomap->flags |= IOMAP_F_SHARED;
return 0;
}
@@ -95,18 +95,30 @@ xfs_hole_to_iomap(
xfs_fileoff_t offset_fsb,
xfs_fileoff_t end_fsb)
{
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
- iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
- iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+ iomap->bdev = target->bt_bdev;
+ iomap->dax_dev = target->bt_daxdev;
+}
+
+static inline xfs_fileoff_t
+xfs_iomap_end_fsb(
+ struct xfs_mount *mp,
+ loff_t offset,
+ loff_t count)
+{
+ ASSERT(offset <= mp->m_super->s_maxbytes);
+ return min(XFS_B_TO_FSB(mp, offset + count),
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
}
-xfs_extlen_t
+static xfs_extlen_t
xfs_eof_alignment(
- struct xfs_inode *ip,
- xfs_extlen_t extsize)
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
xfs_extlen_t align = 0;
@@ -129,111 +141,80 @@ xfs_eof_alignment(
align = 0;
}
- /*
- * Always round up the allocation request to an extent boundary
- * (when file on a real-time subvolume or has di_extsize hint).
- */
- if (extsize) {
- if (align)
- align = roundup_64(align, extsize);
- else
- align = extsize;
- }
-
return align;
}
-STATIC int
+/*
+ * Check if last_fsb is outside the last extent, and if so grow it to the next
+ * stripe unit boundary.
+ */
+xfs_fileoff_t
xfs_iomap_eof_align_last_fsb(
struct xfs_inode *ip,
- xfs_extlen_t extsize,
- xfs_fileoff_t *last_fsb)
+ xfs_fileoff_t end_fsb)
{
- xfs_extlen_t align = xfs_eof_alignment(ip, extsize);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ xfs_extlen_t extsz = xfs_get_extsz_hint(ip);
+ xfs_extlen_t align = xfs_eof_alignment(ip);
+ struct xfs_bmbt_irec irec;
+ struct xfs_iext_cursor icur;
+
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+
+ /*
+ * Always round up the allocation request to the extent hint boundary.
+ */
+ if (extsz) {
+ if (align)
+ align = roundup_64(align, extsz);
+ else
+ align = extsz;
+ }
if (align) {
- xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align);
- int eof, error;
+ xfs_fileoff_t aligned_end_fsb = roundup_64(end_fsb, align);
- error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
- if (error)
- return error;
- if (eof)
- *last_fsb = new_last_fsb;
+ xfs_iext_last(ifp, &icur);
+ if (!xfs_iext_get_extent(ifp, &icur, &irec) ||
+ aligned_end_fsb >= irec.br_startoff + irec.br_blockcount)
+ return aligned_end_fsb;
}
- return 0;
+
+ return end_fsb;
}
int
xfs_iomap_write_direct(
- xfs_inode_t *ip,
- xfs_off_t offset,
- size_t count,
- xfs_bmbt_irec_t *imap,
- int nmaps)
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t count_fsb,
+ struct xfs_bmbt_irec *imap)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb;
- xfs_fileoff_t last_fsb;
- xfs_filblks_t count_fsb, resaligned;
- xfs_extlen_t extsz;
- int nimaps;
- int quota_flag;
- int rt;
- xfs_trans_t *tp;
- uint qblocks, resblks, resrtextents;
- int error;
- int lockmode;
- int bmapi_flags = XFS_BMAPI_PREALLOC;
- uint tflags = 0;
-
- rt = XFS_IS_REALTIME_INODE(ip);
- extsz = xfs_get_extsz_hint(ip);
- lockmode = XFS_ILOCK_SHARED; /* locked by caller */
-
- ASSERT(xfs_isilocked(ip, lockmode));
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ xfs_filblks_t resaligned;
+ int nimaps;
+ int quota_flag;
+ uint qblocks, resblks;
+ unsigned int resrtextents = 0;
+ int error;
+ int bmapi_flags = XFS_BMAPI_PREALLOC;
+ uint tflags = 0;
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- if ((offset + count) > XFS_ISIZE(ip)) {
- /*
- * Assert that the in-core extent list is present since this can
- * call xfs_iread_extents() and we only have the ilock shared.
- * This should be safe because the lock was held around a bmapi
- * call in the caller and we only need it to access the in-core
- * list.
- */
- ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
- XFS_IFEXTENTS);
- error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb);
- if (error)
- goto out_unlock;
- } else {
- if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
- last_fsb = min(last_fsb, (xfs_fileoff_t)
- imap->br_blockcount +
- imap->br_startoff);
- }
- count_fsb = last_fsb - offset_fsb;
ASSERT(count_fsb > 0);
- resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz);
- if (unlikely(rt)) {
+ resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+ xfs_get_extsz_hint(ip));
+ if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
resrtextents = qblocks = resaligned;
resrtextents /= mp->m_sb.sb_rextsize;
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
quota_flag = XFS_QMOPT_RES_RTBLKS;
} else {
- resrtextents = 0;
resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
quota_flag = XFS_QMOPT_RES_REGBLKS;
}
- /*
- * Drop the shared lock acquired by the caller, attach the dquot if
- * necessary and move on to transaction setup.
- */
- xfs_iunlock(ip, lockmode);
error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -263,8 +244,7 @@ xfs_iomap_write_direct(
if (error)
return error;
- lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, lockmode);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
if (error)
@@ -277,8 +257,8 @@ xfs_iomap_write_direct(
* caller gave to us.
*/
nimaps = 1;
- error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- bmapi_flags, resblks, imap, &nimaps);
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
+ imap, &nimaps);
if (error)
goto out_res_cancel;
@@ -301,7 +281,7 @@ xfs_iomap_write_direct(
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
- xfs_iunlock(ip, lockmode);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_res_cancel:
@@ -410,19 +390,19 @@ xfs_iomap_prealloc_size(
if (offset + count <= XFS_ISIZE(ip))
return 0;
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
- (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)))
+ if (!(mp->m_flags & XFS_MOUNT_ALLOCSIZE) &&
+ (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks)))
return 0;
/*
* If an explicit allocsize is set, the file is small, or we
* are writing behind a hole, then use the minimum prealloc:
*/
- if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
+ if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) ||
XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
!xfs_iext_peek_prev_extent(ifp, icur, &prev) ||
prev.br_startoff + prev.br_blockcount < offset_fsb)
- return mp->m_writeio_blocks;
+ return mp->m_allocsize_blocks;
/*
* Determine the initial size of the preallocation. We are beyond the
@@ -515,219 +495,13 @@ xfs_iomap_prealloc_size(
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
check_writeio:
- if (alloc_blocks < mp->m_writeio_blocks)
- alloc_blocks = mp->m_writeio_blocks;
+ if (alloc_blocks < mp->m_allocsize_blocks)
+ alloc_blocks = mp->m_allocsize_blocks;
trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
- mp->m_writeio_blocks);
+ mp->m_allocsize_blocks);
return alloc_blocks;
}
-static int
-xfs_file_iomap_begin_delay(
- struct inode *inode,
- loff_t offset,
- loff_t count,
- unsigned flags,
- struct iomap *iomap)
-{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
- xfs_fileoff_t maxbytes_fsb =
- XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- xfs_fileoff_t end_fsb;
- struct xfs_bmbt_irec imap, cmap;
- struct xfs_iext_cursor icur, ccur;
- xfs_fsblock_t prealloc_blocks = 0;
- bool eof = false, cow_eof = false, shared = false;
- int whichfork = XFS_DATA_FORK;
- int error = 0;
-
- ASSERT(!XFS_IS_REALTIME_INODE(ip));
- ASSERT(!xfs_get_extsz_hint(ip));
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
- mp, XFS_ERRTAG_BMAPIFORMAT))) {
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
- error = -EFSCORRUPTED;
- goto out_unlock;
- }
-
- XFS_STATS_INC(mp, xs_blk_mapw);
-
- if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
- if (error)
- goto out_unlock;
- }
-
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
- /*
- * Search the data fork fork first to look up our source mapping. We
- * always need the data fork map, as we have to return it to the
- * iomap code so that the higher level write code can read data in to
- * perform read-modify-write cycles for unaligned writes.
- */
- eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
- if (eof)
- imap.br_startoff = end_fsb; /* fake hole until the end */
-
- /* We never need to allocate blocks for zeroing a hole. */
- if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
- xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
- goto out_unlock;
- }
-
- /*
- * Search the COW fork extent list even if we did not find a data fork
- * extent. This serves two purposes: first this implements the
- * speculative preallocation using cowextsize, so that we also unshare
- * block adjacent to shared blocks instead of just the shared blocks
- * themselves. Second the lookup in the extent list is generally faster
- * than going out to the shared extent tree.
- */
- if (xfs_is_cow_inode(ip)) {
- if (!ip->i_cowfp) {
- ASSERT(!xfs_is_reflink_inode(ip));
- xfs_ifork_init_cow(ip);
- }
- cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
- &ccur, &cmap);
- if (!cow_eof && cmap.br_startoff <= offset_fsb) {
- trace_xfs_reflink_cow_found(ip, &cmap);
- whichfork = XFS_COW_FORK;
- goto done;
- }
- }
-
- if (imap.br_startoff <= offset_fsb) {
- /*
- * For reflink files we may need a delalloc reservation when
- * overwriting shared extents. This includes zeroing of
- * existing extents that contain data.
- */
- if (!xfs_is_cow_inode(ip) ||
- ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
- trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
- &imap);
- goto done;
- }
-
- xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
-
- /* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_inode_need_cow(ip, &imap, &shared);
- if (error)
- goto out_unlock;
-
- /* Not shared? Just report the (potentially capped) extent. */
- if (!shared) {
- trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
- &imap);
- goto done;
- }
-
- /*
- * Fork all the shared blocks from our write offset until the
- * end of the extent.
- */
- whichfork = XFS_COW_FORK;
- end_fsb = imap.br_startoff + imap.br_blockcount;
- } else {
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
- * pages to keep the chunks of work done where somewhat
- * symmetric with the work writeback does. This is a completely
- * arbitrary number pulled out of thin air.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
- if (xfs_is_always_cow_inode(ip))
- whichfork = XFS_COW_FORK;
- }
-
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out_unlock;
-
- if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
- count, &icur);
- if (prealloc_blocks) {
- xfs_extlen_t align;
- xfs_off_t end_offset;
- xfs_fileoff_t p_end_fsb;
-
- end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
- p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
- prealloc_blocks;
-
- align = xfs_eof_alignment(ip, 0);
- if (align)
- p_end_fsb = roundup_64(p_end_fsb, align);
-
- p_end_fsb = min(p_end_fsb, maxbytes_fsb);
- ASSERT(p_end_fsb > offset_fsb);
- prealloc_blocks = p_end_fsb - end_fsb;
- }
- }
-
-retry:
- error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks,
- whichfork == XFS_DATA_FORK ? &imap : &cmap,
- whichfork == XFS_DATA_FORK ? &icur : &ccur,
- whichfork == XFS_DATA_FORK ? eof : cow_eof);
- switch (error) {
- case 0:
- break;
- case -ENOSPC:
- case -EDQUOT:
- /* retry without any preallocation */
- trace_xfs_delalloc_enospc(ip, offset, count);
- if (prealloc_blocks) {
- prealloc_blocks = 0;
- goto retry;
- }
- /*FALLTHRU*/
- default:
- goto out_unlock;
- }
-
- /*
- * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
- * them out if the write happens to fail.
- */
- iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, count, whichfork,
- whichfork == XFS_DATA_FORK ? &imap : &cmap);
-done:
- if (whichfork == XFS_COW_FORK) {
- if (imap.br_startoff > offset_fsb) {
- xfs_trim_extent(&cmap, offset_fsb,
- imap.br_startoff - offset_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
- goto out_unlock;
- }
- /* ensure we only report blocks we have a reservation for */
- xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
- shared = true;
- }
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-}
-
int
xfs_iomap_write_unwritten(
xfs_inode_t *ip,
@@ -765,6 +539,11 @@ xfs_iomap_write_unwritten(
*/
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ /* Attach dquots so that bmbt splits are accounted correctly. */
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
do {
/*
* Set up a transaction to convert the range of extents
@@ -783,6 +562,11 @@ xfs_iomap_write_unwritten(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto error_on_bmapi_transaction;
+
/*
* Modify the unwritten extent state of the buffer.
*/
@@ -840,23 +624,42 @@ error_on_bmapi_transaction:
static inline bool
imap_needs_alloc(
struct inode *inode,
+ unsigned flags,
struct xfs_bmbt_irec *imap,
int nimaps)
{
- return !nimaps ||
- imap->br_startblock == HOLESTARTBLOCK ||
- imap->br_startblock == DELAYSTARTBLOCK ||
- (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
+ /* don't allocate blocks when just zeroing */
+ if (flags & IOMAP_ZERO)
+ return false;
+ if (!nimaps ||
+ imap->br_startblock == HOLESTARTBLOCK ||
+ imap->br_startblock == DELAYSTARTBLOCK)
+ return true;
+ /* we convert unwritten extents before copying the data for DAX */
+ if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN)
+ return true;
+ return false;
}
static inline bool
-needs_cow_for_zeroing(
+imap_needs_cow(
+ struct xfs_inode *ip,
+ unsigned int flags,
struct xfs_bmbt_irec *imap,
int nimaps)
{
- return nimaps &&
- imap->br_startblock != HOLESTARTBLOCK &&
- imap->br_state != XFS_EXT_UNWRITTEN;
+ if (!xfs_is_cow_inode(ip))
+ return false;
+
+ /* when zeroing we don't have to COW holes or unwritten extents */
+ if (flags & IOMAP_ZERO) {
+ if (!nimaps ||
+ imap->br_startblock == HOLESTARTBLOCK ||
+ imap->br_state == XFS_EXT_UNWRITTEN)
+ return false;
+ }
+
+ return true;
}
static int
@@ -872,15 +675,8 @@ xfs_ilock_for_iomap(
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_cow_inode(ip) && is_write) {
- /*
- * FIXME: It could still overwrite on unshared extents and not
- * need allocation.
- */
- if (flags & IOMAP_NOWAIT)
- return -EAGAIN;
+ if (xfs_is_cow_inode(ip) && is_write)
mode = XFS_ILOCK_EXCL;
- }
/*
* Extents not yet cached requires exclusive access, don't block. This
@@ -917,111 +713,73 @@ relock:
}
static int
-xfs_file_iomap_begin(
+xfs_direct_write_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t length,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec imap;
- xfs_fileoff_t offset_fsb, end_fsb;
+ struct xfs_bmbt_irec imap, cmap;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0;
bool shared = false;
+ u16 iomap_flags = 0;
unsigned lockmode;
+ ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
+
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
- !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
- /* Reserve delalloc blocks for regular writeback. */
- return xfs_file_iomap_begin_delay(inode, offset, length, flags,
- iomap);
- }
-
/*
- * Lock the inode in the manner required for the specified operation and
- * check for as many conditions that would result in blocking as
- * possible. This removes most of the non-blocking checks from the
- * mapping code below.
+ * Writes that span EOF might trigger an IO size update on completion,
+ * so consider them to be dirty for the purposes of O_DSYNC even if
+ * there is no other metadata changes pending or have been made here.
*/
+ if (offset + length > i_size_read(inode))
+ iomap_flags |= IOMAP_F_DIRTY;
+
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
- ASSERT(offset <= mp->m_super->s_maxbytes);
- if (offset > mp->m_super->s_maxbytes - length)
- length = mp->m_super->s_maxbytes - offset;
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- end_fsb = XFS_B_TO_FSB(mp, offset + length);
-
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, 0);
if (error)
goto out_unlock;
- if (flags & IOMAP_REPORT) {
- /* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
- if (error)
+ if (imap_needs_cow(ip, flags, &imap, nimaps)) {
+ error = -EAGAIN;
+ if (flags & IOMAP_NOWAIT)
goto out_unlock;
- }
-
- /* Non-modifying mapping requested, so we are done */
- if (!(flags & (IOMAP_WRITE | IOMAP_ZERO)))
- goto out_found;
-
- /*
- * Break shared extents if necessary. Checks for non-blocking IO have
- * been done up front, so we don't need to do them here.
- */
- if (xfs_is_cow_inode(ip)) {
- struct xfs_bmbt_irec cmap;
- bool directio = (flags & IOMAP_DIRECT);
-
- /* if zeroing doesn't need COW allocation, then we are done. */
- if ((flags & IOMAP_ZERO) &&
- !needs_cow_for_zeroing(&imap, nimaps))
- goto out_found;
/* may drop and re-acquire the ilock */
- cmap = imap;
- error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
- directio);
+ error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
+ &lockmode, flags & IOMAP_DIRECT);
if (error)
goto out_unlock;
-
- /*
- * For buffered writes we need to report the address of the
- * previous block (if there was any) so that the higher level
- * write code can perform read-modify-write operations; we
- * won't need the CoW fork mapping until writeback. For direct
- * I/O, which must be block aligned, we need to report the
- * newly allocated address. If the data fork has a hole, copy
- * the COW fork mapping to avoid allocating to the data fork.
- */
- if (directio || imap.br_startblock == HOLESTARTBLOCK)
- imap = cmap;
-
+ if (shared)
+ goto out_found_cow;
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- /* Don't need to allocate over holes when doing zeroing operations. */
- if (flags & IOMAP_ZERO)
- goto out_found;
+ if (imap_needs_alloc(inode, flags, &imap, nimaps))
+ goto allocate_blocks;
- if (!imap_needs_alloc(inode, &imap, nimaps))
- goto out_found;
+ xfs_iunlock(ip, lockmode);
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
- /* If nowait is set bail since we are going to make allocations. */
- if (flags & IOMAP_NOWAIT) {
- error = -EAGAIN;
+allocate_blocks:
+ error = -EAGAIN;
+ if (flags & IOMAP_NOWAIT)
goto out_unlock;
- }
/*
* We cap the maximum length we map to a sane size to keep the chunks
@@ -1033,48 +791,273 @@ xfs_file_iomap_begin(
* lower level functions are updated.
*/
length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+ end_fsb = xfs_iomap_end_fsb(mp, offset, length);
- /*
- * xfs_iomap_write_direct() expects the shared lock. It is unlocked on
- * return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
- error = xfs_iomap_write_direct(ip, offset, length, &imap,
- nimaps);
+ if (offset + length > XFS_ISIZE(ip))
+ end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
+ else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+ end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+ xfs_iunlock(ip, lockmode);
+
+ error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap);
if (error)
return error;
- iomap->flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW);
-out_finish:
- return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
-
-out_found:
- ASSERT(nimaps);
+out_found_cow:
xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- goto out_finish;
+ length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+ trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+ if (imap.br_startblock != HOLESTARTBLOCK) {
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ if (error)
+ return error;
+ }
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
}
+const struct iomap_ops xfs_direct_write_iomap_ops = {
+ .iomap_begin = xfs_direct_write_iomap_begin,
+};
+
static int
-xfs_file_iomap_end_delalloc(
- struct xfs_inode *ip,
+xfs_buffered_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t count,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+ struct xfs_bmbt_irec imap, cmap;
+ struct xfs_iext_cursor icur, ccur;
+ xfs_fsblock_t prealloc_blocks = 0;
+ bool eof = false, cow_eof = false, shared = false;
+ int allocfork = XFS_DATA_FORK;
+ int error = 0;
+
+ /* we can't use delayed allocations when using extent size hints */
+ if (xfs_get_extsz_hint(ip))
+ return xfs_direct_write_iomap_begin(inode, offset, count,
+ flags, iomap, srcmap);
+
+ ASSERT(!XFS_IS_REALTIME_INODE(ip));
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, XFS_DATA_FORK)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ XFS_STATS_INC(mp, xs_blk_mapw);
+
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+ /*
+ * Search the data fork fork first to look up our source mapping. We
+ * always need the data fork map, as we have to return it to the
+ * iomap code so that the higher level write code can read data in to
+ * perform read-modify-write cycles for unaligned writes.
+ */
+ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
+ if (eof)
+ imap.br_startoff = end_fsb; /* fake hole until the end */
+
+ /* We never need to allocate blocks for zeroing a hole. */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+ goto out_unlock;
+ }
+
+ /*
+ * Search the COW fork extent list even if we did not find a data fork
+ * extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
+ */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ goto found_cow;
+ }
+ }
+
+ if (imap.br_startoff <= offset_fsb) {
+ /*
+ * For reflink files we may need a delalloc reservation when
+ * overwriting shared extents. This includes zeroing of
+ * existing extents that contain data.
+ */
+ if (!xfs_is_cow_inode(ip) ||
+ ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto found_imap;
+ }
+
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
+
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_inode_need_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ /* Not shared? Just report the (potentially capped) extent. */
+ if (!shared) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto found_imap;
+ }
+
+ /*
+ * Fork all the shared blocks from our write offset until the
+ * end of the extent.
+ */
+ allocfork = XFS_COW_FORK;
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ } else {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat
+ * symmetric with the work writeback does. This is a completely
+ * arbitrary number pulled out of thin air.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+
+ if (xfs_is_always_cow_inode(ip))
+ allocfork = XFS_COW_FORK;
+ }
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error)
+ goto out_unlock;
+
+ if (eof) {
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset,
+ count, &icur);
+ if (prealloc_blocks) {
+ xfs_extlen_t align;
+ xfs_off_t end_offset;
+ xfs_fileoff_t p_end_fsb;
+
+ end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1);
+ p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+ prealloc_blocks;
+
+ align = xfs_eof_alignment(ip);
+ if (align)
+ p_end_fsb = roundup_64(p_end_fsb, align);
+
+ p_end_fsb = min(p_end_fsb,
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ ASSERT(p_end_fsb > offset_fsb);
+ prealloc_blocks = p_end_fsb - end_fsb;
+ }
+ }
+
+retry:
+ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks,
+ allocfork == XFS_DATA_FORK ? &imap : &cmap,
+ allocfork == XFS_DATA_FORK ? &icur : &ccur,
+ allocfork == XFS_DATA_FORK ? eof : cow_eof);
+ switch (error) {
+ case 0:
+ break;
+ case -ENOSPC:
+ case -EDQUOT:
+ /* retry without any preallocation */
+ trace_xfs_delalloc_enospc(ip, offset, count);
+ if (prealloc_blocks) {
+ prealloc_blocks = 0;
+ goto retry;
+ }
+ /*FALLTHRU*/
+ default:
+ goto out_unlock;
+ }
+
+ if (allocfork == XFS_COW_FORK) {
+ trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
+ goto found_cow;
+ }
+
+ /*
+ * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
+ * them out if the write happens to fail.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW);
+
+found_imap:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+
+found_cow:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (imap.br_startoff <= offset_fsb) {
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+ if (error)
+ return error;
+ } else {
+ xfs_trim_extent(&cmap, offset_fsb,
+ imap.br_startoff - offset_fsb);
+ }
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+static int
+xfs_buffered_write_iomap_end(
+ struct inode *inode,
loff_t offset,
loff_t length,
ssize_t written,
+ unsigned flags,
struct iomap *iomap)
{
+ struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t start_fsb;
xfs_fileoff_t end_fsb;
int error = 0;
+ if (iomap->type != IOMAP_DELALLOC)
+ return 0;
+
/*
* Behave as if the write failed if drop writes is enabled. Set the NEW
* flag to force delalloc cleanup.
@@ -1119,24 +1102,51 @@ xfs_file_iomap_end_delalloc(
return 0;
}
+const struct iomap_ops xfs_buffered_write_iomap_ops = {
+ .iomap_begin = xfs_buffered_write_iomap_begin,
+ .iomap_end = xfs_buffered_write_iomap_end,
+};
+
static int
-xfs_file_iomap_end(
+xfs_read_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t length,
- ssize_t written,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
- if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
- return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
- length, written, iomap);
- return 0;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ int nimaps = 1, error = 0;
+ bool shared = false;
+ unsigned lockmode;
+
+ ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, 0);
+ if (!error && (flags & IOMAP_REPORT))
+ error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+ xfs_iunlock(ip, lockmode);
+
+ if (error)
+ return error;
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0);
}
-const struct iomap_ops xfs_iomap_ops = {
- .iomap_begin = xfs_file_iomap_begin,
- .iomap_end = xfs_file_iomap_end,
+const struct iomap_ops xfs_read_iomap_ops = {
+ .iomap_begin = xfs_read_iomap_begin,
};
static int
@@ -1145,7 +1155,8 @@ xfs_seek_iomap_begin(
loff_t offset,
loff_t length,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1178,8 +1189,7 @@ xfs_seek_iomap_begin(
/*
* Fake a hole until the end of the file.
*/
- data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
- XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ data_fsb = xfs_iomap_end_fsb(mp, offset, length);
}
/*
@@ -1193,7 +1203,7 @@ xfs_seek_iomap_begin(
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
@@ -1215,7 +1225,7 @@ xfs_seek_iomap_begin(
imap.br_state = XFS_EXT_NORM;
done:
xfs_trim_extent(&imap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
@@ -1231,7 +1241,8 @@ xfs_xattr_iomap_begin(
loff_t offset,
loff_t length,
unsigned flags,
- struct iomap *iomap)
+ struct iomap *iomap,
+ struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1261,7 +1272,7 @@ out_unlock:
if (error)
return error;
ASSERT(nimaps);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 5c2f6aa6d78f..7d3703556d0e 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -11,13 +11,14 @@
struct xfs_inode;
struct xfs_bmbt_irec;
-int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
- struct xfs_bmbt_irec *, int);
+int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t count_fsb, struct xfs_bmbt_irec *imap);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
+ xfs_fileoff_t end_fsb);
int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *, bool shared);
-xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
+ struct xfs_bmbt_irec *, u16);
static inline xfs_filblks_t
xfs_aligned_fsb_count(
@@ -39,7 +40,9 @@ xfs_aligned_fsb_count(
return count_fsb;
}
-extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_buffered_write_iomap_ops;
+extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index fe285d123d69..8afe69ca188b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -20,6 +20,7 @@
#include "xfs_symlink.h"
#include "xfs_dir2.h"
#include "xfs_iomap.h"
+#include "xfs_error.h"
#include <linux/xattr.h>
#include <linux/posix_acl.h>
@@ -470,20 +471,57 @@ xfs_vn_get_link_inline(
struct inode *inode,
struct delayed_call *done)
{
+ struct xfs_inode *ip = XFS_I(inode);
char *link;
- ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
+ ASSERT(ip->i_df.if_flags & XFS_IFINLINE);
/*
* The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if
* if_data is junk.
*/
- link = XFS_I(inode)->i_df.if_u1.if_data;
- if (!link)
+ link = ip->i_df.if_u1.if_data;
+ if (XFS_IS_CORRUPT(ip->i_mount, !link))
return ERR_PTR(-EFSCORRUPTED);
return link;
}
+static uint32_t
+xfs_stat_blksize(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If the file blocks are being allocated from a realtime volume, then
+ * always return the realtime extent size.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
+
+ /*
+ * Allow large block sizes to be reported to userspace programs if the
+ * "largeio" mount option is used.
+ *
+ * If compatibility mode is specified, simply return the basic unit of
+ * caching so that we don't get inefficient read/modify/write I/O from
+ * user apps. Otherwise....
+ *
+ * If the underlying volume is a stripe, then return the stripe width in
+ * bytes as the recommended I/O size. It is not a stripe and we've set a
+ * default buffered I/O size, return that, otherwise return the compat
+ * default.
+ */
+ if (mp->m_flags & XFS_MOUNT_LARGEIO) {
+ if (mp->m_swidth)
+ return mp->m_swidth << mp->m_sb.sb_blocklog;
+ if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
+ return 1U << mp->m_allocsize_log;
+ }
+
+ return PAGE_SIZE;
+}
+
STATIC int
xfs_vn_getattr(
const struct path *path,
@@ -516,8 +554,7 @@ xfs_vn_getattr(
if (ip->i_d.di_version == 3) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
- stat->btime.tv_sec = ip->i_d.di_crtime.t_sec;
- stat->btime.tv_nsec = ip->i_d.di_crtime.t_nsec;
+ stat->btime = ip->i_d.di_crtime;
}
}
@@ -543,16 +580,7 @@ xfs_vn_getattr(
stat->rdev = inode->i_rdev;
break;
default:
- if (XFS_IS_REALTIME_INODE(ip)) {
- /*
- * If the file blocks are being allocated from a
- * realtime volume, then return the inode's realtime
- * extent size or the realtime volume's extent size.
- */
- stat->blksize =
- xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
- } else
- stat->blksize = xfs_preferred_iosize(mp);
+ stat->blksize = xfs_stat_blksize(ip);
stat->rdev = 0;
break;
}
@@ -664,7 +692,7 @@ xfs_setattr_nonsize(
ASSERT(gdqp == NULL);
error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
xfs_kgid_to_gid(gid),
- xfs_get_projid(ip),
+ ip->i_d.di_projid,
qflags, &udqp, &gdqp, NULL);
if (error)
return error;
@@ -883,10 +911,10 @@ xfs_setattr_size(
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = iomap_zero_range(inode, oldsize, newsize - oldsize,
- &did_zeroing, &xfs_iomap_ops);
+ &did_zeroing, &xfs_buffered_write_iomap_ops);
} else {
error = iomap_truncate_page(inode, newsize, &did_zeroing,
- &xfs_iomap_ops);
+ &xfs_buffered_write_iomap_ops);
}
if (error)
@@ -1114,7 +1142,7 @@ xfs_vn_fiemap(
&xfs_xattr_iomap_ops);
} else {
error = iomap_fiemap(inode, fieinfo, start, length,
- &xfs_iomap_ops);
+ &xfs_read_iomap_ops);
}
xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
@@ -1227,7 +1255,7 @@ xfs_inode_supports_dax(
return false;
/* Device has to support DAX too. */
- return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL;
+ return xfs_inode_buftarg(ip)->bt_daxdev != NULL;
}
STATIC void
@@ -1290,9 +1318,7 @@ xfs_setup_inode(
lockdep_set_class(&inode->i_rwsem,
&inode->i_sb->s_type->i_mutex_dir_key);
lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
- ip->d_ops = ip->i_mount->m_dir_inode_ops;
} else {
- ip->d_ops = ip->i_mount->m_nondir_inode_ops;
lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
}
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 884950adbd16..4b31c29b7e6b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -84,7 +84,7 @@ xfs_bulkstat_one_int(
/* xfs_iget returns the following without needing
* further change.
*/
- buf->bs_projectid = xfs_get_projid(ip);
+ buf->bs_projectid = ip->i_d.di_projid;
buf->bs_ino = ino;
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
@@ -97,8 +97,8 @@ xfs_bulkstat_one_int(
buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
buf->bs_ctime = inode->i_ctime.tv_sec;
buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
- buf->bs_btime = dic->di_crtime.t_sec;
- buf->bs_btime_nsec = dic->di_crtime.t_nsec;
+ buf->bs_btime = dic->di_crtime.tv_sec;
+ buf->bs_btime_nsec = dic->di_crtime.tv_nsec;
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index aa375cf53021..233dcc8784db 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -298,7 +298,8 @@ xfs_iwalk_ag_start(
error = xfs_inobt_get_rec(*curpp, irec, has_more);
if (error)
return error;
- XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1);
+ if (XFS_IS_CORRUPT(mp, *has_more != 1))
+ return -EFSCORRUPTED;
/*
* If the LE lookup yielded an inobt record before the cursor position,
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index ca15105681ca..8738bb03f253 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -223,26 +223,32 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
char *data, unsigned int op);
#define ASSERT_ALWAYS(expr) \
- (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
#ifdef DEBUG
#define ASSERT(expr) \
- (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
#else /* !DEBUG */
#ifdef XFS_WARN
#define ASSERT(expr) \
- (likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
+ (likely(expr) ? (void)0 : asswarn(NULL, #expr, __FILE__, __LINE__))
#else /* !DEBUG && !XFS_WARN */
-#define ASSERT(expr) ((void)0)
+#define ASSERT(expr) ((void)0)
#endif /* XFS_WARN */
#endif /* DEBUG */
+#define XFS_IS_CORRUPT(mp, expr) \
+ (unlikely(expr) ? xfs_corruption_error(#expr, XFS_ERRLEVEL_LOW, (mp), \
+ NULL, 0, __FILE__, __LINE__, \
+ __this_address), \
+ true : false)
+
#define STATIC static noinline
#ifdef CONFIG_XFS_RT
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 641d07f30a27..6a147c63a8a6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -57,10 +57,6 @@ xlog_state_get_iclog_space(
struct xlog_ticket *ticket,
int *continued_write,
int *logoffsetp);
-STATIC int
-xlog_state_release_iclog(
- struct xlog *log,
- struct xlog_in_core *iclog);
STATIC void
xlog_state_switch_iclogs(
struct xlog *log,
@@ -83,7 +79,10 @@ STATIC void
xlog_ungrant_log_space(
struct xlog *log,
struct xlog_ticket *ticket);
-
+STATIC void
+xlog_sync(
+ struct xlog *log,
+ struct xlog_in_core *iclog);
#if defined(DEBUG)
STATIC void
xlog_verify_dest_ptr(
@@ -552,16 +551,71 @@ xfs_log_done(
return lsn;
}
+static bool
+__xlog_state_release_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog)
+{
+ lockdep_assert_held(&log->l_icloglock);
+
+ if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+ /* update tail before writing to iclog */
+ xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+
+ iclog->ic_state = XLOG_STATE_SYNCING;
+ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ xlog_verify_tail_lsn(log, iclog, tail_lsn);
+ /* cycle incremented when incrementing curr_block */
+ return true;
+ }
+
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+ return false;
+}
+
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and the
+ * it is in the WANT_SYNC state.
+ */
+static int
+xlog_state_release_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog)
+{
+ lockdep_assert_held(&log->l_icloglock);
+
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
+ return -EIO;
+
+ if (atomic_dec_and_test(&iclog->ic_refcnt) &&
+ __xlog_state_release_iclog(log, iclog)) {
+ spin_unlock(&log->l_icloglock);
+ xlog_sync(log, iclog);
+ spin_lock(&log->l_icloglock);
+ }
+
+ return 0;
+}
+
int
xfs_log_release_iclog(
- struct xfs_mount *mp,
+ struct xfs_mount *mp,
struct xlog_in_core *iclog)
{
- if (xlog_state_release_iclog(mp->m_log, iclog)) {
+ struct xlog *log = mp->m_log;
+ bool sync;
+
+ if (iclog->ic_state == XLOG_STATE_IOERROR) {
xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
return -EIO;
}
+ if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) {
+ sync = __xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ if (sync)
+ xlog_sync(log, iclog);
+ }
return 0;
}
@@ -866,10 +920,7 @@ out_err:
iclog = log->l_iclog;
atomic_inc(&iclog->ic_refcnt);
xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
error = xlog_state_release_iclog(log, iclog);
-
- spin_lock(&log->l_icloglock);
switch (iclog->ic_state) {
default:
if (!XLOG_FORCED_SHUTDOWN(log)) {
@@ -924,8 +975,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
#ifdef DEBUG
first_iclog = iclog = log->l_iclog;
do {
- if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
- ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
+ if (iclog->ic_state != XLOG_STATE_IOERROR) {
+ ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
ASSERT(iclog->ic_offset == 0);
}
iclog = iclog->ic_next;
@@ -950,21 +1001,17 @@ xfs_log_unmount_write(xfs_mount_t *mp)
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
atomic_inc(&iclog->ic_refcnt);
-
xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
error = xlog_state_release_iclog(log, iclog);
-
- spin_lock(&log->l_icloglock);
-
- if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE
- || iclog->ic_state == XLOG_STATE_DIRTY
- || iclog->ic_state == XLOG_STATE_IOERROR) ) {
-
- xlog_wait(&iclog->ic_force_wait,
- &log->l_icloglock);
- } else {
+ switch (iclog->ic_state) {
+ case XLOG_STATE_ACTIVE:
+ case XLOG_STATE_DIRTY:
+ case XLOG_STATE_IOERROR:
spin_unlock(&log->l_icloglock);
+ break;
+ default:
+ xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
+ break;
}
}
@@ -1254,7 +1301,7 @@ xlog_ioend_work(
* didn't succeed.
*/
aborted = true;
- } else if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ } else if (iclog->ic_state == XLOG_STATE_IOERROR) {
aborted = true;
}
@@ -1479,7 +1526,7 @@ xlog_alloc_log(
log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0,
- mp->m_fsname);
+ mp->m_super->s_id);
if (!log->l_ioend_workqueue)
goto out_free_iclog;
@@ -1727,7 +1774,7 @@ xlog_write_iclog(
* across the log IO to archieve that.
*/
down(&iclog->ic_sema);
- if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) {
+ if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) {
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
@@ -1735,13 +1782,11 @@ xlog_write_iclog(
* the buffer manually, the code needs to be kept in sync
* with the I/O completion path.
*/
- xlog_state_done_syncing(iclog, XFS_LI_ABORTED);
+ xlog_state_done_syncing(iclog, true);
up(&iclog->ic_sema);
return;
}
- iclog->ic_io_size = count;
-
bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
@@ -1751,9 +1796,9 @@ xlog_write_iclog(
if (need_flush)
iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
- xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size);
+ xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count);
if (is_vmalloc_addr(iclog->ic_data))
- flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size);
+ flush_kernel_vmap_range(iclog->ic_data, count);
/*
* If this log buffer would straddle the end of the log we will have
@@ -1969,7 +2014,6 @@ xlog_dealloc_log(
/*
* Update counters atomically now that memcpy is done.
*/
-/* ARGSUSED */
static inline void
xlog_state_finish_copy(
struct xlog *log,
@@ -1977,16 +2021,11 @@ xlog_state_finish_copy(
int record_cnt,
int copy_bytes)
{
- spin_lock(&log->l_icloglock);
+ lockdep_assert_held(&log->l_icloglock);
be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
iclog->ic_offset += copy_bytes;
-
- spin_unlock(&log->l_icloglock);
-} /* xlog_state_finish_copy */
-
-
-
+}
/*
* print out info relating to regions written which consume
@@ -2263,15 +2302,18 @@ xlog_write_copy_finish(
int log_offset,
struct xlog_in_core **commit_iclog)
{
+ int error;
+
if (*partial_copy) {
/*
* This iclog has already been marked WANT_SYNC by
* xlog_state_get_iclog_space.
*/
+ spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
*record_cnt = 0;
*data_cnt = 0;
- return xlog_state_release_iclog(log, iclog);
+ goto release_iclog;
}
*partial_copy = 0;
@@ -2279,21 +2321,25 @@ xlog_write_copy_finish(
if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
/* no more space in this iclog - push it. */
+ spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
*record_cnt = 0;
*data_cnt = 0;
- spin_lock(&log->l_icloglock);
xlog_state_want_sync(log, iclog);
- spin_unlock(&log->l_icloglock);
-
if (!commit_iclog)
- return xlog_state_release_iclog(log, iclog);
+ goto release_iclog;
+ spin_unlock(&log->l_icloglock);
ASSERT(flags & XLOG_COMMIT_TRANS);
*commit_iclog = iclog;
}
return 0;
+
+release_iclog:
+ error = xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ return error;
}
/*
@@ -2355,7 +2401,7 @@ xlog_write(
int contwr = 0;
int record_cnt = 0;
int data_cnt = 0;
- int error;
+ int error = 0;
*start_lsn = 0;
@@ -2506,13 +2552,17 @@ next_lv:
ASSERT(len == 0);
+ spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
- if (!commit_iclog)
- return xlog_state_release_iclog(log, iclog);
+ if (commit_iclog) {
+ ASSERT(flags & XLOG_COMMIT_TRANS);
+ *commit_iclog = iclog;
+ } else {
+ error = xlog_state_release_iclog(log, iclog);
+ }
+ spin_unlock(&log->l_icloglock);
- ASSERT(flags & XLOG_COMMIT_TRANS);
- *commit_iclog = iclog;
- return 0;
+ return error;
}
@@ -2548,7 +2598,7 @@ xlog_state_clean_iclog(
int changed = 0;
/* Prepare the completed iclog. */
- if (!(dirty_iclog->ic_state & XLOG_STATE_IOERROR))
+ if (dirty_iclog->ic_state != XLOG_STATE_IOERROR)
dirty_iclog->ic_state = XLOG_STATE_DIRTY;
/* Walk all the iclogs to update the ordered active state. */
@@ -2639,7 +2689,8 @@ xlog_get_lowest_lsn(
xfs_lsn_t lowest_lsn = 0, lsn;
do {
- if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+ if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+ iclog->ic_state == XLOG_STATE_DIRTY)
continue;
lsn = be64_to_cpu(iclog->ic_header.h_lsn);
@@ -2699,61 +2750,48 @@ static bool
xlog_state_iodone_process_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
- struct xlog_in_core *completed_iclog,
bool *ioerror)
{
xfs_lsn_t lowest_lsn;
xfs_lsn_t header_lsn;
- /* Skip all iclogs in the ACTIVE & DIRTY states */
- if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+ switch (iclog->ic_state) {
+ case XLOG_STATE_ACTIVE:
+ case XLOG_STATE_DIRTY:
+ /*
+ * Skip all iclogs in the ACTIVE & DIRTY states:
+ */
return false;
-
- /*
- * Between marking a filesystem SHUTDOWN and stopping the log, we do
- * flush all iclogs to disk (if there wasn't a log I/O error). So, we do
- * want things to go smoothly in case of just a SHUTDOWN w/o a
- * LOG_IO_ERROR.
- */
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
+ case XLOG_STATE_IOERROR:
+ /*
+ * Between marking a filesystem SHUTDOWN and stopping the log,
+ * we do flush all iclogs to disk (if there wasn't a log I/O
+ * error). So, we do want things to go smoothly in case of just
+ * a SHUTDOWN w/o a LOG_IO_ERROR.
+ */
*ioerror = true;
return false;
- }
-
- /*
- * Can only perform callbacks in order. Since this iclog is not in the
- * DONE_SYNC/ DO_CALLBACK state, we skip the rest and just try to clean
- * up. If we set our iclog to DO_CALLBACK, we will not process it when
- * we retry since a previous iclog is in the CALLBACK and the state
- * cannot change since we are holding the l_icloglock.
- */
- if (!(iclog->ic_state &
- (XLOG_STATE_DONE_SYNC | XLOG_STATE_DO_CALLBACK))) {
- if (completed_iclog &&
- (completed_iclog->ic_state == XLOG_STATE_DONE_SYNC)) {
- completed_iclog->ic_state = XLOG_STATE_DO_CALLBACK;
- }
+ case XLOG_STATE_DONE_SYNC:
+ /*
+ * Now that we have an iclog that is in the DONE_SYNC state, do
+ * one more check here to see if we have chased our tail around.
+ * If this is not the lowest lsn iclog, then we will leave it
+ * for another completion to process.
+ */
+ header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ lowest_lsn = xlog_get_lowest_lsn(log);
+ if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
+ return false;
+ xlog_state_set_callback(log, iclog, header_lsn);
+ return false;
+ default:
+ /*
+ * Can only perform callbacks in order. Since this iclog is not
+ * in the DONE_SYNC state, we skip the rest and just try to
+ * clean up.
+ */
return true;
}
-
- /*
- * We now have an iclog that is in either the DO_CALLBACK or DONE_SYNC
- * states. The other states (WANT_SYNC, SYNCING, or CALLBACK were caught
- * by the above if and are going to clean (i.e. we aren't doing their
- * callbacks) see the above if.
- *
- * We will do one more check here to see if we have chased our tail
- * around. If this is not the lowest lsn iclog, then we will leave it
- * for another completion to process.
- */
- header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
- lowest_lsn = xlog_get_lowest_lsn(log);
- if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
- return false;
-
- xlog_state_set_callback(log, iclog, header_lsn);
- return false;
-
}
/*
@@ -2770,6 +2808,8 @@ xlog_state_do_iclog_callbacks(
struct xlog *log,
struct xlog_in_core *iclog,
bool aborted)
+ __releases(&log->l_icloglock)
+ __acquires(&log->l_icloglock)
{
spin_unlock(&log->l_icloglock);
spin_lock(&iclog->ic_callback_lock);
@@ -2792,57 +2832,13 @@ xlog_state_do_iclog_callbacks(
spin_unlock(&iclog->ic_callback_lock);
}
-#ifdef DEBUG
-/*
- * Make one last gasp attempt to see if iclogs are being left in limbo. If the
- * above loop finds an iclog earlier than the current iclog and in one of the
- * syncing states, the current iclog is put into DO_CALLBACK and the callbacks
- * are deferred to the completion of the earlier iclog. Walk the iclogs in order
- * and make sure that no iclog is in DO_CALLBACK unless an earlier iclog is in
- * one of the syncing states.
- *
- * Note that SYNCING|IOERROR is a valid state so we cannot just check for
- * ic_state == SYNCING.
- */
-static void
-xlog_state_callback_check_state(
- struct xlog *log)
-{
- struct xlog_in_core *first_iclog = log->l_iclog;
- struct xlog_in_core *iclog = first_iclog;
-
- do {
- ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
- /*
- * Terminate the loop if iclogs are found in states
- * which will cause other threads to clean up iclogs.
- *
- * SYNCING - i/o completion will go through logs
- * DONE_SYNC - interrupt thread should be waiting for
- * l_icloglock
- * IOERROR - give up hope all ye who enter here
- */
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state & XLOG_STATE_SYNCING ||
- iclog->ic_state == XLOG_STATE_DONE_SYNC ||
- iclog->ic_state == XLOG_STATE_IOERROR )
- break;
- iclog = iclog->ic_next;
- } while (first_iclog != iclog);
-}
-#else
-#define xlog_state_callback_check_state(l) ((void)0)
-#endif
-
STATIC void
xlog_state_do_callback(
struct xlog *log,
- bool aborted,
- struct xlog_in_core *ciclog)
+ bool aborted)
{
struct xlog_in_core *iclog;
struct xlog_in_core *first_iclog;
- bool did_callbacks = false;
bool cycled_icloglock;
bool ioerror;
int flushcnt = 0;
@@ -2866,11 +2862,11 @@ xlog_state_do_callback(
do {
if (xlog_state_iodone_process_iclog(log, iclog,
- ciclog, &ioerror))
+ &ioerror))
break;
- if (!(iclog->ic_state &
- (XLOG_STATE_CALLBACK | XLOG_STATE_IOERROR))) {
+ if (iclog->ic_state != XLOG_STATE_CALLBACK &&
+ iclog->ic_state != XLOG_STATE_IOERROR) {
iclog = iclog->ic_next;
continue;
}
@@ -2886,8 +2882,6 @@ xlog_state_do_callback(
iclog = iclog->ic_next;
} while (first_iclog != iclog);
- did_callbacks |= cycled_icloglock;
-
if (repeats > 5000) {
flushcnt += repeats;
repeats = 0;
@@ -2897,10 +2891,8 @@ xlog_state_do_callback(
}
} while (!ioerror && cycled_icloglock);
- if (did_callbacks)
- xlog_state_callback_check_state(log);
-
- if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+ if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE ||
+ log->l_iclog->ic_state == XLOG_STATE_IOERROR)
wake_up_all(&log->l_flush_wait);
spin_unlock(&log->l_icloglock);
@@ -2929,8 +2921,6 @@ xlog_state_done_syncing(
spin_lock(&log->l_icloglock);
- ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
- iclog->ic_state == XLOG_STATE_IOERROR);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
/*
@@ -2939,8 +2929,10 @@ xlog_state_done_syncing(
* and none should ever be attempted to be written to disk
* again.
*/
- if (iclog->ic_state != XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_SYNCING)
iclog->ic_state = XLOG_STATE_DONE_SYNC;
+ else
+ ASSERT(iclog->ic_state == XLOG_STATE_IOERROR);
/*
* Someone could be sleeping prior to writing out the next
@@ -2949,7 +2941,7 @@ xlog_state_done_syncing(
*/
wake_up_all(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
- xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
+ xlog_state_do_callback(log, aborted); /* also cleans log */
} /* xlog_state_done_syncing */
@@ -2983,7 +2975,6 @@ xlog_state_get_iclog_space(
int log_offset;
xlog_rec_header_t *head;
xlog_in_core_t *iclog;
- int error;
restart:
spin_lock(&log->l_icloglock);
@@ -3032,24 +3023,22 @@ restart:
* can fit into remaining data section.
*/
if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+ int error = 0;
+
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
/*
- * If I'm the only one writing to this iclog, sync it to disk.
- * We need to do an atomic compare and decrement here to avoid
- * racing with concurrent atomic_dec_and_lock() calls in
+ * If we are the only one writing to this iclog, sync it to
+ * disk. We need to do an atomic compare and decrement here to
+ * avoid racing with concurrent atomic_dec_and_lock() calls in
* xlog_state_release_iclog() when there is more than one
* reference to the iclog.
*/
- if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
- /* we are the only one */
- spin_unlock(&log->l_icloglock);
+ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
error = xlog_state_release_iclog(log, iclog);
- if (error)
- return error;
- } else {
- spin_unlock(&log->l_icloglock);
- }
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
goto restart;
}
@@ -3161,60 +3150,6 @@ xlog_ungrant_log_space(
}
/*
- * Flush iclog to disk if this is the last reference to the given iclog and
- * the WANT_SYNC bit is set.
- *
- * When this function is entered, the iclog is not necessarily in the
- * WANT_SYNC state. It may be sitting around waiting to get filled.
- *
- *
- */
-STATIC int
-xlog_state_release_iclog(
- struct xlog *log,
- struct xlog_in_core *iclog)
-{
- int sync = 0; /* do we sync? */
-
- if (iclog->ic_state & XLOG_STATE_IOERROR)
- return -EIO;
-
- ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
- if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
- return 0;
-
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- spin_unlock(&log->l_icloglock);
- return -EIO;
- }
- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
- iclog->ic_state == XLOG_STATE_WANT_SYNC);
-
- if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
- /* update tail before writing to iclog */
- xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp);
- sync++;
- iclog->ic_state = XLOG_STATE_SYNCING;
- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
- xlog_verify_tail_lsn(log, iclog, tail_lsn);
- /* cycle incremented when incrementing curr_block */
- }
- spin_unlock(&log->l_icloglock);
-
- /*
- * We let the log lock go, so it's possible that we hit a log I/O
- * error or some other SHUTDOWN condition that marks the iclog
- * as XLOG_STATE_IOERROR before the bwrite. However, we know that
- * this iclog has consistent data, so we ignore IOERROR
- * flags after this point.
- */
- if (sync)
- xlog_sync(log, iclog);
- return 0;
-} /* xlog_state_release_iclog */
-
-
-/*
* This routine will mark the current iclog in the ring as WANT_SYNC
* and move the current iclog pointer to the next iclog in the ring.
* When this routine is called from xlog_state_get_iclog_space(), the
@@ -3307,7 +3242,7 @@ xfs_log_force(
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
goto out_error;
if (iclog->ic_state == XLOG_STATE_DIRTY ||
@@ -3337,12 +3272,9 @@ xfs_log_force(
atomic_inc(&iclog->ic_refcnt);
lsn = be64_to_cpu(iclog->ic_header.h_lsn);
xlog_state_switch_iclogs(log, iclog, 0);
- spin_unlock(&log->l_icloglock);
-
if (xlog_state_release_iclog(log, iclog))
- return -EIO;
+ goto out_error;
- spin_lock(&log->l_icloglock);
if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn ||
iclog->ic_state == XLOG_STATE_DIRTY)
goto out_unlock;
@@ -3367,11 +3299,11 @@ xfs_log_force(
if (!(flags & XFS_LOG_SYNC))
goto out_unlock;
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
goto out_error;
XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
return -EIO;
return 0;
@@ -3396,7 +3328,7 @@ __xfs_log_force_lsn(
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
goto out_error;
while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
@@ -3425,10 +3357,8 @@ __xfs_log_force_lsn(
* will go out then.
*/
if (!already_slept &&
- (iclog->ic_prev->ic_state &
- (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
- ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
-
+ (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_prev->ic_write_wait,
@@ -3437,24 +3367,23 @@ __xfs_log_force_lsn(
}
atomic_inc(&iclog->ic_refcnt);
xlog_state_switch_iclogs(log, iclog, 0);
- spin_unlock(&log->l_icloglock);
if (xlog_state_release_iclog(log, iclog))
- return -EIO;
+ goto out_error;
if (log_flushed)
*log_flushed = 1;
- spin_lock(&log->l_icloglock);
}
if (!(flags & XFS_LOG_SYNC) ||
- (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)))
+ (iclog->ic_state == XLOG_STATE_ACTIVE ||
+ iclog->ic_state == XLOG_STATE_DIRTY))
goto out_unlock;
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
goto out_error;
XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
- if (iclog->ic_state & XLOG_STATE_IOERROR)
+ if (iclog->ic_state == XLOG_STATE_IOERROR)
return -EIO;
return 0;
@@ -3517,8 +3446,8 @@ xlog_state_want_sync(
if (iclog->ic_state == XLOG_STATE_ACTIVE) {
xlog_state_switch_iclogs(log, iclog, 0);
} else {
- ASSERT(iclog->ic_state &
- (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ iclog->ic_state == XLOG_STATE_IOERROR);
}
}
@@ -3539,7 +3468,7 @@ xfs_log_ticket_put(
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
- kmem_zone_free(xfs_log_ticket_zone, ticket);
+ kmem_cache_free(xfs_log_ticket_zone, ticket);
}
xlog_ticket_t *
@@ -3895,7 +3824,7 @@ xlog_state_ioerror(
xlog_in_core_t *iclog, *ic;
iclog = log->l_iclog;
- if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
+ if (iclog->ic_state != XLOG_STATE_IOERROR) {
/*
* Mark all the incore logs IOERROR.
* From now on, no log flushes will result.
@@ -3955,7 +3884,7 @@ xfs_log_force_umount(
* Somebody could've already done the hard work for us.
* No need to get locks for this.
*/
- if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
+ if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) {
ASSERT(XLOG_FORCED_SHUTDOWN(log));
return 1;
}
@@ -4006,21 +3935,8 @@ xfs_log_force_umount(
spin_lock(&log->l_cilp->xc_push_lock);
wake_up_all(&log->l_cilp->xc_commit_wait);
spin_unlock(&log->l_cilp->xc_push_lock);
- xlog_state_do_callback(log, true, NULL);
-
-#ifdef XFSERRORDEBUG
- {
- xlog_in_core_t *iclog;
+ xlog_state_do_callback(log, true);
- spin_lock(&log->l_icloglock);
- iclog = log->l_iclog;
- do {
- ASSERT(iclog->ic_callback == 0);
- iclog = iclog->ic_next;
- } while (iclog != log->l_iclog);
- spin_unlock(&log->l_icloglock);
- }
-#endif
/* return non-zero if log IOERROR transition had already happened */
return retval;
}
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ef652abd112c..48435cf2aa16 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -179,7 +179,7 @@ xlog_cil_alloc_shadow_bufs(
/*
* We free and allocate here as a realloc would copy
- * unecessary data. We don't use kmem_zalloc() for the
+ * unnecessary data. We don't use kmem_zalloc() for the
* same reason - we don't need to zero the data area in
* the buffer, only the log vector header and the iovec
* storage.
@@ -682,7 +682,7 @@ xlog_cil_push(
}
- /* check for a previously pushed seqeunce */
+ /* check for a previously pushed sequence */
if (push_seq < cil->xc_ctx->sequence) {
spin_unlock(&cil->xc_push_lock);
goto out_skip;
@@ -847,7 +847,7 @@ restart:
goto out_abort;
spin_lock(&commit_iclog->ic_callback_lock);
- if (commit_iclog->ic_state & XLOG_STATE_IOERROR) {
+ if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
spin_unlock(&commit_iclog->ic_callback_lock);
goto out_abort;
}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b880c23cb6e4..b192c5a9f9fd 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -40,17 +40,15 @@ static inline uint xlog_get_client_id(__be32 i)
/*
* In core log state
*/
-#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */
-#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
-#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */
-#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
-#define XLOG_STATE_DO_CALLBACK \
- 0x0010 /* Process callback functions */
-#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
-#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
-#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
-#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
-#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
+enum xlog_iclog_state {
+ XLOG_STATE_ACTIVE, /* Current IC log being written to */
+ XLOG_STATE_WANT_SYNC, /* Want to sync this iclog; no more writes */
+ XLOG_STATE_SYNCING, /* This IC log is syncing */
+ XLOG_STATE_DONE_SYNC, /* Done syncing to disk */
+ XLOG_STATE_CALLBACK, /* Callback functions now */
+ XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */
+ XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */
+};
/*
* Flags to log ticket
@@ -179,8 +177,6 @@ typedef struct xlog_ticket {
* - ic_next is the pointer to the next iclog in the ring.
* - ic_log is a pointer back to the global log structure.
* - ic_size is the full size of the log buffer, minus the cycle headers.
- * - ic_io_size is the size of the currently pending log buffer write, which
- * might be smaller than ic_size
* - ic_offset is the current number of bytes written to in this iclog.
* - ic_refcnt is bumped when someone is writing to the log.
* - ic_state is the state of the iclog.
@@ -205,9 +201,8 @@ typedef struct xlog_in_core {
struct xlog_in_core *ic_prev;
struct xlog *ic_log;
u32 ic_size;
- u32 ic_io_size;
u32 ic_offset;
- unsigned short ic_state;
+ enum xlog_iclog_state ic_state;
char *ic_datap; /* pointer to iclog data */
/* Callback structures need their own cacheline */
@@ -399,8 +394,6 @@ struct xlog {
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
void *l_iclog_bak[XLOG_MAX_ICLOGS];
- /* log record crc error injection factor */
- uint32_t l_badcrc_factor;
#endif
/* log recovery lsn tracking (for buffer submission */
xfs_lsn_t l_recovery_lsn;
@@ -542,7 +535,11 @@ xlog_cil_force(struct xlog *log)
* by a spinlock. This matches the semantics of all the wait queues used in the
* log code.
*/
-static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
+static inline void
+xlog_wait(
+ struct wait_queue_head *wq,
+ struct spinlock *lock)
+ __releases(lock)
{
DECLARE_WAITQUEUE(wait, current);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c1a514ffff55..99ec3fba4548 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -103,10 +103,9 @@ xlog_alloc_buffer(
* Pass log block 0 since we don't have an addr yet, buffer will be
* verified on read.
*/
- if (!xlog_verify_bno(log, 0, nbblks)) {
+ if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return NULL;
}
@@ -152,11 +151,10 @@ xlog_do_io(
{
int error;
- if (!xlog_verify_bno(log, blk_no, nbblks)) {
+ if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
xfs_warn(log->l_mp,
"Invalid log block/length (0x%llx, 0x%x) for buffer",
blk_no, nbblks);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return -EFSCORRUPTED;
}
@@ -244,19 +242,17 @@ xlog_header_check_recover(
* (XLOG_FMT_UNKNOWN). This stops us from trying to recover
* a dirty log created in IRIX.
*/
- if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
+ if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
xfs_warn(mp,
"dirty log written in incompatible format - can't recover");
xlog_header_check_dump(mp, head);
- XFS_ERROR_REPORT("xlog_header_check_recover(1)",
- XFS_ERRLEVEL_HIGH, mp);
return -EFSCORRUPTED;
- } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+ }
+ if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
+ &head->h_fs_uuid))) {
xfs_warn(mp,
"dirty log entry has mismatched uuid - can't recover");
xlog_header_check_dump(mp, head);
- XFS_ERROR_REPORT("xlog_header_check_recover(2)",
- XFS_ERRLEVEL_HIGH, mp);
return -EFSCORRUPTED;
}
return 0;
@@ -279,11 +275,10 @@ xlog_header_check_mount(
* by IRIX and continue.
*/
xfs_warn(mp, "null uuid in log - IRIX style log");
- } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+ } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
+ &head->h_fs_uuid))) {
xfs_warn(mp, "log has mismatched uuid - can't recover");
xlog_header_check_dump(mp, head);
- XFS_ERROR_REPORT("xlog_header_check_mount",
- XFS_ERRLEVEL_HIGH, mp);
return -EFSCORRUPTED;
}
return 0;
@@ -471,7 +466,7 @@ xlog_find_verify_log_record(
xfs_warn(log->l_mp,
"Log inconsistent (didn't find previous header)");
ASSERT(0);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto out;
}
@@ -1347,10 +1342,11 @@ xlog_find_tail(
error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
&rhead_blk, &rhead, &wrapped);
if (error < 0)
- return error;
+ goto done;
if (!error) {
xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
- return -EIO;
+ error = -EFSCORRUPTED;
+ goto done;
}
*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
@@ -1699,11 +1695,10 @@ xlog_clear_stale_blocks(
* the distance from the beginning of the log to the
* tail.
*/
- if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
- XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+ if (XFS_IS_CORRUPT(log->l_mp,
+ head_block < tail_block ||
+ head_block >= log->l_logBBsize))
return -EFSCORRUPTED;
- }
tail_distance = tail_block + (log->l_logBBsize - head_block);
} else {
/*
@@ -1711,11 +1706,10 @@ xlog_clear_stale_blocks(
* so the distance from the head to the tail is just
* the tail block minus the head block.
*/
- if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
- XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+ if (XFS_IS_CORRUPT(log->l_mp,
+ head_block >= tail_block ||
+ head_cycle != tail_cycle + 1))
return -EFSCORRUPTED;
- }
tail_distance = tail_block - head_block;
}
@@ -2135,13 +2129,11 @@ xlog_recover_do_inode_buffer(
*/
logged_nextp = item->ri_buf[item_index].i_addr +
next_unlinked_offset - reg_buf_offset;
- if (unlikely(*logged_nextp == 0)) {
+ if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
xfs_alert(mp,
"Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
"Trying to replay bad (0) inode di_next_unlinked field.",
item, bp);
- XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
- XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
}
@@ -2576,6 +2568,7 @@ xlog_recover_do_reg_buffer(
int bit;
int nbits;
xfs_failaddr_t fa;
+ const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot);
trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
@@ -2618,7 +2611,7 @@ xlog_recover_do_reg_buffer(
"XFS: NULL dquot in %s.", __func__);
goto next;
}
- if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
+ if (item->ri_buf[i].i_len < size_disk_dquot) {
xfs_alert(mp,
"XFS: dquot too small (%d) in %s.",
item->ri_buf[i].i_len, __func__);
@@ -2969,22 +2962,18 @@ xlog_recover_inode_pass2(
* Make sure the place we're flushing out to really looks
* like an inode!
*/
- if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
+ if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
xfs_alert(mp,
"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
- XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
- XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto out_release;
}
ldip = item->ri_buf[1].i_addr;
- if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
+ if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
xfs_alert(mp,
"%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
__func__, item, in_f->ilf_ino);
- XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
- XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -3166,7 +3155,7 @@ xlog_recover_inode_pass2(
default:
xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
ASSERT(0);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto out_release;
}
}
@@ -3247,12 +3236,12 @@ xlog_recover_dquot_pass2(
recddq = item->ri_buf[1].i_addr;
if (recddq == NULL) {
xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
- return -EIO;
+ return -EFSCORRUPTED;
}
- if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
+ if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
item->ri_buf[1].i_len, __func__);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -3279,7 +3268,7 @@ xlog_recover_dquot_pass2(
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
dq_f->qlf_id, fa);
- return -EIO;
+ return -EFSCORRUPTED;
}
ASSERT(dq_f->qlf_len == 1);
@@ -3537,6 +3526,7 @@ xfs_cui_copy_format(
memcpy(dst_cui_fmt, src_cui_fmt, len);
return 0;
}
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
@@ -3601,8 +3591,10 @@ xlog_recover_cud_pass2(
struct xfs_ail *ailp = log->l_ailp;
cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format))
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
+ }
cui_id = cud_formatp->cud_cui_id;
/*
@@ -3654,6 +3646,7 @@ xfs_bui_copy_format(
memcpy(dst_bui_fmt, src_bui_fmt, len);
return 0;
}
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
@@ -3677,8 +3670,10 @@ xlog_recover_bui_pass2(
bui_formatp = item->ri_buf[0].i_addr;
- if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
+ if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
+ }
buip = xfs_bui_init(mp);
error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
if (error) {
@@ -3720,8 +3715,10 @@ xlog_recover_bud_pass2(
struct xfs_ail *ailp = log->l_ailp;
bud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format))
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
+ }
bui_id = bud_formatp->bud_bui_id;
/*
@@ -4018,7 +4015,7 @@ xlog_recover_commit_pass1(
xfs_warn(log->l_mp, "%s: invalid item type (%d)",
__func__, ITEM_TYPE(item));
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -4066,7 +4063,7 @@ xlog_recover_commit_pass2(
xfs_warn(log->l_mp, "%s: invalid item type (%d)",
__func__, ITEM_TYPE(item));
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -4187,7 +4184,7 @@ xlog_recover_add_to_cont_trans(
ASSERT(len <= sizeof(struct xfs_trans_header));
if (len > sizeof(struct xfs_trans_header)) {
xfs_warn(log->l_mp, "%s: bad header length", __func__);
- return -EIO;
+ return -EFSCORRUPTED;
}
xlog_recover_add_item(&trans->r_itemq);
@@ -4243,13 +4240,13 @@ xlog_recover_add_to_trans(
xfs_warn(log->l_mp, "%s: bad header magic number",
__func__);
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (len > sizeof(struct xfs_trans_header)) {
xfs_warn(log->l_mp, "%s: bad header length", __func__);
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -4285,7 +4282,7 @@ xlog_recover_add_to_trans(
in_f->ilf_size);
ASSERT(0);
kmem_free(ptr);
- return -EIO;
+ return -EFSCORRUPTED;
}
item->ri_total = in_f->ilf_size;
@@ -4293,7 +4290,16 @@ xlog_recover_add_to_trans(
kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
0);
}
- ASSERT(item->ri_total > item->ri_cnt);
+
+ if (item->ri_total <= item->ri_cnt) {
+ xfs_warn(log->l_mp,
+ "log item region count (%d) overflowed size (%d)",
+ item->ri_cnt, item->ri_total);
+ ASSERT(0);
+ kmem_free(ptr);
+ return -EFSCORRUPTED;
+ }
+
/* Description region is ri_buf[0] */
item->ri_buf[item->ri_cnt].i_addr = ptr;
item->ri_buf[item->ri_cnt].i_len = len;
@@ -4380,7 +4386,7 @@ xlog_recovery_process_trans(
default:
xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
ASSERT(0);
- error = -EIO;
+ error = -EFSCORRUPTED;
break;
}
if (error || freeit)
@@ -4460,7 +4466,7 @@ xlog_recover_process_ophdr(
xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
__func__, ohead->oh_clientid);
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -4470,7 +4476,7 @@ xlog_recover_process_ophdr(
if (dp + len > end) {
xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
WARN_ON(1);
- return -EIO;
+ return -EFSCORRUPTED;
}
trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
@@ -5172,8 +5178,10 @@ xlog_recover_process(
* If the filesystem is CRC enabled, this mismatch becomes a
* fatal log corruption failure.
*/
- if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
+ }
}
xlog_unpack_data(rhead, dp, log);
@@ -5190,31 +5198,25 @@ xlog_valid_rec_header(
{
int hlen;
- if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
- XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+ if (XFS_IS_CORRUPT(log->l_mp,
+ rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
return -EFSCORRUPTED;
- }
- if (unlikely(
- (!rhead->h_version ||
- (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
+ if (XFS_IS_CORRUPT(log->l_mp,
+ (!rhead->h_version ||
+ (be32_to_cpu(rhead->h_version) &
+ (~XLOG_VERSION_OKBITS))))) {
xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
__func__, be32_to_cpu(rhead->h_version));
- return -EIO;
+ return -EFSCORRUPTED;
}
/* LR body must have data or it wouldn't have been written */
hlen = be32_to_cpu(rhead->h_len);
- if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
- XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+ if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX))
return -EFSCORRUPTED;
- }
- if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
- XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
- XFS_ERRLEVEL_LOW, log->l_mp);
+ if (XFS_IS_CORRUPT(log->l_mp,
+ blkno > log->l_logBBsize || blkno > INT_MAX))
return -EFSCORRUPTED;
- }
return 0;
}
@@ -5296,8 +5298,12 @@ xlog_do_recovery_pass(
"invalid iclog size (%d bytes), using lsunit (%d bytes)",
h_size, log->l_mp->m_logbsize);
h_size = log->l_mp->m_logbsize;
- } else
- return -EFSCORRUPTED;
+ } else {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
+ log->l_mp);
+ error = -EFSCORRUPTED;
+ goto bread_err1;
+ }
}
if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 9804efe525a9..e0f9d3b6abe9 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -20,8 +20,8 @@ __xfs_printk(
const struct xfs_mount *mp,
struct va_format *vaf)
{
- if (mp && mp->m_fsname) {
- printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+ if (mp && mp->m_super) {
+ printk("%sXFS (%s): %pV\n", level, mp->m_super->s_id, vaf);
return;
}
printk("%sXFS: %pV\n", level, vaf);
@@ -86,17 +86,25 @@ xfs_alert_tag(
}
void
-asswarn(char *expr, char *file, int line)
+asswarn(
+ struct xfs_mount *mp,
+ char *expr,
+ char *file,
+ int line)
{
- xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d",
+ xfs_warn(mp, "Assertion failed: %s, file: %s, line: %d",
expr, file, line);
WARN_ON(1);
}
void
-assfail(char *expr, char *file, int line)
+assfail(
+ struct xfs_mount *mp,
+ char *expr,
+ char *file,
+ int line)
{
- xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+ xfs_emerg(mp, "Assertion failed: %s, file: %s, line: %d",
expr, file, line);
if (xfs_globals.bug_on_assert)
BUG();
@@ -105,7 +113,7 @@ assfail(char *expr, char *file, int line)
}
void
-xfs_hex_dump(void *p, int length)
+xfs_hex_dump(const void *p, int length)
{
print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
}
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 34447dca97d1..0b05e10995a0 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -57,9 +57,9 @@ do { \
#define xfs_debug_ratelimited(dev, fmt, ...) \
xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__)
-extern void assfail(char *expr, char *f, int l);
-extern void asswarn(char *expr, char *f, int l);
+void assfail(struct xfs_mount *mp, char *expr, char *f, int l);
+void asswarn(struct xfs_mount *mp, char *expr, char *f, int l);
-extern void xfs_hex_dump(void *p, int length);
+extern void xfs_hex_dump(const void *p, int length);
#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ba5b6f3b2b88..fca65109cf24 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -426,45 +426,6 @@ xfs_update_alignment(xfs_mount_t *mp)
}
/*
- * Set the default minimum read and write sizes unless
- * already specified in a mount option.
- * We use smaller I/O sizes when the file system
- * is being used for NFS service (wsync mount option).
- */
-STATIC void
-xfs_set_rw_sizes(xfs_mount_t *mp)
-{
- xfs_sb_t *sbp = &(mp->m_sb);
- int readio_log, writeio_log;
-
- if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
- readio_log = XFS_WSYNC_READIO_LOG;
- writeio_log = XFS_WSYNC_WRITEIO_LOG;
- } else {
- readio_log = XFS_READIO_LOG_LARGE;
- writeio_log = XFS_WRITEIO_LOG_LARGE;
- }
- } else {
- readio_log = mp->m_readio_log;
- writeio_log = mp->m_writeio_log;
- }
-
- if (sbp->sb_blocklog > readio_log) {
- mp->m_readio_log = sbp->sb_blocklog;
- } else {
- mp->m_readio_log = readio_log;
- }
- mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
- if (sbp->sb_blocklog > writeio_log) {
- mp->m_writeio_log = sbp->sb_blocklog;
- } else {
- mp->m_writeio_log = writeio_log;
- }
- mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
-}
-
-/*
* precalculate the low space thresholds for dynamic speculative preallocation.
*/
void
@@ -706,7 +667,8 @@ xfs_mountfs(
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
- error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
+ error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
+ NULL, mp->m_super->s_id);
if (error)
goto out;
@@ -728,9 +690,12 @@ xfs_mountfs(
goto out_remove_errortag;
/*
- * Set the minimum read and write sizes
+ * Update the preferred write size based on the information from the
+ * on-disk superblock.
*/
- xfs_set_rw_sizes(mp);
+ mp->m_allocsize_log =
+ max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
+ mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);
/* set the low space thresholds for dynamic preallocation */
xfs_set_low_space_thresholds(mp);
@@ -796,9 +761,8 @@ xfs_mountfs(
goto out_free_dir;
}
- if (!sbp->sb_logblocks) {
+ if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
xfs_warn(mp, "no log defined");
- XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto out_free_perag;
}
@@ -836,12 +800,10 @@ xfs_mountfs(
ASSERT(rip != NULL);
- if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) {
+ if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
xfs_warn(mp, "corrupted root inode %llu: not a directory",
(unsigned long long)rip->i_ino);
xfs_iunlock(rip, XFS_ILOCK_EXCL);
- XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
- mp);
error = -EFSCORRUPTED;
goto out_rele_rip;
}
@@ -1277,7 +1239,7 @@ xfs_mod_fdblocks(
printk_once(KERN_WARNING
"Filesystem \"%s\": reserve blocks depleted! "
"Consider increasing reserve pool size.",
- mp->m_fsname);
+ mp->m_super->s_id);
fdblocks_enospc:
spin_unlock(&mp->m_sb_lock);
return -ENOSPC;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fdb60e09a9c5..88ab09ed29e7 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -9,10 +9,8 @@
struct xlog;
struct xfs_inode;
struct xfs_mru_cache;
-struct xfs_nameops;
struct xfs_ail;
struct xfs_quotainfo;
-struct xfs_dir_ops;
struct xfs_da_geometry;
/* dynamic preallocation free space thresholds, 5% down to 1% */
@@ -59,7 +57,6 @@ struct xfs_error_cfg {
typedef struct xfs_mount {
struct super_block *m_super;
- xfs_tid_t m_tid; /* next unused tid for fs */
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
@@ -89,8 +86,6 @@ typedef struct xfs_mount {
struct percpu_counter m_delalloc_blks;
struct xfs_buf *m_sb_bp; /* buffer for superblock */
- char *m_fsname; /* filesystem name */
- int m_fsname_len; /* strlen of fs name */
char *m_rtname; /* realtime device name */
char *m_logname; /* external log device name */
int m_bsize; /* fs logical block size */
@@ -98,10 +93,8 @@ typedef struct xfs_mount {
xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
spinlock_t m_agirotor_lock;/* .. and lock protecting it */
xfs_agnumber_t m_maxagi; /* highest inode alloc group */
- uint m_readio_log; /* min read size log bytes */
- uint m_readio_blocks; /* min read size blocks */
- uint m_writeio_log; /* min write size log bytes */
- uint m_writeio_blocks; /* min write size blocks */
+ uint m_allocsize_log;/* min write size log bytes */
+ uint m_allocsize_blocks; /* min write size blocks */
struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
struct xlog *m_log; /* log specific stuff */
@@ -159,10 +152,6 @@ typedef struct xfs_mount {
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
- const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
- const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
- const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
- uint m_chsize; /* size of next field */
atomic_t m_active_trans; /* number trans frozen */
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
struct delayed_work m_reclaim_work; /* background inode reclaim */
@@ -229,7 +218,7 @@ typedef struct xfs_mount {
#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
-#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
+#define XFS_MOUNT_ALLOCSIZE (1ULL << 12) /* specified allocation size */
#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */
#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */
#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
@@ -238,7 +227,7 @@ typedef struct xfs_mount {
* allocation */
#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */
#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
-#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
+#define XFS_MOUNT_LARGEIO (1ULL << 22) /* report large preferred
* I/O size in stat() */
#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
allocator */
@@ -246,13 +235,6 @@ typedef struct xfs_mount {
#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
-
-/*
- * Default minimum read and write sizes.
- */
-#define XFS_READIO_LOG_LARGE 16
-#define XFS_WRITEIO_LOG_LARGE 16
-
/*
* Max and min values for mount-option defined I/O
* preallocation sizes.
@@ -260,37 +242,6 @@ typedef struct xfs_mount {
#define XFS_MAX_IO_LOG 30 /* 1G */
#define XFS_MIN_IO_LOG PAGE_SHIFT
-/*
- * Synchronous read and write sizes. This should be
- * better for NFSv2 wsync filesystems.
- */
-#define XFS_WSYNC_READIO_LOG 15 /* 32k */
-#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */
-
-/*
- * Allow large block sizes to be reported to userspace programs if the
- * "largeio" mount option is used.
- *
- * If compatibility mode is specified, simply return the basic unit of caching
- * so that we don't get inefficient read/modify/write I/O from user apps.
- * Otherwise....
- *
- * If the underlying volume is a stripe, then return the stripe width in bytes
- * as the recommended I/O size. It is not a stripe and we've set a default
- * buffered I/O size, return that, otherwise return the compat default.
- */
-static inline unsigned long
-xfs_preferred_iosize(xfs_mount_t *mp)
-{
- if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
- return PAGE_SIZE;
- return (mp->m_swidth ?
- (mp->m_swidth << mp->m_sb.sb_blocklog) :
- ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
- (1 << (int)max(mp->m_readio_log, mp->m_writeio_log)) :
- PAGE_SIZE));
-}
-
#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index a339bd5fa260..bb3008d390aa 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -12,6 +12,7 @@
#include "xfs_trans.h"
#include "xfs_bmap.h"
#include "xfs_iomap.h"
+#include "xfs_pnfs.h"
/*
* Ensure that we do not have any outstanding pNFS layouts that can be used by
@@ -59,7 +60,7 @@ xfs_fs_get_uuid(
printk_once(KERN_NOTICE
"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
- mp->m_fsname);
+ mp->m_super->s_id);
if (*len < sizeof(uuid_t))
return -EINVAL;
@@ -142,43 +143,38 @@ xfs_fs_map_blocks(
lock_flags = xfs_ilock_data_map_shared(ip);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
&imap, &nimaps, bmapi_flags);
- xfs_iunlock(ip, lock_flags);
- if (error)
- goto out_unlock;
+ ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
+
+ if (!error && write &&
+ (!nimaps || imap.br_startblock == HOLESTARTBLOCK)) {
+ if (offset + length > XFS_ISIZE(ip))
+ end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
+ else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+ end_fsb = min(end_fsb, imap.br_startoff +
+ imap.br_blockcount);
+ xfs_iunlock(ip, lock_flags);
+
+ error = xfs_iomap_write_direct(ip, offset_fsb,
+ end_fsb - offset_fsb, &imap);
+ if (error)
+ goto out_unlock;
- if (write) {
- enum xfs_prealloc_flags flags = 0;
-
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-
- if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
- /*
- * xfs_iomap_write_direct() expects to take ownership of
- * the shared ilock.
- */
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- error = xfs_iomap_write_direct(ip, offset, length,
- &imap, nimaps);
- if (error)
- goto out_unlock;
-
- /*
- * Ensure the next transaction is committed
- * synchronously so that the blocks allocated and
- * handed out to the client are guaranteed to be
- * present even after a server crash.
- */
- flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC;
- }
-
- error = xfs_update_prealloc_flags(ip, flags);
+ /*
+ * Ensure the next transaction is committed synchronously so
+ * that the blocks allocated and handed out to the client are
+ * guaranteed to be present even after a server crash.
+ */
+ error = xfs_update_prealloc_flags(ip,
+ XFS_PREALLOC_SET | XFS_PREALLOC_SYNC);
if (error)
goto out_unlock;
+ } else {
+ xfs_iunlock(ip, lock_flags);
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index ecd8ce152ab1..0b0909657bad 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -22,6 +22,7 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_error.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -29,10 +30,10 @@
* quota functionality, including maintaining the freelist and hash
* tables of dquots.
*/
-STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
-STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
+STATIC int xfs_qm_init_quotainos(struct xfs_mount *mp);
+STATIC int xfs_qm_init_quotainfo(struct xfs_mount *mp);
-STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi);
+STATIC void xfs_qm_destroy_quotainos(struct xfs_quotainfo *qi);
STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp);
/*
* We use the batch lookup interface to iterate over the dquots as it
@@ -243,14 +244,14 @@ xfs_qm_unmount_quotas(
STATIC int
xfs_qm_dqattach_one(
- xfs_inode_t *ip,
- xfs_dqid_t id,
- uint type,
- bool doalloc,
- xfs_dquot_t **IO_idqpp)
+ struct xfs_inode *ip,
+ xfs_dqid_t id,
+ uint type,
+ bool doalloc,
+ struct xfs_dquot **IO_idqpp)
{
- xfs_dquot_t *dqp;
- int error;
+ struct xfs_dquot *dqp;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
error = 0;
@@ -341,7 +342,7 @@ xfs_qm_dqattach_locked(
}
if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
- error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+ error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
doalloc, &ip->i_pdquot);
if (error)
goto done;
@@ -539,12 +540,12 @@ xfs_qm_shrink_count(
STATIC void
xfs_qm_set_defquota(
- xfs_mount_t *mp,
- uint type,
- xfs_quotainfo_t *qinf)
+ struct xfs_mount *mp,
+ uint type,
+ struct xfs_quotainfo *qinf)
{
- xfs_dquot_t *dqp;
- struct xfs_def_quota *defq;
+ struct xfs_dquot *dqp;
+ struct xfs_def_quota *defq;
struct xfs_disk_dquot *ddqp;
int error;
@@ -642,7 +643,7 @@ xfs_qm_init_quotainfo(
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), 0);
+ qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
error = list_lru_init(&qinf->qi_lru);
if (error)
@@ -709,9 +710,9 @@ out_free_qinf:
*/
void
xfs_qm_destroy_quotainfo(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_quotainfo_t *qi;
+ struct xfs_quotainfo *qi;
qi = mp->m_quotainfo;
ASSERT(qi != NULL);
@@ -754,11 +755,15 @@ xfs_qm_qino_alloc(
if ((flags & XFS_QMOPT_PQUOTA) &&
(mp->m_sb.sb_gquotino != NULLFSINO)) {
ino = mp->m_sb.sb_gquotino;
- ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
+ if (XFS_IS_CORRUPT(mp,
+ mp->m_sb.sb_pquotino != NULLFSINO))
+ return -EFSCORRUPTED;
} else if ((flags & XFS_QMOPT_GQUOTA) &&
(mp->m_sb.sb_pquotino != NULLFSINO)) {
ino = mp->m_sb.sb_pquotino;
- ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
+ if (XFS_IS_CORRUPT(mp,
+ mp->m_sb.sb_gquotino != NULLFSINO))
+ return -EFSCORRUPTED;
}
if (ino != NULLFSINO) {
error = xfs_iget(mp, NULL, ino, 0, 0, ip);
@@ -1559,7 +1564,7 @@ error_rele:
STATIC void
xfs_qm_destroy_quotainos(
- xfs_quotainfo_t *qi)
+ struct xfs_quotainfo *qi)
{
if (qi->qi_uquotaip) {
xfs_irele(qi->qi_uquotaip);
@@ -1693,7 +1698,7 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
- if (xfs_get_projid(ip) != prid) {
+ if (ip->i_d.di_projid != prid) {
xfs_iunlock(ip, lockflags);
error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ,
true, &pq);
@@ -1737,14 +1742,14 @@ error_rele:
* Actually transfer ownership, and do dquot modifications.
* These were already reserved.
*/
-xfs_dquot_t *
+struct xfs_dquot *
xfs_qm_vop_chown(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t **IO_olddq,
- xfs_dquot_t *newdq)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot **IO_olddq,
+ struct xfs_dquot *newdq)
{
- xfs_dquot_t *prevdq;
+ struct xfs_dquot *prevdq;
uint bfield = XFS_IS_REALTIME_INODE(ip) ?
XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
@@ -1827,7 +1832,7 @@ xfs_qm_vop_chown_reserve(
}
if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
- xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
+ ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) {
prjflags = XFS_QMOPT_ENOSPC;
pdq_delblks = pdqp;
if (delblks) {
@@ -1928,7 +1933,7 @@ xfs_qm_vop_create_dqattach(
}
if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
ASSERT(ip->i_pdquot == NULL);
- ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
+ ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id));
ip->i_pdquot = xfs_qm_dqhold(pdqp);
xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index b41b75089548..7823af39008b 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -54,7 +54,7 @@ struct xfs_def_quota {
* Various quota information for individual filesystems.
* The mount structure keeps a pointer to this.
*/
-typedef struct xfs_quotainfo {
+struct xfs_quotainfo {
struct radix_tree_root qi_uquota_tree;
struct radix_tree_root qi_gquota_tree;
struct radix_tree_root qi_pquota_tree;
@@ -76,8 +76,8 @@ typedef struct xfs_quotainfo {
struct xfs_def_quota qi_usr_default;
struct xfs_def_quota qi_grp_default;
struct xfs_def_quota qi_prj_default;
- struct shrinker qi_shrinker;
-} xfs_quotainfo_t;
+ struct shrinker qi_shrinker;
+};
static inline struct radix_tree_root *
xfs_dquot_tree(
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 5d72e88598b4..fc2fa418919f 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -54,13 +54,13 @@ xfs_fill_statvfs_from_dquot(
*/
void
xfs_qm_statvfs(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
struct kstatfs *statp)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_dquot_t *dqp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dquot *dqp;
- if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) {
+ if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) {
xfs_fill_statvfs_from_dquot(statp, dqp);
xfs_qm_dqput(dqp);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index da7ad0383037..1ea82764bf89 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -19,9 +19,72 @@
#include "xfs_qm.h"
#include "xfs_icache.h"
-STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
-STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
- uint);
+STATIC int
+xfs_qm_log_quotaoff(
+ struct xfs_mount *mp,
+ struct xfs_qoff_logitem **qoffstartp,
+ uint flags)
+{
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_qoff_logitem *qoffi;
+
+ *qoffstartp = NULL;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
+ if (error)
+ goto out;
+
+ qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+ xfs_trans_log_quotaoff_item(tp, qoffi);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+ spin_unlock(&mp->m_sb_lock);
+
+ xfs_log_sb(tp);
+
+ /*
+ * We have to make sure that the transaction is secure on disk before we
+ * return and actually stop quota accounting. So, make it synchronous.
+ * We don't care about quotoff's performance.
+ */
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out;
+
+ *qoffstartp = qoffi;
+out:
+ return error;
+}
+
+STATIC int
+xfs_qm_log_quotaoff_end(
+ struct xfs_mount *mp,
+ struct xfs_qoff_logitem *startqoff,
+ uint flags)
+{
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_qoff_logitem *qoffi;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ qoffi = xfs_trans_get_qoff_item(tp, startqoff,
+ flags & XFS_ALL_QUOTA_ACCT);
+ xfs_trans_log_quotaoff_item(tp, qoffi);
+
+ /*
+ * We have to make sure that the transaction is secure on disk before we
+ * return and actually stop quota accounting. So, make it synchronous.
+ * We don't care about quotoff's performance.
+ */
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp);
+}
/*
* Turn off quota accounting and/or enforcement for all udquots and/or
@@ -40,7 +103,7 @@ xfs_qm_scall_quotaoff(
uint dqtype;
int error;
uint inactivate_flags;
- xfs_qoff_logitem_t *qoffstart;
+ struct xfs_qoff_logitem *qoffstart;
/*
* No file system can have quotas enabled on disk but not in core.
@@ -538,74 +601,6 @@ out_unlock:
return error;
}
-STATIC int
-xfs_qm_log_quotaoff_end(
- xfs_mount_t *mp,
- xfs_qoff_logitem_t *startqoff,
- uint flags)
-{
- xfs_trans_t *tp;
- int error;
- xfs_qoff_logitem_t *qoffi;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
- if (error)
- return error;
-
- qoffi = xfs_trans_get_qoff_item(tp, startqoff,
- flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
-}
-
-
-STATIC int
-xfs_qm_log_quotaoff(
- xfs_mount_t *mp,
- xfs_qoff_logitem_t **qoffstartp,
- uint flags)
-{
- xfs_trans_t *tp;
- int error;
- xfs_qoff_logitem_t *qoffi;
-
- *qoffstartp = NULL;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
- if (error)
- goto out;
-
- qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- spin_lock(&mp->m_sb_lock);
- mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
- spin_unlock(&mp->m_sb_lock);
-
- xfs_log_sb(tp);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp);
- if (error)
- goto out;
-
- *qoffstartp = qoffi;
-out:
- return error;
-}
-
/* Fill out the quota context. */
static void
xfs_qm_scall_getquota_fill_qc(
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index cd6c7210a373..c7de17deeae6 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -201,6 +201,9 @@ xfs_fs_rm_xquota(
if (XFS_IS_QUOTA_ON(mp))
return -EINVAL;
+ if (uflags & ~(FS_USER_QUOTA | FS_GROUP_QUOTA | FS_PROJ_QUOTA))
+ return -EINVAL;
+
if (uflags & FS_USER_QUOTA)
flags |= XFS_DQ_USER;
if (uflags & FS_GROUP_QUOTA)
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 2328268e6245..8eeed73928cd 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -17,7 +17,7 @@
#include "xfs_refcount_item.h"
#include "xfs_log.h"
#include "xfs_refcount.h"
-
+#include "xfs_error.h"
kmem_zone_t *xfs_cui_zone;
kmem_zone_t *xfs_cud_zone;
@@ -34,7 +34,7 @@ xfs_cui_item_free(
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
- kmem_zone_free(xfs_cui_zone, cuip);
+ kmem_cache_free(xfs_cui_zone, cuip);
}
/*
@@ -206,7 +206,7 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
- kmem_zone_free(xfs_cud_zone, cudp);
+ kmem_cache_free(xfs_cud_zone, cudp);
}
static const struct xfs_item_ops xfs_cud_item_ops = {
@@ -497,7 +497,7 @@ xfs_cui_recover(
*/
set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
xfs_cui_release(cuip);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -536,6 +536,7 @@ xfs_cui_recover(
type = refc_type;
break;
default:
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto abort_error;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 0f08153b4994..de451235c4ee 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -308,13 +308,13 @@ static int
xfs_find_trim_cow_extent(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
bool *shared,
bool *found)
{
xfs_fileoff_t offset_fsb = imap->br_startoff;
xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_iext_cursor icur;
- struct xfs_bmbt_irec got;
*found = false;
@@ -322,23 +322,22 @@ xfs_find_trim_cow_extent(
* If we don't find an overlapping extent, trim the range we need to
* allocate to fit the hole we found.
*/
- if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
- got.br_startoff = offset_fsb + count_fsb;
- if (got.br_startoff > offset_fsb) {
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
+ cmap->br_startoff = offset_fsb + count_fsb;
+ if (cmap->br_startoff > offset_fsb) {
xfs_trim_extent(imap, imap->br_startoff,
- got.br_startoff - imap->br_startoff);
+ cmap->br_startoff - imap->br_startoff);
return xfs_inode_need_cow(ip, imap, shared);
}
*shared = true;
- if (isnullstartblock(got.br_startblock)) {
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+ if (isnullstartblock(cmap->br_startblock)) {
+ xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
return 0;
}
/* real extent found - no need to allocate */
- xfs_trim_extent(&got, offset_fsb, count_fsb);
- *imap = got;
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
*found = true;
return 0;
}
@@ -348,6 +347,7 @@ int
xfs_reflink_allocate_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap,
bool *shared,
uint *lockmode,
bool convert_now)
@@ -367,7 +367,7 @@ xfs_reflink_allocate_cow(
xfs_ifork_init_cow(ip);
}
- error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
return error;
if (found)
@@ -392,7 +392,7 @@ xfs_reflink_allocate_cow(
/*
* Check for an overlapping extent again now that we dropped the ilock.
*/
- error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
+ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
goto out_trans_cancel;
if (found) {
@@ -410,8 +410,8 @@ xfs_reflink_allocate_cow(
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
- XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
- resblks, imap, &nimaps);
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
+ &nimaps);
if (error)
goto out_unreserve;
@@ -427,15 +427,15 @@ xfs_reflink_allocate_cow(
if (nimaps == 0)
return -ENOSPC;
convert:
- xfs_trim_extent(imap, offset_fsb, count_fsb);
+ xfs_trim_extent(cmap, offset_fsb, count_fsb);
/*
* COW fork extents are supposed to remain unwritten until we're ready
* to initiate a disk write. For direct I/O we are going to write the
* data and need the conversion, but for buffered writes we're done.
*/
- if (!convert_now || imap->br_state == XFS_EXT_NORM)
+ if (!convert_now || cmap->br_state == XFS_EXT_NORM)
return 0;
- trace_xfs_reflink_convert_cow(ip, imap);
+ trace_xfs_reflink_convert_cow(ip, cmap);
return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
out_unreserve:
@@ -1270,7 +1270,7 @@ xfs_reflink_zero_posteof(
trace_xfs_zero_eof(ip, isize, pos - isize);
return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
- &xfs_iomap_ops);
+ &xfs_buffered_write_iomap_ops);
}
/*
@@ -1381,85 +1381,6 @@ out_unlock:
return ret;
}
-/*
- * The user wants to preemptively CoW all shared blocks in this file,
- * which enables us to turn off the reflink flag. Iterate all
- * extents which are not prealloc/delalloc to see which ranges are
- * mentioned in the refcount tree, then read those blocks into the
- * pagecache, dirty them, fsync them back out, and then we can update
- * the inode flag. What happens if we run out of memory? :)
- */
-STATIC int
-xfs_reflink_dirty_extents(
- struct xfs_inode *ip,
- xfs_fileoff_t fbno,
- xfs_filblks_t end,
- xfs_off_t isize)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- xfs_extlen_t aglen;
- xfs_agblock_t rbno;
- xfs_extlen_t rlen;
- xfs_off_t fpos;
- xfs_off_t flen;
- struct xfs_bmbt_irec map[2];
- int nmaps;
- int error = 0;
-
- while (end - fbno > 0) {
- nmaps = 1;
- /*
- * Look for extents in the file. Skip holes, delalloc, or
- * unwritten extents; they can't be reflinked.
- */
- error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
- if (error)
- goto out;
- if (nmaps == 0)
- break;
- if (!xfs_bmap_is_real_extent(&map[0]))
- goto next;
-
- map[1] = map[0];
- while (map[1].br_blockcount) {
- agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
- agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
- aglen = map[1].br_blockcount;
-
- error = xfs_reflink_find_shared(mp, NULL, agno, agbno,
- aglen, &rbno, &rlen, true);
- if (error)
- goto out;
- if (rbno == NULLAGBLOCK)
- break;
-
- /* Dirty the pages */
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
- (rbno - agbno));
- flen = XFS_FSB_TO_B(mp, rlen);
- if (fpos + flen > isize)
- flen = isize - fpos;
- error = iomap_file_dirty(VFS_I(ip), fpos, flen,
- &xfs_iomap_ops);
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto out;
-
- map[1].br_blockcount -= (rbno - agbno + rlen);
- map[1].br_startoff += (rbno - agbno + rlen);
- map[1].br_startblock += (rbno - agbno + rlen);
- }
-
-next:
- fbno = map[0].br_startoff + map[0].br_blockcount;
- }
-out:
- return error;
-}
-
/* Does this inode need the reflink flag? */
int
xfs_reflink_inode_has_shared_extents(
@@ -1596,10 +1517,7 @@ xfs_reflink_unshare(
xfs_off_t offset,
xfs_off_t len)
{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t fbno;
- xfs_filblks_t end;
- xfs_off_t isize;
+ struct inode *inode = VFS_I(ip);
int error;
if (!xfs_is_reflink_inode(ip))
@@ -1607,20 +1525,13 @@ xfs_reflink_unshare(
trace_xfs_reflink_unshare(ip, offset, len);
- inode_dio_wait(VFS_I(ip));
+ inode_dio_wait(inode);
- /* Try to CoW the selected ranges */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- fbno = XFS_B_TO_FSBT(mp, offset);
- isize = i_size_read(VFS_I(ip));
- end = XFS_B_TO_FSB(mp, offset + len);
- error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
+ error = iomap_file_unshare(inode, offset, len,
+ &xfs_buffered_write_iomap_ops);
if (error)
- goto out_unlock;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- /* Wait for the IO to finish */
- error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ goto out;
+ error = filemap_write_and_wait(inode->i_mapping);
if (error)
goto out;
@@ -1628,11 +1539,8 @@ xfs_reflink_unshare(
error = xfs_reflink_try_clear_inode_flag(ip);
if (error)
goto out;
-
return 0;
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
return error;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 28a43b7f581d..d18ad7f4fb64 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -25,8 +25,8 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool *shared);
-extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+ struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode,
bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 8939e0ea09cd..4911b68f95dd 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -17,7 +17,7 @@
#include "xfs_rmap_item.h"
#include "xfs_log.h"
#include "xfs_rmap.h"
-
+#include "xfs_error.h"
kmem_zone_t *xfs_rui_zone;
kmem_zone_t *xfs_rud_zone;
@@ -34,7 +34,7 @@ xfs_rui_item_free(
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
- kmem_zone_free(xfs_rui_zone, ruip);
+ kmem_cache_free(xfs_rui_zone, ruip);
}
/*
@@ -171,8 +171,10 @@ xfs_rui_copy_format(
src_rui_fmt = buf->i_addr;
len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
- if (buf->i_len != len)
+ if (buf->i_len != len) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
+ }
memcpy(dst_rui_fmt, src_rui_fmt, len);
return 0;
@@ -227,7 +229,7 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
- kmem_zone_free(xfs_rud_zone, rudp);
+ kmem_cache_free(xfs_rud_zone, rudp);
}
static const struct xfs_item_ops xfs_rud_item_ops = {
@@ -539,7 +541,7 @@ xfs_rui_recover(
*/
set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
xfs_rui_release(ruip);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -581,6 +583,7 @@ xfs_rui_recover(
type = XFS_RMAP_FREE;
break;
default:
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
error = -EFSCORRUPTED;
goto abort_error;
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 4a48a8c75b4f..d42b5a2047e0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -792,8 +792,7 @@ xfs_growfs_rt_alloc(
*/
nmap = 1;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
- XFS_BMAPI_METADATA, resblks, &map,
- &nmap);
+ XFS_BMAPI_METADATA, 0, &map, &nmap);
if (!error && nmap < 1)
error = -ENOSPC;
if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8d1df9f8be07..d9ae27ddf253 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -37,10 +37,10 @@
#include "xfs_reflink.h"
#include <linux/magic.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
static const struct super_operations xfs_super_operations;
-struct bio_set xfs_ioend_bioset;
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
#ifdef DEBUG
@@ -51,7 +51,7 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
* Table driven mount option parser.
*/
enum {
- Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
+ Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
@@ -59,382 +59,67 @@ enum {
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
- Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
+ Opt_discard, Opt_nodiscard, Opt_dax,
};
-static const match_table_t tokens = {
- {Opt_logbufs, "logbufs=%u"}, /* number of XFS log buffers */
- {Opt_logbsize, "logbsize=%s"}, /* size of XFS log buffers */
- {Opt_logdev, "logdev=%s"}, /* log device */
- {Opt_rtdev, "rtdev=%s"}, /* realtime I/O device */
- {Opt_biosize, "biosize=%u"}, /* log2 of preferred buffered io size */
- {Opt_wsync, "wsync"}, /* safe-mode nfs compatible mount */
- {Opt_noalign, "noalign"}, /* turn off stripe alignment */
- {Opt_swalloc, "swalloc"}, /* turn on stripe width allocation */
- {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */
- {Opt_swidth, "swidth=%u"}, /* data volume stripe width */
- {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */
- {Opt_grpid, "grpid"}, /* group-ID from parent directory */
- {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */
- {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */
- {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */
- {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */
- {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */
- {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */
- {Opt_inode32, "inode32"}, /* inode allocation limited to
- * XFS_MAXINUMBER_32 */
- {Opt_ikeep, "ikeep"}, /* do not free empty inode clusters */
- {Opt_noikeep, "noikeep"}, /* free empty inode clusters */
- {Opt_largeio, "largeio"}, /* report large I/O sizes in stat() */
- {Opt_nolargeio, "nolargeio"}, /* do not report large I/O sizes
- * in stat(). */
- {Opt_attr2, "attr2"}, /* do use attr2 attribute format */
- {Opt_noattr2, "noattr2"}, /* do not use attr2 attribute format */
- {Opt_filestreams,"filestreams"},/* use filestreams allocator */
- {Opt_quota, "quota"}, /* disk quotas (user) */
- {Opt_noquota, "noquota"}, /* no quotas */
- {Opt_usrquota, "usrquota"}, /* user quota enabled */
- {Opt_grpquota, "grpquota"}, /* group quota enabled */
- {Opt_prjquota, "prjquota"}, /* project quota enabled */
- {Opt_uquota, "uquota"}, /* user quota (IRIX variant) */
- {Opt_gquota, "gquota"}, /* group quota (IRIX variant) */
- {Opt_pquota, "pquota"}, /* project quota (IRIX variant) */
- {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */
- {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */
- {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
- {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */
- {Opt_discard, "discard"}, /* Discard unused blocks */
- {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */
- {Opt_dax, "dax"}, /* Enable direct access to bdev pages */
- {Opt_err, NULL},
+static const struct fs_parameter_spec xfs_param_specs[] = {
+ fsparam_u32("logbufs", Opt_logbufs),
+ fsparam_string("logbsize", Opt_logbsize),
+ fsparam_string("logdev", Opt_logdev),
+ fsparam_string("rtdev", Opt_rtdev),
+ fsparam_flag("wsync", Opt_wsync),
+ fsparam_flag("noalign", Opt_noalign),
+ fsparam_flag("swalloc", Opt_swalloc),
+ fsparam_u32("sunit", Opt_sunit),
+ fsparam_u32("swidth", Opt_swidth),
+ fsparam_flag("nouuid", Opt_nouuid),
+ fsparam_flag("grpid", Opt_grpid),
+ fsparam_flag("nogrpid", Opt_nogrpid),
+ fsparam_flag("bsdgroups", Opt_bsdgroups),
+ fsparam_flag("sysvgroups", Opt_sysvgroups),
+ fsparam_string("allocsize", Opt_allocsize),
+ fsparam_flag("norecovery", Opt_norecovery),
+ fsparam_flag("inode64", Opt_inode64),
+ fsparam_flag("inode32", Opt_inode32),
+ fsparam_flag("ikeep", Opt_ikeep),
+ fsparam_flag("noikeep", Opt_noikeep),
+ fsparam_flag("largeio", Opt_largeio),
+ fsparam_flag("nolargeio", Opt_nolargeio),
+ fsparam_flag("attr2", Opt_attr2),
+ fsparam_flag("noattr2", Opt_noattr2),
+ fsparam_flag("filestreams", Opt_filestreams),
+ fsparam_flag("quota", Opt_quota),
+ fsparam_flag("noquota", Opt_noquota),
+ fsparam_flag("usrquota", Opt_usrquota),
+ fsparam_flag("grpquota", Opt_grpquota),
+ fsparam_flag("prjquota", Opt_prjquota),
+ fsparam_flag("uquota", Opt_uquota),
+ fsparam_flag("gquota", Opt_gquota),
+ fsparam_flag("pquota", Opt_pquota),
+ fsparam_flag("uqnoenforce", Opt_uqnoenforce),
+ fsparam_flag("gqnoenforce", Opt_gqnoenforce),
+ fsparam_flag("pqnoenforce", Opt_pqnoenforce),
+ fsparam_flag("qnoenforce", Opt_qnoenforce),
+ fsparam_flag("discard", Opt_discard),
+ fsparam_flag("nodiscard", Opt_nodiscard),
+ fsparam_flag("dax", Opt_dax),
+ {}
};
-
-STATIC int
-suffix_kstrtoint(const substring_t *s, unsigned int base, int *res)
-{
- int last, shift_left_factor = 0, _res;
- char *value;
- int ret = 0;
-
- value = match_strdup(s);
- if (!value)
- return -ENOMEM;
-
- last = strlen(value) - 1;
- if (value[last] == 'K' || value[last] == 'k') {
- shift_left_factor = 10;
- value[last] = '\0';
- }
- if (value[last] == 'M' || value[last] == 'm') {
- shift_left_factor = 20;
- value[last] = '\0';
- }
- if (value[last] == 'G' || value[last] == 'g') {
- shift_left_factor = 30;
- value[last] = '\0';
- }
-
- if (kstrtoint(value, base, &_res))
- ret = -EINVAL;
- kfree(value);
- *res = _res << shift_left_factor;
- return ret;
-}
-
-/*
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- *
- * Note that this function leaks the various device name allocations on
- * failure. The caller takes care of them.
- *
- * *sb is const because this is also used to test options on the remount
- * path, and we don't want this to have any side effects at remount time.
- * Today this function does not change *sb, but just to future-proof...
- */
-STATIC int
-xfs_parseargs(
- struct xfs_mount *mp,
- char *options)
-{
- const struct super_block *sb = mp->m_super;
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int dsunit = 0;
- int dswidth = 0;
- int iosize = 0;
- uint8_t iosizelog = 0;
-
- /*
- * set up the mount name first so all the errors will refer to the
- * correct device.
- */
- mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
- if (!mp->m_fsname)
- return -ENOMEM;
- mp->m_fsname_len = strlen(mp->m_fsname) + 1;
-
- /*
- * Copy binary VFS mount flags we are interested in.
- */
- if (sb_rdonly(sb))
- mp->m_flags |= XFS_MOUNT_RDONLY;
- if (sb->s_flags & SB_DIRSYNC)
- mp->m_flags |= XFS_MOUNT_DIRSYNC;
- if (sb->s_flags & SB_SYNCHRONOUS)
- mp->m_flags |= XFS_MOUNT_WSYNC;
-
- /*
- * Set some default flags that could be cleared by the mount option
- * parsing.
- */
- mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-
- /*
- * These can be overridden by the mount option parsing.
- */
- mp->m_logbufs = -1;
- mp->m_logbsize = -1;
-
- if (!options)
- goto done;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_logbufs:
- if (match_int(args, &mp->m_logbufs))
- return -EINVAL;
- break;
- case Opt_logbsize:
- if (suffix_kstrtoint(args, 10, &mp->m_logbsize))
- return -EINVAL;
- break;
- case Opt_logdev:
- kfree(mp->m_logname);
- mp->m_logname = match_strdup(args);
- if (!mp->m_logname)
- return -ENOMEM;
- break;
- case Opt_rtdev:
- kfree(mp->m_rtname);
- mp->m_rtname = match_strdup(args);
- if (!mp->m_rtname)
- return -ENOMEM;
- break;
- case Opt_allocsize:
- case Opt_biosize:
- if (suffix_kstrtoint(args, 10, &iosize))
- return -EINVAL;
- iosizelog = ffs(iosize) - 1;
- break;
- case Opt_grpid:
- case Opt_bsdgroups:
- mp->m_flags |= XFS_MOUNT_GRPID;
- break;
- case Opt_nogrpid:
- case Opt_sysvgroups:
- mp->m_flags &= ~XFS_MOUNT_GRPID;
- break;
- case Opt_wsync:
- mp->m_flags |= XFS_MOUNT_WSYNC;
- break;
- case Opt_norecovery:
- mp->m_flags |= XFS_MOUNT_NORECOVERY;
- break;
- case Opt_noalign:
- mp->m_flags |= XFS_MOUNT_NOALIGN;
- break;
- case Opt_swalloc:
- mp->m_flags |= XFS_MOUNT_SWALLOC;
- break;
- case Opt_sunit:
- if (match_int(args, &dsunit))
- return -EINVAL;
- break;
- case Opt_swidth:
- if (match_int(args, &dswidth))
- return -EINVAL;
- break;
- case Opt_inode32:
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- break;
- case Opt_inode64:
- mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
- break;
- case Opt_nouuid:
- mp->m_flags |= XFS_MOUNT_NOUUID;
- break;
- case Opt_ikeep:
- mp->m_flags |= XFS_MOUNT_IKEEP;
- break;
- case Opt_noikeep:
- mp->m_flags &= ~XFS_MOUNT_IKEEP;
- break;
- case Opt_largeio:
- mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
- break;
- case Opt_nolargeio:
- mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
- break;
- case Opt_attr2:
- mp->m_flags |= XFS_MOUNT_ATTR2;
- break;
- case Opt_noattr2:
- mp->m_flags &= ~XFS_MOUNT_ATTR2;
- mp->m_flags |= XFS_MOUNT_NOATTR2;
- break;
- case Opt_filestreams:
- mp->m_flags |= XFS_MOUNT_FILESTREAMS;
- break;
- case Opt_noquota:
- mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
- mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
- mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
- break;
- case Opt_quota:
- case Opt_uquota:
- case Opt_usrquota:
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
- XFS_UQUOTA_ENFD);
- break;
- case Opt_qnoenforce:
- case Opt_uqnoenforce:
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_UQUOTA_ENFD;
- break;
- case Opt_pquota:
- case Opt_prjquota:
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
- XFS_PQUOTA_ENFD);
- break;
- case Opt_pqnoenforce:
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_PQUOTA_ENFD;
- break;
- case Opt_gquota:
- case Opt_grpquota:
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
- XFS_GQUOTA_ENFD);
- break;
- case Opt_gqnoenforce:
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_GQUOTA_ENFD;
- break;
- case Opt_discard:
- mp->m_flags |= XFS_MOUNT_DISCARD;
- break;
- case Opt_nodiscard:
- mp->m_flags &= ~XFS_MOUNT_DISCARD;
- break;
-#ifdef CONFIG_FS_DAX
- case Opt_dax:
- mp->m_flags |= XFS_MOUNT_DAX;
- break;
-#endif
- default:
- xfs_warn(mp, "unknown mount option [%s].", p);
- return -EINVAL;
- }
- }
-
- /*
- * no recovery flag requires a read-only mount
- */
- if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
- !(mp->m_flags & XFS_MOUNT_RDONLY)) {
- xfs_warn(mp, "no-recovery mounts must be read-only.");
- return -EINVAL;
- }
-
- if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
- xfs_warn(mp,
- "sunit and swidth options incompatible with the noalign option");
- return -EINVAL;
- }
-
-#ifndef CONFIG_XFS_QUOTA
- if (XFS_IS_QUOTA_RUNNING(mp)) {
- xfs_warn(mp, "quota support not available in this kernel.");
- return -EINVAL;
- }
-#endif
-
- if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
- xfs_warn(mp, "sunit and swidth must be specified together");
- return -EINVAL;
- }
-
- if (dsunit && (dswidth % dsunit != 0)) {
- xfs_warn(mp,
- "stripe width (%d) must be a multiple of the stripe unit (%d)",
- dswidth, dsunit);
- return -EINVAL;
- }
-
-done:
- if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
- /*
- * At this point the superblock has not been read
- * in, therefore we do not know the block size.
- * Before the mount call ends we will convert
- * these to FSBs.
- */
- mp->m_dalign = dsunit;
- mp->m_swidth = dswidth;
- }
-
- if (mp->m_logbufs != -1 &&
- mp->m_logbufs != 0 &&
- (mp->m_logbufs < XLOG_MIN_ICLOGS ||
- mp->m_logbufs > XLOG_MAX_ICLOGS)) {
- xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
- mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
- return -EINVAL;
- }
- if (mp->m_logbsize != -1 &&
- mp->m_logbsize != 0 &&
- (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
- mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
- !is_power_of_2(mp->m_logbsize))) {
- xfs_warn(mp,
- "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
- mp->m_logbsize);
- return -EINVAL;
- }
-
- if (iosizelog) {
- if (iosizelog > XFS_MAX_IO_LOG ||
- iosizelog < XFS_MIN_IO_LOG) {
- xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
- iosizelog, XFS_MIN_IO_LOG,
- XFS_MAX_IO_LOG);
- return -EINVAL;
- }
-
- mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
- mp->m_readio_log = iosizelog;
- mp->m_writeio_log = iosizelog;
- }
-
- return 0;
-}
+static const struct fs_parameter_description xfs_fs_parameters = {
+ .name = "xfs",
+ .specs = xfs_param_specs,
+};
struct proc_xfs_info {
uint64_t flag;
char *str;
};
-STATIC void
-xfs_showargs(
- struct xfs_mount *mp,
- struct seq_file *m)
+static int
+xfs_fs_show_options(
+ struct seq_file *m,
+ struct dentry *root)
{
static struct proc_xfs_info xfs_info_set[] = {
/* the few simple ones we can get from the mount struct */
@@ -448,30 +133,24 @@ xfs_showargs(
{ XFS_MOUNT_FILESTREAMS, ",filestreams" },
{ XFS_MOUNT_GRPID, ",grpid" },
{ XFS_MOUNT_DISCARD, ",discard" },
- { XFS_MOUNT_SMALL_INUMS, ",inode32" },
+ { XFS_MOUNT_LARGEIO, ",largeio" },
{ XFS_MOUNT_DAX, ",dax" },
{ 0, NULL }
};
- static struct proc_xfs_info xfs_info_unset[] = {
- /* the few simple ones we can get from the mount struct */
- { XFS_MOUNT_COMPAT_IOSIZE, ",largeio" },
- { XFS_MOUNT_SMALL_INUMS, ",inode64" },
- { 0, NULL }
- };
+ struct xfs_mount *mp = XFS_M(root->d_sb);
struct proc_xfs_info *xfs_infop;
for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
if (mp->m_flags & xfs_infop->flag)
seq_puts(m, xfs_infop->str);
}
- for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) {
- if (!(mp->m_flags & xfs_infop->flag))
- seq_puts(m, xfs_infop->str);
- }
- if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+ seq_printf(m, ",inode%d",
+ (mp->m_flags & XFS_MOUNT_SMALL_INUMS) ? 32 : 64);
+
+ if (mp->m_flags & XFS_MOUNT_ALLOCSIZE)
seq_printf(m, ",allocsize=%dk",
- (int)(1 << mp->m_writeio_log) >> 10);
+ (1 << mp->m_allocsize_log) >> 10);
if (mp->m_logbufs > 0)
seq_printf(m, ",logbufs=%d", mp->m_logbufs);
@@ -510,6 +189,8 @@ xfs_showargs(
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
+
+ return 0;
}
static uint64_t
@@ -808,33 +489,33 @@ xfs_init_mount_workqueues(
struct xfs_mount *mp)
{
mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_super->s_id);
if (!mp->m_buf_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND,
- 0, mp->m_fsname);
+ 0, mp->m_super->s_id);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_super->s_id);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_reclaim;
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
- mp->m_fsname);
+ mp->m_super->s_id);
if (!mp->m_sync_workqueue)
goto out_destroy_eofb;
@@ -1038,13 +719,13 @@ xfs_fs_drop_inode(
return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
}
-STATIC void
-xfs_free_fsname(
+static void
+xfs_mount_free(
struct xfs_mount *mp)
{
- kfree(mp->m_fsname);
kfree(mp->m_rtname);
kfree(mp->m_logname);
+ kmem_free(mp);
}
STATIC int
@@ -1205,181 +886,6 @@ xfs_quiesce_attr(
xfs_log_quiesce(mp);
}
-STATIC int
-xfs_test_remount_options(
- struct super_block *sb,
- char *options)
-{
- int error = 0;
- struct xfs_mount *tmp_mp;
-
- tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL);
- if (!tmp_mp)
- return -ENOMEM;
-
- tmp_mp->m_super = sb;
- error = xfs_parseargs(tmp_mp, options);
- xfs_free_fsname(tmp_mp);
- kmem_free(tmp_mp);
-
- return error;
-}
-
-STATIC int
-xfs_fs_remount(
- struct super_block *sb,
- int *flags,
- char *options)
-{
- struct xfs_mount *mp = XFS_M(sb);
- xfs_sb_t *sbp = &mp->m_sb;
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int error;
-
- /* First, check for complete junk; i.e. invalid options */
- error = xfs_test_remount_options(sb, options);
- if (error)
- return error;
-
- sync_filesystem(sb);
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_inode64:
- mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
- mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
- break;
- case Opt_inode32:
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
- break;
- default:
- /*
- * Logically we would return an error here to prevent
- * users from believing they might have changed
- * mount options using remount which can't be changed.
- *
- * But unfortunately mount(8) adds all options from
- * mtab and fstab to the mount arguments in some cases
- * so we can't blindly reject options, but have to
- * check for each specified option if it actually
- * differs from the currently set option and only
- * reject it if that's the case.
- *
- * Until that is implemented we return success for
- * every remount request, and silently ignore all
- * options that we can't actually change.
- */
-#if 0
- xfs_info(mp,
- "mount option \"%s\" not supported for remount", p);
- return -EINVAL;
-#else
- break;
-#endif
- }
- }
-
- /* ro -> rw */
- if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) {
- if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
- xfs_warn(mp,
- "ro->rw transition prohibited on norecovery mount");
- return -EINVAL;
- }
-
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
- xfs_sb_has_ro_compat_feature(sbp,
- XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
- xfs_warn(mp,
-"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
- (sbp->sb_features_ro_compat &
- XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
- return -EINVAL;
- }
-
- mp->m_flags &= ~XFS_MOUNT_RDONLY;
-
- /*
- * If this is the first remount to writeable state we
- * might have some superblock changes to update.
- */
- if (mp->m_update_sb) {
- error = xfs_sync_sb(mp, false);
- if (error) {
- xfs_warn(mp, "failed to write sb changes");
- return error;
- }
- mp->m_update_sb = false;
- }
-
- /*
- * Fill out the reserve pool if it is empty. Use the stashed
- * value if it is non-zero, otherwise go with the default.
- */
- xfs_restore_resvblks(mp);
- xfs_log_work_queue(mp);
-
- /* Recover any CoW blocks that never got remapped. */
- error = xfs_reflink_recover_cow(mp);
- if (error) {
- xfs_err(mp,
- "Error %d recovering leftover CoW allocations.", error);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
- xfs_start_block_reaping(mp);
-
- /* Create the per-AG metadata reservation pool .*/
- error = xfs_fs_reserve_ag_blocks(mp);
- if (error && error != -ENOSPC)
- return error;
- }
-
- /* rw -> ro */
- if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) {
- /*
- * Cancel background eofb scanning so it cannot race with the
- * final log force+buftarg wait and deadlock the remount.
- */
- xfs_stop_block_reaping(mp);
-
- /* Get rid of any leftover CoW reservations... */
- error = xfs_icache_free_cowblocks(mp, NULL);
- if (error) {
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
-
- /* Free the per-AG metadata reservation pool. */
- error = xfs_fs_unreserve_ag_blocks(mp);
- if (error) {
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return error;
- }
-
- /*
- * Before we sync the metadata, we need to free up the reserve
- * block pool so that the used block count in the superblock on
- * disk is correct at the end of the remount. Stash the current
- * reserve pool size so that if we get remounted rw, we can
- * return it to the same size.
- */
- xfs_save_resvblks(mp);
-
- xfs_quiesce_attr(mp);
- mp->m_flags |= XFS_MOUNT_RDONLY;
- }
-
- return 0;
-}
-
/*
* Second stage of a freeze. The data is already frozen so we only
* need to take care of the metadata. Once that's done sync the superblock
@@ -1410,15 +916,6 @@ xfs_fs_unfreeze(
return 0;
}
-STATIC int
-xfs_fs_show_options(
- struct seq_file *m,
- struct dentry *root)
-{
- xfs_showargs(XFS_M(root->d_sb), m);
- return 0;
-}
-
/*
* This function fills in xfs_mount_t fields based on mount args.
* Note: the superblock _has_ now been read in.
@@ -1541,60 +1038,337 @@ xfs_destroy_percpu_counters(
percpu_counter_destroy(&mp->m_delalloc_blks);
}
-static struct xfs_mount *
-xfs_mount_alloc(
+static void
+xfs_fs_put_super(
struct super_block *sb)
{
- struct xfs_mount *mp;
+ struct xfs_mount *mp = XFS_M(sb);
- mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
- if (!mp)
- return NULL;
+ /* if ->fill_super failed, we have no mount to tear down */
+ if (!sb->s_fs_info)
+ return;
- mp->m_super = sb;
- spin_lock_init(&mp->m_sb_lock);
- spin_lock_init(&mp->m_agirotor_lock);
- INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
- spin_lock_init(&mp->m_perag_lock);
- mutex_init(&mp->m_growlock);
- atomic_set(&mp->m_active_trans, 0);
- INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
- INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
- INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
- mp->m_kobj.kobject.kset = xfs_kset;
- /*
- * We don't create the finobt per-ag space reservation until after log
- * recovery, so we must set this to true so that an ifree transaction
- * started during log recovery will not depend on space reservations
- * for finobt expansion.
- */
- mp->m_finobt_nores = true;
- return mp;
+ xfs_notice(mp, "Unmounting Filesystem");
+ xfs_filestream_unmount(mp);
+ xfs_unmountfs(mp);
+
+ xfs_freesb(mp);
+ free_percpu(mp->m_stats.xs_stats);
+ xfs_destroy_percpu_counters(mp);
+ xfs_destroy_mount_workqueues(mp);
+ xfs_close_devices(mp);
+
+ sb->s_fs_info = NULL;
+ xfs_mount_free(mp);
}
+static long
+xfs_fs_nr_cached_objects(
+ struct super_block *sb,
+ struct shrink_control *sc)
+{
+ /* Paranoia: catch incorrect calls during mount setup or teardown */
+ if (WARN_ON_ONCE(!sb->s_fs_info))
+ return 0;
+ return xfs_reclaim_inodes_count(XFS_M(sb));
+}
-STATIC int
-xfs_fs_fill_super(
+static long
+xfs_fs_free_cached_objects(
struct super_block *sb,
- void *data,
- int silent)
+ struct shrink_control *sc)
{
- struct inode *root;
- struct xfs_mount *mp = NULL;
- int flags = 0, error = -ENOMEM;
+ return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
+}
+static const struct super_operations xfs_super_operations = {
+ .alloc_inode = xfs_fs_alloc_inode,
+ .destroy_inode = xfs_fs_destroy_inode,
+ .dirty_inode = xfs_fs_dirty_inode,
+ .drop_inode = xfs_fs_drop_inode,
+ .put_super = xfs_fs_put_super,
+ .sync_fs = xfs_fs_sync_fs,
+ .freeze_fs = xfs_fs_freeze,
+ .unfreeze_fs = xfs_fs_unfreeze,
+ .statfs = xfs_fs_statfs,
+ .show_options = xfs_fs_show_options,
+ .nr_cached_objects = xfs_fs_nr_cached_objects,
+ .free_cached_objects = xfs_fs_free_cached_objects,
+};
+
+static int
+suffix_kstrtoint(
+ const char *s,
+ unsigned int base,
+ int *res)
+{
+ int last, shift_left_factor = 0, _res;
+ char *value;
+ int ret = 0;
+
+ value = kstrdup(s, GFP_KERNEL);
+ if (!value)
+ return -ENOMEM;
+
+ last = strlen(value) - 1;
+ if (value[last] == 'K' || value[last] == 'k') {
+ shift_left_factor = 10;
+ value[last] = '\0';
+ }
+ if (value[last] == 'M' || value[last] == 'm') {
+ shift_left_factor = 20;
+ value[last] = '\0';
+ }
+ if (value[last] == 'G' || value[last] == 'g') {
+ shift_left_factor = 30;
+ value[last] = '\0';
+ }
+
+ if (kstrtoint(value, base, &_res))
+ ret = -EINVAL;
+ kfree(value);
+ *res = _res << shift_left_factor;
+ return ret;
+}
+
+/*
+ * Set mount state from a mount option.
+ *
+ * NOTE: mp->m_super is NULL here!
+ */
+static int
+xfs_fc_parse_param(
+ struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct xfs_mount *mp = fc->s_fs_info;
+ struct fs_parse_result result;
+ int size = 0;
+ int opt;
+
+ opt = fs_parse(fc, &xfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_logbufs:
+ mp->m_logbufs = result.uint_32;
+ return 0;
+ case Opt_logbsize:
+ if (suffix_kstrtoint(param->string, 10, &mp->m_logbsize))
+ return -EINVAL;
+ return 0;
+ case Opt_logdev:
+ kfree(mp->m_logname);
+ mp->m_logname = kstrdup(param->string, GFP_KERNEL);
+ if (!mp->m_logname)
+ return -ENOMEM;
+ return 0;
+ case Opt_rtdev:
+ kfree(mp->m_rtname);
+ mp->m_rtname = kstrdup(param->string, GFP_KERNEL);
+ if (!mp->m_rtname)
+ return -ENOMEM;
+ return 0;
+ case Opt_allocsize:
+ if (suffix_kstrtoint(param->string, 10, &size))
+ return -EINVAL;
+ mp->m_allocsize_log = ffs(size) - 1;
+ mp->m_flags |= XFS_MOUNT_ALLOCSIZE;
+ return 0;
+ case Opt_grpid:
+ case Opt_bsdgroups:
+ mp->m_flags |= XFS_MOUNT_GRPID;
+ return 0;
+ case Opt_nogrpid:
+ case Opt_sysvgroups:
+ mp->m_flags &= ~XFS_MOUNT_GRPID;
+ return 0;
+ case Opt_wsync:
+ mp->m_flags |= XFS_MOUNT_WSYNC;
+ return 0;
+ case Opt_norecovery:
+ mp->m_flags |= XFS_MOUNT_NORECOVERY;
+ return 0;
+ case Opt_noalign:
+ mp->m_flags |= XFS_MOUNT_NOALIGN;
+ return 0;
+ case Opt_swalloc:
+ mp->m_flags |= XFS_MOUNT_SWALLOC;
+ return 0;
+ case Opt_sunit:
+ mp->m_dalign = result.uint_32;
+ return 0;
+ case Opt_swidth:
+ mp->m_swidth = result.uint_32;
+ return 0;
+ case Opt_inode32:
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ return 0;
+ case Opt_inode64:
+ mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+ return 0;
+ case Opt_nouuid:
+ mp->m_flags |= XFS_MOUNT_NOUUID;
+ return 0;
+ case Opt_ikeep:
+ mp->m_flags |= XFS_MOUNT_IKEEP;
+ return 0;
+ case Opt_noikeep:
+ mp->m_flags &= ~XFS_MOUNT_IKEEP;
+ return 0;
+ case Opt_largeio:
+ mp->m_flags |= XFS_MOUNT_LARGEIO;
+ return 0;
+ case Opt_nolargeio:
+ mp->m_flags &= ~XFS_MOUNT_LARGEIO;
+ return 0;
+ case Opt_attr2:
+ mp->m_flags |= XFS_MOUNT_ATTR2;
+ return 0;
+ case Opt_noattr2:
+ mp->m_flags &= ~XFS_MOUNT_ATTR2;
+ mp->m_flags |= XFS_MOUNT_NOATTR2;
+ return 0;
+ case Opt_filestreams:
+ mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+ return 0;
+ case Opt_noquota:
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
+ mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
+ return 0;
+ case Opt_quota:
+ case Opt_uquota:
+ case Opt_usrquota:
+ mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+ XFS_UQUOTA_ENFD);
+ return 0;
+ case Opt_qnoenforce:
+ case Opt_uqnoenforce:
+ mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+ return 0;
+ case Opt_pquota:
+ case Opt_prjquota:
+ mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+ XFS_PQUOTA_ENFD);
+ return 0;
+ case Opt_pqnoenforce:
+ mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+ return 0;
+ case Opt_gquota:
+ case Opt_grpquota:
+ mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+ XFS_GQUOTA_ENFD);
+ return 0;
+ case Opt_gqnoenforce:
+ mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_GQUOTA_ENFD;
+ return 0;
+ case Opt_discard:
+ mp->m_flags |= XFS_MOUNT_DISCARD;
+ return 0;
+ case Opt_nodiscard:
+ mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ return 0;
+#ifdef CONFIG_FS_DAX
+ case Opt_dax:
+ mp->m_flags |= XFS_MOUNT_DAX;
+ return 0;
+#endif
+ default:
+ xfs_warn(mp, "unknown mount option [%s].", param->key);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+xfs_fc_validate_params(
+ struct xfs_mount *mp)
+{
/*
- * allocate mp and do all low-level struct initializations before we
- * attach it to the super
+ * no recovery flag requires a read-only mount
*/
- mp = xfs_mount_alloc(sb);
- if (!mp)
- goto out;
- sb->s_fs_info = mp;
+ if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
+ !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ xfs_warn(mp, "no-recovery mounts must be read-only.");
+ return -EINVAL;
+ }
+
+ if ((mp->m_flags & XFS_MOUNT_NOALIGN) &&
+ (mp->m_dalign || mp->m_swidth)) {
+ xfs_warn(mp,
+ "sunit and swidth options incompatible with the noalign option");
+ return -EINVAL;
+ }
+
+ if (!IS_ENABLED(CONFIG_XFS_QUOTA) && mp->m_qflags != 0) {
+ xfs_warn(mp, "quota support not available in this kernel.");
+ return -EINVAL;
+ }
+
+ if ((mp->m_dalign && !mp->m_swidth) ||
+ (!mp->m_dalign && mp->m_swidth)) {
+ xfs_warn(mp, "sunit and swidth must be specified together");
+ return -EINVAL;
+ }
+
+ if (mp->m_dalign && (mp->m_swidth % mp->m_dalign != 0)) {
+ xfs_warn(mp,
+ "stripe width (%d) must be a multiple of the stripe unit (%d)",
+ mp->m_swidth, mp->m_dalign);
+ return -EINVAL;
+ }
+
+ if (mp->m_logbufs != -1 &&
+ mp->m_logbufs != 0 &&
+ (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+ mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+ xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
+ mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+ return -EINVAL;
+ }
+
+ if (mp->m_logbsize != -1 &&
+ mp->m_logbsize != 0 &&
+ (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+ mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+ !is_power_of_2(mp->m_logbsize))) {
+ xfs_warn(mp,
+ "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+ mp->m_logbsize);
+ return -EINVAL;
+ }
+
+ if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) &&
+ (mp->m_allocsize_log > XFS_MAX_IO_LOG ||
+ mp->m_allocsize_log < XFS_MIN_IO_LOG)) {
+ xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
+ mp->m_allocsize_log, XFS_MIN_IO_LOG, XFS_MAX_IO_LOG);
+ return -EINVAL;
+ }
- error = xfs_parseargs(mp, (char *)data);
+ return 0;
+}
+
+static int
+xfs_fc_fill_super(
+ struct super_block *sb,
+ struct fs_context *fc)
+{
+ struct xfs_mount *mp = sb->s_fs_info;
+ struct inode *root;
+ int flags = 0, error;
+
+ mp->m_super = sb;
+
+ error = xfs_fc_validate_params(mp);
if (error)
- goto out_free_fsname;
+ goto out_free_names;
sb_min_blocksize(sb, BBSIZE);
sb->s_xattr = xfs_xattr_handlers;
@@ -1616,12 +1390,12 @@ xfs_fs_fill_super(
msleep(xfs_globals.mount_delay * 1000);
}
- if (silent)
+ if (fc->sb_flags & SB_SILENT)
flags |= XFS_MFSI_QUIET;
error = xfs_open_devices(mp);
if (error)
- goto out_free_fsname;
+ goto out_free_names;
error = xfs_init_mount_workqueues(mp);
if (error)
@@ -1758,11 +1532,9 @@ xfs_fs_fill_super(
xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
- out_free_fsname:
+ out_free_names:
sb->s_fs_info = NULL;
- xfs_free_fsname(mp);
- kfree(mp);
- out:
+ xfs_mount_free(mp);
return error;
out_unmount:
@@ -1771,80 +1543,252 @@ xfs_fs_fill_super(
goto out_free_sb;
}
-STATIC void
-xfs_fs_put_super(
- struct super_block *sb)
+static int
+xfs_fc_get_tree(
+ struct fs_context *fc)
{
- struct xfs_mount *mp = XFS_M(sb);
+ return get_tree_bdev(fc, xfs_fc_fill_super);
+}
- /* if ->fill_super failed, we have no mount to tear down */
- if (!sb->s_fs_info)
- return;
+static int
+xfs_remount_rw(
+ struct xfs_mount *mp)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ int error;
- xfs_notice(mp, "Unmounting Filesystem");
- xfs_filestream_unmount(mp);
- xfs_unmountfs(mp);
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+ xfs_warn(mp,
+ "ro->rw transition prohibited on norecovery mount");
+ return -EINVAL;
+ }
- xfs_freesb(mp);
- free_percpu(mp->m_stats.xs_stats);
- xfs_destroy_percpu_counters(mp);
- xfs_destroy_mount_workqueues(mp);
- xfs_close_devices(mp);
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_warn(mp,
+ "ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem",
+ (sbp->sb_features_ro_compat &
+ XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
+ return -EINVAL;
+ }
- sb->s_fs_info = NULL;
- xfs_free_fsname(mp);
- kfree(mp);
+ mp->m_flags &= ~XFS_MOUNT_RDONLY;
+
+ /*
+ * If this is the first remount to writeable state we might have some
+ * superblock changes to update.
+ */
+ if (mp->m_update_sb) {
+ error = xfs_sync_sb(mp, false);
+ if (error) {
+ xfs_warn(mp, "failed to write sb changes");
+ return error;
+ }
+ mp->m_update_sb = false;
+ }
+
+ /*
+ * Fill out the reserve pool if it is empty. Use the stashed value if
+ * it is non-zero, otherwise go with the default.
+ */
+ xfs_restore_resvblks(mp);
+ xfs_log_work_queue(mp);
+
+ /* Recover any CoW blocks that never got remapped. */
+ error = xfs_reflink_recover_cow(mp);
+ if (error) {
+ xfs_err(mp,
+ "Error %d recovering leftover CoW allocations.", error);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+ xfs_start_block_reaping(mp);
+
+ /* Create the per-AG metadata reservation pool .*/
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error && error != -ENOSPC)
+ return error;
+
+ return 0;
}
-STATIC struct dentry *
-xfs_fs_mount(
- struct file_system_type *fs_type,
- int flags,
- const char *dev_name,
- void *data)
+static int
+xfs_remount_ro(
+ struct xfs_mount *mp)
{
- return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+ int error;
+
+ /*
+ * Cancel background eofb scanning so it cannot race with the final
+ * log force+buftarg wait and deadlock the remount.
+ */
+ xfs_stop_block_reaping(mp);
+
+ /* Get rid of any leftover CoW reservations... */
+ error = xfs_icache_free_cowblocks(mp, NULL);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ /* Free the per-AG metadata reservation pool. */
+ error = xfs_fs_unreserve_ag_blocks(mp);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return error;
+ }
+
+ /*
+ * Before we sync the metadata, we need to free up the reserve block
+ * pool so that the used block count in the superblock on disk is
+ * correct at the end of the remount. Stash the current* reserve pool
+ * size so that if we get remounted rw, we can return it to the same
+ * size.
+ */
+ xfs_save_resvblks(mp);
+
+ xfs_quiesce_attr(mp);
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+
+ return 0;
}
-static long
-xfs_fs_nr_cached_objects(
- struct super_block *sb,
- struct shrink_control *sc)
+/*
+ * Logically we would return an error here to prevent users from believing
+ * they might have changed mount options using remount which can't be changed.
+ *
+ * But unfortunately mount(8) adds all options from mtab and fstab to the mount
+ * arguments in some cases so we can't blindly reject options, but have to
+ * check for each specified option if it actually differs from the currently
+ * set option and only reject it if that's the case.
+ *
+ * Until that is implemented we return success for every remount request, and
+ * silently ignore all options that we can't actually change.
+ */
+static int
+xfs_fc_reconfigure(
+ struct fs_context *fc)
{
- /* Paranoia: catch incorrect calls during mount setup or teardown */
- if (WARN_ON_ONCE(!sb->s_fs_info))
- return 0;
- return xfs_reclaim_inodes_count(XFS_M(sb));
+ struct xfs_mount *mp = XFS_M(fc->root->d_sb);
+ struct xfs_mount *new_mp = fc->s_fs_info;
+ xfs_sb_t *sbp = &mp->m_sb;
+ int flags = fc->sb_flags;
+ int error;
+
+ error = xfs_fc_validate_params(new_mp);
+ if (error)
+ return error;
+
+ sync_filesystem(mp->m_super);
+
+ /* inode32 -> inode64 */
+ if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) &&
+ !(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) {
+ mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
+ }
+
+ /* inode64 -> inode32 */
+ if (!(mp->m_flags & XFS_MOUNT_SMALL_INUMS) &&
+ (new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) {
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
+ }
+
+ /* ro -> rw */
+ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(flags & SB_RDONLY)) {
+ error = xfs_remount_rw(mp);
+ if (error)
+ return error;
+ }
+
+ /* rw -> ro */
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (flags & SB_RDONLY)) {
+ error = xfs_remount_ro(mp);
+ if (error)
+ return error;
+ }
+
+ return 0;
}
-static long
-xfs_fs_free_cached_objects(
- struct super_block *sb,
- struct shrink_control *sc)
+static void xfs_fc_free(
+ struct fs_context *fc)
{
- return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
+ struct xfs_mount *mp = fc->s_fs_info;
+
+ /*
+ * mp is stored in the fs_context when it is initialized.
+ * mp is transferred to the superblock on a successful mount,
+ * but if an error occurs before the transfer we have to free
+ * it here.
+ */
+ if (mp)
+ xfs_mount_free(mp);
}
-static const struct super_operations xfs_super_operations = {
- .alloc_inode = xfs_fs_alloc_inode,
- .destroy_inode = xfs_fs_destroy_inode,
- .dirty_inode = xfs_fs_dirty_inode,
- .drop_inode = xfs_fs_drop_inode,
- .put_super = xfs_fs_put_super,
- .sync_fs = xfs_fs_sync_fs,
- .freeze_fs = xfs_fs_freeze,
- .unfreeze_fs = xfs_fs_unfreeze,
- .statfs = xfs_fs_statfs,
- .remount_fs = xfs_fs_remount,
- .show_options = xfs_fs_show_options,
- .nr_cached_objects = xfs_fs_nr_cached_objects,
- .free_cached_objects = xfs_fs_free_cached_objects,
+static const struct fs_context_operations xfs_context_ops = {
+ .parse_param = xfs_fc_parse_param,
+ .get_tree = xfs_fc_get_tree,
+ .reconfigure = xfs_fc_reconfigure,
+ .free = xfs_fc_free,
};
+static int xfs_init_fs_context(
+ struct fs_context *fc)
+{
+ struct xfs_mount *mp;
+
+ mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO);
+ if (!mp)
+ return -ENOMEM;
+
+ spin_lock_init(&mp->m_sb_lock);
+ spin_lock_init(&mp->m_agirotor_lock);
+ INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
+ spin_lock_init(&mp->m_perag_lock);
+ mutex_init(&mp->m_growlock);
+ atomic_set(&mp->m_active_trans, 0);
+ INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+ INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+ INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
+ mp->m_kobj.kobject.kset = xfs_kset;
+ /*
+ * We don't create the finobt per-ag space reservation until after log
+ * recovery, so we must set this to true so that an ifree transaction
+ * started during log recovery will not depend on space reservations
+ * for finobt expansion.
+ */
+ mp->m_finobt_nores = true;
+
+ /*
+ * These can be overridden by the mount option parsing.
+ */
+ mp->m_logbufs = -1;
+ mp->m_logbsize = -1;
+ mp->m_allocsize_log = 16; /* 64k */
+
+ /*
+ * Copy binary VFS mount flags we are interested in.
+ */
+ if (fc->sb_flags & SB_RDONLY)
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+ if (fc->sb_flags & SB_DIRSYNC)
+ mp->m_flags |= XFS_MOUNT_DIRSYNC;
+ if (fc->sb_flags & SB_SYNCHRONOUS)
+ mp->m_flags |= XFS_MOUNT_WSYNC;
+
+ fc->s_fs_info = mp;
+ fc->ops = &xfs_context_ops;
+
+ return 0;
+}
+
static struct file_system_type xfs_fs_type = {
.owner = THIS_MODULE,
.name = "xfs",
- .mount = xfs_fs_mount,
+ .init_fs_context = xfs_init_fs_context,
+ .parameters = &xfs_fs_parameters,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
@@ -1853,37 +1797,39 @@ MODULE_ALIAS_FS("xfs");
STATIC int __init
xfs_init_zones(void)
{
- if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct xfs_ioend, io_inline_bio),
- BIOSET_NEED_BVECS))
- goto out;
-
- xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
- "xfs_log_ticket");
+ xfs_log_ticket_zone = kmem_cache_create("xfs_log_ticket",
+ sizeof(struct xlog_ticket),
+ 0, 0, NULL);
if (!xfs_log_ticket_zone)
- goto out_free_ioend_bioset;
+ goto out;
- xfs_bmap_free_item_zone = kmem_zone_init(
- sizeof(struct xfs_extent_free_item),
- "xfs_bmap_free_item");
+ xfs_bmap_free_item_zone = kmem_cache_create("xfs_bmap_free_item",
+ sizeof(struct xfs_extent_free_item),
+ 0, 0, NULL);
if (!xfs_bmap_free_item_zone)
goto out_destroy_log_ticket_zone;
- xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
- "xfs_btree_cur");
+ xfs_btree_cur_zone = kmem_cache_create("xfs_btree_cur",
+ sizeof(struct xfs_btree_cur),
+ 0, 0, NULL);
if (!xfs_btree_cur_zone)
goto out_destroy_bmap_free_item_zone;
- xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
- "xfs_da_state");
+ xfs_da_state_zone = kmem_cache_create("xfs_da_state",
+ sizeof(struct xfs_da_state),
+ 0, 0, NULL);
if (!xfs_da_state_zone)
goto out_destroy_btree_cur_zone;
- xfs_ifork_zone = kmem_zone_init(sizeof(struct xfs_ifork), "xfs_ifork");
+ xfs_ifork_zone = kmem_cache_create("xfs_ifork",
+ sizeof(struct xfs_ifork),
+ 0, 0, NULL);
if (!xfs_ifork_zone)
goto out_destroy_da_state_zone;
- xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+ xfs_trans_zone = kmem_cache_create("xf_trans",
+ sizeof(struct xfs_trans),
+ 0, 0, NULL);
if (!xfs_trans_zone)
goto out_destroy_ifork_zone;
@@ -1893,111 +1839,121 @@ xfs_init_zones(void)
* size possible under XFS. This wastes a little bit of memory,
* but it is much faster.
*/
- xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item),
- "xfs_buf_item");
+ xfs_buf_item_zone = kmem_cache_create("xfs_buf_item",
+ sizeof(struct xfs_buf_log_item),
+ 0, 0, NULL);
if (!xfs_buf_item_zone)
goto out_destroy_trans_zone;
- xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
- ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
- sizeof(xfs_extent_t))), "xfs_efd_item");
+ xfs_efd_zone = kmem_cache_create("xfs_efd_item",
+ (sizeof(struct xfs_efd_log_item) +
+ (XFS_EFD_MAX_FAST_EXTENTS - 1) *
+ sizeof(struct xfs_extent)),
+ 0, 0, NULL);
if (!xfs_efd_zone)
goto out_destroy_buf_item_zone;
- xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
- ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
- sizeof(xfs_extent_t))), "xfs_efi_item");
+ xfs_efi_zone = kmem_cache_create("xfs_efi_item",
+ (sizeof(struct xfs_efi_log_item) +
+ (XFS_EFI_MAX_FAST_EXTENTS - 1) *
+ sizeof(struct xfs_extent)),
+ 0, 0, NULL);
if (!xfs_efi_zone)
goto out_destroy_efd_zone;
- xfs_inode_zone =
- kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
- KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
+ xfs_inode_zone = kmem_cache_create("xfs_inode",
+ sizeof(struct xfs_inode), 0,
+ (SLAB_HWCACHE_ALIGN |
+ SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ xfs_fs_inode_init_once);
if (!xfs_inode_zone)
goto out_destroy_efi_zone;
- xfs_ili_zone =
- kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
- KM_ZONE_SPREAD, NULL);
+ xfs_ili_zone = kmem_cache_create("xfs_ili",
+ sizeof(struct xfs_inode_log_item), 0,
+ SLAB_MEM_SPREAD, NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
- xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
- "xfs_icr");
+
+ xfs_icreate_zone = kmem_cache_create("xfs_icr",
+ sizeof(struct xfs_icreate_item),
+ 0, 0, NULL);
if (!xfs_icreate_zone)
goto out_destroy_ili_zone;
- xfs_rud_zone = kmem_zone_init(sizeof(struct xfs_rud_log_item),
- "xfs_rud_item");
+ xfs_rud_zone = kmem_cache_create("xfs_rud_item",
+ sizeof(struct xfs_rud_log_item),
+ 0, 0, NULL);
if (!xfs_rud_zone)
goto out_destroy_icreate_zone;
- xfs_rui_zone = kmem_zone_init(
+ xfs_rui_zone = kmem_cache_create("xfs_rui_item",
xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS),
- "xfs_rui_item");
+ 0, 0, NULL);
if (!xfs_rui_zone)
goto out_destroy_rud_zone;
- xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item),
- "xfs_cud_item");
+ xfs_cud_zone = kmem_cache_create("xfs_cud_item",
+ sizeof(struct xfs_cud_log_item),
+ 0, 0, NULL);
if (!xfs_cud_zone)
goto out_destroy_rui_zone;
- xfs_cui_zone = kmem_zone_init(
+ xfs_cui_zone = kmem_cache_create("xfs_cui_item",
xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
- "xfs_cui_item");
+ 0, 0, NULL);
if (!xfs_cui_zone)
goto out_destroy_cud_zone;
- xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item),
- "xfs_bud_item");
+ xfs_bud_zone = kmem_cache_create("xfs_bud_item",
+ sizeof(struct xfs_bud_log_item),
+ 0, 0, NULL);
if (!xfs_bud_zone)
goto out_destroy_cui_zone;
- xfs_bui_zone = kmem_zone_init(
+ xfs_bui_zone = kmem_cache_create("xfs_bui_item",
xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
- "xfs_bui_item");
+ 0, 0, NULL);
if (!xfs_bui_zone)
goto out_destroy_bud_zone;
return 0;
out_destroy_bud_zone:
- kmem_zone_destroy(xfs_bud_zone);
+ kmem_cache_destroy(xfs_bud_zone);
out_destroy_cui_zone:
- kmem_zone_destroy(xfs_cui_zone);
+ kmem_cache_destroy(xfs_cui_zone);
out_destroy_cud_zone:
- kmem_zone_destroy(xfs_cud_zone);
+ kmem_cache_destroy(xfs_cud_zone);
out_destroy_rui_zone:
- kmem_zone_destroy(xfs_rui_zone);
+ kmem_cache_destroy(xfs_rui_zone);
out_destroy_rud_zone:
- kmem_zone_destroy(xfs_rud_zone);
+ kmem_cache_destroy(xfs_rud_zone);
out_destroy_icreate_zone:
- kmem_zone_destroy(xfs_icreate_zone);
+ kmem_cache_destroy(xfs_icreate_zone);
out_destroy_ili_zone:
- kmem_zone_destroy(xfs_ili_zone);
+ kmem_cache_destroy(xfs_ili_zone);
out_destroy_inode_zone:
- kmem_zone_destroy(xfs_inode_zone);
+ kmem_cache_destroy(xfs_inode_zone);
out_destroy_efi_zone:
- kmem_zone_destroy(xfs_efi_zone);
+ kmem_cache_destroy(xfs_efi_zone);
out_destroy_efd_zone:
- kmem_zone_destroy(xfs_efd_zone);
+ kmem_cache_destroy(xfs_efd_zone);
out_destroy_buf_item_zone:
- kmem_zone_destroy(xfs_buf_item_zone);
+ kmem_cache_destroy(xfs_buf_item_zone);
out_destroy_trans_zone:
- kmem_zone_destroy(xfs_trans_zone);
+ kmem_cache_destroy(xfs_trans_zone);
out_destroy_ifork_zone:
- kmem_zone_destroy(xfs_ifork_zone);
+ kmem_cache_destroy(xfs_ifork_zone);
out_destroy_da_state_zone:
- kmem_zone_destroy(xfs_da_state_zone);
+ kmem_cache_destroy(xfs_da_state_zone);
out_destroy_btree_cur_zone:
- kmem_zone_destroy(xfs_btree_cur_zone);
+ kmem_cache_destroy(xfs_btree_cur_zone);
out_destroy_bmap_free_item_zone:
- kmem_zone_destroy(xfs_bmap_free_item_zone);
+ kmem_cache_destroy(xfs_bmap_free_item_zone);
out_destroy_log_ticket_zone:
- kmem_zone_destroy(xfs_log_ticket_zone);
- out_free_ioend_bioset:
- bioset_exit(&xfs_ioend_bioset);
+ kmem_cache_destroy(xfs_log_ticket_zone);
out:
return -ENOMEM;
}
@@ -2010,25 +1966,24 @@ xfs_destroy_zones(void)
* destroy caches.
*/
rcu_barrier();
- kmem_zone_destroy(xfs_bui_zone);
- kmem_zone_destroy(xfs_bud_zone);
- kmem_zone_destroy(xfs_cui_zone);
- kmem_zone_destroy(xfs_cud_zone);
- kmem_zone_destroy(xfs_rui_zone);
- kmem_zone_destroy(xfs_rud_zone);
- kmem_zone_destroy(xfs_icreate_zone);
- kmem_zone_destroy(xfs_ili_zone);
- kmem_zone_destroy(xfs_inode_zone);
- kmem_zone_destroy(xfs_efi_zone);
- kmem_zone_destroy(xfs_efd_zone);
- kmem_zone_destroy(xfs_buf_item_zone);
- kmem_zone_destroy(xfs_trans_zone);
- kmem_zone_destroy(xfs_ifork_zone);
- kmem_zone_destroy(xfs_da_state_zone);
- kmem_zone_destroy(xfs_btree_cur_zone);
- kmem_zone_destroy(xfs_bmap_free_item_zone);
- kmem_zone_destroy(xfs_log_ticket_zone);
- bioset_exit(&xfs_ioend_bioset);
+ kmem_cache_destroy(xfs_bui_zone);
+ kmem_cache_destroy(xfs_bud_zone);
+ kmem_cache_destroy(xfs_cui_zone);
+ kmem_cache_destroy(xfs_cud_zone);
+ kmem_cache_destroy(xfs_rui_zone);
+ kmem_cache_destroy(xfs_rud_zone);
+ kmem_cache_destroy(xfs_icreate_zone);
+ kmem_cache_destroy(xfs_ili_zone);
+ kmem_cache_destroy(xfs_inode_zone);
+ kmem_cache_destroy(xfs_efi_zone);
+ kmem_cache_destroy(xfs_efd_zone);
+ kmem_cache_destroy(xfs_buf_item_zone);
+ kmem_cache_destroy(xfs_trans_zone);
+ kmem_cache_destroy(xfs_ifork_zone);
+ kmem_cache_destroy(xfs_da_state_zone);
+ kmem_cache_destroy(xfs_btree_cur_zone);
+ kmem_cache_destroy(xfs_bmap_free_item_zone);
+ kmem_cache_destroy(xfs_log_ticket_zone);
}
STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 763e43d22dee..b552cf6d3379 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -11,9 +11,11 @@
#ifdef CONFIG_XFS_QUOTA
extern int xfs_qm_init(void);
extern void xfs_qm_exit(void);
+# define XFS_QUOTA_STRING "quota, "
#else
# define xfs_qm_init() (0)
# define xfs_qm_exit() do { } while (0)
+# define XFS_QUOTA_STRING
#endif
#ifdef CONFIG_XFS_POSIX_ACL
@@ -50,6 +52,12 @@ extern void xfs_qm_exit(void);
# define XFS_WARN_STRING
#endif
+#ifdef CONFIG_XFS_ASSERT_FATAL
+# define XFS_ASSERT_FATAL_STRING "fatal assert, "
+#else
+# define XFS_ASSERT_FATAL_STRING
+#endif
+
#ifdef DEBUG
# define XFS_DBG_STRING "debug"
#else
@@ -63,6 +71,8 @@ extern void xfs_qm_exit(void);
XFS_SCRUB_STRING \
XFS_REPAIR_STRING \
XFS_WARN_STRING \
+ XFS_QUOTA_STRING \
+ XFS_ASSERT_FATAL_STRING \
XFS_DBG_STRING /* DBG must be last */
struct xfs_inode;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index ed66fd2de327..a25502bc2071 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -17,6 +17,7 @@
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
#include "xfs_quota.h"
+#include "xfs_symlink.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index 9743d8c9394b..b1fa091427e6 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -5,7 +5,7 @@
#ifndef __XFS_SYMLINK_H
#define __XFS_SYMLINK_H 1
-/* Kernel only symlink defintions */
+/* Kernel only symlink definitions */
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, umode_t mode, struct xfs_inode **ipp);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index eaae275ed430..c13bb3655e48 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -725,7 +725,7 @@ TRACE_EVENT(xfs_iomap_prealloc_size,
__entry->writeio_blocks = writeio_blocks;
),
TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d "
- "m_writeio_blocks %u",
+ "m_allocsize_blocks %u",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
__entry->blocks, __entry->shift, __entry->writeio_blocks)
)
@@ -1158,71 +1158,6 @@ DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
-DECLARE_EVENT_CLASS(xfs_page_class,
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
- unsigned int len),
- TP_ARGS(inode, page, off, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(pgoff_t, pgoff)
- __field(loff_t, size)
- __field(unsigned long, offset)
- __field(unsigned int, length)
- ),
- TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = XFS_I(inode)->i_ino;
- __entry->pgoff = page_offset(page);
- __entry->size = i_size_read(inode);
- __entry->offset = off;
- __entry->length = len;
- ),
- TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
- "length %x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->pgoff,
- __entry->size,
- __entry->offset,
- __entry->length)
-)
-
-#define DEFINE_PAGE_EVENT(name) \
-DEFINE_EVENT(xfs_page_class, name, \
- TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
- unsigned int len), \
- TP_ARGS(inode, page, off, len))
-DEFINE_PAGE_EVENT(xfs_writepage);
-DEFINE_PAGE_EVENT(xfs_releasepage);
-DEFINE_PAGE_EVENT(xfs_invalidatepage);
-
-DECLARE_EVENT_CLASS(xfs_readpage_class,
- TP_PROTO(struct inode *inode, int nr_pages),
- TP_ARGS(inode, nr_pages),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(int, nr_pages)
- ),
- TP_fast_assign(
- __entry->dev = inode->i_sb->s_dev;
- __entry->ino = inode->i_ino;
- __entry->nr_pages = nr_pages;
- ),
- TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->nr_pages)
-)
-
-#define DEFINE_READPAGE_EVENT(name) \
-DEFINE_EVENT(xfs_readpage_class, name, \
- TP_PROTO(struct inode *inode, int nr_pages), \
- TP_ARGS(inode, nr_pages))
-DEFINE_READPAGE_EVENT(xfs_vm_readpage);
-DEFINE_READPAGE_EVENT(xfs_vm_readpages);
-
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int whichfork, struct xfs_bmbt_irec *irec),
@@ -1642,8 +1577,11 @@ DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_right);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_left);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup);
+DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done);
DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
@@ -1663,6 +1601,32 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+TRACE_EVENT(xfs_alloc_cur_check,
+ TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno,
+ xfs_extlen_t len, xfs_extlen_t diff, bool new),
+ TP_ARGS(mp, btnum, bno, len, diff, new),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_btnum_t, btnum)
+ __field(xfs_agblock_t, bno)
+ __field(xfs_extlen_t, len)
+ __field(xfs_extlen_t, diff)
+ __field(bool, new)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->btnum = btnum;
+ __entry->bno = bno;
+ __entry->len = len;
+ __entry->diff = diff;
+ __entry->new = new;
+ ),
+ TP_printk("dev %d:%d btree %s bno 0x%x len 0x%x diff 0x%x new %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+ __entry->bno, __entry->len, __entry->diff, __entry->new)
+)
+
DECLARE_EVENT_CLASS(xfs_da_class,
TP_PROTO(struct xfs_da_args *args),
TP_ARGS(args),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index f4795fdb7389..3b208f9a865c 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -71,7 +71,7 @@ xfs_trans_free(
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
- kmem_zone_free(xfs_trans_zone, tp);
+ kmem_cache_free(xfs_trans_zone, tp);
}
/*
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 6ccfd75d3c24..00cc5b8734be 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -427,15 +427,15 @@ xfsaild_push(
case XFS_ITEM_FLUSHING:
/*
- * The item or its backing buffer is already beeing
+ * The item or its backing buffer is already being
* flushed. The typical reason for that is that an
* inode buffer is locked because we already pushed the
* updates to it as part of inode clustering.
*
* We do not want to to stop flushing just because lots
- * of items are already beeing flushed, but we need to
+ * of items are already being flushed, but we need to
* re-try the flushing relatively soon if most of the
- * AIL is beeing flushed.
+ * AIL is being flushed.
*/
XFS_STATS_INC(mp, xs_push_ail_flushing);
trace_xfs_ail_flushing(lip);
@@ -612,7 +612,7 @@ xfsaild(
* The push is run asynchronously in a workqueue, which means the caller needs
* to handle waiting on the async flush for space to become available.
* We don't want to interrupt any push that is in progress, hence we only queue
- * work if we set the pushing bit approriately.
+ * work if we set the pushing bit appropriately.
*
* We do this unlocked - we only need to know whether there is anything in the
* AIL at the time we are called. We don't need to access the contents of
@@ -836,7 +836,7 @@ xfs_trans_ail_init(
init_waitqueue_head(&ailp->ail_empty);
ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
- ailp->ail_mount->m_fsname);
+ ailp->ail_mount->m_super->s_id);
if (IS_ERR(ailp->ail_task))
goto out_free_ailp;
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 16457465833b..a6fe2d8dc40f 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -25,8 +25,8 @@ STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
*/
void
xfs_trans_dqjoin(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(dqp->q_logitem.qli_dquot == dqp);
@@ -49,8 +49,8 @@ xfs_trans_dqjoin(
*/
void
xfs_trans_log_dquot(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
@@ -486,12 +486,12 @@ xfs_trans_apply_dquot_deltas(
*/
void
xfs_trans_unreserve_and_mod_dquots(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
int i, j;
- xfs_dquot_t *dqp;
+ struct xfs_dquot *dqp;
struct xfs_dqtrx *qtrx, *qa;
- bool locked;
+ bool locked;
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
@@ -571,21 +571,21 @@ xfs_quota_warn(
*/
STATIC int
xfs_trans_dqresv(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dquot_t *dqp,
- int64_t nblks,
- long ninos,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ struct xfs_dquot *dqp,
+ int64_t nblks,
+ long ninos,
+ uint flags)
{
- xfs_qcnt_t hardlimit;
- xfs_qcnt_t softlimit;
- time_t timer;
- xfs_qwarncnt_t warns;
- xfs_qwarncnt_t warnlimit;
- xfs_qcnt_t total_count;
- xfs_qcnt_t *resbcountp;
- xfs_quotainfo_t *q = mp->m_quotainfo;
+ xfs_qcnt_t hardlimit;
+ xfs_qcnt_t softlimit;
+ time_t timer;
+ xfs_qwarncnt_t warns;
+ xfs_qwarncnt_t warnlimit;
+ xfs_qcnt_t total_count;
+ xfs_qcnt_t *resbcountp;
+ struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_def_quota *defq;
@@ -824,13 +824,13 @@ xfs_trans_reserve_quota_nblks(
/*
* This routine is called to allocate a quotaoff log item.
*/
-xfs_qoff_logitem_t *
+struct xfs_qoff_logitem *
xfs_trans_get_qoff_item(
- xfs_trans_t *tp,
- xfs_qoff_logitem_t *startqoff,
+ struct xfs_trans *tp,
+ struct xfs_qoff_logitem *startqoff,
uint flags)
{
- xfs_qoff_logitem_t *q;
+ struct xfs_qoff_logitem *q;
ASSERT(tp != NULL);
@@ -852,8 +852,8 @@ xfs_trans_get_qoff_item(
*/
void
xfs_trans_log_quotaoff_item(
- xfs_trans_t *tp,
- xfs_qoff_logitem_t *qlp)
+ struct xfs_trans *tp,
+ struct xfs_qoff_logitem *qlp)
{
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags);
@@ -872,6 +872,6 @@ xfs_trans_free_dqinfo(
{
if (!tp->t_dqinfo)
return;
- kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
+ kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
tp->t_dqinfo = NULL;
}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index cb895b1df5e4..383f0203d103 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -11,6 +11,7 @@
#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_attr.h"
+#include "xfs_acl.h"
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>