diff options
Diffstat (limited to 'fs')
156 files changed, 4144 insertions, 2036 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index ab5547ff29a1..38d695d66a0b 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -37,7 +37,6 @@ #include <linux/mount.h> #include <linux/idr.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -231,10 +230,8 @@ v9fs_umount_begin(struct super_block *sb) { struct v9fs_session_info *v9ses; - lock_kernel(); v9ses = sb->s_fs_info; v9fs_session_cancel(v9ses); - unlock_kernel(); } static const struct super_operations v9fs_super_ops = { diff --git a/fs/Kconfig b/fs/Kconfig index 525da2e8f73b..d78e950402c1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -39,6 +39,13 @@ config FS_POSIX_ACL bool default n +source "fs/xfs/Kconfig" +source "fs/gfs2/Kconfig" +source "fs/ocfs2/Kconfig" +source "fs/btrfs/Kconfig" + +endif # BLOCK + config FILE_LOCKING bool "Enable POSIX file locking API" if EMBEDDED default y @@ -47,13 +54,6 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. -source "fs/xfs/Kconfig" -source "fs/gfs2/Kconfig" -source "fs/ocfs2/Kconfig" -source "fs/btrfs/Kconfig" - -endif # BLOCK - source "fs/notify/Kconfig" source "fs/quota/Kconfig" @@ -134,7 +134,7 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ - (S390 && 64BIT) || BROKEN + (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index a6665f37f456..9cc18775b832 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -1,3 +1,6 @@ +#include <linux/fs.h> +#include <linux/adfs_fs.h> + /* Internal data structures for ADFS */ #define ADFS_FREE_FRAG 0 @@ -17,6 +20,58 @@ struct buffer_head; /* + * adfs file system inode data in memory + */ +struct adfs_inode_info { + loff_t mmu_private; + unsigned long parent_id; /* object id of parent */ + __u32 loadaddr; /* RISC OS load address */ + __u32 execaddr; /* RISC OS exec address */ + unsigned int filetype; /* RISC OS file type */ + unsigned int attr; /* RISC OS permissions */ + unsigned int stamped:1; /* RISC OS file has date/time */ + struct inode vfs_inode; +}; + +/* + * Forward-declare this + */ +struct adfs_discmap; +struct adfs_dir_ops; + +/* + * ADFS file system superblock data in memory + */ +struct adfs_sb_info { + struct adfs_discmap *s_map; /* bh list containing map */ + struct adfs_dir_ops *s_dir; /* directory operations */ + + uid_t s_uid; /* owner uid */ + gid_t s_gid; /* owner gid */ + umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ + umode_t s_other_mask; /* ADFS other perm -> unix perm */ + + __u32 s_ids_per_zone; /* max. no ids in one zone */ + __u32 s_idlen; /* length of ID in map */ + __u32 s_map_size; /* sector size of a map */ + unsigned long s_size; /* total size (in blocks) of this fs */ + signed int s_map2blk; /* shift left by this for map->sector */ + unsigned int s_log2sharesize;/* log2 share size */ + __le32 s_version; /* disc format version */ + unsigned int s_namelen; /* maximum number of characters in name */ +}; + +static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct adfs_inode_info *ADFS_I(struct inode *inode) +{ + return container_of(inode, struct adfs_inode_info, vfs_inode); +} + +/* * Directory handling */ struct adfs_dir { diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 4d4073447d1a..23aa52f548a0 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -9,15 +9,7 @@ * * Common directory handling for ADFS */ -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/adfs_fs.h> -#include <linux/time.h> -#include <linux/stat.h> -#include <linux/spinlock.h> #include <linux/smp_lock.h> -#include <linux/buffer_head.h> /* for file_fsync() */ - #include "adfs.h" /* diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index 31df6adf0de6..bafc71222e25 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -9,15 +9,7 @@ * * E and F format directory handling */ -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/adfs_fs.h> -#include <linux/time.h> -#include <linux/stat.h> -#include <linux/spinlock.h> #include <linux/buffer_head.h> -#include <linux/string.h> - #include "adfs.h" #include "dir_f.h" diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index 139e0f345f18..1796bb352d05 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -7,15 +7,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/adfs_fs.h> -#include <linux/time.h> -#include <linux/stat.h> -#include <linux/spinlock.h> #include <linux/buffer_head.h> -#include <linux/string.h> - #include "adfs.h" #include "dir_fplus.h" diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 8224d54a2afb..005ea34d1758 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -19,10 +19,6 @@ * * adfs regular file handling primitives */ -#include <linux/fs.h> -#include <linux/buffer_head.h> /* for file_fsync() */ -#include <linux/adfs_fs.h> - #include "adfs.h" const struct file_operations adfs_file_operations = { diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 05b3a677201d..798cb071d132 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -7,17 +7,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/adfs_fs.h> -#include <linux/time.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/mm.h> #include <linux/smp_lock.h> -#include <linux/module.h> #include <linux/buffer_head.h> - #include "adfs.h" /* @@ -395,4 +386,3 @@ int adfs_write_inode(struct inode *inode, int wait) unlock_kernel(); return ret; } -MODULE_LICENSE("GPL"); diff --git a/fs/adfs/map.c b/fs/adfs/map.c index 568081b93f73..d1a5932bb0f1 100644 --- a/fs/adfs/map.c +++ b/fs/adfs/map.c @@ -7,14 +7,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/adfs_fs.h> -#include <linux/spinlock.h> #include <linux/buffer_head.h> - #include <asm/unaligned.h> - #include "adfs.h" /* diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 0ec5aaf47aa7..aad92f0a1048 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -8,26 +8,12 @@ * published by the Free Software Foundation. */ #include <linux/module.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/adfs_fs.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/stat.h> -#include <linux/string.h> #include <linux/init.h> #include <linux/buffer_head.h> -#include <linux/vfs.h> #include <linux/parser.h> -#include <linux/bitops.h> #include <linux/mount.h> #include <linux/seq_file.h> - -#include <asm/uaccess.h> -#include <asm/system.h> - -#include <stdarg.h> - +#include <linux/statfs.h> #include "adfs.h" #include "dir_f.h" #include "dir_fplus.h" @@ -534,3 +520,4 @@ static void __exit exit_adfs_fs(void) module_init(init_adfs_fs) module_exit(exit_adfs_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 2d33a5f7d218..0dd4dafee10b 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> +#include <rxrpc/packet.h> #include "internal.h" #include "afs_fs.h" @@ -54,6 +55,21 @@ int afs_abort_to_error(u32 abort_code) case 0x2f6df24: return -ENOLCK; case 0x2f6df26: return -ENOTEMPTY; case 0x2f6df78: return -EDQUOT; + + case RXKADINCONSISTENCY: return -EPROTO; + case RXKADPACKETSHORT: return -EPROTO; + case RXKADLEVELFAIL: return -EKEYREJECTED; + case RXKADTICKETLEN: return -EKEYREJECTED; + case RXKADOUTOFSEQUENCE: return -EPROTO; + case RXKADNOAUTH: return -EKEYREJECTED; + case RXKADBADKEY: return -EKEYREJECTED; + case RXKADBADTICKET: return -EKEYREJECTED; + case RXKADUNKNOWNKEY: return -EKEYREJECTED; + case RXKADEXPIRED: return -EKEYEXPIRED; + case RXKADSEALEDINCON: return -EKEYREJECTED; + case RXKADDATALEN: return -EKEYREJECTED; + case RXKADILLEGALLEVEL: return -EKEYREJECTED; + default: return -EREMOTEIO; } } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index ec2a7431e458..6e689208def2 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -65,6 +65,8 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl, goto out; goto rotate; case -ENOMEDIUM: + case -EKEYREJECTED: + case -EKEYEXPIRED: goto out; default: ret = -EIO; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 9367b6297d84..615d5496fe0f 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in, { struct nls_table *nls = BEFS_SB(sb)->nls; int i, o; - wchar_t uni; + unicode_t uni; int unilen, utflen; char *result; /* The utf8->nls conversion won't make the final nls string bigger @@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in, for (i = o = 0; i < in_len; i += utflen, o += unilen) { /* convert from UTF-8 to Unicode */ - utflen = utf8_mbtowc(&uni, &in[i], in_len - i); - if (utflen < 0) { + utflen = utf8_to_utf32(&in[i], in_len - i, &uni); + if (utflen < 0) goto conv_err; - } /* convert from Unicode to nls */ + if (uni > MAX_WCHAR_T) + goto conv_err; unilen = nls->uni2char(uni, &result[o], in_len - o); - if (unilen < 0) { + if (unilen < 0) goto conv_err; - } } result[o] = '\0'; *out_len = o; @@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in, /* convert from nls to unicode */ unilen = nls->char2uni(&in[i], in_len - i, &uni); - if (unilen < 0) { + if (unilen < 0) goto conv_err; - } /* convert from unicode to UTF-8 */ - utflen = utf8_wctomb(&result[o], uni, 3); - if (utflen <= 0) { + utflen = utf32_to_utf8(uni, &result[o], 3); + if (utflen <= 0) goto conv_err; - } } result[o] = '\0'; @@ -737,8 +735,6 @@ parse_options(char *options, befs_mount_options * opts) static void befs_put_super(struct super_block *sb) { - lock_kernel(); - kfree(BEFS_SB(sb)->mount_opts.iocharset); BEFS_SB(sb)->mount_opts.iocharset = NULL; @@ -749,8 +745,6 @@ befs_put_super(struct super_block *sb) kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } /* Allocate private field of the superblock, fill it. diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 40381df34869..9fa212b014a5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1340,8 +1340,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; prstatus->pr_sigpend = p->pending.signal.sig[0]; prstatus->pr_sighold = p->blocked.sig[0]; + rcu_read_lock(); + prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); prstatus->pr_pid = task_pid_vnr(p); - prstatus->pr_ppid = task_pid_vnr(p->real_parent); prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { @@ -1382,8 +1384,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_psargs[i] = ' '; psinfo->pr_psargs[len] = 0; + rcu_read_lock(); + psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); psinfo->pr_pid = task_pid_vnr(p); - psinfo->pr_ppid = task_pid_vnr(p->real_parent); psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index fdb66faa24f1..20fbeced472b 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1387,8 +1387,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; prstatus->pr_sigpend = p->pending.signal.sig[0]; prstatus->pr_sighold = p->blocked.sig[0]; + rcu_read_lock(); + prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); prstatus->pr_pid = task_pid_vnr(p); - prstatus->pr_ppid = task_pid_vnr(p->real_parent); prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { @@ -1432,8 +1434,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_psargs[i] = ' '; psinfo->pr_psargs[len] = 0; + rcu_read_lock(); + psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); psinfo->pr_pid = task_pid_vnr(p); - psinfo->pr_ppid = task_pid_vnr(p->real_parent); psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); @@ -25,7 +25,6 @@ #include <linux/module.h> #include <linux/mempool.h> #include <linux/workqueue.h> -#include <linux/blktrace_api.h> #include <scsi/sg.h> /* for struct sg_iovec */ #include <trace/events/block.h> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0d50d49d990a..d28d29c95f7c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,8 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); +static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); + /* * end_io_wq structs are used to do processing in task context when an IO is * complete. This is used during reads to verify checksums, and it is used @@ -1342,12 +1344,25 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) free_extent_map(em); } +/* + * If this fails, caller must call bdi_destroy() to get rid of the + * bdi again. + */ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { - bdi_init(bdi); + int err; + + bdi->capabilities = BDI_CAP_MAP_COPY; + err = bdi_init(bdi); + if (err) + return err; + + err = bdi_register(bdi, NULL, "btrfs-%d", + atomic_inc_return(&btrfs_bdi_num)); + if (err) + return err; + bdi->ra_pages = default_backing_dev_info.ra_pages; - bdi->state = 0; - bdi->capabilities = default_backing_dev_info.capabilities; bdi->unplug_io_fn = btrfs_unplug_io_fn; bdi->unplug_io_data = info; bdi->congested_fn = btrfs_congested_fn; @@ -1569,7 +1584,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; - setup_bdi(fs_info, &fs_info->bdi); + if (setup_bdi(fs_info, &fs_info->bdi)) + goto fail_bdi; fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; @@ -1946,8 +1962,8 @@ fail_iput: btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); +fail_bdi: bdi_destroy(&fs_info->bdi); - fail: kfree(extent_root); kfree(tree_root); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2e177d7f4bb9..4e83457ea253 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -543,13 +543,13 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); - if (root->commit_root == root->node) - continue; - - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + if (root->commit_root != root->node) { + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + btrfs_set_root_node(&root->root_item, + root->node); + } - btrfs_set_root_node(&root->root_item, root->node); err = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 0aac371bff0b..c5ded5ff72b5 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -788,12 +788,6 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) if (put_user(compat_ptr(data), &sgio->usr_ptr)) return -EFAULT; - if (copy_in_user(&sgio->status, &sgio32->status, - (4 * sizeof(unsigned char)) + - (2 * sizeof(unsigned short)) + - (3 * sizeof(int)))) - return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) sgio); if (err >= 0) { diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 33a90120f6ad..4d74fc72c195 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -67,6 +67,8 @@ static int debugfs_u8_get(void *data, u64 *val) return 0; } DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value @@ -95,6 +97,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); struct dentry *debugfs_create_u8(const char *name, mode_t mode, struct dentry *parent, u8 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u8_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u8_wo); + return debugfs_create_file(name, mode, parent, value, &fops_u8); } EXPORT_SYMBOL_GPL(debugfs_create_u8); @@ -110,6 +119,8 @@ static int debugfs_u16_get(void *data, u64 *val) return 0; } DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); /** * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value @@ -138,6 +149,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); struct dentry *debugfs_create_u16(const char *name, mode_t mode, struct dentry *parent, u16 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u16_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u16_wo); + return debugfs_create_file(name, mode, parent, value, &fops_u16); } EXPORT_SYMBOL_GPL(debugfs_create_u16); @@ -153,6 +171,8 @@ static int debugfs_u32_get(void *data, u64 *val) return 0; } DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); /** * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value @@ -181,6 +201,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); struct dentry *debugfs_create_u32(const char *name, mode_t mode, struct dentry *parent, u32 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u32_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u32_wo); + return debugfs_create_file(name, mode, parent, value, &fops_u32); } EXPORT_SYMBOL_GPL(debugfs_create_u32); @@ -197,6 +224,8 @@ static int debugfs_u64_get(void *data, u64 *val) return 0; } DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); /** * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value @@ -225,15 +254,28 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); struct dentry *debugfs_create_u64(const char *name, mode_t mode, struct dentry *parent, u64 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u64_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u64_wo); + return debugfs_create_file(name, mode, parent, value, &fops_u64); } EXPORT_SYMBOL_GPL(debugfs_create_u64); DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); /* * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value @@ -256,6 +298,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n" struct dentry *debugfs_create_x8(const char *name, mode_t mode, struct dentry *parent, u8 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x8_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x8_wo); + return debugfs_create_file(name, mode, parent, value, &fops_x8); } EXPORT_SYMBOL_GPL(debugfs_create_x8); @@ -273,6 +322,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); struct dentry *debugfs_create_x16(const char *name, mode_t mode, struct dentry *parent, u16 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x16_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x16_wo); + return debugfs_create_file(name, mode, parent, value, &fops_x16); } EXPORT_SYMBOL_GPL(debugfs_create_x16); @@ -290,6 +346,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); struct dentry *debugfs_create_x32(const char *name, mode_t mode, struct dentry *parent, u32 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x32_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x32_wo); + return debugfs_create_file(name, mode, parent, value, &fops_x32); } EXPORT_SYMBOL_GPL(debugfs_create_x32); @@ -419,7 +482,7 @@ static const struct file_operations fops_blob = { }; /** - * debugfs_create_blob - create a debugfs file that is used to read and write a binary blob + * debugfs_create_blob - create a debugfs file that is used to read a binary blob * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 0662ba6de85a..d22438ef7674 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -403,6 +403,7 @@ void debugfs_remove_recursive(struct dentry *dentry) } child = list_entry(parent->d_subdirs.next, struct dentry, d_u.d_child); + next_sibling: /* * If "child" isn't empty, walk down the tree and @@ -417,6 +418,16 @@ void debugfs_remove_recursive(struct dentry *dentry) __debugfs_remove(child, parent); if (parent->d_subdirs.next == &child->d_u.d_child) { /* + * Try the next sibling. + */ + if (child->d_u.d_child.next != &parent->d_subdirs) { + child = list_entry(child->d_u.d_child.next, + struct dentry, + d_u.d_child); + goto next_sibling; + } + + /* * Avoid infinite loop if we fail to remove * one dentry. */ diff --git a/fs/drop_caches.c b/fs/drop_caches.c index b6a719a909f8..a2edb7913447 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -24,7 +24,7 @@ static void drop_pagecache_sb(struct super_block *sb) continue; __iget(inode); spin_unlock(&inode_lock); - __invalidate_mapping_pages(inode->i_mapping, 0, -1, true); + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; spin_lock(&inode_lock); diff --git a/fs/efs/dir.c b/fs/efs/dir.c index 49308a29798a..7ee6f7e3a608 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -5,12 +5,12 @@ */ #include <linux/buffer_head.h> -#include <linux/smp_lock.h> #include "efs.h" static int efs_readdir(struct file *, void *, filldir_t); const struct file_operations efs_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = efs_readdir, }; @@ -33,8 +33,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { if (inode->i_size & (EFS_DIRBSIZE-1)) printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); - lock_kernel(); - /* work out where this entry can be found */ block = filp->f_pos >> EFS_DIRBSIZE_BITS; @@ -107,7 +105,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; out: - unlock_kernel(); return 0; } diff --git a/fs/efs/namei.c b/fs/efs/namei.c index c3fb5f9c4a44..1511bf9e5f80 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -8,7 +8,6 @@ #include <linux/buffer_head.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/exportfs.h> #include "efs.h" @@ -63,16 +62,12 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei efs_ino_t inodenum; struct inode * inode = NULL; - lock_kernel(); inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len); if (inodenum) { inode = efs_iget(dir->i_sb, inodenum); - if (IS_ERR(inode)) { - unlock_kernel(); + if (IS_ERR(inode)) return ERR_CAST(inode); - } } - unlock_kernel(); return d_splice_alias(inode, dentry); } @@ -115,11 +110,9 @@ struct dentry *efs_get_parent(struct dentry *child) struct dentry *parent = ERR_PTR(-ENOENT); efs_ino_t ino; - lock_kernel(); ino = efs_find_entry(child->d_inode, "..", 2); if (ino) parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino)); - unlock_kernel(); return parent; } diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c index 41911ec83aaf..75117d0dac2b 100644 --- a/fs/efs/symlink.c +++ b/fs/efs/symlink.c @@ -9,7 +9,6 @@ #include <linux/string.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> -#include <linux/smp_lock.h> #include "efs.h" static int efs_symlink_readpage(struct file *file, struct page *page) @@ -22,9 +21,8 @@ static int efs_symlink_readpage(struct file *file, struct page *page) err = -ENAMETOOLONG; if (size > 2 * EFS_BLOCKSIZE) - goto fail_notlocked; + goto fail; - lock_kernel(); /* read first 512 bytes of link target */ err = -EIO; bh = sb_bread(inode->i_sb, efs_bmap(inode, 0)); @@ -40,14 +38,11 @@ static int efs_symlink_readpage(struct file *file, struct page *page) brelse(bh); } link[size] = '\0'; - unlock_kernel(); SetPageUptodate(page); kunmap(page); unlock_page(page); return 0; fail: - unlock_kernel(); -fail_notlocked: SetPageError(page); kunmap(page); unlock_page(page); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5458e80fc558..085c5c063420 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -98,7 +98,7 @@ struct epoll_filefd { struct nested_call_node { struct list_head llink; void *cookie; - int cpu; + void *ctx; }; /* @@ -317,17 +317,17 @@ static void ep_nested_calls_init(struct nested_calls *ncalls) * @nproc: Nested call core function pointer. * @priv: Opaque data to be passed to the @nproc callback. * @cookie: Cookie to be used to identify this nested call. + * @ctx: This instance context. * * Returns: Returns the code returned by the @nproc callback, or -1 if * the maximum recursion limit has been exceeded. */ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, int (*nproc)(void *, void *, int), void *priv, - void *cookie) + void *cookie, void *ctx) { int error, call_nests = 0; unsigned long flags; - int this_cpu = get_cpu(); struct list_head *lsthead = &ncalls->tasks_call_list; struct nested_call_node *tncur; struct nested_call_node tnode; @@ -340,7 +340,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, * very much limited. */ list_for_each_entry(tncur, lsthead, llink) { - if (tncur->cpu == this_cpu && + if (tncur->ctx == ctx && (tncur->cookie == cookie || ++call_nests > max_nests)) { /* * Ops ... loop detected or maximum nest level reached. @@ -352,7 +352,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, } /* Add the current task and cookie to the list */ - tnode.cpu = this_cpu; + tnode.ctx = ctx; tnode.cookie = cookie; list_add(&tnode.llink, lsthead); @@ -364,10 +364,9 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, /* Remove the current task from the list */ spin_lock_irqsave(&ncalls->lock, flags); list_del(&tnode.llink); - out_unlock: +out_unlock: spin_unlock_irqrestore(&ncalls->lock, flags); - put_cpu(); return error; } @@ -408,8 +407,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) */ static void ep_poll_safewake(wait_queue_head_t *wq) { + int this_cpu = get_cpu(); + ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, - ep_poll_wakeup_proc, NULL, wq); + ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); + + put_cpu(); } /* @@ -663,7 +666,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) * could re-enter here. */ pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, ep, ep); + ep_poll_readyevents_proc, ep, ep, current); return pollflags != -1 ? pollflags : 0; } diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 003500498c22..6cde970b0a1a 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -450,7 +450,7 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child) /* Releases the page */ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, - struct page *page, struct inode *inode) + struct page *page, struct inode *inode, int update_times) { loff_t pos = page_offset(page) + (char *) de - (char *) page_address(page); @@ -465,7 +465,8 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, ext2_set_de_type(de, inode); err = ext2_commit_chunk(page, pos, len); ext2_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (update_times) + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index f2e5811936d0..d988a718aedb 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -111,7 +111,7 @@ extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *); extern int ext2_empty_dir (struct inode *); extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); -extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *); +extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); /* ialloc.c */ extern struct inode * ext2_new_inode (struct inode *, int); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 90ea17998a73..6524ecaebb7a 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -320,7 +320,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, if (!new_de) goto out_dir; inode_inc_link_count(old_inode); - ext2_set_link(new_dir, new_de, new_page, old_inode); + ext2_set_link(new_dir, new_de, new_page, old_inode, 1); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) drop_nlink(new_inode); @@ -352,7 +352,8 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, inode_dec_link_count(old_inode); if (dir_de) { - ext2_set_link(old_inode, dir_de, dir_page, new_dir); + if (old_dir != new_dir) + ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); inode_dec_link_count(old_dir); } return 0; diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index d81ef2fdb08e..e0c745451715 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -129,12 +129,15 @@ fail: static inline struct posix_acl * ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl) { - struct posix_acl *acl = EXT3_ACL_NOT_CACHED; + struct posix_acl *acl = ACCESS_ONCE(*i_acl); - spin_lock(&inode->i_lock); - if (*i_acl != EXT3_ACL_NOT_CACHED) - acl = posix_acl_dup(*i_acl); - spin_unlock(&inode->i_lock); + if (acl) { + spin_lock(&inode->i_lock); + acl = *i_acl; + if (acl != EXT3_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } return acl; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index b0248c6d5d4c..05dea8132fc0 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -820,7 +820,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, while (count < maxblocks && count <= blocks_to_boundary) { ext3_fsblk_t blk; - if (!verify_chain(chain, partial)) { + if (!verify_chain(chain, chain + depth - 1)) { /* * Indirect block might be removed by * truncate while we were reading it. @@ -2374,7 +2374,7 @@ void ext3_truncate(struct inode *inode) struct page *page; if (!ext3_can_truncate(inode)) - return; + goto out_notrans; if (inode->i_size == 0 && ext3_should_writeback_data(inode)) ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE; @@ -2390,7 +2390,7 @@ void ext3_truncate(struct inode *inode) page = grab_cache_page(mapping, inode->i_size >> PAGE_CACHE_SHIFT); if (!page) - return; + goto out_notrans; } handle = start_transaction(inode); @@ -2401,7 +2401,7 @@ void ext3_truncate(struct inode *inode) unlock_page(page); page_cache_release(page); } - return; /* AKPM: return what? */ + goto out_notrans; } last_block = (inode->i_size + blocksize-1) @@ -2525,6 +2525,14 @@ out_stop: ext3_orphan_del(handle, inode); ext3_journal_stop(handle); + return; +out_notrans: + /* + * Delete the inode from orphan list so that it doesn't stay there + * forever and trigger assertion on umount. + */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); } static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, @@ -3122,12 +3130,6 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) rc = inode_setattr(inode, attr); - /* If inode_setattr's call to ext3_truncate failed to get a - * transaction handle at all, we need to clean up the in-core - * orphan list manually. */ - if (inode->i_nlink) - ext3_orphan_del(NULL, inode); - if (!rc && (ia_valid & ATTR_MODE)) rc = ext3_acl_chmod(inode); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 8a0b26340b54..8359e7b3dc89 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -990,7 +990,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) ext3_warning(sb, __func__, - "CONFIG_LBD not enabled\n"); + "CONFIG_LBDAF not enabled\n"); return -EINVAL; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 26aa64dee6aa..601e881e6105 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1812,7 +1812,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) printk(KERN_ERR "EXT3-fs: filesystem on %s:" " too large to mount safely\n", sb->s_id); if (sizeof(sector_t) < 8) - printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not " + printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not " "enabled\n"); goto failed_mount; } diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 8a34710ecf40..8867b2a1e5fe 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 647e0d65a284..605aeed96d68 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -129,12 +129,15 @@ fail: static inline struct posix_acl * ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl) { - struct posix_acl *acl = EXT4_ACL_NOT_CACHED; + struct posix_acl *acl = ACCESS_ONCE(*i_acl); - spin_lock(&inode->i_lock); - if (*i_acl != EXT4_ACL_NOT_CACHED) - acl = posix_acl_dup(*i_acl); - spin_unlock(&inode->i_lock); + if (acl) { + spin_lock(&inode->i_lock); + acl = *i_acl; + if (acl != EXT4_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } return acl; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cc7d5edc38c9..17b9998680e3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -352,6 +352,7 @@ struct ext4_new_group_data { /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) /* * ioctl commands in 32 bit emulation @@ -447,6 +448,15 @@ struct ext4_inode { __le32 i_version_hi; /* high 32 bits for 64-bit version */ }; +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; +#define MAX_DEFRAG_SIZE ((1UL<<31) - 1) #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) @@ -674,7 +684,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ -#define EXT4_MOUNT_ABORT 0x00200 /* Fatal error detected */ #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ @@ -696,17 +705,10 @@ struct ext4_inode_info { #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ -/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ -#ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt #define set_opt(o, opt) o |= EXT4_MOUNT_##opt #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ EXT4_MOUNT_##opt) -#else -#define EXT2_MOUNT_NOLOAD EXT4_MOUNT_NOLOAD -#define EXT2_MOUNT_ABORT EXT4_MOUNT_ABORT -#define EXT2_MOUNT_DATA_FLAGS EXT4_MOUNT_DATA_FLAGS -#endif #define ext4_set_bit ext2_set_bit #define ext4_set_bit_atomic ext2_set_bit_atomic @@ -824,6 +826,13 @@ struct ext4_super_block { }; #ifdef __KERNEL__ + +/* + * run-time mount flags + */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ + /* * fourth extended-fs super-block data in memory */ @@ -842,7 +851,8 @@ struct ext4_sb_info { struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ struct buffer_head **s_group_desc; - unsigned long s_mount_opt; + unsigned int s_mount_opt; + unsigned int s_mount_flags; ext4_fsblk_t s_sb_block; uid_t s_resuid; gid_t s_resgid; @@ -853,6 +863,7 @@ struct ext4_sb_info { int s_inode_size; int s_first_ino; unsigned int s_inode_readahead_blks; + unsigned int s_inode_goal; spinlock_t s_next_gen_lock; u32 s_next_generation; u32 s_hash_seed[4]; @@ -1305,7 +1316,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ -extern struct inode * ext4_new_inode(handle_t *, struct inode *, int); +extern struct inode *ext4_new_inode(handle_t *, struct inode *, int, + const struct qstr *qstr, __u32 goal); extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); @@ -1329,7 +1341,7 @@ extern void ext4_discard_preallocations(struct inode *); extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); extern void ext4_mb_free_blocks(handle_t *, struct inode *, - unsigned long, unsigned long, int, unsigned long *); + ext4_fsblk_t, unsigned long, int, unsigned long *); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern void ext4_mb_update_group_info(struct ext4_group_info *grp, @@ -1647,6 +1659,11 @@ extern int ext4_get_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, int flags); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +/* move_extent.c */ +extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 start_orig, __u64 start_donor, + __u64 len, __u64 *moved_len); + /* * Add new method to test wether block and inode bitmaps are properly diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index f0c3ec85bd48..20a84105a10b 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -221,12 +221,16 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) } extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); +extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); extern int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2593f748c3a4..50322a09bd01 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -49,7 +49,7 @@ * ext_pblock: * combine low and high parts of physical block number into ext4_fsblk_t */ -static ext4_fsblk_t ext_pblock(struct ext4_extent *ex) +ext4_fsblk_t ext_pblock(struct ext4_extent *ex) { ext4_fsblk_t block; @@ -1417,7 +1417,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, return err; } -static int +int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 588af8c77246..3f1873fef1c6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -21,6 +21,8 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> +#include <linux/mount.h> +#include <linux/path.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -145,6 +147,38 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int ext4_file_open(struct inode * inode, struct file * filp) +{ + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct vfsmount *mnt = filp->f_path.mnt; + struct path path; + char buf[64], *cp; + + if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && + !(sb->s_flags & MS_RDONLY))) { + sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt->mnt_parent; + path.dentry = mnt->mnt_mountpoint; + path_get(&path); + cp = d_path(&path, buf, sizeof(buf)); + path_put(&path); + if (!IS_ERR(cp)) { + memcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); + sb->s_dirt = 1; + } + } + return generic_file_open(inode, filp); +} + const struct file_operations ext4_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -156,7 +190,7 @@ const struct file_operations ext4_file_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, - .open = generic_file_open, + .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .splice_read = generic_file_splice_read, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 5afe4370840b..83cf6415f599 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -28,10 +28,12 @@ #include <linux/writeback.h> #include <linux/jbd2.h> #include <linux/blkdev.h> -#include <linux/marker.h> + #include "ext4.h" #include "ext4_jbd2.h" +#include <trace/events/ext4.h> + /* * akpm: A new design for ext4_sync_file(). * @@ -52,9 +54,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); - trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", - inode->i_sb->s_id, datasync, inode->i_ino, - dentry->d_parent->d_inode->i_ino); + trace_ext4_sync_file(file, dentry, datasync); /* * data=writeback: diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3743bd849bce..2f645732e3b7 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -23,11 +23,14 @@ #include <linux/bitops.h> #include <linux/blkdev.h> #include <asm/byteorder.h> + #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include <trace/events/ext4.h> + /* * ialloc.c contains the inodes allocation and deallocation routines */ @@ -208,11 +211,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); - trace_mark(ext4_free_inode, - "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu", - sb->s_id, inode->i_ino, inode->i_mode, - (unsigned long) inode->i_uid, (unsigned long) inode->i_gid, - (unsigned long long) inode->i_blocks); + trace_ext4_free_inode(inode); /* * Note: we must free any quota before locking the superblock, @@ -471,7 +470,8 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g, */ static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode) + ext4_group_t *group, int mode, + const struct qstr *qstr) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -486,6 +486,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); + struct dx_hash_info hinfo; ngroups = real_ngroups; if (flex_size > 1) { @@ -507,7 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, int best_ndir = inodes_per_group; int ret = -1; - get_random_bytes(&grp, sizeof(grp)); + if (qstr) { + hinfo.hash_version = DX_HASH_HALF_MD4; + hinfo.seed = sbi->s_hash_seed; + ext4fs_dirhash(qstr->name, qstr->len, &hinfo); + grp = hinfo.hash; + } else + get_random_bytes(&grp, sizeof(grp)); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; @@ -650,7 +657,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group + flex_size; if (*group > ngroups) *group = 0; - return find_group_orlov(sb, parent, group, mode); + return find_group_orlov(sb, parent, group, mode, 0); } /* @@ -791,7 +798,8 @@ err_ret: * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) +struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, + const struct qstr *qstr, __u32 goal) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -815,14 +823,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) sb = dir->i_sb; ngroups = ext4_get_groups_count(sb); - trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, - dir->i_ino, mode); + trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); sbi = EXT4_SB(sb); + if (!goal) + goal = sbi->s_inode_goal; + + if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) { + group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); + ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); + ret2 = 0; + goto got_group; + } + if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { @@ -841,7 +858,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) if (test_opt(sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); else - ret2 = find_group_orlov(sb, dir, &group, mode); + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); } else ret2 = find_group_other(sb, dir, &group, mode); @@ -851,7 +868,7 @@ got_group: if (ret2 == -1) goto out; - for (i = 0; i < ngroups; i++) { + for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); @@ -863,8 +880,6 @@ got_group: if (!inode_bitmap_bh) goto fail; - ino = 0; - repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, @@ -1047,8 +1062,7 @@ got: } ext4_debug("allocating inode %lu\n", inode->i_ino); - trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d", - sb->s_id, inode->i_ino, dir->i_ino, mode); + trace_ext4_allocate_inode(inode, dir, mode); goto really_out; fail: ext4_std_error(sb, err); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 875db944b22f..7c17ae275af4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,11 +37,14 @@ #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> + #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "ext4_extents.h" +#include <trace/events/ext4.h> + #define MPAGE_DA_EXTENT_TAIL 0x01 static inline int ext4_begin_ordered_truncate(struct inode *inode, @@ -78,7 +81,7 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) * If the handle isn't valid we're not journaling so there's nothing to do. */ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t blocknr) + struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; @@ -90,7 +93,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, BUFFER_TRACE(bh, "enter"); jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %lx\n", + "data mode %x\n", bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); @@ -329,8 +332,8 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) */ static int ext4_block_to_path(struct inode *inode, - ext4_lblk_t i_block, - ext4_lblk_t offsets[4], int *boundary) + ext4_lblk_t i_block, + ext4_lblk_t offsets[4], int *boundary) { int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); @@ -362,9 +365,9 @@ static int ext4_block_to_path(struct inode *inode, final = ptrs; } else { ext4_warning(inode->i_sb, "ext4_block_to_path", - "block %lu > max in inode %lu", - i_block + direct_blocks + - indirect_blocks + double_blocks, inode->i_ino); + "block %lu > max in inode %lu", + i_block + direct_blocks + + indirect_blocks + double_blocks, inode->i_ino); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); @@ -379,25 +382,25 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, while (bref < p+max) { blk = le32_to_cpu(*bref++); - if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { ext4_error(inode->i_sb, function, "invalid block reference %u " "in inode #%lu", blk, inode->i_ino); - return -EIO; - } - } - return 0; + return -EIO; + } + } + return 0; } #define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ + __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ EXT4_ADDR_PER_BLOCK((inode)->i_sb)) #define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ + __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ EXT4_NDIR_BLOCKS) /** @@ -447,7 +450,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, bh = sb_getblk(sb, le32_to_cpu(p->key)); if (unlikely(!bh)) goto failure; - + if (!bh_uptodate_or_lock(bh)) { if (bh_submit_read(bh) < 0) { put_bh(bh); @@ -459,7 +462,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, goto failure; } } - + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) @@ -552,7 +555,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) * returns it. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, - Indirect *partial) + Indirect *partial) { /* * XXX need to get goal block from mballoc's data structures @@ -574,7 +577,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, * direct and indirect blocks. */ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, - int blocks_to_boundary) + int blocks_to_boundary) { unsigned int count = 0; @@ -610,9 +613,9 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, * direct blocks */ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) { struct ext4_allocation_request ar; int target, i; @@ -683,10 +686,10 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, } if (!*err) { if (target == blks) { - /* - * save the new block number - * for the first direct block - */ + /* + * save the new block number + * for the first direct block + */ new_blocks[index] = current_block; } blk_allocated += ar.len; @@ -728,9 +731,9 @@ failed_out: * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) { int blocksize = inode->i_sb->s_blocksize; int i, n = 0; @@ -777,7 +780,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, * the chain to point to the new allocated * data blocks numbers */ - for (i=1; i < num; i++) + for (i = 1; i < num; i++) *(branch[n].p + i) = cpu_to_le32(++current_block); } BUFFER_TRACE(bh, "marking uptodate"); @@ -820,7 +823,8 @@ failed: * chain to new block and return 0. */ static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, int blks) + ext4_lblk_t block, Indirect *where, int num, + int blks) { int i; int err = 0; @@ -852,10 +856,6 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, } /* We are done with atomic stuff, now do the rest of housekeeping */ - - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - /* had we spliced it onto indirect block? */ if (where->bh) { /* @@ -874,8 +874,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, } else { /* * OK, we spliced it into the inode itself on a direct block. - * Inode was dirtied above. */ + ext4_mark_inode_dirty(handle, inode); jbd_debug(5, "splicing direct\n"); } return err; @@ -921,9 +921,9 @@ err_out: * blocks. */ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned int maxblocks, - struct buffer_head *bh_result, - int flags) + ext4_lblk_t iblock, unsigned int maxblocks, + struct buffer_head *bh_result, + int flags) { int err = -EIO; ext4_lblk_t offsets[4]; @@ -939,7 +939,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, iblock, offsets, - &blocks_to_boundary); + &blocks_to_boundary); if (depth == 0) goto out; @@ -987,8 +987,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, * Block out ext4_truncate while we alter the tree */ err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, - &count, goal, - offsets + (partial - chain), partial); + &count, goal, + offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers @@ -999,8 +999,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, */ if (!err) err = ext4_splice_branch(handle, inode, iblock, - partial, indirect_blks, count); - else + partial, indirect_blks, count); + else goto cleanup; set_buffer_new(bh_result); @@ -1172,7 +1172,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, block, + int ret = check_block_validity(inode, block, bh->b_blocknr, retval); if (ret != 0) return ret; @@ -1254,7 +1254,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { - int ret = check_block_validity(inode, block, + int ret = check_block_validity(inode, block, bh->b_blocknr, retval); if (ret != 0) return ret; @@ -1405,8 +1405,7 @@ static int walk_page_buffers(handle_t *handle, for (bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); - block_start = block_end, bh = next) - { + block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { @@ -1447,7 +1446,7 @@ static int walk_page_buffers(handle_t *handle, * write. */ static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) + struct buffer_head *bh) { if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; @@ -1455,27 +1454,24 @@ static int do_journal_get_write_access(handle_t *handle, } static int ext4_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct page *page; - pgoff_t index; + pgoff_t index; unsigned from, to; - trace_mark(ext4_write_begin, - "dev %s ino %lu pos %llu len %u flags %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, flags); + trace_ext4_write_begin(inode, pos, len, flags); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; - index = pos >> PAGE_CACHE_SHIFT; + index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1523,7 +1519,7 @@ retry: ext4_journal_stop(handle); if (pos + len > inode->i_size) { vmtruncate(inode, inode->i_size); - /* + /* * If vmtruncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the @@ -1550,9 +1546,9 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) } static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { int i_size_changed = 0; struct inode *inode = mapping->host; @@ -1603,18 +1599,15 @@ static int ext4_generic_write_end(struct file *file, * buffers are managed internally. */ static int ext4_ordered_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; int ret = 0, ret2; - trace_mark(ext4_ordered_write_end, - "dev %s ino %lu pos %llu len %u copied %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, copied); + trace_ext4_ordered_write_end(inode, pos, len, copied); ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { @@ -1636,7 +1629,7 @@ static int ext4_ordered_write_end(struct file *file, if (pos + len > inode->i_size) { vmtruncate(inode, inode->i_size); - /* + /* * If vmtruncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. @@ -1650,18 +1643,15 @@ static int ext4_ordered_write_end(struct file *file, } static int ext4_writeback_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; int ret = 0, ret2; - trace_mark(ext4_writeback_write_end, - "dev %s ino %lu pos %llu len %u copied %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, copied); + trace_ext4_writeback_write_end(inode, pos, len, copied); ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; @@ -1681,7 +1671,7 @@ static int ext4_writeback_write_end(struct file *file, if (pos + len > inode->i_size) { vmtruncate(inode, inode->i_size); - /* + /* * If vmtruncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. @@ -1694,9 +1684,9 @@ static int ext4_writeback_write_end(struct file *file, } static int ext4_journalled_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; @@ -1705,10 +1695,7 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; loff_t new_i_size; - trace_mark(ext4_journalled_write_end, - "dev %s ino %lu pos %llu len %u copied %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, copied); + trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1747,7 +1734,7 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { vmtruncate(inode, inode->i_size); - /* + /* * If vmtruncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. @@ -1854,7 +1841,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) } static void ext4_da_page_release_reservation(struct page *page, - unsigned long offset) + unsigned long offset) { int to_release = 0; struct buffer_head *head, *bh; @@ -2554,9 +2541,7 @@ static int ext4_da_writepage(struct page *page, struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; - trace_mark(ext4_da_writepage, - "dev %s ino %lu page_index %lu", - inode->i_sb->s_id, inode->i_ino, page->index); + trace_ext4_da_writepage(inode, page); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2667,19 +2652,7 @@ static int ext4_da_writepages(struct address_space *mapping, int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - trace_mark(ext4_da_writepages, - "dev %s ino %lu nr_t_write %ld " - "pages_skipped %ld range_start %llu " - "range_end %llu nonblocking %d " - "for_kupdate %d for_reclaim %d " - "for_writepages %d range_cyclic %d", - inode->i_sb->s_id, inode->i_ino, - wbc->nr_to_write, wbc->pages_skipped, - (unsigned long long) wbc->range_start, - (unsigned long long) wbc->range_end, - wbc->nonblocking, wbc->for_kupdate, - wbc->for_reclaim, wbc->for_writepages, - wbc->range_cyclic); + trace_ext4_da_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting @@ -2693,13 +2666,13 @@ static int ext4_da_writepages(struct address_space *mapping, * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because + * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_da_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) return -EROFS; /* @@ -2845,14 +2818,7 @@ out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; wbc->nr_to_write -= nr_to_writebump; - trace_mark(ext4_da_writepage_result, - "dev %s ino %lu ret %d pages_written %d " - "pages_skipped %ld congestion %d " - "more_io %d no_nrwrite_index_update %d", - inode->i_sb->s_id, inode->i_ino, ret, - pages_written, wbc->pages_skipped, - wbc->encountered_congestion, wbc->more_io, - wbc->no_nrwrite_index_update); + trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; } @@ -2884,8 +2850,8 @@ static int ext4_nonda_switch(struct super_block *sb) } static int ext4_da_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { int ret, retries = 0; struct page *page; @@ -2904,11 +2870,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, len, flags, pagep, fsdata); } *fsdata = (void *)0; - - trace_mark(ext4_da_write_begin, - "dev %s ino %lu pos %llu len %u flags %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, flags); + trace_ext4_da_write_begin(inode, pos, len, flags); retry: /* * With delayed allocation, we don't log the i_disksize update @@ -2959,7 +2921,7 @@ out: * when write to the end of file but not require block allocation */ static int ext4_da_should_update_i_disksize(struct page *page, - unsigned long offset) + unsigned long offset) { struct buffer_head *bh; struct inode *inode = page->mapping->host; @@ -2978,9 +2940,9 @@ static int ext4_da_should_update_i_disksize(struct page *page, } static int ext4_da_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct inode *inode = mapping->host; int ret = 0, ret2; @@ -3001,10 +2963,7 @@ static int ext4_da_write_end(struct file *file, } } - trace_mark(ext4_da_write_end, - "dev %s ino %lu pos %llu len %u copied %u", - inode->i_sb->s_id, inode->i_ino, - (unsigned long long) pos, len, copied); + trace_ext4_da_write_end(inode, pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); end = start + copied - 1; @@ -3081,7 +3040,7 @@ int ext4_alloc_da_blocks(struct inode *inode) * not strictly speaking necessary (and for users of * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: - * + * * ext4_da_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> @@ -3101,7 +3060,7 @@ int ext4_alloc_da_blocks(struct inode *inode) * write out the pages, but rather only collect contiguous * logical block extents, call the multi-block allocator, and * then update the buffer heads with the block allocations. - * + * * For now, though, we'll cheat by calling filemap_flush(), * which will map the blocks, and start the I/O, but not * actually wait for the I/O to complete. @@ -3237,7 +3196,7 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) * */ static int __ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; @@ -3249,15 +3208,13 @@ static int __ext4_normal_writepage(struct page *page, } static int ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; loff_t size = i_size_read(inode); loff_t len; - trace_mark(ext4_normal_writepage, - "dev %s ino %lu page_index %lu", - inode->i_sb->s_id, inode->i_ino, page->index); + trace_ext4_normal_writepage(inode, page); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -3287,7 +3244,7 @@ static int ext4_normal_writepage(struct page *page, } static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; @@ -3337,15 +3294,13 @@ out: } static int ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; loff_t size = i_size_read(inode); loff_t len; - trace_mark(ext4_journalled_writepage, - "dev %s ino %lu page_index %lu", - inode->i_sb->s_id, inode->i_ino, page->index); + trace_ext4_journalled_writepage(inode, page); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -3442,8 +3397,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) * VFS code falls back into buffered path in that case so we are safe. */ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -3763,7 +3718,8 @@ static inline int all_zeroes(__le32 *p, __le32 *q) * (no partially truncated stuff there). */ static Indirect *ext4_find_shared(struct inode *inode, int depth, - ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) + ext4_lblk_t offsets[4], Indirect chain[4], + __le32 *top) { Indirect *partial, *p; int k, err; @@ -3819,8 +3775,10 @@ no_top: * than `count' because there can be holes in there. */ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, - struct buffer_head *bh, ext4_fsblk_t block_to_free, - unsigned long count, __le32 *first, __le32 *last) + struct buffer_head *bh, + ext4_fsblk_t block_to_free, + unsigned long count, __le32 *first, + __le32 *last) { __le32 *p; if (try_to_extend_transaction(handle, inode)) { @@ -3837,10 +3795,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, } /* - * Any buffers which are on the journal will be in memory. We find - * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget() - * on them. We've already detached each block from the file, so - * bforget() in jbd2_journal_forget() should be safe. + * Any buffers which are on the journal will be in memory. We + * find them on the hash table so jbd2_journal_revoke() will + * run jbd2_journal_forget() on them. We've already detached + * each block from the file, so bforget() in + * jbd2_journal_forget() should be safe. * * AKPM: turn on bforget in jbd2_journal_forget()!!! */ @@ -4212,7 +4171,7 @@ void ext4_truncate(struct inode *inode) (__le32*)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); - brelse (partial->bh); + brelse(partial->bh); partial--; } do_indirects: @@ -4453,8 +4412,9 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei) if (flags & S_DIRSYNC) ei->i_flags |= EXT4_DIRSYNC_FL; } + static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, - struct ext4_inode_info *ei) + struct ext4_inode_info *ei) { blkcnt_t i_blocks ; struct inode *inode = &(ei->vfs_inode); @@ -4569,7 +4529,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ei->i_state |= EXT4_STATE_XATTR; + ei->i_state |= EXT4_STATE_XATTR; } } else ei->i_extra_isize = 0; @@ -4588,7 +4548,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && - ((ei->i_file_acl < + ((ei->i_file_acl < (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + EXT4_SB(sb)->s_gdb_count)) || (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { @@ -4603,15 +4563,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) !ext4_inode_is_fast_symlink(inode))) /* Validate extent which is part of inode */ ret = ext4_ext_check_inode(inode); - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { - /* Validate block references which are part of inode */ + /* Validate block references which are part of inode */ ret = ext4_check_inode_blockref(inode); } if (ret) { - brelse(bh); - goto bad_inode; + brelse(bh); + goto bad_inode; } if (S_ISREG(inode->i_mode)) { @@ -4642,7 +4602,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } else { brelse(bh); ret = -EIO; - ext4_error(inode->i_sb, __func__, + ext4_error(inode->i_sb, __func__, "bogus i_mode (%o) for inode=%lu", inode->i_mode, inode->i_ino); goto bad_inode; @@ -4795,8 +4755,9 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } - } else for (block = 0; block < EXT4_N_BLOCKS; block++) - raw_inode->i_block[block] = ei->i_data[block]; + } else + for (block = 0; block < EXT4_N_BLOCKS; block++) + raw_inode->i_block[block] = ei->i_data[block]; raw_inode->i_disk_version = cpu_to_le32(inode->i_version); if (ei->i_extra_isize) { @@ -5150,7 +5111,7 @@ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) * Give this, we know that the caller already has write access to iloc->bh. */ int ext4_mark_iloc_dirty(handle_t *handle, - struct inode *inode, struct ext4_iloc *iloc) + struct inode *inode, struct ext4_iloc *iloc) { int err = 0; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 91e75f7a9e73..bb415408fdb6 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -14,6 +14,7 @@ #include <linux/compat.h> #include <linux/smp_lock.h> #include <linux/mount.h> +#include <linux/file.h> #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -213,6 +214,41 @@ setversion_out: return err; } + + case EXT4_IOC_MOVE_EXT: { + struct move_extent me; + struct file *donor_filp; + int err; + + if (copy_from_user(&me, + (struct move_extent __user *)arg, sizeof(me))) + return -EFAULT; + + donor_filp = fget(me.donor_fd); + if (!donor_filp) + return -EBADF; + + if (!capable(CAP_DAC_OVERRIDE)) { + if ((current->real_cred->fsuid != inode->i_uid) || + !(inode->i_mode & S_IRUSR) || + !(donor_filp->f_dentry->d_inode->i_mode & + S_IRUSR)) { + fput(donor_filp); + return -EACCES; + } + } + + err = ext4_move_extents(filp, donor_filp, me.orig_start, + me.donor_start, me.len, &me.moved_len); + fput(donor_filp); + + if (!err) + if (copy_to_user((struct move_extent *)arg, + &me, sizeof(me))) + return -EFAULT; + return err; + } + case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; struct super_block *sb = inode->i_sb; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ed8482e22c0e..519a0a686d94 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -22,6 +22,8 @@ */ #include "mballoc.h" +#include <trace/events/ext4.h> + /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -340,8 +342,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); - - static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 @@ -2859,9 +2859,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + entry->start_blk + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", - sb->s_id, (unsigned long long) discard_block, - entry->count); + trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, + entry->count); sb_issue_discard(sb, discard_block, entry->count); kmem_cache_free(ext4_free_ext_cachep, entry); @@ -3629,10 +3628,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) mb_debug("new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); - trace_mark(ext4_mb_new_inode_pa, - "dev %s ino %lu pstart %llu len %u lstart %u", - sb->s_id, ac->ac_inode->i_ino, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3691,9 +3687,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_type = MB_GROUP_PA; mb_debug("new group pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); - trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u", - sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart); + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3783,10 +3778,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_mb_store_history(ac); } - trace_mark(ext4_mb_release_inode_pa, - "dev %s ino %lu block %llu count %u", - sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit, - next - bit); + trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, + next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3820,8 +3813,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (ac) ac->ac_op = EXT4_MB_HISTORY_DISCARD; - trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d", - sb->s_id, pa->pa_pstart, pa->pa_len); + trace_ext4_mb_release_group_pa(ac, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -3889,6 +3881,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, INIT_LIST_HEAD(&list); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (ac) + ac->ac_sb = sb; repeat: ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, @@ -3987,12 +3981,15 @@ void ext4_discard_preallocations(struct inode *inode) } mb_debug("discard preallocation for inode %lu\n", inode->i_ino); - trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id, - inode->i_ino); + trace_ext4_discard_preallocations(inode); INIT_LIST_HEAD(&list); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (ac) { + ac->ac_sb = sb; + ac->ac_inode = inode; + } repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); @@ -4276,6 +4273,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, INIT_LIST_HEAD(&discard_list); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + if (ac) + ac->ac_sb = sb; spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], @@ -4445,8 +4444,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) int ret; int freed = 0; - trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", - sb->s_id, needed); + trace_ext4_mb_discard_preallocations(sb, needed); for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; @@ -4475,17 +4473,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, sb = ar->inode->i_sb; sbi = EXT4_SB(sb); - trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu " - "lblk %llu goal %llu lleft %llu lright %llu " - "pleft %llu pright %llu ", - sb->s_id, ar->flags, ar->len, - ar->inode ? ar->inode->i_ino : 0, - (unsigned long long) ar->logical, - (unsigned long long) ar->goal, - (unsigned long long) ar->lleft, - (unsigned long long) ar->lright, - (unsigned long long) ar->pleft, - (unsigned long long) ar->pright); + trace_ext4_request_blocks(ar); /* * For delayed allocation, we could skip the ENOSPC and @@ -4521,7 +4509,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (!ac) { + if (ac) { + ac->ac_sb = sb; + ac->ac_inode = ar->inode; + } else { ar->len = 0; *errp = -ENOMEM; goto out1; @@ -4594,18 +4585,7 @@ out3: reserv_blks); } - trace_mark(ext4_allocate_blocks, - "dev %s block %llu flags %u len %u ino %lu " - "logical %llu goal %llu lleft %llu lright %llu " - "pleft %llu pright %llu ", - sb->s_id, (unsigned long long) block, - ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0, - (unsigned long long) ar->logical, - (unsigned long long) ar->goal, - (unsigned long long) ar->lleft, - (unsigned long long) ar->lright, - (unsigned long long) ar->pleft, - (unsigned long long) ar->pright); + trace_ext4_allocate_blocks(ar, (unsigned long long)block); return block; } @@ -4709,7 +4689,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * Main entry point into mballoc to free blocks */ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count, + ext4_fsblk_t block, unsigned long count, int metadata, unsigned long *freed) { struct buffer_head *bitmap_bh = NULL; @@ -4735,15 +4715,12 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, block + count > ext4_blocks_count(es)) { ext4_error(sb, __func__, "Freeing blocks not in datazone - " - "block = %lu, count = %lu", block, count); + "block = %llu, count = %lu", block, count); goto error_return; } - ext4_debug("freeing block %lu\n", block); - trace_mark(ext4_free_blocks, - "dev %s block %llu count %lu metadata %d ino %lu", - sb->s_id, (unsigned long long) block, count, metadata, - inode ? inode->i_ino : 0); + ext4_debug("freeing block %llu\n", block); + trace_ext4_free_blocks(inode, block, count, metadata); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { @@ -4784,7 +4761,7 @@ do_more: ext4_error(sb, __func__, "Freeing blocks in system zone - " - "Block = %lu, count = %lu", block, count); + "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_return; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 75e34f69215b..c96bb19f58f9 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -19,7 +19,6 @@ #include <linux/seq_file.h> #include <linux/version.h> #include <linux/blkdev.h> -#include <linux/marker.h> #include <linux/mutex.h> #include "ext4_jbd2.h" #include "ext4.h" diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index fe64d9f79852..313a50b39741 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -458,6 +458,7 @@ int ext4_ext_migrate(struct inode *inode) struct inode *tmp_inode = NULL; struct list_blocks_struct lb; unsigned long max_entries; + __u32 goal; /* * If the filesystem does not support extents, or the inode @@ -483,9 +484,10 @@ int ext4_ext_migrate(struct inode *inode) retval = PTR_ERR(handle); return retval; } - tmp_inode = ext4_new_inode(handle, - inode->i_sb->s_root->d_inode, - S_IFREG); + goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * + EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; + tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + S_IFREG, 0, goal); if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c new file mode 100644 index 000000000000..bbf2dd9404dc --- /dev/null +++ b/fs/ext4/move_extent.c @@ -0,0 +1,1320 @@ +/* + * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. + * Written by Takashi Sato <t-sato@yk.jp.nec.com> + * Akira Fujita <a-fujita@rs.jp.nec.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/fs.h> +#include <linux/quotaops.h> +#include "ext4_jbd2.h" +#include "ext4_extents.h" +#include "ext4.h" + +#define get_ext_path(path, inode, block, ret) \ + do { \ + path = ext4_ext_find_extent(inode, block, path); \ + if (IS_ERR(path)) { \ + ret = PTR_ERR(path); \ + path = NULL; \ + } \ + } while (0) + +/** + * copy_extent_status - Copy the extent's initialization status + * + * @src: an extent for getting initialize status + * @dest: an extent to be set the status + */ +static void +copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) +{ + if (ext4_ext_is_uninitialized(src)) + ext4_ext_mark_uninitialized(dest); + else + dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); +} + +/** + * mext_next_extent - Search for the next extent and set it to "extent" + * + * @inode: inode which is searched + * @path: this will obtain data for the next extent + * @extent: pointer to the next extent we have just gotten + * + * Search the next extent in the array of ext4_ext_path structure (@path) + * and set it to ext4_extent structure (@extent). In addition, the member of + * @path (->p_ext) also points the next extent. Return 0 on success, 1 if + * ext4_ext_path structure refers to the last extent, or a negative error + * value on failure. + */ +static int +mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent) +{ + int ppos, leaf_ppos = path->p_depth; + + ppos = leaf_ppos; + if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { + /* leaf block */ + *extent = ++path[ppos].p_ext; + return 0; + } + + while (--ppos >= 0) { + if (EXT_LAST_INDEX(path[ppos].p_hdr) > + path[ppos].p_idx) { + int cur_ppos = ppos; + + /* index block */ + path[ppos].p_idx++; + path[ppos].p_block = idx_pblock(path[ppos].p_idx); + if (path[ppos+1].p_bh) + brelse(path[ppos+1].p_bh); + path[ppos+1].p_bh = + sb_bread(inode->i_sb, path[ppos].p_block); + if (!path[ppos+1].p_bh) + return -EIO; + path[ppos+1].p_hdr = + ext_block_hdr(path[ppos+1].p_bh); + + /* Halfway index block */ + while (++cur_ppos < leaf_ppos) { + path[cur_ppos].p_idx = + EXT_FIRST_INDEX(path[cur_ppos].p_hdr); + path[cur_ppos].p_block = + idx_pblock(path[cur_ppos].p_idx); + if (path[cur_ppos+1].p_bh) + brelse(path[cur_ppos+1].p_bh); + path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, + path[cur_ppos].p_block); + if (!path[cur_ppos+1].p_bh) + return -EIO; + path[cur_ppos+1].p_hdr = + ext_block_hdr(path[cur_ppos+1].p_bh); + } + + /* leaf block */ + path[leaf_ppos].p_ext = *extent = + EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); + return 0; + } + } + /* We found the last extent */ + return 1; +} + +/** + * mext_double_down_read - Acquire two inodes' read semaphore + * + * @orig_inode: original inode structure + * @donor_inode: donor inode structure + * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. + */ +static void +mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) +{ + struct inode *first = orig_inode, *second = donor_inode; + + BUG_ON(orig_inode == NULL || donor_inode == NULL); + + /* + * Use the inode number to provide the stable locking order instead + * of its address, because the C language doesn't guarantee you can + * compare pointers that don't come from the same array. + */ + if (donor_inode->i_ino < orig_inode->i_ino) { + first = donor_inode; + second = orig_inode; + } + + down_read(&EXT4_I(first)->i_data_sem); + down_read(&EXT4_I(second)->i_data_sem); +} + +/** + * mext_double_down_write - Acquire two inodes' write semaphore + * + * @orig_inode: original inode structure + * @donor_inode: donor inode structure + * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. + */ +static void +mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) +{ + struct inode *first = orig_inode, *second = donor_inode; + + BUG_ON(orig_inode == NULL || donor_inode == NULL); + + /* + * Use the inode number to provide the stable locking order instead + * of its address, because the C language doesn't guarantee you can + * compare pointers that don't come from the same array. + */ + if (donor_inode->i_ino < orig_inode->i_ino) { + first = donor_inode; + second = orig_inode; + } + + down_write(&EXT4_I(first)->i_data_sem); + down_write(&EXT4_I(second)->i_data_sem); +} + +/** + * mext_double_up_read - Release two inodes' read semaphore + * + * @orig_inode: original inode structure to be released its lock first + * @donor_inode: donor inode structure to be released its lock second + * Release read semaphore of two inodes (orig and donor). + */ +static void +mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) +{ + BUG_ON(orig_inode == NULL || donor_inode == NULL); + + up_read(&EXT4_I(orig_inode)->i_data_sem); + up_read(&EXT4_I(donor_inode)->i_data_sem); +} + +/** + * mext_double_up_write - Release two inodes' write semaphore + * + * @orig_inode: original inode structure to be released its lock first + * @donor_inode: donor inode structure to be released its lock second + * Release write semaphore of two inodes (orig and donor). + */ +static void +mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) +{ + BUG_ON(orig_inode == NULL || donor_inode == NULL); + + up_write(&EXT4_I(orig_inode)->i_data_sem); + up_write(&EXT4_I(donor_inode)->i_data_sem); +} + +/** + * mext_insert_across_blocks - Insert extents across leaf block + * + * @handle: journal handle + * @orig_inode: original inode + * @o_start: first original extent to be changed + * @o_end: last original extent to be changed + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * + * Allocate a new leaf block and insert extents into it. Return 0 on success, + * or a negative error value on failure. + */ +static int +mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, + struct ext4_extent *o_start, struct ext4_extent *o_end, + struct ext4_extent *start_ext, struct ext4_extent *new_ext, + struct ext4_extent *end_ext) +{ + struct ext4_ext_path *orig_path = NULL; + ext4_lblk_t eblock = 0; + int new_flag = 0; + int end_flag = 0; + int err = 0; + + if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) { + if (o_start == o_end) { + + /* start_ext new_ext end_ext + * donor |---------|-----------|--------| + * orig |------------------------------| + */ + end_flag = 1; + } else { + + /* start_ext new_ext end_ext + * donor |---------|----------|---------| + * orig |---------------|--------------| + */ + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); + } + + o_start->ee_len = start_ext->ee_len; + new_flag = 1; + + } else if (start_ext->ee_len && new_ext->ee_len && + !end_ext->ee_len && o_start == o_end) { + + /* start_ext new_ext + * donor |--------------|---------------| + * orig |------------------------------| + */ + o_start->ee_len = start_ext->ee_len; + new_flag = 1; + + } else if (!start_ext->ee_len && new_ext->ee_len && + end_ext->ee_len && o_start == o_end) { + + /* new_ext end_ext + * donor |--------------|---------------| + * orig |------------------------------| + */ + o_end->ee_block = end_ext->ee_block; + o_end->ee_len = end_ext->ee_len; + ext4_ext_store_pblock(o_end, ext_pblock(end_ext)); + + /* + * Set 0 to the extent block if new_ext was + * the first block. + */ + if (new_ext->ee_block) + eblock = le32_to_cpu(new_ext->ee_block); + + new_flag = 1; + } else { + ext4_debug("ext4 move extent: Unexpected insert case\n"); + return -EIO; + } + + if (new_flag) { + get_ext_path(orig_path, orig_inode, eblock, err); + if (orig_path == NULL) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, + orig_path, new_ext)) + goto out; + } + + if (end_flag) { + get_ext_path(orig_path, orig_inode, + le32_to_cpu(end_ext->ee_block) - 1, err); + if (orig_path == NULL) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, + orig_path, end_ext)) + goto out; + } +out: + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + + return err; + +} + +/** + * mext_insert_inside_block - Insert new extent to the extent block + * + * @o_start: first original extent to be moved + * @o_end: last original extent to be moved + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * @eh: extent header of target leaf block + * @range_to_move: used to decide how to insert extent + * + * Insert extents into the leaf block. The extent (@o_start) is overwritten + * by inserted extents. + */ +static void +mext_insert_inside_block(struct ext4_extent *o_start, + struct ext4_extent *o_end, + struct ext4_extent *start_ext, + struct ext4_extent *new_ext, + struct ext4_extent *end_ext, + struct ext4_extent_header *eh, + int range_to_move) +{ + int i = 0; + unsigned long len; + + /* Move the existing extents */ + if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { + len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - + (unsigned long)(o_end + 1); + memmove(o_end + 1 + range_to_move, o_end + 1, len); + } + + /* Insert start entry */ + if (start_ext->ee_len) + o_start[i++].ee_len = start_ext->ee_len; + + /* Insert new entry */ + if (new_ext->ee_len) { + o_start[i] = *new_ext; + ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext)); + } + + /* Insert end entry */ + if (end_ext->ee_len) + o_start[i] = *end_ext; + + /* Increment the total entries counter on the extent block */ + le16_add_cpu(&eh->eh_entries, range_to_move); +} + +/** + * mext_insert_extents - Insert new extent + * + * @handle: journal handle + * @orig_inode: original inode + * @orig_path: path indicates first extent to be changed + * @o_start: first original extent to be changed + * @o_end: last original extent to be changed + * @start_ext: first new extent to be inserted + * @new_ext: middle of new extent to be inserted + * @end_ext: last new extent to be inserted + * + * Call the function to insert extents. If we cannot add more extents into + * the leaf block, we call mext_insert_across_blocks() to create a + * new leaf block. Otherwise call mext_insert_inside_block(). Return 0 + * on success, or a negative error value on failure. + */ +static int +mext_insert_extents(handle_t *handle, struct inode *orig_inode, + struct ext4_ext_path *orig_path, + struct ext4_extent *o_start, + struct ext4_extent *o_end, + struct ext4_extent *start_ext, + struct ext4_extent *new_ext, + struct ext4_extent *end_ext) +{ + struct ext4_extent_header *eh; + unsigned long need_slots, slots_range; + int range_to_move, depth, ret; + + /* + * The extents need to be inserted + * start_extent + new_extent + end_extent. + */ + need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) + + (new_ext->ee_len ? 1 : 0); + + /* The number of slots between start and end */ + slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) + / sizeof(struct ext4_extent); + + /* Range to move the end of extent */ + range_to_move = need_slots - slots_range; + depth = orig_path->p_depth; + orig_path += depth; + eh = orig_path->p_hdr; + + if (depth) { + /* Register to journal */ + ret = ext4_journal_get_write_access(handle, orig_path->p_bh); + if (ret) + return ret; + } + + /* Expansion */ + if (range_to_move > 0 && + (range_to_move > le16_to_cpu(eh->eh_max) + - le16_to_cpu(eh->eh_entries))) { + + ret = mext_insert_across_blocks(handle, orig_inode, o_start, + o_end, start_ext, new_ext, end_ext); + if (ret < 0) + return ret; + } else + mext_insert_inside_block(o_start, o_end, start_ext, new_ext, + end_ext, eh, range_to_move); + + if (depth) { + ret = ext4_handle_dirty_metadata(handle, orig_inode, + orig_path->p_bh); + if (ret) + return ret; + } else { + ret = ext4_mark_inode_dirty(handle, orig_inode); + if (ret < 0) + return ret; + } + + return 0; +} + +/** + * mext_leaf_block - Move one leaf extent block into the inode. + * + * @handle: journal handle + * @orig_inode: original inode + * @orig_path: path indicates first extent to be changed + * @dext: donor extent + * @from: start offset on the target file + * + * In order to insert extents into the leaf block, we must divide the extent + * in the leaf block into three extents. The one is located to be inserted + * extents, and the others are located around it. + * + * Therefore, this function creates structures to save extents of the leaf + * block, and inserts extents by calling mext_insert_extents() with + * created extents. Return 0 on success, or a negative error value on failure. + */ +static int +mext_leaf_block(handle_t *handle, struct inode *orig_inode, + struct ext4_ext_path *orig_path, struct ext4_extent *dext, + ext4_lblk_t *from) +{ + struct ext4_extent *oext, *o_start, *o_end, *prev_ext; + struct ext4_extent new_ext, start_ext, end_ext; + ext4_lblk_t new_ext_end; + ext4_fsblk_t new_phys_end; + int oext_alen, new_ext_alen, end_ext_alen; + int depth = ext_depth(orig_inode); + int ret; + + o_start = o_end = oext = orig_path[depth].p_ext; + oext_alen = ext4_ext_get_actual_len(oext); + start_ext.ee_len = end_ext.ee_len = 0; + + new_ext.ee_block = cpu_to_le32(*from); + ext4_ext_store_pblock(&new_ext, ext_pblock(dext)); + new_ext.ee_len = dext->ee_len; + new_ext_alen = ext4_ext_get_actual_len(&new_ext); + new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; + new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1; + + /* + * Case: original extent is first + * oext |--------| + * new_ext |--| + * start_ext |--| + */ + if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) && + le32_to_cpu(new_ext.ee_block) < + le32_to_cpu(oext->ee_block) + oext_alen) { + start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - + le32_to_cpu(oext->ee_block)); + copy_extent_status(oext, &start_ext); + } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { + prev_ext = oext - 1; + /* + * We can merge new_ext into previous extent, + * if these are contiguous and same extent type. + */ + if (ext4_can_extents_be_merged(orig_inode, prev_ext, + &new_ext)) { + o_start = prev_ext; + start_ext.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(prev_ext) + + new_ext_alen); + copy_extent_status(prev_ext, &start_ext); + new_ext.ee_len = 0; + } + } + + /* + * Case: new_ext_end must be less than oext + * oext |-----------| + * new_ext |-------| + */ + BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); + + /* + * Case: new_ext is smaller than original extent + * oext |---------------| + * new_ext |-----------| + * end_ext |---| + */ + if (le32_to_cpu(oext->ee_block) <= new_ext_end && + new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) { + end_ext.ee_len = + cpu_to_le16(le32_to_cpu(oext->ee_block) + + oext_alen - 1 - new_ext_end); + copy_extent_status(oext, &end_ext); + end_ext_alen = ext4_ext_get_actual_len(&end_ext); + ext4_ext_store_pblock(&end_ext, + (ext_pblock(o_end) + oext_alen - end_ext_alen)); + end_ext.ee_block = + cpu_to_le32(le32_to_cpu(o_end->ee_block) + + oext_alen - end_ext_alen); + } + + ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, + o_end, &start_ext, &new_ext, &end_ext); + return ret; +} + +/** + * mext_calc_swap_extents - Calculate extents for extent swapping. + * + * @tmp_dext: the extent that will belong to the original inode + * @tmp_oext: the extent that will belong to the donor inode + * @orig_off: block offset of original inode + * @donor_off: block offset of donor inode + * @max_count: the maximun length of extents + */ +static void +mext_calc_swap_extents(struct ext4_extent *tmp_dext, + struct ext4_extent *tmp_oext, + ext4_lblk_t orig_off, ext4_lblk_t donor_off, + ext4_lblk_t max_count) +{ + ext4_lblk_t diff, orig_diff; + struct ext4_extent dext_old, oext_old; + + dext_old = *tmp_dext; + oext_old = *tmp_oext; + + /* When tmp_dext is too large, pick up the target range. */ + diff = donor_off - le32_to_cpu(tmp_dext->ee_block); + + ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff); + tmp_dext->ee_block = + cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff); + tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff); + + if (max_count < ext4_ext_get_actual_len(tmp_dext)) + tmp_dext->ee_len = cpu_to_le16(max_count); + + orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); + ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff); + + /* Adjust extent length if donor extent is larger than orig */ + if (ext4_ext_get_actual_len(tmp_dext) > + ext4_ext_get_actual_len(tmp_oext) - orig_diff) + tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) - + orig_diff); + + tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext)); + + copy_extent_status(&oext_old, tmp_dext); + copy_extent_status(&dext_old, tmp_oext); +} + +/** + * mext_replace_branches - Replace original extents with new extents + * + * @handle: journal handle + * @orig_inode: original inode + * @donor_inode: donor inode + * @from: block offset of orig_inode + * @count: block count to be replaced + * + * Replace original inode extents and donor inode extents page by page. + * We implement this replacement in the following three steps: + * 1. Save the block information of original and donor inodes into + * dummy extents. + * 2. Change the block information of original inode to point at the + * donor inode blocks. + * 3. Change the block information of donor inode to point at the saved + * original inode blocks in the dummy extents. + * + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_replace_branches(handle_t *handle, struct inode *orig_inode, + struct inode *donor_inode, ext4_lblk_t from, + ext4_lblk_t count) +{ + struct ext4_ext_path *orig_path = NULL; + struct ext4_ext_path *donor_path = NULL; + struct ext4_extent *oext, *dext; + struct ext4_extent tmp_dext, tmp_oext; + ext4_lblk_t orig_off = from, donor_off = from; + int err = 0; + int depth; + int replaced_count = 0; + int dext_alen; + + mext_double_down_write(orig_inode, donor_inode); + + /* Get the original extent for the block "orig_off" */ + get_ext_path(orig_path, orig_inode, orig_off, err); + if (orig_path == NULL) + goto out; + + /* Get the donor extent for the head */ + get_ext_path(donor_path, donor_inode, donor_off, err); + if (donor_path == NULL) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; + tmp_oext = *oext; + + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; + tmp_dext = *dext; + + mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count); + + /* Loop for the donor extents */ + while (1) { + /* The extent for donor must be found. */ + BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); + + /* Set donor extent to orig extent */ + err = mext_leaf_block(handle, orig_inode, + orig_path, &tmp_dext, &orig_off); + if (err < 0) + goto out; + + /* Set orig extent to donor extent */ + err = mext_leaf_block(handle, donor_inode, + donor_path, &tmp_oext, &donor_off); + if (err < 0) + goto out; + + dext_alen = ext4_ext_get_actual_len(&tmp_dext); + replaced_count += dext_alen; + donor_off += dext_alen; + orig_off += dext_alen; + + /* Already moved the expected blocks */ + if (replaced_count >= count) + break; + + if (orig_path) + ext4_ext_drop_refs(orig_path); + get_ext_path(orig_path, orig_inode, orig_off, err); + if (orig_path == NULL) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; + if (le32_to_cpu(oext->ee_block) + + ext4_ext_get_actual_len(oext) <= orig_off) { + err = 0; + goto out; + } + tmp_oext = *oext; + + if (donor_path) + ext4_ext_drop_refs(donor_path); + get_ext_path(donor_path, donor_inode, + donor_off, err); + if (donor_path == NULL) + goto out; + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; + if (le32_to_cpu(dext->ee_block) + + ext4_ext_get_actual_len(dext) <= donor_off) { + err = 0; + goto out; + } + tmp_dext = *dext; + + mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, + count - replaced_count); + } + +out: + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + if (donor_path) { + ext4_ext_drop_refs(donor_path); + kfree(donor_path); + } + + mext_double_up_write(orig_inode, donor_inode); + return err; +} + +/** + * move_extent_per_page - Move extent data per page + * + * @o_filp: file structure of original file + * @donor_inode: donor inode + * @orig_page_offset: page index on original file + * @data_offset_in_page: block index where data swapping starts + * @block_len_in_page: the number of blocks to be swapped + * @uninit: orig extent is uninitialized or not + * + * Save the data in original inode blocks and replace original inode extents + * with donor inode extents by calling mext_replace_branches(). + * Finally, write out the saved data in new original inode blocks. Return 0 + * on success, or a negative error value on failure. + */ +static int +move_extent_par_page(struct file *o_filp, struct inode *donor_inode, + pgoff_t orig_page_offset, int data_offset_in_page, + int block_len_in_page, int uninit) +{ + struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct address_space *mapping = orig_inode->i_mapping; + struct buffer_head *bh; + struct page *page = NULL; + const struct address_space_operations *a_ops = mapping->a_ops; + handle_t *handle; + ext4_lblk_t orig_blk_offset; + long long offs = orig_page_offset << PAGE_CACHE_SHIFT; + unsigned long blocksize = orig_inode->i_sb->s_blocksize; + unsigned int w_flags = 0; + unsigned int tmp_data_len, data_len; + void *fsdata; + int ret, i, jblocks; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + + /* + * It needs twice the amount of ordinary journal buffers because + * inode and donor_inode may change each different metadata blocks. + */ + jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; + handle = ext4_journal_start(orig_inode, jblocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + + if (segment_eq(get_fs(), KERNEL_DS)) + w_flags |= AOP_FLAG_UNINTERRUPTIBLE; + + orig_blk_offset = orig_page_offset * blocks_per_page + + data_offset_in_page; + + /* + * If orig extent is uninitialized one, + * it's not necessary force the page into memory + * and then force it to be written out again. + * Just swap data blocks between orig and donor. + */ + if (uninit) { + ret = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page); + + /* Clear the inode cache not to refer to the old data */ + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + goto out2; + } + + offs = (long long)orig_blk_offset << orig_inode->i_blkbits; + + /* Calculate data_len */ + if ((orig_blk_offset + block_len_in_page - 1) == + ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { + /* Replace the last block */ + tmp_data_len = orig_inode->i_size & (blocksize - 1); + /* + * If data_len equal zero, it shows data_len is multiples of + * blocksize. So we set appropriate value. + */ + if (tmp_data_len == 0) + tmp_data_len = blocksize; + + data_len = tmp_data_len + + ((block_len_in_page - 1) << orig_inode->i_blkbits); + } else { + data_len = block_len_in_page << orig_inode->i_blkbits; + } + + ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, + &page, &fsdata); + if (unlikely(ret < 0)) + goto out; + + if (!PageUptodate(page)) { + mapping->a_ops->readpage(o_filp, page); + lock_page(page); + } + + /* + * try_to_release_page() doesn't call releasepage in writeback mode. + * We should care about the order of writing to the same file + * by multiple move extent processes. + * It needs to call wait_on_page_writeback() to wait for the + * writeback of the page. + */ + if (PageWriteback(page)) + wait_on_page_writeback(page); + + /* Release old bh and drop refs */ + try_to_release_page(page, 0); + + ret = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page); + if (ret < 0) + goto out; + + /* Clear the inode cache not to refer to the old data */ + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); + + bh = page_buffers(page); + for (i = 0; i < data_offset_in_page; i++) + bh = bh->b_this_page; + + for (i = 0; i < block_len_in_page; i++) { + ret = ext4_get_block(orig_inode, + (sector_t)(orig_blk_offset + i), bh, 0); + if (ret < 0) + goto out; + + if (bh->b_this_page != NULL) + bh = bh->b_this_page; + } + + ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, + page, fsdata); + page = NULL; + +out: + if (unlikely(page)) { + if (PageLocked(page)) + unlock_page(page); + page_cache_release(page); + } +out2: + ext4_journal_stop(handle); + + return ret < 0 ? ret : 0; +} + +/** + * mext_check_argumants - Check whether move extent can be done + * + * @orig_inode: original inode + * @donor_inode: donor inode + * @orig_start: logical start offset in block for orig + * @donor_start: logical start offset in block for donor + * @len: the number of blocks to be moved + * @moved_len: moved block length + * + * Check the arguments of ext4_move_extents() whether the files can be + * exchanged with each other. + * Return 0 on success, or a negative error value on failure. + */ +static int +mext_check_arguments(struct inode *orig_inode, + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len, __u64 moved_len) +{ + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " + "regular file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Ext4 move extent does not support swapfile */ + if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " + "not be swapfile [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Files should be in the same ext4 FS */ + if (orig_inode->i_sb != donor_inode->i_sb) { + ext4_debug("ext4 move extent: The argument files " + "should be in same FS [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* orig and donor should be different file */ + if (orig_inode->i_ino == donor_inode->i_ino) { + ext4_debug("ext4 move extent: The argument files should not " + "be same file [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + /* Ext4 move extent supports only extent based file */ + if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { + ext4_debug("ext4 move extent: orig file is not extents " + "based file [ino:orig %lu]\n", orig_inode->i_ino); + return -EOPNOTSUPP; + } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { + ext4_debug("ext4 move extent: donor file is not extents " + "based file [ino:donor %lu]\n", donor_inode->i_ino); + return -EOPNOTSUPP; + } + + if ((!orig_inode->i_size) || (!donor_inode->i_size)) { + ext4_debug("ext4 move extent: File size is 0 byte\n"); + return -EINVAL; + } + + /* Start offset should be same */ + if (orig_start != donor_start) { + ext4_debug("ext4 move extent: orig and donor's start " + "offset are not same [ino:orig %lu, donor %lu]\n", + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (moved_len) { + ext4_debug("ext4 move extent: moved_len should be 0 " + "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, + donor_inode->i_ino); + return -EINVAL; + } + + if ((orig_start > MAX_DEFRAG_SIZE) || + (donor_start > MAX_DEFRAG_SIZE) || + (*len > MAX_DEFRAG_SIZE) || + (orig_start + *len > MAX_DEFRAG_SIZE)) { + ext4_debug("ext4 move extent: Can't handle over [%lu] blocks " + "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (orig_inode->i_size > donor_inode->i_size) { + if (orig_start >= donor_inode->i_size) { + ext4_debug("ext4 move extent: orig start offset " + "[%llu] should be less than donor file size " + "[%lld] [ino:orig %lu, donor_inode %lu]\n", + orig_start, donor_inode->i_size, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (orig_start + *len > donor_inode->i_size) { + ext4_debug("ext4 move extent: End offset [%llu] should " + "be less than donor file size [%lld]." + "So adjust length from %llu to %lld " + "[ino:orig %lu, donor %lu]\n", + orig_start + *len, donor_inode->i_size, + *len, donor_inode->i_size - orig_start, + orig_inode->i_ino, donor_inode->i_ino); + *len = donor_inode->i_size - orig_start; + } + } else { + if (orig_start >= orig_inode->i_size) { + ext4_debug("ext4 move extent: start offset [%llu] " + "should be less than original file size " + "[%lld] [inode:orig %lu, donor %lu]\n", + orig_start, orig_inode->i_size, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + + if (orig_start + *len > orig_inode->i_size) { + ext4_debug("ext4 move extent: Adjust length " + "from %llu to %lld. Because it should be " + "less than original file size " + "[ino:orig %lu, donor %lu]\n", + *len, orig_inode->i_size - orig_start, + orig_inode->i_ino, donor_inode->i_ino); + *len = orig_inode->i_size - orig_start; + } + } + + if (!*len) { + ext4_debug("ext4 move extent: len shoudld not be 0 " + "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, + donor_inode->i_ino); + return -EINVAL; + } + + return 0; +} + +/** + * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2 + * + * @inode1: the inode structure + * @inode2: the inode structure + * + * Lock two inodes' i_mutex by i_ino order. This function is moved from + * fs/inode.c. + */ +static void +mext_inode_double_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { + if (inode1) + mutex_lock(&inode1->i_mutex); + else if (inode2) + mutex_lock(&inode2->i_mutex); + return; + } + + if (inode1->i_ino < inode2->i_ino) { + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); + } +} + +/** + * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2 + * + * @inode1: the inode that is released first + * @inode2: the inode that is released second + * + * This function is moved from fs/inode.c. + */ + +static void +mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) +{ + if (inode1) + mutex_unlock(&inode1->i_mutex); + + if (inode2 && inode2 != inode1) + mutex_unlock(&inode2->i_mutex); +} + +/** + * ext4_move_extents - Exchange the specified range of a file + * + * @o_filp: file structure of the original file + * @d_filp: file structure of the donor file + * @orig_start: start offset in block for orig + * @donor_start: start offset in block for donor + * @len: the number of blocks to be moved + * @moved_len: moved block length + * + * This function returns 0 and moved block length is set in moved_len + * if succeed, otherwise returns error value. + * + * Note: ext4_move_extents() proceeds the following order. + * 1:ext4_move_extents() calculates the last block number of moving extent + * function by the start block number (orig_start) and the number of blocks + * to be moved (len) specified as arguments. + * If the {orig, donor}_start points a hole, the extent's start offset + * pointed by ext_cur (current extent), holecheck_path, orig_path are set + * after hole behind. + * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent + * or the ext_cur exceeds the block_end which is last logical block number. + * 3:To get the length of continues area, call mext_next_extent() + * specified with the ext_cur (initial value is holecheck_path) re-cursive, + * until find un-continuous extent, the start logical block number exceeds + * the block_end or the extent points to the last extent. + * 4:Exchange the original inode data with donor inode data + * from orig_page_offset to seq_end_page. + * The start indexes of data are specified as arguments. + * That of the original inode is orig_page_offset, + * and the donor inode is also orig_page_offset + * (To easily handle blocksize != pagesize case, the offset for the + * donor inode is block unit). + * 5:Update holecheck_path and orig_path to points a next proceeding extent, + * then returns to step 2. + * 6:Release holecheck_path, orig_path and set the len to moved_len + * which shows the number of moved blocks. + * The moved_len is useful for the command to calculate the file offset + * for starting next move extent ioctl. + * 7:Return 0 on success, or a negative error value on failure. + */ +int +ext4_move_extents(struct file *o_filp, struct file *d_filp, + __u64 orig_start, __u64 donor_start, __u64 len, + __u64 *moved_len) +{ + struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct inode *donor_inode = d_filp->f_dentry->d_inode; + struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; + struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; + ext4_lblk_t block_start = orig_start; + ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; + ext4_lblk_t rest_blocks; + pgoff_t orig_page_offset = 0, seq_end_page; + int ret, depth, last_extent = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + int data_offset_in_page; + int block_len_in_page; + int uninit; + + /* protect orig and donor against a truncate */ + mext_inode_double_lock(orig_inode, donor_inode); + + mext_double_down_read(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ + ret = mext_check_arguments(orig_inode, donor_inode, orig_start, + donor_start, &len, *moved_len); + mext_double_up_read(orig_inode, donor_inode); + if (ret) + goto out2; + + file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; + block_end = block_start + len - 1; + if (file_end < block_end) + len -= block_end - file_end; + + get_ext_path(orig_path, orig_inode, block_start, ret); + if (orig_path == NULL) + goto out2; + + /* Get path structure to check the hole */ + get_ext_path(holecheck_path, orig_inode, block_start, ret); + if (holecheck_path == NULL) + goto out; + + depth = ext_depth(orig_inode); + ext_cur = holecheck_path[depth].p_ext; + if (ext_cur == NULL) { + ret = -EINVAL; + goto out; + } + + /* + * Get proper extent whose ee_block is beyond block_start + * if block_start was within the hole. + */ + if (le32_to_cpu(ext_cur->ee_block) + + ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { + last_extent = mext_next_extent(orig_inode, + holecheck_path, &ext_cur); + if (last_extent < 0) { + ret = last_extent; + goto out; + } + last_extent = mext_next_extent(orig_inode, orig_path, + &ext_dummy); + if (last_extent < 0) { + ret = last_extent; + goto out; + } + } + seq_start = block_start; + + /* No blocks within the specified range. */ + if (le32_to_cpu(ext_cur->ee_block) > block_end) { + ext4_debug("ext4 move extent: The specified range of file " + "may be the hole\n"); + ret = -EINVAL; + goto out; + } + + /* Adjust start blocks */ + add_blocks = min(le32_to_cpu(ext_cur->ee_block) + + ext4_ext_get_actual_len(ext_cur), block_end + 1) - + max(le32_to_cpu(ext_cur->ee_block), block_start); + + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { + seq_blocks += add_blocks; + + /* Adjust tail blocks */ + if (seq_start + seq_blocks - 1 > block_end) + seq_blocks = block_end - seq_start + 1; + + ext_prev = ext_cur; + last_extent = mext_next_extent(orig_inode, holecheck_path, + &ext_cur); + if (last_extent < 0) { + ret = last_extent; + break; + } + add_blocks = ext4_ext_get_actual_len(ext_cur); + + /* + * Extend the length of contiguous block (seq_blocks) + * if extents are contiguous. + */ + if (ext4_can_extents_be_merged(orig_inode, + ext_prev, ext_cur) && + block_end >= le32_to_cpu(ext_cur->ee_block) && + !last_extent) + continue; + + /* Is original extent is uninitialized */ + uninit = ext4_ext_is_uninitialized(ext_prev); + + data_offset_in_page = seq_start % blocks_per_page; + + /* + * Calculate data blocks count that should be swapped + * at the first page. + */ + if (data_offset_in_page + seq_blocks > blocks_per_page) { + /* Swapped blocks are across pages */ + block_len_in_page = + blocks_per_page - data_offset_in_page; + } else { + /* Swapped blocks are in a page */ + block_len_in_page = seq_blocks; + } + + orig_page_offset = seq_start >> + (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); + seq_end_page = (seq_start + seq_blocks - 1) >> + (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); + seq_start = le32_to_cpu(ext_cur->ee_block); + rest_blocks = seq_blocks; + + /* Discard preallocations of two inodes */ + down_write(&EXT4_I(orig_inode)->i_data_sem); + ext4_discard_preallocations(orig_inode); + up_write(&EXT4_I(orig_inode)->i_data_sem); + + down_write(&EXT4_I(donor_inode)->i_data_sem); + ext4_discard_preallocations(donor_inode); + up_write(&EXT4_I(donor_inode)->i_data_sem); + + while (orig_page_offset <= seq_end_page) { + + /* Swap original branches with new branches */ + ret = move_extent_par_page(o_filp, donor_inode, + orig_page_offset, + data_offset_in_page, + block_len_in_page, uninit); + if (ret < 0) + goto out; + orig_page_offset++; + /* Count how many blocks we have exchanged */ + *moved_len += block_len_in_page; + BUG_ON(*moved_len > len); + + data_offset_in_page = 0; + rest_blocks -= block_len_in_page; + if (rest_blocks > blocks_per_page) + block_len_in_page = blocks_per_page; + else + block_len_in_page = rest_blocks; + } + + /* Decrease buffer counter */ + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); + get_ext_path(holecheck_path, orig_inode, + seq_start, ret); + if (holecheck_path == NULL) + break; + depth = holecheck_path->p_depth; + + /* Decrease buffer counter */ + if (orig_path) + ext4_ext_drop_refs(orig_path); + get_ext_path(orig_path, orig_inode, seq_start, ret); + if (orig_path == NULL) + break; + + ext_cur = holecheck_path[depth].p_ext; + add_blocks = ext4_ext_get_actual_len(ext_cur); + seq_blocks = 0; + + } +out: + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); + } + if (holecheck_path) { + ext4_ext_drop_refs(holecheck_path); + kfree(holecheck_path); + } +out2: + mext_inode_double_unlock(orig_inode, donor_inode); + + if (ret) + return ret; + + /* All of the specified blocks must be exchanged in succeed */ + BUG_ON(*moved_len != len); + + return 0; +} diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 07eb6649e4fa..de04013d16ff 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1782,7 +1782,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode (handle, dir, mode); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; @@ -1816,7 +1816,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, mode); + inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); @@ -1853,7 +1853,8 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, S_IFDIR | mode); + inode = ext4_new_inode(handle, dir, S_IFDIR | mode, + &dentry->d_name, 0); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -2264,7 +2265,8 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); + inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, + &dentry->d_name, 0); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 27eb289eea37..68b0351fc647 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1002,7 +1002,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, " too large to resize to %llu blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext4_warning(sb, __func__, "CONFIG_LBD not enabled"); + ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled"); return -EINVAL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 012c4251397e..8bb9e2d3e4b8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -37,7 +37,6 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/ctype.h> -#include <linux/marker.h> #include <linux/log2.h> #include <linux/crc16.h> #include <asm/uaccess.h> @@ -47,6 +46,9 @@ #include "xattr.h" #include "acl.h" +#define CREATE_TRACE_POINTS +#include <trace/events/ext4.h> + static int default_mb_history_length = 1000; module_param_named(default_mb_history_length, default_mb_history_length, @@ -301,7 +303,7 @@ static void ext4_handle_error(struct super_block *sb) if (!test_opt(sb, ERRORS_CONT)) { journal_t *journal = EXT4_SB(sb)->s_journal; - EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; if (journal) jbd2_journal_abort(journal, -EIO); } @@ -414,7 +416,7 @@ void ext4_abort(struct super_block *sb, const char *function, ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; sb->s_flags |= MS_RDONLY; - EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } @@ -1474,7 +1476,7 @@ set_qf_format: break; #endif case Opt_abort: - set_opt(sbi->s_mount_opt, ABORT); + sbi->s_mount_flags |= EXT4_MF_FS_ABORTED; break; case Opt_nobarrier: clear_opt(sbi->s_mount_opt, BARRIER); @@ -1653,7 +1655,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_commit_super(sb, 1); if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " - "bpg=%lu, ipg=%lu, mo=%04lx]\n", + "bpg=%lu, ipg=%lu, mo=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), @@ -1957,7 +1959,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) /* small i_blocks in vfs inode? */ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * CONFIG_LBD is not enabled implies the inode + * CONFIG_LBDAF is not enabled implies the inode * i_block represent total blocks in 512 bytes * 32 == size of vfs inode i_blocks * 8 */ @@ -2000,7 +2002,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * !has_huge_files or CONFIG_LBD not enabled implies that + * !has_huge_files or CONFIG_LBDAF not enabled implies that * the inode i_block field represents total file blocks in * 2^32 512-byte sectors == size of vfs inode i_blocks * 8 */ @@ -2204,6 +2206,7 @@ EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); @@ -2216,6 +2219,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), ATTR_LIST(mb_max_to_scan), ATTR_LIST(mb_min_to_scan), @@ -2436,13 +2440,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (has_huge_files) { /* * Large file size enabled file system can only be - * mount if kernel is build with CONFIG_LBD + * mount if kernel is build with CONFIG_LBDAF */ if (sizeof(root->i_blocks) < sizeof(u64) && !(sb->s_flags & MS_RDONLY)) { ext4_msg(sb, KERN_ERR, "Filesystem with huge " "files cannot be mounted read-write " - "without CONFIG_LBD"); + "without CONFIG_LBDAF"); goto failed_mount; } } @@ -2566,7 +2570,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely"); if (sizeof(sector_t) < 8) - ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled"); + ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); goto failed_mount; } @@ -3346,7 +3350,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) int ret = 0; tid_t target; - trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); + trace_ext4_sync_fs(sb, wait); if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { if (wait) jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); @@ -3450,7 +3454,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ext4_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -3465,7 +3469,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > ext4_blocks_count(es)) { - if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) { + if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; goto restore_opts; } diff --git a/fs/fat/cache.c b/fs/fat/cache.c index b42602298087..923990e4f16e 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -241,7 +241,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) while (*fclus < cluster) { /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { - fat_fs_panic(sb, "%s: detected the cluster chain loop" + fat_fs_error(sb, "%s: detected the cluster chain loop" " (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); nr = -EIO; @@ -252,7 +252,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) if (nr < 0) goto out; else if (nr == FAT_ENT_FREE) { - fat_fs_panic(sb, "%s: invalid cluster chain" + fat_fs_error(sb, "%s: invalid cluster chain" " (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); nr = -EIO; @@ -285,7 +285,7 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) if (ret < 0) return ret; else if (ret == FAT_ENT_EOF) { - fat_fs_panic(sb, "%s: request beyond EOF (i_pos %lld)", + fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); return -EIO; } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index f3500294eec5..38ff75a0fe22 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -22,6 +22,19 @@ #include <asm/uaccess.h> #include "fat.h" +/* + * Maximum buffer size of short name. + * [(MSDOS_NAME + '.') * max one char + nul] + * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] + */ +#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) +/* + * Maximum buffer size of unicode chars from slots. + * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] + */ +#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) +#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) + static inline loff_t fat_make_i_pos(struct super_block *sb, struct buffer_head *bh, struct msdos_dir_entry *de) @@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni, unsigned char *buf, int size) { if (sbi->options.utf8) - return utf8_wcstombs(buf, uni, size); + return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS, + UTF16_HOST_ENDIAN, buf, size); else return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, sbi->nls_io); @@ -325,19 +339,6 @@ parse_long: } /* - * Maximum buffer size of short name. - * [(MSDOS_NAME + '.') * max one char + nul] - * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] - */ -#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) -/* - * Maximum buffer size of unicode chars from slots. - * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] - */ -#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) -#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) - -/* * Return values: negative -> error, 0 -> not found, positive -> found, * value is the total amount of slots, including the shortname entry. */ @@ -1334,7 +1335,7 @@ found: goto error_remove; } if (dir->i_size & (sbi->cluster_size - 1)) { - fat_fs_panic(sb, "Odd directory size"); + fat_fs_error(sb, "Odd directory size"); dir->i_size = (dir->i_size + sbi->cluster_size - 1) & ~((loff_t)sbi->cluster_size - 1); } diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e4d88527b5dd..adb0e72a176d 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -17,6 +17,10 @@ #define VFAT_SFN_CREATE_WIN95 0x0100 /* emulate win95 rule for create */ #define VFAT_SFN_CREATE_WINNT 0x0200 /* emulate winnt rule for create */ +#define FAT_ERRORS_CONT 1 /* ignore error and continue */ +#define FAT_ERRORS_PANIC 2 /* panic on error */ +#define FAT_ERRORS_RO 3 /* remount r/o on error */ + struct fat_mount_options { uid_t fs_uid; gid_t fs_gid; @@ -26,6 +30,7 @@ struct fat_mount_options { char *iocharset; /* Charset used for filename input/display */ unsigned short shortname; /* flags for shortname display/create rule */ unsigned char name_check; /* r = relaxed, n = normal, s = strict */ + unsigned char errors; /* On error: continue, panic, remount-ro */ unsigned short allow_utime;/* permission for setting the [am]time */ unsigned quiet:1, /* set = fake successful chmods and chowns */ showexec:1, /* set = only set x bit for com/exe/bat */ @@ -316,7 +321,7 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent, extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); /* fat/misc.c */ -extern void fat_fs_panic(struct super_block *s, const char *fmt, ...) +extern void fat_fs_error(struct super_block *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))) __cold; extern void fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 618f5305c2e4..a81037721a6f 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -348,7 +348,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) if (entry < FAT_START_ENT || sbi->max_cluster <= entry) { fatent_brelse(fatent); - fat_fs_panic(sb, "invalid access to FAT (entry 0x%08x)", entry); + fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; } @@ -560,7 +560,7 @@ int fat_free_clusters(struct inode *inode, int cluster) err = cluster; goto error; } else if (cluster == FAT_ENT_FREE) { - fat_fs_panic(sb, "%s: deleting FAT entry beyond EOF", + fat_fs_error(sb, "%s: deleting FAT entry beyond EOF", __func__); err = -EIO; goto error; diff --git a/fs/fat/file.c b/fs/fat/file.c index e955a56b4e5e..b28ea646ff60 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -18,106 +18,112 @@ #include <linux/security.h> #include "fat.h" -int fat_generic_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { + u32 attr; + + mutex_lock(&inode->i_mutex); + attr = fat_make_attrs(inode); + mutex_unlock(&inode->i_mutex); + + return put_user(attr, user_attr); +} + +static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) +{ + struct inode *inode = file->f_path.dentry->d_inode; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - u32 __user *user_attr = (u32 __user *)arg; + int is_dir = S_ISDIR(inode->i_mode); + u32 attr, oldattr; + struct iattr ia; + int err; - switch (cmd) { - case FAT_IOCTL_GET_ATTRIBUTES: - { - u32 attr; + err = get_user(attr, user_attr); + if (err) + goto out; - mutex_lock(&inode->i_mutex); - attr = fat_make_attrs(inode); - mutex_unlock(&inode->i_mutex); + mutex_lock(&inode->i_mutex); + err = mnt_want_write(file->f_path.mnt); + if (err) + goto out_unlock_inode; - return put_user(attr, user_attr); + /* + * ATTR_VOLUME and ATTR_DIR cannot be changed; this also + * prevents the user from turning us into a VFAT + * longname entry. Also, we obviously can't set + * any of the NTFS attributes in the high 24 bits. + */ + attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR); + /* Merge in ATTR_VOLUME and ATTR_DIR */ + attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) | + (is_dir ? ATTR_DIR : 0); + oldattr = fat_make_attrs(inode); + + /* Equivalent to a chmod() */ + ia.ia_valid = ATTR_MODE | ATTR_CTIME; + ia.ia_ctime = current_fs_time(inode->i_sb); + if (is_dir) + ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO); + else { + ia.ia_mode = fat_make_mode(sbi, attr, + S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO)); } - case FAT_IOCTL_SET_ATTRIBUTES: - { - u32 attr, oldattr; - int err, is_dir = S_ISDIR(inode->i_mode); - struct iattr ia; - err = get_user(attr, user_attr); - if (err) - return err; + /* The root directory has no attributes */ + if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { + err = -EINVAL; + goto out_drop_write; + } - mutex_lock(&inode->i_mutex); - - err = mnt_want_write(filp->f_path.mnt); - if (err) - goto up_no_drop_write; - - /* - * ATTR_VOLUME and ATTR_DIR cannot be changed; this also - * prevents the user from turning us into a VFAT - * longname entry. Also, we obviously can't set - * any of the NTFS attributes in the high 24 bits. - */ - attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR); - /* Merge in ATTR_VOLUME and ATTR_DIR */ - attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) | - (is_dir ? ATTR_DIR : 0); - oldattr = fat_make_attrs(inode); - - /* Equivalent to a chmod() */ - ia.ia_valid = ATTR_MODE | ATTR_CTIME; - ia.ia_ctime = current_fs_time(inode->i_sb); - if (is_dir) - ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO); - else { - ia.ia_mode = fat_make_mode(sbi, attr, - S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO)); - } + if (sbi->options.sys_immutable && + ((attr | oldattr) & ATTR_SYS) && + !capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_drop_write; + } - /* The root directory has no attributes */ - if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { - err = -EINVAL; - goto up; - } + /* + * The security check is questionable... We single + * out the RO attribute for checking by the security + * module, just because it maps to a file mode. + */ + err = security_inode_setattr(file->f_path.dentry, &ia); + if (err) + goto out_drop_write; - if (sbi->options.sys_immutable) { - if ((attr | oldattr) & ATTR_SYS) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - err = -EPERM; - goto up; - } - } - } + /* This MUST be done before doing anything irreversible... */ + err = fat_setattr(file->f_path.dentry, &ia); + if (err) + goto out_drop_write; + + fsnotify_change(file->f_path.dentry, ia.ia_valid); + if (sbi->options.sys_immutable) { + if (attr & ATTR_SYS) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= S_IMMUTABLE; + } - /* - * The security check is questionable... We single - * out the RO attribute for checking by the security - * module, just because it maps to a file mode. - */ - err = security_inode_setattr(filp->f_path.dentry, &ia); - if (err) - goto up; - - /* This MUST be done before doing anything irreversible... */ - err = fat_setattr(filp->f_path.dentry, &ia); - if (err) - goto up; - - fsnotify_change(filp->f_path.dentry, ia.ia_valid); - if (sbi->options.sys_immutable) { - if (attr & ATTR_SYS) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= S_IMMUTABLE; - } + fat_save_attrs(inode, attr); + mark_inode_dirty(inode); +out_drop_write: + mnt_drop_write(file->f_path.mnt); +out_unlock_inode: + mutex_unlock(&inode->i_mutex); +out: + return err; +} - fat_save_attrs(inode, attr); - mark_inode_dirty(inode); -up: - mnt_drop_write(filp->f_path.mnt); -up_no_drop_write: - mutex_unlock(&inode->i_mutex); - return err; - } +int fat_generic_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + u32 __user *user_attr = (u32 __user *)arg; + + switch (cmd) { + case FAT_IOCTL_GET_ATTRIBUTES: + return fat_ioctl_get_attributes(inode, user_attr); + case FAT_IOCTL_SET_ATTRIBUTES: + return fat_ioctl_set_attributes(filp, user_attr); default: return -ENOTTY; /* Inappropriate ioctl for device */ } @@ -225,7 +231,7 @@ static int fat_free(struct inode *inode, int skip) fatent_brelse(&fatent); return 0; } else if (ret == FAT_ENT_FREE) { - fat_fs_panic(sb, + fat_fs_error(sb, "%s: invalid cluster chain (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); ret = -EIO; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 51a5ecf9000a..8970d8c49bb0 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -76,7 +76,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, return 0; if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) { - fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)", + fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)", MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); return -EIO; } @@ -856,6 +856,12 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",flush"); if (opts->tz_utc) seq_puts(m, ",tz=UTC"); + if (opts->errors == FAT_ERRORS_CONT) + seq_puts(m, ",errors=continue"); + else if (opts->errors == FAT_ERRORS_PANIC) + seq_puts(m, ",errors=panic"); + else + seq_puts(m, ",errors=remount-ro"); return 0; } @@ -868,7 +874,8 @@ enum { Opt_charset, Opt_shortname_lower, Opt_shortname_win95, Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, - Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err, + Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, + Opt_err_panic, Opt_err_ro, Opt_err, }; static const match_table_t fat_tokens = { @@ -891,6 +898,11 @@ static const match_table_t fat_tokens = { {Opt_showexec, "showexec"}, {Opt_debug, "debug"}, {Opt_immutable, "sys_immutable"}, + {Opt_flush, "flush"}, + {Opt_tz_utc, "tz=UTC"}, + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, {Opt_obsolate, "conv=binary"}, {Opt_obsolate, "conv=text"}, {Opt_obsolate, "conv=auto"}, @@ -902,8 +914,6 @@ static const match_table_t fat_tokens = { {Opt_obsolate, "cvf_format=%20s"}, {Opt_obsolate, "cvf_options=%100s"}, {Opt_obsolate, "posix"}, - {Opt_flush, "flush"}, - {Opt_tz_utc, "tz=UTC"}, {Opt_err, NULL}, }; static const match_table_t msdos_tokens = { @@ -956,7 +966,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->fs_uid = current_uid(); opts->fs_gid = current_gid(); - opts->fs_fmask = current_umask(); + opts->fs_fmask = opts->fs_dmask = current_umask(); opts->allow_utime = -1; opts->codepage = fat_default_codepage; opts->iocharset = fat_default_iocharset; @@ -973,6 +983,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->numtail = 1; opts->usefree = opts->nocase = 0; opts->tz_utc = 0; + opts->errors = FAT_ERRORS_RO; *debug = 0; if (!options) @@ -1065,6 +1076,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, case Opt_tz_utc: opts->tz_utc = 1; break; + case Opt_err_cont: + opts->errors = FAT_ERRORS_CONT; + break; + case Opt_err_panic: + opts->errors = FAT_ERRORS_PANIC; + break; + case Opt_err_ro: + opts->errors = FAT_ERRORS_RO; + break; /* msdos specific */ case Opt_dots: diff --git a/fs/fat/misc.c b/fs/fat/misc.c index ac39ebcc1496..a6c20473dfd7 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -12,14 +12,19 @@ #include "fat.h" /* - * fat_fs_panic reports a severe file system problem and sets the file system - * read-only. The file system can be made writable again by remounting it. + * fat_fs_error reports a file system problem that might indicate fa data + * corruption/inconsistency. Depending on 'errors' mount option the + * panic() is called, or error message is printed FAT and nothing is done, + * or filesystem is remounted read-only (default behavior). + * In case the file system is remounted read-only, it can be made writable + * again by remounting it. */ -void fat_fs_panic(struct super_block *s, const char *fmt, ...) +void fat_fs_error(struct super_block *s, const char *fmt, ...) { + struct fat_mount_options *opts = &MSDOS_SB(s)->options; va_list args; - printk(KERN_ERR "FAT: Filesystem panic (dev %s)\n", s->s_id); + printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); printk(KERN_ERR " "); va_start(args, fmt); @@ -27,13 +32,14 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...) va_end(args); printk("\n"); - if (!(s->s_flags & MS_RDONLY)) { + if (opts->errors == FAT_ERRORS_PANIC) + panic(" FAT fs panic from previous error\n"); + else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { s->s_flags |= MS_RDONLY; printk(KERN_ERR " File system has been set read-only\n"); } } - -EXPORT_SYMBOL_GPL(fat_fs_panic); +EXPORT_SYMBOL_GPL(fat_fs_error); /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ @@ -124,7 +130,7 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) mark_inode_dirty(inode); } if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) { - fat_fs_panic(sb, "clusters badly computed (%d != %llu)", + fat_fs_error(sb, "clusters badly computed (%d != %llu)", new_fclus, (llu)(inode->i_blocks >> (sbi->cluster_bits - 9))); fat_cache_inval_inode(inode); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 20f522861355..82f88733b681 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -608,7 +608,7 @@ error_inode: sinfo.bh = NULL; } if (corrupt < 0) { - fat_fs_panic(new_dir->i_sb, + fat_fs_error(new_dir->i_sb, "%s: Filesystem corrupted (i_pos %lld)", __func__, sinfo.i_pos); } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index b50ecbe97f83..73471b7ecc8c 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, if (utf8) { int name_len = strlen(name); - *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX); + *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); /* * We stripped '.'s before and set len appropriately, - * but utf8_mbstowcs doesn't care about len + * but utf8s_to_utf16s doesn't care about len */ *outlen -= (name_len - len); @@ -1030,7 +1030,7 @@ error_inode: sinfo.bh = NULL; } if (corrupt < 0) { - fat_fs_panic(new_dir->i_sb, + fat_fs_error(new_dir->i_sb, "%s: Filesystem corrupted (i_pos %lld)", __func__, sinfo.i_pos); } diff --git a/fs/fcntl.c b/fs/fcntl.c index 1ad703150dee..a040b764f8e3 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -198,15 +198,19 @@ static int setfl(int fd, struct file * filp, unsigned long arg) } static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, - uid_t uid, uid_t euid, int force) + int force) { write_lock_irq(&filp->f_owner.lock); if (force || !filp->f_owner.pid) { put_pid(filp->f_owner.pid); filp->f_owner.pid = get_pid(pid); filp->f_owner.pid_type = type; - filp->f_owner.uid = uid; - filp->f_owner.euid = euid; + + if (pid) { + const struct cred *cred = current_cred(); + filp->f_owner.uid = cred->uid; + filp->f_owner.euid = cred->euid; + } } write_unlock_irq(&filp->f_owner.lock); } @@ -214,14 +218,13 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - const struct cred *cred = current_cred(); int err; - + err = security_file_set_fowner(filp); if (err) return err; - f_modown(filp, pid, type, cred->uid, cred->euid, force); + f_modown(filp, pid, type, force); return 0; } EXPORT_SYMBOL(__f_setown); @@ -247,7 +250,7 @@ EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { - f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1); + f_modown(filp, NULL, PIDTYPE_PID, 1); } pid_t f_getown(struct file *filp) @@ -425,14 +428,20 @@ static inline int sigio_perm(struct task_struct *p, } static void send_sigio_to_task(struct task_struct *p, - struct fown_struct *fown, + struct fown_struct *fown, int fd, int reason) { - if (!sigio_perm(p, fown, fown->signum)) + /* + * F_SETSIG can change ->signum lockless in parallel, make + * sure we read it once and use the same value throughout. + */ + int signum = ACCESS_ONCE(fown->signum); + + if (!sigio_perm(p, fown, signum)) return; - switch (fown->signum) { + switch (signum) { siginfo_t si; default: /* Queue a rt signal with the appropriate fd as its @@ -441,7 +450,7 @@ static void send_sigio_to_task(struct task_struct *p, delivered even if we can't queue. Failure to queue in this case _should_ be reported; we fall back to SIGIO in that case. --sct */ - si.si_signo = fown->signum; + si.si_signo = signum; si.si_errno = 0; si.si_code = reason; /* Make sure we are called with one of the POLL_* @@ -453,7 +462,7 @@ static void send_sigio_to_task(struct task_struct *p, else si.si_band = band_table[reason - POLL_IN]; si.si_fd = fd; - if (!group_send_sig_info(fown->signum, &si, p)) + if (!group_send_sig_info(signum, &si, p)) break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 40308e98c6a4..caf049146ca2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -321,7 +321,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode_lock); inode->i_state &= ~I_SYNC; - if (!(inode->i_state & I_FREEING)) { + if (!(inode->i_state & (I_FREEING | I_CLEAR))) { if (!(inode->i_state & I_DIRTY) && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* @@ -492,7 +492,7 @@ void generic_sync_sb_inodes(struct super_block *sb, break; } - if (inode->i_state & I_NEW) { + if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; } @@ -523,7 +523,7 @@ void generic_sync_sb_inodes(struct super_block *sb, if (current_is_pdflush() && !writeback_acquire(bdi)) break; - BUG_ON(inode->i_state & I_FREEING); + BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); pages_skipped = wbc->pages_skipped; __writeback_single_inode(inode, wbc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f0df55a52929..d8673ccf90b7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -19,7 +19,6 @@ #include <linux/random.h> #include <linux/sched.h> #include <linux/exportfs.h> -#include <linux/smp_lock.h> MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -260,9 +259,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, static void fuse_umount_begin(struct super_block *sb) { - lock_kernel(); fuse_abort_conn(get_fuse_conn_super(sb)); - unlock_kernel(); } static void fuse_send_destroy(struct fuse_conn *fc) diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index cad957cdb1e5..5971359d2090 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,6 @@ config GFS2_FS tristate "GFS2 file system support" - depends on EXPERIMENTAL && (64BIT || LBD) + depends on EXPERIMENTAL && (64BIT || LBDAF) select DLM if GFS2_FS_LOCKING_DLM select CONFIGFS_FS if GFS2_FS_LOCKING_DLM select SYSFS if GFS2_FS_LOCKING_DLM diff --git a/fs/ioctl.c b/fs/ioctl.c index 286f38dfc6c0..001f8d3118f2 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -70,9 +70,7 @@ static int ioctl_fibmap(struct file *filp, int __user *p) res = get_user(block, p); if (res) return res; - lock_kernel(); res = mapping->a_ops->bmap(mapping, block); - unlock_kernel(); return put_user(res, p); } diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 2f0dc5a14633..8ba5441063be 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -195,9 +195,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, * Do not report hidden files if so instructed, or associated * files unless instructed to do so */ - if ((sbi->s_hide == 'y' && - (de->flags[-sbi->s_high_sierra] & 1)) || - (sbi->s_showassoc =='n' && + if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) || + (!sbi->s_showassoc && (de->flags[-sbi->s_high_sierra] & 4))) { filp->f_pos += de_len; continue; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 068b34b5a107..58a7963e168a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -141,13 +141,17 @@ static const struct dentry_operations isofs_dentry_ops[] = { }; struct iso9660_options{ - char map; - char rock; + unsigned int rock:1; + unsigned int cruft:1; + unsigned int hide:1; + unsigned int showassoc:1; + unsigned int nocompress:1; + unsigned int overriderockperm:1; + unsigned int uid_set:1; + unsigned int gid_set:1; + unsigned int utf8:1; + unsigned char map; char joliet; - char cruft; - char hide; - char showassoc; - char nocompress; unsigned char check; unsigned int blocksize; mode_t fmode; @@ -155,7 +159,6 @@ struct iso9660_options{ gid_t gid; uid_t uid; char *iocharset; - unsigned char utf8; /* LVE */ s32 session; s32 sbsector; @@ -312,7 +315,7 @@ enum { Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore, Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet, Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err, - Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, + Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm, }; static const match_table_t tokens = { @@ -340,6 +343,7 @@ static const match_table_t tokens = { {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%u"}, {Opt_dmode, "dmode=%u"}, + {Opt_overriderockperm, "overriderockperm"}, {Opt_block, "block=%u"}, {Opt_ignore, "conv=binary"}, {Opt_ignore, "conv=b"}, @@ -359,24 +363,22 @@ static int parse_options(char *options, struct iso9660_options *popt) int option; popt->map = 'n'; - popt->rock = 'y'; - popt->joliet = 'y'; - popt->cruft = 'n'; - popt->hide = 'n'; - popt->showassoc = 'n'; + popt->rock = 1; + popt->joliet = 1; + popt->cruft = 0; + popt->hide = 0; + popt->showassoc = 0; popt->check = 'u'; /* unset */ popt->nocompress = 0; popt->blocksize = 1024; - popt->fmode = popt->dmode = S_IRUGO | S_IXUGO; /* - * r-x for all. The disc could - * be shared with DOS machines so - * virtually anything could be - * a valid executable. - */ + popt->fmode = popt->dmode = ISOFS_INVALID_MODE; + popt->uid_set = 0; + popt->gid_set = 0; popt->gid = 0; popt->uid = 0; popt->iocharset = NULL; popt->utf8 = 0; + popt->overriderockperm = 0; popt->session=-1; popt->sbsector=-1; if (!options) @@ -393,20 +395,20 @@ static int parse_options(char *options, struct iso9660_options *popt) token = match_token(p, tokens, args); switch (token) { case Opt_norock: - popt->rock = 'n'; + popt->rock = 0; break; case Opt_nojoliet: - popt->joliet = 'n'; + popt->joliet = 0; break; case Opt_hide: - popt->hide = 'y'; + popt->hide = 1; break; case Opt_unhide: case Opt_showassoc: - popt->showassoc = 'y'; + popt->showassoc = 1; break; case Opt_cruft: - popt->cruft = 'y'; + popt->cruft = 1; break; case Opt_utf8: popt->utf8 = 1; @@ -450,11 +452,13 @@ static int parse_options(char *options, struct iso9660_options *popt) if (match_int(&args[0], &option)) return 0; popt->uid = option; + popt->uid_set = 1; break; case Opt_gid: if (match_int(&args[0], &option)) return 0; popt->gid = option; + popt->gid_set = 1; break; case Opt_mode: if (match_int(&args[0], &option)) @@ -466,6 +470,9 @@ static int parse_options(char *options, struct iso9660_options *popt) return 0; popt->dmode = option; break; + case Opt_overriderockperm: + popt->overriderockperm = 1; + break; case Opt_block: if (match_int(&args[0], &option)) return 0; @@ -650,7 +657,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) goto out_freebh; sbi->s_high_sierra = 1; - opt.rock = 'n'; + opt.rock = 0; h_pri = (struct hs_primary_descriptor *)vdp; goto root_found; } @@ -673,7 +680,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) root_found: - if (joliet_level && (pri == NULL || opt.rock == 'n')) { + if (joliet_level && (pri == NULL || !opt.rock)) { /* This is the case of Joliet with the norock mount flag. * A disc with both Joliet and Rock Ridge is handled later */ @@ -802,22 +809,31 @@ root_found: s->s_op = &isofs_sops; s->s_export_op = &isofs_export_ops; sbi->s_mapping = opt.map; - sbi->s_rock = (opt.rock == 'y' ? 2 : 0); + sbi->s_rock = (opt.rock ? 2 : 0); sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/ sbi->s_cruft = opt.cruft; sbi->s_hide = opt.hide; sbi->s_showassoc = opt.showassoc; sbi->s_uid = opt.uid; sbi->s_gid = opt.gid; + sbi->s_uid_set = opt.uid_set; + sbi->s_gid_set = opt.gid_set; sbi->s_utf8 = opt.utf8; sbi->s_nocompress = opt.nocompress; + sbi->s_overriderockperm = opt.overriderockperm; /* * It would be incredibly stupid to allow people to mark every file * on the disk as suid, so we merely allow them to set the default * permissions. */ - sbi->s_fmode = opt.fmode & 0777; - sbi->s_dmode = opt.dmode & 0777; + if (opt.fmode != ISOFS_INVALID_MODE) + sbi->s_fmode = opt.fmode & 0777; + else + sbi->s_fmode = ISOFS_INVALID_MODE; + if (opt.dmode != ISOFS_INVALID_MODE) + sbi->s_dmode = opt.dmode & 0777; + else + sbi->s_dmode = ISOFS_INVALID_MODE; /* * Read the root inode, which _may_ result in changing @@ -1095,18 +1111,6 @@ static const struct address_space_operations isofs_aops = { .bmap = _isofs_bmap }; -static inline void test_and_set_uid(uid_t *p, uid_t value) -{ - if (value) - *p = value; -} - -static inline void test_and_set_gid(gid_t *p, gid_t value) -{ - if (value) - *p = value; -} - static int isofs_read_level3_size(struct inode *inode) { unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); @@ -1261,7 +1265,10 @@ static int isofs_read_inode(struct inode *inode) ei->i_file_format = isofs_file_normal; if (de->flags[-high_sierra] & 2) { - inode->i_mode = sbi->s_dmode | S_IFDIR; + if (sbi->s_dmode != ISOFS_INVALID_MODE) + inode->i_mode = S_IFDIR | sbi->s_dmode; + else + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_nlink = 1; /* * Set to 1. We know there are 2, but * the find utility tries to optimize @@ -1270,8 +1277,16 @@ static int isofs_read_inode(struct inode *inode) * do it the hard way. */ } else { - /* Everybody gets to read the file. */ - inode->i_mode = sbi->s_fmode | S_IFREG; + if (sbi->s_fmode != ISOFS_INVALID_MODE) { + inode->i_mode = S_IFREG | sbi->s_fmode; + } else { + /* + * Set default permissions: r-x for all. The disc + * could be shared with DOS machines so virtually + * anything could be a valid executable. + */ + inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO; + } inode->i_nlink = 1; } inode->i_uid = sbi->s_uid; @@ -1300,7 +1315,7 @@ static int isofs_read_inode(struct inode *inode) * this CDROM was mounted with the cruft option. */ - if (sbi->s_cruft == 'y') + if (sbi->s_cruft) inode->i_size &= 0x00ffffff; if (de->interleave[0]) { @@ -1346,9 +1361,18 @@ static int isofs_read_inode(struct inode *inode) if (!high_sierra) { parse_rock_ridge_inode(de, inode); /* if we want uid/gid set, override the rock ridge setting */ - test_and_set_uid(&inode->i_uid, sbi->s_uid); - test_and_set_gid(&inode->i_gid, sbi->s_gid); + if (sbi->s_uid_set) + inode->i_uid = sbi->s_uid; + if (sbi->s_gid_set) + inode->i_gid = sbi->s_gid; } + /* Now set final access rights if overriding rock ridge setting */ + if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm && + sbi->s_dmode != ISOFS_INVALID_MODE) + inode->i_mode = S_IFDIR | sbi->s_dmode; + if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm && + sbi->s_fmode != ISOFS_INVALID_MODE) + inode->i_mode = S_IFREG | sbi->s_fmode; /* Install the inode operations vector */ if (S_ISREG(inode->i_mode)) { diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index ccbf72faf27a..7d33de84f52a 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -35,21 +35,20 @@ struct isofs_sb_info { unsigned long s_log_zone_size; unsigned long s_max_size; - unsigned char s_high_sierra; /* A simple flag */ - unsigned char s_mapping; int s_rock_offset; /* offset of SUSP fields within SU area */ - unsigned char s_rock; unsigned char s_joliet_level; - unsigned char s_utf8; - unsigned char s_cruft; /* Broken disks with high - byte of length containing - junk */ - unsigned char s_unhide; - unsigned char s_nosuid; - unsigned char s_nodev; - unsigned char s_nocompress; - unsigned char s_hide; - unsigned char s_showassoc; + unsigned char s_mapping; + unsigned int s_high_sierra:1; + unsigned int s_rock:2; + unsigned int s_utf8:1; + unsigned int s_cruft:1; /* Broken disks with high byte of length + * containing junk */ + unsigned int s_nocompress:1; + unsigned int s_hide:1; + unsigned int s_showassoc:1; + unsigned int s_overriderockperm:1; + unsigned int s_uid_set:1; + unsigned int s_gid_set:1; mode_t s_fmode; mode_t s_dmode; @@ -58,6 +57,8 @@ struct isofs_sb_info { struct nls_table *s_nls_iocharset; /* Native language support table */ }; +#define ISOFS_INVALID_MODE ((mode_t) -1) + static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index 92c14b850e9c..a048de81c093 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) return (op - ascii); } -/* Convert big endian wide character string to utf8 */ -static int -wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen) -{ - const __u8 *ip; - __u8 *op; - int size; - __u16 c; - - op = s; - ip = pwcs; - while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) { - c = (*ip << 8) | ip[1]; - if (c > 0x7f) { - size = utf8_wctomb(op, c, maxlen); - if (size == -1) { - /* Ignore character and move on */ - maxlen--; - } else { - op += size; - maxlen -= size; - } - } else { - *op++ = (__u8) c; - } - ip += 2; - inlen--; - } - return (op - s); -} - int get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) { @@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; if (utf8) { - len = wcsntombs_be(outname, de->name, - de->name_len[0] >> 1, PAGE_SIZE); + len = utf16s_to_utf8s((const wchar_t *) de->name, + de->name_len[0] >> 1, UTF16_BIG_ENDIAN, + outname, PAGE_SIZE); } else { len = uni16_to_x8(outname, (__be16 *) de->name, de->name_len[0] >> 1, nls); diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 8299889a835e..eaa831311c9c 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -142,9 +142,9 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, */ match = 0; if (dlen > 0 && - (sbi->s_hide =='n' || + (!sbi->s_hide || (!(de->flags[-sbi->s_high_sierra] & 1))) && - (sbi->s_showassoc =='y' || + (sbi->s_showassoc || (!(de->flags[-sbi->s_high_sierra] & 4)))) { match = (isofs_cmp(dentry, dpnt, dlen) == 0); } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index ed886e6db399..73242ba7c7b1 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1686,35 +1686,6 @@ out: return; } -/* - * journal_try_to_free_buffers() could race with journal_commit_transaction() - * The latter might still hold the a count on buffers when inspecting - * them on t_syncdata_list or t_locked_list. - * - * journal_try_to_free_buffers() will call this function to - * wait for the current transaction to finish syncing data buffers, before - * tryinf to free that buffer. - * - * Called with journal->j_state_lock held. - */ -static void journal_wait_for_transaction_sync_data(journal_t *journal) -{ - transaction_t *transaction = NULL; - tid_t tid; - - spin_lock(&journal->j_state_lock); - transaction = journal->j_committing_transaction; - - if (!transaction) { - spin_unlock(&journal->j_state_lock); - return; - } - - tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); - log_wait_commit(journal, tid); -} - /** * int journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation @@ -1786,25 +1757,6 @@ int journal_try_to_free_buffers(journal_t *journal, ret = try_to_free_buffers(page); - /* - * There are a number of places where journal_try_to_free_buffers() - * could race with journal_commit_transaction(), the later still - * holds the reference to the buffers to free while processing them. - * try_to_free_buffers() failed to free those buffers. Some of the - * caller of releasepage() request page buffers to be dropped, otherwise - * treat the fail-to-free as errors (such as generic_file_direct_IO()) - * - * So, if the caller of try_to_release_page() wants the synchronous - * behaviour(i.e make sure buffers are dropped upon return), - * let's wait for the current transaction to finish flush of - * dirty data buffers, then try to free those buffers again, - * with the journal locked. - */ - if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { - journal_wait_for_transaction_sync_data(journal); - ret = try_to_free_buffers(page); - } - busy: return ret; } diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 17159cacbd9e..5d70b3e6d49b 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -20,9 +20,9 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> -#include <linux/marker.h> #include <linux/errno.h> #include <linux/slab.h> +#include <trace/events/jbd2.h> /* * Unlink a buffer from a transaction checkpoint list. @@ -358,8 +358,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) * journal straight away. */ result = jbd2_cleanup_journal_tail(journal); - trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d", - journal->j_devname, result); + trace_jbd2_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 0b7d3b8226fd..7b4088b2364d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -16,7 +16,6 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> -#include <linux/marker.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/mm.h> @@ -26,6 +25,7 @@ #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/bio.h> +#include <trace/events/jbd2.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -253,6 +253,7 @@ static int journal_submit_data_buffers(journal_t *journal, * block allocation with delalloc. We need to write * only allocated blocks here. */ + trace_jbd2_submit_inode_data(jinode->i_vfs_inode); err = journal_submit_inode_data_buffers(mapping); if (!ret) ret = err; @@ -394,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); - trace_mark(jbd2_start_commit, "dev %s transaction %d", - journal->j_devname, commit_transaction->t_tid); + trace_jbd2_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); @@ -409,6 +409,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ if (commit_transaction->t_synchronous_commit) write_op = WRITE_SYNC_PLUG; + trace_jbd2_commit_locking(journal, commit_transaction); stats.u.run.rs_wait = commit_transaction->t_max_wait; stats.u.run.rs_locked = jiffies; stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, @@ -484,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_switch_revoke_table(journal); + trace_jbd2_commit_flushing(journal, commit_transaction); stats.u.run.rs_flushing = jiffies; stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, stats.u.run.rs_flushing); @@ -520,6 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); + trace_jbd2_commit_logging(journal, commit_transaction); stats.u.run.rs_logging = jiffies; stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, stats.u.run.rs_logging); @@ -1054,9 +1057,7 @@ restart_loop: if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); - trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", - journal->j_devname, commit_transaction->t_tid, - journal->j_tail_sequence); + trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); if (to_free) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 62be7d294ec2..18bfd5dab642 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -38,6 +38,10 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/math64.h> +#include <linux/hash.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/jbd2.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -2377,6 +2381,71 @@ static void __exit journal_exit(void) jbd2_journal_destroy_caches(); } +/* + * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 + * tracing infrastructure to map a dev_t to a device name. + * + * The caller should use rcu_read_lock() in order to make sure the + * device name stays valid until its done with it. We use + * rcu_read_lock() as well to make sure we're safe in case the caller + * gets sloppy, and because rcu_read_lock() is cheap and can be safely + * nested. + */ +struct devname_cache { + struct rcu_head rcu; + dev_t device; + char devname[BDEVNAME_SIZE]; +}; +#define CACHE_SIZE_BITS 6 +static struct devname_cache *devcache[1 << CACHE_SIZE_BITS]; +static DEFINE_SPINLOCK(devname_cache_lock); + +static void free_devcache(struct rcu_head *rcu) +{ + kfree(rcu); +} + +const char *jbd2_dev_to_name(dev_t device) +{ + int i = hash_32(device, CACHE_SIZE_BITS); + char *ret; + struct block_device *bd; + + rcu_read_lock(); + if (devcache[i] && devcache[i]->device == device) { + ret = devcache[i]->devname; + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); + + spin_lock(&devname_cache_lock); + if (devcache[i]) { + if (devcache[i]->device == device) { + ret = devcache[i]->devname; + spin_unlock(&devname_cache_lock); + return ret; + } + call_rcu(&devcache[i]->rcu, free_devcache); + } + devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); + if (!devcache[i]) { + spin_unlock(&devname_cache_lock); + return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ + } + devcache[i]->device = device; + bd = bdget(device); + if (bd) { + bdevname(bd, devcache[i]->devname); + bdput(bd); + } else + __bdevname(device, devcache[i]->devname); + ret = devcache[i]->devname; + spin_unlock(&devname_cache_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_dev_to_name); + MODULE_LICENSE("GPL"); module_init(journal_init); module_exit(journal_exit); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 996ffda06bf3..494501edba6b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1547,36 +1547,6 @@ out: return; } -/* - * jbd2_journal_try_to_free_buffers() could race with - * jbd2_journal_commit_transaction(). The later might still hold the - * reference count to the buffers when inspecting them on - * t_syncdata_list or t_locked_list. - * - * jbd2_journal_try_to_free_buffers() will call this function to - * wait for the current transaction to finish syncing data buffers, before - * try to free that buffer. - * - * Called with journal->j_state_lock hold. - */ -static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal) -{ - transaction_t *transaction; - tid_t tid; - - spin_lock(&journal->j_state_lock); - transaction = journal->j_committing_transaction; - - if (!transaction) { - spin_unlock(&journal->j_state_lock); - return; - } - - tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); - jbd2_log_wait_commit(journal, tid); -} - /** * int jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation @@ -1649,25 +1619,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, ret = try_to_free_buffers(page); - /* - * There are a number of places where jbd2_journal_try_to_free_buffers() - * could race with jbd2_journal_commit_transaction(), the later still - * holds the reference to the buffers to free while processing them. - * try_to_free_buffers() failed to free those buffers. Some of the - * caller of releasepage() request page buffers to be dropped, otherwise - * treat the fail-to-free as errors (such as generic_file_direct_IO()) - * - * So, if the caller of try_to_release_page() wants the synchronous - * behaviour(i.e make sure buffers are dropped upon return), - * let's wait for the current transaction to finish flush of - * dirty data buffers, then try to free those buffers again, - * with the journal locked. - */ - if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { - jbd2_journal_wait_for_transaction_sync_data(journal); - ret = try_to_free_buffers(page); - } - busy: return ret; } diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index bbbd5f202e37..41d6045dbeb0 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -391,6 +391,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) } XADaddress(xp, xaddr); XADlength(xp, xlen); + XADoffset(xp, prev); /* * only preserve the abnr flag within the xad flags * of the returned hint. diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 3aebe322271a..6ac693faae49 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -12,13 +12,14 @@ /* bitmap.c contains the code that handles the inode and block bitmaps */ #include "minix.h" -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/bitops.h> #include <linux/sched.h> static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; +static DEFINE_SPINLOCK(bitmap_lock); + static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) { unsigned i, j, sum = 0; @@ -69,11 +70,11 @@ void minix_free_block(struct inode *inode, unsigned long block) return; } bh = sbi->s_zmap[zone]; - lock_kernel(); + spin_lock(&bitmap_lock); if (!minix_test_and_clear_bit(bit, bh->b_data)) printk("minix_free_block (%s:%lu): bit already cleared\n", sb->s_id, block); - unlock_kernel(); + spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); return; } @@ -88,18 +89,18 @@ int minix_new_block(struct inode * inode) struct buffer_head *bh = sbi->s_zmap[i]; int j; - lock_kernel(); + spin_lock(&bitmap_lock); j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); if (j < bits_per_zone) { minix_set_bit(j, bh->b_data); - unlock_kernel(); + spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); j += i * bits_per_zone + sbi->s_firstdatazone-1; if (j < sbi->s_firstdatazone || j >= sbi->s_nzones) break; return j; } - unlock_kernel(); + spin_unlock(&bitmap_lock); } return 0; } @@ -211,10 +212,10 @@ void minix_free_inode(struct inode * inode) minix_clear_inode(inode); /* clear on-disk copy */ bh = sbi->s_imap[ino]; - lock_kernel(); + spin_lock(&bitmap_lock); if (!minix_test_and_clear_bit(bit, bh->b_data)) printk("minix_free_inode: bit %lu already cleared\n", bit); - unlock_kernel(); + spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); out: clear_inode(inode); /* clear in-memory copy */ @@ -237,7 +238,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error) j = bits_per_zone; bh = NULL; *error = -ENOSPC; - lock_kernel(); + spin_lock(&bitmap_lock); for (i = 0; i < sbi->s_imap_blocks; i++) { bh = sbi->s_imap[i]; j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); @@ -245,17 +246,17 @@ struct inode * minix_new_inode(const struct inode * dir, int * error) break; } if (!bh || j >= bits_per_zone) { - unlock_kernel(); + spin_unlock(&bitmap_lock); iput(inode); return NULL; } if (minix_test_and_set_bit(j, bh->b_data)) { /* shouldn't happen */ - unlock_kernel(); + spin_unlock(&bitmap_lock); printk("minix_new_inode: bit already set\n"); iput(inode); return NULL; } - unlock_kernel(); + spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); j += i * bits_per_zone; if (!j || j > sbi->s_ninodes) { diff --git a/fs/minix/dir.c b/fs/minix/dir.c index e5f206467e40..d407e7a0b6fe 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -11,7 +11,6 @@ #include "minix.h" #include <linux/buffer_head.h> #include <linux/highmem.h> -#include <linux/smp_lock.h> #include <linux/swap.h> typedef struct minix_dir_entry minix_dirent; @@ -20,6 +19,7 @@ typedef struct minix3_dir_entry minix3_dirent; static int minix_readdir(struct file *, void *, filldir_t); const struct file_operations minix_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = minix_readdir, .fsync = simple_fsync, @@ -102,8 +102,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) char *name; __u32 inumber; - lock_kernel(); - pos = (pos + chunk_size-1) & ~(chunk_size-1); if (pos >= inode->i_size) goto done; @@ -146,7 +144,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) done: filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; - unlock_kernel(); return 0; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f91a23693597..74ea82d72164 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -35,8 +35,6 @@ static void minix_put_super(struct super_block *sb) int i; struct minix_sb_info *sbi = minix_sb(sb); - lock_kernel(); - if (!(sb->s_flags & MS_RDONLY)) { if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ sbi->s_ms->s_state = sbi->s_mount_state; @@ -50,8 +48,6 @@ static void minix_put_super(struct super_block *sb) kfree(sbi->s_imap); sb->s_fs_info = NULL; kfree(sbi); - - unlock_kernel(); } static struct kmem_cache * minix_inode_cachep; diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 97645f112114..0ec6237a5970 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen, if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { int k; + unicode_t u; - k = utf8_mbtowc(&ec, iname, iname_end - iname); - if (k < 0) + k = utf8_to_utf32(iname, iname_end - iname, &u); + if (k < 0 || u > MAX_WCHAR_T) return -EINVAL; iname += k; + ec = u; } else { if (*iname == NCP_ESC) { int k; @@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen, if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { int k; - k = utf8_wctomb(iname, ec, iname_end - iname); + k = utf32_to_utf8(ec, iname, iname_end - iname); if (k < 0) { err = -ENAMETOOLONG; goto quit; diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index a2ab2529b5ca..ceda50aad73c 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -31,7 +31,7 @@ static inline void nfs_inc_server_stats(const struct nfs_server *server, cpu = get_cpu(); iostats = per_cpu_ptr(server->io_stats, cpu); iostats->events[stat]++; - put_cpu_no_resched(); + put_cpu(); } static inline void nfs_inc_stats(const struct inode *inode, @@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server, cpu = get_cpu(); iostats = per_cpu_ptr(server->io_stats, cpu); iostats->bytes[stat] += addend; - put_cpu_no_resched(); + put_cpu(); } static inline void nfs_add_stats(const struct inode *inode, @@ -71,7 +71,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode, cpu = get_cpu(); iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); iostats->fscache[stat] += addend; - put_cpu_no_resched(); + put_cpu(); } #endif diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 064279e33bbb..36df60b6d8a4 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -31,21 +31,26 @@ #include "dat.h" #include "alloc.h" +struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) +{ + return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); +} + int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, __u64 *ptrp) { - __u64 ptr; + sector_t blocknr; int ret; down_read(&bmap->b_sem); ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); if (ret < 0) goto out; - if (bmap->b_pops->bpop_translate != NULL) { - ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr); - if (ret < 0) - goto out; - *ptrp = ptr; + if (NILFS_BMAP_USE_VBN(bmap)) { + ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp, + &blocknr); + if (!ret) + *ptrp = blocknr; } out: @@ -53,6 +58,16 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, return ret; } +int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, + unsigned maxblocks) +{ + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks); + up_read(&bmap->b_sem); + return ret; +} /** * nilfs_bmap_lookup - find a record @@ -101,8 +116,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) if (n < 0) return n; ret = nilfs_btree_convert_and_insert( - bmap, key, ptr, keys, ptrs, n, - NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH); + bmap, key, ptr, keys, ptrs, n); if (ret == 0) bmap->b_u.u_flags |= NILFS_BMAP_LARGE; @@ -158,8 +172,7 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) if (n < 0) return n; ret = nilfs_direct_delete_and_convert( - bmap, key, keys, ptrs, n, - NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH); + bmap, key, keys, ptrs, n); if (ret == 0) bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE; @@ -417,38 +430,6 @@ void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n) mark_inode_dirty(bmap->b_inode); } -int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr, - struct buffer_head **bhp) -{ - return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache, - ptr, 0, bhp, 0); -} - -void nilfs_bmap_put_block(const struct nilfs_bmap *bmap, - struct buffer_head *bh) -{ - brelse(bh); -} - -int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr, - struct buffer_head **bhp) -{ - int ret; - - ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache, - ptr, 0, bhp, 1); - if (ret < 0) - return ret; - set_buffer_nilfs_volatile(*bhp); - return 0; -} - -void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap, - struct buffer_head *bh) -{ - nilfs_btnode_delete(bh); -} - __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, const struct buffer_head *bh) { @@ -476,11 +457,6 @@ __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key) return NILFS_BMAP_INVALID_PTR; } -static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) -{ - return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); -} - #define NILFS_BMAP_GROUP_DIV 8 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) { @@ -493,64 +469,51 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) (entries_per_group / NILFS_BMAP_GROUP_DIV); } -static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) +int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) { return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); } -static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) +void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) { nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); } -static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) +void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) { nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); } -static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) +int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, + sector_t blocknr) { - return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req, - sector_t blocknr) -{ - nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req, - blocknr); -} + struct inode *dat = nilfs_bmap_get_dat(bmap); + int ret; -static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); + ret = nilfs_dat_prepare_start(dat, &req->bpr_req); + if (likely(!ret)) + nilfs_dat_commit_start(dat, &req->bpr_req, blocknr); + return ret; } -static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) +int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) { return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); } -static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0); -} - -static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) +void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) { - nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1); + nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); } -static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) +void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) { nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); } @@ -566,128 +529,44 @@ int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr) return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr); } -int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) +int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) { + struct inode *dat = nilfs_bmap_get_dat(bmap); int ret; - ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq); + ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req); if (ret < 0) return ret; - ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq); + ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req); if (ret < 0) - bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq); + nilfs_dat_abort_end(dat, &oldreq->bpr_req); return ret; } -void nilfs_bmap_commit_update(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) +void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) { - bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq); - bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq); -} + struct inode *dat = nilfs_bmap_get_dat(bmap); -void nilfs_bmap_abort_update(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq); - bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq); + nilfs_dat_commit_end(dat, &oldreq->bpr_req, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); + nilfs_dat_commit_alloc(dat, &newreq->bpr_req); } -static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr, - __u64 *ptrp) +void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) { - sector_t blocknr; - int ret; - - ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr); - if (ret < 0) - return ret; - if (ptrp != NULL) - *ptrp = blocknr; - return 0; -} + struct inode *dat = nilfs_bmap_get_dat(bmap); -static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - /* ignore target ptr */ - req->bpr_ptr = bmap->b_last_allocated_ptr++; - return 0; + nilfs_dat_abort_end(dat, &oldreq->bpr_req); + nilfs_dat_abort_alloc(dat, &newreq->bpr_req); } -static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - /* do nothing */ -} - -static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - bmap->b_last_allocated_ptr--; -} - -static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = { - .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v, - .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v, - .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v, - .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v, - .bpop_commit_start_ptr = nilfs_bmap_commit_start_v, - .bpop_abort_start_ptr = nilfs_bmap_abort_start_v, - .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v, - .bpop_commit_end_ptr = nilfs_bmap_commit_end_v, - .bpop_abort_end_ptr = nilfs_bmap_abort_end_v, - - .bpop_translate = nilfs_bmap_translate_v, -}; - -static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = { - .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v, - .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v, - .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v, - .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v, - .bpop_commit_start_ptr = nilfs_bmap_commit_start_v, - .bpop_abort_start_ptr = nilfs_bmap_abort_start_v, - .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v, - .bpop_commit_end_ptr = nilfs_bmap_commit_end_vmdt, - .bpop_abort_end_ptr = nilfs_bmap_abort_end_v, - - .bpop_translate = nilfs_bmap_translate_v, -}; - -static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = { - .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_p, - .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_p, - .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_p, - .bpop_prepare_start_ptr = NULL, - .bpop_commit_start_ptr = NULL, - .bpop_abort_start_ptr = NULL, - .bpop_prepare_end_ptr = NULL, - .bpop_commit_end_ptr = NULL, - .bpop_abort_end_ptr = NULL, - - .bpop_translate = NULL, -}; - -static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = { - .bpop_prepare_alloc_ptr = NULL, - .bpop_commit_alloc_ptr = NULL, - .bpop_abort_alloc_ptr = NULL, - .bpop_prepare_start_ptr = NULL, - .bpop_commit_start_ptr = NULL, - .bpop_abort_start_ptr = NULL, - .bpop_prepare_end_ptr = NULL, - .bpop_commit_end_ptr = NULL, - .bpop_abort_end_ptr = NULL, - - .bpop_translate = NULL, -}; - static struct lock_class_key nilfs_bmap_dat_lock_key; /** @@ -714,31 +593,26 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; switch (bmap->b_inode->i_ino) { case NILFS_DAT_INO: - bmap->b_pops = &nilfs_bmap_ptr_ops_p; - bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_ptr_type = NILFS_BMAP_PTR_P; + bmap->b_last_allocated_key = 0; bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); break; case NILFS_CPFILE_INO: case NILFS_SUFILE_INO: - bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt; - bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_ptr_type = NILFS_BMAP_PTR_VS; + bmap->b_last_allocated_key = 0; bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; break; default: - bmap->b_pops = &nilfs_bmap_ptr_ops_v; - bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_ptr_type = NILFS_BMAP_PTR_VM; + bmap->b_last_allocated_key = 0; bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; break; } return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ? - nilfs_btree_init(bmap, - NILFS_BMAP_LARGE_LOW, - NILFS_BMAP_LARGE_HIGH) : - nilfs_direct_init(bmap, - NILFS_BMAP_SMALL_LOW, - NILFS_BMAP_SMALL_HIGH); + nilfs_btree_init(bmap) : nilfs_direct_init(bmap); } /** @@ -764,7 +638,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) memset(&bmap->b_u, 0, NILFS_BMAP_SIZE); init_rwsem(&bmap->b_sem); bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; - bmap->b_pops = &nilfs_bmap_ptr_ops_gc; + bmap->b_ptr_type = NILFS_BMAP_PTR_U; bmap->b_last_allocated_key = 0; bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; bmap->b_state = 0; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index 4f2708abb1ba..b2890cdcef12 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -64,6 +64,8 @@ struct nilfs_bmap_stats { */ struct nilfs_bmap_operations { int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); + int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *, + unsigned); int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); int (*bop_delete)(struct nilfs_bmap *, __u64); void (*bop_clear)(struct nilfs_bmap *); @@ -86,34 +88,6 @@ struct nilfs_bmap_operations { }; -/** - * struct nilfs_bmap_ptr_operations - bmap ptr operation table - */ -struct nilfs_bmap_ptr_operations { - int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); - void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); - void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); - int (*bpop_prepare_start_ptr)(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); - void (*bpop_commit_start_ptr)(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - sector_t); - void (*bpop_abort_start_ptr)(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); - int (*bpop_prepare_end_ptr)(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); - void (*bpop_commit_end_ptr)(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); - void (*bpop_abort_end_ptr)(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); - - int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *); -}; - - #define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64)) #define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */) #define NILFS_BMAP_NEW_PTR_INIT \ @@ -131,11 +105,9 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) * @b_sem: semaphore * @b_inode: owner of bmap * @b_ops: bmap operation table - * @b_pops: bmap ptr operation table - * @b_low: low watermark of conversion - * @b_high: high watermark of conversion * @b_last_allocated_key: last allocated key for data block * @b_last_allocated_ptr: last allocated ptr for data block + * @b_ptr_type: pointer type * @b_state: state */ struct nilfs_bmap { @@ -146,14 +118,22 @@ struct nilfs_bmap { struct rw_semaphore b_sem; struct inode *b_inode; const struct nilfs_bmap_operations *b_ops; - const struct nilfs_bmap_ptr_operations *b_pops; - __u64 b_low; - __u64 b_high; __u64 b_last_allocated_key; __u64 b_last_allocated_ptr; + int b_ptr_type; int b_state; }; +/* pointer type */ +#define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */ +#define NILFS_BMAP_PTR_VS 1 /* virtual block number (single + version) */ +#define NILFS_BMAP_PTR_VM 2 /* virtual block number (has multiple + versions) */ +#define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */ + +#define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0) + /* state */ #define NILFS_BMAP_DIRTY 0x00000001 @@ -162,6 +142,7 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *); +int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); @@ -182,7 +163,67 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); /* * Internal use only */ +struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); +int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); +static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + if (NILFS_BMAP_USE_VBN(bmap)) + return nilfs_bmap_prepare_alloc_v(bmap, req); + /* ignore target ptr */ + req->bpr_ptr = bmap->b_last_allocated_ptr++; + return 0; +} + +static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_bmap_commit_alloc_v(bmap, req); +} + +static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_bmap_abort_alloc_v(bmap, req); + else + bmap->b_last_allocated_ptr--; +} + +int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); +void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); +void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); + +static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return NILFS_BMAP_USE_VBN(bmap) ? + nilfs_bmap_prepare_end_v(bmap, req) : 0; +} + +static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_bmap_commit_end_v(bmap, req); +} + +static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_bmap_abort_end_v(bmap, req); +} + +int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *, + sector_t); int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t); int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64); @@ -193,28 +234,20 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); -int nilfs_bmap_prepare_update(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_update(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_update(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); +int nilfs_bmap_prepare_update_v(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_commit_update_v(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_abort_update_v(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); -int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64, - struct buffer_head **); -void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *); -int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64, - struct buffer_head **); -void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *); - - /* Assume that bmap semaphore is locked. */ static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) { diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 4cc07b2c30e0..7e0b61be212e 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -46,15 +46,18 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc) INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); } -static struct address_space_operations def_btnode_aops; +static struct address_space_operations def_btnode_aops = { + .sync_page = block_sync_page, +}; -void nilfs_btnode_cache_init(struct address_space *btnc) +void nilfs_btnode_cache_init(struct address_space *btnc, + struct backing_dev_info *bdi) { btnc->host = NULL; /* can safely set to host inode ? */ btnc->flags = 0; mapping_set_gfp_mask(btnc, GFP_NOFS); btnc->assoc_mapping = NULL; - btnc->backing_dev_info = &default_backing_dev_info; + btnc->backing_dev_info = bdi; btnc->a_ops = &def_btnode_aops; } diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 35faa86444a7..3e2275172ed6 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -38,7 +38,7 @@ struct nilfs_btnode_chkey_ctxt { }; void nilfs_btnode_cache_init_once(struct address_space *); -void nilfs_btnode_cache_init(struct address_space *); +void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, struct buffer_head **, int); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 6b37a2767293..aa412724b64e 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -29,6 +29,7 @@ #include "btnode.h" #include "btree.h" #include "alloc.h" +#include "dat.h" /** * struct nilfs_btree_path - A path on which B-tree operations are executed @@ -109,8 +110,7 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree, level < NILFS_BTREE_LEVEL_MAX; level++) { if (path[level].bp_bh != NULL) { - nilfs_bmap_put_block(&btree->bt_bmap, - path[level].bp_bh); + brelse(path[level].bp_bh); path[level].bp_bh = NULL; } /* sib_bh is released or deleted by prepare or commit @@ -123,10 +123,29 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree, } } - /* * B-tree node operations */ +static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr, + struct buffer_head **bhp) +{ + struct address_space *btnc = + &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; + return nilfs_btnode_get(btnc, ptr, 0, bhp, 0); +} + +static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, + __u64 ptr, struct buffer_head **bhp) +{ + struct address_space *btnc = + &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; + int ret; + + ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1); + if (!ret) + set_buffer_nilfs_volatile(*bhp); + return ret; +} static inline int nilfs_btree_node_get_flags(const struct nilfs_btree *btree, @@ -488,8 +507,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, path[level].bp_index = index; for (level--; level >= minlevel; level--) { - ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, - &path[level].bp_bh); + ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(btree, path, level); @@ -535,8 +553,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, path[level].bp_index = index; for (level--; level > 0; level--) { - ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, - &path[level].bp_bh); + ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(btree, path, level); @@ -579,6 +596,87 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, return ret; } +static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, + __u64 key, __u64 *ptrp, unsigned maxblocks) +{ + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + struct inode *dat = NULL; + __u64 ptr, ptr2; + sector_t blocknr; + int level = NILFS_BTREE_LEVEL_NODE_MIN; + int ret, cnt, index, maxlevel; + + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); + if (ret < 0) + goto out; + + if (NILFS_BMAP_USE_VBN(bmap)) { + dat = nilfs_bmap_get_dat(bmap); + ret = nilfs_dat_translate(dat, ptr, &blocknr); + if (ret < 0) + goto out; + ptr = blocknr; + } + cnt = 1; + if (cnt == maxblocks) + goto end; + + maxlevel = nilfs_btree_height(btree) - 1; + node = nilfs_btree_get_node(btree, path, level); + index = path[level].bp_index + 1; + for (;;) { + while (index < nilfs_btree_node_get_nchildren(btree, node)) { + if (nilfs_btree_node_get_key(btree, node, index) != + key + cnt) + goto end; + ptr2 = nilfs_btree_node_get_ptr(btree, node, index); + if (dat) { + ret = nilfs_dat_translate(dat, ptr2, &blocknr); + if (ret < 0) + goto out; + ptr2 = blocknr; + } + if (ptr2 != ptr + cnt || ++cnt == maxblocks) + goto end; + index++; + continue; + } + if (level == maxlevel) + break; + + /* look-up right sibling node */ + node = nilfs_btree_get_node(btree, path, level + 1); + index = path[level + 1].bp_index + 1; + if (index >= nilfs_btree_node_get_nchildren(btree, node) || + nilfs_btree_node_get_key(btree, node, index) != key + cnt) + break; + ptr2 = nilfs_btree_node_get_ptr(btree, node, index); + path[level + 1].bp_index = index; + + brelse(path[level].bp_bh); + path[level].bp_bh = NULL; + ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); + if (ret < 0) + goto out; + node = nilfs_btree_get_nonroot_node(btree, path, level); + index = 0; + path[level].bp_index = index; + } + end: + *ptrp = ptr; + ret = cnt; + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + static void nilfs_btree_promote_key(struct nilfs_btree *btree, struct nilfs_btree_path *path, int level, __u64 key) @@ -669,13 +767,13 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, nilfs_btree_node_get_key(btree, node, 0)); if (move) { - nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; path[level].bp_index += lnchildren; path[level + 1].bp_index--; } else { - nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; path[level].bp_index -= n; } @@ -722,14 +820,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, path[level + 1].bp_index--; if (move) { - nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; path[level].bp_index -= nilfs_btree_node_get_nchildren(btree, node); path[level + 1].bp_index++; } else { - nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; } @@ -781,7 +879,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree, *keyp = nilfs_btree_node_get_key(btree, right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; - nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; } else { @@ -790,7 +888,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree, *keyp = nilfs_btree_node_get_key(btree, right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; - nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; } @@ -897,12 +995,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, level = NILFS_BTREE_LEVEL_DATA; /* allocate a new ptr for data block */ - if (btree->bt_ops->btop_find_target != NULL) + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) path[level].bp_newreq.bpr_ptr = - btree->bt_ops->btop_find_target(btree, path, key); + nilfs_btree_find_target_v(btree, path, key); - ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( - &btree->bt_bmap, &path[level].bp_newreq); + ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); if (ret < 0) goto err_out_data; @@ -924,8 +1022,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, if (pindex > 0) { sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex - 1); - ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, - &bh); + ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; @@ -936,7 +1033,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, stats->bs_nblocks++; goto out; } else - nilfs_bmap_put_block(&btree->bt_bmap, bh); + brelse(bh); } /* right sibling */ @@ -944,8 +1041,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, nilfs_btree_node_get_nchildren(btree, parent) - 1) { sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); - ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, - &bh); + ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; @@ -956,19 +1052,19 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, stats->bs_nblocks++; goto out; } else - nilfs_bmap_put_block(&btree->bt_bmap, bh); + brelse(bh); } /* split */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; - ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( - &btree->bt_bmap, &path[level].bp_newreq); + ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); if (ret < 0) goto err_out_child_node; - ret = nilfs_bmap_get_new_block(&btree->bt_bmap, - path[level].bp_newreq.bpr_ptr, - &bh); + ret = nilfs_btree_get_new_block(btree, + path[level].bp_newreq.bpr_ptr, + &bh); if (ret < 0) goto err_out_curr_node; @@ -994,12 +1090,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* grow */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; - ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( - &btree->bt_bmap, &path[level].bp_newreq); + ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); if (ret < 0) goto err_out_child_node; - ret = nilfs_bmap_get_new_block(&btree->bt_bmap, - path[level].bp_newreq.bpr_ptr, &bh); + ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, + &bh); if (ret < 0) goto err_out_curr_node; @@ -1023,18 +1119,16 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* error */ err_out_curr_node: - btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); err_out_child_node: for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { - nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); - btree->bt_bmap.b_pops->bpop_abort_alloc_ptr( - &btree->bt_bmap, &path[level].bp_newreq); + nilfs_btnode_delete(path[level].bp_sib_bh); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); } - btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); err_out_data: *levelp = level; stats->bs_nblocks = 0; @@ -1049,14 +1143,12 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree, set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; - if (btree->bt_ops->btop_set_target != NULL) - btree->bt_ops->btop_set_target(btree, key, ptr); + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) + nilfs_btree_set_target_v(btree, key, ptr); for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { - if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) { - btree->bt_bmap.b_pops->bpop_commit_alloc_ptr( - &btree->bt_bmap, &path[level - 1].bp_newreq); - } + nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, + &path[level - 1].bp_newreq); path[level].bp_op(btree, path, level, &key, &ptr); } @@ -1153,7 +1245,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(btree, node, 0)); - nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; path[level].bp_index += n; } @@ -1192,7 +1284,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, nilfs_btree_node_get_key(btree, right, 0)); path[level + 1].bp_index--; - nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; } @@ -1221,7 +1313,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); unlock_buffer(path[level].bp_sib_bh); - nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); + nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); @@ -1252,7 +1344,7 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); unlock_buffer(path[level].bp_sib_bh); - nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); + nilfs_btnode_delete(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; path[level + 1].bp_index++; } @@ -1276,7 +1368,7 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, nilfs_btree_node_move_left(btree, root, child, n); unlock_buffer(path[level].bp_bh); - nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); + nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = NULL; } @@ -1300,12 +1392,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); - if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { - ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( - &btree->bt_bmap, &path[level].bp_oldreq); - if (ret < 0) - goto err_out_child_node; - } + ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, + &path[level].bp_oldreq); + if (ret < 0) + goto err_out_child_node; if (nilfs_btree_node_get_nchildren(btree, node) > nilfs_btree_node_nchildren_min(btree, node)) { @@ -1321,8 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* left sibling */ sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex - 1); - ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, - &bh); + ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; @@ -1343,8 +1432,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* right sibling */ sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); - ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, - &bh); + ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; @@ -1381,12 +1469,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, node = nilfs_btree_get_root(btree); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); - if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { - ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( - &btree->bt_bmap, &path[level].bp_oldreq); - if (ret < 0) - goto err_out_child_node; - } + + ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, + &path[level].bp_oldreq); + if (ret < 0) + goto err_out_child_node; + /* child of the root node is deleted */ path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; @@ -1398,15 +1486,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* error */ err_out_curr_node: - if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) - btree->bt_bmap.b_pops->bpop_abort_end_ptr( - &btree->bt_bmap, &path[level].bp_oldreq); + nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq); err_out_child_node: for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { - nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); - if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) - btree->bt_bmap.b_pops->bpop_abort_end_ptr( - &btree->bt_bmap, &path[level].bp_oldreq); + brelse(path[level].bp_sib_bh); + nilfs_bmap_abort_end_ptr(&btree->bt_bmap, + &path[level].bp_oldreq); } *levelp = level; stats->bs_nblocks = 0; @@ -1420,9 +1505,8 @@ static void nilfs_btree_commit_delete(struct nilfs_btree *btree, int level; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { - if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL) - btree->bt_bmap.b_pops->bpop_commit_end_ptr( - &btree->bt_bmap, &path[level].bp_oldreq); + nilfs_bmap_commit_end_ptr(&btree->bt_bmap, + &path[level].bp_oldreq); path[level].bp_op(btree, path, level, NULL, NULL); } @@ -1501,7 +1585,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) if (nchildren > 1) return 0; ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); - ret = nilfs_bmap_get_block(bmap, ptr, &bh); + ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) return ret; node = (struct nilfs_btree_node *)bh->b_data; @@ -1515,9 +1599,9 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) nextmaxkey = (nchildren > 1) ? nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; if (bh != NULL) - nilfs_bmap_put_block(bmap, bh); + brelse(bh); - return (maxkey == key) && (nextmaxkey < bmap->b_low); + return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, @@ -1542,7 +1626,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, nchildren = nilfs_btree_node_get_nchildren(btree, root); WARN_ON(nchildren > 1); ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); - ret = nilfs_bmap_get_block(bmap, ptr, &bh); + ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) return ret; node = (struct nilfs_btree_node *)bh->b_data; @@ -1563,7 +1647,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, } if (bh != NULL) - nilfs_bmap_put_block(bmap, bh); + brelse(bh); return nitems; } @@ -1584,10 +1668,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* for data */ /* cannot find near ptr */ - if (btree->bt_ops->btop_find_target != NULL) - dreq->bpr_ptr - = btree->bt_ops->btop_find_target(btree, NULL, key); - ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq); + if (NILFS_BMAP_USE_VBN(bmap)) + dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); + + ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq); if (ret < 0) return ret; @@ -1595,11 +1679,11 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, stats->bs_nblocks++; if (nreq != NULL) { nreq->bpr_ptr = dreq->bpr_ptr + 1; - ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq); + ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq); if (ret < 0) goto err_out_dreq; - ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh); + ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh); if (ret < 0) goto err_out_nreq; @@ -1612,9 +1696,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* error */ err_out_nreq: - bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq); + nilfs_bmap_abort_alloc_ptr(bmap, nreq); err_out_dreq: - bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq); + nilfs_bmap_abort_alloc_ptr(bmap, dreq); stats->bs_nblocks = 0; return ret; @@ -1624,7 +1708,7 @@ static void nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, - int n, __u64 low, __u64 high, + int n, union nilfs_bmap_ptr_req *dreq, union nilfs_bmap_ptr_req *nreq, struct buffer_head *bh) @@ -1642,12 +1726,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, /* convert and insert */ btree = (struct nilfs_btree *)bmap; - nilfs_btree_init(bmap, low, high); + nilfs_btree_init(bmap); if (nreq != NULL) { - if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) { - bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq); - bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq); - } + nilfs_bmap_commit_alloc_ptr(bmap, dreq); + nilfs_bmap_commit_alloc_ptr(bmap, nreq); /* create child node at level 1 */ lock_buffer(bh); @@ -1661,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, nilfs_bmap_set_dirty(bmap); unlock_buffer(bh); - nilfs_bmap_put_block(bmap, bh); + brelse(bh); /* create root node at level 2 */ node = nilfs_btree_get_root(btree); @@ -1669,8 +1751,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 2, 1, &keys[0], &tmpptr); } else { - if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) - bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq); + nilfs_bmap_commit_alloc_ptr(bmap, dreq); /* create root node at level 1 */ node = nilfs_btree_get_root(btree); @@ -1682,8 +1763,8 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, nilfs_bmap_set_dirty(bmap); } - if (btree->bt_ops->btop_set_target != NULL) - btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr); + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr); } /** @@ -1694,13 +1775,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, * @keys: * @ptrs: * @n: - * @low: - * @high: */ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr, - const __u64 *keys, const __u64 *ptrs, - int n, __u64 low, __u64 high) + const __u64 *keys, const __u64 *ptrs, int n) { struct buffer_head *bh; union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; @@ -1725,7 +1803,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, if (ret < 0) return ret; nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, - low, high, di, ni, bh); + di, ni, bh); nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); return 0; } @@ -1754,9 +1832,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; - ret = nilfs_bmap_prepare_update(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); if (ret < 0) return ret; @@ -1768,9 +1846,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, &path[level].bp_ctxt); if (ret < 0) { - nilfs_bmap_abort_update(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_bmap_abort_update_v(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); return ret; } } @@ -1784,9 +1862,9 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, { struct nilfs_btree_node *parent; - nilfs_bmap_commit_update(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_bmap_commit_update_v(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( @@ -1805,9 +1883,9 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, int level) { - nilfs_bmap_abort_update(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_bmap_abort_update_v(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, @@ -1930,7 +2008,9 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, goto out; } - ret = btree->bt_ops->btop_propagate(btree, path, level, bh); + ret = NILFS_BMAP_USE_VBN(bmap) ? + nilfs_btree_propagate_v(btree, path, level, bh) : + nilfs_btree_propagate_p(btree, path, level, bh); out: nilfs_btree_clear_path(btree, path); @@ -2066,12 +2146,9 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, ptr = nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); req.bpr_ptr = ptr; - ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap, - &req); - if (ret < 0) + ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr); + if (unlikely(ret < 0)) return ret; - btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap, - &req, blocknr); key = nilfs_btree_node_get_key(btree, parent, path[level + 1].bp_index); @@ -2114,8 +2191,9 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, goto out; } - ret = btree->bt_ops->btop_assign(btree, path, level, bh, - blocknr, binfo); + ret = NILFS_BMAP_USE_VBN(bmap) ? + nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) : + nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); out: nilfs_btree_clear_path(btree, path); @@ -2171,7 +2249,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) WARN_ON(ret == -ENOENT); goto out; } - ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh); + ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) { WARN_ON(ret == -ENOENT); goto out; @@ -2179,7 +2257,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) if (!buffer_dirty(bh)) nilfs_btnode_mark_dirty(bh); - nilfs_bmap_put_block(&btree->bt_bmap, bh); + brelse(bh); if (!nilfs_bmap_dirty(&btree->bt_bmap)) nilfs_bmap_set_dirty(&btree->bt_bmap); @@ -2191,6 +2269,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) static const struct nilfs_bmap_operations nilfs_btree_ops = { .bop_lookup = nilfs_btree_lookup, + .bop_lookup_contig = nilfs_btree_lookup_contig, .bop_insert = nilfs_btree_insert, .bop_delete = nilfs_btree_delete, .bop_clear = NULL, @@ -2210,6 +2289,7 @@ static const struct nilfs_bmap_operations nilfs_btree_ops = { static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { .bop_lookup = NULL, + .bop_lookup_contig = NULL, .bop_insert = NULL, .bop_delete = NULL, .bop_clear = NULL, @@ -2227,43 +2307,13 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { .bop_gather_data = NULL, }; -static const struct nilfs_btree_operations nilfs_btree_ops_v = { - .btop_find_target = nilfs_btree_find_target_v, - .btop_set_target = nilfs_btree_set_target_v, - .btop_propagate = nilfs_btree_propagate_v, - .btop_assign = nilfs_btree_assign_v, -}; - -static const struct nilfs_btree_operations nilfs_btree_ops_p = { - .btop_find_target = NULL, - .btop_set_target = NULL, - .btop_propagate = nilfs_btree_propagate_p, - .btop_assign = nilfs_btree_assign_p, -}; - -int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high) +int nilfs_btree_init(struct nilfs_bmap *bmap) { - struct nilfs_btree *btree; - - btree = (struct nilfs_btree *)bmap; bmap->b_ops = &nilfs_btree_ops; - bmap->b_low = low; - bmap->b_high = high; - switch (bmap->b_inode->i_ino) { - case NILFS_DAT_INO: - btree->bt_ops = &nilfs_btree_ops_p; - break; - default: - btree->bt_ops = &nilfs_btree_ops_v; - break; - } - return 0; } void nilfs_btree_init_gc(struct nilfs_bmap *bmap) { - bmap->b_low = NILFS_BMAP_LARGE_LOW; - bmap->b_high = NILFS_BMAP_LARGE_HIGH; bmap->b_ops = &nilfs_btree_ops_gc; } diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index 4766deb52fb1..0e72bbbc6b64 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -34,28 +34,6 @@ struct nilfs_btree; struct nilfs_btree_path; /** - * struct nilfs_btree_operations - B-tree operation table - */ -struct nilfs_btree_operations { - __u64 (*btop_find_target)(const struct nilfs_btree *, - const struct nilfs_btree_path *, __u64); - void (*btop_set_target)(struct nilfs_btree *, __u64, __u64); - - struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *); - - int (*btop_propagate)(struct nilfs_btree *, - struct nilfs_btree_path *, - int, - struct buffer_head *); - int (*btop_assign)(struct nilfs_btree *, - struct nilfs_btree_path *, - int, - struct buffer_head **, - sector_t, - union nilfs_binfo *); -}; - -/** * struct nilfs_btree_node - B-tree node * @bn_flags: flags * @bn_level: level @@ -80,13 +58,9 @@ struct nilfs_btree_node { /** * struct nilfs_btree - B-tree structure * @bt_bmap: bmap base structure - * @bt_ops: B-tree operation table */ struct nilfs_btree { struct nilfs_bmap bt_bmap; - - /* B-tree-specific members */ - const struct nilfs_btree_operations *bt_ops; }; @@ -108,10 +82,9 @@ struct nilfs_btree { int nilfs_btree_path_cache_init(void); void nilfs_btree_path_cache_destroy(void); -int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64); +int nilfs_btree_init(struct nilfs_bmap *); int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, - const __u64 *, const __u64 *, - int, __u64, __u64); + const __u64 *, const __u64 *, int); void nilfs_btree_init_gc(struct nilfs_bmap *); #endif /* _NILFS_BTREE_H */ diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index cadd36b14d07..7d49813f66d6 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -295,10 +295,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, return -EINVAL; } - /* cannot delete the latest checkpoint */ - if (start == nilfs_mdt_cno(cpfile) - 1) - return -EPERM; - down_write(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); @@ -384,9 +380,10 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile, } static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, - struct nilfs_cpinfo *ci, size_t nci) + void *buf, unsigned cisz, size_t nci) { struct nilfs_checkpoint *cp; + struct nilfs_cpinfo *ci = buf; struct buffer_head *bh; size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; @@ -410,17 +407,22 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, kaddr = kmap_atomic(bh->b_page, KM_USER0); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { - if (!nilfs_checkpoint_invalid(cp)) - nilfs_cpfile_checkpoint_to_cpinfo( - cpfile, cp, &ci[n++]); + if (!nilfs_checkpoint_invalid(cp)) { + nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, + ci); + ci = (void *)ci + cisz; + n++; + } } kunmap_atomic(kaddr, KM_USER0); brelse(bh); } ret = n; - if (n > 0) - *cnop = ci[n - 1].ci_cno + 1; + if (n > 0) { + ci = (void *)ci - cisz; + *cnop = ci->ci_cno + 1; + } out: up_read(&NILFS_MDT(cpfile)->mi_sem); @@ -428,11 +430,12 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, } static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, - struct nilfs_cpinfo *ci, size_t nci) + void *buf, unsigned cisz, size_t nci) { struct buffer_head *bh; struct nilfs_cpfile_header *header; struct nilfs_checkpoint *cp; + struct nilfs_cpinfo *ci = buf; __u64 curr = *cnop, next; unsigned long curr_blkoff, next_blkoff; void *kaddr; @@ -472,7 +475,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, if (unlikely(nilfs_checkpoint_invalid(cp) || !nilfs_checkpoint_snapshot(cp))) break; - nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]); + nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci); + ci = (void *)ci + cisz; + n++; next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); if (next == 0) break; /* reach end of the snapshot list */ @@ -511,13 +516,13 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, */ ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, - struct nilfs_cpinfo *ci, size_t nci) + void *buf, unsigned cisz, size_t nci) { switch (mode) { case NILFS_CHECKPOINT: - return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci); + return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci); case NILFS_SNAPSHOT: - return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci); + return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci); default: return -EINVAL; } @@ -533,20 +538,14 @@ int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno) struct nilfs_cpinfo ci; __u64 tcno = cno; ssize_t nci; - int ret; - nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1); + nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1); if (nci < 0) return nci; else if (nci == 0 || ci.ci_cno != cno) return -ENOENT; - - /* cannot delete the latest checkpoint nor snapshots */ - ret = nilfs_cpinfo_snapshot(&ci); - if (ret < 0) - return ret; - else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1) - return -EPERM; + else if (nilfs_cpinfo_snapshot(&ci)) + return -EBUSY; return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); } diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 1a8a1008c342..788a45950197 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -39,7 +39,7 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); int nilfs_cpfile_is_snapshot(struct inode *, __u64); int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); -ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, - struct nilfs_cpinfo *, size_t); +ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, + size_t); #endif /* _NILFS_CPFILE_H */ diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index bb8a5818e7f1..0b2710e2d565 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -92,21 +92,6 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req) nilfs_palloc_abort_alloc_entry(dat, req); } -int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req) -{ - int ret; - - ret = nilfs_palloc_prepare_free_entry(dat, req); - if (ret < 0) - return ret; - ret = nilfs_dat_prepare_entry(dat, req, 0); - if (ret < 0) { - nilfs_palloc_abort_free_entry(dat, req); - return ret; - } - return 0; -} - void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; @@ -391,36 +376,37 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) return ret; } -ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo, +ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz, size_t nvi) { struct buffer_head *entry_bh; struct nilfs_dat_entry *entry; + struct nilfs_vinfo *vinfo = buf; __u64 first, last; void *kaddr; unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block; int i, j, n, ret; for (i = 0; i < nvi; i += n) { - ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr, + ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr, 0, &entry_bh); if (ret < 0) return ret; kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); /* last virtual block number in this block */ - first = vinfo[i].vi_vblocknr; + first = vinfo->vi_vblocknr; do_div(first, entries_per_block); first *= entries_per_block; last = first + entries_per_block - 1; for (j = i, n = 0; - j < nvi && vinfo[j].vi_vblocknr >= first && - vinfo[j].vi_vblocknr <= last; - j++, n++) { + j < nvi && vinfo->vi_vblocknr >= first && + vinfo->vi_vblocknr <= last; + j++, n++, vinfo = (void *)vinfo + visz) { entry = nilfs_palloc_block_get_entry( - dat, vinfo[j].vi_vblocknr, entry_bh, kaddr); - vinfo[j].vi_start = le64_to_cpu(entry->de_start); - vinfo[j].vi_end = le64_to_cpu(entry->de_end); - vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr); + dat, vinfo->vi_vblocknr, entry_bh, kaddr); + vinfo->vi_start = le64_to_cpu(entry->de_start); + vinfo->vi_end = le64_to_cpu(entry->de_end); + vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr); } kunmap_atomic(kaddr, KM_USER0); brelse(entry_bh); diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index d9560654a4b7..d328b81eead4 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -47,6 +47,6 @@ void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); int nilfs_dat_mark_dirty(struct inode *, __u64); int nilfs_dat_freev(struct inode *, __u64 *, size_t); int nilfs_dat_move(struct inode *, __u64, sector_t); -ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t); +ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); #endif /* _NILFS_DAT_H */ diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index c6379e482781..342d9765df8d 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -25,6 +25,7 @@ #include "page.h" #include "direct.h" #include "alloc.h" +#include "dat.h" static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) { @@ -62,6 +63,47 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, return 0; } +static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, + __u64 key, __u64 *ptrp, + unsigned maxblocks) +{ + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; + struct inode *dat = NULL; + __u64 ptr, ptr2; + sector_t blocknr; + int ret, cnt; + + if (key > NILFS_DIRECT_KEY_MAX || + (ptr = nilfs_direct_get_ptr(direct, key)) == + NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + if (NILFS_BMAP_USE_VBN(bmap)) { + dat = nilfs_bmap_get_dat(bmap); + ret = nilfs_dat_translate(dat, ptr, &blocknr); + if (ret < 0) + return ret; + ptr = blocknr; + } + + maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1); + for (cnt = 1; cnt < maxblocks && + (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) != + NILFS_BMAP_INVALID_PTR; + cnt++) { + if (dat) { + ret = nilfs_dat_translate(dat, ptr2, &blocknr); + if (ret < 0) + return ret; + ptr2 = blocknr; + } + if (ptr2 != ptr + cnt) + break; + } + *ptrp = ptr; + return cnt; +} + static __u64 nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) { @@ -90,10 +132,9 @@ static int nilfs_direct_prepare_insert(struct nilfs_direct *direct, { int ret; - if (direct->d_ops->dop_find_target != NULL) - req->bpr_ptr = direct->d_ops->dop_find_target(direct, key); - ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap, - req); + if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) + req->bpr_ptr = nilfs_direct_find_target_v(direct, key); + ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req); if (ret < 0) return ret; @@ -111,16 +152,14 @@ static void nilfs_direct_commit_insert(struct nilfs_direct *direct, bh = (struct buffer_head *)((unsigned long)ptr); set_buffer_nilfs_volatile(bh); - if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL) - direct->d_bmap.b_pops->bpop_commit_alloc_ptr( - &direct->d_bmap, req); + nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req); nilfs_direct_set_ptr(direct, key, req->bpr_ptr); if (!nilfs_bmap_dirty(&direct->d_bmap)) nilfs_bmap_set_dirty(&direct->d_bmap); - if (direct->d_ops->dop_set_target != NULL) - direct->d_ops->dop_set_target(direct, key, req->bpr_ptr); + if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) + nilfs_direct_set_target_v(direct, key, req->bpr_ptr); } static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) @@ -152,25 +191,18 @@ static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, { int ret; - if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) { - req->bpr_ptr = nilfs_direct_get_ptr(direct, key); - ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr( - &direct->d_bmap, req); - if (ret < 0) - return ret; - } - - stats->bs_nblocks = 1; - return 0; + req->bpr_ptr = nilfs_direct_get_ptr(direct, key); + ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req); + if (!ret) + stats->bs_nblocks = 1; + return ret; } static void nilfs_direct_commit_delete(struct nilfs_direct *direct, union nilfs_bmap_ptr_req *req, __u64 key) { - if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL) - direct->d_bmap.b_pops->bpop_commit_end_ptr( - &direct->d_bmap, req); + nilfs_bmap_commit_end_ptr(&direct->d_bmap, req); nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); } @@ -244,8 +276,7 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, } int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, - __u64 key, __u64 *keys, __u64 *ptrs, - int n, __u64 low, __u64 high) + __u64 key, __u64 *keys, __u64 *ptrs, int n) { struct nilfs_direct *direct; __le64 *dptrs; @@ -275,8 +306,7 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, dptrs[i] = NILFS_BMAP_INVALID_PTR; } - nilfs_direct_init(bmap, low, high); - + nilfs_direct_init(bmap); return 0; } @@ -293,11 +323,11 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct, if (!buffer_nilfs_volatile(bh)) { oldreq.bpr_ptr = ptr; newreq.bpr_ptr = ptr; - ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq, - &newreq); + ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq, + &newreq); if (ret < 0) return ret; - nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq); + nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq); set_buffer_nilfs_volatile(bh); nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); } else @@ -309,12 +339,10 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct, static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, struct buffer_head *bh) { - struct nilfs_direct *direct; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; - direct = (struct nilfs_direct *)bmap; - return (direct->d_ops->dop_propagate != NULL) ? - direct->d_ops->dop_propagate(direct, bh) : - 0; + return NILFS_BMAP_USE_VBN(bmap) ? + nilfs_direct_propagate_v(direct, bh) : 0; } static int nilfs_direct_assign_v(struct nilfs_direct *direct, @@ -327,12 +355,9 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct, int ret; req.bpr_ptr = ptr; - ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr( - &direct->d_bmap, &req); - if (ret < 0) + ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr); + if (unlikely(ret < 0)) return ret; - direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap, - &req, blocknr); binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); @@ -377,12 +402,14 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, return -EINVAL; } - return direct->d_ops->dop_assign(direct, key, ptr, bh, - blocknr, binfo); + return NILFS_BMAP_USE_VBN(bmap) ? + nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) : + nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo); } static const struct nilfs_bmap_operations nilfs_direct_ops = { .bop_lookup = nilfs_direct_lookup, + .bop_lookup_contig = nilfs_direct_lookup_contig, .bop_insert = nilfs_direct_insert, .bop_delete = nilfs_direct_delete, .bop_clear = NULL, @@ -401,36 +428,8 @@ static const struct nilfs_bmap_operations nilfs_direct_ops = { }; -static const struct nilfs_direct_operations nilfs_direct_ops_v = { - .dop_find_target = nilfs_direct_find_target_v, - .dop_set_target = nilfs_direct_set_target_v, - .dop_propagate = nilfs_direct_propagate_v, - .dop_assign = nilfs_direct_assign_v, -}; - -static const struct nilfs_direct_operations nilfs_direct_ops_p = { - .dop_find_target = NULL, - .dop_set_target = NULL, - .dop_propagate = NULL, - .dop_assign = nilfs_direct_assign_p, -}; - -int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high) +int nilfs_direct_init(struct nilfs_bmap *bmap) { - struct nilfs_direct *direct; - - direct = (struct nilfs_direct *)bmap; bmap->b_ops = &nilfs_direct_ops; - bmap->b_low = low; - bmap->b_high = high; - switch (bmap->b_inode->i_ino) { - case NILFS_DAT_INO: - direct->d_ops = &nilfs_direct_ops_p; - break; - default: - direct->d_ops = &nilfs_direct_ops_v; - break; - } - return 0; } diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h index 45d2c5cda812..a5ffd66e25d0 100644 --- a/fs/nilfs2/direct.h +++ b/fs/nilfs2/direct.h @@ -31,18 +31,6 @@ struct nilfs_direct; /** - * struct nilfs_direct_operations - direct mapping operation table - */ -struct nilfs_direct_operations { - __u64 (*dop_find_target)(const struct nilfs_direct *, __u64); - void (*dop_set_target)(struct nilfs_direct *, __u64, __u64); - int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *); - int (*dop_assign)(struct nilfs_direct *, __u64, __u64, - struct buffer_head **, sector_t, - union nilfs_binfo *); -}; - -/** * struct nilfs_direct_node - direct node * @dn_flags: flags * @dn_pad: padding @@ -55,13 +43,9 @@ struct nilfs_direct_node { /** * struct nilfs_direct - direct mapping * @d_bmap: bmap structure - * @d_ops: direct mapping operation table */ struct nilfs_direct { struct nilfs_bmap d_bmap; - - /* direct-mapping-specific members */ - const struct nilfs_direct_operations *d_ops; }; @@ -70,9 +54,9 @@ struct nilfs_direct { #define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) -int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64); +int nilfs_direct_init(struct nilfs_bmap *); int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *, - __u64 *, int, __u64, __u64); + __u64 *, int); #endif /* _NILFS_DIRECT_H */ diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 19d2102b6a69..1b3c2bb20da9 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -52,8 +52,9 @@ #include "dat.h" #include "ifile.h" -static struct address_space_operations def_gcinode_aops = {}; -/* XXX need def_gcinode_iops/fops? */ +static struct address_space_operations def_gcinode_aops = { + .sync_page = block_sync_page, +}; /* * nilfs_gccache_submit_read_data() - add data buffer and submit read request diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 49ab4a49bb4f..2696d6b513b7 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -43,22 +43,23 @@ * * This function does not issue actual read request of the specified data * block. It is done by VFS. - * Bulk read for direct-io is not supported yet. (should be supported) */ int nilfs_get_block(struct inode *inode, sector_t blkoff, struct buffer_head *bh_result, int create) { struct nilfs_inode_info *ii = NILFS_I(inode); - unsigned long blknum = 0; + __u64 blknum = 0; int err = 0, ret; struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); + unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; - /* This exclusion control is a workaround; should be revised */ - down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ - ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum); - up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ - if (ret == 0) { /* found */ + down_read(&NILFS_MDT(dat)->mi_sem); + ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); + up_read(&NILFS_MDT(dat)->mi_sem); + if (ret >= 0) { /* found */ map_bh(bh_result, inode->i_sb, blknum); + if (ret > 0) + bh_result->b_size = (ret << inode->i_blkbits); goto out; } /* data block was not found */ @@ -240,7 +241,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, struct address_space_operations nilfs_aops = { .writepage = nilfs_writepage, .readpage = nilfs_readpage, - /* .sync_page = nilfs_sync_page, */ + .sync_page = block_sync_page, .writepages = nilfs_writepages, .set_page_dirty = nilfs_set_page_dirty, .readpages = nilfs_readpages, @@ -249,6 +250,7 @@ struct address_space_operations nilfs_aops = { /* .releasepage = nilfs_releasepage, */ .invalidatepage = block_invalidatepage, .direct_IO = nilfs_direct_IO, + .is_partially_uptodate = block_is_partially_uptodate, }; struct inode *nilfs_new_inode(struct inode *dir, int mode) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index d6759b92006f..6ea5f872e2de 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -152,7 +152,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, down_read(&nilfs->ns_segctor_sem); ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf, - nmembs); + size, nmembs); up_read(&nilfs->ns_segctor_sem); return ret; } @@ -182,7 +182,8 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, int ret; down_read(&nilfs->ns_segctor_sem); - ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs); + ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size, + nmembs); up_read(&nilfs->ns_segctor_sem); return ret; } @@ -212,7 +213,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, int ret; down_read(&nilfs->ns_segctor_sem); - ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs); + ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs); up_read(&nilfs->ns_segctor_sem); return ret; } @@ -435,24 +436,6 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, return nmembs; } -static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs, - struct nilfs_argv *argv, void *buf) -{ - size_t nmembs = argv->v_nmembs; - struct nilfs_sb_info *sbi = nilfs->ns_writer; - int ret; - - if (unlikely(!sbi)) { - /* never happens because called for a writable mount */ - WARN_ON(1); - return -EROFS; - } - ret = nilfs_segctor_add_segments_to_be_freed( - NILFS_SC(sbi), buf, nmembs); - - return (ret < 0) ? ret : nmembs; -} - int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, struct nilfs_argv *argv, void **kbufs) { @@ -491,14 +474,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, msg = "cannot mark copying blocks dirty"; goto failed; } - ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]); - if (ret < 0) { - /* - * can safely abort because this operation is atomic. - */ - msg = "cannot set segments to be freed"; - goto failed; - } return 0; failed: @@ -615,7 +590,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, if (copy_from_user(&argv, argp, sizeof(argv))) return -EFAULT; - if (argv.v_size != membsz) + if (argv.v_size < membsz) return -EINVAL; ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index bb78745a0e30..3d3ddb3f5177 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -430,6 +430,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) static struct address_space_operations def_mdt_aops = { .writepage = nilfs_mdt_write_page, + .sync_page = block_sync_page, }; static struct inode_operations def_mdt_iops; @@ -449,7 +450,7 @@ struct inode * nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, ino_t ino, gfp_t gfp_mask) { - struct inode *inode = nilfs_alloc_inode(sb); + struct inode *inode = nilfs_alloc_inode_common(nilfs); if (!inode) return NULL; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index da6fc0bba2e5..edf6a59d9f2a 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -263,6 +263,7 @@ extern void nilfs_dirty_inode(struct inode *); extern struct dentry *nilfs_get_parent(struct dentry *); /* super.c */ +extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *); extern struct inode *nilfs_alloc_inode(struct super_block *); extern void nilfs_destroy_inode(struct inode *); extern void nilfs_error(struct super_block *, const char *, const char *, ...) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 57afa9d24061..d80cc71be749 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -28,7 +28,6 @@ #include "segment.h" #include "sufile.h" #include "page.h" -#include "seglist.h" #include "segbuf.h" /* @@ -395,6 +394,24 @@ static void dispose_recovery_list(struct list_head *head) } } +struct nilfs_segment_entry { + struct list_head list; + __u64 segnum; +}; + +static int nilfs_segment_list_add(struct list_head *head, __u64 segnum) +{ + struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); + + if (unlikely(!ent)) + return -ENOMEM; + + ent->segnum = segnum; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, head); + return 0; +} + void nilfs_dispose_segment_list(struct list_head *head) { while (!list_empty(head)) { @@ -402,7 +419,7 @@ void nilfs_dispose_segment_list(struct list_head *head) = list_entry(head->next, struct nilfs_segment_entry, list); list_del(&ent->list); - nilfs_free_segment_entry(ent); + kfree(ent); } } @@ -431,12 +448,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, if (unlikely(err)) goto failed; - err = -ENOMEM; for (i = 1; i < 4; i++) { - ent = nilfs_alloc_segment_entry(segnum[i]); - if (unlikely(!ent)) + err = nilfs_segment_list_add(head, segnum[i]); + if (unlikely(err)) goto failed; - list_add_tail(&ent->list, head); } /* @@ -450,7 +465,7 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, goto failed; } list_del(&ent->list); - nilfs_free_segment_entry(ent); + kfree(ent); } /* Allocate new segments for recovery */ @@ -791,7 +806,6 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, u64 seg_seq; __u64 segnum, nextnum = 0; __u64 cno; - struct nilfs_segment_entry *ent; LIST_HEAD(segments); int empty_seg = 0, scan_newer = 0; int ret; @@ -892,12 +906,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, if (empty_seg++) goto super_root_found; /* found a valid super root */ - ent = nilfs_alloc_segment_entry(segnum); - if (unlikely(!ent)) { - ret = -ENOMEM; + ret = nilfs_segment_list_add(&segments, segnum); + if (unlikely(ret)) goto failed; - } - list_add_tail(&ent->list, &segments); seg_seq++; segnum = nextnum; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1e68821b4a9b..9e3fe17bb96b 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -26,7 +26,6 @@ #include <linux/crc32.h> #include "page.h" #include "segbuf.h" -#include "seglist.h" static struct kmem_cache *nilfs_segbuf_cachep; @@ -394,7 +393,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * Last BIO is always sent through the following * submission. */ - rw |= (1 << BIO_RW_SYNCIO); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); res = nilfs_submit_seg_bio(wi, rw); if (unlikely(res)) goto failed_bio; diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h deleted file mode 100644 index d39df9144e99..000000000000 --- a/fs/nilfs2/seglist.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * seglist.h - expediential structure and routines to handle list of segments - * (would be removed in a future release) - * - * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> - * - */ -#ifndef _NILFS_SEGLIST_H -#define _NILFS_SEGLIST_H - -#include <linux/fs.h> -#include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> -#include "sufile.h" - -struct nilfs_segment_entry { - __u64 segnum; - -#define NILFS_SLH_FREED 0x0001 /* The segment was freed provisonally. - It must be cancelled if - construction aborted */ - - unsigned flags; - struct list_head list; - struct buffer_head *bh_su; - struct nilfs_segment_usage *raw_su; -}; - - -void nilfs_dispose_segment_list(struct list_head *); - -static inline struct nilfs_segment_entry * -nilfs_alloc_segment_entry(__u64 segnum) -{ - struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); - - if (likely(ent)) { - ent->segnum = segnum; - ent->flags = 0; - ent->bh_su = NULL; - ent->raw_su = NULL; - INIT_LIST_HEAD(&ent->list); - } - return ent; -} - -static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent, - struct inode *sufile) -{ - return nilfs_sufile_get_segment_usage(sufile, ent->segnum, - &ent->raw_su, &ent->bh_su); -} - -static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent, - struct inode *sufile) -{ - if (!ent->bh_su) - return; - nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su); - ent->bh_su = NULL; - ent->raw_su = NULL; -} - -static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent) -{ - kfree(ent); -} - -#endif /* _NILFS_SEGLIST_H */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 22c7f65c2403..aa977549919e 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -39,7 +39,6 @@ #include "sufile.h" #include "cpfile.h" #include "ifile.h" -#include "seglist.h" #include "segbuf.h" @@ -79,7 +78,8 @@ enum { /* State flags of collection */ #define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ #define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ -#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED) +#define NILFS_CF_SUFREED 0x0004 /* segment usages has been freed */ +#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED) /* Operations depending on the construction mode and file type */ struct nilfs_sc_operations { @@ -810,7 +810,7 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci) { return list_empty(&sci->sc_dirty_files) && !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) && - list_empty(&sci->sc_cleaning_segments) && + sci->sc_nfreesegs == 0 && (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes)); } @@ -1005,44 +1005,6 @@ static void nilfs_drop_collected_inodes(struct list_head *head) } } -static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci, - struct inode *sufile) - -{ - struct list_head *head = &sci->sc_cleaning_segments; - struct nilfs_segment_entry *ent; - int err; - - list_for_each_entry(ent, head, list) { - if (!(ent->flags & NILFS_SLH_FREED)) - break; - err = nilfs_sufile_cancel_free(sufile, ent->segnum); - WARN_ON(err); /* do not happen */ - ent->flags &= ~NILFS_SLH_FREED; - } -} - -static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci, - struct inode *sufile) -{ - struct list_head *head = &sci->sc_cleaning_segments; - struct nilfs_segment_entry *ent; - int err; - - list_for_each_entry(ent, head, list) { - err = nilfs_sufile_free(sufile, ent->segnum); - if (unlikely(err)) - return err; - ent->flags |= NILFS_SLH_FREED; - } - return 0; -} - -static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci) -{ - nilfs_dispose_segment_list(&sci->sc_cleaning_segments); -} - static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, struct inode *inode, struct list_head *listp, @@ -1161,6 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) struct the_nilfs *nilfs = sbi->s_nilfs; struct list_head *head; struct nilfs_inode_info *ii; + size_t ndone; int err = 0; switch (sci->sc_stage.scnt) { @@ -1250,10 +1213,16 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) break; sci->sc_stage.scnt++; /* Fall through */ case NILFS_ST_SUFILE: - err = nilfs_segctor_prepare_free_segments(sci, - nilfs->ns_sufile); - if (unlikely(err)) + err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs, + sci->sc_nfreesegs, &ndone); + if (unlikely(err)) { + nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, ndone, + NULL); break; + } + sci->sc_stage.flags |= NILFS_CF_SUFREED; + err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile, &nilfs_sc_file_ops); if (unlikely(err)) @@ -1486,7 +1455,15 @@ static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci, { if (unlikely(err)) { nilfs_segctor_free_incomplete_segments(sci, nilfs); - nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); + if (sci->sc_stage.flags & NILFS_CF_SUFREED) { + int ret; + + ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, + sci->sc_nfreesegs, + NULL); + WARN_ON(ret); /* do not happen */ + } } nilfs_segctor_clear_segment_buffers(sci); } @@ -1585,7 +1562,13 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) break; - nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); + if (sci->sc_stage.flags & NILFS_CF_SUFREED) { + err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, + sci->sc_freesegs, + sci->sc_nfreesegs, + NULL); + WARN_ON(err); /* do not happen */ + } nilfs_segctor_clear_segment_buffers(sci); err = nilfs_segctor_extend_segments(sci, nilfs, nadd); @@ -2224,10 +2207,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) nilfs_segctor_complete_write(sci); /* Commit segments */ - if (has_sr) { - nilfs_segctor_commit_free_segments(sci); + if (has_sr) nilfs_segctor_clear_metadata_dirty(sci); - } nilfs_segctor_end_construction(sci, nilfs, 0); @@ -2301,48 +2282,6 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino) /* assign bit 0 to data files */ } -int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci, - __u64 *segnum, size_t nsegs) -{ - struct nilfs_segment_entry *ent; - struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; - struct inode *sufile = nilfs->ns_sufile; - LIST_HEAD(list); - __u64 *pnum; - size_t i; - int err; - - for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) { - ent = nilfs_alloc_segment_entry(*pnum); - if (unlikely(!ent)) { - err = -ENOMEM; - goto failed; - } - list_add_tail(&ent->list, &list); - - err = nilfs_open_segment_entry(ent, sufile); - if (unlikely(err)) - goto failed; - - if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su))) - printk(KERN_WARNING "NILFS: unused segment is " - "requested to be cleaned (segnum=%llu)\n", - (unsigned long long)ent->segnum); - nilfs_close_segment_entry(ent, sufile); - } - list_splice(&list, sci->sc_cleaning_segments.prev); - return 0; - - failed: - nilfs_dispose_segment_list(&list); - return err; -} - -void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci) -{ - nilfs_dispose_segment_list(&sci->sc_cleaning_segments); -} - struct nilfs_segctor_wait_request { wait_queue_t wq; __u32 seq; @@ -2607,10 +2546,13 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, err = nilfs_init_gcdat_inode(nilfs); if (unlikely(err)) goto out_unlock; + err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs); if (unlikely(err)) goto out_unlock; + sci->sc_freesegs = kbufs[4]; + sci->sc_nfreesegs = argv[4].v_nmembs; list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev); for (;;) { @@ -2629,6 +2571,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, } out_unlock: + sci->sc_freesegs = NULL; + sci->sc_nfreesegs = 0; nilfs_clear_gcdat_inode(nilfs); nilfs_transaction_unlock(sbi); return err; @@ -2835,7 +2779,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) INIT_LIST_HEAD(&sci->sc_dirty_files); INIT_LIST_HEAD(&sci->sc_segbufs); INIT_LIST_HEAD(&sci->sc_gc_inodes); - INIT_LIST_HEAD(&sci->sc_cleaning_segments); INIT_LIST_HEAD(&sci->sc_copied_buffers); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; @@ -2901,9 +2844,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1); } - if (!list_empty(&sci->sc_cleaning_segments)) - nilfs_dispose_segment_list(&sci->sc_cleaning_segments); - WARN_ON(!list_empty(&sci->sc_segbufs)); down_write(&sbi->s_nilfs->ns_segctor_sem); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 476bdd5df5be..0d2a475a741b 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -90,8 +90,9 @@ struct nilfs_segsum_pointer { * @sc_nblk_inc: Block count of current generation * @sc_dirty_files: List of files to be written * @sc_gc_inodes: List of GC inodes having blocks to be written - * @sc_cleaning_segments: List of segments to be freed through construction * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data + * @sc_freesegs: array of segment numbers to be freed + * @sc_nfreesegs: number of segments on @sc_freesegs * @sc_dsync_inode: inode whose data pages are written for a sync operation * @sc_dsync_start: start byte offset of data pages * @sc_dsync_end: end byte offset of data pages (inclusive) @@ -131,9 +132,11 @@ struct nilfs_sc_info { struct list_head sc_dirty_files; struct list_head sc_gc_inodes; - struct list_head sc_cleaning_segments; struct list_head sc_copied_buffers; + __u64 *sc_freesegs; + size_t sc_nfreesegs; + struct nilfs_inode_info *sc_dsync_inode; loff_t sc_dsync_start; loff_t sc_dsync_end; @@ -225,10 +228,6 @@ extern void nilfs_flush_segment(struct super_block *, ino_t); extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, void **); -extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *, - __u64 *, size_t); -extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *); - extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); @@ -240,5 +239,6 @@ extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *, extern int nilfs_recover_logical_segments(struct the_nilfs *, struct nilfs_sb_info *, struct nilfs_recovery_info *); +extern void nilfs_dispose_segment_list(struct list_head *); #endif /* _NILFS_SEGMENT_H */ diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 98e68677f045..37994d4a59cc 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -18,6 +18,7 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * Written by Koji Sato <koji@osrg.net>. + * Rivised by Ryusuke Konishi <ryusuke@osrg.net>. */ #include <linux/kernel.h> @@ -108,6 +109,102 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, nilfs_mdt_mark_buffer_dirty(header_bh); } +/** + * nilfs_sufile_updatev - modify multiple segment usages at a time + * @sufile: inode of segment usage file + * @segnumv: array of segment numbers + * @nsegs: size of @segnumv array + * @create: creation flag + * @ndone: place to store number of modified segments on @segnumv + * @dofunc: primitive operation for the update + * + * Description: nilfs_sufile_updatev() repeatedly calls @dofunc + * against the given array of segments. The @dofunc is called with + * buffers of a header block and the sufile block in which the target + * segment usage entry is contained. If @ndone is given, the number + * of successfully modified segments from the head is stored in the + * place @ndone points to. + * + * Return Value: On success, zero is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - Given segment usage is in hole block (may be returned if + * @create is zero) + * + * %-EINVAL - Invalid segment usage number + */ +int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs, + int create, size_t *ndone, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)) +{ + struct buffer_head *header_bh, *bh; + unsigned long blkoff, prev_blkoff; + __u64 *seg; + size_t nerr = 0, n = 0; + int ret = 0; + + if (unlikely(nsegs == 0)) + goto out; + + down_write(&NILFS_MDT(sufile)->mi_sem); + for (seg = segnumv; seg < segnumv + nsegs; seg++) { + if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) { + printk(KERN_WARNING + "%s: invalid segment number: %llu\n", __func__, + (unsigned long long)*seg); + nerr++; + } + } + if (nerr > 0) { + ret = -EINVAL; + goto out_sem; + } + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + seg = segnumv; + blkoff = nilfs_sufile_get_blkoff(sufile, *seg); + ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); + if (ret < 0) + goto out_header; + + for (;;) { + dofunc(sufile, *seg, header_bh, bh); + + if (++seg >= segnumv + nsegs) + break; + prev_blkoff = blkoff; + blkoff = nilfs_sufile_get_blkoff(sufile, *seg); + if (blkoff == prev_blkoff) + continue; + + /* get different block */ + brelse(bh); + ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh); + if (unlikely(ret < 0)) + goto out_header; + } + brelse(bh); + + out_header: + n = seg - segnumv; + brelse(header_bh); + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + out: + if (ndone) + *ndone = n; + return ret; +} + int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, void (*dofunc)(struct inode *, __u64, struct buffer_head *, @@ -490,7 +587,8 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, * nilfs_sufile_get_suinfo - * @sufile: inode of segment usage file * @segnum: segment number to start looking - * @si: array of suinfo + * @buf: array of suinfo + * @sisz: byte size of suinfo * @nsi: size of suinfo array * * Description: @@ -502,11 +600,12 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, * * %-ENOMEM - Insufficient amount of memory available. */ -ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, - struct nilfs_suinfo *si, size_t nsi) +ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, + unsigned sisz, size_t nsi) { struct buffer_head *su_bh; struct nilfs_segment_usage *su; + struct nilfs_suinfo *si = buf; size_t susz = NILFS_MDT(sufile)->mi_entry_size; struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; void *kaddr; @@ -531,20 +630,22 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, if (ret != -ENOENT) goto out; /* hole */ - memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n); + memset(si, 0, sisz * n); + si = (void *)si + sisz * n; continue; } kaddr = kmap_atomic(su_bh->b_page, KM_USER0); su = nilfs_sufile_block_get_segment_usage( sufile, segnum, su_bh, kaddr); - for (j = 0; j < n; j++, su = (void *)su + susz) { - si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod); - si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks); - si[i + j].sui_flags = le32_to_cpu(su->su_flags) & + for (j = 0; j < n; + j++, su = (void *)su + susz, si = (void *)si + sisz) { + si->sui_lastmod = le64_to_cpu(su->su_lastmod); + si->sui_nblocks = le32_to_cpu(su->su_nblocks); + si->sui_flags = le32_to_cpu(su->su_flags) & ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); if (nilfs_segment_is_active(nilfs, segnum + j)) - si[i + j].sui_flags |= + si->sui_flags |= (1UL << NILFS_SEGMENT_USAGE_ACTIVE); } kunmap_atomic(kaddr, KM_USER0); diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index a2e2efd4ade1..a2c4d76c3366 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -43,43 +43,27 @@ void nilfs_sufile_put_segment_usage(struct inode *, __u64, struct buffer_head *); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *); -ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *, +ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, size_t); +int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)); int nilfs_sufile_update(struct inode *, __u64, int, void (*dofunc)(struct inode *, __u64, struct buffer_head *, struct buffer_head *)); -void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, - struct buffer_head *); void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *, struct buffer_head *); void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *, struct buffer_head *); +void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, struct buffer_head *); /** - * nilfs_sufile_cancel_free - - * @sufile: inode of segment usage file - * @segnum: segment number - * - * Description: - * - * Return Value: On success, 0 is returned. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - */ -static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum) -{ - return nilfs_sufile_update(sufile, segnum, 0, - nilfs_sufile_do_cancel_free); -} - -/** * nilfs_sufile_scrap - make a segment garbage * @sufile: inode of segment usage file * @segnum: segment number to be freed @@ -100,6 +84,38 @@ static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum) } /** + * nilfs_sufile_freev - free segments + * @sufile: inode of segment usage file + * @segnumv: array of segment numbers + * @nsegs: size of @segnumv array + * @ndone: place to store the number of freed segments + */ +static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv, + size_t nsegs, size_t *ndone) +{ + return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone, + nilfs_sufile_do_free); +} + +/** + * nilfs_sufile_cancel_freev - reallocate freeing segments + * @sufile: inode of segment usage file + * @segnumv: array of segment numbers + * @nsegs: size of @segnumv array + * @ndone: place to store the number of cancelled segments + * + * Return Value: On success, 0 is returned. On error, a negative error codes + * is returned. + */ +static inline int nilfs_sufile_cancel_freev(struct inode *sufile, + __u64 *segnumv, size_t nsegs, + size_t *ndone) +{ + return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone, + nilfs_sufile_do_cancel_free); +} + +/** * nilfs_sufile_set_error - mark a segment as erroneous * @sufile: inode of segment usage file * @segnum: segment number diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 1777a3467bd2..ab785f85aa50 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -133,7 +133,7 @@ void nilfs_warning(struct super_block *sb, const char *function, static struct kmem_cache *nilfs_inode_cachep; -struct inode *nilfs_alloc_inode(struct super_block *sb) +struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs) { struct nilfs_inode_info *ii; @@ -143,10 +143,15 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_bh = NULL; ii->i_state = 0; ii->vfs_inode.i_version = 1; - nilfs_btnode_cache_init(&ii->i_btnode_cache); + nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi); return &ii->vfs_inode; } +struct inode *nilfs_alloc_inode(struct super_block *sb) +{ + return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs); +} + void nilfs_destroy_inode(struct inode *inode) { kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index e4e5c78bcc93..8b8889825716 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -32,7 +32,6 @@ #include "cpfile.h" #include "sufile.h" #include "dat.h" -#include "seglist.h" #include "segbuf.h" diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 9b0efdad8910..477d37d83b31 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -15,6 +15,7 @@ #include <linux/errno.h> #include <linux/kmod.h> #include <linux/spinlock.h> +#include <asm/byteorder.h> static struct nls_table default_table; static struct nls_table *tables = &default_table; @@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] = {0, /* end of table */} }; -int -utf8_mbtowc(wchar_t *p, const __u8 *s, int n) +#define UNICODE_MAX 0x0010ffff +#define PLANE_SIZE 0x00010000 + +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff + +int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) { - long l; + unsigned long l; int c0, c, nc; const struct utf8_table *t; @@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) nc++; if ((c0 & t->cmask) == t->cval) { l &= t->lmask; - if (l < t->lval) + if (l < t->lval || l > UNICODE_MAX || + (l & SURROGATE_MASK) == SURROGATE_PAIR) return -1; - *p = l; + *pu = (unicode_t) l; return nc; } - if (n <= nc) + if (len <= nc) return -1; s++; c = (*s ^ 0x80) & 0xFF; @@ -72,90 +81,133 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) } return -1; } +EXPORT_SYMBOL(utf8_to_utf32); -int -utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n) +int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) { - __u16 *op; - const __u8 *ip; - int size; - - op = pwcs; - ip = s; - while (*ip && n > 0) { - if (*ip & 0x80) { - size = utf8_mbtowc(op, ip, n); - if (size == -1) { - /* Ignore character and move on */ - ip++; - n--; - } else { - op++; - ip += size; - n -= size; - } - } else { - *op++ = *ip++; - n--; - } - } - return (op - pwcs); -} - -int -utf8_wctomb(__u8 *s, wchar_t wc, int maxlen) -{ - long l; + unsigned long l; int c, nc; const struct utf8_table *t; - + if (!s) return 0; - - l = wc; + + l = u; + if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) + return -1; + nc = 0; for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { nc++; if (l <= t->lmask) { c = t->shift; - *s = t->cval | (l >> c); + *s = (u8) (t->cval | (l >> c)); while (c > 0) { c -= 6; s++; - *s = 0x80 | ((l >> c) & 0x3F); + *s = (u8) (0x80 | ((l >> c) & 0x3F)); } return nc; } } return -1; } +EXPORT_SYMBOL(utf32_to_utf8); -int -utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) +int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) { - const __u16 *ip; - __u8 *op; + u16 *op; int size; + unicode_t u; + + op = pwcs; + while (*s && len > 0) { + if (*s & 0x80) { + size = utf8_to_utf32(s, len, &u); + if (size < 0) { + /* Ignore character and move on */ + size = 1; + } else if (u >= PLANE_SIZE) { + u -= PLANE_SIZE; + *op++ = (wchar_t) (SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS)); + *op++ = (wchar_t) (SURROGATE_PAIR | + SURROGATE_LOW | + (u & SURROGATE_BITS)); + } else { + *op++ = (wchar_t) u; + } + s += size; + len -= size; + } else { + *op++ = *s++; + len--; + } + } + return op - pwcs; +} +EXPORT_SYMBOL(utf8s_to_utf16s); + +static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + return c; + case UTF16_LITTLE_ENDIAN: + return __le16_to_cpu(c); + case UTF16_BIG_ENDIAN: + return __be16_to_cpu(c); + } +} + +int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, + u8 *s, int maxlen) +{ + u8 *op; + int size; + unsigned long u, v; op = s; - ip = pwcs; - while (*ip && maxlen > 0) { - if (*ip > 0x7f) { - size = utf8_wctomb(op, *ip, maxlen); + while (len > 0 && maxlen > 0) { + u = get_utf16(*pwcs, endian); + if (!u) + break; + pwcs++; + len--; + if (u > 0x7f) { + if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { + if (u & SURROGATE_LOW) { + /* Ignore character and move on */ + continue; + } + if (len <= 0) + break; + v = get_utf16(*pwcs, endian); + if ((v & SURROGATE_MASK) != SURROGATE_PAIR || + !(v & SURROGATE_LOW)) { + /* Ignore character and move on */ + continue; + } + u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + + (v & SURROGATE_BITS); + pwcs++; + len--; + } + size = utf32_to_utf8(u, op, maxlen); if (size == -1) { /* Ignore character and move on */ - maxlen--; } else { op += size; maxlen -= size; } } else { - *op++ = (__u8) *ip; + *op++ = (u8) u; + maxlen--; } - ip++; } - return (op - s); + return op - s; } +EXPORT_SYMBOL(utf16s_to_utf8s); int register_nls(struct nls_table * nls) { @@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls); EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); EXPORT_SYMBOL(load_nls_default); -EXPORT_SYMBOL(utf8_mbtowc); -EXPORT_SYMBOL(utf8_mbstowcs); -EXPORT_SYMBOL(utf8_wctomb); -EXPORT_SYMBOL(utf8_wcstombs); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index aa2c42fdd977..0d60a44acacd 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { int n; - if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) { + if (boundlen <= 0) + return -ENAMETOOLONG; + + n = utf32_to_utf8(uni, out, boundlen); + if (n < 0) { *out = '?'; return -EINVAL; } @@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { int n; + unicode_t u; - if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) { + n = utf8_to_utf32(rawstring, boundlen, &u); + if (n < 0 || u > MAX_WCHAR_T) { *uni = 0x003f; /* ? */ - n = -EINVAL; + return -EINVAL; } + *uni = (wchar_t) u; return n; } diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index ea2605a58b8a..f234f3a4c8ca 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -15,7 +15,8 @@ struct inotify_inode_mark_entry { int wd; }; -extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group); +extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, + struct fsnotify_group *group); extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 7ef75b83247e..47cd258fd24d 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -81,7 +81,7 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) { - inotify_destroy_mark_entry(entry, group); + inotify_ignored_and_remove_idr(entry, group); } static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 982a412ac5bc..ff231ad23895 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -363,39 +363,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns } /* - * When, for whatever reason, inotify is done with a mark (or what used to be a - * watch) we need to remove that watch from the idr and we need to send IN_IGNORED - * for the given wd. - * - * There is a bit of recursion here. The loop looks like: - * inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry -> - * inotify_freeing_mark -> inotify_destory_mark_entry -> restart - * But the loop is broken in 2 places. fsnotify_destroy_mark_by_entry sets - * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup) - * test below will not call back to fsnotify again. But even if that test wasn't - * there this would still be safe since fsnotify_destroy_mark_by_entry() is - * safe from recursion. + * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the + * internal reference help on the mark because it is in the idr. */ -void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) +void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, + struct fsnotify_group *group) { struct inotify_inode_mark_entry *ientry; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; - struct fsnotify_group *egroup; struct idr *idr; - spin_lock(&entry->lock); - egroup = entry->group; - - /* if egroup we aren't really done and something might still send events - * for this inode, on the callback we'll send the IN_IGNORED */ - if (egroup) { - spin_unlock(&entry->lock); - fsnotify_destroy_mark_by_entry(entry); - return; - } - spin_unlock(&entry->lock); - ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); @@ -699,7 +677,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) fsnotify_get_mark(entry); spin_unlock(&group->inotify_data.idr_lock); - inotify_destroy_mark_entry(entry, group); + fsnotify_destroy_mark_by_entry(entry); fsnotify_put_mark(entry); out: diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 82c5085559c6..9938034762cc 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -27,6 +27,7 @@ #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/slab.h> +#include <linux/log2.h> #include "aops.h" #include "attrib.h" @@ -1570,7 +1571,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) ntfs_debug("Index collation rule is 0x%x.", le32_to_cpu(ir->collation_rule)); ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); - if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) { + if (!is_power_of_2(ni->itype.index.block_size)) { ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " "two.", ni->itype.index.block_size); goto unm_err_out; diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index d7932e95b1fd..89b02985c054 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -26,6 +26,7 @@ #include <linux/highmem.h> #include <linux/buffer_head.h> #include <linux/bitops.h> +#include <linux/log2.h> #include "attrib.h" #include "aops.h" @@ -65,7 +66,7 @@ static bool ntfs_check_restart_page_header(struct inode *vi, logfile_log_page_size < NTFS_BLOCK_SIZE || logfile_system_page_size & (logfile_system_page_size - 1) || - logfile_log_page_size & (logfile_log_page_size - 1)) { + !is_power_of_2(logfile_log_page_size)) { ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); return false; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 678a067d9251..9edcde4974aa 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -475,6 +475,12 @@ struct ocfs2_path { #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) #define path_num_items(_path) ((_path)->p_tree_depth + 1) +static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, + u32 cpos); +static void ocfs2_adjust_rightmost_records(struct inode *inode, + handle_t *handle, + struct ocfs2_path *path, + struct ocfs2_extent_rec *insert_rec); /* * Reset the actual path elements so that we can re-use the structure * to build another path. Generally, this involves freeing the buffer @@ -1013,6 +1019,54 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) } /* + * Change range of the branches in the right most path according to the leaf + * extent block's rightmost record. + */ +static int ocfs2_adjust_rightmost_branch(handle_t *handle, + struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int status; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + path = ocfs2_new_path_from_et(et); + if (!path) { + status = -ENOMEM; + return status; + } + + status = ocfs2_find_path(inode, path, UINT_MAX); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_extend_trans(handle, path_num_items(path) + + handle->h_buffer_credits); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_path(inode, handle, path); + if (status < 0) { + mlog_errno(status); + goto out; + } + + el = path_leaf_el(path); + rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; + + ocfs2_adjust_rightmost_records(inode, handle, path, rec); + +out: + ocfs2_free_path(path); + return status; +} + +/* * Add an entire tree branch to our inode. eb_bh is the extent block * to start at, if we don't want to start the branch at the dinode * structure. @@ -1038,7 +1092,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; - u32 new_cpos; + u32 new_cpos, root_end; mlog_entry_void(); @@ -1055,6 +1109,27 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, new_blocks = le16_to_cpu(el->l_tree_depth); + eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + root_end = ocfs2_sum_rightmost_rec(et->et_root_el); + + /* + * If there is a gap before the root end and the real end + * of the righmost leaf block, we need to remove the gap + * between new_cpos and root_end first so that the tree + * is consistent after we add a new branch(it will start + * from new_cpos). + */ + if (root_end > new_cpos) { + mlog(0, "adjust the cluster end from %u to %u\n", + root_end, new_cpos); + status = ocfs2_adjust_rightmost_branch(handle, inode, et); + if (status) { + mlog_errno(status); + goto bail; + } + } + /* allocate the number of new eb blocks we need */ new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), GFP_KERNEL); @@ -1071,9 +1146,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } - eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; - new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); - /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. * conversly, new_eb_bhs[0] is the new bottommost leaf. diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 2a947c44e594..a1163b8b417c 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -22,6 +22,9 @@ #include <linux/crc32.h> #include <linux/buffer_head.h> #include <linux/bitops.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/fs.h> #include <asm/byteorder.h> #include <cluster/masklog.h> @@ -222,6 +225,155 @@ void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, ocfs2_hamming_fix(data, blocksize * 8, 0, fix); } + +/* + * Debugfs handling. + */ + +#ifdef CONFIG_DEBUG_FS + +static int blockcheck_u64_get(void *data, u64 *val) +{ + *val = *(u64 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); + +static struct dentry *blockcheck_debugfs_create(const char *name, + struct dentry *parent, + u64 *value) +{ + return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, + &blockcheck_fops); +} + +static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) +{ + if (stats) { + debugfs_remove(stats->b_debug_check); + stats->b_debug_check = NULL; + debugfs_remove(stats->b_debug_failure); + stats->b_debug_failure = NULL; + debugfs_remove(stats->b_debug_recover); + stats->b_debug_recover = NULL; + debugfs_remove(stats->b_debug_dir); + stats->b_debug_dir = NULL; + } +} + +static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + int rc = -EINVAL; + + if (!stats) + goto out; + + stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); + if (!stats->b_debug_dir) + goto out; + + stats->b_debug_check = + blockcheck_debugfs_create("blocks_checked", + stats->b_debug_dir, + &stats->b_check_count); + + stats->b_debug_failure = + blockcheck_debugfs_create("checksums_failed", + stats->b_debug_dir, + &stats->b_failure_count); + + stats->b_debug_recover = + blockcheck_debugfs_create("ecc_recoveries", + stats->b_debug_dir, + &stats->b_recover_count); + if (stats->b_debug_check && stats->b_debug_failure && + stats->b_debug_recover) + rc = 0; + +out: + if (rc) + ocfs2_blockcheck_debug_remove(stats); + return rc; +} +#else +static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + return 0; +} + +static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) +{ +} +#endif /* CONFIG_DEBUG_FS */ + +/* Always-called wrappers for starting and stopping the debugfs files */ +int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) +{ + return ocfs2_blockcheck_debug_install(stats, parent); +} + +void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats) +{ + ocfs2_blockcheck_debug_remove(stats); +} + +static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_check_count++; + new_count = stats->b_check_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "Block check count has wrapped\n"); +} + +static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_failure_count++; + new_count = stats->b_failure_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "Checksum failure count has wrapped\n"); +} + +static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats) +{ + u64 new_count; + + if (!stats) + return; + + spin_lock(&stats->b_lock); + stats->b_recover_count++; + new_count = stats->b_recover_count; + spin_unlock(&stats->b_lock); + + if (!new_count) + mlog(ML_NOTICE, "ECC recovery count has wrapped\n"); +} + + + +/* + * These are the low-level APIs for using the ocfs2_block_check structure. + */ + /* * This function generates check information for a block. * data is the block to be checked. bc is a pointer to the @@ -266,12 +418,15 @@ void ocfs2_block_check_compute(void *data, size_t blocksize, * Again, the data passed in should be the on-disk endian. */ int ocfs2_block_check_validate(void *data, size_t blocksize, - struct ocfs2_block_check *bc) + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats) { int rc = 0; struct ocfs2_block_check check; u32 crc, ecc; + ocfs2_blockcheck_inc_check(stats); + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); check.bc_ecc = le16_to_cpu(bc->bc_ecc); @@ -282,6 +437,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, if (crc == check.bc_crc32e) goto out; + ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); @@ -292,8 +448,10 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, /* And check the crc32 again */ crc = crc32_le(~0, data, blocksize); - if (crc == check.bc_crc32e) + if (crc == check.bc_crc32e) { + ocfs2_blockcheck_inc_recover(stats); goto out; + } mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); @@ -366,7 +524,8 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, * Again, the data passed in should be the on-disk endian. */ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, - struct ocfs2_block_check *bc) + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats) { int i, rc = 0; struct ocfs2_block_check check; @@ -377,6 +536,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, if (!nr) return 0; + ocfs2_blockcheck_inc_check(stats); + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); check.bc_ecc = le16_to_cpu(bc->bc_ecc); @@ -388,6 +549,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, if (crc == check.bc_crc32e) goto out; + ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); @@ -416,8 +578,10 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, /* And check the crc32 again */ for (i = 0, crc = ~0; i < nr; i++) crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); - if (crc == check.bc_crc32e) + if (crc == check.bc_crc32e) { + ocfs2_blockcheck_inc_recover(stats); goto out; + } mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); @@ -448,9 +612,11 @@ int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, struct ocfs2_block_check *bc) { int rc = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); - if (ocfs2_meta_ecc(OCFS2_SB(sb))) - rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc); + if (ocfs2_meta_ecc(osb)) + rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc, + &osb->osb_ecc_stats); return rc; } @@ -468,9 +634,11 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, struct ocfs2_block_check *bc) { int rc = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); - if (ocfs2_meta_ecc(OCFS2_SB(sb))) - rc = ocfs2_block_check_validate_bhs(bhs, nr, bc); + if (ocfs2_meta_ecc(osb)) + rc = ocfs2_block_check_validate_bhs(bhs, nr, bc, + &osb->osb_ecc_stats); return rc; } diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h index 70ec3feda32f..d4b69febf70a 100644 --- a/fs/ocfs2/blockcheck.h +++ b/fs/ocfs2/blockcheck.h @@ -21,6 +21,24 @@ #define OCFS2_BLOCKCHECK_H +/* Count errors and error correction from blockcheck.c */ +struct ocfs2_blockcheck_stats { + spinlock_t b_lock; + u64 b_check_count; /* Number of blocks we've checked */ + u64 b_failure_count; /* Number of failed checksums */ + u64 b_recover_count; /* Number of blocks fixed by ecc */ + + /* + * debugfs entries, used if this is passed to + * ocfs2_blockcheck_stats_debugfs_install() + */ + struct dentry *b_debug_dir; /* Parent of the debugfs files */ + struct dentry *b_debug_check; /* Exposes b_check_count */ + struct dentry *b_debug_failure; /* Exposes b_failure_count */ + struct dentry *b_debug_recover; /* Exposes b_recover_count */ +}; + + /* High level block API */ void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, struct ocfs2_block_check *bc); @@ -37,11 +55,18 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, void ocfs2_block_check_compute(void *data, size_t blocksize, struct ocfs2_block_check *bc); int ocfs2_block_check_validate(void *data, size_t blocksize, - struct ocfs2_block_check *bc); + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats); void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, struct ocfs2_block_check *bc); int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, - struct ocfs2_block_check *bc); + struct ocfs2_block_check *bc, + struct ocfs2_blockcheck_stats *stats); + +/* Debug Initialization */ +int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent); +void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats); /* * Hamming code functions diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 7e72a81bc2d4..696c32e50716 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -48,34 +48,33 @@ * only emit the appropriage printk() when the caller passes in a constant * mask, as is almost always the case. * - * All this bitmask nonsense is hidden from the /proc interface so that Joel - * doesn't have an aneurism. Reading the file gives a straight forward - * indication of which bits are on or off: - * ENTRY off - * EXIT off + * All this bitmask nonsense is managed from the files under + * /sys/fs/o2cb/logmask/. Reading the files gives a straightforward + * indication of which bits are allowed (allow) or denied (off/deny). + * ENTRY deny + * EXIT deny * TCP off * MSG off * SOCKET off - * ERROR off - * NOTICE on + * ERROR allow + * NOTICE allow * * Writing changes the state of a given bit and requires a strictly formatted * single write() call: * - * write(fd, "ENTRY on", 8); + * write(fd, "allow", 5); * - * would turn the entry bit on. "1" is also accepted in the place of "on", and - * "off" and "0" behave as expected. + * Echoing allow/deny/off string into the logmask files can flip the bits + * on or off as expected; here is the bash script for example: * - * Some trivial shell can flip all the bits on or off: + * log_mask="/sys/fs/o2cb/log_mask" + * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do + * echo allow >"$log_mask"/"$node" + * done * - * log_mask="/proc/fs/ocfs2_nodemanager/log_mask" - * cat $log_mask | ( - * while read bit status; do - * # $1 is "on" or "off", say - * echo "$bit $1" > $log_mask - * done - * ) + * The debugfs.ocfs2 tool can also flip the bits with the -l option: + * + * debugfs.ocfs2 -l TCP allow */ /* for task_struct */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 9fbe849f6344..334f231a422c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -974,7 +974,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, size_t caller_veclen, u8 target_node, int *status) { - int ret, error = 0; + int ret; struct o2net_msg *msg = NULL; size_t veclen, caller_bytes = 0; struct kvec *vec = NULL; @@ -1015,10 +1015,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, o2net_set_nst_sock_time(&nst); - ret = wait_event_interruptible(nn->nn_sc_wq, - o2net_tx_can_proceed(nn, &sc, &error)); - if (!ret && error) - ret = error; + wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret)); if (ret) goto out; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c5752305627c..b358f3bf896d 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2900,6 +2900,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, alloc = ocfs2_clusters_for_bytes(sb, bytes); dx_alloc = 0; + down_write(&oi->ip_alloc_sem); + if (ocfs2_supports_indexed_dirs(osb)) { credits += ocfs2_add_dir_index_credits(sb); @@ -2940,8 +2942,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out; } - down_write(&oi->ip_alloc_sem); - /* * Prepare for worst case allocation scenario of two separate * extents in the unindexed tree. @@ -2953,7 +2953,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_sem; + goto out; } if (vfs_dq_alloc_space_nodirty(dir, @@ -3172,10 +3172,8 @@ out_commit: ocfs2_commit_trans(osb, handle); -out_sem: - up_write(&oi->ip_alloc_sem); - out: + up_write(&oi->ip_alloc_sem); if (data_ac) ocfs2_free_alloc_context(data_ac); if (meta_ac) @@ -3322,11 +3320,15 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, brelse(new_bh); new_bh = NULL; + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; dir_i_size = i_size_read(dir); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; goto do_extend; } + down_write(&OCFS2_I(dir)->ip_alloc_sem); + drop_alloc_sem = 1; dir_i_size = i_size_read(dir); mlog(0, "extending dir %llu (i_size = %lld)\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size); @@ -3370,9 +3372,6 @@ do_extend: credits++; /* For attaching the new dirent block to the * dx_root */ - down_write(&OCFS2_I(dir)->ip_alloc_sem); - drop_alloc_sem = 1; - handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -3435,10 +3434,10 @@ bail_bh: *new_de_bh = new_bh; get_bh(*new_de_bh); bail: - if (drop_alloc_sem) - up_write(&OCFS2_I(dir)->ip_alloc_sem); if (handle) ocfs2_commit_trans(osb, handle); + if (drop_alloc_sem) + up_write(&OCFS2_I(dir)->ip_alloc_sem); if (data_ac) ocfs2_free_alloc_context(data_ac); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e15fc7d50827..6cdeaa76f27f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .get_osb = ocfs2_get_dentry_osb, .post_unlock = ocfs2_dentry_post_unlock, @@ -637,6 +641,19 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan_lvb *lvb; + + ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN, + &ocfs2_orphan_scan_lops, osb); + lvb = ocfs2_dlm_lvb(&res->l_lksb); + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION; +} + void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_file_private *fp) { @@ -2352,6 +2369,37 @@ void ocfs2_inode_unlock(struct inode *inode, mlog_exit_void(); } +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + int status = 0; + + lockres = &osb->osb_orphan_scan.os_lockres; + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) + return status; + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION) + *seqno = be32_to_cpu(lvb->lvb_os_seqno); + return status; +} + +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex) +{ + struct ocfs2_lock_res *lockres; + struct ocfs2_orphan_scan_lvb *lvb; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + + lockres = &osb->osb_orphan_scan.os_lockres; + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION; + lvb->lvb_os_seqno = cpu_to_be32(seqno); + ocfs2_cluster_unlock(osb, lockres, level); +} + int ocfs2_super_lock(struct ocfs2_super *osb, int ex) { @@ -2842,6 +2890,7 @@ local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->cconn = conn; @@ -2878,6 +2927,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb, ocfs2_lock_res_free(&osb->osb_super_lockres); ocfs2_lock_res_free(&osb->osb_rename_lockres); ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres); + ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres); ocfs2_cluster_disconnect(osb->cconn, hangup_pending); osb->cconn = NULL; @@ -3061,6 +3111,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres); } int ocfs2_drop_inode_locks(struct inode *inode) diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index e1fd5721cd7f..31b90d7b8f51 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb { __be32 lvb_free_entry; }; +#define OCFS2_ORPHAN_LVB_VERSION 1 + +struct ocfs2_orphan_scan_lvb { + __u8 lvb_version; + __u8 lvb_reserved[3]; + __be32 lvb_os_seqno; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb, int ex); void ocfs2_super_unlock(struct ocfs2_super *osb, int ex); +int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex); +void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex); + int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c2a87c885b73..07267e0da909 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -187,6 +187,9 @@ static int ocfs2_sync_file(struct file *file, if (err) goto bail; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto bail; + journal = osb->journal->j_journal; err = jbd2_journal_force_commit(journal); @@ -894,9 +897,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; handle_t *handle = NULL; - int locked[MAXQUOTAS] = {0, 0}; - int credits, qtype; - struct ocfs2_mem_dqinfo *oinfo; + int qtype; + struct dquot *transfer_from[MAXQUOTAS] = { }; + struct dquot *transfer_to[MAXQUOTAS] = { }; mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); @@ -969,30 +972,37 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - credits = OCFS2_INODE_UPDATE_CREDITS; + /* + * Gather pointers to quota structures so that allocation / + * freeing of quota structures happens here and not inside + * vfs_dq_transfer() where we have problems with lock ordering + */ if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv; - status = ocfs2_lock_global_qf(oinfo, 1); - if (status < 0) + transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid, + USRQUOTA); + transfer_from[USRQUOTA] = dqget(sb, inode->i_uid, + USRQUOTA); + if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) { + status = -ESRCH; goto bail_unlock; - credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) + - ocfs2_calc_qdel_credits(sb, USRQUOTA); - locked[USRQUOTA] = 1; + } } if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid && OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv; - status = ocfs2_lock_global_qf(oinfo, 1); - if (status < 0) + transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid, + GRPQUOTA); + transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid, + GRPQUOTA); + if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) { + status = -ESRCH; goto bail_unlock; - credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) + - ocfs2_calc_qdel_credits(sb, GRPQUOTA); - locked[GRPQUOTA] = 1; + } } - handle = ocfs2_start_trans(osb, credits); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS + + 2 * ocfs2_quota_trans_credits(sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -1030,12 +1040,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: - for (qtype = 0; qtype < MAXQUOTAS; qtype++) { - if (!locked[qtype]) - continue; - oinfo = sb_dqinfo(sb, qtype)->dqi_priv; - ocfs2_unlock_global_qf(oinfo, 1); - } ocfs2_inode_unlock(inode, 1); bail_unlock_rw: if (size_change) @@ -1043,6 +1047,12 @@ bail_unlock_rw: bail: brelse(bh); + /* Release quota pointers in case we acquired them */ + for (qtype = 0; qtype < MAXQUOTAS; qtype++) { + dqput(transfer_to[qtype]); + dqput(transfer_from[qtype]); + } + if (!status && attr->ia_valid & ATTR_MODE) { status = ocfs2_acl_chmod(inode); if (status < 0) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a20a0f1e37fd..4a3b9e6b31ad 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -28,6 +28,8 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/time.h> +#include <linux/random.h> #define MLOG_MASK_PREFIX ML_JOURNAL #include <cluster/masklog.h> @@ -52,6 +54,8 @@ DEFINE_SPINLOCK(trans_inc_lock); +#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000 + static int ocfs2_force_read_journal(struct inode *inode); static int ocfs2_recover_node(struct ocfs2_super *osb, int node_num, int slot_num); @@ -1841,6 +1845,113 @@ bail: return status; } +/* + * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some + * randomness to the timeout to minimize multple nodes firing the timer at the + * same time. + */ +static inline unsigned long ocfs2_orphan_scan_timeout(void) +{ + unsigned long time; + + get_random_bytes(&time, sizeof(time)); + time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000); + return msecs_to_jiffies(time); +} + +/* + * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for + * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This + * is done to catch any orphans that are left over in orphan directories. + * + * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT + * seconds. It gets an EX lock on os_lockres and checks sequence number + * stored in LVB. If the sequence number has changed, it means some other + * node has done the scan. This node skips the scan and tracks the + * sequence number. If the sequence number didn't change, it means a scan + * hasn't happened. The node queues a scan and increments the + * sequence number in the LVB. + */ +void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + int status, i; + u32 seqno = 0; + + os = &osb->osb_orphan_scan; + + status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto out; + } + + if (os->os_seqno != seqno) { + os->os_seqno = seqno; + goto unlock; + } + + for (i = 0; i < osb->max_slots; i++) + ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, + NULL); + /* + * We queued a recovery on orphan slots, increment the sequence + * number and update LVB so other node will skip the scan for a while + */ + seqno++; + os->os_count++; + os->os_scantime = CURRENT_TIME; +unlock: + ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX); +out: + return; +} + +/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ +void ocfs2_orphan_scan_work(struct work_struct *work) +{ + struct ocfs2_orphan_scan *os; + struct ocfs2_super *osb; + + os = container_of(work, struct ocfs2_orphan_scan, + os_orphan_scan_work.work); + osb = os->os_osb; + + mutex_lock(&os->os_lock); + ocfs2_queue_orphan_scan(osb); + schedule_delayed_work(&os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); + mutex_unlock(&os->os_lock); +} + +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + mutex_lock(&os->os_lock); + cancel_delayed_work(&os->os_orphan_scan_work); + mutex_unlock(&os->os_lock); +} + +int ocfs2_orphan_scan_init(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_osb = osb; + os->os_count = 0; + os->os_scantime = CURRENT_TIME; + mutex_init(&os->os_lock); + + INIT_DELAYED_WORK(&os->os_orphan_scan_work, + ocfs2_orphan_scan_work); + schedule_delayed_work(&os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); + return 0; +} + struct ocfs2_orphan_filldir_priv { struct inode *head; struct ocfs2_super *osb; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index eb7b76331eb7..61045eeb3f6e 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, } /* Exported only for the journal struct init code in super.c. Do not call. */ +int ocfs2_orphan_scan_init(struct ocfs2_super *osb); +void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); +void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); + void ocfs2_complete_recovery(struct work_struct *work); void ocfs2_wait_for_recovery(struct ocfs2_super *osb); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 1386281950db..18c1d9ec1c93 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -47,6 +47,9 @@ #include "ocfs2_fs.h" #include "ocfs2_lockid.h" +/* For struct ocfs2_blockcheck_stats */ +#include "blockcheck.h" + /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small @@ -151,6 +154,16 @@ struct ocfs2_lock_res { #endif }; +struct ocfs2_orphan_scan { + struct mutex os_lock; + struct ocfs2_super *os_osb; + struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ + struct delayed_work os_orphan_scan_work; + struct timespec os_scantime; /* time this node ran the scan */ + u32 os_count; /* tracks node specific scans */ + u32 os_seqno; /* tracks cluster wide scans */ +}; + struct ocfs2_dlm_debug { struct kref d_refcnt; struct dentry *d_locking_state; @@ -295,6 +308,7 @@ struct ocfs2_super struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; + struct ocfs2_blockcheck_stats osb_ecc_stats; struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ @@ -341,6 +355,8 @@ struct ocfs2_super unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; + struct ocfs2_orphan_scan osb_orphan_scan; + /* used to protect metaecc calculation check of xattr. */ spinlock_t osb_xattr_lock; diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index a53ce87481bf..fcdba091af3d 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -48,6 +48,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_FLOCK, OCFS2_LOCK_TYPE_QINFO, OCFS2_LOCK_TYPE_NFS_SYNC, + OCFS2_LOCK_TYPE_ORPHAN_SCAN, OCFS2_NUM_LOCK_TYPES }; @@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_NFS_SYNC: c = 'Y'; break; + case OCFS2_LOCK_TYPE_ORPHAN_SCAN: + c = 'P'; + break; default: c = '\0'; } @@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", [OCFS2_LOCK_TYPE_QINFO] = "Quota", + [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 1ed0f7c86869..edfa60cd155c 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -421,6 +421,7 @@ int ocfs2_global_read_dquot(struct dquot *dquot) OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; if (!dquot->dq_off) { /* No real quota entry? */ /* Upgrade to exclusive lock for allocation */ + ocfs2_qinfo_unlock(info, 0); err = ocfs2_qinfo_lock(info, 1); if (err < 0) goto out_qlock; @@ -435,7 +436,8 @@ int ocfs2_global_read_dquot(struct dquot *dquot) out_qlock: if (ex) ocfs2_qinfo_unlock(info, 1); - ocfs2_qinfo_unlock(info, 0); + else + ocfs2_qinfo_unlock(info, 0); out: if (err < 0) mlog_errno(err); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 07deec5e9721..5a460fa82553 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -444,10 +444,6 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type); - status = ocfs2_lock_global_qf(oinfo, 1); - if (status < 0) - goto out; - list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { chunk = rchunk->rc_chunk; hbh = NULL; @@ -480,12 +476,18 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, type); goto out_put_bh; } + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) { + mlog_errno(status); + goto out_put_dquot; + } + handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QSYNC_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto out_put_dquot; + goto out_drop_lock; } mutex_lock(&sb_dqopt(sb)->dqio_mutex); spin_lock(&dq_data_lock); @@ -523,6 +525,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, out_commit: mutex_unlock(&sb_dqopt(sb)->dqio_mutex); ocfs2_commit_trans(OCFS2_SB(sb), handle); +out_drop_lock: + ocfs2_unlock_global_qf(oinfo, 1); out_put_dquot: dqput(dquot); out_put_bh: @@ -537,8 +541,6 @@ out_put_bh: if (status < 0) break; } - ocfs2_unlock_global_qf(oinfo, 1); -out: if (status < 0) free_recovery_list(&(rec->r_list[type])); mlog_exit(status); @@ -655,6 +657,9 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) struct ocfs2_quota_recovery *rec; int locked = 0; + /* We don't need the lock and we have to acquire quota file locks + * which will later depend on this lock */ + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); info->dqi_maxblimit = 0x7fffffffffffffffLL; info->dqi_maxilimit = 0x7fffffffffffffffLL; oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); @@ -733,6 +738,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) goto out_err; } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); return 0; out_err: if (oinfo) { @@ -746,6 +752,7 @@ out_err: kfree(oinfo); } brelse(bh); + mutex_lock(&sb_dqopt(sb)->dqio_mutex); return -1; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 201b40a441fe..0d3ed7407a04 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -119,10 +119,12 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb); static int ocfs2_check_volume(struct ocfs2_super *osb); static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct buffer_head *bh, - u32 sectsize); + u32 sectsize, + struct ocfs2_blockcheck_stats *stats); static int ocfs2_initialize_super(struct super_block *sb, struct buffer_head *bh, - int sector_size); + int sector_size, + struct ocfs2_blockcheck_stats *stats); static int ocfs2_get_sector(struct super_block *sb, struct buffer_head **bh, int block, @@ -207,6 +209,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) int i; struct ocfs2_cluster_connection *cconn = osb->cconn; struct ocfs2_recovery_map *rm = osb->recovery_map; + struct ocfs2_orphan_scan *os; out += snprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", @@ -308,6 +311,13 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) i, osb->slot_recovery_generations[i]); } + os = &osb->osb_orphan_scan; + out += snprintf(buf + out, len - out, "Orphan Scan=> "); + out += snprintf(buf + out, len - out, "Local: %u Global: %u ", + os->os_count, os->os_seqno); + out += snprintf(buf + out, len - out, " Last Scan: %lu seconds ago\n", + (get_seconds() - os->os_scantime.tv_sec)); + return out; } @@ -542,7 +552,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits, */ #if BITS_PER_LONG == 32 -# if defined(CONFIG_LBD) +# if defined(CONFIG_LBDAF) BUILD_BUG_ON(sizeof(sector_t) != 8); /* * We might be limited by page cache size. @@ -693,7 +703,8 @@ out: static int ocfs2_sb_probe(struct super_block *sb, struct buffer_head **bh, - int *sector_size) + int *sector_size, + struct ocfs2_blockcheck_stats *stats) { int status, tmpstat; struct ocfs1_vol_disk_hdr *hdr; @@ -759,7 +770,8 @@ static int ocfs2_sb_probe(struct super_block *sb, goto bail; } di = (struct ocfs2_dinode *) (*bh)->b_data; - status = ocfs2_verify_volume(di, *bh, blksize); + memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); + status = ocfs2_verify_volume(di, *bh, blksize, stats); if (status >= 0) goto bail; brelse(*bh); @@ -965,6 +977,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; char nodestr[8]; + struct ocfs2_blockcheck_stats stats; mlog_entry("%p, %p, %i", sb, data, silent); @@ -974,13 +987,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } /* probe for superblock */ - status = ocfs2_sb_probe(sb, &bh, §or_size); + status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); if (status < 0) { mlog(ML_ERROR, "superblock probe failed!\n"); goto read_super_error; } - status = ocfs2_initialize_super(sb, bh, sector_size); + status = ocfs2_initialize_super(sb, bh, sector_size, &stats); osb = OCFS2_SB(sb); if (status < 0) { mlog_errno(status); @@ -1090,6 +1103,18 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } + if (ocfs2_meta_ecc(osb)) { + status = ocfs2_blockcheck_stats_debugfs_install( + &osb->osb_ecc_stats, + osb->osb_debug_root); + if (status) { + mlog(ML_ERROR, + "Unable to create blockcheck statistics " + "files\n"); + goto read_super_error; + } + } + status = ocfs2_mount_volume(sb); if (osb->root_inode) inode = igrab(osb->root_inode); @@ -1760,13 +1785,8 @@ static int ocfs2_mount_volume(struct super_block *sb) } status = ocfs2_truncate_log_init(osb); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto leave; - } - - if (ocfs2_mount_local(osb)) - goto leave; leave: if (unlock_super) @@ -1796,6 +1816,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_truncate_log_shutdown(osb); + ocfs2_orphan_scan_stop(osb); + /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); @@ -1833,6 +1855,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) if (osb->cconn) ocfs2_dlm_shutdown(osb, hangup_needed); + ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); debugfs_remove(osb->osb_debug_root); if (hangup_needed) @@ -1880,7 +1903,8 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu static int ocfs2_initialize_super(struct super_block *sb, struct buffer_head *bh, - int sector_size) + int sector_size, + struct ocfs2_blockcheck_stats *stats) { int status; int i, cbits, bbits; @@ -1939,6 +1963,9 @@ static int ocfs2_initialize_super(struct super_block *sb, atomic_set(&osb->alloc_stats.bg_allocs, 0); atomic_set(&osb->alloc_stats.bg_extends, 0); + /* Copy the blockcheck stats from the superblock probe */ + osb->osb_ecc_stats = *stats; + ocfs2_init_node_maps(osb); snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", @@ -1951,6 +1978,13 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } + status = ocfs2_orphan_scan_init(osb); + if (status) { + mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n"); + mlog_errno(status); + goto bail; + } + init_waitqueue_head(&osb->checkpoint_event); atomic_set(&osb->needs_checkpoint, 0); @@ -2169,7 +2203,8 @@ bail: */ static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct buffer_head *bh, - u32 blksz) + u32 blksz, + struct ocfs2_blockcheck_stats *stats) { int status = -EAGAIN; @@ -2182,7 +2217,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, OCFS2_FEATURE_INCOMPAT_META_ECC) { status = ocfs2_block_check_validate(bh->b_data, bh->b_size, - &di->i_check); + &di->i_check, + stats); if (status) goto out; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 15631019dc63..ba320e250747 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3154,7 +3154,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode, le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash)); if (func) { ret = func(inode, bucket, para); - if (ret) + if (ret && ret != -ERANGE) mlog_errno(ret); /* Fall through to bucket_relse() */ } @@ -3261,7 +3261,8 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode, ocfs2_list_xattr_bucket, &xl); if (ret) { - mlog_errno(ret); + if (ret != -ERANGE) + mlog_errno(ret); goto out; } diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 63d965193b22..11a7b5c68153 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -18,6 +18,7 @@ proc-y += meminfo.o proc-y += stat.o proc-y += uptime.o proc-y += version.o +proc-y += softirqs.o proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o proc-$(CONFIG_NET) += proc_net.o proc-$(CONFIG_PROC_KCORE) += kcore.o diff --git a/fs/proc/base.c b/fs/proc/base.c index 1539e630c47d..3ce5ae9e3d2d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - oom_adjust = task->oomkilladj; + task_lock(task); + if (task->mm) + oom_adjust = task->mm->oom_adj; + else + oom_adjust = OOM_DISABLE; + task_unlock(task); put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { + task_lock(task); + if (!task->mm) { + task_unlock(task); + put_task_struct(task); + return -EINVAL; + } + if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { + task_unlock(task); put_task_struct(task); return -EACCES; } - task->oomkilladj = oom_adjust; + task->mm->oom_adj = oom_adjust; + task_unlock(task); put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c6b0302af4c4..d5c410d47fae 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "Inactive(anon): %8lu kB\n" "Active(file): %8lu kB\n" "Inactive(file): %8lu kB\n" -#ifdef CONFIG_UNEVICTABLE_LRU "Unevictable: %8lu kB\n" "Mlocked: %8lu kB\n" -#endif #ifdef CONFIG_HIGHMEM "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" @@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(pages[LRU_INACTIVE_ANON]), K(pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_FILE]), -#ifdef CONFIG_UNEVICTABLE_LRU K(pages[LRU_UNEVICTABLE]), K(global_page_state(NR_MLOCK)), -#endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), diff --git a/fs/proc/page.c b/fs/proc/page.c index e9983837d08d..2707c6c7a20f 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -6,11 +6,13 @@ #include <linux/mmzone.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/hugetlb.h> #include <asm/uaccess.h> #include "internal.h" #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) + /* /proc/kpagecount - an array exposing page counts * * Each entry is a u64 representing the corresponding @@ -32,20 +34,22 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - ppage = NULL; if (pfn_valid(pfn)) ppage = pfn_to_page(pfn); - pfn++; + else + ppage = NULL; if (!ppage) pcount = 0; else pcount = page_mapcount(ppage); - if (put_user(pcount, out++)) { + if (put_user(pcount, out)) { ret = -EFAULT; break; } + pfn++; + out++; count -= KPMSIZE; } @@ -68,19 +72,122 @@ static const struct file_operations proc_kpagecount_operations = { /* These macros are used to decouple internal flags from exported ones */ -#define KPF_LOCKED 0 -#define KPF_ERROR 1 -#define KPF_REFERENCED 2 -#define KPF_UPTODATE 3 -#define KPF_DIRTY 4 -#define KPF_LRU 5 -#define KPF_ACTIVE 6 -#define KPF_SLAB 7 -#define KPF_WRITEBACK 8 -#define KPF_RECLAIM 9 -#define KPF_BUDDY 10 +#define KPF_LOCKED 0 +#define KPF_ERROR 1 +#define KPF_REFERENCED 2 +#define KPF_UPTODATE 3 +#define KPF_DIRTY 4 +#define KPF_LRU 5 +#define KPF_ACTIVE 6 +#define KPF_SLAB 7 +#define KPF_WRITEBACK 8 +#define KPF_RECLAIM 9 +#define KPF_BUDDY 10 + +/* 11-20: new additions in 2.6.31 */ +#define KPF_MMAP 11 +#define KPF_ANON 12 +#define KPF_SWAPCACHE 13 +#define KPF_SWAPBACKED 14 +#define KPF_COMPOUND_HEAD 15 +#define KPF_COMPOUND_TAIL 16 +#define KPF_HUGE 17 +#define KPF_UNEVICTABLE 18 +#define KPF_NOPAGE 20 + +/* kernel hacking assistances + * WARNING: subject to change, never rely on them! + */ +#define KPF_RESERVED 32 +#define KPF_MLOCKED 33 +#define KPF_MAPPEDTODISK 34 +#define KPF_PRIVATE 35 +#define KPF_PRIVATE_2 36 +#define KPF_OWNER_PRIVATE 37 +#define KPF_ARCH 38 +#define KPF_UNCACHED 39 + +static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) +{ + return ((kflags >> kbit) & 1) << ubit; +} -#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos) +static u64 get_uflags(struct page *page) +{ + u64 k; + u64 u; + + /* + * pseudo flag: KPF_NOPAGE + * it differentiates a memory hole from a page with no flags + */ + if (!page) + return 1 << KPF_NOPAGE; + + k = page->flags; + u = 0; + + /* + * pseudo flags for the well known (anonymous) memory mapped pages + * + * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the + * simple test in page_mapped() is not enough. + */ + if (!PageSlab(page) && page_mapped(page)) + u |= 1 << KPF_MMAP; + if (PageAnon(page)) + u |= 1 << KPF_ANON; + + /* + * compound pages: export both head/tail info + * they together define a compound page's start/end pos and order + */ + if (PageHead(page)) + u |= 1 << KPF_COMPOUND_HEAD; + if (PageTail(page)) + u |= 1 << KPF_COMPOUND_TAIL; + if (PageHuge(page)) + u |= 1 << KPF_HUGE; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + + /* + * Caveats on high order pages: + * PG_buddy will only be set on the head page; SLUB/SLQB do the same + * for PG_slab; SLOB won't set PG_slab at all on compound pages. + */ + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); + u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy); + + u |= kpf_copy_bit(k, KPF_ERROR, PG_error); + u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); + u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); + u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); + + u |= kpf_copy_bit(k, KPF_LRU, PG_lru); + u |= kpf_copy_bit(k, KPF_REFERENCED, PG_referenced); + u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); + u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); + + u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); + u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); + + u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); + u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); + +#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR + u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); +#endif + + u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); + u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); + u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); + u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); + u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); + + return u; +}; static ssize_t kpageflags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -90,7 +197,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; - u64 kflags, uflags; pfn = src / KPMSIZE; count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); @@ -98,32 +204,18 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - ppage = NULL; if (pfn_valid(pfn)) ppage = pfn_to_page(pfn); - pfn++; - if (!ppage) - kflags = 0; else - kflags = ppage->flags; - - uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) | - kpf_copy_bit(kflags, KPF_ERROR, PG_error) | - kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | - kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | - kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) | - kpf_copy_bit(kflags, KPF_LRU, PG_lru) | - kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) | - kpf_copy_bit(kflags, KPF_SLAB, PG_slab) | - kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) | - kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) | - kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy); - - if (put_user(uflags, out++)) { + ppage = NULL; + + if (put_user(get_uflags(ppage), out)) { ret = -EFAULT; break; } + pfn++; + out++; count -= KPMSIZE; } diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index fc6c3025befd..7ba79a54948c 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -195,20 +195,20 @@ void proc_device_tree_add_node(struct device_node *np, p = fixup_name(np, de, p); ent = proc_mkdir(p, de); - if (ent == 0) + if (ent == NULL) break; proc_device_tree_add_node(child, ent); } of_node_put(child); - for (pp = np->properties; pp != 0; pp = pp->next) { + for (pp = np->properties; pp != NULL; pp = pp->next) { p = pp->name; if (duplicate_name(de, p)) p = fixup_name(np, de, p); ent = __proc_device_tree_add_prop(de, pp, p); - if (ent == 0) + if (ent == NULL) break; } } @@ -221,10 +221,10 @@ void __init proc_device_tree_init(void) struct device_node *root; proc_device_tree = proc_mkdir("device-tree", NULL); - if (proc_device_tree == 0) + if (proc_device_tree == NULL) return; root = of_find_node_by_path("/"); - if (root == 0) { + if (root == NULL) { printk(KERN_ERR "/proc/device-tree: can't find root\n"); return; } diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c new file mode 100644 index 000000000000..1807c2419f17 --- /dev/null +++ b/fs/proc/softirqs.c @@ -0,0 +1,44 @@ +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +/* + * /proc/softirqs ... display the number of softirqs + */ +static int show_softirqs(struct seq_file *p, void *v) +{ + int i, j; + + seq_printf(p, " "); + for_each_possible_cpu(i) + seq_printf(p, "CPU%-8d", i); + seq_printf(p, "\n"); + + for (i = 0; i < NR_SOFTIRQS; i++) { + seq_printf(p, "%8s:", softirq_to_name[i]); + for_each_possible_cpu(j) + seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_printf(p, "\n"); + } + return 0; +} + +static int softirqs_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_softirqs, NULL); +} + +static const struct file_operations proc_softirqs_operations = { + .open = softirqs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_softirqs_init(void) +{ + proc_create("softirqs", 0, NULL, &proc_softirqs_operations); + return 0; +} +module_init(proc_softirqs_init); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 81e4eb60972e..7cc726c6d70a 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -29,6 +29,8 @@ static int show_stat(struct seq_file *p, void *v) cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; cputime64_t guest; u64 sum = 0; + u64 sum_softirq = 0; + unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; unsigned int per_irq_sum; @@ -53,6 +55,13 @@ static int show_stat(struct seq_file *p, void *v) sum += kstat_irqs_cpu(j, i); } sum += arch_irq_stat_cpu(i); + + for (j = 0; j < NR_SOFTIRQS; j++) { + unsigned int softirq_stat = kstat_softirqs_cpu(j, i); + + per_softirq_sums[j] += softirq_stat; + sum_softirq += softirq_stat; + } } sum += arch_irq_stat(); @@ -115,6 +124,12 @@ static int show_stat(struct seq_file *p, void *v) nr_running(), nr_iowait()); + seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); + + for (i = 0; i < NR_SOFTIRQS; i++) + seq_printf(p, " %u", per_softirq_sums[i]); + seq_printf(p, "\n"); + return 0; } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 5edcc3f92ba7..0872afa58d39 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -166,12 +166,7 @@ static const struct file_operations proc_vmcore_operations = { static struct vmcore* __init get_new_element(void) { - struct vmcore *p; - - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (p) - memset(p, 0, sizeof(*p)); - return p; + return kzalloc(sizeof(struct vmcore), GFP_KERNEL); } static u64 __init get_vmcore_size_elf64(char *elfptr) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 3a6b193d8444..0ff7566c767c 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -202,9 +202,12 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts) return -EINVAL; opts->mode = option & S_IALLUGO; break; - default: - printk(KERN_ERR "ramfs: bad mount option: %s\n", p); - return -EINVAL; + /* + * We might like to report bad mount options here; + * but traditionally ramfs has ignored all mount options, + * and as it is used as a !CONFIG_SHMEM simple substitute + * for tmpfs, better continue to ignore other mount options. + */ } } diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 4beb964a2a3e..128d3f7c8aa5 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -1270,9 +1270,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h RFALSE(ih, "PAP-12210: ih must be 0"); - if (is_direntry_le_ih - (aux_ih = - B_N_PITEM_HEAD(tbS0, item_pos))) { + aux_ih = B_N_PITEM_HEAD(tbS0, item_pos); + if (is_direntry_le_ih(aux_ih)) { /* we append to directory item */ int entry_count; diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index 381750a155f6..03d85cbf90bf 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -390,7 +390,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, if (last_first == FIRST_TO_LAST) { /* if ( if item in position item_num in buffer SOURCE is directory item ) */ - if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num))) + ih = B_N_PITEM_HEAD(src, item_num); + if (is_direntry_le_ih(ih)) leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes); else { @@ -418,7 +419,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, } } else { /* if ( if item in position item_num in buffer SOURCE is directory item ) */ - if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num))) + ih = B_N_PITEM_HEAD(src, item_num); + if (is_direntry_le_ih(ih)) leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, item_num, I_ENTRY_COUNT(ih) - cpy_bytes, @@ -774,8 +776,8 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first, leaf_delete_items_entirely(cur_bi, first + 1, del_num - 1); - if (is_direntry_le_ih - (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1))) + ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1); + if (is_direntry_le_ih(ih)) /* the last item is directory */ /* len = numbers of directory entries in this item */ len = ih_entry_count(ih); diff --git a/fs/select.c b/fs/select.c index 0fe0e1469df3..d870237e42c7 100644 --- a/fs/select.c +++ b/fs/select.c @@ -168,7 +168,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) return table->entry++; } -static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); @@ -194,6 +194,16 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) return default_wake_function(&dummy_wait, mode, sync, key); } +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_table_entry *entry; + + entry = container_of(wait, struct poll_table_entry, wait); + if (key && !((unsigned long)key & entry->key)) + return 0; + return __pollwake(wait, mode, sync, key); +} + /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) @@ -205,6 +215,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, get_file(filp); entry->filp = filp; entry->wait_address = wait_address; + entry->key = p->key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); @@ -362,6 +373,18 @@ get_max: #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) #define POLLEX_SET (POLLPRI) +static inline void wait_key_set(poll_table *wait, unsigned long in, + unsigned long out, unsigned long bit) +{ + if (wait) { + wait->key = POLLEX_SET; + if (in & bit) + wait->key |= POLLIN_SET; + if (out & bit) + wait->key |= POLLOUT_SET; + } +} + int do_select(int n, fd_set_bits *fds, struct timespec *end_time) { ktime_t expire, *to = NULL; @@ -418,20 +441,25 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) if (file) { f_op = file->f_op; mask = DEFAULT_POLLMASK; - if (f_op && f_op->poll) - mask = (*f_op->poll)(file, retval ? NULL : wait); + if (f_op && f_op->poll) { + wait_key_set(wait, in, out, bit); + mask = (*f_op->poll)(file, wait); + } fput_light(file, fput_needed); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; + wait = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; + wait = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; + wait = NULL; } } } @@ -685,8 +713,12 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) mask = POLLNVAL; if (file != NULL) { mask = DEFAULT_POLLMASK; - if (file->f_op && file->f_op->poll) + if (file->f_op && file->f_op->poll) { + if (pwait) + pwait->key = pollfd->events | + POLLERR | POLLHUP; mask = file->f_op->poll(file, pwait); + } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; fput_light(file, fput_needed); diff --git a/fs/seq_file.c b/fs/seq_file.c index 7f40f30c55c5..6c959275f2d0 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -640,6 +640,26 @@ int seq_puts(struct seq_file *m, const char *s) } EXPORT_SYMBOL(seq_puts); +/** + * seq_write - write arbitrary data to buffer + * @seq: seq_file identifying the buffer to which data should be written + * @data: data address + * @len: number of bytes + * + * Return 0 on success, non-zero otherwise. + */ +int seq_write(struct seq_file *seq, const void *data, size_t len) +{ + if (seq->count + len < seq->size) { + memcpy(seq->buf + seq->count, data, len); + seq->count += len; + return 0; + } + seq->count = seq->size; + return -1; +} +EXPORT_SYMBOL(seq_write); + struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; diff --git a/fs/super.c b/fs/super.c index 83b47416d006..d40d53a22fb5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -545,24 +545,18 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { if (force) mark_files_ro(sb); - else if (!fs_may_remount_ro(sb)) { - unlock_kernel(); + else if (!fs_may_remount_ro(sb)) return -EBUSY; - } retval = vfs_dq_off(sb, 1); - if (retval < 0 && retval != -ENOSYS) { - unlock_kernel(); + if (retval < 0 && retval != -ENOSYS) return -EBUSY; - } } remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); - if (retval) { - unlock_kernel(); + if (retval) return retval; - } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); if (remount_rw) diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index a3ba217fbe74..1d897ad808e0 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -192,8 +192,11 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) { int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); - if (page) + if (page) { error = sysfs_getlink(dentry, (char *) page); + if (error < 0) + free_page((unsigned long)page); + } nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); return NULL; } diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index c7798079e644..4e50286a4cc3 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -15,13 +15,13 @@ #include <linux/pagemap.h> #include <linux/highmem.h> -#include <linux/smp_lock.h> #include <linux/swap.h> #include "sysv.h" static int sysv_readdir(struct file *, void *, filldir_t); const struct file_operations sysv_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = sysv_readdir, .fsync = simple_fsync, @@ -74,8 +74,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); - lock_kernel(); - pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); if (pos >= inode->i_size) goto done; @@ -113,7 +111,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) done: filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset; - unlock_kernel(); return 0; } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 479923456a54..9824743832a7 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -21,7 +21,6 @@ * the superblock. */ -#include <linux/smp_lock.h> #include <linux/highuid.h> #include <linux/slab.h> #include <linux/init.h> @@ -37,7 +36,6 @@ static int sysv_sync_fs(struct super_block *sb, int wait) unsigned long time = get_seconds(), old_time; lock_super(sb); - lock_kernel(); /* * If we are going to write out the super block, @@ -52,7 +50,6 @@ static int sysv_sync_fs(struct super_block *sb, int wait) mark_buffer_dirty(sbi->s_bh2); } - unlock_kernel(); unlock_super(sb); return 0; @@ -82,8 +79,6 @@ static void sysv_put_super(struct super_block *sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); - lock_kernel(); - if (sb->s_dirt) sysv_write_super(sb); @@ -99,8 +94,6 @@ static void sysv_put_super(struct super_block *sb) brelse(sbi->s_bh2); kfree(sbi); - - unlock_kernel(); } static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -275,7 +268,6 @@ int sysv_write_inode(struct inode *inode, int wait) return -EIO; } - lock_kernel(); raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid)); raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid)); @@ -291,7 +283,6 @@ int sysv_write_inode(struct inode *inode, int wait) for (block = 0; block < 10+1+1+1; block++) write3byte(sbi, (u8 *)&si->i_data[block], &raw_inode->i_data[3*block]); - unlock_kernel(); mark_buffer_dirty(bh); if (wait) { sync_dirty_buffer(bh); @@ -315,9 +306,7 @@ static void sysv_delete_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); inode->i_size = 0; sysv_truncate(inode); - lock_kernel(); sysv_free_inode(inode); - unlock_kernel(); } static struct kmem_cache *sysv_inode_cachep; diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index af1914462f02..eaf6d891d46f 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -91,7 +91,6 @@ static int shrink_liability(struct ubifs_info *c, int nr_to_write) return nr_written; } - /** * run_gc - run garbage collector. * @c: UBIFS file-system description object @@ -628,7 +627,7 @@ void ubifs_convert_page_budget(struct ubifs_info *c) * * This function releases budget corresponding to a dirty inode. It is usually * called when after the inode has been written to the media and marked as - * clean. + * clean. It also causes the "no space" flags to be cleared. */ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, struct ubifs_inode *ui) @@ -636,6 +635,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, struct ubifs_budget_req req; memset(&req, 0, sizeof(struct ubifs_budget_req)); + /* The "no space" flags will be cleared because dd_growth is > 0 */ req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); ubifs_release_budget(c, &req); } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index f55d523c52bb..552fb0111fff 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -528,6 +528,25 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, inode->i_nlink, dir->i_ino); ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); + + /* + * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing + * otherwise has the potential to corrupt the orphan inode list. + * + * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and + * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not + * lock 'dirA->i_mutex', so this is possible. Both of the functions + * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes + * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this + * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA' + * to the list of orphans. After this, 'vfs_link()' will link + * 'dirB/fileB' to 'inodeA'. This is a problem because, for example, + * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode + * to the list of orphans. + */ + if (inode->i_nlink == 0) + return -ENOENT; + err = dbg_check_synced_i_size(inode); if (err) return err; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index e8e632a1dcdf..bc5857199ec2 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -293,13 +293,14 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) * * This function is called when the write-buffer timer expires. */ -static void wbuf_timer_callback_nolock(unsigned long data) +static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) { - struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data; + struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); wbuf->need_sync = 1; wbuf->c->need_wbuf_sync = 1; ubifs_wake_up_bgt(wbuf->c); + return HRTIMER_NORESTART; } /** @@ -308,13 +309,12 @@ static void wbuf_timer_callback_nolock(unsigned long data) */ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) { - ubifs_assert(!timer_pending(&wbuf->timer)); + ubifs_assert(!hrtimer_active(&wbuf->timer)); - if (!wbuf->timeout) + if (!ktime_to_ns(wbuf->softlimit)) return; - - wbuf->timer.expires = jiffies + wbuf->timeout; - add_timer(&wbuf->timer); + hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta, + HRTIMER_MODE_REL); } /** @@ -329,7 +329,7 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) * should be canceled. */ wbuf->need_sync = 0; - del_timer(&wbuf->timer); + hrtimer_cancel(&wbuf->timer); } /** @@ -825,6 +825,7 @@ out: int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) { size_t size; + ktime_t hardlimit; wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); if (!wbuf->buf) @@ -845,14 +846,21 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); spin_lock_init(&wbuf->lock); - wbuf->c = c; - init_timer(&wbuf->timer); - wbuf->timer.function = wbuf_timer_callback_nolock; - wbuf->timer.data = (unsigned long)wbuf; - wbuf->timeout = DEFAULT_WBUF_TIMEOUT; wbuf->next_ino = 0; + hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + wbuf->timer.function = wbuf_timer_callback_nolock; + /* + * Make write-buffer soft limit to be 20% of the hard limit. The + * write-buffer timer is allowed to expire any time between the soft + * and hard limits. + */ + hardlimit = ktime_set(DEFAULT_WBUF_TIMEOUT_SECS, 0); + wbuf->delta = (DEFAULT_WBUF_TIMEOUT_SECS * NSEC_PER_SEC) * 2 / 10; + wbuf->softlimit = ktime_sub_ns(hardlimit, wbuf->delta); + hrtimer_set_expires_range_ns(&wbuf->timer, wbuf->softlimit, + wbuf->delta); return 0; } diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 10662975d2ef..805605250f12 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -343,33 +343,15 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) * * This function returns %1 if @offs was in the last write to the LEB whose data * is in @buf, otherwise %0 is returned. The determination is made by checking - * for subsequent empty space starting from the next min_io_size boundary (or a - * bit less than the common header size if min_io_size is one). + * for subsequent empty space starting from the next @c->min_io_size boundary. */ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) { - int empty_offs; - int check_len; + int empty_offs, check_len; uint8_t *p; - if (c->min_io_size == 1) { - check_len = c->leb_size - offs; - p = buf + check_len; - for (; check_len > 0; check_len--) - if (*--p != 0xff) - break; - /* - * 'check_len' is the size of the corruption which cannot be - * more than the size of 1 node if it was caused by an unclean - * unmount. - */ - if (check_len > UBIFS_MAX_NODE_SZ) - return 0; - return 1; - } - /* - * Round up to the next c->min_io_size boundary i.e. 'offs' is in the + * Round up to the next @c->min_io_size boundary i.e. @offs is in the * last wbuf written. After that should be empty space. */ empty_offs = ALIGN(offs + 1, c->min_io_size); @@ -392,7 +374,7 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) * * This function pads up to the next min_io_size boundary (if there is one) and * sets empty space to all 0xff. @buf, @offs and @len are updated to the next - * min_io_size boundary (if there is one). + * @c->min_io_size boundary. */ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, int *offs, int *len) @@ -402,11 +384,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, lnum = lnum; dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); - if (c->min_io_size == 1) { - memset(*buf, 0xff, c->leb_size - *offs); - return; - } - ubifs_assert(!(*offs & 7)); empty_offs = ALIGN(*offs, c->min_io_size); pad_len = empty_offs - *offs; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 3589eab02a2f..79fad43f3c57 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -361,6 +361,11 @@ static void ubifs_delete_inode(struct inode *inode) out: if (ui->dirty) ubifs_release_dirty_inode_budget(c, ui); + else { + /* We've deleted something - clean the "no space" flags */ + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } clear_inode(inode); } @@ -792,7 +797,7 @@ static int alloc_wbufs(struct ubifs_info *c) * does not need to be synchronized by timer. */ c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; - c->jheads[GCHD].wbuf.timeout = 0; + c->jheads[GCHD].wbuf.softlimit = ktime_set(0, 0); return 0; } @@ -933,6 +938,27 @@ static const match_table_t tokens = { }; /** + * parse_standard_option - parse a standard mount option. + * @option: the option to parse + * + * Normally, standard mount options like "sync" are passed to file-systems as + * flags. However, when a "rootflags=" kernel boot parameter is used, they may + * be present in the options string. This function tries to deal with this + * situation and parse standard options. Returns 0 if the option was not + * recognized, and the corresponding integer flag if it was. + * + * UBIFS is only interested in the "sync" option, so do not check for anything + * else. + */ +static int parse_standard_option(const char *option) +{ + ubifs_msg("parse %s", option); + if (!strcmp(option, "sync")) + return MS_SYNCHRONOUS; + return 0; +} + +/** * ubifs_parse_options - parse mount parameters. * @c: UBIFS file-system description object * @options: parameters to parse @@ -1008,9 +1034,19 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, break; } default: - ubifs_err("unrecognized mount option \"%s\" " - "or missing value", p); - return -EINVAL; + { + unsigned long flag; + struct super_block *sb = c->vfs_sb; + + flag = parse_standard_option(p); + if (!flag) { + ubifs_err("unrecognized mount option \"%s\" " + "or missing value", p); + return -EINVAL; + } + sb->s_flags |= flag; + break; + } } } @@ -1180,6 +1216,7 @@ static int mount_ubifs(struct ubifs_info *c) if (!ubifs_compr_present(c->default_compr)) { ubifs_err("'compressor \"%s\" is not compiled in", ubifs_compr_name(c->default_compr)); + err = -ENOTSUPP; goto out_free; } @@ -1656,7 +1693,7 @@ static void ubifs_remount_ro(struct ubifs_info *c) for (i = 0; i < c->jhead_cnt; i++) { ubifs_wbuf_sync(&c->jheads[i].wbuf); - del_timer_sync(&c->jheads[i].wbuf.timer); + hrtimer_cancel(&c->jheads[i].wbuf.timer); } c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); @@ -1719,7 +1756,7 @@ static void ubifs_put_super(struct super_block *sb) if (c->jheads) for (i = 0; i < c->jhead_cnt; i++) { ubifs_wbuf_sync(&c->jheads[i].wbuf); - del_timer_sync(&c->jheads[i].wbuf.timer); + hrtimer_cancel(&c->jheads[i].wbuf.timer); } /* @@ -1911,6 +1948,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&c->orph_list); INIT_LIST_HEAD(&c->orph_new); + c->vfs_sb = sb; c->highest_inum = UBIFS_FIRST_INO; c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; @@ -1937,18 +1975,18 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) err = bdi_init(&c->bdi); if (err) goto out_close; + err = bdi_register(&c->bdi, NULL, "ubifs"); + if (err) + goto out_bdi; err = ubifs_parse_options(c, data, 0); if (err) goto out_bdi; - c->vfs_sb = sb; - sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; - sb->s_dev = c->vi.cdev; sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); if (c->max_inode_sz > MAX_LFS_FILESIZE) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; @@ -1993,16 +2031,9 @@ out_free: static int sb_test(struct super_block *sb, void *data) { dev_t *dev = data; + struct ubifs_info *c = sb->s_fs_info; - return sb->s_dev == *dev; -} - -static int sb_set(struct super_block *sb, void *data) -{ - dev_t *dev = data; - - sb->s_dev = *dev; - return 0; + return c->vi.cdev == *dev; } static int ubifs_get_sb(struct file_system_type *fs_type, int flags, @@ -2030,7 +2061,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags, dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); - sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev); + sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev); if (IS_ERR(sb)) { err = PTR_ERR(sb); goto out_close; @@ -2070,16 +2101,11 @@ out_close: return err; } -static void ubifs_kill_sb(struct super_block *sb) -{ - generic_shutdown_super(sb); -} - static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, .get_sb = ubifs_get_sb, - .kill_sb = ubifs_kill_sb + .kill_sb = kill_anon_super, }; /* diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 0a8341e14088..1bf01d820066 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -95,8 +95,8 @@ */ #define BGT_NAME_PATTERN "ubifs_bgt%d_%d" -/* Default write-buffer synchronization timeout (5 secs) */ -#define DEFAULT_WBUF_TIMEOUT (5 * HZ) +/* Default write-buffer synchronization timeout in seconds */ +#define DEFAULT_WBUF_TIMEOUT_SECS 5 /* Maximum possible inode number (only 32-bit inodes are supported now) */ #define MAX_INUM 0xFFFFFFFF @@ -650,8 +650,10 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @io_mutex: serializes write-buffer I/O * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes * fields + * @softlimit: soft write-buffer timeout interval + * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit + * and @softlimit + @delta) * @timer: write-buffer timer - * @timeout: timer expire interval in jiffies * @need_sync: it is set if its timer expired and needs sync * @next_ino: points to the next position of the following inode number * @inodes: stores the inode numbers of the nodes which are in wbuf @@ -678,8 +680,9 @@ struct ubifs_wbuf { int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); struct mutex io_mutex; spinlock_t lock; - struct timer_list timer; - int timeout; + ktime_t softlimit; + unsigned long long delta; + struct hrtimer timer; int need_sync; int next_ino; ino_t *inodes; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 3d2512c21f05..7cf33379fd46 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -56,9 +56,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks); - if (i_block < 0) { - ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0"); - } else if (i_block < direct_blocks) { + if (i_block < direct_blocks) { offsets[n++] = i_block; } else if ((i_block -= direct_blocks) < indirect_blocks) { offsets[n++] = UFS_IND_BLOCK; @@ -440,8 +438,6 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head lock_kernel(); UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); - if (fragment < 0) - goto abort_negative; if (fragment > ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb) << uspi->s_fpbshift)) @@ -504,10 +500,6 @@ abort: unlock_kernel(); return err; -abort_negative: - ufs_warning(sb, "ufs_get_block", "block < 0"); - goto abort; - abort_too_big: ufs_warning(sb, "ufs_get_block", "block > big"); goto abort; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index f65a53f8752f..6127e24062d0 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -24,7 +24,7 @@ * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set. */ -#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) +#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) # define XFS_BIG_BLKNOS 1 # define XFS_BIG_INUMS 1 #else diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 2e09efbca8db..a220d36f789b 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -616,7 +616,7 @@ xfs_max_file_offset( */ #if BITS_PER_LONG == 32 -# if defined(CONFIG_LBD) +# if defined(CONFIG_LBDAF) ASSERT(sizeof(sector_t) == 8); pagefactor = PAGE_CACHE_SIZE; bitshift = BITS_PER_LONG; |