156 files changed, 4144 insertions, 2036 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index ab5547ff29a1..38d695d66a0b 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,7 +37,6 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -231,10 +230,8 @@ v9fs_umount_begin(struct super_block *sb)
 {
 	struct v9fs_session_info *v9ses;
 
-	lock_kernel();
 	v9ses = sb->s_fs_info;
 	v9fs_session_cancel(v9ses);
-	unlock_kernel();
 }
 
 static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 525da2e8f73b..d78e950402c1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -39,6 +39,13 @@ config FS_POSIX_ACL
 	bool
 	default n
 
+source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
+source "fs/ocfs2/Kconfig"
+source "fs/btrfs/Kconfig"
+
+endif # BLOCK
+
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EMBEDDED
 	default y
@@ -47,13 +54,6 @@ config FILE_LOCKING
           for filesystems like NFS and for the flock() system
           call. Disabling this option saves about 11k.
 
-source "fs/xfs/Kconfig"
-source "fs/gfs2/Kconfig"
-source "fs/ocfs2/Kconfig"
-source "fs/btrfs/Kconfig"
-
-endif # BLOCK
-
 source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
@@ -134,7 +134,7 @@ config TMPFS_POSIX_ACL
 config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \
-		   (S390 && 64BIT) || BROKEN
+		   (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN
 	help
 	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
 	  ramfs. For architectures that support it, say Y here and read
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index a6665f37f456..9cc18775b832 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -1,3 +1,6 @@
+#include <linux/fs.h>
+#include <linux/adfs_fs.h>
+
 /* Internal data structures for ADFS */
 
 #define ADFS_FREE_FRAG		 0
@@ -17,6 +20,58 @@
 struct buffer_head;
 
 /*
+ * adfs file system inode data in memory
+ */
+struct adfs_inode_info {
+	loff_t		mmu_private;
+	unsigned long	parent_id;	/* object id of parent		*/
+	__u32		loadaddr;	/* RISC OS load address		*/
+	__u32		execaddr;	/* RISC OS exec address		*/
+	unsigned int	filetype;	/* RISC OS file type		*/
+	unsigned int	attr;		/* RISC OS permissions		*/
+	unsigned int	stamped:1;	/* RISC OS file has date/time	*/
+	struct inode vfs_inode;
+};
+
+/*
+ * Forward-declare this
+ */
+struct adfs_discmap;
+struct adfs_dir_ops;
+
+/*
+ * ADFS file system superblock data in memory
+ */
+struct adfs_sb_info {
+	struct adfs_discmap *s_map;	/* bh list containing map		 */
+	struct adfs_dir_ops *s_dir;	/* directory operations			 */
+
+	uid_t		s_uid;		/* owner uid				 */
+	gid_t		s_gid;		/* owner gid				 */
+	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm		 */
+	umode_t		s_other_mask;	/* ADFS other perm -> unix perm		 */
+
+	__u32		s_ids_per_zone;	/* max. no ids in one zone		 */
+	__u32		s_idlen;	/* length of ID in map			 */
+	__u32		s_map_size;	/* sector size of a map			 */
+	unsigned long	s_size;		/* total size (in blocks) of this fs	 */
+	signed int	s_map2blk;	/* shift left by this for map->sector	 */
+	unsigned int	s_log2sharesize;/* log2 share size			 */
+	__le32		s_version;	/* disc format version			 */
+	unsigned int	s_namelen;	/* maximum number of characters in name	 */
+};
+
+static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct adfs_inode_info *ADFS_I(struct inode *inode)
+{
+	return container_of(inode, struct adfs_inode_info, vfs_inode);
+}
+
+/*
  * Directory handling
  */
 struct adfs_dir {
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 4d4073447d1a..23aa52f548a0 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,15 +9,7 @@
  *
  *  Common directory handling for ADFS
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>		/* for file_fsync() */
-
 #include "adfs.h"
 
 /*
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 31df6adf0de6..bafc71222e25 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -9,15 +9,7 @@
  *
  *  E and F format directory handling
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-#include <linux/string.h>
-
 #include "adfs.h"
 #include "dir_f.h"
 
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 139e0f345f18..1796bb352d05 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -7,15 +7,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-#include <linux/string.h>
-
 #include "adfs.h"
 #include "dir_fplus.h"
 
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 8224d54a2afb..005ea34d1758 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -19,10 +19,6 @@
  *
  *  adfs regular file handling primitives           
  */
-#include <linux/fs.h>
-#include <linux/buffer_head.h>			/* for file_fsync() */
-#include <linux/adfs_fs.h>
-
 #include "adfs.h"
 
 const struct file_operations adfs_file_operations = {
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 05b3a677201d..798cb071d132 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,17 +7,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/mm.h>
 #include <linux/smp_lock.h>
-#include <linux/module.h>
 #include <linux/buffer_head.h>
-
 #include "adfs.h"
 
 /*
@@ -395,4 +386,3 @@ int adfs_write_inode(struct inode *inode, int wait)
 	unlock_kernel();
 	return ret;
 }
-MODULE_LICENSE("GPL");
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 568081b93f73..d1a5932bb0f1 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -7,14 +7,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/spinlock.h>
 #include <linux/buffer_head.h>
-
 #include <asm/unaligned.h>
-
 #include "adfs.h"
 
 /*
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 0ec5aaf47aa7..aad92f0a1048 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -8,26 +8,12 @@
  * published by the Free Software Foundation.
  */
 #include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/adfs_fs.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
-#include <linux/vfs.h>
 #include <linux/parser.h>
-#include <linux/bitops.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
-#include <stdarg.h>
-
+#include <linux/statfs.h>
 #include "adfs.h"
 #include "dir_f.h"
 #include "dir_fplus.h"
@@ -534,3 +520,4 @@ static void __exit exit_adfs_fs(void)
 
 module_init(init_adfs_fs)
 module_exit(exit_adfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 2d33a5f7d218..0dd4dafee10b 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
+#include <rxrpc/packet.h>
 #include "internal.h"
 #include "afs_fs.h"
 
@@ -54,6 +55,21 @@ int afs_abort_to_error(u32 abort_code)
 	case 0x2f6df24:		return -ENOLCK;
 	case 0x2f6df26:		return -ENOTEMPTY;
 	case 0x2f6df78:		return -EDQUOT;
+
+	case RXKADINCONSISTENCY: return -EPROTO;
+	case RXKADPACKETSHORT:	return -EPROTO;
+	case RXKADLEVELFAIL:	return -EKEYREJECTED;
+	case RXKADTICKETLEN:	return -EKEYREJECTED;
+	case RXKADOUTOFSEQUENCE: return -EPROTO;
+	case RXKADNOAUTH:	return -EKEYREJECTED;
+	case RXKADBADKEY:	return -EKEYREJECTED;
+	case RXKADBADTICKET:	return -EKEYREJECTED;
+	case RXKADUNKNOWNKEY:	return -EKEYREJECTED;
+	case RXKADEXPIRED:	return -EKEYEXPIRED;
+	case RXKADSEALEDINCON:	return -EKEYREJECTED;
+	case RXKADDATALEN:	return -EKEYREJECTED;
+	case RXKADILLEGALLEVEL:	return -EKEYREJECTED;
+
 	default:		return -EREMOTEIO;
 	}
 }
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index ec2a7431e458..6e689208def2 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -65,6 +65,8 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
 				goto out;
 			goto rotate;
 		case -ENOMEDIUM:
+		case -EKEYREJECTED:
+		case -EKEYEXPIRED:
 			goto out;
 		default:
 			ret = -EIO;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9367b6297d84..615d5496fe0f 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
 {
 	struct nls_table *nls = BEFS_SB(sb)->nls;
 	int i, o;
-	wchar_t uni;
+	unicode_t uni;
 	int unilen, utflen;
 	char *result;
 	/* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
 	for (i = o = 0; i < in_len; i += utflen, o += unilen) {
 
 		/* convert from UTF-8 to Unicode */
-		utflen = utf8_mbtowc(&uni, &in[i], in_len - i);
-		if (utflen < 0) {
+		utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
+		if (utflen < 0)
 			goto conv_err;
-		}
 
 		/* convert from Unicode to nls */
+		if (uni > MAX_WCHAR_T)
+			goto conv_err;
 		unilen = nls->uni2char(uni, &result[o], in_len - o);
-		if (unilen < 0) {
+		if (unilen < 0)
 			goto conv_err;
-		}
 	}
 	result[o] = '\0';
 	*out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in,
 
 		/* convert from nls to unicode */
 		unilen = nls->char2uni(&in[i], in_len - i, &uni);
-		if (unilen < 0) {
+		if (unilen < 0)
 			goto conv_err;
-		}
 
 		/* convert from unicode to UTF-8 */
-		utflen = utf8_wctomb(&result[o], uni, 3);
-		if (utflen <= 0) {
+		utflen = utf32_to_utf8(uni, &result[o], 3);
+		if (utflen <= 0)
 			goto conv_err;
-		}
 	}
 
 	result[o] = '\0';
@@ -737,8 +735,6 @@ parse_options(char *options, befs_mount_options * opts)
 static void
 befs_put_super(struct super_block *sb)
 {
-	lock_kernel();
-
 	kfree(BEFS_SB(sb)->mount_opts.iocharset);
 	BEFS_SB(sb)->mount_opts.iocharset = NULL;
 
@@ -749,8 +745,6 @@ befs_put_super(struct super_block *sb)
 
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 /* Allocate private field of the superblock, fill it.
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 40381df34869..9fa212b014a5 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1340,8 +1340,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
 	prstatus->pr_sighold = p->blocked.sig[0];
+	rcu_read_lock();
+	prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
 	prstatus->pr_pid = task_pid_vnr(p);
-	prstatus->pr_ppid = task_pid_vnr(p->real_parent);
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
@@ -1382,8 +1384,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 			psinfo->pr_psargs[i] = ' ';
 	psinfo->pr_psargs[len] = 0;
 
+	rcu_read_lock();
+	psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
 	psinfo->pr_pid = task_pid_vnr(p);
-	psinfo->pr_ppid = task_pid_vnr(p->real_parent);
 	psinfo->pr_pgrp = task_pgrp_vnr(p);
 	psinfo->pr_sid = task_session_vnr(p);
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fdb66faa24f1..20fbeced472b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1387,8 +1387,10 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
 	prstatus->pr_sighold = p->blocked.sig[0];
+	rcu_read_lock();
+	prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
 	prstatus->pr_pid = task_pid_vnr(p);
-	prstatus->pr_ppid = task_pid_vnr(p->real_parent);
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
@@ -1432,8 +1434,10 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 			psinfo->pr_psargs[i] = ' ';
 	psinfo->pr_psargs[len] = 0;
 
+	rcu_read_lock();
+	psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
 	psinfo->pr_pid = task_pid_vnr(p);
-	psinfo->pr_ppid = task_pid_vnr(p->real_parent);
 	psinfo->pr_pgrp = task_pgrp_vnr(p);
 	psinfo->pr_sid = task_session_vnr(p);
 
diff --git a/fs/bio.c b/fs/bio.c
index 5f80848c320c..24c914043532 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,7 +25,6 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #include <trace/events/block.h>
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0d50d49d990a..d28d29c95f7c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -42,6 +42,8 @@
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 
+static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
+
 /*
  * end_io_wq structs are used to do processing in task context when an IO is
  * complete.  This is used during reads to verify checksums, and it is used
@@ -1342,12 +1344,25 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	free_extent_map(em);
 }
 
+/*
+ * If this fails, caller must call bdi_destroy() to get rid of the
+ * bdi again.
+ */
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
-	bdi_init(bdi);
+	int err;
+
+	bdi->capabilities = BDI_CAP_MAP_COPY;
+	err = bdi_init(bdi);
+	if (err)
+		return err;
+
+	err = bdi_register(bdi, NULL, "btrfs-%d",
+				atomic_inc_return(&btrfs_bdi_num));
+	if (err)
+		return err;
+
 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
-	bdi->state		= 0;
-	bdi->capabilities	= default_backing_dev_info.capabilities;
 	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
 	bdi->unplug_io_data	= info;
 	bdi->congested_fn	= btrfs_congested_fn;
@@ -1569,7 +1584,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
-	setup_bdi(fs_info, &fs_info->bdi);
+	if (setup_bdi(fs_info, &fs_info->bdi))
+		goto fail_bdi;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
@@ -1946,8 +1962,8 @@ fail_iput:
 
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+fail_bdi:
 	bdi_destroy(&fs_info->bdi);
-
 fail:
 	kfree(extent_root);
 	kfree(tree_root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2e177d7f4bb9..4e83457ea253 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -543,13 +543,13 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 			btrfs_free_log(trans, root);
 			btrfs_update_reloc_root(trans, root);
 
-			if (root->commit_root == root->node)
-				continue;
-
-			free_extent_buffer(root->commit_root);
-			root->commit_root = btrfs_root_node(root);
+			if (root->commit_root != root->node) {
+				free_extent_buffer(root->commit_root);
+				root->commit_root = btrfs_root_node(root);
+				btrfs_set_root_node(&root->root_item,
+						    root->node);
+			}
 
-			btrfs_set_root_node(&root->root_item, root->node);
 			err = btrfs_update_root(trans, fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 0aac371bff0b..c5ded5ff72b5 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -788,12 +788,6 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 	if (put_user(compat_ptr(data), &sgio->usr_ptr))
 		return -EFAULT;
 
-	if (copy_in_user(&sgio->status, &sgio32->status,
-			 (4 * sizeof(unsigned char)) +
-			 (2 * sizeof(unsigned short)) +
-			 (3 * sizeof(int))))
-		return -EFAULT;
-
 	err = sys_ioctl(fd, cmd, (unsigned long) sgio);
 
 	if (err >= 0) {
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 33a90120f6ad..4d74fc72c195 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -67,6 +67,8 @@ static int debugfs_u8_get(void *data, u64 *val)
 	return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
 
 /**
  * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value
@@ -95,6 +97,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n");
 struct dentry *debugfs_create_u8(const char *name, mode_t mode,
 				 struct dentry *parent, u8 *value)
 {
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
+
 	return debugfs_create_file(name, mode, parent, value, &fops_u8);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -110,6 +119,8 @@ static int debugfs_u16_get(void *data, u64 *val)
 	return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
 
 /**
  * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value
@@ -138,6 +149,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n");
 struct dentry *debugfs_create_u16(const char *name, mode_t mode,
 				  struct dentry *parent, u16 *value)
 {
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
+
 	return debugfs_create_file(name, mode, parent, value, &fops_u16);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -153,6 +171,8 @@ static int debugfs_u32_get(void *data, u64 *val)
 	return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
 
 /**
  * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value
@@ -181,6 +201,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n");
 struct dentry *debugfs_create_u32(const char *name, mode_t mode,
 				 struct dentry *parent, u32 *value)
 {
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
+
 	return debugfs_create_file(name, mode, parent, value, &fops_u32);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -197,6 +224,8 @@ static int debugfs_u64_get(void *data, u64 *val)
 	return 0;
 }
 DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
 
 /**
  * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value
@@ -225,15 +254,28 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n");
 struct dentry *debugfs_create_u64(const char *name, mode_t mode,
 				 struct dentry *parent, u64 *value)
 {
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
+
 	return debugfs_create_file(name, mode, parent, value, &fops_u64);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u64);
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n");
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
 
 /*
  * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
@@ -256,6 +298,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
 struct dentry *debugfs_create_x8(const char *name, mode_t mode,
 				 struct dentry *parent, u8 *value)
 {
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
+
 	return debugfs_create_file(name, mode, parent, value, &fops_x8);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -273,6 +322,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
 struct dentry *debugfs_create_x16(const char *name, mode_t mode,
 				 struct dentry *parent, u16 *value)
 {
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
+
 	return debugfs_create_file(name, mode, parent, value, &fops_x16);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -290,6 +346,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
 struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 				 struct dentry *parent, u32 *value)
 {
+	/* if there are no write bits set, make read only */
+	if (!(mode & S_IWUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
+	/* if there are no read bits set, make write only */
+	if (!(mode & S_IRUGO))
+		return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
+
 	return debugfs_create_file(name, mode, parent, value, &fops_x32);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -419,7 +482,7 @@ static const struct file_operations fops_blob = {
 };
 
 /**
- * debugfs_create_blob - create a debugfs file that is used to read and write a binary blob
+ * debugfs_create_blob - create a debugfs file that is used to read a binary blob
  * @name: a pointer to a string containing the name of the file to create.
  * @mode: the permission that the file should have
  * @parent: a pointer to the parent dentry for this file.  This should be a
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 0662ba6de85a..d22438ef7674 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -403,6 +403,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 		}
 		child = list_entry(parent->d_subdirs.next, struct dentry,
 				d_u.d_child);
+ next_sibling:
 
 		/*
 		 * If "child" isn't empty, walk down the tree and
@@ -417,6 +418,16 @@ void debugfs_remove_recursive(struct dentry *dentry)
 		__debugfs_remove(child, parent);
 		if (parent->d_subdirs.next == &child->d_u.d_child) {
 			/*
+			 * Try the next sibling.
+			 */
+			if (child->d_u.d_child.next != &parent->d_subdirs) {
+				child = list_entry(child->d_u.d_child.next,
+						   struct dentry,
+						   d_u.d_child);
+				goto next_sibling;
+			}
+
+			/*
 			 * Avoid infinite loop if we fail to remove
 			 * one dentry.
 			 */
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index b6a719a909f8..a2edb7913447 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -24,7 +24,7 @@ static void drop_pagecache_sb(struct super_block *sb)
 			continue;
 		__iget(inode);
 		spin_unlock(&inode_lock);
-		__invalidate_mapping_pages(inode->i_mapping, 0, -1, true);
+		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
 		toput_inode = inode;
 		spin_lock(&inode_lock);
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 49308a29798a..7ee6f7e3a608 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -5,12 +5,12 @@
  */
 
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "efs.h"
 
 static int efs_readdir(struct file *, void *, filldir_t);
 
 const struct file_operations efs_dir_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= efs_readdir,
 };
@@ -33,8 +33,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
 	if (inode->i_size & (EFS_DIRBSIZE-1))
 		printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
 
-	lock_kernel();
-
 	/* work out where this entry can be found */
 	block = filp->f_pos >> EFS_DIRBSIZE_BITS;
 
@@ -107,7 +105,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
 
 	filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
 out:
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index c3fb5f9c4a44..1511bf9e5f80 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -8,7 +8,6 @@
 
 #include <linux/buffer_head.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/exportfs.h>
 #include "efs.h"
 
@@ -63,16 +62,12 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
 	efs_ino_t inodenum;
 	struct inode * inode = NULL;
 
-	lock_kernel();
 	inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
 	if (inodenum) {
 		inode = efs_iget(dir->i_sb, inodenum);
-		if (IS_ERR(inode)) {
-			unlock_kernel();
+		if (IS_ERR(inode))
 			return ERR_CAST(inode);
-		}
 	}
-	unlock_kernel();
 
 	return d_splice_alias(inode, dentry);
 }
@@ -115,11 +110,9 @@ struct dentry *efs_get_parent(struct dentry *child)
 	struct dentry *parent = ERR_PTR(-ENOENT);
 	efs_ino_t ino;
 
-	lock_kernel();
 	ino = efs_find_entry(child->d_inode, "..", 2);
 	if (ino)
 		parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino));
-	unlock_kernel();
 
 	return parent;
 }
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 41911ec83aaf..75117d0dac2b 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -9,7 +9,6 @@
 #include <linux/string.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/smp_lock.h>
 #include "efs.h"
 
 static int efs_symlink_readpage(struct file *file, struct page *page)
@@ -22,9 +21,8 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
   
 	err = -ENAMETOOLONG;
 	if (size > 2 * EFS_BLOCKSIZE)
-		goto fail_notlocked;
+		goto fail;
   
-	lock_kernel();
 	/* read first 512 bytes of link target */
 	err = -EIO;
 	bh = sb_bread(inode->i_sb, efs_bmap(inode, 0));
@@ -40,14 +38,11 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
 		brelse(bh);
 	}
 	link[size] = '\0';
-	unlock_kernel();
 	SetPageUptodate(page);
 	kunmap(page);
 	unlock_page(page);
 	return 0;
 fail:
-	unlock_kernel();
-fail_notlocked:
 	SetPageError(page);
 	kunmap(page);
 	unlock_page(page);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5458e80fc558..085c5c063420 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -98,7 +98,7 @@ struct epoll_filefd {
 struct nested_call_node {
 	struct list_head llink;
 	void *cookie;
-	int cpu;
+	void *ctx;
 };
 
 /*
@@ -317,17 +317,17 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
  * @nproc: Nested call core function pointer.
  * @priv: Opaque data to be passed to the @nproc callback.
  * @cookie: Cookie to be used to identify this nested call.
+ * @ctx: This instance context.
  *
  * Returns: Returns the code returned by the @nproc callback, or -1 if
  *          the maximum recursion limit has been exceeded.
  */
 static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 			  int (*nproc)(void *, void *, int), void *priv,
-			  void *cookie)
+			  void *cookie, void *ctx)
 {
 	int error, call_nests = 0;
 	unsigned long flags;
-	int this_cpu = get_cpu();
 	struct list_head *lsthead = &ncalls->tasks_call_list;
 	struct nested_call_node *tncur;
 	struct nested_call_node tnode;
@@ -340,7 +340,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 	 * very much limited.
 	 */
 	list_for_each_entry(tncur, lsthead, llink) {
-		if (tncur->cpu == this_cpu &&
+		if (tncur->ctx == ctx &&
 		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
 			/*
 			 * Ops ... loop detected or maximum nest level reached.
@@ -352,7 +352,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 	}
 
 	/* Add the current task and cookie to the list */
-	tnode.cpu = this_cpu;
+	tnode.ctx = ctx;
 	tnode.cookie = cookie;
 	list_add(&tnode.llink, lsthead);
 
@@ -364,10 +364,9 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
 	/* Remove the current task from the list */
 	spin_lock_irqsave(&ncalls->lock, flags);
 	list_del(&tnode.llink);
- out_unlock:
+out_unlock:
 	spin_unlock_irqrestore(&ncalls->lock, flags);
 
-	put_cpu();
 	return error;
 }
 
@@ -408,8 +407,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
  */
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
+	int this_cpu = get_cpu();
+
 	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
-		       ep_poll_wakeup_proc, NULL, wq);
+		       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
+
+	put_cpu();
 }
 
 /*
@@ -663,7 +666,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 	 * could re-enter here.
 	 */
 	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
-				   ep_poll_readyevents_proc, ep, ep);
+				   ep_poll_readyevents_proc, ep, ep, current);
 
 	return pollflags != -1 ? pollflags : 0;
 }
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 003500498c22..6cde970b0a1a 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -450,7 +450,7 @@ ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 
 /* Releases the page */
 void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
-			struct page *page, struct inode *inode)
+		   struct page *page, struct inode *inode, int update_times)
 {
 	loff_t pos = page_offset(page) +
 			(char *) de - (char *) page_address(page);
@@ -465,7 +465,8 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
 	ext2_set_de_type(de, inode);
 	err = ext2_commit_chunk(page, pos, len);
 	ext2_put_page(page);
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	if (update_times)
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
 	EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
 	mark_inode_dirty(dir);
 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index f2e5811936d0..d988a718aedb 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -111,7 +111,7 @@ extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *,
 extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
 extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
-extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
+extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
 
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, int);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 90ea17998a73..6524ecaebb7a 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -320,7 +320,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 		if (!new_de)
 			goto out_dir;
 		inode_inc_link_count(old_inode);
-		ext2_set_link(new_dir, new_de, new_page, old_inode);
+		ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
 			drop_nlink(new_inode);
@@ -352,7 +352,8 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	inode_dec_link_count(old_inode);
 
 	if (dir_de) {
-		ext2_set_link(old_inode, dir_de, dir_page, new_dir);
+		if (old_dir != new_dir)
+			ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
 		inode_dec_link_count(old_dir);
 	}
 	return 0;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index d81ef2fdb08e..e0c745451715 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -129,12 +129,15 @@ fail:
 static inline struct posix_acl *
 ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
 {
-	struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
+	struct posix_acl *acl = ACCESS_ONCE(*i_acl);
 
-	spin_lock(&inode->i_lock);
-	if (*i_acl != EXT3_ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
+	if (acl) {
+		spin_lock(&inode->i_lock);
+		acl = *i_acl;
+		if (acl != EXT3_ACL_NOT_CACHED)
+			acl = posix_acl_dup(acl);
+		spin_unlock(&inode->i_lock);
+	}
 
 	return acl;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b0248c6d5d4c..05dea8132fc0 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -820,7 +820,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 		while (count < maxblocks && count <= blocks_to_boundary) {
 			ext3_fsblk_t blk;
 
-			if (!verify_chain(chain, partial)) {
+			if (!verify_chain(chain, chain + depth - 1)) {
 				/*
 				 * Indirect block might be removed by
 				 * truncate while we were reading it.
@@ -2374,7 +2374,7 @@ void ext3_truncate(struct inode *inode)
 	struct page *page;
 
 	if (!ext3_can_truncate(inode))
-		return;
+		goto out_notrans;
 
 	if (inode->i_size == 0 && ext3_should_writeback_data(inode))
 		ei->i_state |= EXT3_STATE_FLUSH_ON_CLOSE;
@@ -2390,7 +2390,7 @@ void ext3_truncate(struct inode *inode)
 		page = grab_cache_page(mapping,
 				inode->i_size >> PAGE_CACHE_SHIFT);
 		if (!page)
-			return;
+			goto out_notrans;
 	}
 
 	handle = start_transaction(inode);
@@ -2401,7 +2401,7 @@ void ext3_truncate(struct inode *inode)
 			unlock_page(page);
 			page_cache_release(page);
 		}
-		return;		/* AKPM: return what? */
+		goto out_notrans;
 	}
 
 	last_block = (inode->i_size + blocksize-1)
@@ -2525,6 +2525,14 @@ out_stop:
 		ext3_orphan_del(handle, inode);
 
 	ext3_journal_stop(handle);
+	return;
+out_notrans:
+	/*
+	 * Delete the inode from orphan list so that it doesn't stay there
+	 * forever and trigger assertion on umount.
+	 */
+	if (inode->i_nlink)
+		ext3_orphan_del(NULL, inode);
 }
 
 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -3122,12 +3130,6 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 
 	rc = inode_setattr(inode, attr);
 
-	/* If inode_setattr's call to ext3_truncate failed to get a
-	 * transaction handle at all, we need to clean up the in-core
-	 * orphan list manually. */
-	if (inode->i_nlink)
-		ext3_orphan_del(NULL, inode);
-
 	if (!rc && (ia_valid & ATTR_MODE))
 		rc = ext3_acl_chmod(inode);
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 8a0b26340b54..8359e7b3dc89 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -990,7 +990,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
 			ext3_warning(sb, __func__,
-			"CONFIG_LBD not enabled\n");
+			"CONFIG_LBDAF not enabled\n");
 		return -EINVAL;
 	}
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 26aa64dee6aa..601e881e6105 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1812,7 +1812,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
 			" too large to mount safely\n", sb->s_id);
 		if (sizeof(sector_t) < 8)
-			printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
+			printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not "
 					"enabled\n");
 		goto failed_mount;
 	}
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8a34710ecf40..8867b2a1e5fe 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
 
 ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		ext4_jbd2.o migrate.o mballoc.o block_validity.o
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
 
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 647e0d65a284..605aeed96d68 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -129,12 +129,15 @@ fail:
 static inline struct posix_acl *
 ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
 {
-	struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
+	struct posix_acl *acl = ACCESS_ONCE(*i_acl);
 
-	spin_lock(&inode->i_lock);
-	if (*i_acl != EXT4_ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
+	if (acl) {
+		spin_lock(&inode->i_lock);
+		acl = *i_acl;
+		if (acl != EXT4_ACL_NOT_CACHED)
+			acl = posix_acl_dup(acl);
+		spin_unlock(&inode->i_lock);
+	}
 
 	return acl;
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cc7d5edc38c9..17b9998680e3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -352,6 +352,7 @@ struct ext4_new_group_data {
  /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
+#define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
 
 /*
  * ioctl commands in 32 bit emulation
@@ -447,6 +448,15 @@ struct ext4_inode {
 	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
 };
 
+struct move_extent {
+	__u32 reserved;		/* should be zero */
+	__u32 donor_fd;		/* donor file descriptor */
+	__u64 orig_start;	/* logical start offset in block for orig */
+	__u64 donor_start;	/* logical start offset in block for donor */
+	__u64 len;		/* block length to be moved */
+	__u64 moved_len;	/* moved block length */
+};
+#define MAX_DEFRAG_SIZE         ((1UL<<31) - 1)
 
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -674,7 +684,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
 #define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
-#define EXT4_MOUNT_ABORT		0x00200	/* Fatal error detected */
 #define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
 #define EXT4_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
@@ -696,17 +705,10 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
 #define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 
-/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
-#ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT4_MOUNT_##opt
 #define test_opt(sb, opt)		(EXT4_SB(sb)->s_mount_opt & \
 					 EXT4_MOUNT_##opt)
-#else
-#define EXT2_MOUNT_NOLOAD		EXT4_MOUNT_NOLOAD
-#define EXT2_MOUNT_ABORT		EXT4_MOUNT_ABORT
-#define EXT2_MOUNT_DATA_FLAGS		EXT4_MOUNT_DATA_FLAGS
-#endif
 
 #define ext4_set_bit			ext2_set_bit
 #define ext4_set_bit_atomic		ext2_set_bit_atomic
@@ -824,6 +826,13 @@ struct ext4_super_block {
 };
 
 #ifdef __KERNEL__
+
+/*
+ * run-time mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED	0x0001
+#define EXT4_MF_FS_ABORTED	0x0002	/* Fatal error detected */
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -842,7 +851,8 @@ struct ext4_sb_info {
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head **s_group_desc;
-	unsigned long  s_mount_opt;
+	unsigned int s_mount_opt;
+	unsigned int s_mount_flags;
 	ext4_fsblk_t s_sb_block;
 	uid_t s_resuid;
 	gid_t s_resgid;
@@ -853,6 +863,7 @@ struct ext4_sb_info {
 	int s_inode_size;
 	int s_first_ino;
 	unsigned int s_inode_readahead_blks;
+	unsigned int s_inode_goal;
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
@@ -1305,7 +1316,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
 			  dx_hash_info *hinfo);
 
 /* ialloc.c */
-extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
+				    const struct qstr *qstr, __u32 goal);
 extern void ext4_free_inode(handle_t *, struct inode *);
 extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
@@ -1329,7 +1341,7 @@ extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
-		unsigned long, unsigned long, int, unsigned long *);
+		ext4_fsblk_t, unsigned long, int, unsigned long *);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
@@ -1647,6 +1659,11 @@ extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
+/* move_extent.c */
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+			     __u64 start_orig, __u64 start_donor,
+			     __u64 len, __u64 *moved_len);
+
 
 /*
  * Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index f0c3ec85bd48..20a84105a10b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -221,12 +221,16 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 }
 
 extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
 extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
 						   int num,
 						   struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+				      struct ext4_extent *ex1,
+				      struct ext4_extent *ex2);
 extern int ext4_ext_try_to_merge(struct inode *inode,
 				 struct ext4_ext_path *path,
 				 struct ext4_extent *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2593f748c3a4..50322a09bd01 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -49,7 +49,7 @@
  * ext_pblock:
  * combine low and high parts of physical block number into ext4_fsblk_t
  */
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
 {
 	ext4_fsblk_t block;
 
@@ -1417,7 +1417,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 	return err;
 }
 
-static int
+int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 				struct ext4_extent *ex2)
 {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 588af8c77246..3f1873fef1c6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -21,6 +21,8 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/mount.h>
+#include <linux/path.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -145,6 +147,38 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct vfsmount *mnt = filp->f_path.mnt;
+	struct path path;
+	char buf[64], *cp;
+
+	if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+		     !(sb->s_flags & MS_RDONLY))) {
+		sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+		/*
+		 * Sample where the filesystem has been mounted and
+		 * store it in the superblock for sysadmin convenience
+		 * when trying to sort through large numbers of block
+		 * devices or filesystem images.
+		 */
+		memset(buf, 0, sizeof(buf));
+		path.mnt = mnt->mnt_parent;
+		path.dentry = mnt->mnt_mountpoint;
+		path_get(&path);
+		cp = d_path(&path, buf, sizeof(buf));
+		path_put(&path);
+		if (!IS_ERR(cp)) {
+			memcpy(sbi->s_es->s_last_mounted, cp,
+			       sizeof(sbi->s_es->s_last_mounted));
+			sb->s_dirt = 1;
+		}
+	}
+	return generic_file_open(inode, filp);
+}
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -156,7 +190,7 @@ const struct file_operations ext4_file_operations = {
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
 	.mmap		= ext4_file_mmap,
-	.open		= generic_file_open,
+	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5afe4370840b..83cf6415f599 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,10 +28,12 @@
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
 #include <linux/blkdev.h>
-#include <linux/marker.h>
+
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
+#include <trace/events/ext4.h>
+
 /*
  * akpm: A new design for ext4_sync_file().
  *
@@ -52,9 +54,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
-	trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
-		   inode->i_sb->s_id, datasync, inode->i_ino,
-		   dentry->d_parent->d_inode->i_ino);
+	trace_ext4_sync_file(file, dentry, datasync);
 
 	/*
 	 * data=writeback:
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3743bd849bce..2f645732e3b7 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -23,11 +23,14 @@
 #include <linux/bitops.h>
 #include <linux/blkdev.h>
 #include <asm/byteorder.h>
+
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 
+#include <trace/events/ext4.h>
+
 /*
  * ialloc.c contains the inodes allocation and deallocation routines
  */
@@ -208,11 +211,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
 	ino = inode->i_ino;
 	ext4_debug("freeing inode %lu\n", ino);
-	trace_mark(ext4_free_inode,
-		   "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
-		   sb->s_id, inode->i_ino, inode->i_mode,
-		   (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
-		   (unsigned long long) inode->i_blocks);
+	trace_ext4_free_inode(inode);
 
 	/*
 	 * Note: we must free any quota before locking the superblock,
@@ -471,7 +470,8 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g,
  */
 
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-			    ext4_group_t *group, int mode)
+			    ext4_group_t *group, int mode,
+			    const struct qstr *qstr)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -486,6 +486,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	struct ext4_group_desc *desc;
 	struct orlov_stats stats;
 	int flex_size = ext4_flex_bg_size(sbi);
+	struct dx_hash_info hinfo;
 
 	ngroups = real_ngroups;
 	if (flex_size > 1) {
@@ -507,7 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 		int best_ndir = inodes_per_group;
 		int ret = -1;
 
-		get_random_bytes(&grp, sizeof(grp));
+		if (qstr) {
+			hinfo.hash_version = DX_HASH_HALF_MD4;
+			hinfo.seed = sbi->s_hash_seed;
+			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+			grp = hinfo.hash;
+		} else
+			get_random_bytes(&grp, sizeof(grp));
 		parent_group = (unsigned)grp % ngroups;
 		for (i = 0; i < ngroups; i++) {
 			g = (parent_group + i) % ngroups;
@@ -650,7 +657,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 		*group = parent_group + flex_size;
 		if (*group > ngroups)
 			*group = 0;
-		return find_group_orlov(sb, parent, group, mode);
+		return find_group_orlov(sb, parent, group, mode, 0);
 	}
 
 	/*
@@ -791,7 +798,8 @@ err_ret:
  * For other inodes, search forward from the parent directory's block
  * group to find a free inode.
  */
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
+			     const struct qstr *qstr, __u32 goal)
 {
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
@@ -815,14 +823,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 
 	sb = dir->i_sb;
 	ngroups = ext4_get_groups_count(sb);
-	trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
-		   dir->i_ino, mode);
+	trace_ext4_request_inode(dir, mode);
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	ei = EXT4_I(inode);
 	sbi = EXT4_SB(sb);
 
+	if (!goal)
+		goal = sbi->s_inode_goal;
+
+	if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) {
+		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+		ret2 = 0;
+		goto got_group;
+	}
+
 	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
 		ret2 = find_group_flex(sb, dir, &group);
 		if (ret2 == -1) {
@@ -841,7 +858,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		if (test_opt(sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
 		else
-			ret2 = find_group_orlov(sb, dir, &group, mode);
+			ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
 	} else
 		ret2 = find_group_other(sb, dir, &group, mode);
 
@@ -851,7 +868,7 @@ got_group:
 	if (ret2 == -1)
 		goto out;
 
-	for (i = 0; i < ngroups; i++) {
+	for (i = 0; i < ngroups; i++, ino = 0) {
 		err = -EIO;
 
 		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -863,8 +880,6 @@ got_group:
 		if (!inode_bitmap_bh)
 			goto fail;
 
-		ino = 0;
-
 repeat_in_this_group:
 		ino = ext4_find_next_zero_bit((unsigned long *)
 					      inode_bitmap_bh->b_data,
@@ -1047,8 +1062,7 @@ got:
 	}
 
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
-	trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
-		   sb->s_id, inode->i_ino, dir->i_ino, mode);
+	trace_ext4_allocate_inode(inode, dir, mode);
 	goto really_out;
 fail:
 	ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 875db944b22f..7c17ae275af4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,11 +37,14 @@
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 #include "ext4_extents.h"
 
+#include <trace/events/ext4.h>
+
 #define MPAGE_DA_EXTENT_TAIL 0x01
 
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
@@ -78,7 +81,7 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
  * If the handle isn't valid we're not journaling so there's nothing to do.
  */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-			struct buffer_head *bh, ext4_fsblk_t blocknr)
+		struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
 	int err;
 
@@ -90,7 +93,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 	BUFFER_TRACE(bh, "enter");
 
 	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-		  "data mode %lx\n",
+		  "data mode %x\n",
 		  bh, is_metadata, inode->i_mode,
 		  test_opt(inode->i_sb, DATA_FLAGS));
 
@@ -329,8 +332,8 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
  */
 
 static int ext4_block_to_path(struct inode *inode,
-			ext4_lblk_t i_block,
-			ext4_lblk_t offsets[4], int *boundary)
+			      ext4_lblk_t i_block,
+			      ext4_lblk_t offsets[4], int *boundary)
 {
 	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
 	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -362,9 +365,9 @@ static int ext4_block_to_path(struct inode *inode,
 		final = ptrs;
 	} else {
 		ext4_warning(inode->i_sb, "ext4_block_to_path",
-				"block %lu > max in inode %lu",
-				i_block + direct_blocks +
-				indirect_blocks + double_blocks, inode->i_ino);
+			     "block %lu > max in inode %lu",
+			     i_block + direct_blocks +
+			     indirect_blocks + double_blocks, inode->i_ino);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
@@ -379,25 +382,25 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 
 	while (bref < p+max) {
 		blk = le32_to_cpu(*bref++);
-		if (blk && 
-		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 
+		if (blk &&
+		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						    blk, 1))) {
 			ext4_error(inode->i_sb, function,
 				   "invalid block reference %u "
 				   "in inode #%lu", blk, inode->i_ino);
- 			return -EIO;
- 		}
- 	}
- 	return 0;
+			return -EIO;
+		}
+	}
+	return 0;
 }
 
 
 #define ext4_check_indirect_blockref(inode, bh)                         \
-        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+	__ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
 			      EXT4_ADDR_PER_BLOCK((inode)->i_sb))
 
 #define ext4_check_inode_blockref(inode)                                \
-        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+	__ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
 			      EXT4_NDIR_BLOCKS)
 
 /**
@@ -447,7 +450,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 		bh = sb_getblk(sb, le32_to_cpu(p->key));
 		if (unlikely(!bh))
 			goto failure;
-                  
+
 		if (!bh_uptodate_or_lock(bh)) {
 			if (bh_submit_read(bh) < 0) {
 				put_bh(bh);
@@ -459,7 +462,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 				goto failure;
 			}
 		}
-		
+
 		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
@@ -552,7 +555,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
  *	returns it.
  */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
-		Indirect *partial)
+				   Indirect *partial)
 {
 	/*
 	 * XXX need to get goal block from mballoc's data structures
@@ -574,7 +577,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
  *	direct and indirect blocks.
  */
 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
-		int blocks_to_boundary)
+				 int blocks_to_boundary)
 {
 	unsigned int count = 0;
 
@@ -610,9 +613,9 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
  *		direct blocks
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, ext4_fsblk_t goal,
-				int indirect_blks, int blks,
-				ext4_fsblk_t new_blocks[4], int *err)
+			     ext4_lblk_t iblock, ext4_fsblk_t goal,
+			     int indirect_blks, int blks,
+			     ext4_fsblk_t new_blocks[4], int *err)
 {
 	struct ext4_allocation_request ar;
 	int target, i;
@@ -683,10 +686,10 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	}
 	if (!*err) {
 		if (target == blks) {
-		/*
-		 * save the new block number
-		 * for the first direct block
-		 */
+			/*
+			 * save the new block number
+			 * for the first direct block
+			 */
 			new_blocks[index] = current_block;
 		}
 		blk_allocated += ar.len;
@@ -728,9 +731,9 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, int indirect_blks,
-				int *blks, ext4_fsblk_t goal,
-				ext4_lblk_t *offsets, Indirect *branch)
+			     ext4_lblk_t iblock, int indirect_blks,
+			     int *blks, ext4_fsblk_t goal,
+			     ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
@@ -777,7 +780,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 			 * the chain to point to the new allocated
 			 * data blocks numbers
 			 */
-			for (i=1; i < num; i++)
+			for (i = 1; i < num; i++)
 				*(branch[n].p + i) = cpu_to_le32(++current_block);
 		}
 		BUFFER_TRACE(bh, "marking uptodate");
@@ -820,7 +823,8 @@ failed:
  * chain to new block and return 0.
  */
 static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-			ext4_lblk_t block, Indirect *where, int num, int blks)
+			      ext4_lblk_t block, Indirect *where, int num,
+			      int blks)
 {
 	int i;
 	int err = 0;
@@ -852,10 +856,6 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 	}
 
 	/* We are done with atomic stuff, now do the rest of housekeeping */
-
-	inode->i_ctime = ext4_current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
-
 	/* had we spliced it onto indirect block? */
 	if (where->bh) {
 		/*
@@ -874,8 +874,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 	} else {
 		/*
 		 * OK, we spliced it into the inode itself on a direct block.
-		 * Inode was dirtied above.
 		 */
+		ext4_mark_inode_dirty(handle, inode);
 		jbd_debug(5, "splicing direct\n");
 	}
 	return err;
@@ -921,9 +921,9 @@ err_out:
  * blocks.
  */
 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
-				  ext4_lblk_t iblock, unsigned int maxblocks,
-				  struct buffer_head *bh_result,
-				  int flags)
+			       ext4_lblk_t iblock, unsigned int maxblocks,
+			       struct buffer_head *bh_result,
+			       int flags)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
@@ -939,7 +939,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, iblock, offsets,
-					&blocks_to_boundary);
+				   &blocks_to_boundary);
 
 	if (depth == 0)
 		goto out;
@@ -987,8 +987,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * Block out ext4_truncate while we alter the tree
 	 */
 	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
-					&count, goal,
-					offsets + (partial - chain), partial);
+				&count, goal,
+				offsets + (partial - chain), partial);
 
 	/*
 	 * The ext4_splice_branch call will free and forget any buffers
@@ -999,8 +999,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 */
 	if (!err)
 		err = ext4_splice_branch(handle, inode, iblock,
-					partial, indirect_blks, count);
-	else 
+					 partial, indirect_blks, count);
+	else
 		goto cleanup;
 
 	set_buffer_new(bh_result);
@@ -1172,7 +1172,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, block, 
+		int ret = check_block_validity(inode, block,
 					       bh->b_blocknr, retval);
 		if (ret != 0)
 			return ret;
@@ -1254,7 +1254,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && buffer_mapped(bh)) {
-		int ret = check_block_validity(inode, block, 
+		int ret = check_block_validity(inode, block,
 					       bh->b_blocknr, retval);
 		if (ret != 0)
 			return ret;
@@ -1405,8 +1405,7 @@ static int walk_page_buffers(handle_t *handle,
 
 	for (bh = head, block_start = 0;
 	     ret == 0 && (bh != head || !block_start);
-	     block_start = block_end, bh = next)
-	{
+	     block_start = block_end, bh = next) {
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
@@ -1447,7 +1446,7 @@ static int walk_page_buffers(handle_t *handle,
  * write.
  */
 static int do_journal_get_write_access(handle_t *handle,
-					struct buffer_head *bh)
+				       struct buffer_head *bh)
 {
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
@@ -1455,27 +1454,24 @@ static int do_journal_get_write_access(handle_t *handle,
 }
 
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned flags,
-				struct page **pagep, void **fsdata)
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
- 	pgoff_t index;
+	pgoff_t index;
 	unsigned from, to;
 
-	trace_mark(ext4_write_begin,
-		   "dev %s ino %lu pos %llu len %u flags %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, flags);
+	trace_ext4_write_begin(inode, pos, len, flags);
 	/*
 	 * Reserve one block more for addition to orphan list in case
 	 * we allocate blocks but write fails for some reason
 	 */
 	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
- 	index = pos >> PAGE_CACHE_SHIFT;
+	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
@@ -1523,7 +1519,7 @@ retry:
 		ext4_journal_stop(handle);
 		if (pos + len > inode->i_size) {
 			vmtruncate(inode, inode->i_size);
-			/* 
+			/*
 			 * If vmtruncate failed early the inode might
 			 * still be on the orphan list; we need to
 			 * make sure the inode is removed from the
@@ -1550,9 +1546,9 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 
 static int ext4_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+				  struct address_space *mapping,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page, void *fsdata)
 {
 	int i_size_changed = 0;
 	struct inode *inode = mapping->host;
@@ -1603,18 +1599,15 @@ static int ext4_generic_write_end(struct file *file,
  * buffers are managed internally.
  */
 static int ext4_ordered_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+				  struct address_space *mapping,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 
-	trace_mark(ext4_ordered_write_end,
-		   "dev %s ino %lu pos %llu len %u copied %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, copied);
+	trace_ext4_ordered_write_end(inode, pos, len, copied);
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
@@ -1636,7 +1629,7 @@ static int ext4_ordered_write_end(struct file *file,
 
 	if (pos + len > inode->i_size) {
 		vmtruncate(inode, inode->i_size);
-		/* 
+		/*
 		 * If vmtruncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
@@ -1650,18 +1643,15 @@ static int ext4_ordered_write_end(struct file *file,
 }
 
 static int ext4_writeback_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+				    struct address_space *mapping,
+				    loff_t pos, unsigned len, unsigned copied,
+				    struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 
-	trace_mark(ext4_writeback_write_end,
-		   "dev %s ino %lu pos %llu len %u copied %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, copied);
+	trace_ext4_writeback_write_end(inode, pos, len, copied);
 	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
@@ -1681,7 +1671,7 @@ static int ext4_writeback_write_end(struct file *file,
 
 	if (pos + len > inode->i_size) {
 		vmtruncate(inode, inode->i_size);
-		/* 
+		/*
 		 * If vmtruncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
@@ -1694,9 +1684,9 @@ static int ext4_writeback_write_end(struct file *file,
 }
 
 static int ext4_journalled_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+				     struct address_space *mapping,
+				     loff_t pos, unsigned len, unsigned copied,
+				     struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
@@ -1705,10 +1695,7 @@ static int ext4_journalled_write_end(struct file *file,
 	unsigned from, to;
 	loff_t new_i_size;
 
-	trace_mark(ext4_journalled_write_end,
-		   "dev %s ino %lu pos %llu len %u copied %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, copied);
+	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
@@ -1747,7 +1734,7 @@ static int ext4_journalled_write_end(struct file *file,
 		ret = ret2;
 	if (pos + len > inode->i_size) {
 		vmtruncate(inode, inode->i_size);
-		/* 
+		/*
 		 * If vmtruncate failed early the inode might still be
 		 * on the orphan list; we need to make sure the inode
 		 * is removed from the orphan list in that case.
@@ -1854,7 +1841,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
-						unsigned long offset)
+					     unsigned long offset)
 {
 	int to_release = 0;
 	struct buffer_head *head, *bh;
@@ -2554,9 +2541,7 @@ static int ext4_da_writepage(struct page *page,
 	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
 
-	trace_mark(ext4_da_writepage,
-		   "dev %s ino %lu page_index %lu",
-		   inode->i_sb->s_id, inode->i_ino, page->index);
+	trace_ext4_da_writepage(inode, page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -2667,19 +2652,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
-	trace_mark(ext4_da_writepages,
-		   "dev %s ino %lu nr_t_write %ld "
-		   "pages_skipped %ld range_start %llu "
-		   "range_end %llu nonblocking %d "
-		   "for_kupdate %d for_reclaim %d "
-		   "for_writepages %d range_cyclic %d",
-		   inode->i_sb->s_id, inode->i_ino,
-		   wbc->nr_to_write, wbc->pages_skipped,
-		   (unsigned long long) wbc->range_start,
-		   (unsigned long long) wbc->range_end,
-		   wbc->nonblocking, wbc->for_kupdate,
-		   wbc->for_reclaim, wbc->for_writepages,
-		   wbc->range_cyclic);
+	trace_ext4_da_writepages(inode, wbc);
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2693,13 +2666,13 @@ static int ext4_da_writepages(struct address_space *mapping,
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
 	 * will obscure the real source of the problem.  We test
-	 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
 	 * the latter could be true if the filesystem is mounted
 	 * read-only, and in that case, ext4_da_writepages should
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
-	if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return -EROFS;
 
 	/*
@@ -2845,14 +2818,7 @@ out_writepages:
 	if (!no_nrwrite_index_update)
 		wbc->no_nrwrite_index_update = 0;
 	wbc->nr_to_write -= nr_to_writebump;
-	trace_mark(ext4_da_writepage_result,
-		   "dev %s ino %lu ret %d pages_written %d "
-		   "pages_skipped %ld congestion %d "
-		   "more_io %d no_nrwrite_index_update %d",
-		   inode->i_sb->s_id, inode->i_ino, ret,
-		   pages_written, wbc->pages_skipped,
-		   wbc->encountered_congestion, wbc->more_io,
-		   wbc->no_nrwrite_index_update);
+	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	return ret;
 }
 
@@ -2884,8 +2850,8 @@ static int ext4_nonda_switch(struct super_block *sb)
 }
 
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned flags,
-				struct page **pagep, void **fsdata)
+			       loff_t pos, unsigned len, unsigned flags,
+			       struct page **pagep, void **fsdata)
 {
 	int ret, retries = 0;
 	struct page *page;
@@ -2904,11 +2870,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 					len, flags, pagep, fsdata);
 	}
 	*fsdata = (void *)0;
-
-	trace_mark(ext4_da_write_begin,
-		   "dev %s ino %lu pos %llu len %u flags %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, flags);
+	trace_ext4_da_write_begin(inode, pos, len, flags);
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
@@ -2959,7 +2921,7 @@ out:
  * when write to the end of file but not require block allocation
  */
 static int ext4_da_should_update_i_disksize(struct page *page,
-					 unsigned long offset)
+					    unsigned long offset)
 {
 	struct buffer_head *bh;
 	struct inode *inode = page->mapping->host;
@@ -2978,9 +2940,9 @@ static int ext4_da_should_update_i_disksize(struct page *page,
 }
 
 static int ext4_da_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned copied,
+			     struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
@@ -3001,10 +2963,7 @@ static int ext4_da_write_end(struct file *file,
 		}
 	}
 
-	trace_mark(ext4_da_write_end,
-		   "dev %s ino %lu pos %llu len %u copied %u",
-		   inode->i_sb->s_id, inode->i_ino,
-		   (unsigned long long) pos, len, copied);
+	trace_ext4_da_write_end(inode, pos, len, copied);
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + copied - 1;
 
@@ -3081,7 +3040,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 	 * not strictly speaking necessary (and for users of
 	 * laptop_mode, not even desirable).  However, to do otherwise
 	 * would require replicating code paths in:
-	 * 
+	 *
 	 * ext4_da_writepages() ->
 	 *    write_cache_pages() ---> (via passed in callback function)
 	 *        __mpage_da_writepage() -->
@@ -3101,7 +3060,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 	 * write out the pages, but rather only collect contiguous
 	 * logical block extents, call the multi-block allocator, and
 	 * then update the buffer heads with the block allocations.
-	 * 
+	 *
 	 * For now, though, we'll cheat by calling filemap_flush(),
 	 * which will map the blocks, and start the I/O, but not
 	 * actually wait for the I/O to complete.
@@ -3237,7 +3196,7 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
  *
  */
 static int __ext4_normal_writepage(struct page *page,
-				struct writeback_control *wbc)
+				   struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 
@@ -3249,15 +3208,13 @@ static int __ext4_normal_writepage(struct page *page,
 }
 
 static int ext4_normal_writepage(struct page *page,
-				struct writeback_control *wbc)
+				 struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t size = i_size_read(inode);
 	loff_t len;
 
-	trace_mark(ext4_normal_writepage,
-		   "dev %s ino %lu page_index %lu",
-		   inode->i_sb->s_id, inode->i_ino, page->index);
+	trace_ext4_normal_writepage(inode, page);
 	J_ASSERT(PageLocked(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -3287,7 +3244,7 @@ static int ext4_normal_writepage(struct page *page,
 }
 
 static int __ext4_journalled_writepage(struct page *page,
-				struct writeback_control *wbc)
+				       struct writeback_control *wbc)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
@@ -3337,15 +3294,13 @@ out:
 }
 
 static int ext4_journalled_writepage(struct page *page,
-				struct writeback_control *wbc)
+				     struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t size = i_size_read(inode);
 	loff_t len;
 
-	trace_mark(ext4_journalled_writepage,
-		   "dev %s ino %lu page_index %lu",
-		   inode->i_sb->s_id, inode->i_ino, page->index);
+	trace_ext4_journalled_writepage(inode, page);
 	J_ASSERT(PageLocked(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -3442,8 +3397,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
-			const struct iovec *iov, loff_t offset,
-			unsigned long nr_segs)
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
@@ -3763,7 +3718,8 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
  *			(no partially truncated stuff there).  */
 
 static Indirect *ext4_find_shared(struct inode *inode, int depth,
-			ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
+				  ext4_lblk_t offsets[4], Indirect chain[4],
+				  __le32 *top)
 {
 	Indirect *partial, *p;
 	int k, err;
@@ -3819,8 +3775,10 @@ no_top:
  * than `count' because there can be holes in there.
  */
 static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
-		struct buffer_head *bh, ext4_fsblk_t block_to_free,
-		unsigned long count, __le32 *first, __le32 *last)
+			      struct buffer_head *bh,
+			      ext4_fsblk_t block_to_free,
+			      unsigned long count, __le32 *first,
+			      __le32 *last)
 {
 	__le32 *p;
 	if (try_to_extend_transaction(handle, inode)) {
@@ -3837,10 +3795,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	/*
-	 * Any buffers which are on the journal will be in memory. We find
-	 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
-	 * on them.  We've already detached each block from the file, so
-	 * bforget() in jbd2_journal_forget() should be safe.
+	 * Any buffers which are on the journal will be in memory. We
+	 * find them on the hash table so jbd2_journal_revoke() will
+	 * run jbd2_journal_forget() on them.  We've already detached
+	 * each block from the file, so bforget() in
+	 * jbd2_journal_forget() should be safe.
 	 *
 	 * AKPM: turn on bforget in jbd2_journal_forget()!!!
 	 */
@@ -4212,7 +4171,7 @@ void ext4_truncate(struct inode *inode)
 				   (__le32*)partial->bh->b_data+addr_per_block,
 				   (chain+n-1) - partial);
 		BUFFER_TRACE(partial->bh, "call brelse");
-		brelse (partial->bh);
+		brelse(partial->bh);
 		partial--;
 	}
 do_indirects:
@@ -4453,8 +4412,9 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
 	if (flags & S_DIRSYNC)
 		ei->i_flags |= EXT4_DIRSYNC_FL;
 }
+
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
-					struct ext4_inode_info *ei)
+				  struct ext4_inode_info *ei)
 {
 	blkcnt_t i_blocks ;
 	struct inode *inode = &(ei->vfs_inode);
@@ -4569,7 +4529,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 					EXT4_GOOD_OLD_INODE_SIZE +
 					ei->i_extra_isize;
 			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-				 ei->i_state |= EXT4_STATE_XATTR;
+				ei->i_state |= EXT4_STATE_XATTR;
 		}
 	} else
 		ei->i_extra_isize = 0;
@@ -4588,7 +4548,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 
 	ret = 0;
 	if (ei->i_file_acl &&
-	    ((ei->i_file_acl < 
+	    ((ei->i_file_acl <
 	      (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
 	       EXT4_SB(sb)->s_gdb_count)) ||
 	     (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
@@ -4603,15 +4563,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		     !ext4_inode_is_fast_symlink(inode)))
 			/* Validate extent which is part of inode */
 			ret = ext4_ext_check_inode(inode);
- 	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		   (S_ISLNK(inode->i_mode) &&
 		    !ext4_inode_is_fast_symlink(inode))) {
-	 	/* Validate block references which are part of inode */
+		/* Validate block references which are part of inode */
 		ret = ext4_check_inode_blockref(inode);
 	}
 	if (ret) {
- 		brelse(bh);
- 		goto bad_inode;
+		brelse(bh);
+		goto bad_inode;
 	}
 
 	if (S_ISREG(inode->i_mode)) {
@@ -4642,7 +4602,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	} else {
 		brelse(bh);
 		ret = -EIO;
-		ext4_error(inode->i_sb, __func__, 
+		ext4_error(inode->i_sb, __func__,
 			   "bogus i_mode (%o) for inode=%lu",
 			   inode->i_mode, inode->i_ino);
 		goto bad_inode;
@@ -4795,8 +4755,9 @@ static int ext4_do_update_inode(handle_t *handle,
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
-	} else for (block = 0; block < EXT4_N_BLOCKS; block++)
-		raw_inode->i_block[block] = ei->i_data[block];
+	} else
+		for (block = 0; block < EXT4_N_BLOCKS; block++)
+			raw_inode->i_block[block] = ei->i_data[block];
 
 	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
 	if (ei->i_extra_isize) {
@@ -5150,7 +5111,7 @@ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
  * Give this, we know that the caller already has write access to iloc->bh.
  */
 int ext4_mark_iloc_dirty(handle_t *handle,
-		struct inode *inode, struct ext4_iloc *iloc)
+			 struct inode *inode, struct ext4_iloc *iloc)
 {
 	int err = 0;
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 91e75f7a9e73..bb415408fdb6 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -14,6 +14,7 @@
 #include <linux/compat.h>
 #include <linux/smp_lock.h>
 #include <linux/mount.h>
+#include <linux/file.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -213,6 +214,41 @@ setversion_out:
 
 		return err;
 	}
+
+	case EXT4_IOC_MOVE_EXT: {
+		struct move_extent me;
+		struct file *donor_filp;
+		int err;
+
+		if (copy_from_user(&me,
+			(struct move_extent __user *)arg, sizeof(me)))
+			return -EFAULT;
+
+		donor_filp = fget(me.donor_fd);
+		if (!donor_filp)
+			return -EBADF;
+
+		if (!capable(CAP_DAC_OVERRIDE)) {
+			if ((current->real_cred->fsuid != inode->i_uid) ||
+				!(inode->i_mode & S_IRUSR) ||
+				!(donor_filp->f_dentry->d_inode->i_mode &
+				S_IRUSR)) {
+				fput(donor_filp);
+				return -EACCES;
+			}
+		}
+
+		err = ext4_move_extents(filp, donor_filp, me.orig_start,
+					me.donor_start, me.len, &me.moved_len);
+		fput(donor_filp);
+
+		if (!err)
+			if (copy_to_user((struct move_extent *)arg,
+				&me, sizeof(me)))
+				return -EFAULT;
+		return err;
+	}
+
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
 		struct super_block *sb = inode->i_sb;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ed8482e22c0e..519a0a686d94 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,8 @@
  */
 
 #include "mballoc.h"
+#include <trace/events/ext4.h>
+
 /*
  * MUSTDO:
  *   - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -340,8 +342,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 
-
-
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
 #if BITS_PER_LONG == 64
@@ -2859,9 +2859,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
 			+ entry->start_blk
 			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-		trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
-			   sb->s_id, (unsigned long long) discard_block,
-			   entry->count);
+		trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
+					  entry->count);
 		sb_issue_discard(sb, discard_block, entry->count);
 
 		kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -3629,10 +3628,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 
 	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-	trace_mark(ext4_mb_new_inode_pa,
-		   "dev %s ino %lu pstart %llu len %u lstart %u",
-		   sb->s_id, ac->ac_inode->i_ino,
-		   pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_inode_pa(ac, pa);
 
 	ext4_mb_use_inode_pa(ac, pa);
 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3691,9 +3687,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	pa->pa_type = MB_GROUP_PA;
 
 	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
-		 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
-	trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
-		   sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_group_pa(ac, pa);
 
 	ext4_mb_use_group_pa(ac, pa);
 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3783,10 +3778,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			ext4_mb_store_history(ac);
 		}
 
-		trace_mark(ext4_mb_release_inode_pa,
-			   "dev %s ino %lu block %llu count %u",
-			   sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
-			   next - bit);
+		trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
+					       next - bit);
 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
 		bit = next + 1;
 	}
@@ -3820,8 +3813,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 	if (ac)
 		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
 
-	trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
-		   sb->s_id, pa->pa_pstart, pa->pa_len);
+	trace_ext4_mb_release_group_pa(ac, pa);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3889,6 +3881,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 
 	INIT_LIST_HEAD(&list);
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	if (ac)
+		ac->ac_sb = sb;
 repeat:
 	ext4_lock_group(sb, group);
 	list_for_each_entry_safe(pa, tmp,
@@ -3987,12 +3981,15 @@ void ext4_discard_preallocations(struct inode *inode)
 	}
 
 	mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
-	trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
-		   inode->i_ino);
+	trace_ext4_discard_preallocations(inode);
 
 	INIT_LIST_HEAD(&list);
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	if (ac) {
+		ac->ac_sb = sb;
+		ac->ac_inode = inode;
+	}
 repeat:
 	/* first, collect all pa's in the inode */
 	spin_lock(&ei->i_prealloc_lock);
@@ -4276,6 +4273,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 
 	INIT_LIST_HEAD(&discard_list);
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	if (ac)
+		ac->ac_sb = sb;
 
 	spin_lock(&lg->lg_prealloc_lock);
 	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4445,8 +4444,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 	int ret;
 	int freed = 0;
 
-	trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
-		   sb->s_id, needed);
+	trace_ext4_mb_discard_preallocations(sb, needed);
 	for (i = 0; i < ngroups && needed > 0; i++) {
 		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
 		freed += ret;
@@ -4475,17 +4473,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
 
-	trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
-		   "lblk %llu goal %llu lleft %llu lright %llu "
-		   "pleft %llu pright %llu ",
-		   sb->s_id, ar->flags, ar->len,
-		   ar->inode ? ar->inode->i_ino : 0,
-		   (unsigned long long) ar->logical,
-		   (unsigned long long) ar->goal,
-		   (unsigned long long) ar->lleft,
-		   (unsigned long long) ar->lright,
-		   (unsigned long long) ar->pleft,
-		   (unsigned long long) ar->pright);
+	trace_ext4_request_blocks(ar);
 
 	/*
 	 * For delayed allocation, we could skip the ENOSPC and
@@ -4521,7 +4509,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	}
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (!ac) {
+	if (ac) {
+		ac->ac_sb = sb;
+		ac->ac_inode = ar->inode;
+	} else {
 		ar->len = 0;
 		*errp = -ENOMEM;
 		goto out1;
@@ -4594,18 +4585,7 @@ out3:
 						reserv_blks);
 	}
 
-	trace_mark(ext4_allocate_blocks,
-		   "dev %s block %llu flags %u len %u ino %lu "
-		   "logical %llu goal %llu lleft %llu lright %llu "
-		   "pleft %llu pright %llu ",
-		   sb->s_id, (unsigned long long) block,
-		   ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
-		   (unsigned long long) ar->logical,
-		   (unsigned long long) ar->goal,
-		   (unsigned long long) ar->lleft,
-		   (unsigned long long) ar->lright,
-		   (unsigned long long) ar->pleft,
-		   (unsigned long long) ar->pright);
+	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
 
 	return block;
 }
@@ -4709,7 +4689,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
  * Main entry point into mballoc to free blocks
  */
 void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
-			unsigned long block, unsigned long count,
+			ext4_fsblk_t block, unsigned long count,
 			int metadata, unsigned long *freed)
 {
 	struct buffer_head *bitmap_bh = NULL;
@@ -4735,15 +4715,12 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	    block + count > ext4_blocks_count(es)) {
 		ext4_error(sb, __func__,
 			    "Freeing blocks not in datazone - "
-			    "block = %lu, count = %lu", block, count);
+			    "block = %llu, count = %lu", block, count);
 		goto error_return;
 	}
 
-	ext4_debug("freeing block %lu\n", block);
-	trace_mark(ext4_free_blocks,
-		   "dev %s block %llu count %lu metadata %d ino %lu",
-		   sb->s_id, (unsigned long long) block, count, metadata,
-		   inode ? inode->i_ino : 0);
+	ext4_debug("freeing block %llu\n", block);
+	trace_ext4_free_blocks(inode, block, count, metadata);
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (ac) {
@@ -4784,7 +4761,7 @@ do_more:
 
 		ext4_error(sb, __func__,
 			   "Freeing blocks in system zone - "
-			   "Block = %lu, count = %lu", block, count);
+			   "Block = %llu, count = %lu", block, count);
 		/* err = 0. ext4_std_error should be a no op */
 		goto error_return;
 	}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 75e34f69215b..c96bb19f58f9 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -19,7 +19,6 @@
 #include <linux/seq_file.h>
 #include <linux/version.h>
 #include <linux/blkdev.h>
-#include <linux/marker.h>
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index fe64d9f79852..313a50b39741 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -458,6 +458,7 @@ int ext4_ext_migrate(struct inode *inode)
 	struct inode *tmp_inode = NULL;
 	struct list_blocks_struct lb;
 	unsigned long max_entries;
+	__u32 goal;
 
 	/*
 	 * If the filesystem does not support extents, or the inode
@@ -483,9 +484,10 @@ int ext4_ext_migrate(struct inode *inode)
 		retval = PTR_ERR(handle);
 		return retval;
 	}
-	tmp_inode = ext4_new_inode(handle,
-				inode->i_sb->s_root->d_inode,
-				S_IFREG);
+	goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+		EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+				   S_IFREG, 0, goal);
 	if (IS_ERR(tmp_inode)) {
 		retval = -ENOMEM;
 		ext4_journal_stop(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 000000000000..bbf2dd9404dc
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ *            Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "ext4.h"
+
+#define get_ext_path(path, inode, block, ret)		\
+	do {								\
+		path = ext4_ext_find_extent(inode, block, path);	\
+		if (IS_ERR(path)) {					\
+			ret = PTR_ERR(path);				\
+			path = NULL;					\
+		}							\
+	} while (0)
+
+/**
+ * copy_extent_status - Copy the extent's initialization status
+ *
+ * @src:	an extent for getting initialize status
+ * @dest:	an extent to be set the status
+ */
+static void
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+	if (ext4_ext_is_uninitialized(src))
+		ext4_ext_mark_uninitialized(dest);
+	else
+		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+
+/**
+ * mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode:	inode which is searched
+ * @path:	this will obtain data for the next extent
+ * @extent:	pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+static int
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+		      struct ext4_extent **extent)
+{
+	int ppos, leaf_ppos = path->p_depth;
+
+	ppos = leaf_ppos;
+	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+		/* leaf block */
+		*extent = ++path[ppos].p_ext;
+		return 0;
+	}
+
+	while (--ppos >= 0) {
+		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+		    path[ppos].p_idx) {
+			int cur_ppos = ppos;
+
+			/* index block */
+			path[ppos].p_idx++;
+			path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+			if (path[ppos+1].p_bh)
+				brelse(path[ppos+1].p_bh);
+			path[ppos+1].p_bh =
+				sb_bread(inode->i_sb, path[ppos].p_block);
+			if (!path[ppos+1].p_bh)
+				return -EIO;
+			path[ppos+1].p_hdr =
+				ext_block_hdr(path[ppos+1].p_bh);
+
+			/* Halfway index block */
+			while (++cur_ppos < leaf_ppos) {
+				path[cur_ppos].p_idx =
+					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+				path[cur_ppos].p_block =
+					idx_pblock(path[cur_ppos].p_idx);
+				if (path[cur_ppos+1].p_bh)
+					brelse(path[cur_ppos+1].p_bh);
+				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+					path[cur_ppos].p_block);
+				if (!path[cur_ppos+1].p_bh)
+					return -EIO;
+				path[cur_ppos+1].p_hdr =
+					ext_block_hdr(path[cur_ppos+1].p_bh);
+			}
+
+			/* leaf block */
+			path[leaf_ppos].p_ext = *extent =
+				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+			return 0;
+		}
+	}
+	/* We found the last extent */
+	return 1;
+}
+
+/**
+ * mext_double_down_read - Acquire two inodes' read semaphore
+ *
+ * @orig_inode:		original inode structure
+ * @donor_inode:	donor inode structure
+ * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+	struct inode *first = orig_inode, *second = donor_inode;
+
+	BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+	/*
+	 * Use the inode number to provide the stable locking order instead
+	 * of its address, because the C language doesn't guarantee you can
+	 * compare pointers that don't come from the same array.
+	 */
+	if (donor_inode->i_ino < orig_inode->i_ino) {
+		first = donor_inode;
+		second = orig_inode;
+	}
+
+	down_read(&EXT4_I(first)->i_data_sem);
+	down_read(&EXT4_I(second)->i_data_sem);
+}
+
+/**
+ * mext_double_down_write - Acquire two inodes' write semaphore
+ *
+ * @orig_inode:		original inode structure
+ * @donor_inode:	donor inode structure
+ * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+	struct inode *first = orig_inode, *second = donor_inode;
+
+	BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+	/*
+	 * Use the inode number to provide the stable locking order instead
+	 * of its address, because the C language doesn't guarantee you can
+	 * compare pointers that don't come from the same array.
+	 */
+	if (donor_inode->i_ino < orig_inode->i_ino) {
+		first = donor_inode;
+		second = orig_inode;
+	}
+
+	down_write(&EXT4_I(first)->i_data_sem);
+	down_write(&EXT4_I(second)->i_data_sem);
+}
+
+/**
+ * mext_double_up_read - Release two inodes' read semaphore
+ *
+ * @orig_inode:		original inode structure to be released its lock first
+ * @donor_inode:	donor inode structure to be released its lock second
+ * Release read semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+	BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+	up_read(&EXT4_I(orig_inode)->i_data_sem);
+	up_read(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_double_up_write - Release two inodes' write semaphore
+ *
+ * @orig_inode:		original inode structure to be released its lock first
+ * @donor_inode:	donor inode structure to be released its lock second
+ * Release write semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+	BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+	up_write(&EXT4_I(orig_inode)->i_data_sem);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @o_start:		first original extent to be changed
+ * @o_end:		last original extent to be changed
+ * @start_ext:		first new extent to be inserted
+ * @new_ext:		middle of new extent to be inserted
+ * @end_ext:		last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext)
+{
+	struct ext4_ext_path *orig_path = NULL;
+	ext4_lblk_t eblock = 0;
+	int new_flag = 0;
+	int end_flag = 0;
+	int err = 0;
+
+	if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+		if (o_start == o_end) {
+
+			/*       start_ext   new_ext    end_ext
+			 * donor |---------|-----------|--------|
+			 * orig  |------------------------------|
+			 */
+			end_flag = 1;
+		} else {
+
+			/*       start_ext   new_ext   end_ext
+			 * donor |---------|----------|---------|
+			 * orig  |---------------|--------------|
+			 */
+			o_end->ee_block = end_ext->ee_block;
+			o_end->ee_len = end_ext->ee_len;
+			ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+		}
+
+		o_start->ee_len = start_ext->ee_len;
+		new_flag = 1;
+
+	} else if (start_ext->ee_len && new_ext->ee_len &&
+		   !end_ext->ee_len && o_start == o_end) {
+
+		/*	 start_ext	new_ext
+		 * donor |--------------|---------------|
+		 * orig  |------------------------------|
+		 */
+		o_start->ee_len = start_ext->ee_len;
+		new_flag = 1;
+
+	} else if (!start_ext->ee_len && new_ext->ee_len &&
+		   end_ext->ee_len && o_start == o_end) {
+
+		/*	  new_ext	end_ext
+		 * donor |--------------|---------------|
+		 * orig  |------------------------------|
+		 */
+		o_end->ee_block = end_ext->ee_block;
+		o_end->ee_len = end_ext->ee_len;
+		ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+
+		/*
+		 * Set 0 to the extent block if new_ext was
+		 * the first block.
+		 */
+		if (new_ext->ee_block)
+			eblock = le32_to_cpu(new_ext->ee_block);
+
+		new_flag = 1;
+	} else {
+		ext4_debug("ext4 move extent: Unexpected insert case\n");
+		return -EIO;
+	}
+
+	if (new_flag) {
+		get_ext_path(orig_path, orig_inode, eblock, err);
+		if (orig_path == NULL)
+			goto out;
+
+		if (ext4_ext_insert_extent(handle, orig_inode,
+					orig_path, new_ext))
+			goto out;
+	}
+
+	if (end_flag) {
+		get_ext_path(orig_path, orig_inode,
+				      le32_to_cpu(end_ext->ee_block) - 1, err);
+		if (orig_path == NULL)
+			goto out;
+
+		if (ext4_ext_insert_extent(handle, orig_inode,
+					   orig_path, end_ext))
+			goto out;
+	}
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+
+	return err;
+
+}
+
+/**
+ * mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start:		first original extent to be moved
+ * @o_end:		last original extent to be moved
+ * @start_ext:		first new extent to be inserted
+ * @new_ext:		middle of new extent to be inserted
+ * @end_ext:		last new extent to be inserted
+ * @eh:			extent header of target leaf block
+ * @range_to_move:	used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+mext_insert_inside_block(struct ext4_extent *o_start,
+			      struct ext4_extent *o_end,
+			      struct ext4_extent *start_ext,
+			      struct ext4_extent *new_ext,
+			      struct ext4_extent *end_ext,
+			      struct ext4_extent_header *eh,
+			      int range_to_move)
+{
+	int i = 0;
+	unsigned long len;
+
+	/* Move the existing extents */
+	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+			(unsigned long)(o_end + 1);
+		memmove(o_end + 1 + range_to_move, o_end + 1, len);
+	}
+
+	/* Insert start entry */
+	if (start_ext->ee_len)
+		o_start[i++].ee_len = start_ext->ee_len;
+
+	/* Insert new entry */
+	if (new_ext->ee_len) {
+		o_start[i] = *new_ext;
+		ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+	}
+
+	/* Insert end entry */
+	if (end_ext->ee_len)
+		o_start[i] = *end_ext;
+
+	/* Increment the total entries counter on the extent block */
+	le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+
+/**
+ * mext_insert_extents - Insert new extent
+ *
+ * @handle:	journal handle
+ * @orig_inode:	original inode
+ * @orig_path:	path indicates first extent to be changed
+ * @o_start:	first original extent to be changed
+ * @o_end:	last original extent to be changed
+ * @start_ext:	first new extent to be inserted
+ * @new_ext:	middle of new extent to be inserted
+ * @end_ext:	last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+			 struct ext4_ext_path *orig_path,
+			 struct ext4_extent *o_start,
+			 struct ext4_extent *o_end,
+			 struct ext4_extent *start_ext,
+			 struct ext4_extent *new_ext,
+			 struct ext4_extent *end_ext)
+{
+	struct  ext4_extent_header *eh;
+	unsigned long need_slots, slots_range;
+	int	range_to_move, depth, ret;
+
+	/*
+	 * The extents need to be inserted
+	 * start_extent + new_extent + end_extent.
+	 */
+	need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+		(new_ext->ee_len ? 1 : 0);
+
+	/* The number of slots between start and end */
+	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+		/ sizeof(struct ext4_extent);
+
+	/* Range to move the end of extent */
+	range_to_move = need_slots - slots_range;
+	depth = orig_path->p_depth;
+	orig_path += depth;
+	eh = orig_path->p_hdr;
+
+	if (depth) {
+		/* Register to journal */
+		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+		if (ret)
+			return ret;
+	}
+
+	/* Expansion */
+	if (range_to_move > 0 &&
+		(range_to_move > le16_to_cpu(eh->eh_max)
+			- le16_to_cpu(eh->eh_entries))) {
+
+		ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+					o_end, start_ext, new_ext, end_ext);
+		if (ret < 0)
+			return ret;
+	} else
+		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+						end_ext, eh, range_to_move);
+
+	if (depth) {
+		ret = ext4_handle_dirty_metadata(handle, orig_inode,
+						 orig_path->p_bh);
+		if (ret)
+			return ret;
+	} else {
+		ret = ext4_mark_inode_dirty(handle, orig_inode);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @orig_path:		path indicates first extent to be changed
+ * @dext:		donor extent
+ * @from:		start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+		     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+		     ext4_lblk_t *from)
+{
+	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	ext4_lblk_t new_ext_end;
+	ext4_fsblk_t new_phys_end;
+	int oext_alen, new_ext_alen, end_ext_alen;
+	int depth = ext_depth(orig_inode);
+	int ret;
+
+	o_start = o_end = oext = orig_path[depth].p_ext;
+	oext_alen = ext4_ext_get_actual_len(oext);
+	start_ext.ee_len = end_ext.ee_len = 0;
+
+	new_ext.ee_block = cpu_to_le32(*from);
+	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+	new_ext.ee_len = dext->ee_len;
+	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+	new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
+
+	/*
+	 * Case: original extent is first
+	 * oext      |--------|
+	 * new_ext      |--|
+	 * start_ext |--|
+	 */
+	if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block) + oext_alen) {
+		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+					       le32_to_cpu(oext->ee_block));
+		copy_extent_status(oext, &start_ext);
+	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+		prev_ext = oext - 1;
+		/*
+		 * We can merge new_ext into previous extent,
+		 * if these are contiguous and same extent type.
+		 */
+		if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+					       &new_ext)) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le16(
+				ext4_ext_get_actual_len(prev_ext) +
+				new_ext_alen);
+			copy_extent_status(prev_ext, &start_ext);
+			new_ext.ee_len = 0;
+		}
+	}
+
+	/*
+	 * Case: new_ext_end must be less than oext
+	 * oext      |-----------|
+	 * new_ext       |-------|
+	 */
+	BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+
+	/*
+	 * Case: new_ext is smaller than original extent
+	 * oext    |---------------|
+	 * new_ext |-----------|
+	 * end_ext             |---|
+	 */
+	if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+		new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+		end_ext.ee_len =
+			cpu_to_le16(le32_to_cpu(oext->ee_block) +
+			oext_alen - 1 - new_ext_end);
+		copy_extent_status(oext, &end_ext);
+		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+		ext4_ext_store_pblock(&end_ext,
+			(ext_pblock(o_end) + oext_alen - end_ext_alen));
+		end_ext.ee_block =
+			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+			oext_alen - end_ext_alen);
+	}
+
+	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+				o_end, &start_ext, &new_ext, &end_ext);
+	return ret;
+}
+
+/**
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
+ *
+ * @tmp_dext:		the extent that will belong to the original inode
+ * @tmp_oext:		the extent that will belong to the donor inode
+ * @orig_off:		block offset of original inode
+ * @donor_off:		block offset of donor inode
+ * @max_count:		the maximun length of extents
+ */
+static void
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+			      struct ext4_extent *tmp_oext,
+			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+			      ext4_lblk_t max_count)
+{
+	ext4_lblk_t diff, orig_diff;
+	struct ext4_extent dext_old, oext_old;
+
+	dext_old = *tmp_dext;
+	oext_old = *tmp_oext;
+
+	/* When tmp_dext is too large, pick up the target range. */
+	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+
+	ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+	tmp_dext->ee_block =
+			cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
+	tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+
+	if (max_count < ext4_ext_get_actual_len(tmp_dext))
+		tmp_dext->ee_len = cpu_to_le16(max_count);
+
+	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+	ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+
+	/* Adjust extent length if donor extent is larger than orig */
+	if (ext4_ext_get_actual_len(tmp_dext) >
+	    ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+		tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+						orig_diff);
+
+	tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+
+	copy_extent_status(&oext_old, tmp_dext);
+	copy_extent_status(&dext_old, tmp_oext);
+}
+
+/**
+ * mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @donor_inode:	donor inode
+ * @from:		block offset of orig_inode
+ * @count:		block count to be replaced
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ *    dummy extents.
+ * 2. Change the block information of original inode to point at the
+ *    donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ *    original inode blocks in the dummy extents.
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+			   struct inode *donor_inode, ext4_lblk_t from,
+			   ext4_lblk_t count)
+{
+	struct ext4_ext_path *orig_path = NULL;
+	struct ext4_ext_path *donor_path = NULL;
+	struct ext4_extent *oext, *dext;
+	struct ext4_extent tmp_dext, tmp_oext;
+	ext4_lblk_t orig_off = from, donor_off = from;
+	int err = 0;
+	int depth;
+	int replaced_count = 0;
+	int dext_alen;
+
+	mext_double_down_write(orig_inode, donor_inode);
+
+	/* Get the original extent for the block "orig_off" */
+	get_ext_path(orig_path, orig_inode, orig_off, err);
+	if (orig_path == NULL)
+		goto out;
+
+	/* Get the donor extent for the head */
+	get_ext_path(donor_path, donor_inode, donor_off, err);
+	if (donor_path == NULL)
+		goto out;
+	depth = ext_depth(orig_inode);
+	oext = orig_path[depth].p_ext;
+	tmp_oext = *oext;
+
+	depth = ext_depth(donor_inode);
+	dext = donor_path[depth].p_ext;
+	tmp_dext = *dext;
+
+	mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+				      donor_off, count);
+
+	/* Loop for the donor extents */
+	while (1) {
+		/* The extent for donor must be found. */
+		BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+
+		/* Set donor extent to orig extent */
+		err = mext_leaf_block(handle, orig_inode,
+					   orig_path, &tmp_dext, &orig_off);
+		if (err < 0)
+			goto out;
+
+		/* Set orig extent to donor extent */
+		err = mext_leaf_block(handle, donor_inode,
+					   donor_path, &tmp_oext, &donor_off);
+		if (err < 0)
+			goto out;
+
+		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+		replaced_count += dext_alen;
+		donor_off += dext_alen;
+		orig_off += dext_alen;
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		if (orig_path)
+			ext4_ext_drop_refs(orig_path);
+		get_ext_path(orig_path, orig_inode, orig_off, err);
+		if (orig_path == NULL)
+			goto out;
+		depth = ext_depth(orig_inode);
+		oext = orig_path[depth].p_ext;
+		if (le32_to_cpu(oext->ee_block) +
+				ext4_ext_get_actual_len(oext) <= orig_off) {
+			err = 0;
+			goto out;
+		}
+		tmp_oext = *oext;
+
+		if (donor_path)
+			ext4_ext_drop_refs(donor_path);
+		get_ext_path(donor_path, donor_inode,
+				      donor_off, err);
+		if (donor_path == NULL)
+			goto out;
+		depth = ext_depth(donor_inode);
+		dext = donor_path[depth].p_ext;
+		if (le32_to_cpu(dext->ee_block) +
+				ext4_ext_get_actual_len(dext) <= donor_off) {
+			err = 0;
+			goto out;
+		}
+		tmp_dext = *dext;
+
+		mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+					      donor_off,
+					      count - replaced_count);
+	}
+
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+	if (donor_path) {
+		ext4_ext_drop_refs(donor_path);
+		kfree(donor_path);
+	}
+
+	mext_double_up_write(orig_inode, donor_inode);
+	return err;
+}
+
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp:			file structure of original file
+ * @donor_inode:		donor inode
+ * @orig_page_offset:		page index on original file
+ * @data_offset_in_page:	block index where data swapping starts
+ * @block_len_in_page:		the number of blocks to be swapped
+ * @uninit:			orig extent is uninitialized or not
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+		  pgoff_t orig_page_offset, int data_offset_in_page,
+		  int block_len_in_page, int uninit)
+{
+	struct inode *orig_inode = o_filp->f_dentry->d_inode;
+	struct address_space *mapping = orig_inode->i_mapping;
+	struct buffer_head *bh;
+	struct page *page = NULL;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	handle_t *handle;
+	ext4_lblk_t orig_blk_offset;
+	long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+	unsigned int w_flags = 0;
+	unsigned int tmp_data_len, data_len;
+	void *fsdata;
+	int ret, i, jblocks;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and donor_inode may change each different metadata blocks.
+	 */
+	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+	handle = ext4_journal_start(orig_inode, jblocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		return ret;
+	}
+
+	if (segment_eq(get_fs(), KERNEL_DS))
+		w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	orig_blk_offset = orig_page_offset * blocks_per_page +
+		data_offset_in_page;
+
+	/*
+	 * If orig extent is uninitialized one,
+	 * it's not necessary force the page into memory
+	 * and then force it to be written out again.
+	 * Just swap data blocks between orig and donor.
+	 */
+	if (uninit) {
+		ret = mext_replace_branches(handle, orig_inode,
+						 donor_inode, orig_blk_offset,
+						 block_len_in_page);
+
+		/* Clear the inode cache not to refer to the old data */
+		ext4_ext_invalidate_cache(orig_inode);
+		ext4_ext_invalidate_cache(donor_inode);
+		goto out2;
+	}
+
+	offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+
+	/* Calculate data_len */
+	if ((orig_blk_offset + block_len_in_page - 1) ==
+	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+		/* Replace the last block */
+		tmp_data_len = orig_inode->i_size & (blocksize - 1);
+		/*
+		 * If data_len equal zero, it shows data_len is multiples of
+		 * blocksize. So we set appropriate value.
+		 */
+		if (tmp_data_len == 0)
+			tmp_data_len = blocksize;
+
+		data_len = tmp_data_len +
+			((block_len_in_page - 1) << orig_inode->i_blkbits);
+	} else {
+		data_len = block_len_in_page << orig_inode->i_blkbits;
+	}
+
+	ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+				 &page, &fsdata);
+	if (unlikely(ret < 0))
+		goto out;
+
+	if (!PageUptodate(page)) {
+		mapping->a_ops->readpage(o_filp, page);
+		lock_page(page);
+	}
+
+	/*
+	 * try_to_release_page() doesn't call releasepage in writeback mode.
+	 * We should care about the order of writing to the same file
+	 * by multiple move extent processes.
+	 * It needs to call wait_on_page_writeback() to wait for the
+	 * writeback of the page.
+	 */
+	if (PageWriteback(page))
+		wait_on_page_writeback(page);
+
+	/* Release old bh and drop refs */
+	try_to_release_page(page, 0);
+
+	ret = mext_replace_branches(handle, orig_inode, donor_inode,
+					 orig_blk_offset, block_len_in_page);
+	if (ret < 0)
+		goto out;
+
+	/* Clear the inode cache not to refer to the old data */
+	ext4_ext_invalidate_cache(orig_inode);
+	ext4_ext_invalidate_cache(donor_inode);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+
+	bh = page_buffers(page);
+	for (i = 0; i < data_offset_in_page; i++)
+		bh = bh->b_this_page;
+
+	for (i = 0; i < block_len_in_page; i++) {
+		ret = ext4_get_block(orig_inode,
+				(sector_t)(orig_blk_offset + i), bh, 0);
+		if (ret < 0)
+			goto out;
+
+		if (bh->b_this_page != NULL)
+			bh = bh->b_this_page;
+	}
+
+	ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+			       page, fsdata);
+	page = NULL;
+
+out:
+	if (unlikely(page)) {
+		if (PageLocked(page))
+			unlock_page(page);
+		page_cache_release(page);
+	}
+out2:
+	ext4_journal_stop(handle);
+
+	return ret < 0 ? ret : 0;
+}
+
+/**
+ * mext_check_argumants - Check whether move extent can be done
+ *
+ * @orig_inode:		original inode
+ * @donor_inode:	donor inode
+ * @orig_start:		logical start offset in block for orig
+ * @donor_start:	logical start offset in block for donor
+ * @len:		the number of blocks to be moved
+ * @moved_len:		moved block length
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+			  struct inode *donor_inode, __u64 orig_start,
+			  __u64 donor_start, __u64 *len, __u64 moved_len)
+{
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+		ext4_debug("ext4 move extent: The argument files should be "
+			"regular file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Ext4 move extent does not support swapfile */
+	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+		ext4_debug("ext4 move extent: The argument files should "
+			"not be swapfile [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Files should be in the same ext4 FS */
+	if (orig_inode->i_sb != donor_inode->i_sb) {
+		ext4_debug("ext4 move extent: The argument files "
+			"should be in same FS [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* orig and donor should be different file */
+	if (orig_inode->i_ino == donor_inode->i_ino) {
+		ext4_debug("ext4 move extent: The argument files should not "
+			"be same file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Ext4 move extent supports only extent based file */
+	if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+		ext4_debug("ext4 move extent: orig file is not extents "
+			"based file [ino:orig %lu]\n", orig_inode->i_ino);
+		return -EOPNOTSUPP;
+	} else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+		ext4_debug("ext4 move extent: donor file is not extents "
+			"based file [ino:donor %lu]\n", donor_inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+		ext4_debug("ext4 move extent: File size is 0 byte\n");
+		return -EINVAL;
+	}
+
+	/* Start offset should be same */
+	if (orig_start != donor_start) {
+		ext4_debug("ext4 move extent: orig and donor's start "
+			"offset are not same [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if (moved_len) {
+		ext4_debug("ext4 move extent: moved_len should be 0 "
+			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+			donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if ((orig_start > MAX_DEFRAG_SIZE) ||
+	    (donor_start > MAX_DEFRAG_SIZE) ||
+	    (*len > MAX_DEFRAG_SIZE) ||
+	    (orig_start + *len > MAX_DEFRAG_SIZE))  {
+		ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
+			"[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if (orig_inode->i_size > donor_inode->i_size) {
+		if (orig_start >= donor_inode->i_size) {
+			ext4_debug("ext4 move extent: orig start offset "
+			"[%llu] should be less than donor file size "
+			"[%lld] [ino:orig %lu, donor_inode %lu]\n",
+			orig_start, donor_inode->i_size,
+			orig_inode->i_ino, donor_inode->i_ino);
+			return -EINVAL;
+		}
+
+		if (orig_start + *len > donor_inode->i_size) {
+			ext4_debug("ext4 move extent: End offset [%llu] should "
+				"be less than donor file size [%lld]."
+				"So adjust length from %llu to %lld "
+				"[ino:orig %lu, donor %lu]\n",
+				orig_start + *len, donor_inode->i_size,
+				*len, donor_inode->i_size - orig_start,
+				orig_inode->i_ino, donor_inode->i_ino);
+			*len = donor_inode->i_size - orig_start;
+		}
+	} else {
+		if (orig_start >= orig_inode->i_size) {
+			ext4_debug("ext4 move extent: start offset [%llu] "
+				"should be less than original file size "
+				"[%lld] [inode:orig %lu, donor %lu]\n",
+				 orig_start, orig_inode->i_size,
+				orig_inode->i_ino, donor_inode->i_ino);
+			return -EINVAL;
+		}
+
+		if (orig_start + *len > orig_inode->i_size) {
+			ext4_debug("ext4 move extent: Adjust length "
+				"from %llu to %lld. Because it should be "
+				"less than original file size "
+				"[ino:orig %lu, donor %lu]\n",
+				*len, orig_inode->i_size - orig_start,
+				orig_inode->i_ino, donor_inode->i_ino);
+			*len = orig_inode->i_size - orig_start;
+		}
+	}
+
+	if (!*len) {
+		ext4_debug("ext4 move extent: len shoudld not be 0 "
+			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+			donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:	the inode structure
+ * @inode2:	the inode structure
+ *
+ * Lock two inodes' i_mutex by i_ino order. This function is moved from
+ * fs/inode.c.
+ */
+static void
+mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
+		if (inode1)
+			mutex_lock(&inode1->i_mutex);
+		else if (inode2)
+			mutex_lock(&inode2->i_mutex);
+		return;
+	}
+
+	if (inode1->i_ino < inode2->i_ino) {
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+	}
+}
+
+/**
+ * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ *
+ * @inode1:     the inode that is released first
+ * @inode2:     the inode that is released second
+ *
+ * This function is moved from fs/inode.c.
+ */
+
+static void
+mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1)
+		mutex_unlock(&inode1->i_mutex);
+
+	if (inode2 && inode2 != inode1)
+		mutex_unlock(&inode2->i_mutex);
+}
+
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp:		file structure of the original file
+ * @d_filp:		file structure of the donor file
+ * @orig_start:		start offset in block for orig
+ * @donor_start:	start offset in block for donor
+ * @len:		the number of blocks to be moved
+ * @moved_len:		moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_move_extents() proceeds the following order.
+ * 1:ext4_move_extents() calculates the last block number of moving extent
+ *   function by the start block number (orig_start) and the number of blocks
+ *   to be moved (len) specified as arguments.
+ *   If the {orig, donor}_start points a hole, the extent's start offset
+ *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ *   after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ *   or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call mext_next_extent()
+ *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ *   until find un-continuous extent, the start logical block number exceeds
+ *   the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ *   from orig_page_offset to seq_end_page.
+ *   The start indexes of data are specified as arguments.
+ *   That of the original inode is orig_page_offset,
+ *   and the donor inode is also orig_page_offset
+ *   (To easily handle blocksize != pagesize case, the offset for the
+ *   donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ *   then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ *   which shows the number of moved blocks.
+ *   The moved_len is useful for the command to calculate the file offset
+ *   for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp,
+		 __u64 orig_start, __u64 donor_start, __u64 len,
+		 __u64 *moved_len)
+{
+	struct inode *orig_inode = o_filp->f_dentry->d_inode;
+	struct inode *donor_inode = d_filp->f_dentry->d_inode;
+	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+	ext4_lblk_t block_start = orig_start;
+	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+	ext4_lblk_t rest_blocks;
+	pgoff_t orig_page_offset = 0, seq_end_page;
+	int ret, depth, last_extent = 0;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+	int data_offset_in_page;
+	int block_len_in_page;
+	int uninit;
+
+	/* protect orig and donor against a truncate */
+	mext_inode_double_lock(orig_inode, donor_inode);
+
+	mext_double_down_read(orig_inode, donor_inode);
+	/* Check the filesystem environment whether move_extent can be done */
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+					donor_start, &len, *moved_len);
+	mext_double_up_read(orig_inode, donor_inode);
+	if (ret)
+		goto out2;
+
+	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+	block_end = block_start + len - 1;
+	if (file_end < block_end)
+		len -= block_end - file_end;
+
+	get_ext_path(orig_path, orig_inode, block_start, ret);
+	if (orig_path == NULL)
+		goto out2;
+
+	/* Get path structure to check the hole */
+	get_ext_path(holecheck_path, orig_inode, block_start, ret);
+	if (holecheck_path == NULL)
+		goto out;
+
+	depth = ext_depth(orig_inode);
+	ext_cur = holecheck_path[depth].p_ext;
+	if (ext_cur == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Get proper extent whose ee_block is beyond block_start
+	 * if block_start was within the hole.
+	 */
+	if (le32_to_cpu(ext_cur->ee_block) +
+		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+		last_extent = mext_next_extent(orig_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+		last_extent = mext_next_extent(orig_inode, orig_path,
+							&ext_dummy);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+	}
+	seq_start = block_start;
+
+	/* No blocks within the specified range. */
+	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+		ext4_debug("ext4 move extent: The specified range of file "
+							"may be the hole\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Adjust start blocks */
+	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+		     max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+		seq_blocks += add_blocks;
+
+		/* Adjust tail blocks */
+		if (seq_start + seq_blocks - 1 > block_end)
+			seq_blocks = block_end - seq_start + 1;
+
+		ext_prev = ext_cur;
+		last_extent = mext_next_extent(orig_inode, holecheck_path,
+						&ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			break;
+		}
+		add_blocks = ext4_ext_get_actual_len(ext_cur);
+
+		/*
+		 * Extend the length of contiguous block (seq_blocks)
+		 * if extents are contiguous.
+		 */
+		if (ext4_can_extents_be_merged(orig_inode,
+					       ext_prev, ext_cur) &&
+		    block_end >= le32_to_cpu(ext_cur->ee_block) &&
+		    !last_extent)
+			continue;
+
+		/* Is original extent is uninitialized */
+		uninit = ext4_ext_is_uninitialized(ext_prev);
+
+		data_offset_in_page = seq_start % blocks_per_page;
+
+		/*
+		 * Calculate data blocks count that should be swapped
+		 * at the first page.
+		 */
+		if (data_offset_in_page + seq_blocks > blocks_per_page) {
+			/* Swapped blocks are across pages */
+			block_len_in_page =
+					blocks_per_page - data_offset_in_page;
+		} else {
+			/* Swapped blocks are in a page */
+			block_len_in_page = seq_blocks;
+		}
+
+		orig_page_offset = seq_start >>
+				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+		seq_end_page = (seq_start + seq_blocks - 1) >>
+				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+		rest_blocks = seq_blocks;
+
+		/* Discard preallocations of two inodes */
+		down_write(&EXT4_I(orig_inode)->i_data_sem);
+		ext4_discard_preallocations(orig_inode);
+		up_write(&EXT4_I(orig_inode)->i_data_sem);
+
+		down_write(&EXT4_I(donor_inode)->i_data_sem);
+		ext4_discard_preallocations(donor_inode);
+		up_write(&EXT4_I(donor_inode)->i_data_sem);
+
+		while (orig_page_offset <= seq_end_page) {
+
+			/* Swap original branches with new branches */
+			ret = move_extent_par_page(o_filp, donor_inode,
+						orig_page_offset,
+						data_offset_in_page,
+						block_len_in_page, uninit);
+			if (ret < 0)
+				goto out;
+			orig_page_offset++;
+			/* Count how many blocks we have exchanged */
+			*moved_len += block_len_in_page;
+			BUG_ON(*moved_len > len);
+
+			data_offset_in_page = 0;
+			rest_blocks -= block_len_in_page;
+			if (rest_blocks > blocks_per_page)
+				block_len_in_page = blocks_per_page;
+			else
+				block_len_in_page = rest_blocks;
+		}
+
+		/* Decrease buffer counter */
+		if (holecheck_path)
+			ext4_ext_drop_refs(holecheck_path);
+		get_ext_path(holecheck_path, orig_inode,
+				      seq_start, ret);
+		if (holecheck_path == NULL)
+			break;
+		depth = holecheck_path->p_depth;
+
+		/* Decrease buffer counter */
+		if (orig_path)
+			ext4_ext_drop_refs(orig_path);
+		get_ext_path(orig_path, orig_inode, seq_start, ret);
+		if (orig_path == NULL)
+			break;
+
+		ext_cur = holecheck_path[depth].p_ext;
+		add_blocks = ext4_ext_get_actual_len(ext_cur);
+		seq_blocks = 0;
+
+	}
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+	if (holecheck_path) {
+		ext4_ext_drop_refs(holecheck_path);
+		kfree(holecheck_path);
+	}
+out2:
+	mext_inode_double_unlock(orig_inode, donor_inode);
+
+	if (ret)
+		return ret;
+
+	/* All of the specified blocks must be exchanged in succeed */
+	BUG_ON(*moved_len != len);
+
+	return 0;
+}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 07eb6649e4fa..de04013d16ff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1782,7 +1782,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode (handle, dir, mode);
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext4_file_inode_operations;
@@ -1816,7 +1816,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, mode);
+	inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -1853,7 +1853,8 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
+	inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
+			       &dentry->d_name, 0);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -2264,7 +2265,8 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
+	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
+			       &dentry->d_name, 0);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 27eb289eea37..68b0351fc647 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1002,7 +1002,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			" too large to resize to %llu blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
+			ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
 		return -EINVAL;
 	}
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 012c4251397e..8bb9e2d3e4b8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -37,7 +37,6 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/ctype.h>
-#include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -47,6 +46,9 @@
 #include "xattr.h"
 #include "acl.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
+
 static int default_mb_history_length = 1000;
 
 module_param_named(default_mb_history_length, default_mb_history_length,
@@ -301,7 +303,7 @@ static void ext4_handle_error(struct super_block *sb)
 	if (!test_opt(sb, ERRORS_CONT)) {
 		journal_t *journal = EXT4_SB(sb)->s_journal;
 
-		EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 		if (journal)
 			jbd2_journal_abort(journal, -EIO);
 	}
@@ -414,7 +416,7 @@ void ext4_abort(struct super_block *sb, const char *function,
 	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
-	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+	EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 	if (EXT4_SB(sb)->s_journal)
 		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
@@ -1474,7 +1476,7 @@ set_qf_format:
 			break;
 #endif
 		case Opt_abort:
-			set_opt(sbi->s_mount_opt, ABORT);
+			sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
 			break;
 		case Opt_nobarrier:
 			clear_opt(sbi->s_mount_opt, BARRIER);
@@ -1653,7 +1655,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	ext4_commit_super(sb, 1);
 	if (test_opt(sb, DEBUG))
 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
-				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
+				"bpg=%lu, ipg=%lu, mo=%04x]\n",
 			sb->s_blocksize,
 			sbi->s_groups_count,
 			EXT4_BLOCKS_PER_GROUP(sb),
@@ -1957,7 +1959,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
 	/* small i_blocks in vfs inode? */
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * CONFIG_LBD is not enabled implies the inode
+		 * CONFIG_LBDAF is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
 		 * 32 == size of vfs inode i_blocks * 8
 		 */
@@ -2000,7 +2002,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * !has_huge_files or CONFIG_LBD not enabled implies that
+		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
 		 * the inode i_block field represents total file blocks in
 		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
 		 */
@@ -2204,6 +2206,7 @@ EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
@@ -2216,6 +2219,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
 	ATTR_LIST(inode_readahead_blks),
+	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
 	ATTR_LIST(mb_max_to_scan),
 	ATTR_LIST(mb_min_to_scan),
@@ -2436,13 +2440,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (has_huge_files) {
 		/*
 		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LBD
+		 * mount if kernel is build with CONFIG_LBDAF
 		 */
 		if (sizeof(root->i_blocks) < sizeof(u64) &&
 				!(sb->s_flags & MS_RDONLY)) {
 			ext4_msg(sb, KERN_ERR, "Filesystem with huge "
 					"files cannot be mounted read-write "
-					"without CONFIG_LBD");
+					"without CONFIG_LBDAF");
 			goto failed_mount;
 		}
 	}
@@ -2566,7 +2570,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		ext4_msg(sb, KERN_ERR, "filesystem"
 			" too large to mount safely");
 		if (sizeof(sector_t) < 8)
-			ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
+			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
 		goto failed_mount;
 	}
 
@@ -3346,7 +3350,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	int ret = 0;
 	tid_t target;
 
-	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
+	trace_ext4_sync_fs(sb, wait);
 	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
 		if (wait)
 			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
@@ -3450,7 +3454,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
-	if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
+	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
 		ext4_abort(sb, __func__, "Abort forced by user");
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -3465,7 +3469,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
 		n_blocks_count > ext4_blocks_count(es)) {
-		if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
 			err = -EROFS;
 			goto restore_opts;
 		}
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index b42602298087..923990e4f16e 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -241,7 +241,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 	while (*fclus < cluster) {
 		/* prevent the infinite loop of cluster chain */
 		if (*fclus > limit) {
-			fat_fs_panic(sb, "%s: detected the cluster chain loop"
+			fat_fs_error(sb, "%s: detected the cluster chain loop"
 				     " (i_pos %lld)", __func__,
 				     MSDOS_I(inode)->i_pos);
 			nr = -EIO;
@@ -252,7 +252,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
 		if (nr < 0)
 			goto out;
 		else if (nr == FAT_ENT_FREE) {
-			fat_fs_panic(sb, "%s: invalid cluster chain"
+			fat_fs_error(sb, "%s: invalid cluster chain"
 				     " (i_pos %lld)", __func__,
 				     MSDOS_I(inode)->i_pos);
 			nr = -EIO;
@@ -285,7 +285,7 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 	if (ret < 0)
 		return ret;
 	else if (ret == FAT_ENT_EOF) {
-		fat_fs_panic(sb, "%s: request beyond EOF (i_pos %lld)",
+		fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)",
 			     __func__, MSDOS_I(inode)->i_pos);
 		return -EIO;
 	}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index f3500294eec5..38ff75a0fe22 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -22,6 +22,19 @@
 #include <asm/uaccess.h>
 #include "fat.h"
 
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
 static inline loff_t fat_make_i_pos(struct super_block *sb,
 				    struct buffer_head *bh,
 				    struct msdos_dir_entry *de)
@@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
 				unsigned char *buf, int size)
 {
 	if (sbi->options.utf8)
-		return utf8_wcstombs(buf, uni, size);
+		return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
+				UTF16_HOST_ENDIAN, buf, size);
 	else
 		return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
 				   sbi->nls_io);
@@ -325,19 +339,6 @@ parse_long:
 }
 
 /*
- * Maximum buffer size of short name.
- * [(MSDOS_NAME + '.') * max one char + nul]
- * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
- */
-#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
-/*
- * Maximum buffer size of unicode chars from slots.
- * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
- */
-#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
-#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
-
-/*
  * Return values: negative -> error, 0 -> not found, positive -> found,
  * value is the total amount of slots, including the shortname entry.
  */
@@ -1334,7 +1335,7 @@ found:
 			goto error_remove;
 		}
 		if (dir->i_size & (sbi->cluster_size - 1)) {
-			fat_fs_panic(sb, "Odd directory size");
+			fat_fs_error(sb, "Odd directory size");
 			dir->i_size = (dir->i_size + sbi->cluster_size - 1)
 				& ~((loff_t)sbi->cluster_size - 1);
 		}
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e4d88527b5dd..adb0e72a176d 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -17,6 +17,10 @@
 #define VFAT_SFN_CREATE_WIN95	0x0100 /* emulate win95 rule for create */
 #define VFAT_SFN_CREATE_WINNT	0x0200 /* emulate winnt rule for create */
 
+#define FAT_ERRORS_CONT		1      /* ignore error and continue */
+#define FAT_ERRORS_PANIC	2      /* panic on error */
+#define FAT_ERRORS_RO		3      /* remount r/o on error */
+
 struct fat_mount_options {
 	uid_t fs_uid;
 	gid_t fs_gid;
@@ -26,6 +30,7 @@ struct fat_mount_options {
 	char *iocharset;          /* Charset used for filename input/display */
 	unsigned short shortname; /* flags for shortname display/create rule */
 	unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+	unsigned char errors;	  /* On error: continue, panic, remount-ro */
 	unsigned short allow_utime;/* permission for setting the [am]time */
 	unsigned quiet:1,         /* set = fake successful chmods and chowns */
 		 showexec:1,      /* set = only set x bit for com/exe/bat */
@@ -316,7 +321,7 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 		            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3))) __cold;
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 618f5305c2e4..a81037721a6f 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -348,7 +348,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
 
 	if (entry < FAT_START_ENT || sbi->max_cluster <= entry) {
 		fatent_brelse(fatent);
-		fat_fs_panic(sb, "invalid access to FAT (entry 0x%08x)", entry);
+		fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
 		return -EIO;
 	}
 
@@ -560,7 +560,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 			err = cluster;
 			goto error;
 		} else if (cluster == FAT_ENT_FREE) {
-			fat_fs_panic(sb, "%s: deleting FAT entry beyond EOF",
+			fat_fs_error(sb, "%s: deleting FAT entry beyond EOF",
 				     __func__);
 			err = -EIO;
 			goto error;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e955a56b4e5e..b28ea646ff60 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -18,106 +18,112 @@
 #include <linux/security.h>
 #include "fat.h"
 
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
-		      unsigned int cmd, unsigned long arg)
+static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
 {
+	u32 attr;
+
+	mutex_lock(&inode->i_mutex);
+	attr = fat_make_attrs(inode);
+	mutex_unlock(&inode->i_mutex);
+
+	return put_user(attr, user_attr);
+}
+
+static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
-	u32 __user *user_attr = (u32 __user *)arg;
+	int is_dir = S_ISDIR(inode->i_mode);
+	u32 attr, oldattr;
+	struct iattr ia;
+	int err;
 
-	switch (cmd) {
-	case FAT_IOCTL_GET_ATTRIBUTES:
-	{
-		u32 attr;
+	err = get_user(attr, user_attr);
+	if (err)
+		goto out;
 
-		mutex_lock(&inode->i_mutex);
-		attr = fat_make_attrs(inode);
-		mutex_unlock(&inode->i_mutex);
+	mutex_lock(&inode->i_mutex);
+	err = mnt_want_write(file->f_path.mnt);
+	if (err)
+		goto out_unlock_inode;
 
-		return put_user(attr, user_attr);
+	/*
+	 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
+	 * prevents the user from turning us into a VFAT
+	 * longname entry.  Also, we obviously can't set
+	 * any of the NTFS attributes in the high 24 bits.
+	 */
+	attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
+	/* Merge in ATTR_VOLUME and ATTR_DIR */
+	attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
+		(is_dir ? ATTR_DIR : 0);
+	oldattr = fat_make_attrs(inode);
+
+	/* Equivalent to a chmod() */
+	ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+	ia.ia_ctime = current_fs_time(inode->i_sb);
+	if (is_dir)
+		ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+	else {
+		ia.ia_mode = fat_make_mode(sbi, attr,
+			S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
 	}
-	case FAT_IOCTL_SET_ATTRIBUTES:
-	{
-		u32 attr, oldattr;
-		int err, is_dir = S_ISDIR(inode->i_mode);
-		struct iattr ia;
 
-		err = get_user(attr, user_attr);
-		if (err)
-			return err;
+	/* The root directory has no attributes */
+	if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
+		err = -EINVAL;
+		goto out_drop_write;
+	}
 
-		mutex_lock(&inode->i_mutex);
-
-		err = mnt_want_write(filp->f_path.mnt);
-		if (err)
-			goto up_no_drop_write;
-
-		/*
-		 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
-		 * prevents the user from turning us into a VFAT
-		 * longname entry.  Also, we obviously can't set
-		 * any of the NTFS attributes in the high 24 bits.
-		 */
-		attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
-		/* Merge in ATTR_VOLUME and ATTR_DIR */
-		attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
-			(is_dir ? ATTR_DIR : 0);
-		oldattr = fat_make_attrs(inode);
-
-		/* Equivalent to a chmod() */
-		ia.ia_valid = ATTR_MODE | ATTR_CTIME;
-		ia.ia_ctime = current_fs_time(inode->i_sb);
-		if (is_dir)
-			ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
-		else {
-			ia.ia_mode = fat_make_mode(sbi, attr,
-				S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
-		}
+	if (sbi->options.sys_immutable &&
+	    ((attr | oldattr) & ATTR_SYS) &&
+	    !capable(CAP_LINUX_IMMUTABLE)) {
+		err = -EPERM;
+		goto out_drop_write;
+	}
 
-		/* The root directory has no attributes */
-		if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
-			err = -EINVAL;
-			goto up;
-		}
+	/*
+	 * The security check is questionable...  We single
+	 * out the RO attribute for checking by the security
+	 * module, just because it maps to a file mode.
+	 */
+	err = security_inode_setattr(file->f_path.dentry, &ia);
+	if (err)
+		goto out_drop_write;
 
-		if (sbi->options.sys_immutable) {
-			if ((attr | oldattr) & ATTR_SYS) {
-				if (!capable(CAP_LINUX_IMMUTABLE)) {
-					err = -EPERM;
-					goto up;
-				}
-			}
-		}
+	/* This MUST be done before doing anything irreversible... */
+	err = fat_setattr(file->f_path.dentry, &ia);
+	if (err)
+		goto out_drop_write;
+
+	fsnotify_change(file->f_path.dentry, ia.ia_valid);
+	if (sbi->options.sys_immutable) {
+		if (attr & ATTR_SYS)
+			inode->i_flags |= S_IMMUTABLE;
+		else
+			inode->i_flags &= S_IMMUTABLE;
+	}
 
-		/*
-		 * The security check is questionable...  We single
-		 * out the RO attribute for checking by the security
-		 * module, just because it maps to a file mode.
-		 */
-		err = security_inode_setattr(filp->f_path.dentry, &ia);
-		if (err)
-			goto up;
-
-		/* This MUST be done before doing anything irreversible... */
-		err = fat_setattr(filp->f_path.dentry, &ia);
-		if (err)
-			goto up;
-
-		fsnotify_change(filp->f_path.dentry, ia.ia_valid);
-		if (sbi->options.sys_immutable) {
-			if (attr & ATTR_SYS)
-				inode->i_flags |= S_IMMUTABLE;
-			else
-				inode->i_flags &= S_IMMUTABLE;
-		}
+	fat_save_attrs(inode, attr);
+	mark_inode_dirty(inode);
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
+out_unlock_inode:
+	mutex_unlock(&inode->i_mutex);
+out:
+	return err;
+}
 
-		fat_save_attrs(inode, attr);
-		mark_inode_dirty(inode);
-up:
-		mnt_drop_write(filp->f_path.mnt);
-up_no_drop_write:
-		mutex_unlock(&inode->i_mutex);
-		return err;
-	}
+int fat_generic_ioctl(struct inode *inode, struct file *filp,
+		      unsigned int cmd, unsigned long arg)
+{
+	u32 __user *user_attr = (u32 __user *)arg;
+
+	switch (cmd) {
+	case FAT_IOCTL_GET_ATTRIBUTES:
+		return fat_ioctl_get_attributes(inode, user_attr);
+	case FAT_IOCTL_SET_ATTRIBUTES:
+		return fat_ioctl_set_attributes(filp, user_attr);
 	default:
 		return -ENOTTY;	/* Inappropriate ioctl for device */
 	}
@@ -225,7 +231,7 @@ static int fat_free(struct inode *inode, int skip)
 			fatent_brelse(&fatent);
 			return 0;
 		} else if (ret == FAT_ENT_FREE) {
-			fat_fs_panic(sb,
+			fat_fs_error(sb,
 				     "%s: invalid cluster chain (i_pos %lld)",
 				     __func__, MSDOS_I(inode)->i_pos);
 			ret = -EIO;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 51a5ecf9000a..8970d8c49bb0 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -76,7 +76,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 		return 0;
 
 	if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
-		fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
+		fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)",
 			MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
 		return -EIO;
 	}
@@ -856,6 +856,12 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",flush");
 	if (opts->tz_utc)
 		seq_puts(m, ",tz=UTC");
+	if (opts->errors == FAT_ERRORS_CONT)
+		seq_puts(m, ",errors=continue");
+	else if (opts->errors == FAT_ERRORS_PANIC)
+		seq_puts(m, ",errors=panic");
+	else
+		seq_puts(m, ",errors=remount-ro");
 
 	return 0;
 }
@@ -868,7 +874,8 @@ enum {
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err,
+	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
+	Opt_err_panic, Opt_err_ro, Opt_err,
 };
 
 static const match_table_t fat_tokens = {
@@ -891,6 +898,11 @@ static const match_table_t fat_tokens = {
 	{Opt_showexec, "showexec"},
 	{Opt_debug, "debug"},
 	{Opt_immutable, "sys_immutable"},
+	{Opt_flush, "flush"},
+	{Opt_tz_utc, "tz=UTC"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
 	{Opt_obsolate, "conv=binary"},
 	{Opt_obsolate, "conv=text"},
 	{Opt_obsolate, "conv=auto"},
@@ -902,8 +914,6 @@ static const match_table_t fat_tokens = {
 	{Opt_obsolate, "cvf_format=%20s"},
 	{Opt_obsolate, "cvf_options=%100s"},
 	{Opt_obsolate, "posix"},
-	{Opt_flush, "flush"},
-	{Opt_tz_utc, "tz=UTC"},
 	{Opt_err, NULL},
 };
 static const match_table_t msdos_tokens = {
@@ -956,7 +966,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 
 	opts->fs_uid = current_uid();
 	opts->fs_gid = current_gid();
-	opts->fs_fmask = current_umask();
+	opts->fs_fmask = opts->fs_dmask = current_umask();
 	opts->allow_utime = -1;
 	opts->codepage = fat_default_codepage;
 	opts->iocharset = fat_default_iocharset;
@@ -973,6 +983,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 	opts->numtail = 1;
 	opts->usefree = opts->nocase = 0;
 	opts->tz_utc = 0;
+	opts->errors = FAT_ERRORS_RO;
 	*debug = 0;
 
 	if (!options)
@@ -1065,6 +1076,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 		case Opt_tz_utc:
 			opts->tz_utc = 1;
 			break;
+		case Opt_err_cont:
+			opts->errors = FAT_ERRORS_CONT;
+			break;
+		case Opt_err_panic:
+			opts->errors = FAT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			opts->errors = FAT_ERRORS_RO;
+			break;
 
 		/* msdos specific */
 		case Opt_dots:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index ac39ebcc1496..a6c20473dfd7 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -12,14 +12,19 @@
 #include "fat.h"
 
 /*
- * fat_fs_panic reports a severe file system problem and sets the file system
- * read-only. The file system can be made writable again by remounting it.
+ * fat_fs_error reports a file system problem that might indicate fa data
+ * corruption/inconsistency. Depending on 'errors' mount option the
+ * panic() is called, or error message is printed FAT and nothing is done,
+ * or filesystem is remounted read-only (default behavior).
+ * In case the file system is remounted read-only, it can be made writable
+ * again by remounting it.
  */
-void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+void fat_fs_error(struct super_block *s, const char *fmt, ...)
 {
+	struct fat_mount_options *opts = &MSDOS_SB(s)->options;
 	va_list args;
 
-	printk(KERN_ERR "FAT: Filesystem panic (dev %s)\n", s->s_id);
+	printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
 
 	printk(KERN_ERR "    ");
 	va_start(args, fmt);
@@ -27,13 +32,14 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...)
 	va_end(args);
 	printk("\n");
 
-	if (!(s->s_flags & MS_RDONLY)) {
+	if (opts->errors == FAT_ERRORS_PANIC)
+		panic("    FAT fs panic from previous error\n");
+	else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
 		s->s_flags |= MS_RDONLY;
 		printk(KERN_ERR "    File system has been set read-only\n");
 	}
 }
-
-EXPORT_SYMBOL_GPL(fat_fs_panic);
+EXPORT_SYMBOL_GPL(fat_fs_error);
 
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
@@ -124,7 +130,7 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
 			mark_inode_dirty(inode);
 	}
 	if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
-		fat_fs_panic(sb, "clusters badly computed (%d != %llu)",
+		fat_fs_error(sb, "clusters badly computed (%d != %llu)",
 			     new_fclus,
 			     (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
 		fat_cache_inval_inode(inode);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 20f522861355..82f88733b681 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -608,7 +608,7 @@ error_inode:
 		sinfo.bh = NULL;
 	}
 	if (corrupt < 0) {
-		fat_fs_panic(new_dir->i_sb,
+		fat_fs_error(new_dir->i_sb,
 			     "%s: Filesystem corrupted (i_pos %lld)",
 			     __func__, sinfo.i_pos);
 	}
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b50ecbe97f83..73471b7ecc8c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
 	if (utf8) {
 		int name_len = strlen(name);
 
-		*outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+		*outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
 
 		/*
 		 * We stripped '.'s before and set len appropriately,
-		 * but utf8_mbstowcs doesn't care about len
+		 * but utf8s_to_utf16s doesn't care about len
 		 */
 		*outlen -= (name_len - len);
 
@@ -1030,7 +1030,7 @@ error_inode:
 		sinfo.bh = NULL;
 	}
 	if (corrupt < 0) {
-		fat_fs_panic(new_dir->i_sb,
+		fat_fs_error(new_dir->i_sb,
 			     "%s: Filesystem corrupted (i_pos %lld)",
 			     __func__, sinfo.i_pos);
 	}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 1ad703150dee..a040b764f8e3 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -198,15 +198,19 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 }
 
 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
-                     uid_t uid, uid_t euid, int force)
+                     int force)
 {
 	write_lock_irq(&filp->f_owner.lock);
 	if (force || !filp->f_owner.pid) {
 		put_pid(filp->f_owner.pid);
 		filp->f_owner.pid = get_pid(pid);
 		filp->f_owner.pid_type = type;
-		filp->f_owner.uid = uid;
-		filp->f_owner.euid = euid;
+
+		if (pid) {
+			const struct cred *cred = current_cred();
+			filp->f_owner.uid = cred->uid;
+			filp->f_owner.euid = cred->euid;
+		}
 	}
 	write_unlock_irq(&filp->f_owner.lock);
 }
@@ -214,14 +218,13 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
 int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
 		int force)
 {
-	const struct cred *cred = current_cred();
 	int err;
-	
+
 	err = security_file_set_fowner(filp);
 	if (err)
 		return err;
 
-	f_modown(filp, pid, type, cred->uid, cred->euid, force);
+	f_modown(filp, pid, type, force);
 	return 0;
 }
 EXPORT_SYMBOL(__f_setown);
@@ -247,7 +250,7 @@ EXPORT_SYMBOL(f_setown);
 
 void f_delown(struct file *filp)
 {
-	f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1);
+	f_modown(filp, NULL, PIDTYPE_PID, 1);
 }
 
 pid_t f_getown(struct file *filp)
@@ -425,14 +428,20 @@ static inline int sigio_perm(struct task_struct *p,
 }
 
 static void send_sigio_to_task(struct task_struct *p,
-			       struct fown_struct *fown, 
+			       struct fown_struct *fown,
 			       int fd,
 			       int reason)
 {
-	if (!sigio_perm(p, fown, fown->signum))
+	/*
+	 * F_SETSIG can change ->signum lockless in parallel, make
+	 * sure we read it once and use the same value throughout.
+	 */
+	int signum = ACCESS_ONCE(fown->signum);
+
+	if (!sigio_perm(p, fown, signum))
 		return;
 
-	switch (fown->signum) {
+	switch (signum) {
 		siginfo_t si;
 		default:
 			/* Queue a rt signal with the appropriate fd as its
@@ -441,7 +450,7 @@ static void send_sigio_to_task(struct task_struct *p,
 			   delivered even if we can't queue.  Failure to
 			   queue in this case _should_ be reported; we fall
 			   back to SIGIO in that case. --sct */
-			si.si_signo = fown->signum;
+			si.si_signo = signum;
 			si.si_errno = 0;
 		        si.si_code  = reason;
 			/* Make sure we are called with one of the POLL_*
@@ -453,7 +462,7 @@ static void send_sigio_to_task(struct task_struct *p,
 			else
 				si.si_band = band_table[reason - POLL_IN];
 			si.si_fd    = fd;
-			if (!group_send_sig_info(fown->signum, &si, p))
+			if (!group_send_sig_info(signum, &si, p))
 				break;
 		/* fall-through: fall back on the old plain SIGIO signal */
 		case 0:
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 40308e98c6a4..caf049146ca2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -321,7 +321,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_SYNC;
-	if (!(inode->i_state & I_FREEING)) {
+	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		if (!(inode->i_state & I_DIRTY) &&
 		    mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/*
@@ -492,7 +492,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
 			break;
 		}
 
-		if (inode->i_state & I_NEW) {
+		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
 			requeue_io(inode);
 			continue;
 		}
@@ -523,7 +523,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
 		if (current_is_pdflush() && !writeback_acquire(bdi))
 			break;
 
-		BUG_ON(inode->i_state & I_FREEING);
+		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		__writeback_single_inode(inode, wbc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f0df55a52929..d8673ccf90b7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -19,7 +19,6 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
-#include <linux/smp_lock.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -260,9 +259,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 
 static void fuse_umount_begin(struct super_block *sb)
 {
-	lock_kernel();
 	fuse_abort_conn(get_fuse_conn_super(sb));
-	unlock_kernel();
 }
 
 static void fuse_send_destroy(struct fuse_conn *fc)
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cad957cdb1e5..5971359d2090 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL && (64BIT || LBD)
+	depends on EXPERIMENTAL && (64BIT || LBDAF)
 	select DLM if GFS2_FS_LOCKING_DLM
 	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
 	select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 286f38dfc6c0..001f8d3118f2 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -70,9 +70,7 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
 	res = get_user(block, p);
 	if (res)
 		return res;
-	lock_kernel();
 	res = mapping->a_ops->bmap(mapping, block);
-	unlock_kernel();
 	return put_user(res, p);
 }
 
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 2f0dc5a14633..8ba5441063be 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -195,9 +195,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
 		 * Do not report hidden files if so instructed, or associated
 		 * files unless instructed to do so
 		 */
-		if ((sbi->s_hide == 'y' &&
-				(de->flags[-sbi->s_high_sierra] & 1)) ||
-				(sbi->s_showassoc =='n' &&
+		if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
+		    (!sbi->s_showassoc &&
 				(de->flags[-sbi->s_high_sierra] & 4))) {
 			filp->f_pos += de_len;
 			continue;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 068b34b5a107..58a7963e168a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -141,13 +141,17 @@ static const struct dentry_operations isofs_dentry_ops[] = {
 };
 
 struct iso9660_options{
-	char map;
-	char rock;
+	unsigned int rock:1;
+	unsigned int cruft:1;
+	unsigned int hide:1;
+	unsigned int showassoc:1;
+	unsigned int nocompress:1;
+	unsigned int overriderockperm:1;
+	unsigned int uid_set:1;
+	unsigned int gid_set:1;
+	unsigned int utf8:1;
+	unsigned char map;
 	char joliet;
-	char cruft;
-	char hide;
-	char showassoc;
-	char nocompress;
 	unsigned char check;
 	unsigned int blocksize;
 	mode_t fmode;
@@ -155,7 +159,6 @@ struct iso9660_options{
 	gid_t gid;
 	uid_t uid;
 	char *iocharset;
-	unsigned char utf8;
 	/* LVE */
 	s32 session;
 	s32 sbsector;
@@ -312,7 +315,7 @@ enum {
 	Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore,
 	Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet,
 	Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err,
-	Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode,
+	Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, Opt_overriderockperm,
 };
 
 static const match_table_t tokens = {
@@ -340,6 +343,7 @@ static const match_table_t tokens = {
 	{Opt_gid, "gid=%u"},
 	{Opt_mode, "mode=%u"},
 	{Opt_dmode, "dmode=%u"},
+	{Opt_overriderockperm, "overriderockperm"},
 	{Opt_block, "block=%u"},
 	{Opt_ignore, "conv=binary"},
 	{Opt_ignore, "conv=b"},
@@ -359,24 +363,22 @@ static int parse_options(char *options, struct iso9660_options *popt)
 	int option;
 
 	popt->map = 'n';
-	popt->rock = 'y';
-	popt->joliet = 'y';
-	popt->cruft = 'n';
-	popt->hide = 'n';
-	popt->showassoc = 'n';
+	popt->rock = 1;
+	popt->joliet = 1;
+	popt->cruft = 0;
+	popt->hide = 0;
+	popt->showassoc = 0;
 	popt->check = 'u';		/* unset */
 	popt->nocompress = 0;
 	popt->blocksize = 1024;
-	popt->fmode = popt->dmode = S_IRUGO | S_IXUGO; /*
-					 * r-x for all.  The disc could
-					 * be shared with DOS machines so
-					 * virtually anything could be
-					 * a valid executable.
-					 */
+	popt->fmode = popt->dmode = ISOFS_INVALID_MODE;
+	popt->uid_set = 0;
+	popt->gid_set = 0;
 	popt->gid = 0;
 	popt->uid = 0;
 	popt->iocharset = NULL;
 	popt->utf8 = 0;
+	popt->overriderockperm = 0;
 	popt->session=-1;
 	popt->sbsector=-1;
 	if (!options)
@@ -393,20 +395,20 @@ static int parse_options(char *options, struct iso9660_options *popt)
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_norock:
-			popt->rock = 'n';
+			popt->rock = 0;
 			break;
 		case Opt_nojoliet:
-			popt->joliet = 'n';
+			popt->joliet = 0;
 			break;
 		case Opt_hide:
-			popt->hide = 'y';
+			popt->hide = 1;
 			break;
 		case Opt_unhide:
 		case Opt_showassoc:
-			popt->showassoc = 'y';
+			popt->showassoc = 1;
 			break;
 		case Opt_cruft:
-			popt->cruft = 'y';
+			popt->cruft = 1;
 			break;
 		case Opt_utf8:
 			popt->utf8 = 1;
@@ -450,11 +452,13 @@ static int parse_options(char *options, struct iso9660_options *popt)
 			if (match_int(&args[0], &option))
 				return 0;
 			popt->uid = option;
+			popt->uid_set = 1;
 			break;
 		case Opt_gid:
 			if (match_int(&args[0], &option))
 				return 0;
 			popt->gid = option;
+			popt->gid_set = 1;
 			break;
 		case Opt_mode:
 			if (match_int(&args[0], &option))
@@ -466,6 +470,9 @@ static int parse_options(char *options, struct iso9660_options *popt)
 				return 0;
 			popt->dmode = option;
 			break;
+		case Opt_overriderockperm:
+			popt->overriderockperm = 1;
+			break;
 		case Opt_block:
 			if (match_int(&args[0], &option))
 				return 0;
@@ -650,7 +657,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 					goto out_freebh;
 
 				sbi->s_high_sierra = 1;
-				opt.rock = 'n';
+				opt.rock = 0;
 				h_pri = (struct hs_primary_descriptor *)vdp;
 				goto root_found;
 			}
@@ -673,7 +680,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 
 root_found:
 
-	if (joliet_level && (pri == NULL || opt.rock == 'n')) {
+	if (joliet_level && (pri == NULL || !opt.rock)) {
 		/* This is the case of Joliet with the norock mount flag.
 		 * A disc with both Joliet and Rock Ridge is handled later
 		 */
@@ -802,22 +809,31 @@ root_found:
 	s->s_op = &isofs_sops;
 	s->s_export_op = &isofs_export_ops;
 	sbi->s_mapping = opt.map;
-	sbi->s_rock = (opt.rock == 'y' ? 2 : 0);
+	sbi->s_rock = (opt.rock ? 2 : 0);
 	sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/
 	sbi->s_cruft = opt.cruft;
 	sbi->s_hide = opt.hide;
 	sbi->s_showassoc = opt.showassoc;
 	sbi->s_uid = opt.uid;
 	sbi->s_gid = opt.gid;
+	sbi->s_uid_set = opt.uid_set;
+	sbi->s_gid_set = opt.gid_set;
 	sbi->s_utf8 = opt.utf8;
 	sbi->s_nocompress = opt.nocompress;
+	sbi->s_overriderockperm = opt.overriderockperm;
 	/*
 	 * It would be incredibly stupid to allow people to mark every file
 	 * on the disk as suid, so we merely allow them to set the default
 	 * permissions.
 	 */
-	sbi->s_fmode = opt.fmode & 0777;
-	sbi->s_dmode = opt.dmode & 0777;
+	if (opt.fmode != ISOFS_INVALID_MODE)
+		sbi->s_fmode = opt.fmode & 0777;
+	else
+		sbi->s_fmode = ISOFS_INVALID_MODE;
+	if (opt.dmode != ISOFS_INVALID_MODE)
+		sbi->s_dmode = opt.dmode & 0777;
+	else
+		sbi->s_dmode = ISOFS_INVALID_MODE;
 
 	/*
 	 * Read the root inode, which _may_ result in changing
@@ -1095,18 +1111,6 @@ static const struct address_space_operations isofs_aops = {
 	.bmap = _isofs_bmap
 };
 
-static inline void test_and_set_uid(uid_t *p, uid_t value)
-{
-	if (value)
-		*p = value;
-}
-
-static inline void test_and_set_gid(gid_t *p, gid_t value)
-{
-        if (value)
-                *p = value;
-}
-
 static int isofs_read_level3_size(struct inode *inode)
 {
 	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -1261,7 +1265,10 @@ static int isofs_read_inode(struct inode *inode)
 	ei->i_file_format = isofs_file_normal;
 
 	if (de->flags[-high_sierra] & 2) {
-		inode->i_mode = sbi->s_dmode | S_IFDIR;
+		if (sbi->s_dmode != ISOFS_INVALID_MODE)
+			inode->i_mode = S_IFDIR | sbi->s_dmode;
+		else
+			inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
 		inode->i_nlink = 1;	/*
 					 * Set to 1.  We know there are 2, but
 					 * the find utility tries to optimize
@@ -1270,8 +1277,16 @@ static int isofs_read_inode(struct inode *inode)
 					 * do it the hard way.
 					 */
 	} else {
-		/* Everybody gets to read the file. */
-		inode->i_mode = sbi->s_fmode | S_IFREG;
+		if (sbi->s_fmode != ISOFS_INVALID_MODE) {
+			inode->i_mode = S_IFREG | sbi->s_fmode;
+		} else {
+			/*
+			 * Set default permissions: r-x for all.  The disc
+			 * could be shared with DOS machines so virtually
+			 * anything could be a valid executable.
+			 */
+			inode->i_mode = S_IFREG | S_IRUGO | S_IXUGO;
+		}
 		inode->i_nlink = 1;
 	}
 	inode->i_uid = sbi->s_uid;
@@ -1300,7 +1315,7 @@ static int isofs_read_inode(struct inode *inode)
 	 * this CDROM was mounted with the cruft option.
 	 */
 
-	if (sbi->s_cruft == 'y')
+	if (sbi->s_cruft)
 		inode->i_size &= 0x00ffffff;
 
 	if (de->interleave[0]) {
@@ -1346,9 +1361,18 @@ static int isofs_read_inode(struct inode *inode)
 	if (!high_sierra) {
 		parse_rock_ridge_inode(de, inode);
 		/* if we want uid/gid set, override the rock ridge setting */
-		test_and_set_uid(&inode->i_uid, sbi->s_uid);
-		test_and_set_gid(&inode->i_gid, sbi->s_gid);
+		if (sbi->s_uid_set)
+			inode->i_uid = sbi->s_uid;
+		if (sbi->s_gid_set)
+			inode->i_gid = sbi->s_gid;
 	}
+	/* Now set final access rights if overriding rock ridge setting */
+	if (S_ISDIR(inode->i_mode) && sbi->s_overriderockperm &&
+	    sbi->s_dmode != ISOFS_INVALID_MODE)
+		inode->i_mode = S_IFDIR | sbi->s_dmode;
+	if (S_ISREG(inode->i_mode) && sbi->s_overriderockperm &&
+	    sbi->s_fmode != ISOFS_INVALID_MODE)
+		inode->i_mode = S_IFREG | sbi->s_fmode;
 
 	/* Install the inode operations vector */
 	if (S_ISREG(inode->i_mode)) {
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index ccbf72faf27a..7d33de84f52a 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -35,21 +35,20 @@ struct isofs_sb_info {
 	unsigned long s_log_zone_size;
 	unsigned long s_max_size;
 	
-	unsigned char s_high_sierra; /* A simple flag */
-	unsigned char s_mapping;
 	int           s_rock_offset; /* offset of SUSP fields within SU area */
-	unsigned char s_rock;
 	unsigned char s_joliet_level;
-	unsigned char s_utf8;
-	unsigned char s_cruft; /* Broken disks with high
-				  byte of length containing
-				  junk */
-	unsigned char s_unhide;
-	unsigned char s_nosuid;
-	unsigned char s_nodev;
-	unsigned char s_nocompress;
-	unsigned char s_hide;
-	unsigned char s_showassoc;
+	unsigned char s_mapping;
+	unsigned int  s_high_sierra:1;
+	unsigned int  s_rock:2;
+	unsigned int  s_utf8:1;
+	unsigned int  s_cruft:1; /* Broken disks with high byte of length
+				  * containing junk */
+	unsigned int  s_nocompress:1;
+	unsigned int  s_hide:1;
+	unsigned int  s_showassoc:1;
+	unsigned int  s_overriderockperm:1;
+	unsigned int  s_uid_set:1;
+	unsigned int  s_gid_set:1;
 
 	mode_t s_fmode;
 	mode_t s_dmode;
@@ -58,6 +57,8 @@ struct isofs_sb_info {
 	struct nls_table *s_nls_iocharset; /* Native language support table */
 };
 
+#define ISOFS_INVALID_MODE ((mode_t) -1)
+
 static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 92c14b850e9c..a048de81c093 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
 	return (op - ascii);
 }
 
-/* Convert big endian wide character string to utf8 */
-static int
-wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
-{
-	const __u8 *ip;
-	__u8 *op;
-	int size;
-	__u16 c;
-
-	op = s;
-	ip = pwcs;
-	while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
-		c = (*ip << 8) | ip[1];
-		if (c > 0x7f) {
-			size = utf8_wctomb(op, c, maxlen);
-			if (size == -1) {
-				/* Ignore character and move on */
-				maxlen--;
-			} else {
-				op += size;
-				maxlen -= size;
-			}
-		} else {
-			*op++ = (__u8) c;
-		}
-		ip += 2;
-		inlen--;
-	}
-	return (op - s);
-}
-
 int
 get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
 {
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
 	nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
 
 	if (utf8) {
-		len = wcsntombs_be(outname, de->name,
-				de->name_len[0] >> 1, PAGE_SIZE);
+		len = utf16s_to_utf8s((const wchar_t *) de->name,
+				de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
+				outname, PAGE_SIZE);
 	} else {
 		len = uni16_to_x8(outname, (__be16 *) de->name,
 				de->name_len[0] >> 1, nls);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 8299889a835e..eaa831311c9c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -142,9 +142,9 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
 		 */
 		match = 0;
 		if (dlen > 0 &&
-			(sbi->s_hide =='n' ||
+			(!sbi->s_hide ||
 				(!(de->flags[-sbi->s_high_sierra] & 1))) &&
-			(sbi->s_showassoc =='y' ||
+			(sbi->s_showassoc ||
 				(!(de->flags[-sbi->s_high_sierra] & 4)))) {
 			match = (isofs_cmp(dentry, dpnt, dlen) == 0);
 		}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index ed886e6db399..73242ba7c7b1 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1686,35 +1686,6 @@ out:
 	return;
 }
 
-/*
- * journal_try_to_free_buffers() could race with journal_commit_transaction()
- * The latter might still hold the a count on buffers when inspecting
- * them on t_syncdata_list or t_locked_list.
- *
- * journal_try_to_free_buffers() will call this function to
- * wait for the current transaction to finish syncing data buffers, before
- * tryinf to free that buffer.
- *
- * Called with journal->j_state_lock held.
- */
-static void journal_wait_for_transaction_sync_data(journal_t *journal)
-{
-	transaction_t *transaction = NULL;
-	tid_t tid;
-
-	spin_lock(&journal->j_state_lock);
-	transaction = journal->j_committing_transaction;
-
-	if (!transaction) {
-		spin_unlock(&journal->j_state_lock);
-		return;
-	}
-
-	tid = transaction->t_tid;
-	spin_unlock(&journal->j_state_lock);
-	log_wait_commit(journal, tid);
-}
-
 /**
  * int journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
@@ -1786,25 +1757,6 @@ int journal_try_to_free_buffers(journal_t *journal,
 
 	ret = try_to_free_buffers(page);
 
-	/*
-	 * There are a number of places where journal_try_to_free_buffers()
-	 * could race with journal_commit_transaction(), the later still
-	 * holds the reference to the buffers to free while processing them.
-	 * try_to_free_buffers() failed to free those buffers. Some of the
-	 * caller of releasepage() request page buffers to be dropped, otherwise
-	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
-	 *
-	 * So, if the caller of try_to_release_page() wants the synchronous
-	 * behaviour(i.e make sure buffers are dropped upon return),
-	 * let's wait for the current transaction to finish flush of
-	 * dirty data buffers, then try to free those buffers again,
-	 * with the journal locked.
-	 */
-	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
-		journal_wait_for_transaction_sync_data(journal);
-		ret = try_to_free_buffers(page);
-	}
-
 busy:
 	return ret;
 }
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 17159cacbd9e..5d70b3e6d49b 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,9 +20,9 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <trace/events/jbd2.h>
 
 /*
  * Unlink a buffer from a transaction checkpoint list.
@@ -358,8 +358,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 * journal straight away.
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
-	trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
-		   journal->j_devname, result);
+	trace_jbd2_checkpoint(journal, result);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0b7d3b8226fd..7b4088b2364d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,7 +16,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -26,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
+#include <trace/events/jbd2.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -253,6 +253,7 @@ static int journal_submit_data_buffers(journal_t *journal,
 		 * block allocation  with delalloc. We need to write
 		 * only allocated blocks here.
 		 */
+		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 		err = journal_submit_inode_data_buffers(mapping);
 		if (!ret)
 			ret = err;
@@ -394,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction = journal->j_running_transaction;
 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
-	trace_mark(jbd2_start_commit, "dev %s transaction %d",
-		   journal->j_devname, commit_transaction->t_tid);
+	trace_jbd2_start_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
@@ -409,6 +409,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	if (commit_transaction->t_synchronous_commit)
 		write_op = WRITE_SYNC_PLUG;
+	trace_jbd2_commit_locking(journal, commit_transaction);
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -484,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	jbd2_journal_switch_revoke_table(journal);
 
+	trace_jbd2_commit_flushing(journal, commit_transaction);
 	stats.u.run.rs_flushing = jiffies;
 	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 					       stats.u.run.rs_flushing);
@@ -520,6 +522,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction->t_state = T_COMMIT;
 	spin_unlock(&journal->j_state_lock);
 
+	trace_jbd2_commit_logging(journal, commit_transaction);
 	stats.u.run.rs_logging = jiffies;
 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 						 stats.u.run.rs_logging);
@@ -1054,9 +1057,7 @@ restart_loop:
 	if (journal->j_commit_callback)
 		journal->j_commit_callback(journal, commit_transaction);
 
-	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-		   journal->j_devname, commit_transaction->t_tid,
-		   journal->j_tail_sequence);
+	trace_jbd2_end_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 	if (to_free)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 62be7d294ec2..18bfd5dab642 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -38,6 +38,10 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/math64.h>
+#include <linux/hash.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/jbd2.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -2377,6 +2381,71 @@ static void __exit journal_exit(void)
 	jbd2_journal_destroy_caches();
 }
 
+/* 
+ * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 
+ * tracing infrastructure to map a dev_t to a device name.
+ *
+ * The caller should use rcu_read_lock() in order to make sure the
+ * device name stays valid until its done with it.  We use
+ * rcu_read_lock() as well to make sure we're safe in case the caller
+ * gets sloppy, and because rcu_read_lock() is cheap and can be safely
+ * nested.
+ */
+struct devname_cache {
+	struct rcu_head	rcu;
+	dev_t		device;
+	char		devname[BDEVNAME_SIZE];
+};
+#define CACHE_SIZE_BITS 6
+static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
+static DEFINE_SPINLOCK(devname_cache_lock);
+
+static void free_devcache(struct rcu_head *rcu)
+{
+	kfree(rcu);
+}
+
+const char *jbd2_dev_to_name(dev_t device)
+{
+	int	i = hash_32(device, CACHE_SIZE_BITS);
+	char	*ret;
+	struct block_device *bd;
+
+	rcu_read_lock();
+	if (devcache[i] && devcache[i]->device == device) {
+		ret = devcache[i]->devname;
+		rcu_read_unlock();
+		return ret;
+	}
+	rcu_read_unlock();
+
+	spin_lock(&devname_cache_lock);
+	if (devcache[i]) {
+		if (devcache[i]->device == device) {
+			ret = devcache[i]->devname;
+			spin_unlock(&devname_cache_lock);
+			return ret;
+		}
+		call_rcu(&devcache[i]->rcu, free_devcache);
+	}
+	devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
+	if (!devcache[i]) {
+		spin_unlock(&devname_cache_lock);
+		return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+	}
+	devcache[i]->device = device;
+	bd = bdget(device);
+	if (bd) {
+		bdevname(bd, devcache[i]->devname);
+		bdput(bd);
+	} else
+		__bdevname(device, devcache[i]->devname);
+	ret = devcache[i]->devname;
+	spin_unlock(&devname_cache_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_dev_to_name);
+
 MODULE_LICENSE("GPL");
 module_init(journal_init);
 module_exit(journal_exit);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 996ffda06bf3..494501edba6b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1547,36 +1547,6 @@ out:
 	return;
 }
 
-/*
- * jbd2_journal_try_to_free_buffers() could race with
- * jbd2_journal_commit_transaction(). The later might still hold the
- * reference count to the buffers when inspecting them on
- * t_syncdata_list or t_locked_list.
- *
- * jbd2_journal_try_to_free_buffers() will call this function to
- * wait for the current transaction to finish syncing data buffers, before
- * try to free that buffer.
- *
- * Called with journal->j_state_lock hold.
- */
-static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
-{
-	transaction_t *transaction;
-	tid_t tid;
-
-	spin_lock(&journal->j_state_lock);
-	transaction = journal->j_committing_transaction;
-
-	if (!transaction) {
-		spin_unlock(&journal->j_state_lock);
-		return;
-	}
-
-	tid = transaction->t_tid;
-	spin_unlock(&journal->j_state_lock);
-	jbd2_log_wait_commit(journal, tid);
-}
-
 /**
  * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
@@ -1649,25 +1619,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
 
 	ret = try_to_free_buffers(page);
 
-	/*
-	 * There are a number of places where jbd2_journal_try_to_free_buffers()
-	 * could race with jbd2_journal_commit_transaction(), the later still
-	 * holds the reference to the buffers to free while processing them.
-	 * try_to_free_buffers() failed to free those buffers. Some of the
-	 * caller of releasepage() request page buffers to be dropped, otherwise
-	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
-	 *
-	 * So, if the caller of try_to_release_page() wants the synchronous
-	 * behaviour(i.e make sure buffers are dropped upon return),
-	 * let's wait for the current transaction to finish flush of
-	 * dirty data buffers, then try to free those buffers again,
-	 * with the journal locked.
-	 */
-	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
-		jbd2_journal_wait_for_transaction_sync_data(journal);
-		ret = try_to_free_buffers(page);
-	}
-
 busy:
 	return ret;
 }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index bbbd5f202e37..41d6045dbeb0 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -391,6 +391,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
 		}
 		XADaddress(xp, xaddr);
 		XADlength(xp, xlen);
+		XADoffset(xp, prev);
 		/*
 		 * only preserve the abnr flag within the xad flags
 		 * of the returned hint.
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3aebe322271a..6ac693faae49 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -12,13 +12,14 @@
 /* bitmap.c contains the code that handles the inode and block bitmaps */
 
 #include "minix.h"
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
 #include <linux/sched.h>
 
 static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
 
+static DEFINE_SPINLOCK(bitmap_lock);
+
 static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits)
 {
 	unsigned i, j, sum = 0;
@@ -69,11 +70,11 @@ void minix_free_block(struct inode *inode, unsigned long block)
 		return;
 	}
 	bh = sbi->s_zmap[zone];
-	lock_kernel();
+	spin_lock(&bitmap_lock);
 	if (!minix_test_and_clear_bit(bit, bh->b_data))
 		printk("minix_free_block (%s:%lu): bit already cleared\n",
 		       sb->s_id, block);
-	unlock_kernel();
+	spin_unlock(&bitmap_lock);
 	mark_buffer_dirty(bh);
 	return;
 }
@@ -88,18 +89,18 @@ int minix_new_block(struct inode * inode)
 		struct buffer_head *bh = sbi->s_zmap[i];
 		int j;
 
-		lock_kernel();
+		spin_lock(&bitmap_lock);
 		j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
 		if (j < bits_per_zone) {
 			minix_set_bit(j, bh->b_data);
-			unlock_kernel();
+			spin_unlock(&bitmap_lock);
 			mark_buffer_dirty(bh);
 			j += i * bits_per_zone + sbi->s_firstdatazone-1;
 			if (j < sbi->s_firstdatazone || j >= sbi->s_nzones)
 				break;
 			return j;
 		}
-		unlock_kernel();
+		spin_unlock(&bitmap_lock);
 	}
 	return 0;
 }
@@ -211,10 +212,10 @@ void minix_free_inode(struct inode * inode)
 	minix_clear_inode(inode);	/* clear on-disk copy */
 
 	bh = sbi->s_imap[ino];
-	lock_kernel();
+	spin_lock(&bitmap_lock);
 	if (!minix_test_and_clear_bit(bit, bh->b_data))
 		printk("minix_free_inode: bit %lu already cleared\n", bit);
-	unlock_kernel();
+	spin_unlock(&bitmap_lock);
 	mark_buffer_dirty(bh);
  out:
 	clear_inode(inode);		/* clear in-memory copy */
@@ -237,7 +238,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
 	j = bits_per_zone;
 	bh = NULL;
 	*error = -ENOSPC;
-	lock_kernel();
+	spin_lock(&bitmap_lock);
 	for (i = 0; i < sbi->s_imap_blocks; i++) {
 		bh = sbi->s_imap[i];
 		j = minix_find_first_zero_bit(bh->b_data, bits_per_zone);
@@ -245,17 +246,17 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
 			break;
 	}
 	if (!bh || j >= bits_per_zone) {
-		unlock_kernel();
+		spin_unlock(&bitmap_lock);
 		iput(inode);
 		return NULL;
 	}
 	if (minix_test_and_set_bit(j, bh->b_data)) {	/* shouldn't happen */
-		unlock_kernel();
+		spin_unlock(&bitmap_lock);
 		printk("minix_new_inode: bit already set\n");
 		iput(inode);
 		return NULL;
 	}
-	unlock_kernel();
+	spin_unlock(&bitmap_lock);
 	mark_buffer_dirty(bh);
 	j += i * bits_per_zone;
 	if (!j || j > sbi->s_ninodes) {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index e5f206467e40..d407e7a0b6fe 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -11,7 +11,6 @@
 #include "minix.h"
 #include <linux/buffer_head.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 
 typedef struct minix_dir_entry minix_dirent;
@@ -20,6 +19,7 @@ typedef struct minix3_dir_entry minix3_dirent;
 static int minix_readdir(struct file *, void *, filldir_t);
 
 const struct file_operations minix_dir_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= minix_readdir,
 	.fsync		= simple_fsync,
@@ -102,8 +102,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	char *name;
 	__u32 inumber;
 
-	lock_kernel();
-
 	pos = (pos + chunk_size-1) & ~(chunk_size-1);
 	if (pos >= inode->i_size)
 		goto done;
@@ -146,7 +144,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
 
 done:
 	filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f91a23693597..74ea82d72164 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -35,8 +35,6 @@ static void minix_put_super(struct super_block *sb)
 	int i;
 	struct minix_sb_info *sbi = minix_sb(sb);
 
-	lock_kernel();
-
 	if (!(sb->s_flags & MS_RDONLY)) {
 		if (sbi->s_version != MINIX_V3)	 /* s_state is now out from V3 sb */
 			sbi->s_ms->s_state = sbi->s_mount_state;
@@ -50,8 +48,6 @@ static void minix_put_super(struct super_block *sb)
 	kfree(sbi->s_imap);
 	sb->s_fs_info = NULL;
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 static struct kmem_cache * minix_inode_cachep;
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 97645f112114..0ec6237a5970 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
 
 		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
 			int k;
+			unicode_t u;
 
-			k = utf8_mbtowc(&ec, iname, iname_end - iname);
-			if (k < 0)
+			k = utf8_to_utf32(iname, iname_end - iname, &u);
+			if (k < 0 || u > MAX_WCHAR_T)
 				return -EINVAL;
 			iname += k;
+			ec = u;
 		} else {
 			if (*iname == NCP_ESC) {
 				int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
 		if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
 			int k;
 
-			k = utf8_wctomb(iname, ec, iname_end - iname);
+			k = utf32_to_utf8(ec, iname, iname_end - iname);
 			if (k < 0) {
 				err = -ENAMETOOLONG;
 				goto quit;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index a2ab2529b5ca..ceda50aad73c 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -31,7 +31,7 @@ static inline void nfs_inc_server_stats(const struct nfs_server *server,
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(server->io_stats, cpu);
 	iostats->events[stat]++;
-	put_cpu_no_resched();
+	put_cpu();
 }
 
 static inline void nfs_inc_stats(const struct inode *inode,
@@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server,
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(server->io_stats, cpu);
 	iostats->bytes[stat] += addend;
-	put_cpu_no_resched();
+	put_cpu();
 }
 
 static inline void nfs_add_stats(const struct inode *inode,
@@ -71,7 +71,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode,
 	cpu = get_cpu();
 	iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu);
 	iostats->fscache[stat] += addend;
-	put_cpu_no_resched();
+	put_cpu();
 }
 #endif
 
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 064279e33bbb..36df60b6d8a4 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -31,21 +31,26 @@
 #include "dat.h"
 #include "alloc.h"
 
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+	return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+}
+
 int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
 			       __u64 *ptrp)
 {
-	__u64 ptr;
+	sector_t blocknr;
 	int ret;
 
 	down_read(&bmap->b_sem);
 	ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
 	if (ret < 0)
 		goto out;
-	if (bmap->b_pops->bpop_translate != NULL) {
-		ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
-		if (ret < 0)
-			goto out;
-		*ptrp = ptr;
+	if (NILFS_BMAP_USE_VBN(bmap)) {
+		ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
+					  &blocknr);
+		if (!ret)
+			*ptrp = blocknr;
 	}
 
  out:
@@ -53,6 +58,16 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
 	return ret;
 }
 
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
+			     unsigned maxblocks)
+{
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
+	up_read(&bmap->b_sem);
+	return ret;
+}
 
 /**
  * nilfs_bmap_lookup - find a record
@@ -101,8 +116,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 			if (n < 0)
 				return n;
 			ret = nilfs_btree_convert_and_insert(
-				bmap, key, ptr, keys, ptrs, n,
-				NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
+				bmap, key, ptr, keys, ptrs, n);
 			if (ret == 0)
 				bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
 
@@ -158,8 +172,7 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
 			if (n < 0)
 				return n;
 			ret = nilfs_direct_delete_and_convert(
-				bmap, key, keys, ptrs, n,
-				NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
+				bmap, key, keys, ptrs, n);
 			if (ret == 0)
 				bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
 
@@ -417,38 +430,6 @@ void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
 		mark_inode_dirty(bmap->b_inode);
 }
 
-int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
-			 struct buffer_head **bhp)
-{
-	return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
-				ptr, 0, bhp, 0);
-}
-
-void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
-			  struct buffer_head *bh)
-{
-	brelse(bh);
-}
-
-int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
-			     struct buffer_head **bhp)
-{
-	int ret;
-
-	ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
-			       ptr, 0, bhp, 1);
-	if (ret < 0)
-		return ret;
-	set_buffer_nilfs_volatile(*bhp);
-	return 0;
-}
-
-void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
-			     struct buffer_head *bh)
-{
-	nilfs_btnode_delete(bh);
-}
-
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
 			      const struct buffer_head *bh)
 {
@@ -476,11 +457,6 @@ __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
 		return NILFS_BMAP_INVALID_PTR;
 }
 
-static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
-{
-	return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
-}
-
 #define NILFS_BMAP_GROUP_DIV	8
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
 {
@@ -493,64 +469,51 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
 		(entries_per_group / NILFS_BMAP_GROUP_DIV);
 }
 
-static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
-				      union nilfs_bmap_ptr_req *req)
+int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
+				 union nilfs_bmap_ptr_req *req)
 {
 	return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
 
-static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
-				      union nilfs_bmap_ptr_req *req)
+void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
+				 union nilfs_bmap_ptr_req *req)
 {
 	nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
 
-static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
-				     union nilfs_bmap_ptr_req *req)
+void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
+			      union nilfs_bmap_ptr_req *req)
 {
 	nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
 
-static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
-				      union nilfs_bmap_ptr_req *req)
+int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
+		       sector_t blocknr)
 {
-	return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-
-static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
-				      union nilfs_bmap_ptr_req *req,
-				      sector_t blocknr)
-{
-	nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
-			       blocknr);
-}
+	struct inode *dat = nilfs_bmap_get_dat(bmap);
+	int ret;
 
-static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
-				     union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+	ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
+	if (likely(!ret))
+		nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
+	return ret;
 }
 
-static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
-				    union nilfs_bmap_ptr_req *req)
+int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
+			     union nilfs_bmap_ptr_req *req)
 {
 	return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
 
-static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
-				    union nilfs_bmap_ptr_req *req)
-{
-	nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
-}
-
-static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
-				       union nilfs_bmap_ptr_req *req)
+void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
+			     union nilfs_bmap_ptr_req *req)
 {
-	nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
+	nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
+			     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
 
-static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
-				   union nilfs_bmap_ptr_req *req)
+void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
+			    union nilfs_bmap_ptr_req *req)
 {
 	nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
@@ -566,128 +529,44 @@ int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
 	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
 }
 
-int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
-			      union nilfs_bmap_ptr_req *oldreq,
-			      union nilfs_bmap_ptr_req *newreq)
+int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
+				union nilfs_bmap_ptr_req *oldreq,
+				union nilfs_bmap_ptr_req *newreq)
 {
+	struct inode *dat = nilfs_bmap_get_dat(bmap);
 	int ret;
 
-	ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
+	ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
 	if (ret < 0)
 		return ret;
-	ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
+	ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
 	if (ret < 0)
-		bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+		nilfs_dat_abort_end(dat, &oldreq->bpr_req);
 
 	return ret;
 }
 
-void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
-			      union nilfs_bmap_ptr_req *oldreq,
-			      union nilfs_bmap_ptr_req *newreq)
+void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
+				union nilfs_bmap_ptr_req *oldreq,
+				union nilfs_bmap_ptr_req *newreq)
 {
-	bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
-	bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
-}
+	struct inode *dat = nilfs_bmap_get_dat(bmap);
 
-void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
-			     union nilfs_bmap_ptr_req *oldreq,
-			     union nilfs_bmap_ptr_req *newreq)
-{
-	bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
-	bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
+	nilfs_dat_commit_end(dat, &oldreq->bpr_req,
+			     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
+	nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
 }
 
-static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
-				  __u64 *ptrp)
+void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
+			       union nilfs_bmap_ptr_req *oldreq,
+			       union nilfs_bmap_ptr_req *newreq)
 {
-	sector_t blocknr;
-	int ret;
-
-	ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
-	if (ret < 0)
-		return ret;
-	if (ptrp != NULL)
-		*ptrp = blocknr;
-	return 0;
-}
+	struct inode *dat = nilfs_bmap_get_dat(bmap);
 
-static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
-				      union nilfs_bmap_ptr_req *req)
-{
-	/* ignore target ptr */
-	req->bpr_ptr = bmap->b_last_allocated_ptr++;
-	return 0;
+	nilfs_dat_abort_end(dat, &oldreq->bpr_req);
+	nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
 }
 
-static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
-				      union nilfs_bmap_ptr_req *req)
-{
-	/* do nothing */
-}
-
-static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
-				     union nilfs_bmap_ptr_req *req)
-{
-	bmap->b_last_allocated_ptr--;
-}
-
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
-	.bpop_prepare_alloc_ptr	=	nilfs_bmap_prepare_alloc_v,
-	.bpop_commit_alloc_ptr	=	nilfs_bmap_commit_alloc_v,
-	.bpop_abort_alloc_ptr	=	nilfs_bmap_abort_alloc_v,
-	.bpop_prepare_start_ptr	=	nilfs_bmap_prepare_start_v,
-	.bpop_commit_start_ptr	=	nilfs_bmap_commit_start_v,
-	.bpop_abort_start_ptr	=	nilfs_bmap_abort_start_v,
-	.bpop_prepare_end_ptr	=	nilfs_bmap_prepare_end_v,
-	.bpop_commit_end_ptr	=	nilfs_bmap_commit_end_v,
-	.bpop_abort_end_ptr	=	nilfs_bmap_abort_end_v,
-
-	.bpop_translate		=	nilfs_bmap_translate_v,
-};
-
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
-	.bpop_prepare_alloc_ptr	=	nilfs_bmap_prepare_alloc_v,
-	.bpop_commit_alloc_ptr	=	nilfs_bmap_commit_alloc_v,
-	.bpop_abort_alloc_ptr	=	nilfs_bmap_abort_alloc_v,
-	.bpop_prepare_start_ptr	=	nilfs_bmap_prepare_start_v,
-	.bpop_commit_start_ptr	=	nilfs_bmap_commit_start_v,
-	.bpop_abort_start_ptr	=	nilfs_bmap_abort_start_v,
-	.bpop_prepare_end_ptr	=	nilfs_bmap_prepare_end_v,
-	.bpop_commit_end_ptr	=	nilfs_bmap_commit_end_vmdt,
-	.bpop_abort_end_ptr	=	nilfs_bmap_abort_end_v,
-
-	.bpop_translate		=	nilfs_bmap_translate_v,
-};
-
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
-	.bpop_prepare_alloc_ptr	=	nilfs_bmap_prepare_alloc_p,
-	.bpop_commit_alloc_ptr	=	nilfs_bmap_commit_alloc_p,
-	.bpop_abort_alloc_ptr	=	nilfs_bmap_abort_alloc_p,
-	.bpop_prepare_start_ptr	=	NULL,
-	.bpop_commit_start_ptr	=	NULL,
-	.bpop_abort_start_ptr	=	NULL,
-	.bpop_prepare_end_ptr	=	NULL,
-	.bpop_commit_end_ptr	=	NULL,
-	.bpop_abort_end_ptr	=	NULL,
-
-	.bpop_translate		=	NULL,
-};
-
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
-	.bpop_prepare_alloc_ptr	=	NULL,
-	.bpop_commit_alloc_ptr	=	NULL,
-	.bpop_abort_alloc_ptr	=	NULL,
-	.bpop_prepare_start_ptr	=	NULL,
-	.bpop_commit_start_ptr	=	NULL,
-	.bpop_abort_start_ptr	=	NULL,
-	.bpop_prepare_end_ptr	=	NULL,
-	.bpop_commit_end_ptr	=	NULL,
-	.bpop_abort_end_ptr	=	NULL,
-
-	.bpop_translate		=	NULL,
-};
-
 static struct lock_class_key nilfs_bmap_dat_lock_key;
 
 /**
@@ -714,31 +593,26 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
 	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
 	switch (bmap->b_inode->i_ino) {
 	case NILFS_DAT_INO:
-		bmap->b_pops = &nilfs_bmap_ptr_ops_p;
-		bmap->b_last_allocated_key = 0;	/* XXX: use macro */
+		bmap->b_ptr_type = NILFS_BMAP_PTR_P;
+		bmap->b_last_allocated_key = 0;
 		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
 		lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
 		break;
 	case NILFS_CPFILE_INO:
 	case NILFS_SUFILE_INO:
-		bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
-		bmap->b_last_allocated_key = 0;	/* XXX: use macro */
+		bmap->b_ptr_type = NILFS_BMAP_PTR_VS;
+		bmap->b_last_allocated_key = 0;
 		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
 		break;
 	default:
-		bmap->b_pops = &nilfs_bmap_ptr_ops_v;
-		bmap->b_last_allocated_key = 0;	/* XXX: use macro */
+		bmap->b_ptr_type = NILFS_BMAP_PTR_VM;
+		bmap->b_last_allocated_key = 0;
 		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
 		break;
 	}
 
 	return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
-		nilfs_btree_init(bmap,
-				 NILFS_BMAP_LARGE_LOW,
-				 NILFS_BMAP_LARGE_HIGH) :
-		nilfs_direct_init(bmap,
-				  NILFS_BMAP_SMALL_LOW,
-				  NILFS_BMAP_SMALL_HIGH);
+		nilfs_btree_init(bmap) : nilfs_direct_init(bmap);
 }
 
 /**
@@ -764,7 +638,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
 	memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
 	init_rwsem(&bmap->b_sem);
 	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
-	bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
+	bmap->b_ptr_type = NILFS_BMAP_PTR_U;
 	bmap->b_last_allocated_key = 0;
 	bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
 	bmap->b_state = 0;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 4f2708abb1ba..b2890cdcef12 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -64,6 +64,8 @@ struct nilfs_bmap_stats {
  */
 struct nilfs_bmap_operations {
 	int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+	int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
+				 unsigned);
 	int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
 	int (*bop_delete)(struct nilfs_bmap *, __u64);
 	void (*bop_clear)(struct nilfs_bmap *);
@@ -86,34 +88,6 @@ struct nilfs_bmap_operations {
 };
 
 
-/**
- * struct nilfs_bmap_ptr_operations - bmap ptr operation table
- */
-struct nilfs_bmap_ptr_operations {
-	int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
-				      union nilfs_bmap_ptr_req *);
-	void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
-				      union nilfs_bmap_ptr_req *);
-	void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
-				     union nilfs_bmap_ptr_req *);
-	int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
-				      union nilfs_bmap_ptr_req *);
-	void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
-				      union nilfs_bmap_ptr_req *,
-				      sector_t);
-	void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
-				     union nilfs_bmap_ptr_req *);
-	int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
-				    union nilfs_bmap_ptr_req *);
-	void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
-				    union nilfs_bmap_ptr_req *);
-	void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
-				   union nilfs_bmap_ptr_req *);
-
-	int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
-};
-
-
 #define NILFS_BMAP_SIZE		(NILFS_INODE_BMAP_SIZE * sizeof(__le64))
 #define NILFS_BMAP_KEY_BIT	(sizeof(unsigned long) * 8 /* CHAR_BIT */)
 #define NILFS_BMAP_NEW_PTR_INIT	\
@@ -131,11 +105,9 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
  * @b_sem: semaphore
  * @b_inode: owner of bmap
  * @b_ops: bmap operation table
- * @b_pops: bmap ptr operation table
- * @b_low: low watermark of conversion
- * @b_high: high watermark of conversion
  * @b_last_allocated_key: last allocated key for data block
  * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_ptr_type: pointer type
  * @b_state: state
  */
 struct nilfs_bmap {
@@ -146,14 +118,22 @@ struct nilfs_bmap {
 	struct rw_semaphore b_sem;
 	struct inode *b_inode;
 	const struct nilfs_bmap_operations *b_ops;
-	const struct nilfs_bmap_ptr_operations *b_pops;
-	__u64 b_low;
-	__u64 b_high;
 	__u64 b_last_allocated_key;
 	__u64 b_last_allocated_ptr;
+	int b_ptr_type;
 	int b_state;
 };
 
+/* pointer type */
+#define NILFS_BMAP_PTR_P	0	/* physical block number (i.e. LBN) */
+#define NILFS_BMAP_PTR_VS	1	/* virtual block number (single
+					   version) */
+#define NILFS_BMAP_PTR_VM	2	/* virtual block number (has multiple
+					   versions) */
+#define NILFS_BMAP_PTR_U	(-1)	/* never perform pointer operations */
+
+#define NILFS_BMAP_USE_VBN(bmap)	((bmap)->b_ptr_type > 0)
+
 /* state */
 #define NILFS_BMAP_DIRTY	0x00000001
 
@@ -162,6 +142,7 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
 int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
 int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
 int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
@@ -182,7 +163,67 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 /*
  * Internal use only
  */
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
+int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
+			       union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
+			       union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
+			      union nilfs_bmap_ptr_req *);
 
+static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
+					       union nilfs_bmap_ptr_req *req)
+{
+	if (NILFS_BMAP_USE_VBN(bmap))
+		return nilfs_bmap_prepare_alloc_v(bmap, req);
+	/* ignore target ptr */
+	req->bpr_ptr = bmap->b_last_allocated_ptr++;
+	return 0;
+}
+
+static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
+					       union nilfs_bmap_ptr_req *req)
+{
+	if (NILFS_BMAP_USE_VBN(bmap))
+		nilfs_bmap_commit_alloc_v(bmap, req);
+}
+
+static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
+					      union nilfs_bmap_ptr_req *req)
+{
+	if (NILFS_BMAP_USE_VBN(bmap))
+		nilfs_bmap_abort_alloc_v(bmap, req);
+	else
+		bmap->b_last_allocated_ptr--;
+}
+
+int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+
+static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
+					     union nilfs_bmap_ptr_req *req)
+{
+	return NILFS_BMAP_USE_VBN(bmap) ?
+		nilfs_bmap_prepare_end_v(bmap, req) : 0;
+}
+
+static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
+					     union nilfs_bmap_ptr_req *req)
+{
+	if (NILFS_BMAP_USE_VBN(bmap))
+		nilfs_bmap_commit_end_v(bmap, req);
+}
+
+static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
+					    union nilfs_bmap_ptr_req *req)
+{
+	if (NILFS_BMAP_USE_VBN(bmap))
+		nilfs_bmap_abort_end_v(bmap, req);
+}
+
+int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
+		       sector_t);
 int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
 int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
 
@@ -193,28 +234,20 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
 
-int nilfs_bmap_prepare_update(struct nilfs_bmap *,
-			      union nilfs_bmap_ptr_req *,
-			      union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update(struct nilfs_bmap *,
-			      union nilfs_bmap_ptr_req *,
-			      union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update(struct nilfs_bmap *,
-			     union nilfs_bmap_ptr_req *,
-			     union nilfs_bmap_ptr_req *);
+int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
+				union nilfs_bmap_ptr_req *,
+				union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
+				union nilfs_bmap_ptr_req *,
+				union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
+			       union nilfs_bmap_ptr_req *,
+			       union nilfs_bmap_ptr_req *);
 
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
 
 
-int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
-			 struct buffer_head **);
-void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
-int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
-			     struct buffer_head **);
-void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
-
-
 /* Assume that bmap semaphore is locked. */
 static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
 {
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4cc07b2c30e0..7e0b61be212e 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -46,15 +46,18 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
 	INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
 }
 
-static struct address_space_operations def_btnode_aops;
+static struct address_space_operations def_btnode_aops = {
+	.sync_page		= block_sync_page,
+};
 
-void nilfs_btnode_cache_init(struct address_space *btnc)
+void nilfs_btnode_cache_init(struct address_space *btnc,
+			     struct backing_dev_info *bdi)
 {
 	btnc->host = NULL;  /* can safely set to host inode ? */
 	btnc->flags = 0;
 	mapping_set_gfp_mask(btnc, GFP_NOFS);
 	btnc->assoc_mapping = NULL;
-	btnc->backing_dev_info = &default_backing_dev_info;
+	btnc->backing_dev_info = bdi;
 	btnc->a_ops = &def_btnode_aops;
 }
 
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 35faa86444a7..3e2275172ed6 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -38,7 +38,7 @@ struct nilfs_btnode_chkey_ctxt {
 };
 
 void nilfs_btnode_cache_init_once(struct address_space *);
-void nilfs_btnode_cache_init(struct address_space *);
+void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
 			      struct buffer_head **, int);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 6b37a2767293..aa412724b64e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -29,6 +29,7 @@
 #include "btnode.h"
 #include "btree.h"
 #include "alloc.h"
+#include "dat.h"
 
 /**
  * struct nilfs_btree_path - A path on which B-tree operations are executed
@@ -109,8 +110,7 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
 	     level < NILFS_BTREE_LEVEL_MAX;
 	     level++) {
 		if (path[level].bp_bh != NULL) {
-			nilfs_bmap_put_block(&btree->bt_bmap,
-					     path[level].bp_bh);
+			brelse(path[level].bp_bh);
 			path[level].bp_bh = NULL;
 		}
 		/* sib_bh is released or deleted by prepare or commit
@@ -123,10 +123,29 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
 	}
 }
 
-
 /*
  * B-tree node operations
  */
+static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
+				 struct buffer_head **bhp)
+{
+	struct address_space *btnc =
+		&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
+	return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
+}
+
+static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
+				     __u64 ptr, struct buffer_head **bhp)
+{
+	struct address_space *btnc =
+		&NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
+	int ret;
+
+	ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
+	if (!ret)
+		set_buffer_nilfs_volatile(*bhp);
+	return ret;
+}
 
 static inline int
 nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
@@ -488,8 +507,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
 	path[level].bp_index = index;
 
 	for (level--; level >= minlevel; level--) {
-		ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
-					   &path[level].bp_bh);
+		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
 		if (ret < 0)
 			return ret;
 		node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -535,8 +553,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
 	path[level].bp_index = index;
 
 	for (level--; level > 0; level--) {
-		ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
-					   &path[level].bp_bh);
+		ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
 		if (ret < 0)
 			return ret;
 		node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -579,6 +596,87 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
 	return ret;
 }
 
+static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
+				     __u64 key, __u64 *ptrp, unsigned maxblocks)
+{
+	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	struct inode *dat = NULL;
+	__u64 ptr, ptr2;
+	sector_t blocknr;
+	int level = NILFS_BTREE_LEVEL_NODE_MIN;
+	int ret, cnt, index, maxlevel;
+
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+	if (ret < 0)
+		goto out;
+
+	if (NILFS_BMAP_USE_VBN(bmap)) {
+		dat = nilfs_bmap_get_dat(bmap);
+		ret = nilfs_dat_translate(dat, ptr, &blocknr);
+		if (ret < 0)
+			goto out;
+		ptr = blocknr;
+	}
+	cnt = 1;
+	if (cnt == maxblocks)
+		goto end;
+
+	maxlevel = nilfs_btree_height(btree) - 1;
+	node = nilfs_btree_get_node(btree, path, level);
+	index = path[level].bp_index + 1;
+	for (;;) {
+		while (index < nilfs_btree_node_get_nchildren(btree, node)) {
+			if (nilfs_btree_node_get_key(btree, node, index) !=
+			    key + cnt)
+				goto end;
+			ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+			if (dat) {
+				ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+				if (ret < 0)
+					goto out;
+				ptr2 = blocknr;
+			}
+			if (ptr2 != ptr + cnt || ++cnt == maxblocks)
+				goto end;
+			index++;
+			continue;
+		}
+		if (level == maxlevel)
+			break;
+
+		/* look-up right sibling node */
+		node = nilfs_btree_get_node(btree, path, level + 1);
+		index = path[level + 1].bp_index + 1;
+		if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
+		    nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+			break;
+		ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+		path[level + 1].bp_index = index;
+
+		brelse(path[level].bp_bh);
+		path[level].bp_bh = NULL;
+		ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
+		if (ret < 0)
+			goto out;
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		index = 0;
+		path[level].bp_index = index;
+	}
+ end:
+	*ptrp = ptr;
+	ret = cnt;
+ out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+	return ret;
+}
+
 static void nilfs_btree_promote_key(struct nilfs_btree *btree,
 				    struct nilfs_btree_path *path,
 				    int level, __u64 key)
@@ -669,13 +767,13 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
 				nilfs_btree_node_get_key(btree, node, 0));
 
 	if (move) {
-		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+		brelse(path[level].bp_bh);
 		path[level].bp_bh = path[level].bp_sib_bh;
 		path[level].bp_sib_bh = NULL;
 		path[level].bp_index += lnchildren;
 		path[level + 1].bp_index--;
 	} else {
-		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		brelse(path[level].bp_sib_bh);
 		path[level].bp_sib_bh = NULL;
 		path[level].bp_index -= n;
 	}
@@ -722,14 +820,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
 	path[level + 1].bp_index--;
 
 	if (move) {
-		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+		brelse(path[level].bp_bh);
 		path[level].bp_bh = path[level].bp_sib_bh;
 		path[level].bp_sib_bh = NULL;
 		path[level].bp_index -=
 			nilfs_btree_node_get_nchildren(btree, node);
 		path[level + 1].bp_index++;
 	} else {
-		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		brelse(path[level].bp_sib_bh);
 		path[level].bp_sib_bh = NULL;
 	}
 
@@ -781,7 +879,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 		*keyp = nilfs_btree_node_get_key(btree, right, 0);
 		*ptrp = path[level].bp_newreq.bpr_ptr;
 
-		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+		brelse(path[level].bp_bh);
 		path[level].bp_bh = path[level].bp_sib_bh;
 		path[level].bp_sib_bh = NULL;
 	} else {
@@ -790,7 +888,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
 		*keyp = nilfs_btree_node_get_key(btree, right, 0);
 		*ptrp = path[level].bp_newreq.bpr_ptr;
 
-		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		brelse(path[level].bp_sib_bh);
 		path[level].bp_sib_bh = NULL;
 	}
 
@@ -897,12 +995,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 	level = NILFS_BTREE_LEVEL_DATA;
 
 	/* allocate a new ptr for data block */
-	if (btree->bt_ops->btop_find_target != NULL)
+	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
 		path[level].bp_newreq.bpr_ptr =
-			btree->bt_ops->btop_find_target(btree, path, key);
+			nilfs_btree_find_target_v(btree, path, key);
 
-	ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
-		&btree->bt_bmap, &path[level].bp_newreq);
+	ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+					   &path[level].bp_newreq);
 	if (ret < 0)
 		goto err_out_data;
 
@@ -924,8 +1022,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 		if (pindex > 0) {
 			sibptr = nilfs_btree_node_get_ptr(btree, parent,
 							  pindex - 1);
-			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
-						   &bh);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_child_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
@@ -936,7 +1033,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 				stats->bs_nblocks++;
 				goto out;
 			} else
-				nilfs_bmap_put_block(&btree->bt_bmap, bh);
+				brelse(bh);
 		}
 
 		/* right sibling */
@@ -944,8 +1041,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 		    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
 			sibptr = nilfs_btree_node_get_ptr(btree, parent,
 							  pindex + 1);
-			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
-						   &bh);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_child_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
@@ -956,19 +1052,19 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 				stats->bs_nblocks++;
 				goto out;
 			} else
-				nilfs_bmap_put_block(&btree->bt_bmap, bh);
+				brelse(bh);
 		}
 
 		/* split */
 		path[level].bp_newreq.bpr_ptr =
 			path[level - 1].bp_newreq.bpr_ptr + 1;
-		ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
-			&btree->bt_bmap, &path[level].bp_newreq);
+		ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+						   &path[level].bp_newreq);
 		if (ret < 0)
 			goto err_out_child_node;
-		ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
-					       path[level].bp_newreq.bpr_ptr,
-					       &bh);
+		ret = nilfs_btree_get_new_block(btree,
+						path[level].bp_newreq.bpr_ptr,
+						&bh);
 		if (ret < 0)
 			goto err_out_curr_node;
 
@@ -994,12 +1090,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 	/* grow */
 	path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
-	ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
-		&btree->bt_bmap, &path[level].bp_newreq);
+	ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
+					   &path[level].bp_newreq);
 	if (ret < 0)
 		goto err_out_child_node;
-	ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
-				       path[level].bp_newreq.bpr_ptr, &bh);
+	ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
+					&bh);
 	if (ret < 0)
 		goto err_out_curr_node;
 
@@ -1023,18 +1119,16 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
 
 	/* error */
  err_out_curr_node:
-	btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
-						    &path[level].bp_newreq);
+	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
  err_out_child_node:
 	for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
-		nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
-		btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
-			&btree->bt_bmap, &path[level].bp_newreq);
+		nilfs_btnode_delete(path[level].bp_sib_bh);
+		nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
+					   &path[level].bp_newreq);
 
 	}
 
-	btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
-						       &path[level].bp_newreq);
+	nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
  err_out_data:
 	*levelp = level;
 	stats->bs_nblocks = 0;
@@ -1049,14 +1143,12 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
 
 	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
 	ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-	if (btree->bt_ops->btop_set_target != NULL)
-		btree->bt_ops->btop_set_target(btree, key, ptr);
+	if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+		nilfs_btree_set_target_v(btree, key, ptr);
 
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-		if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
-			btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
-				&btree->bt_bmap, &path[level - 1].bp_newreq);
-		}
+		nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
+					    &path[level - 1].bp_newreq);
 		path[level].bp_op(btree, path, level, &key, &ptr);
 	}
 
@@ -1153,7 +1245,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
 	nilfs_btree_promote_key(btree, path, level + 1,
 				nilfs_btree_node_get_key(btree, node, 0));
 
-	nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+	brelse(path[level].bp_sib_bh);
 	path[level].bp_sib_bh = NULL;
 	path[level].bp_index += n;
 }
@@ -1192,7 +1284,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
 				nilfs_btree_node_get_key(btree, right, 0));
 	path[level + 1].bp_index--;
 
-	nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+	brelse(path[level].bp_sib_bh);
 	path[level].bp_sib_bh = NULL;
 }
 
@@ -1221,7 +1313,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
 	unlock_buffer(path[level].bp_bh);
 	unlock_buffer(path[level].bp_sib_bh);
 
-	nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = path[level].bp_sib_bh;
 	path[level].bp_sib_bh = NULL;
 	path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
@@ -1252,7 +1344,7 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
 	unlock_buffer(path[level].bp_bh);
 	unlock_buffer(path[level].bp_sib_bh);
 
-	nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+	nilfs_btnode_delete(path[level].bp_sib_bh);
 	path[level].bp_sib_bh = NULL;
 	path[level + 1].bp_index++;
 }
@@ -1276,7 +1368,7 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
 	nilfs_btree_node_move_left(btree, root, child, n);
 	unlock_buffer(path[level].bp_bh);
 
-	nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+	nilfs_btnode_delete(path[level].bp_bh);
 	path[level].bp_bh = NULL;
 }
 
@@ -1300,12 +1392,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 		path[level].bp_oldreq.bpr_ptr =
 			nilfs_btree_node_get_ptr(btree, node,
 						 path[level].bp_index);
-		if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
-			ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
-				&btree->bt_bmap, &path[level].bp_oldreq);
-			if (ret < 0)
-				goto err_out_child_node;
-		}
+		ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
+						 &path[level].bp_oldreq);
+		if (ret < 0)
+			goto err_out_child_node;
 
 		if (nilfs_btree_node_get_nchildren(btree, node) >
 		    nilfs_btree_node_nchildren_min(btree, node)) {
@@ -1321,8 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			/* left sibling */
 			sibptr = nilfs_btree_node_get_ptr(btree, parent,
 							  pindex - 1);
-			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
-						   &bh);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_curr_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1343,8 +1432,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 			/* right sibling */
 			sibptr = nilfs_btree_node_get_ptr(btree, parent,
 							  pindex + 1);
-			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
-						   &bh);
+			ret = nilfs_btree_get_block(btree, sibptr, &bh);
 			if (ret < 0)
 				goto err_out_curr_node;
 			sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1381,12 +1469,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 	node = nilfs_btree_get_root(btree);
 	path[level].bp_oldreq.bpr_ptr =
 		nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
-	if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
-		ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
-			&btree->bt_bmap, &path[level].bp_oldreq);
-		if (ret < 0)
-			goto err_out_child_node;
-	}
+
+	ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
+					 &path[level].bp_oldreq);
+	if (ret < 0)
+		goto err_out_child_node;
+
 	/* child of the root node is deleted */
 	path[level].bp_op = nilfs_btree_do_delete;
 	stats->bs_nblocks++;
@@ -1398,15 +1486,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 
 	/* error */
  err_out_curr_node:
-	if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
-		btree->bt_bmap.b_pops->bpop_abort_end_ptr(
-			&btree->bt_bmap, &path[level].bp_oldreq);
+	nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
  err_out_child_node:
 	for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
-		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
-		if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
-			btree->bt_bmap.b_pops->bpop_abort_end_ptr(
-				&btree->bt_bmap, &path[level].bp_oldreq);
+		brelse(path[level].bp_sib_bh);
+		nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
+					 &path[level].bp_oldreq);
 	}
 	*levelp = level;
 	stats->bs_nblocks = 0;
@@ -1420,9 +1505,8 @@ static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
 	int level;
 
 	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-		if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
-			btree->bt_bmap.b_pops->bpop_commit_end_ptr(
-				&btree->bt_bmap, &path[level].bp_oldreq);
+		nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
+					  &path[level].bp_oldreq);
 		path[level].bp_op(btree, path, level, NULL, NULL);
 	}
 
@@ -1501,7 +1585,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
 		if (nchildren > 1)
 			return 0;
 		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
-		ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+		ret = nilfs_btree_get_block(btree, ptr, &bh);
 		if (ret < 0)
 			return ret;
 		node = (struct nilfs_btree_node *)bh->b_data;
@@ -1515,9 +1599,9 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
 	nextmaxkey = (nchildren > 1) ?
 		nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
 	if (bh != NULL)
-		nilfs_bmap_put_block(bmap, bh);
+		brelse(bh);
 
-	return (maxkey == key) && (nextmaxkey < bmap->b_low);
+	return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
 }
 
 static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
@@ -1542,7 +1626,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 		nchildren = nilfs_btree_node_get_nchildren(btree, root);
 		WARN_ON(nchildren > 1);
 		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
-		ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+		ret = nilfs_btree_get_block(btree, ptr, &bh);
 		if (ret < 0)
 			return ret;
 		node = (struct nilfs_btree_node *)bh->b_data;
@@ -1563,7 +1647,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
 	}
 
 	if (bh != NULL)
-		nilfs_bmap_put_block(bmap, bh);
+		brelse(bh);
 
 	return nitems;
 }
@@ -1584,10 +1668,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 
 	/* for data */
 	/* cannot find near ptr */
-	if (btree->bt_ops->btop_find_target != NULL)
-		dreq->bpr_ptr
-			= btree->bt_ops->btop_find_target(btree, NULL, key);
-	ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
+	if (NILFS_BMAP_USE_VBN(bmap))
+		dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
+
+	ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
 	if (ret < 0)
 		return ret;
 
@@ -1595,11 +1679,11 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 	stats->bs_nblocks++;
 	if (nreq != NULL) {
 		nreq->bpr_ptr = dreq->bpr_ptr + 1;
-		ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
+		ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
 		if (ret < 0)
 			goto err_out_dreq;
 
-		ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
+		ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh);
 		if (ret < 0)
 			goto err_out_nreq;
 
@@ -1612,9 +1696,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
 
 	/* error */
  err_out_nreq:
-	bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
+	nilfs_bmap_abort_alloc_ptr(bmap, nreq);
  err_out_dreq:
-	bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
+	nilfs_bmap_abort_alloc_ptr(bmap, dreq);
 	stats->bs_nblocks = 0;
 	return ret;
 
@@ -1624,7 +1708,7 @@ static void
 nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 				      __u64 key, __u64 ptr,
 				      const __u64 *keys, const __u64 *ptrs,
-				      int n, __u64 low, __u64 high,
+				      int n,
 				      union nilfs_bmap_ptr_req *dreq,
 				      union nilfs_bmap_ptr_req *nreq,
 				      struct buffer_head *bh)
@@ -1642,12 +1726,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 
 	/* convert and insert */
 	btree = (struct nilfs_btree *)bmap;
-	nilfs_btree_init(bmap, low, high);
+	nilfs_btree_init(bmap);
 	if (nreq != NULL) {
-		if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
-			bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
-			bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
-		}
+		nilfs_bmap_commit_alloc_ptr(bmap, dreq);
+		nilfs_bmap_commit_alloc_ptr(bmap, nreq);
 
 		/* create child node at level 1 */
 		lock_buffer(bh);
@@ -1661,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 			nilfs_bmap_set_dirty(bmap);
 
 		unlock_buffer(bh);
-		nilfs_bmap_put_block(bmap, bh);
+		brelse(bh);
 
 		/* create root node at level 2 */
 		node = nilfs_btree_get_root(btree);
@@ -1669,8 +1751,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 		nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
 				      2, 1, &keys[0], &tmpptr);
 	} else {
-		if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
-			bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+		nilfs_bmap_commit_alloc_ptr(bmap, dreq);
 
 		/* create root node at level 1 */
 		node = nilfs_btree_get_root(btree);
@@ -1682,8 +1763,8 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 			nilfs_bmap_set_dirty(bmap);
 	}
 
-	if (btree->bt_ops->btop_set_target != NULL)
-		btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
+	if (NILFS_BMAP_USE_VBN(bmap))
+		nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr);
 }
 
 /**
@@ -1694,13 +1775,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
  * @keys:
  * @ptrs:
  * @n:
- * @low:
- * @high:
  */
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
 				   __u64 key, __u64 ptr,
-				   const __u64 *keys, const __u64 *ptrs,
-				   int n, __u64 low, __u64 high)
+				   const __u64 *keys, const __u64 *ptrs, int n)
 {
 	struct buffer_head *bh;
 	union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
@@ -1725,7 +1803,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
 	if (ret < 0)
 		return ret;
 	nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
-					      low, high, di, ni, bh);
+					      di, ni, bh);
 	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 	return 0;
 }
@@ -1754,9 +1832,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 		nilfs_btree_node_get_ptr(btree, parent,
 					 path[level + 1].bp_index);
 	path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
-	ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
-					&path[level].bp_oldreq,
-					&path[level].bp_newreq);
+	ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
+					  &path[level].bp_oldreq,
+					  &path[level].bp_newreq);
 	if (ret < 0)
 		return ret;
 
@@ -1768,9 +1846,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
 			&path[level].bp_ctxt);
 		if (ret < 0) {
-			nilfs_bmap_abort_update(&btree->bt_bmap,
-						&path[level].bp_oldreq,
-						&path[level].bp_newreq);
+			nilfs_bmap_abort_update_v(&btree->bt_bmap,
+						  &path[level].bp_oldreq,
+						  &path[level].bp_newreq);
 			return ret;
 		}
 	}
@@ -1784,9 +1862,9 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 {
 	struct nilfs_btree_node *parent;
 
-	nilfs_bmap_commit_update(&btree->bt_bmap,
-				 &path[level].bp_oldreq,
-				 &path[level].bp_newreq);
+	nilfs_bmap_commit_update_v(&btree->bt_bmap,
+				   &path[level].bp_oldreq,
+				   &path[level].bp_newreq);
 
 	if (buffer_nilfs_node(path[level].bp_bh)) {
 		nilfs_btnode_commit_change_key(
@@ -1805,9 +1883,9 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
 				       struct nilfs_btree_path *path,
 				       int level)
 {
-	nilfs_bmap_abort_update(&btree->bt_bmap,
-				&path[level].bp_oldreq,
-				&path[level].bp_newreq);
+	nilfs_bmap_abort_update_v(&btree->bt_bmap,
+				  &path[level].bp_oldreq,
+				  &path[level].bp_newreq);
 	if (buffer_nilfs_node(path[level].bp_bh))
 		nilfs_btnode_abort_change_key(
 			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1930,7 +2008,9 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 		goto out;
 	}
 
-	ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
+	ret = NILFS_BMAP_USE_VBN(bmap) ?
+		nilfs_btree_propagate_v(btree, path, level, bh) :
+		nilfs_btree_propagate_p(btree, path, level, bh);
 
  out:
 	nilfs_btree_clear_path(btree, path);
@@ -2066,12 +2146,9 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
 	ptr = nilfs_btree_node_get_ptr(btree, parent,
 				       path[level + 1].bp_index);
 	req.bpr_ptr = ptr;
-	ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
-							       &req);
-	if (ret < 0)
+	ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
+	if (unlikely(ret < 0))
 		return ret;
-	btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
-							&req, blocknr);
 
 	key = nilfs_btree_node_get_key(btree, parent,
 				       path[level + 1].bp_index);
@@ -2114,8 +2191,9 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
 		goto out;
 	}
 
-	ret = btree->bt_ops->btop_assign(btree, path, level, bh,
-					    blocknr, binfo);
+	ret = NILFS_BMAP_USE_VBN(bmap) ?
+		nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
+		nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 
  out:
 	nilfs_btree_clear_path(btree, path);
@@ -2171,7 +2249,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 		WARN_ON(ret == -ENOENT);
 		goto out;
 	}
-	ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
+	ret = nilfs_btree_get_block(btree, ptr, &bh);
 	if (ret < 0) {
 		WARN_ON(ret == -ENOENT);
 		goto out;
@@ -2179,7 +2257,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 
 	if (!buffer_dirty(bh))
 		nilfs_btnode_mark_dirty(bh);
-	nilfs_bmap_put_block(&btree->bt_bmap, bh);
+	brelse(bh);
 	if (!nilfs_bmap_dirty(&btree->bt_bmap))
 		nilfs_bmap_set_dirty(&btree->bt_bmap);
 
@@ -2191,6 +2269,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 
 static const struct nilfs_bmap_operations nilfs_btree_ops = {
 	.bop_lookup		=	nilfs_btree_lookup,
+	.bop_lookup_contig	=	nilfs_btree_lookup_contig,
 	.bop_insert		=	nilfs_btree_insert,
 	.bop_delete		=	nilfs_btree_delete,
 	.bop_clear		=	NULL,
@@ -2210,6 +2289,7 @@ static const struct nilfs_bmap_operations nilfs_btree_ops = {
 
 static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
 	.bop_lookup		=	NULL,
+	.bop_lookup_contig	=	NULL,
 	.bop_insert		=	NULL,
 	.bop_delete		=	NULL,
 	.bop_clear		=	NULL,
@@ -2227,43 +2307,13 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
 	.bop_gather_data	=	NULL,
 };
 
-static const struct nilfs_btree_operations nilfs_btree_ops_v = {
-	.btop_find_target	=	nilfs_btree_find_target_v,
-	.btop_set_target	=	nilfs_btree_set_target_v,
-	.btop_propagate		=	nilfs_btree_propagate_v,
-	.btop_assign		=	nilfs_btree_assign_v,
-};
-
-static const struct nilfs_btree_operations nilfs_btree_ops_p = {
-	.btop_find_target	=	NULL,
-	.btop_set_target	=	NULL,
-	.btop_propagate		=	nilfs_btree_propagate_p,
-	.btop_assign		=	nilfs_btree_assign_p,
-};
-
-int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+int nilfs_btree_init(struct nilfs_bmap *bmap)
 {
-	struct nilfs_btree *btree;
-
-	btree = (struct nilfs_btree *)bmap;
 	bmap->b_ops = &nilfs_btree_ops;
-	bmap->b_low = low;
-	bmap->b_high = high;
-	switch (bmap->b_inode->i_ino) {
-	case NILFS_DAT_INO:
-		btree->bt_ops = &nilfs_btree_ops_p;
-		break;
-	default:
-		btree->bt_ops = &nilfs_btree_ops_v;
-		break;
-	}
-
 	return 0;
 }
 
 void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
 {
-	bmap->b_low = NILFS_BMAP_LARGE_LOW;
-	bmap->b_high = NILFS_BMAP_LARGE_HIGH;
 	bmap->b_ops = &nilfs_btree_ops_gc;
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4766deb52fb1..0e72bbbc6b64 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
 struct nilfs_btree_path;
 
 /**
- * struct nilfs_btree_operations - B-tree operation table
- */
-struct nilfs_btree_operations {
-	__u64 (*btop_find_target)(const struct nilfs_btree *,
-				  const struct nilfs_btree_path *, __u64);
-	void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
-
-	struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
-
-	int (*btop_propagate)(struct nilfs_btree *,
-			      struct nilfs_btree_path *,
-			      int,
-			      struct buffer_head *);
-	int (*btop_assign)(struct nilfs_btree *,
-			   struct nilfs_btree_path *,
-			   int,
-			   struct buffer_head **,
-			   sector_t,
-			   union nilfs_binfo *);
-};
-
-/**
  * struct nilfs_btree_node - B-tree node
  * @bn_flags: flags
  * @bn_level: level
@@ -80,13 +58,9 @@ struct nilfs_btree_node {
 /**
  * struct nilfs_btree - B-tree structure
  * @bt_bmap: bmap base structure
- * @bt_ops: B-tree operation table
  */
 struct nilfs_btree {
 	struct nilfs_bmap bt_bmap;
-
-	/* B-tree-specific members */
-	const struct nilfs_btree_operations *bt_ops;
 };
 
 
@@ -108,10 +82,9 @@ struct nilfs_btree {
 
 int nilfs_btree_path_cache_init(void);
 void nilfs_btree_path_cache_destroy(void);
-int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_btree_init(struct nilfs_bmap *);
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
-				   const __u64 *, const __u64 *,
-				   int, __u64, __u64);
+				   const __u64 *, const __u64 *, int);
 void nilfs_btree_init_gc(struct nilfs_bmap *);
 
 #endif	/* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index cadd36b14d07..7d49813f66d6 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -295,10 +295,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 		return -EINVAL;
 	}
 
-	/* cannot delete the latest checkpoint */
-	if (start == nilfs_mdt_cno(cpfile) - 1)
-		return -EPERM;
-
 	down_write(&NILFS_MDT(cpfile)->mi_sem);
 
 	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
@@ -384,9 +380,10 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
 }
 
 static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
-					  struct nilfs_cpinfo *ci, size_t nci)
+					  void *buf, unsigned cisz, size_t nci)
 {
 	struct nilfs_checkpoint *cp;
+	struct nilfs_cpinfo *ci = buf;
 	struct buffer_head *bh;
 	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
 	__u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
@@ -410,17 +407,22 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 		kaddr = kmap_atomic(bh->b_page, KM_USER0);
 		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
 		for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
-			if (!nilfs_checkpoint_invalid(cp))
-				nilfs_cpfile_checkpoint_to_cpinfo(
-					cpfile, cp, &ci[n++]);
+			if (!nilfs_checkpoint_invalid(cp)) {
+				nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp,
+								  ci);
+				ci = (void *)ci + cisz;
+				n++;
+			}
 		}
 		kunmap_atomic(kaddr, KM_USER0);
 		brelse(bh);
 	}
 
 	ret = n;
-	if (n > 0)
-		*cnop = ci[n - 1].ci_cno + 1;
+	if (n > 0) {
+		ci = (void *)ci - cisz;
+		*cnop = ci->ci_cno + 1;
+	}
 
  out:
 	up_read(&NILFS_MDT(cpfile)->mi_sem);
@@ -428,11 +430,12 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 }
 
 static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
-					  struct nilfs_cpinfo *ci, size_t nci)
+					  void *buf, unsigned cisz, size_t nci)
 {
 	struct buffer_head *bh;
 	struct nilfs_cpfile_header *header;
 	struct nilfs_checkpoint *cp;
+	struct nilfs_cpinfo *ci = buf;
 	__u64 curr = *cnop, next;
 	unsigned long curr_blkoff, next_blkoff;
 	void *kaddr;
@@ -472,7 +475,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 		if (unlikely(nilfs_checkpoint_invalid(cp) ||
 			     !nilfs_checkpoint_snapshot(cp)))
 			break;
-		nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
+		nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci);
+		ci = (void *)ci + cisz;
+		n++;
 		next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
 		if (next == 0)
 			break; /* reach end of the snapshot list */
@@ -511,13 +516,13 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
  */
 
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
-				struct nilfs_cpinfo *ci, size_t nci)
+				void *buf, unsigned cisz, size_t nci)
 {
 	switch (mode) {
 	case NILFS_CHECKPOINT:
-		return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
+		return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci);
 	case NILFS_SNAPSHOT:
-		return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
+		return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci);
 	default:
 		return -EINVAL;
 	}
@@ -533,20 +538,14 @@ int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
 	struct nilfs_cpinfo ci;
 	__u64 tcno = cno;
 	ssize_t nci;
-	int ret;
 
-	nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
+	nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1);
 	if (nci < 0)
 		return nci;
 	else if (nci == 0 || ci.ci_cno != cno)
 		return -ENOENT;
-
-	/* cannot delete the latest checkpoint nor snapshots */
-	ret = nilfs_cpinfo_snapshot(&ci);
-	if (ret < 0)
-		return ret;
-	else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
-		return -EPERM;
+	else if (nilfs_cpinfo_snapshot(&ci))
+		return -EBUSY;
 
 	return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
 }
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 1a8a1008c342..788a45950197 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -39,7 +39,7 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
 int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
 int nilfs_cpfile_is_snapshot(struct inode *, __u64);
 int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
-ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
-				struct nilfs_cpinfo *, size_t);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
+				size_t);
 
 #endif	/* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index bb8a5818e7f1..0b2710e2d565 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -92,21 +92,6 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
 	nilfs_palloc_abort_alloc_entry(dat, req);
 }
 
-int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
-	int ret;
-
-	ret = nilfs_palloc_prepare_free_entry(dat, req);
-	if (ret < 0)
-		return ret;
-	ret = nilfs_dat_prepare_entry(dat, req, 0);
-	if (ret < 0) {
-		nilfs_palloc_abort_free_entry(dat, req);
-		return ret;
-	}
-	return 0;
-}
-
 void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
 {
 	struct nilfs_dat_entry *entry;
@@ -391,36 +376,37 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 	return ret;
 }
 
-ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
 			    size_t nvi)
 {
 	struct buffer_head *entry_bh;
 	struct nilfs_dat_entry *entry;
+	struct nilfs_vinfo *vinfo = buf;
 	__u64 first, last;
 	void *kaddr;
 	unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
 	int i, j, n, ret;
 
 	for (i = 0; i < nvi; i += n) {
-		ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
+		ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr,
 						   0, &entry_bh);
 		if (ret < 0)
 			return ret;
 		kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
 		/* last virtual block number in this block */
-		first = vinfo[i].vi_vblocknr;
+		first = vinfo->vi_vblocknr;
 		do_div(first, entries_per_block);
 		first *= entries_per_block;
 		last = first + entries_per_block - 1;
 		for (j = i, n = 0;
-		     j < nvi && vinfo[j].vi_vblocknr >= first &&
-			     vinfo[j].vi_vblocknr <= last;
-		     j++, n++) {
+		     j < nvi && vinfo->vi_vblocknr >= first &&
+			     vinfo->vi_vblocknr <= last;
+		     j++, n++, vinfo = (void *)vinfo + visz) {
 			entry = nilfs_palloc_block_get_entry(
-				dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
-			vinfo[j].vi_start = le64_to_cpu(entry->de_start);
-			vinfo[j].vi_end = le64_to_cpu(entry->de_end);
-			vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
+				dat, vinfo->vi_vblocknr, entry_bh, kaddr);
+			vinfo->vi_start = le64_to_cpu(entry->de_start);
+			vinfo->vi_end = le64_to_cpu(entry->de_end);
+			vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
 		}
 		kunmap_atomic(kaddr, KM_USER0);
 		brelse(entry_bh);
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d9560654a4b7..d328b81eead4 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -47,6 +47,6 @@ void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
-ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
 
 #endif	/* _NILFS_DAT_H */
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index c6379e482781..342d9765df8d 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -25,6 +25,7 @@
 #include "page.h"
 #include "direct.h"
 #include "alloc.h"
+#include "dat.h"
 
 static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
 {
@@ -62,6 +63,47 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
 	return 0;
 }
 
+static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
+				      __u64 key, __u64 *ptrp,
+				      unsigned maxblocks)
+{
+	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+	struct inode *dat = NULL;
+	__u64 ptr, ptr2;
+	sector_t blocknr;
+	int ret, cnt;
+
+	if (key > NILFS_DIRECT_KEY_MAX ||
+	    (ptr = nilfs_direct_get_ptr(direct, key)) ==
+	    NILFS_BMAP_INVALID_PTR)
+		return -ENOENT;
+
+	if (NILFS_BMAP_USE_VBN(bmap)) {
+		dat = nilfs_bmap_get_dat(bmap);
+		ret = nilfs_dat_translate(dat, ptr, &blocknr);
+		if (ret < 0)
+			return ret;
+		ptr = blocknr;
+	}
+
+	maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+	for (cnt = 1; cnt < maxblocks &&
+		     (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
+		     NILFS_BMAP_INVALID_PTR;
+	     cnt++) {
+		if (dat) {
+			ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+			if (ret < 0)
+				return ret;
+			ptr2 = blocknr;
+		}
+		if (ptr2 != ptr + cnt)
+			break;
+	}
+	*ptrp = ptr;
+	return cnt;
+}
+
 static __u64
 nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
 {
@@ -90,10 +132,9 @@ static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
 {
 	int ret;
 
-	if (direct->d_ops->dop_find_target != NULL)
-		req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
-	ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
-							       req);
+	if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
+		req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
+	ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
 	if (ret < 0)
 		return ret;
 
@@ -111,16 +152,14 @@ static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
 	bh = (struct buffer_head *)((unsigned long)ptr);
 	set_buffer_nilfs_volatile(bh);
 
-	if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
-		direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
-			&direct->d_bmap, req);
+	nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
 	nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
 
 	if (!nilfs_bmap_dirty(&direct->d_bmap))
 		nilfs_bmap_set_dirty(&direct->d_bmap);
 
-	if (direct->d_ops->dop_set_target != NULL)
-		direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
+	if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
+		nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
 }
 
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -152,25 +191,18 @@ static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
 {
 	int ret;
 
-	if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
-		req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
-		ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
-			&direct->d_bmap, req);
-		if (ret < 0)
-			return ret;
-	}
-
-	stats->bs_nblocks = 1;
-	return 0;
+	req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+	ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
+	if (!ret)
+		stats->bs_nblocks = 1;
+	return ret;
 }
 
 static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
 				       union nilfs_bmap_ptr_req *req,
 				       __u64 key)
 {
-	if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
-		direct->d_bmap.b_pops->bpop_commit_end_ptr(
-			&direct->d_bmap, req);
+	nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
 	nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
 }
 
@@ -244,8 +276,7 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
 }
 
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
-				    __u64 key, __u64 *keys, __u64 *ptrs,
-				    int n, __u64 low, __u64 high)
+				    __u64 key, __u64 *keys, __u64 *ptrs, int n)
 {
 	struct nilfs_direct *direct;
 	__le64 *dptrs;
@@ -275,8 +306,7 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
 			dptrs[i] = NILFS_BMAP_INVALID_PTR;
 	}
 
-	nilfs_direct_init(bmap, low, high);
-
+	nilfs_direct_init(bmap);
 	return 0;
 }
 
@@ -293,11 +323,11 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
 	if (!buffer_nilfs_volatile(bh)) {
 		oldreq.bpr_ptr = ptr;
 		newreq.bpr_ptr = ptr;
-		ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
-						&newreq);
+		ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
+						  &newreq);
 		if (ret < 0)
 			return ret;
-		nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
+		nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
 		set_buffer_nilfs_volatile(bh);
 		nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
 	} else
@@ -309,12 +339,10 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
 static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
 				  struct buffer_head *bh)
 {
-	struct nilfs_direct *direct;
+	struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
 
-	direct = (struct nilfs_direct *)bmap;
-	return (direct->d_ops->dop_propagate != NULL) ?
-		direct->d_ops->dop_propagate(direct, bh) :
-		0;
+	return NILFS_BMAP_USE_VBN(bmap) ?
+		nilfs_direct_propagate_v(direct, bh) : 0;
 }
 
 static int nilfs_direct_assign_v(struct nilfs_direct *direct,
@@ -327,12 +355,9 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
 	int ret;
 
 	req.bpr_ptr = ptr;
-	ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
-		&direct->d_bmap, &req);
-	if (ret < 0)
+	ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
+	if (unlikely(ret < 0))
 		return ret;
-	direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
-						     &req, blocknr);
 
 	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
 	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -377,12 +402,14 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
 		return -EINVAL;
 	}
 
-	return direct->d_ops->dop_assign(direct, key, ptr, bh,
-					 blocknr, binfo);
+	return NILFS_BMAP_USE_VBN(bmap) ?
+		nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) :
+		nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo);
 }
 
 static const struct nilfs_bmap_operations nilfs_direct_ops = {
 	.bop_lookup		=	nilfs_direct_lookup,
+	.bop_lookup_contig	=	nilfs_direct_lookup_contig,
 	.bop_insert		=	nilfs_direct_insert,
 	.bop_delete		=	nilfs_direct_delete,
 	.bop_clear		=	NULL,
@@ -401,36 +428,8 @@ static const struct nilfs_bmap_operations nilfs_direct_ops = {
 };
 
 
-static const struct nilfs_direct_operations nilfs_direct_ops_v = {
-	.dop_find_target	=	nilfs_direct_find_target_v,
-	.dop_set_target		=	nilfs_direct_set_target_v,
-	.dop_propagate		=	nilfs_direct_propagate_v,
-	.dop_assign		=	nilfs_direct_assign_v,
-};
-
-static const struct nilfs_direct_operations nilfs_direct_ops_p = {
-	.dop_find_target	=	NULL,
-	.dop_set_target		=	NULL,
-	.dop_propagate		=	NULL,
-	.dop_assign		=	nilfs_direct_assign_p,
-};
-
-int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+int nilfs_direct_init(struct nilfs_bmap *bmap)
 {
-	struct nilfs_direct *direct;
-
-	direct = (struct nilfs_direct *)bmap;
 	bmap->b_ops = &nilfs_direct_ops;
-	bmap->b_low = low;
-	bmap->b_high = high;
-	switch (bmap->b_inode->i_ino) {
-	case NILFS_DAT_INO:
-		direct->d_ops = &nilfs_direct_ops_p;
-		break;
-	default:
-		direct->d_ops = &nilfs_direct_ops_v;
-		break;
-	}
-
 	return 0;
 }
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index 45d2c5cda812..a5ffd66e25d0 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -31,18 +31,6 @@
 struct nilfs_direct;
 
 /**
- * struct nilfs_direct_operations - direct mapping operation table
- */
-struct nilfs_direct_operations {
-	__u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
-	void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
-	int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
-	int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
-			  struct buffer_head **, sector_t,
-			  union nilfs_binfo *);
-};
-
-/**
  * struct nilfs_direct_node - direct node
  * @dn_flags: flags
  * @dn_pad: padding
@@ -55,13 +43,9 @@ struct nilfs_direct_node {
 /**
  * struct nilfs_direct - direct mapping
  * @d_bmap: bmap structure
- * @d_ops: direct mapping operation table
  */
 struct nilfs_direct {
 	struct nilfs_bmap d_bmap;
-
-	/* direct-mapping-specific members */
-	const struct nilfs_direct_operations *d_ops;
 };
 
 
@@ -70,9 +54,9 @@ struct nilfs_direct {
 #define NILFS_DIRECT_KEY_MAX	(NILFS_DIRECT_NBLOCKS - 1)
 
 
-int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_direct_init(struct nilfs_bmap *);
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
-				    __u64 *, int, __u64, __u64);
+				    __u64 *, int);
 
 
 #endif	/* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 19d2102b6a69..1b3c2bb20da9 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,8 +52,9 @@
 #include "dat.h"
 #include "ifile.h"
 
-static struct address_space_operations def_gcinode_aops = {};
-/* XXX need def_gcinode_iops/fops? */
+static struct address_space_operations def_gcinode_aops = {
+	.sync_page		= block_sync_page,
+};
 
 /*
  * nilfs_gccache_submit_read_data() - add data buffer and submit read request
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 49ab4a49bb4f..2696d6b513b7 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -43,22 +43,23 @@
  *
  * This function does not issue actual read request of the specified data
  * block. It is done by VFS.
- * Bulk read for direct-io is not supported yet. (should be supported)
  */
 int nilfs_get_block(struct inode *inode, sector_t blkoff,
 		    struct buffer_head *bh_result, int create)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
-	unsigned long blknum = 0;
+	__u64 blknum = 0;
 	int err = 0, ret;
 	struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+	unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
 
-	/* This exclusion control is a workaround; should be revised */
-	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
-	ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
-	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
-	if (ret == 0) {	/* found */
+	down_read(&NILFS_MDT(dat)->mi_sem);
+	ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
+	up_read(&NILFS_MDT(dat)->mi_sem);
+	if (ret >= 0) {	/* found */
 		map_bh(bh_result, inode->i_sb, blknum);
+		if (ret > 0)
+			bh_result->b_size = (ret << inode->i_blkbits);
 		goto out;
 	}
 	/* data block was not found */
@@ -240,7 +241,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 struct address_space_operations nilfs_aops = {
 	.writepage		= nilfs_writepage,
 	.readpage		= nilfs_readpage,
-	/* .sync_page		= nilfs_sync_page, */
+	.sync_page		= block_sync_page,
 	.writepages		= nilfs_writepages,
 	.set_page_dirty		= nilfs_set_page_dirty,
 	.readpages		= nilfs_readpages,
@@ -249,6 +250,7 @@ struct address_space_operations nilfs_aops = {
 	/* .releasepage		= nilfs_releasepage, */
 	.invalidatepage		= block_invalidatepage,
 	.direct_IO		= nilfs_direct_IO,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 struct inode *nilfs_new_inode(struct inode *dir, int mode)
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6759b92006f..6ea5f872e2de 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -152,7 +152,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 
 	down_read(&nilfs->ns_segctor_sem);
 	ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
-				      nmembs);
+				      size, nmembs);
 	up_read(&nilfs->ns_segctor_sem);
 	return ret;
 }
@@ -182,7 +182,8 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 	int ret;
 
 	down_read(&nilfs->ns_segctor_sem);
-	ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
+	ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size,
+				      nmembs);
 	up_read(&nilfs->ns_segctor_sem);
 	return ret;
 }
@@ -212,7 +213,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 	int ret;
 
 	down_read(&nilfs->ns_segctor_sem);
-	ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
+	ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
 	up_read(&nilfs->ns_segctor_sem);
 	return ret;
 }
@@ -435,24 +436,6 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
 	return nmembs;
 }
 
-static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
-				     struct nilfs_argv *argv, void *buf)
-{
-	size_t nmembs = argv->v_nmembs;
-	struct nilfs_sb_info *sbi = nilfs->ns_writer;
-	int ret;
-
-	if (unlikely(!sbi)) {
-		/* never happens because called for a writable mount */
-		WARN_ON(1);
-		return -EROFS;
-	}
-	ret = nilfs_segctor_add_segments_to_be_freed(
-		NILFS_SC(sbi), buf, nmembs);
-
-	return (ret < 0) ? ret : nmembs;
-}
-
 int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
 				       struct nilfs_argv *argv, void **kbufs)
 {
@@ -491,14 +474,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
 		msg = "cannot mark copying blocks dirty";
 		goto failed;
 	}
-	ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]);
-	if (ret < 0) {
-		/*
-		 * can safely abort because this operation is atomic.
-		 */
-		msg = "cannot set segments to be freed";
-		goto failed;
-	}
 	return 0;
 
  failed:
@@ -615,7 +590,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 	if (copy_from_user(&argv, argp, sizeof(argv)))
 		return -EFAULT;
 
-	if (argv.v_size != membsz)
+	if (argv.v_size < membsz)
 		return -EINVAL;
 
 	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index bb78745a0e30..3d3ddb3f5177 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -430,6 +430,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 
 static struct address_space_operations def_mdt_aops = {
 	.writepage		= nilfs_mdt_write_page,
+	.sync_page		= block_sync_page,
 };
 
 static struct inode_operations def_mdt_iops;
@@ -449,7 +450,7 @@ struct inode *
 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 		     ino_t ino, gfp_t gfp_mask)
 {
-	struct inode *inode = nilfs_alloc_inode(sb);
+	struct inode *inode = nilfs_alloc_inode_common(nilfs);
 
 	if (!inode)
 		return NULL;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index da6fc0bba2e5..edf6a59d9f2a 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -263,6 +263,7 @@ extern void nilfs_dirty_inode(struct inode *);
 extern struct dentry *nilfs_get_parent(struct dentry *);
 
 /* super.c */
+extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
 extern void nilfs_error(struct super_block *, const char *, const char *, ...)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 57afa9d24061..d80cc71be749 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -28,7 +28,6 @@
 #include "segment.h"
 #include "sufile.h"
 #include "page.h"
-#include "seglist.h"
 #include "segbuf.h"
 
 /*
@@ -395,6 +394,24 @@ static void dispose_recovery_list(struct list_head *head)
 	}
 }
 
+struct nilfs_segment_entry {
+	struct list_head	list;
+	__u64			segnum;
+};
+
+static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
+{
+	struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+
+	if (unlikely(!ent))
+		return -ENOMEM;
+
+	ent->segnum = segnum;
+	INIT_LIST_HEAD(&ent->list);
+	list_add_tail(&ent->list, head);
+	return 0;
+}
+
 void nilfs_dispose_segment_list(struct list_head *head)
 {
 	while (!list_empty(head)) {
@@ -402,7 +419,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
 			= list_entry(head->next,
 				     struct nilfs_segment_entry, list);
 		list_del(&ent->list);
-		nilfs_free_segment_entry(ent);
+		kfree(ent);
 	}
 }
 
@@ -431,12 +448,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 	if (unlikely(err))
 		goto failed;
 
-	err = -ENOMEM;
 	for (i = 1; i < 4; i++) {
-		ent = nilfs_alloc_segment_entry(segnum[i]);
-		if (unlikely(!ent))
+		err = nilfs_segment_list_add(head, segnum[i]);
+		if (unlikely(err))
 			goto failed;
-		list_add_tail(&ent->list, head);
 	}
 
 	/*
@@ -450,7 +465,7 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
 				goto failed;
 		}
 		list_del(&ent->list);
-		nilfs_free_segment_entry(ent);
+		kfree(ent);
 	}
 
 	/* Allocate new segments for recovery */
@@ -791,7 +806,6 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 	u64 seg_seq;
 	__u64 segnum, nextnum = 0;
 	__u64 cno;
-	struct nilfs_segment_entry *ent;
 	LIST_HEAD(segments);
 	int empty_seg = 0, scan_newer = 0;
 	int ret;
@@ -892,12 +906,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
 		if (empty_seg++)
 			goto super_root_found; /* found a valid super root */
 
-		ent = nilfs_alloc_segment_entry(segnum);
-		if (unlikely(!ent)) {
-			ret = -ENOMEM;
+		ret = nilfs_segment_list_add(&segments, segnum);
+		if (unlikely(ret))
 			goto failed;
-		}
-		list_add_tail(&ent->list, &segments);
 
 		seg_seq++;
 		segnum = nextnum;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1e68821b4a9b..9e3fe17bb96b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -26,7 +26,6 @@
 #include <linux/crc32.h>
 #include "page.h"
 #include "segbuf.h"
-#include "seglist.h"
 
 
 static struct kmem_cache *nilfs_segbuf_cachep;
@@ -394,7 +393,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
 		 * Last BIO is always sent through the following
 		 * submission.
 		 */
-		rw |= (1 << BIO_RW_SYNCIO);
+		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 		res = nilfs_submit_seg_bio(wi, rw);
 		if (unlikely(res))
 			goto failed_bio;
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
deleted file mode 100644
index d39df9144e99..000000000000
--- a/fs/nilfs2/seglist.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * seglist.h - expediential structure and routines to handle list of segments
- *             (would be removed in a future release)
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
- *
- */
-#ifndef _NILFS_SEGLIST_H
-#define _NILFS_SEGLIST_H
-
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
-#include "sufile.h"
-
-struct nilfs_segment_entry {
-	__u64			segnum;
-
-#define NILFS_SLH_FREED		0x0001	/* The segment was freed provisonally.
-					   It must be cancelled if
-					   construction aborted */
-
-	unsigned		flags;
-	struct list_head	list;
-	struct buffer_head     *bh_su;
-	struct nilfs_segment_usage *raw_su;
-};
-
-
-void nilfs_dispose_segment_list(struct list_head *);
-
-static inline struct nilfs_segment_entry *
-nilfs_alloc_segment_entry(__u64 segnum)
-{
-	struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
-
-	if (likely(ent)) {
-		ent->segnum = segnum;
-		ent->flags = 0;
-		ent->bh_su = NULL;
-		ent->raw_su = NULL;
-		INIT_LIST_HEAD(&ent->list);
-	}
-	return ent;
-}
-
-static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
-					   struct inode *sufile)
-{
-	return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
-					      &ent->raw_su, &ent->bh_su);
-}
-
-static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
-					     struct inode *sufile)
-{
-	if (!ent->bh_su)
-		return;
-	nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
-	ent->bh_su = NULL;
-	ent->raw_su = NULL;
-}
-
-static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
-{
-	kfree(ent);
-}
-
-#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 22c7f65c2403..aa977549919e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -39,7 +39,6 @@
 #include "sufile.h"
 #include "cpfile.h"
 #include "ifile.h"
-#include "seglist.h"
 #include "segbuf.h"
 
 
@@ -79,7 +78,8 @@ enum {
 /* State flags of collection */
 #define NILFS_CF_NODE		0x0001	/* Collecting node blocks */
 #define NILFS_CF_IFILE_STARTED	0x0002	/* IFILE stage has started */
-#define NILFS_CF_HISTORY_MASK	(NILFS_CF_IFILE_STARTED)
+#define NILFS_CF_SUFREED	0x0004	/* segment usages has been freed */
+#define NILFS_CF_HISTORY_MASK	(NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED)
 
 /* Operations depending on the construction mode and file type */
 struct nilfs_sc_operations {
@@ -810,7 +810,7 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
 {
 	return list_empty(&sci->sc_dirty_files) &&
 		!test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
-		list_empty(&sci->sc_cleaning_segments) &&
+		sci->sc_nfreesegs == 0 &&
 		(!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
 }
 
@@ -1005,44 +1005,6 @@ static void nilfs_drop_collected_inodes(struct list_head *head)
 	}
 }
 
-static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
-					       struct inode *sufile)
-
-{
-	struct list_head *head = &sci->sc_cleaning_segments;
-	struct nilfs_segment_entry *ent;
-	int err;
-
-	list_for_each_entry(ent, head, list) {
-		if (!(ent->flags & NILFS_SLH_FREED))
-			break;
-		err = nilfs_sufile_cancel_free(sufile, ent->segnum);
-		WARN_ON(err); /* do not happen */
-		ent->flags &= ~NILFS_SLH_FREED;
-	}
-}
-
-static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
-					       struct inode *sufile)
-{
-	struct list_head *head = &sci->sc_cleaning_segments;
-	struct nilfs_segment_entry *ent;
-	int err;
-
-	list_for_each_entry(ent, head, list) {
-		err = nilfs_sufile_free(sufile, ent->segnum);
-		if (unlikely(err))
-			return err;
-		ent->flags |= NILFS_SLH_FREED;
-	}
-	return 0;
-}
-
-static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
-{
-	nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
-}
-
 static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
 				       struct inode *inode,
 				       struct list_head *listp,
@@ -1161,6 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct list_head *head;
 	struct nilfs_inode_info *ii;
+	size_t ndone;
 	int err = 0;
 
 	switch (sci->sc_stage.scnt) {
@@ -1250,10 +1213,16 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 			break;
 		sci->sc_stage.scnt++;  /* Fall through */
 	case NILFS_ST_SUFILE:
-		err = nilfs_segctor_prepare_free_segments(sci,
-							  nilfs->ns_sufile);
-		if (unlikely(err))
+		err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
+					 sci->sc_nfreesegs, &ndone);
+		if (unlikely(err)) {
+			nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+						  sci->sc_freesegs, ndone,
+						  NULL);
 			break;
+		}
+		sci->sc_stage.flags |= NILFS_CF_SUFREED;
+
 		err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
 					      &nilfs_sc_file_ops);
 		if (unlikely(err))
@@ -1486,7 +1455,15 @@ static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
 {
 	if (unlikely(err)) {
 		nilfs_segctor_free_incomplete_segments(sci, nilfs);
-		nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+			int ret;
+
+			ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+							sci->sc_freesegs,
+							sci->sc_nfreesegs,
+							NULL);
+			WARN_ON(ret); /* do not happen */
+		}
 	}
 	nilfs_segctor_clear_segment_buffers(sci);
 }
@@ -1585,7 +1562,13 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 		if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
 			break;
 
-		nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+			err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+							sci->sc_freesegs,
+							sci->sc_nfreesegs,
+							NULL);
+			WARN_ON(err); /* do not happen */
+		}
 		nilfs_segctor_clear_segment_buffers(sci);
 
 		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
@@ -2224,10 +2207,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 		nilfs_segctor_complete_write(sci);
 
 		/* Commit segments */
-		if (has_sr) {
-			nilfs_segctor_commit_free_segments(sci);
+		if (has_sr)
 			nilfs_segctor_clear_metadata_dirty(sci);
-		}
 
 		nilfs_segctor_end_construction(sci, nilfs, 0);
 
@@ -2301,48 +2282,6 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino)
 					/* assign bit 0 to data files */
 }
 
-int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
-					   __u64 *segnum, size_t nsegs)
-{
-	struct nilfs_segment_entry *ent;
-	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-	struct inode *sufile = nilfs->ns_sufile;
-	LIST_HEAD(list);
-	__u64 *pnum;
-	size_t i;
-	int err;
-
-	for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
-		ent = nilfs_alloc_segment_entry(*pnum);
-		if (unlikely(!ent)) {
-			err = -ENOMEM;
-			goto failed;
-		}
-		list_add_tail(&ent->list, &list);
-
-		err = nilfs_open_segment_entry(ent, sufile);
-		if (unlikely(err))
-			goto failed;
-
-		if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
-			printk(KERN_WARNING "NILFS: unused segment is "
-			       "requested to be cleaned (segnum=%llu)\n",
-			       (unsigned long long)ent->segnum);
-		nilfs_close_segment_entry(ent, sufile);
-	}
-	list_splice(&list, sci->sc_cleaning_segments.prev);
-	return 0;
-
- failed:
-	nilfs_dispose_segment_list(&list);
-	return err;
-}
-
-void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
-{
-	nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
-}
-
 struct nilfs_segctor_wait_request {
 	wait_queue_t	wq;
 	__u32		seq;
@@ -2607,10 +2546,13 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 	err = nilfs_init_gcdat_inode(nilfs);
 	if (unlikely(err))
 		goto out_unlock;
+
 	err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
 	if (unlikely(err))
 		goto out_unlock;
 
+	sci->sc_freesegs = kbufs[4];
+	sci->sc_nfreesegs = argv[4].v_nmembs;
 	list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
 
 	for (;;) {
@@ -2629,6 +2571,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 	}
 
  out_unlock:
+	sci->sc_freesegs = NULL;
+	sci->sc_nfreesegs = 0;
 	nilfs_clear_gcdat_inode(nilfs);
 	nilfs_transaction_unlock(sbi);
 	return err;
@@ -2835,7 +2779,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
 	INIT_LIST_HEAD(&sci->sc_dirty_files);
 	INIT_LIST_HEAD(&sci->sc_segbufs);
 	INIT_LIST_HEAD(&sci->sc_gc_inodes);
-	INIT_LIST_HEAD(&sci->sc_cleaning_segments);
 	INIT_LIST_HEAD(&sci->sc_copied_buffers);
 
 	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2901,9 +2844,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 		nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
 	}
 
-	if (!list_empty(&sci->sc_cleaning_segments))
-		nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
-
 	WARN_ON(!list_empty(&sci->sc_segbufs));
 
 	down_write(&sbi->s_nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 476bdd5df5be..0d2a475a741b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -90,8 +90,9 @@ struct nilfs_segsum_pointer {
  * @sc_nblk_inc: Block count of current generation
  * @sc_dirty_files: List of files to be written
  * @sc_gc_inodes: List of GC inodes having blocks to be written
- * @sc_cleaning_segments: List of segments to be freed through construction
  * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
+ * @sc_freesegs: array of segment numbers to be freed
+ * @sc_nfreesegs: number of segments on @sc_freesegs
  * @sc_dsync_inode: inode whose data pages are written for a sync operation
  * @sc_dsync_start: start byte offset of data pages
  * @sc_dsync_end: end byte offset of data pages (inclusive)
@@ -131,9 +132,11 @@ struct nilfs_sc_info {
 
 	struct list_head	sc_dirty_files;
 	struct list_head	sc_gc_inodes;
-	struct list_head	sc_cleaning_segments;
 	struct list_head	sc_copied_buffers;
 
+	__u64		       *sc_freesegs;
+	size_t			sc_nfreesegs;
+
 	struct nilfs_inode_info *sc_dsync_inode;
 	loff_t			sc_dsync_start;
 	loff_t			sc_dsync_end;
@@ -225,10 +228,6 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
 				void **);
 
-extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
-						  __u64 *, size_t);
-extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
-
 extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
 
@@ -240,5 +239,6 @@ extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
 extern int nilfs_recover_logical_segments(struct the_nilfs *,
 					  struct nilfs_sb_info *,
 					  struct nilfs_recovery_info *);
+extern void nilfs_dispose_segment_list(struct list_head *);
 
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 98e68677f045..37994d4a59cc 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  * Written by Koji Sato <koji@osrg.net>.
+ * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
  */
 
 #include <linux/kernel.h>
@@ -108,6 +109,102 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 	nilfs_mdt_mark_buffer_dirty(header_bh);
 }
 
+/**
+ * nilfs_sufile_updatev - modify multiple segment usages at a time
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @create: creation flag
+ * @ndone: place to store number of modified segments on @segnumv
+ * @dofunc: primitive operation for the update
+ *
+ * Description: nilfs_sufile_updatev() repeatedly calls @dofunc
+ * against the given array of segments.  The @dofunc is called with
+ * buffers of a header block and the sufile block in which the target
+ * segment usage entry is contained.  If @ndone is given, the number
+ * of successfully modified segments from the head is stored in the
+ * place @ndone points to.
+ *
+ * Return Value: On success, zero is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Given segment usage is in hole block (may be returned if
+ *            @create is zero)
+ *
+ * %-EINVAL - Invalid segment usage number
+ */
+int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
+			 int create, size_t *ndone,
+			 void (*dofunc)(struct inode *, __u64,
+					struct buffer_head *,
+					struct buffer_head *))
+{
+	struct buffer_head *header_bh, *bh;
+	unsigned long blkoff, prev_blkoff;
+	__u64 *seg;
+	size_t nerr = 0, n = 0;
+	int ret = 0;
+
+	if (unlikely(nsegs == 0))
+		goto out;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	for (seg = segnumv; seg < segnumv + nsegs; seg++) {
+		if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
+			printk(KERN_WARNING
+			       "%s: invalid segment number: %llu\n", __func__,
+			       (unsigned long long)*seg);
+			nerr++;
+		}
+	}
+	if (nerr > 0) {
+		ret = -EINVAL;
+		goto out_sem;
+	}
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	seg = segnumv;
+	blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+	ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+	if (ret < 0)
+		goto out_header;
+
+	for (;;) {
+		dofunc(sufile, *seg, header_bh, bh);
+
+		if (++seg >= segnumv + nsegs)
+			break;
+		prev_blkoff = blkoff;
+		blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+		if (blkoff == prev_blkoff)
+			continue;
+
+		/* get different block */
+		brelse(bh);
+		ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+		if (unlikely(ret < 0))
+			goto out_header;
+	}
+	brelse(bh);
+
+ out_header:
+	n = seg - segnumv;
+	brelse(header_bh);
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+ out:
+	if (ndone)
+		*ndone = n;
+	return ret;
+}
+
 int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
 			void (*dofunc)(struct inode *, __u64,
 				       struct buffer_head *,
@@ -490,7 +587,8 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
  * nilfs_sufile_get_suinfo -
  * @sufile: inode of segment usage file
  * @segnum: segment number to start looking
- * @si: array of suinfo
+ * @buf: array of suinfo
+ * @sisz: byte size of suinfo
  * @nsi: size of suinfo array
  *
  * Description:
@@ -502,11 +600,12 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
  *
  * %-ENOMEM - Insufficient amount of memory available.
  */
-ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
-				struct nilfs_suinfo *si, size_t nsi)
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
+				unsigned sisz, size_t nsi)
 {
 	struct buffer_head *su_bh;
 	struct nilfs_segment_usage *su;
+	struct nilfs_suinfo *si = buf;
 	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
 	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
 	void *kaddr;
@@ -531,20 +630,22 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
 			if (ret != -ENOENT)
 				goto out;
 			/* hole */
-			memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
+			memset(si, 0, sisz * n);
+			si = (void *)si + sisz * n;
 			continue;
 		}
 
 		kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
-		for (j = 0; j < n; j++, su = (void *)su + susz) {
-			si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
-			si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
-			si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
+		for (j = 0; j < n;
+		     j++, su = (void *)su + susz, si = (void *)si + sisz) {
+			si->sui_lastmod = le64_to_cpu(su->su_lastmod);
+			si->sui_nblocks = le32_to_cpu(su->su_nblocks);
+			si->sui_flags = le32_to_cpu(su->su_flags) &
 				~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
 			if (nilfs_segment_is_active(nilfs, segnum + j))
-				si[i + j].sui_flags |=
+				si->sui_flags |=
 					(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
 		}
 		kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2e2efd4ade1..a2c4d76c3366 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -43,43 +43,27 @@ void nilfs_sufile_put_segment_usage(struct inode *, __u64,
 				    struct buffer_head *);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
 int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
-ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
 				size_t);
 
+int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
+			 void (*dofunc)(struct inode *, __u64,
+					struct buffer_head *,
+					struct buffer_head *));
 int nilfs_sufile_update(struct inode *, __u64, int,
 			void (*dofunc)(struct inode *, __u64,
 				       struct buffer_head *,
 				       struct buffer_head *));
-void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
-				 struct buffer_head *);
 void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
 			   struct buffer_head *);
 void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
 			  struct buffer_head *);
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+				 struct buffer_head *);
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
 			       struct buffer_head *);
 
 /**
- * nilfs_sufile_cancel_free -
- * @sufile: inode of segment usage file
- * @segnum: segment number
- *
- * Description:
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
-{
-	return nilfs_sufile_update(sufile, segnum, 0,
-				   nilfs_sufile_do_cancel_free);
-}
-
-/**
  * nilfs_sufile_scrap - make a segment garbage
  * @sufile: inode of segment usage file
  * @segnum: segment number to be freed
@@ -100,6 +84,38 @@ static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
 }
 
 /**
+ * nilfs_sufile_freev - free segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of freed segments
+ */
+static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
+				     size_t nsegs, size_t *ndone)
+{
+	return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+				    nilfs_sufile_do_free);
+}
+
+/**
+ * nilfs_sufile_cancel_freev - reallocate freeing segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of cancelled segments
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error codes
+ * is returned.
+ */
+static inline int nilfs_sufile_cancel_freev(struct inode *sufile,
+					    __u64 *segnumv, size_t nsegs,
+					    size_t *ndone)
+{
+	return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+				    nilfs_sufile_do_cancel_free);
+}
+
+/**
  * nilfs_sufile_set_error - mark a segment as erroneous
  * @sufile: inode of segment usage file
  * @segnum: segment number
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1777a3467bd2..ab785f85aa50 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -133,7 +133,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
 
 static struct kmem_cache *nilfs_inode_cachep;
 
-struct inode *nilfs_alloc_inode(struct super_block *sb)
+struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 {
 	struct nilfs_inode_info *ii;
 
@@ -143,10 +143,15 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 	ii->i_bh = NULL;
 	ii->i_state = 0;
 	ii->vfs_inode.i_version = 1;
-	nilfs_btnode_cache_init(&ii->i_btnode_cache);
+	nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
 	return &ii->vfs_inode;
 }
 
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+	return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
+}
+
 void nilfs_destroy_inode(struct inode *inode)
 {
 	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index e4e5c78bcc93..8b8889825716 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -32,7 +32,6 @@
 #include "cpfile.h"
 #include "sufile.h"
 #include "dat.h"
-#include "seglist.h"
 #include "segbuf.h"
 
 
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 9b0efdad8910..477d37d83b31 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
 #include <linux/errno.h>
 #include <linux/kmod.h>
 #include <linux/spinlock.h>
+#include <asm/byteorder.h>
 
 static struct nls_table default_table;
 static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] =
     {0,						       /* end of table    */}
 };
 
-int
-utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
+#define UNICODE_MAX	0x0010ffff
+#define PLANE_SIZE	0x00010000
+
+#define SURROGATE_MASK	0xfffff800
+#define SURROGATE_PAIR	0x0000d800
+#define SURROGATE_LOW	0x00000400
+#define SURROGATE_BITS	0x000003ff
+
+int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
 {
-	long l;
+	unsigned long l;
 	int c0, c, nc;
 	const struct utf8_table *t;
   
@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
 		nc++;
 		if ((c0 & t->cmask) == t->cval) {
 			l &= t->lmask;
-			if (l < t->lval)
+			if (l < t->lval || l > UNICODE_MAX ||
+					(l & SURROGATE_MASK) == SURROGATE_PAIR)
 				return -1;
-			*p = l;
+			*pu = (unicode_t) l;
 			return nc;
 		}
-		if (n <= nc)
+		if (len <= nc)
 			return -1;
 		s++;
 		c = (*s ^ 0x80) & 0xFF;
@@ -72,90 +81,133 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
 	}
 	return -1;
 }
+EXPORT_SYMBOL(utf8_to_utf32);
 
-int
-utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
+int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
 {
-	__u16 *op;
-	const __u8 *ip;
-	int size;
-
-	op = pwcs;
-	ip = s;
-	while (*ip && n > 0) {
-		if (*ip & 0x80) {
-			size = utf8_mbtowc(op, ip, n);
-			if (size == -1) {
-				/* Ignore character and move on */
-				ip++;
-				n--;
-			} else {
-				op++;
-				ip += size;
-				n -= size;
-			}
-		} else {
-			*op++ = *ip++;
-			n--;
-		}
-	}
-	return (op - pwcs);
-}
-
-int
-utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
-{
-	long l;
+	unsigned long l;
 	int c, nc;
 	const struct utf8_table *t;
-  
+
 	if (!s)
 		return 0;
-  
-	l = wc;
+
+	l = u;
+	if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+		return -1;
+
 	nc = 0;
 	for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
 		nc++;
 		if (l <= t->lmask) {
 			c = t->shift;
-			*s = t->cval | (l >> c);
+			*s = (u8) (t->cval | (l >> c));
 			while (c > 0) {
 				c -= 6;
 				s++;
-				*s = 0x80 | ((l >> c) & 0x3F);
+				*s = (u8) (0x80 | ((l >> c) & 0x3F));
 			}
 			return nc;
 		}
 	}
 	return -1;
 }
+EXPORT_SYMBOL(utf32_to_utf8);
 
-int
-utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
+int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
 {
-	const __u16 *ip;
-	__u8 *op;
+	u16 *op;
 	int size;
+	unicode_t u;
+
+	op = pwcs;
+	while (*s && len > 0) {
+		if (*s & 0x80) {
+			size = utf8_to_utf32(s, len, &u);
+			if (size < 0) {
+				/* Ignore character and move on */
+				size = 1;
+			} else if (u >= PLANE_SIZE) {
+				u -= PLANE_SIZE;
+				*op++ = (wchar_t) (SURROGATE_PAIR |
+						((u >> 10) & SURROGATE_BITS));
+				*op++ = (wchar_t) (SURROGATE_PAIR |
+						SURROGATE_LOW |
+						(u & SURROGATE_BITS));
+			} else {
+				*op++ = (wchar_t) u;
+			}
+			s += size;
+			len -= size;
+		} else {
+			*op++ = *s++;
+			len--;
+		}
+	}
+	return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+	switch (endian) {
+	default:
+		return c;
+	case UTF16_LITTLE_ENDIAN:
+		return __le16_to_cpu(c);
+	case UTF16_BIG_ENDIAN:
+		return __be16_to_cpu(c);
+	}
+}
+
+int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
+		u8 *s, int maxlen)
+{
+	u8 *op;
+	int size;
+	unsigned long u, v;
 
 	op = s;
-	ip = pwcs;
-	while (*ip && maxlen > 0) {
-		if (*ip > 0x7f) {
-			size = utf8_wctomb(op, *ip, maxlen);
+	while (len > 0 && maxlen > 0) {
+		u = get_utf16(*pwcs, endian);
+		if (!u)
+			break;
+		pwcs++;
+		len--;
+		if (u > 0x7f) {
+			if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+				if (u & SURROGATE_LOW) {
+					/* Ignore character and move on */
+					continue;
+				}
+				if (len <= 0)
+					break;
+				v = get_utf16(*pwcs, endian);
+				if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+						!(v & SURROGATE_LOW)) {
+					/* Ignore character and move on */
+					continue;
+				}
+				u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+						+ (v & SURROGATE_BITS);
+				pwcs++;
+				len--;
+			}
+			size = utf32_to_utf8(u, op, maxlen);
 			if (size == -1) {
 				/* Ignore character and move on */
-				maxlen--;
 			} else {
 				op += size;
 				maxlen -= size;
 			}
 		} else {
-			*op++ = (__u8) *ip;
+			*op++ = (u8) u;
+			maxlen--;
 		}
-		ip++;
 	}
-	return (op - s);
+	return op - s;
 }
+EXPORT_SYMBOL(utf16s_to_utf8s);
 
 int register_nls(struct nls_table * nls)
 {
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
 EXPORT_SYMBOL(unload_nls);
 EXPORT_SYMBOL(load_nls);
 EXPORT_SYMBOL(load_nls_default);
-EXPORT_SYMBOL(utf8_mbtowc);
-EXPORT_SYMBOL(utf8_mbstowcs);
-EXPORT_SYMBOL(utf8_wctomb);
-EXPORT_SYMBOL(utf8_wcstombs);
 
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index aa2c42fdd977..0d60a44acacd 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 {
 	int n;
 
-	if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) {
+	if (boundlen <= 0)
+		return -ENAMETOOLONG;
+
+	n = utf32_to_utf8(uni, out, boundlen);
+	if (n < 0) {
 		*out = '?';
 		return -EINVAL;
 	}
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
 {
 	int n;
+	unicode_t u;
 
-	if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) {
+	n = utf8_to_utf32(rawstring, boundlen, &u);
+	if (n < 0 || u > MAX_WCHAR_T) {
 		*uni = 0x003f;	/* ? */
-		n = -EINVAL;
+		return -EINVAL;
 	}
+	*uni = (wchar_t) u;
 	return n;
 }
 
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ea2605a58b8a..f234f3a4c8ca 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -15,7 +15,8 @@ struct inotify_inode_mark_entry {
 	int wd;
 };
 
-extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+					   struct fsnotify_group *group);
 extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 7ef75b83247e..47cd258fd24d 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -81,7 +81,7 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 
 static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
 {
-	inotify_destroy_mark_entry(entry, group);
+	inotify_ignored_and_remove_idr(entry, group);
 }
 
 static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 982a412ac5bc..ff231ad23895 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -363,39 +363,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 }
 
 /*
- * When, for whatever reason, inotify is done with a mark (or what used to be a
- * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
- * for the given wd.
- *
- * There is a bit of recursion here.  The loop looks like:
- * 	inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
- *	inotify_freeing_mark -> inotify_destory_mark_entry -> restart
- * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
- * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
- * test below will not call back to fsnotify again.  But even if that test wasn't
- * there this would still be safe since fsnotify_destroy_mark_by_entry() is
- * safe from recursion.
+ * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
+ * internal reference help on the mark because it is in the idr.
  */
-void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+				    struct fsnotify_group *group)
 {
 	struct inotify_inode_mark_entry *ientry;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
-	struct fsnotify_group *egroup;
 	struct idr *idr;
 
-	spin_lock(&entry->lock);
-	egroup = entry->group;
-
-	/* if egroup we aren't really done and something might still send events
-	 * for this inode, on the callback we'll send the IN_IGNORED */
-	if (egroup) {
-		spin_unlock(&entry->lock);
-		fsnotify_destroy_mark_by_entry(entry);
-		return;
-	}
-	spin_unlock(&entry->lock);
-
 	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 
 	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
@@ -699,7 +677,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 	fsnotify_get_mark(entry);
 	spin_unlock(&group->inotify_data.idr_lock);
 
-	inotify_destroy_mark_entry(entry, group);
+	fsnotify_destroy_mark_by_entry(entry);
 	fsnotify_put_mark(entry);
 
 out:
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 82c5085559c6..9938034762cc 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/slab.h>
+#include <linux/log2.h>
 
 #include "aops.h"
 #include "attrib.h"
@@ -1570,7 +1571,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	ntfs_debug("Index collation rule is 0x%x.",
 			le32_to_cpu(ir->collation_rule));
 	ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
-	if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) {
+	if (!is_power_of_2(ni->itype.index.block_size)) {
 		ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
 				"two.", ni->itype.index.block_size);
 		goto unm_err_out;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index d7932e95b1fd..89b02985c054 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -26,6 +26,7 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include <linux/log2.h>
 
 #include "attrib.h"
 #include "aops.h"
@@ -65,7 +66,7 @@ static bool ntfs_check_restart_page_header(struct inode *vi,
 			logfile_log_page_size < NTFS_BLOCK_SIZE ||
 			logfile_system_page_size &
 			(logfile_system_page_size - 1) ||
-			logfile_log_page_size & (logfile_log_page_size - 1)) {
+			!is_power_of_2(logfile_log_page_size)) {
 		ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
 		return false;
 	}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 678a067d9251..9edcde4974aa 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -475,6 +475,12 @@ struct ocfs2_path {
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
 
+static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
+			   u32 cpos);
+static void ocfs2_adjust_rightmost_records(struct inode *inode,
+					   handle_t *handle,
+					   struct ocfs2_path *path,
+					   struct ocfs2_extent_rec *insert_rec);
 /*
  * Reset the actual path elements so that we can re-use the structure
  * to build another path. Generally, this involves freeing the buffer
@@ -1013,6 +1019,54 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 }
 
 /*
+ * Change range of the branches in the right most path according to the leaf
+ * extent block's rightmost record.
+ */
+static int ocfs2_adjust_rightmost_branch(handle_t *handle,
+					 struct inode *inode,
+					 struct ocfs2_extent_tree *et)
+{
+	int status;
+	struct ocfs2_path *path = NULL;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	path = ocfs2_new_path_from_et(et);
+	if (!path) {
+		status = -ENOMEM;
+		return status;
+	}
+
+	status = ocfs2_find_path(inode, path, UINT_MAX);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_extend_trans(handle, path_num_items(path) +
+				    handle->h_buffer_credits);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access_path(inode, handle, path);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+	rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
+
+	ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+
+out:
+	ocfs2_free_path(path);
+	return status;
+}
+
+/*
  * Add an entire tree branch to our inode. eb_bh is the extent block
  * to start at, if we don't want to start the branch at the dinode
  * structure.
@@ -1038,7 +1092,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *eb_el;
 	struct ocfs2_extent_list  *el;
-	u32 new_cpos;
+	u32 new_cpos, root_end;
 
 	mlog_entry_void();
 
@@ -1055,6 +1109,27 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	new_blocks = le16_to_cpu(el->l_tree_depth);
 
+	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+	root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
+
+	/*
+	 * If there is a gap before the root end and the real end
+	 * of the righmost leaf block, we need to remove the gap
+	 * between new_cpos and root_end first so that the tree
+	 * is consistent after we add a new branch(it will start
+	 * from new_cpos).
+	 */
+	if (root_end > new_cpos) {
+		mlog(0, "adjust the cluster end from %u to %u\n",
+		     root_end, new_cpos);
+		status = ocfs2_adjust_rightmost_branch(handle, inode, et);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
 	/* allocate the number of new eb blocks we need */
 	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
 			     GFP_KERNEL);
@@ -1071,9 +1146,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
-	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
-
 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
 	 * linked with the rest of the tree.
 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 2a947c44e594..a1163b8b417c 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -22,6 +22,9 @@
 #include <linux/crc32.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/fs.h>
 #include <asm/byteorder.h>
 
 #include <cluster/masklog.h>
@@ -222,6 +225,155 @@ void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
 	ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
 }
 
+
+/*
+ * Debugfs handling.
+ */
+
+#ifdef CONFIG_DEBUG_FS
+
+static int blockcheck_u64_get(void *data, u64 *val)
+{
+	*val = *(u64 *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
+
+static struct dentry *blockcheck_debugfs_create(const char *name,
+						struct dentry *parent,
+						u64 *value)
+{
+	return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
+				   &blockcheck_fops);
+}
+
+static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+	if (stats) {
+		debugfs_remove(stats->b_debug_check);
+		stats->b_debug_check = NULL;
+		debugfs_remove(stats->b_debug_failure);
+		stats->b_debug_failure = NULL;
+		debugfs_remove(stats->b_debug_recover);
+		stats->b_debug_recover = NULL;
+		debugfs_remove(stats->b_debug_dir);
+		stats->b_debug_dir = NULL;
+	}
+}
+
+static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+					  struct dentry *parent)
+{
+	int rc = -EINVAL;
+
+	if (!stats)
+		goto out;
+
+	stats->b_debug_dir = debugfs_create_dir("blockcheck", parent);
+	if (!stats->b_debug_dir)
+		goto out;
+
+	stats->b_debug_check =
+		blockcheck_debugfs_create("blocks_checked",
+					  stats->b_debug_dir,
+					  &stats->b_check_count);
+
+	stats->b_debug_failure =
+		blockcheck_debugfs_create("checksums_failed",
+					  stats->b_debug_dir,
+					  &stats->b_failure_count);
+
+	stats->b_debug_recover =
+		blockcheck_debugfs_create("ecc_recoveries",
+					  stats->b_debug_dir,
+					  &stats->b_recover_count);
+	if (stats->b_debug_check && stats->b_debug_failure &&
+	    stats->b_debug_recover)
+		rc = 0;
+
+out:
+	if (rc)
+		ocfs2_blockcheck_debug_remove(stats);
+	return rc;
+}
+#else
+static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
+						 struct dentry *parent)
+{
+	return 0;
+}
+
+static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
+{
+}
+#endif  /* CONFIG_DEBUG_FS */
+
+/* Always-called wrappers for starting and stopping the debugfs files */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+					   struct dentry *parent)
+{
+	return ocfs2_blockcheck_debug_install(stats, parent);
+}
+
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats)
+{
+	ocfs2_blockcheck_debug_remove(stats);
+}
+
+static void ocfs2_blockcheck_inc_check(struct ocfs2_blockcheck_stats *stats)
+{
+	u64 new_count;
+
+	if (!stats)
+		return;
+
+	spin_lock(&stats->b_lock);
+	stats->b_check_count++;
+	new_count = stats->b_check_count;
+	spin_unlock(&stats->b_lock);
+
+	if (!new_count)
+		mlog(ML_NOTICE, "Block check count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_failure(struct ocfs2_blockcheck_stats *stats)
+{
+	u64 new_count;
+
+	if (!stats)
+		return;
+
+	spin_lock(&stats->b_lock);
+	stats->b_failure_count++;
+	new_count = stats->b_failure_count;
+	spin_unlock(&stats->b_lock);
+
+	if (!new_count)
+		mlog(ML_NOTICE, "Checksum failure count has wrapped\n");
+}
+
+static void ocfs2_blockcheck_inc_recover(struct ocfs2_blockcheck_stats *stats)
+{
+	u64 new_count;
+
+	if (!stats)
+		return;
+
+	spin_lock(&stats->b_lock);
+	stats->b_recover_count++;
+	new_count = stats->b_recover_count;
+	spin_unlock(&stats->b_lock);
+
+	if (!new_count)
+		mlog(ML_NOTICE, "ECC recovery count has wrapped\n");
+}
+
+
+
+/*
+ * These are the low-level APIs for using the ocfs2_block_check structure.
+ */
+
 /*
  * This function generates check information for a block.
  * data is the block to be checked.  bc is a pointer to the
@@ -266,12 +418,15 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
  * Again, the data passed in should be the on-disk endian.
  */
 int ocfs2_block_check_validate(void *data, size_t blocksize,
-			       struct ocfs2_block_check *bc)
+			       struct ocfs2_block_check *bc,
+			       struct ocfs2_blockcheck_stats *stats)
 {
 	int rc = 0;
 	struct ocfs2_block_check check;
 	u32 crc, ecc;
 
+	ocfs2_blockcheck_inc_check(stats);
+
 	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
 	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
 
@@ -282,6 +437,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
 	if (crc == check.bc_crc32e)
 		goto out;
 
+	ocfs2_blockcheck_inc_failure(stats);
 	mlog(ML_ERROR,
 	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
 	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -292,8 +448,10 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
 
 	/* And check the crc32 again */
 	crc = crc32_le(~0, data, blocksize);
-	if (crc == check.bc_crc32e)
+	if (crc == check.bc_crc32e) {
+		ocfs2_blockcheck_inc_recover(stats);
 		goto out;
+	}
 
 	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
 	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -366,7 +524,8 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
  * Again, the data passed in should be the on-disk endian.
  */
 int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
-				   struct ocfs2_block_check *bc)
+				   struct ocfs2_block_check *bc,
+				   struct ocfs2_blockcheck_stats *stats)
 {
 	int i, rc = 0;
 	struct ocfs2_block_check check;
@@ -377,6 +536,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
 	if (!nr)
 		return 0;
 
+	ocfs2_blockcheck_inc_check(stats);
+
 	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
 	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
 
@@ -388,6 +549,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
 	if (crc == check.bc_crc32e)
 		goto out;
 
+	ocfs2_blockcheck_inc_failure(stats);
 	mlog(ML_ERROR,
 	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
 	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -416,8 +578,10 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
 	/* And check the crc32 again */
 	for (i = 0, crc = ~0; i < nr; i++)
 		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
-	if (crc == check.bc_crc32e)
+	if (crc == check.bc_crc32e) {
+		ocfs2_blockcheck_inc_recover(stats);
 		goto out;
+	}
 
 	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
 	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
@@ -448,9 +612,11 @@ int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
 			    struct ocfs2_block_check *bc)
 {
 	int rc = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
-		rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+	if (ocfs2_meta_ecc(osb))
+		rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc,
+						&osb->osb_ecc_stats);
 
 	return rc;
 }
@@ -468,9 +634,11 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
 				struct ocfs2_block_check *bc)
 {
 	int rc = 0;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
-		rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+	if (ocfs2_meta_ecc(osb))
+		rc = ocfs2_block_check_validate_bhs(bhs, nr, bc,
+						    &osb->osb_ecc_stats);
 
 	return rc;
 }
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
index 70ec3feda32f..d4b69febf70a 100644
--- a/fs/ocfs2/blockcheck.h
+++ b/fs/ocfs2/blockcheck.h
@@ -21,6 +21,24 @@
 #define OCFS2_BLOCKCHECK_H
 
 
+/* Count errors and error correction from blockcheck.c */
+struct ocfs2_blockcheck_stats {
+	spinlock_t b_lock;
+	u64 b_check_count;	/* Number of blocks we've checked */
+	u64 b_failure_count;	/* Number of failed checksums */
+	u64 b_recover_count;	/* Number of blocks fixed by ecc */
+
+	/*
+	 * debugfs entries, used if this is passed to
+	 * ocfs2_blockcheck_stats_debugfs_install()
+	 */
+	struct dentry *b_debug_dir;	/* Parent of the debugfs  files */
+	struct dentry *b_debug_check;	/* Exposes b_check_count */
+	struct dentry *b_debug_failure;	/* Exposes b_failure_count */
+	struct dentry *b_debug_recover;	/* Exposes b_recover_count */
+};
+
+
 /* High level block API */
 void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
 			    struct ocfs2_block_check *bc);
@@ -37,11 +55,18 @@ int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
 void ocfs2_block_check_compute(void *data, size_t blocksize,
 			       struct ocfs2_block_check *bc);
 int ocfs2_block_check_validate(void *data, size_t blocksize,
-			       struct ocfs2_block_check *bc);
+			       struct ocfs2_block_check *bc,
+			       struct ocfs2_blockcheck_stats *stats);
 void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
 				   struct ocfs2_block_check *bc);
 int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
-				   struct ocfs2_block_check *bc);
+				   struct ocfs2_block_check *bc,
+				   struct ocfs2_blockcheck_stats *stats);
+
+/* Debug Initialization */
+int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats,
+					   struct dentry *parent);
+void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats);
 
 /*
  * Hamming code functions
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7e72a81bc2d4..696c32e50716 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -48,34 +48,33 @@
  * only emit the appropriage printk() when the caller passes in a constant
  * mask, as is almost always the case.
  *
- * All this bitmask nonsense is hidden from the /proc interface so that Joel
- * doesn't have an aneurism.  Reading the file gives a straight forward
- * indication of which bits are on or off:
- * 	ENTRY off
- * 	EXIT off
+ * All this bitmask nonsense is managed from the files under
+ * /sys/fs/o2cb/logmask/.  Reading the files gives a straightforward
+ * indication of which bits are allowed (allow) or denied (off/deny).
+ * 	ENTRY deny
+ * 	EXIT deny
  * 	TCP off
  * 	MSG off
  * 	SOCKET off
- * 	ERROR off
- * 	NOTICE on
+ * 	ERROR allow
+ * 	NOTICE allow
  *
  * Writing changes the state of a given bit and requires a strictly formatted
  * single write() call:
  *
- * 	write(fd, "ENTRY on", 8);
+ * 	write(fd, "allow", 5);
  *
- * would turn the entry bit on.  "1" is also accepted in the place of "on", and
- * "off" and "0" behave as expected.
+ * Echoing allow/deny/off string into the logmask files can flip the bits
+ * on or off as expected; here is the bash script for example:
  *
- * Some trivial shell can flip all the bits on or off:
+ * log_mask="/sys/fs/o2cb/log_mask"
+ * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
+ *	echo allow >"$log_mask"/"$node"
+ * done
  *
- * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
- * cat $log_mask | (
- * 	while read bit status; do
- * 		# $1 is "on" or "off", say
- * 		echo "$bit $1" > $log_mask
- * 	done
- * )
+ * The debugfs.ocfs2 tool can also flip the bits with the -l option:
+ *
+ * debugfs.ocfs2 -l TCP allow
  */
 
 /* for task_struct */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 9fbe849f6344..334f231a422c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -974,7 +974,7 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 			   size_t caller_veclen, u8 target_node, int *status)
 {
-	int ret, error = 0;
+	int ret;
 	struct o2net_msg *msg = NULL;
 	size_t veclen, caller_bytes = 0;
 	struct kvec *vec = NULL;
@@ -1015,10 +1015,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
 
 	o2net_set_nst_sock_time(&nst);
 
-	ret = wait_event_interruptible(nn->nn_sc_wq,
-				       o2net_tx_can_proceed(nn, &sc, &error));
-	if (!ret && error)
-		ret = error;
+	wait_event(nn->nn_sc_wq, o2net_tx_can_proceed(nn, &sc, &ret));
 	if (ret)
 		goto out;
 
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c5752305627c..b358f3bf896d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2900,6 +2900,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	alloc = ocfs2_clusters_for_bytes(sb, bytes);
 	dx_alloc = 0;
 
+	down_write(&oi->ip_alloc_sem);
+
 	if (ocfs2_supports_indexed_dirs(osb)) {
 		credits += ocfs2_add_dir_index_credits(sb);
 
@@ -2940,8 +2942,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		goto out;
 	}
 
-	down_write(&oi->ip_alloc_sem);
-
 	/*
 	 * Prepare for worst case allocation scenario of two separate
 	 * extents in the unindexed tree.
@@ -2953,7 +2953,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
-		goto out_sem;
+		goto out;
 	}
 
 	if (vfs_dq_alloc_space_nodirty(dir,
@@ -3172,10 +3172,8 @@ out_commit:
 
 	ocfs2_commit_trans(osb, handle);
 
-out_sem:
-	up_write(&oi->ip_alloc_sem);
-
 out:
+	up_write(&oi->ip_alloc_sem);
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 	if (meta_ac)
@@ -3322,11 +3320,15 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		brelse(new_bh);
 		new_bh = NULL;
 
+		down_write(&OCFS2_I(dir)->ip_alloc_sem);
+		drop_alloc_sem = 1;
 		dir_i_size = i_size_read(dir);
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
 		goto do_extend;
 	}
 
+	down_write(&OCFS2_I(dir)->ip_alloc_sem);
+	drop_alloc_sem = 1;
 	dir_i_size = i_size_read(dir);
 	mlog(0, "extending dir %llu (i_size = %lld)\n",
 	     (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
@@ -3370,9 +3372,6 @@ do_extend:
 		credits++; /* For attaching the new dirent block to the
 			    * dx_root */
 
-	down_write(&OCFS2_I(dir)->ip_alloc_sem);
-	drop_alloc_sem = 1;
-
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@@ -3435,10 +3434,10 @@ bail_bh:
 	*new_de_bh = new_bh;
 	get_bh(*new_de_bh);
 bail:
-	if (drop_alloc_sem)
-		up_write(&OCFS2_I(dir)->ip_alloc_sem);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
+	if (drop_alloc_sem)
+		up_write(&OCFS2_I(dir)->ip_alloc_sem);
 
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e15fc7d50827..6cdeaa76f27f 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -248,6 +248,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
 	.flags		= 0,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
+};
+
 static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
 	.get_osb	= ocfs2_get_dentry_osb,
 	.post_unlock	= ocfs2_dentry_post_unlock,
@@ -637,6 +641,19 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
 				   &ocfs2_nfs_sync_lops, osb);
 }
 
+static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
+					    struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan_lvb *lvb;
+
+	ocfs2_lock_res_init_once(res);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
+	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
+				   &ocfs2_orphan_scan_lops, osb);
+	lvb = ocfs2_dlm_lvb(&res->l_lksb);
+	lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+}
+
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 			      struct ocfs2_file_private *fp)
 {
@@ -2352,6 +2369,37 @@ void ocfs2_inode_unlock(struct inode *inode,
 	mlog_exit_void();
 }
 
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex)
+{
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_orphan_scan_lvb *lvb;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	int status = 0;
+
+	lockres = &osb->osb_orphan_scan.os_lockres;
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0)
+		return status;
+
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	if (lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
+		*seqno = be32_to_cpu(lvb->lvb_os_seqno);
+	return status;
+}
+
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex)
+{
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_orphan_scan_lvb *lvb;
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	lockres = &osb->osb_orphan_scan.os_lockres;
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
+	lvb->lvb_os_seqno = cpu_to_be32(seqno);
+	ocfs2_cluster_unlock(osb, lockres, level);
+}
+
 int ocfs2_super_lock(struct ocfs2_super *osb,
 		     int ex)
 {
@@ -2842,6 +2890,7 @@ local:
 	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
 	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
 	ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+	ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
 
 	osb->cconn = conn;
 
@@ -2878,6 +2927,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 	ocfs2_lock_res_free(&osb->osb_super_lockres);
 	ocfs2_lock_res_free(&osb->osb_rename_lockres);
 	ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
+	ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
 
 	ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
 	osb->cconn = NULL;
@@ -3061,6 +3111,7 @@ static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
 	ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
 	ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
 	ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
+	ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
 }
 
 int ocfs2_drop_inode_locks(struct inode *inode)
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e1fd5721cd7f..31b90d7b8f51 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -62,6 +62,14 @@ struct ocfs2_qinfo_lvb {
 	__be32	lvb_free_entry;
 };
 
+#define OCFS2_ORPHAN_LVB_VERSION 1
+
+struct ocfs2_orphan_scan_lvb {
+	__u8	lvb_version;
+	__u8	lvb_reserved[3];
+	__be32	lvb_os_seqno;
+};
+
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY	(0x01)
@@ -113,6 +121,9 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 		     int ex);
 void ocfs2_super_unlock(struct ocfs2_super *osb,
 			int ex);
+int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno, int ex);
+void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno, int ex);
+
 int ocfs2_rename_lock(struct ocfs2_super *osb);
 void ocfs2_rename_unlock(struct ocfs2_super *osb);
 int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c2a87c885b73..07267e0da909 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -187,6 +187,9 @@ static int ocfs2_sync_file(struct file *file,
 	if (err)
 		goto bail;
 
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		goto bail;
+
 	journal = osb->journal->j_journal;
 	err = jbd2_journal_force_commit(journal);
 
@@ -894,9 +897,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	struct buffer_head *bh = NULL;
 	handle_t *handle = NULL;
-	int locked[MAXQUOTAS] = {0, 0};
-	int credits, qtype;
-	struct ocfs2_mem_dqinfo *oinfo;
+	int qtype;
+	struct dquot *transfer_from[MAXQUOTAS] = { };
+	struct dquot *transfer_to[MAXQUOTAS] = { };
 
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 	           dentry->d_name.len, dentry->d_name.name);
@@ -969,30 +972,37 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-		credits = OCFS2_INODE_UPDATE_CREDITS;
+		/*
+		 * Gather pointers to quota structures so that allocation /
+		 * freeing of quota structures happens here and not inside
+		 * vfs_dq_transfer() where we have problems with lock ordering
+		 */
 		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
-			oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
-			status = ocfs2_lock_global_qf(oinfo, 1);
-			if (status < 0)
+			transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
+						      USRQUOTA);
+			transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
+							USRQUOTA);
+			if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
+				status = -ESRCH;
 				goto bail_unlock;
-			credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
-				ocfs2_calc_qdel_credits(sb, USRQUOTA);
-			locked[USRQUOTA] = 1;
+			}
 		}
 		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
 		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
 		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
-			oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
-			status = ocfs2_lock_global_qf(oinfo, 1);
-			if (status < 0)
+			transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
+						      GRPQUOTA);
+			transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
+							GRPQUOTA);
+			if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
+				status = -ESRCH;
 				goto bail_unlock;
-			credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
-				   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
-			locked[GRPQUOTA] = 1;
+			}
 		}
-		handle = ocfs2_start_trans(osb, credits);
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
+					   2 * ocfs2_quota_trans_credits(sb));
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
 			mlog_errno(status);
@@ -1030,12 +1040,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
-	for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
-		if (!locked[qtype])
-			continue;
-		oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
-		ocfs2_unlock_global_qf(oinfo, 1);
-	}
 	ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
 	if (size_change)
@@ -1043,6 +1047,12 @@ bail_unlock_rw:
 bail:
 	brelse(bh);
 
+	/* Release quota pointers in case we acquired them */
+	for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+		dqput(transfer_to[qtype]);
+		dqput(transfer_from[qtype]);
+	}
+
 	if (!status && attr->ia_valid & ATTR_MODE) {
 		status = ocfs2_acl_chmod(inode);
 		if (status < 0)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a20a0f1e37fd..4a3b9e6b31ad 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -28,6 +28,8 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/kthread.h>
+#include <linux/time.h>
+#include <linux/random.h>
 
 #define MLOG_MASK_PREFIX ML_JOURNAL
 #include <cluster/masklog.h>
@@ -52,6 +54,8 @@
 
 DEFINE_SPINLOCK(trans_inc_lock);
 
+#define ORPHAN_SCAN_SCHEDULE_TIMEOUT 300000
+
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
 			      int node_num, int slot_num);
@@ -1841,6 +1845,113 @@ bail:
 	return status;
 }
 
+/*
+ * Scan timer should get fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT. Add some
+ * randomness to the timeout to minimize multple nodes firing the timer at the
+ * same time.
+ */
+static inline unsigned long ocfs2_orphan_scan_timeout(void)
+{
+	unsigned long time;
+
+	get_random_bytes(&time, sizeof(time));
+	time = ORPHAN_SCAN_SCHEDULE_TIMEOUT + (time % 5000);
+	return msecs_to_jiffies(time);
+}
+
+/*
+ * ocfs2_queue_orphan_scan calls ocfs2_queue_recovery_completion for
+ * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
+ * is done to catch any orphans that are left over in orphan directories.
+ *
+ * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
+ * seconds.  It gets an EX lock on os_lockres and checks sequence number
+ * stored in LVB. If the sequence number has changed, it means some other
+ * node has done the scan.  This node skips the scan and tracks the
+ * sequence number.  If the sequence number didn't change, it means a scan
+ * hasn't happened.  The node queues a scan and increments the
+ * sequence number in the LVB.
+ */
+void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+	int status, i;
+	u32 seqno = 0;
+
+	os = &osb->osb_orphan_scan;
+
+	status = ocfs2_orphan_scan_lock(osb, &seqno, DLM_LOCK_EX);
+	if (status < 0) {
+		if (status != -EAGAIN)
+			mlog_errno(status);
+		goto out;
+	}
+
+	if (os->os_seqno != seqno) {
+		os->os_seqno = seqno;
+		goto unlock;
+	}
+
+	for (i = 0; i < osb->max_slots; i++)
+		ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
+						NULL);
+	/*
+	 * We queued a recovery on orphan slots, increment the sequence
+	 * number and update LVB so other node will skip the scan for a while
+	 */
+	seqno++;
+	os->os_count++;
+	os->os_scantime = CURRENT_TIME;
+unlock:
+	ocfs2_orphan_scan_unlock(osb, seqno, DLM_LOCK_EX);
+out:
+	return;
+}
+
+/* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */
+void ocfs2_orphan_scan_work(struct work_struct *work)
+{
+	struct ocfs2_orphan_scan *os;
+	struct ocfs2_super *osb;
+
+	os = container_of(work, struct ocfs2_orphan_scan,
+			  os_orphan_scan_work.work);
+	osb = os->os_osb;
+
+	mutex_lock(&os->os_lock);
+	ocfs2_queue_orphan_scan(osb);
+	schedule_delayed_work(&os->os_orphan_scan_work,
+			      ocfs2_orphan_scan_timeout());
+	mutex_unlock(&os->os_lock);
+}
+
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	mutex_lock(&os->os_lock);
+	cancel_delayed_work(&os->os_orphan_scan_work);
+	mutex_unlock(&os->os_lock);
+}
+
+int ocfs2_orphan_scan_init(struct ocfs2_super *osb)
+{
+	struct ocfs2_orphan_scan *os;
+
+	os = &osb->osb_orphan_scan;
+	os->os_osb = osb;
+	os->os_count = 0;
+	os->os_scantime = CURRENT_TIME;
+	mutex_init(&os->os_lock);
+
+	INIT_DELAYED_WORK(&os->os_orphan_scan_work,
+			  ocfs2_orphan_scan_work);
+	schedule_delayed_work(&os->os_orphan_scan_work,
+			      ocfs2_orphan_scan_timeout());
+	return 0;
+}
+
 struct ocfs2_orphan_filldir_priv {
 	struct inode		*head;
 	struct ocfs2_super	*osb;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index eb7b76331eb7..61045eeb3f6e 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -144,6 +144,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 }
 
 /* Exported only for the journal struct init code in super.c. Do not call. */
+int ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
+
 void ocfs2_complete_recovery(struct work_struct *work);
 void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1386281950db..18c1d9ec1c93 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -47,6 +47,9 @@
 #include "ocfs2_fs.h"
 #include "ocfs2_lockid.h"
 
+/* For struct ocfs2_blockcheck_stats */
+#include "blockcheck.h"
+
 /* Most user visible OCFS2 inodes will have very few pieces of
  * metadata, but larger files (including bitmaps, etc) must be taken
  * into account when designing an access scheme. We allow a small
@@ -151,6 +154,16 @@ struct ocfs2_lock_res {
 #endif
 };
 
+struct ocfs2_orphan_scan {
+	struct mutex 		os_lock;
+	struct ocfs2_super 	*os_osb;
+	struct ocfs2_lock_res 	os_lockres;     /* lock to synchronize scans */
+	struct delayed_work 	os_orphan_scan_work;
+	struct timespec		os_scantime;  /* time this node ran the scan */
+	u32			os_count;      /* tracks node specific scans */
+	u32  			os_seqno;       /* tracks cluster wide scans */
+};
+
 struct ocfs2_dlm_debug {
 	struct kref d_refcnt;
 	struct dentry *d_locking_state;
@@ -295,6 +308,7 @@ struct ocfs2_super
 	struct ocfs2_dinode *local_alloc_copy;
 	struct ocfs2_quota_recovery *quota_rec;
 
+	struct ocfs2_blockcheck_stats osb_ecc_stats;
 	struct ocfs2_alloc_stats alloc_stats;
 	char dev_str[20];		/* "major,minor" of the device */
 
@@ -341,6 +355,8 @@ struct ocfs2_super
 	unsigned int			*osb_orphan_wipes;
 	wait_queue_head_t		osb_wipe_event;
 
+	struct ocfs2_orphan_scan	osb_orphan_scan;
+
 	/* used to protect metaecc calculation check of xattr. */
 	spinlock_t osb_xattr_lock;
 
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index a53ce87481bf..fcdba091af3d 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -48,6 +48,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_FLOCK,
 	OCFS2_LOCK_TYPE_QINFO,
 	OCFS2_LOCK_TYPE_NFS_SYNC,
+	OCFS2_LOCK_TYPE_ORPHAN_SCAN,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -85,6 +86,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 		case OCFS2_LOCK_TYPE_NFS_SYNC:
 			c = 'Y';
 			break;
+		case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
+			c = 'P';
+			break;
 		default:
 			c = '\0';
 	}
@@ -104,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
 	[OCFS2_LOCK_TYPE_OPEN] = "Open",
 	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
 	[OCFS2_LOCK_TYPE_QINFO] = "Quota",
+	[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
 };
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 1ed0f7c86869..edfa60cd155c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -421,6 +421,7 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
 	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
 	if (!dquot->dq_off) {	/* No real quota entry? */
 		/* Upgrade to exclusive lock for allocation */
+		ocfs2_qinfo_unlock(info, 0);
 		err = ocfs2_qinfo_lock(info, 1);
 		if (err < 0)
 			goto out_qlock;
@@ -435,7 +436,8 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
 out_qlock:
 	if (ex)
 		ocfs2_qinfo_unlock(info, 1);
-	ocfs2_qinfo_unlock(info, 0);
+	else
+		ocfs2_qinfo_unlock(info, 0);
 out:
 	if (err < 0)
 		mlog_errno(err);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 07deec5e9721..5a460fa82553 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -444,10 +444,6 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 
 	mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
 
-	status = ocfs2_lock_global_qf(oinfo, 1);
-	if (status < 0)
-		goto out;
-
 	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
 		chunk = rchunk->rc_chunk;
 		hbh = NULL;
@@ -480,12 +476,18 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 				     type);
 				goto out_put_bh;
 			}
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_put_dquot;
+			}
+
 			handle = ocfs2_start_trans(OCFS2_SB(sb),
 						   OCFS2_QSYNC_CREDITS);
 			if (IS_ERR(handle)) {
 				status = PTR_ERR(handle);
 				mlog_errno(status);
-				goto out_put_dquot;
+				goto out_drop_lock;
 			}
 			mutex_lock(&sb_dqopt(sb)->dqio_mutex);
 			spin_lock(&dq_data_lock);
@@ -523,6 +525,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 out_commit:
 			mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
 			ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_drop_lock:
+			ocfs2_unlock_global_qf(oinfo, 1);
 out_put_dquot:
 			dqput(dquot);
 out_put_bh:
@@ -537,8 +541,6 @@ out_put_bh:
 		if (status < 0)
 			break;
 	}
-	ocfs2_unlock_global_qf(oinfo, 1);
-out:
 	if (status < 0)
 		free_recovery_list(&(rec->r_list[type]));
 	mlog_exit(status);
@@ -655,6 +657,9 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 	struct ocfs2_quota_recovery *rec;
 	int locked = 0;
 
+	/* We don't need the lock and we have to acquire quota file locks
+	 * which will later depend on this lock */
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
 	info->dqi_maxblimit = 0x7fffffffffffffffLL;
 	info->dqi_maxilimit = 0x7fffffffffffffffLL;
 	oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
@@ -733,6 +738,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 		goto out_err;
 	}
 
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
 	return 0;
 out_err:
 	if (oinfo) {
@@ -746,6 +752,7 @@ out_err:
 		kfree(oinfo);
 	}
 	brelse(bh);
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
 	return -1;
 }
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 201b40a441fe..0d3ed7407a04 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -119,10 +119,12 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_check_volume(struct ocfs2_super *osb);
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 			       struct buffer_head *bh,
-			       u32 sectsize);
+			       u32 sectsize,
+			       struct ocfs2_blockcheck_stats *stats);
 static int ocfs2_initialize_super(struct super_block *sb,
 				  struct buffer_head *bh,
-				  int sector_size);
+				  int sector_size,
+				  struct ocfs2_blockcheck_stats *stats);
 static int ocfs2_get_sector(struct super_block *sb,
 			    struct buffer_head **bh,
 			    int block,
@@ -207,6 +209,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 	int i;
 	struct ocfs2_cluster_connection *cconn = osb->cconn;
 	struct ocfs2_recovery_map *rm = osb->recovery_map;
+	struct ocfs2_orphan_scan *os;
 
 	out += snprintf(buf + out, len - out,
 			"%10s => Id: %-s  Uuid: %-s  Gen: 0x%X  Label: %-s\n",
@@ -308,6 +311,13 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 				i, osb->slot_recovery_generations[i]);
 	}
 
+	os = &osb->osb_orphan_scan;
+	out += snprintf(buf + out, len - out, "Orphan Scan=> ");
+	out += snprintf(buf + out, len - out, "Local: %u  Global: %u ",
+			os->os_count, os->os_seqno);
+	out += snprintf(buf + out, len - out, " Last Scan: %lu seconds ago\n",
+			(get_seconds() - os->os_scantime.tv_sec));
+
 	return out;
 }
 
@@ -542,7 +552,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
 	BUILD_BUG_ON(sizeof(sector_t) != 8);
 	/*
 	 * We might be limited by page cache size.
@@ -693,7 +703,8 @@ out:
 
 static int ocfs2_sb_probe(struct super_block *sb,
 			  struct buffer_head **bh,
-			  int *sector_size)
+			  int *sector_size,
+			  struct ocfs2_blockcheck_stats *stats)
 {
 	int status, tmpstat;
 	struct ocfs1_vol_disk_hdr *hdr;
@@ -759,7 +770,8 @@ static int ocfs2_sb_probe(struct super_block *sb,
 			goto bail;
 		}
 		di = (struct ocfs2_dinode *) (*bh)->b_data;
-		status = ocfs2_verify_volume(di, *bh, blksize);
+		memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+		status = ocfs2_verify_volume(di, *bh, blksize, stats);
 		if (status >= 0)
 			goto bail;
 		brelse(*bh);
@@ -965,6 +977,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	struct ocfs2_super *osb = NULL;
 	struct buffer_head *bh = NULL;
 	char nodestr[8];
+	struct ocfs2_blockcheck_stats stats;
 
 	mlog_entry("%p, %p, %i", sb, data, silent);
 
@@ -974,13 +987,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* probe for superblock */
-	status = ocfs2_sb_probe(sb, &bh, &sector_size);
+	status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
 	if (status < 0) {
 		mlog(ML_ERROR, "superblock probe failed!\n");
 		goto read_super_error;
 	}
 
-	status = ocfs2_initialize_super(sb, bh, sector_size);
+	status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
 	osb = OCFS2_SB(sb);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1090,6 +1103,18 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 		goto read_super_error;
 	}
 
+	if (ocfs2_meta_ecc(osb)) {
+		status = ocfs2_blockcheck_stats_debugfs_install(
+						&osb->osb_ecc_stats,
+						osb->osb_debug_root);
+		if (status) {
+			mlog(ML_ERROR,
+			     "Unable to create blockcheck statistics "
+			     "files\n");
+			goto read_super_error;
+		}
+	}
+
 	status = ocfs2_mount_volume(sb);
 	if (osb->root_inode)
 		inode = igrab(osb->root_inode);
@@ -1760,13 +1785,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
 	}
 
 	status = ocfs2_truncate_log_init(osb);
-	if (status < 0) {
+	if (status < 0)
 		mlog_errno(status);
-		goto leave;
-	}
-
-	if (ocfs2_mount_local(osb))
-		goto leave;
 
 leave:
 	if (unlock_super)
@@ -1796,6 +1816,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_truncate_log_shutdown(osb);
 
+	ocfs2_orphan_scan_stop(osb);
+
 	/* This will disable recovery and flush any recovery work. */
 	ocfs2_recovery_exit(osb);
 
@@ -1833,6 +1855,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	if (osb->cconn)
 		ocfs2_dlm_shutdown(osb, hangup_needed);
 
+	ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
 	debugfs_remove(osb->osb_debug_root);
 
 	if (hangup_needed)
@@ -1880,7 +1903,8 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
 
 static int ocfs2_initialize_super(struct super_block *sb,
 				  struct buffer_head *bh,
-				  int sector_size)
+				  int sector_size,
+				  struct ocfs2_blockcheck_stats *stats)
 {
 	int status;
 	int i, cbits, bbits;
@@ -1939,6 +1963,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	atomic_set(&osb->alloc_stats.bg_allocs, 0);
 	atomic_set(&osb->alloc_stats.bg_extends, 0);
 
+	/* Copy the blockcheck stats from the superblock probe */
+	osb->osb_ecc_stats = *stats;
+
 	ocfs2_init_node_maps(osb);
 
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
@@ -1951,6 +1978,13 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
+	status = ocfs2_orphan_scan_init(osb);
+	if (status) {
+		mlog(ML_ERROR, "Unable to initialize delayed orphan scan\n");
+		mlog_errno(status);
+		goto bail;
+	}
+
 	init_waitqueue_head(&osb->checkpoint_event);
 	atomic_set(&osb->needs_checkpoint, 0);
 
@@ -2169,7 +2203,8 @@ bail:
  */
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 			       struct buffer_head *bh,
-			       u32 blksz)
+			       u32 blksz,
+			       struct ocfs2_blockcheck_stats *stats)
 {
 	int status = -EAGAIN;
 
@@ -2182,7 +2217,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 		    OCFS2_FEATURE_INCOMPAT_META_ECC) {
 			status = ocfs2_block_check_validate(bh->b_data,
 							    bh->b_size,
-							    &di->i_check);
+							    &di->i_check,
+							    stats);
 			if (status)
 				goto out;
 		}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 15631019dc63..ba320e250747 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3154,7 +3154,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 		     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
 		if (func) {
 			ret = func(inode, bucket, para);
-			if (ret)
+			if (ret && ret != -ERANGE)
 				mlog_errno(ret);
 			/* Fall through to bucket_relse() */
 		}
@@ -3261,7 +3261,8 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 						  ocfs2_list_xattr_bucket,
 						  &xl);
 		if (ret) {
-			mlog_errno(ret);
+			if (ret != -ERANGE)
+				mlog_errno(ret);
 			goto out;
 		}
 
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 63d965193b22..11a7b5c68153 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -18,6 +18,7 @@ proc-y	+= meminfo.o
 proc-y	+= stat.o
 proc-y	+= uptime.o
 proc-y	+= version.o
+proc-y	+= softirqs.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1539e630c47d..3ce5ae9e3d2d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 
 	if (!task)
 		return -ESRCH;
-	oom_adjust = task->oomkilladj;
+	task_lock(task);
+	if (task->mm)
+		oom_adjust = task->mm->oom_adj;
+	else
+		oom_adjust = OOM_DISABLE;
+	task_unlock(task);
 	put_task_struct(task);
 
 	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	task = get_proc_task(file->f_path.dentry->d_inode);
 	if (!task)
 		return -ESRCH;
-	if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
+	task_lock(task);
+	if (!task->mm) {
+		task_unlock(task);
+		put_task_struct(task);
+		return -EINVAL;
+	}
+	if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
+		task_unlock(task);
 		put_task_struct(task);
 		return -EACCES;
 	}
-	task->oomkilladj = oom_adjust;
+	task->mm->oom_adj = oom_adjust;
+	task_unlock(task);
 	put_task_struct(task);
 	if (end - buffer == 0)
 		return -EIO;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c6b0302af4c4..d5c410d47fae 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"Inactive(anon): %8lu kB\n"
 		"Active(file):   %8lu kB\n"
 		"Inactive(file): %8lu kB\n"
-#ifdef CONFIG_UNEVICTABLE_LRU
 		"Unevictable:    %8lu kB\n"
 		"Mlocked:        %8lu kB\n"
-#endif
 #ifdef CONFIG_HIGHMEM
 		"HighTotal:      %8lu kB\n"
 		"HighFree:       %8lu kB\n"
@@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(pages[LRU_INACTIVE_ANON]),
 		K(pages[LRU_ACTIVE_FILE]),
 		K(pages[LRU_INACTIVE_FILE]),
-#ifdef CONFIG_UNEVICTABLE_LRU
 		K(pages[LRU_UNEVICTABLE]),
 		K(global_page_state(NR_MLOCK)),
-#endif
 #ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
 		K(i.freehigh),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e9983837d08d..2707c6c7a20f 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -6,11 +6,13 @@
 #include <linux/mmzone.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/hugetlb.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
 #define KPMSIZE sizeof(u64)
 #define KPMMASK (KPMSIZE - 1)
+
 /* /proc/kpagecount - an array exposing page counts
  *
  * Each entry is a u64 representing the corresponding
@@ -32,20 +34,22 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 		return -EINVAL;
 
 	while (count > 0) {
-		ppage = NULL;
 		if (pfn_valid(pfn))
 			ppage = pfn_to_page(pfn);
-		pfn++;
+		else
+			ppage = NULL;
 		if (!ppage)
 			pcount = 0;
 		else
 			pcount = page_mapcount(ppage);
 
-		if (put_user(pcount, out++)) {
+		if (put_user(pcount, out)) {
 			ret = -EFAULT;
 			break;
 		}
 
+		pfn++;
+		out++;
 		count -= KPMSIZE;
 	}
 
@@ -68,19 +72,122 @@ static const struct file_operations proc_kpagecount_operations = {
 
 /* These macros are used to decouple internal flags from exported ones */
 
-#define KPF_LOCKED     0
-#define KPF_ERROR      1
-#define KPF_REFERENCED 2
-#define KPF_UPTODATE   3
-#define KPF_DIRTY      4
-#define KPF_LRU        5
-#define KPF_ACTIVE     6
-#define KPF_SLAB       7
-#define KPF_WRITEBACK  8
-#define KPF_RECLAIM    9
-#define KPF_BUDDY     10
+#define KPF_LOCKED		0
+#define KPF_ERROR		1
+#define KPF_REFERENCED		2
+#define KPF_UPTODATE		3
+#define KPF_DIRTY		4
+#define KPF_LRU			5
+#define KPF_ACTIVE		6
+#define KPF_SLAB		7
+#define KPF_WRITEBACK		8
+#define KPF_RECLAIM		9
+#define KPF_BUDDY		10
+
+/* 11-20: new additions in 2.6.31 */
+#define KPF_MMAP		11
+#define KPF_ANON		12
+#define KPF_SWAPCACHE		13
+#define KPF_SWAPBACKED		14
+#define KPF_COMPOUND_HEAD	15
+#define KPF_COMPOUND_TAIL	16
+#define KPF_HUGE		17
+#define KPF_UNEVICTABLE		18
+#define KPF_NOPAGE		20
+
+/* kernel hacking assistances
+ * WARNING: subject to change, never rely on them!
+ */
+#define KPF_RESERVED		32
+#define KPF_MLOCKED		33
+#define KPF_MAPPEDTODISK	34
+#define KPF_PRIVATE		35
+#define KPF_PRIVATE_2		36
+#define KPF_OWNER_PRIVATE	37
+#define KPF_ARCH		38
+#define KPF_UNCACHED		39
+
+static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
+{
+	return ((kflags >> kbit) & 1) << ubit;
+}
 
-#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
+static u64 get_uflags(struct page *page)
+{
+	u64 k;
+	u64 u;
+
+	/*
+	 * pseudo flag: KPF_NOPAGE
+	 * it differentiates a memory hole from a page with no flags
+	 */
+	if (!page)
+		return 1 << KPF_NOPAGE;
+
+	k = page->flags;
+	u = 0;
+
+	/*
+	 * pseudo flags for the well known (anonymous) memory mapped pages
+	 *
+	 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
+	 * simple test in page_mapped() is not enough.
+	 */
+	if (!PageSlab(page) && page_mapped(page))
+		u |= 1 << KPF_MMAP;
+	if (PageAnon(page))
+		u |= 1 << KPF_ANON;
+
+	/*
+	 * compound pages: export both head/tail info
+	 * they together define a compound page's start/end pos and order
+	 */
+	if (PageHead(page))
+		u |= 1 << KPF_COMPOUND_HEAD;
+	if (PageTail(page))
+		u |= 1 << KPF_COMPOUND_TAIL;
+	if (PageHuge(page))
+		u |= 1 << KPF_HUGE;
+
+	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
+
+	/*
+	 * Caveats on high order pages:
+	 * PG_buddy will only be set on the head page; SLUB/SLQB do the same
+	 * for PG_slab; SLOB won't set PG_slab at all on compound pages.
+	 */
+	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
+	u |= kpf_copy_bit(k, KPF_BUDDY,		PG_buddy);
+
+	u |= kpf_copy_bit(k, KPF_ERROR,		PG_error);
+	u |= kpf_copy_bit(k, KPF_DIRTY,		PG_dirty);
+	u |= kpf_copy_bit(k, KPF_UPTODATE,	PG_uptodate);
+	u |= kpf_copy_bit(k, KPF_WRITEBACK,	PG_writeback);
+
+	u |= kpf_copy_bit(k, KPF_LRU,		PG_lru);
+	u |= kpf_copy_bit(k, KPF_REFERENCED,	PG_referenced);
+	u |= kpf_copy_bit(k, KPF_ACTIVE,	PG_active);
+	u |= kpf_copy_bit(k, KPF_RECLAIM,	PG_reclaim);
+
+	u |= kpf_copy_bit(k, KPF_SWAPCACHE,	PG_swapcache);
+	u |= kpf_copy_bit(k, KPF_SWAPBACKED,	PG_swapbacked);
+
+	u |= kpf_copy_bit(k, KPF_UNEVICTABLE,	PG_unevictable);
+	u |= kpf_copy_bit(k, KPF_MLOCKED,	PG_mlocked);
+
+#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
+	u |= kpf_copy_bit(k, KPF_UNCACHED,	PG_uncached);
+#endif
+
+	u |= kpf_copy_bit(k, KPF_RESERVED,	PG_reserved);
+	u |= kpf_copy_bit(k, KPF_MAPPEDTODISK,	PG_mappedtodisk);
+	u |= kpf_copy_bit(k, KPF_PRIVATE,	PG_private);
+	u |= kpf_copy_bit(k, KPF_PRIVATE_2,	PG_private_2);
+	u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE,	PG_owner_priv_1);
+	u |= kpf_copy_bit(k, KPF_ARCH,		PG_arch_1);
+
+	return u;
+};
 
 static ssize_t kpageflags_read(struct file *file, char __user *buf,
 			     size_t count, loff_t *ppos)
@@ -90,7 +197,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 	unsigned long src = *ppos;
 	unsigned long pfn;
 	ssize_t ret = 0;
-	u64 kflags, uflags;
 
 	pfn = src / KPMSIZE;
 	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
@@ -98,32 +204,18 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 		return -EINVAL;
 
 	while (count > 0) {
-		ppage = NULL;
 		if (pfn_valid(pfn))
 			ppage = pfn_to_page(pfn);
-		pfn++;
-		if (!ppage)
-			kflags = 0;
 		else
-			kflags = ppage->flags;
-
-		uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
-			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
-			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
-			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
-			kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
-			kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
-			kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
-			kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
-			kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
-			kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
-			kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
-
-		if (put_user(uflags, out++)) {
+			ppage = NULL;
+
+		if (put_user(get_uflags(ppage), out)) {
 			ret = -EFAULT;
 			break;
 		}
 
+		pfn++;
+		out++;
 		count -= KPMSIZE;
 	}
 
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index fc6c3025befd..7ba79a54948c 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -195,20 +195,20 @@ void proc_device_tree_add_node(struct device_node *np,
 			p = fixup_name(np, de, p);
 
 		ent = proc_mkdir(p, de);
-		if (ent == 0)
+		if (ent == NULL)
 			break;
 		proc_device_tree_add_node(child, ent);
 	}
 	of_node_put(child);
 
-	for (pp = np->properties; pp != 0; pp = pp->next) {
+	for (pp = np->properties; pp != NULL; pp = pp->next) {
 		p = pp->name;
 
 		if (duplicate_name(de, p))
 			p = fixup_name(np, de, p);
 
 		ent = __proc_device_tree_add_prop(de, pp, p);
-		if (ent == 0)
+		if (ent == NULL)
 			break;
 	}
 }
@@ -221,10 +221,10 @@ void __init proc_device_tree_init(void)
 	struct device_node *root;
 
 	proc_device_tree = proc_mkdir("device-tree", NULL);
-	if (proc_device_tree == 0)
+	if (proc_device_tree == NULL)
 		return;
 	root = of_find_node_by_path("/");
-	if (root == 0) {
+	if (root == NULL) {
 		printk(KERN_ERR "/proc/device-tree: can't find root\n");
 		return;
 	}
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
new file mode 100644
index 000000000000..1807c2419f17
--- /dev/null
+++ b/fs/proc/softirqs.c
@@ -0,0 +1,44 @@
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/softirqs  ... display the number of softirqs
+ */
+static int show_softirqs(struct seq_file *p, void *v)
+{
+	int i, j;
+
+	seq_printf(p, "                ");
+	for_each_possible_cpu(i)
+		seq_printf(p, "CPU%-8d", i);
+	seq_printf(p, "\n");
+
+	for (i = 0; i < NR_SOFTIRQS; i++) {
+		seq_printf(p, "%8s:", softirq_to_name[i]);
+		for_each_possible_cpu(j)
+			seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+		seq_printf(p, "\n");
+	}
+	return 0;
+}
+
+static int softirqs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_softirqs, NULL);
+}
+
+static const struct file_operations proc_softirqs_operations = {
+	.open		= softirqs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_softirqs_init(void)
+{
+	proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
+	return 0;
+}
+module_init(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81e4eb60972e..7cc726c6d70a 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -29,6 +29,8 @@ static int show_stat(struct seq_file *p, void *v)
 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
 	cputime64_t guest;
 	u64 sum = 0;
+	u64 sum_softirq = 0;
+	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
 	struct timespec boottime;
 	unsigned int per_irq_sum;
 
@@ -53,6 +55,13 @@ static int show_stat(struct seq_file *p, void *v)
 			sum += kstat_irqs_cpu(j, i);
 		}
 		sum += arch_irq_stat_cpu(i);
+
+		for (j = 0; j < NR_SOFTIRQS; j++) {
+			unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
+
+			per_softirq_sums[j] += softirq_stat;
+			sum_softirq += softirq_stat;
+		}
 	}
 	sum += arch_irq_stat();
 
@@ -115,6 +124,12 @@ static int show_stat(struct seq_file *p, void *v)
 		nr_running(),
 		nr_iowait());
 
+	seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
+
+	for (i = 0; i < NR_SOFTIRQS; i++)
+		seq_printf(p, " %u", per_softirq_sums[i]);
+	seq_printf(p, "\n");
+
 	return 0;
 }
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 5edcc3f92ba7..0872afa58d39 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -166,12 +166,7 @@ static const struct file_operations proc_vmcore_operations = {
 
 static struct vmcore* __init get_new_element(void)
 {
-	struct vmcore *p;
-
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
-	if (p)
-		memset(p, 0, sizeof(*p));
-	return p;
+	return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
 }
 
 static u64 __init get_vmcore_size_elf64(char *elfptr)
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 3a6b193d8444..0ff7566c767c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -202,9 +202,12 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
 				return -EINVAL;
 			opts->mode = option & S_IALLUGO;
 			break;
-		default:
-			printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
-			return -EINVAL;
+		/*
+		 * We might like to report bad mount options here;
+		 * but traditionally ramfs has ignored all mount options,
+		 * and as it is used as a !CONFIG_SHMEM simple substitute
+		 * for tmpfs, better continue to ignore other mount options.
+		 */
 		}
 	}
 
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 4beb964a2a3e..128d3f7c8aa5 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -1270,9 +1270,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item h
 
 					RFALSE(ih, "PAP-12210: ih must be 0");
 
-					if (is_direntry_le_ih
-					    (aux_ih =
-					     B_N_PITEM_HEAD(tbS0, item_pos))) {
+					aux_ih = B_N_PITEM_HEAD(tbS0, item_pos);
+					if (is_direntry_le_ih(aux_ih)) {
 						/* we append to directory item */
 
 						int entry_count;
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 381750a155f6..03d85cbf90bf 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -390,7 +390,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 
 	if (last_first == FIRST_TO_LAST) {
 		/* if ( if item in position item_num in buffer SOURCE is directory item ) */
-		if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+		ih = B_N_PITEM_HEAD(src, item_num);
+		if (is_direntry_le_ih(ih))
 			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
 					      item_num, 0, cpy_bytes);
 		else {
@@ -418,7 +419,8 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 		}
 	} else {
 		/*  if ( if item in position item_num in buffer SOURCE is directory item ) */
-		if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+		ih = B_N_PITEM_HEAD(src, item_num);
+		if (is_direntry_le_ih(ih))
 			leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
 					      item_num,
 					      I_ENTRY_COUNT(ih) - cpy_bytes,
@@ -774,8 +776,8 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
 			leaf_delete_items_entirely(cur_bi, first + 1,
 						   del_num - 1);
 
-			if (is_direntry_le_ih
-			    (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1)))
+			ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1);
+			if (is_direntry_le_ih(ih))
 				/* the last item is directory  */
 				/* len = numbers of directory entries in this item */
 				len = ih_entry_count(ih);
diff --git a/fs/select.c b/fs/select.c
index 0fe0e1469df3..d870237e42c7 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -168,7 +168,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 	return table->entry++;
 }
 
-static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
 	struct poll_wqueues *pwq = wait->private;
 	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -194,6 +194,16 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	return default_wake_function(&dummy_wait, mode, sync, key);
 }
 
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct poll_table_entry *entry;
+
+	entry = container_of(wait, struct poll_table_entry, wait);
+	if (key && !((unsigned long)key & entry->key))
+		return 0;
+	return __pollwake(wait, mode, sync, key);
+}
+
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 				poll_table *p)
@@ -205,6 +215,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	get_file(filp);
 	entry->filp = filp;
 	entry->wait_address = wait_address;
+	entry->key = p->key;
 	init_waitqueue_func_entry(&entry->wait, pollwake);
 	entry->wait.private = pwq;
 	add_wait_queue(wait_address, &entry->wait);
@@ -362,6 +373,18 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
+static inline void wait_key_set(poll_table *wait, unsigned long in,
+				unsigned long out, unsigned long bit)
+{
+	if (wait) {
+		wait->key = POLLEX_SET;
+		if (in & bit)
+			wait->key |= POLLIN_SET;
+		if (out & bit)
+			wait->key |= POLLOUT_SET;
+	}
+}
+
 int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 {
 	ktime_t expire, *to = NULL;
@@ -418,20 +441,25 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 				if (file) {
 					f_op = file->f_op;
 					mask = DEFAULT_POLLMASK;
-					if (f_op && f_op->poll)
-						mask = (*f_op->poll)(file, retval ? NULL : wait);
+					if (f_op && f_op->poll) {
+						wait_key_set(wait, in, out, bit);
+						mask = (*f_op->poll)(file, wait);
+					}
 					fput_light(file, fput_needed);
 					if ((mask & POLLIN_SET) && (in & bit)) {
 						res_in |= bit;
 						retval++;
+						wait = NULL;
 					}
 					if ((mask & POLLOUT_SET) && (out & bit)) {
 						res_out |= bit;
 						retval++;
+						wait = NULL;
 					}
 					if ((mask & POLLEX_SET) && (ex & bit)) {
 						res_ex |= bit;
 						retval++;
+						wait = NULL;
 					}
 				}
 			}
@@ -685,8 +713,12 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 		mask = POLLNVAL;
 		if (file != NULL) {
 			mask = DEFAULT_POLLMASK;
-			if (file->f_op && file->f_op->poll)
+			if (file->f_op && file->f_op->poll) {
+				if (pwait)
+					pwait->key = pollfd->events |
+							POLLERR | POLLHUP;
 				mask = file->f_op->poll(file, pwait);
+			}
 			/* Mask out unneeded events. */
 			mask &= pollfd->events | POLLERR | POLLHUP;
 			fput_light(file, fput_needed);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 7f40f30c55c5..6c959275f2d0 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -640,6 +640,26 @@ int seq_puts(struct seq_file *m, const char *s)
 }
 EXPORT_SYMBOL(seq_puts);
 
+/**
+ * seq_write - write arbitrary data to buffer
+ * @seq: seq_file identifying the buffer to which data should be written
+ * @data: data address
+ * @len: number of bytes
+ *
+ * Return 0 on success, non-zero otherwise.
+ */
+int seq_write(struct seq_file *seq, const void *data, size_t len)
+{
+	if (seq->count + len < seq->size) {
+		memcpy(seq->buf + seq->count, data, len);
+		seq->count += len;
+		return 0;
+	}
+	seq->count = seq->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_write);
+
 struct list_head *seq_list_start(struct list_head *head, loff_t pos)
 {
 	struct list_head *lh;
diff --git a/fs/super.c b/fs/super.c
index 83b47416d006..d40d53a22fb5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -545,24 +545,18 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
 		if (force)
 			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb)) {
-			unlock_kernel();
+		else if (!fs_may_remount_ro(sb))
 			return -EBUSY;
-		}
 		retval = vfs_dq_off(sb, 1);
-		if (retval < 0 && retval != -ENOSYS) {
-			unlock_kernel();
+		if (retval < 0 && retval != -ENOSYS)
 			return -EBUSY;
-		}
 	}
 	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		if (retval) {
-			unlock_kernel();
+		if (retval)
 			return retval;
-		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 	if (remount_rw)
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index a3ba217fbe74..1d897ad808e0 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -192,8 +192,11 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	int error = -ENOMEM;
 	unsigned long page = get_zeroed_page(GFP_KERNEL);
-	if (page)
+	if (page) {
 		error = sysfs_getlink(dentry, (char *) page); 
+		if (error < 0)
+			free_page((unsigned long)page);
+	}
 	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
 	return NULL;
 }
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index c7798079e644..4e50286a4cc3 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -15,13 +15,13 @@
 
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <linux/swap.h>
 #include "sysv.h"
 
 static int sysv_readdir(struct file *, void *, filldir_t);
 
 const struct file_operations sysv_dir_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= sysv_readdir,
 	.fsync		= simple_fsync,
@@ -74,8 +74,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	unsigned long n = pos >> PAGE_CACHE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 
-	lock_kernel();
-
 	pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
 	if (pos >= inode->i_size)
 		goto done;
@@ -113,7 +111,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
 
 done:
 	filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
-	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 479923456a54..9824743832a7 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -21,7 +21,6 @@
  *  the superblock.
  */
 
-#include <linux/smp_lock.h>
 #include <linux/highuid.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -37,7 +36,6 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
 	unsigned long time = get_seconds(), old_time;
 
 	lock_super(sb);
-	lock_kernel();
 
 	/*
 	 * If we are going to write out the super block,
@@ -52,7 +50,6 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
 		mark_buffer_dirty(sbi->s_bh2);
 	}
 
-	unlock_kernel();
 	unlock_super(sb);
 
 	return 0;
@@ -82,8 +79,6 @@ static void sysv_put_super(struct super_block *sb)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
-	lock_kernel();
-
 	if (sb->s_dirt)
 		sysv_write_super(sb);
 
@@ -99,8 +94,6 @@ static void sysv_put_super(struct super_block *sb)
 		brelse(sbi->s_bh2);
 
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -275,7 +268,6 @@ int sysv_write_inode(struct inode *inode, int wait)
 		return -EIO;
 	}
 
-	lock_kernel();
 	raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
 	raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid));
 	raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid));
@@ -291,7 +283,6 @@ int sysv_write_inode(struct inode *inode, int wait)
 	for (block = 0; block < 10+1+1+1; block++)
 		write3byte(sbi, (u8 *)&si->i_data[block],
 			&raw_inode->i_data[3*block]);
-	unlock_kernel();
 	mark_buffer_dirty(bh);
 	if (wait) {
                 sync_dirty_buffer(bh);
@@ -315,9 +306,7 @@ static void sysv_delete_inode(struct inode *inode)
 	truncate_inode_pages(&inode->i_data, 0);
 	inode->i_size = 0;
 	sysv_truncate(inode);
-	lock_kernel();
 	sysv_free_inode(inode);
-	unlock_kernel();
 }
 
 static struct kmem_cache *sysv_inode_cachep;
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index af1914462f02..eaf6d891d46f 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -91,7 +91,6 @@ static int shrink_liability(struct ubifs_info *c, int nr_to_write)
 	return nr_written;
 }
 
-
 /**
  * run_gc - run garbage collector.
  * @c: UBIFS file-system description object
@@ -628,7 +627,7 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
  *
  * This function releases budget corresponding to a dirty inode. It is usually
  * called when after the inode has been written to the media and marked as
- * clean.
+ * clean. It also causes the "no space" flags to be cleared.
  */
 void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 				      struct ubifs_inode *ui)
@@ -636,6 +635,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 	struct ubifs_budget_req req;
 
 	memset(&req, 0, sizeof(struct ubifs_budget_req));
+	/* The "no space" flags will be cleared because dd_growth is > 0 */
 	req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
 	ubifs_release_budget(c, &req);
 }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f55d523c52bb..552fb0111fff 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -528,6 +528,25 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 		inode->i_nlink, dir->i_ino);
 	ubifs_assert(mutex_is_locked(&dir->i_mutex));
 	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+
+	/*
+	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
+	 * otherwise has the potential to corrupt the orphan inode list.
+	 *
+	 * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
+	 * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
+	 * lock 'dirA->i_mutex', so this is possible. Both of the functions
+	 * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
+	 * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
+	 * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
+	 * to the list of orphans. After this, 'vfs_link()' will link
+	 * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
+	 * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
+	 * to the list of orphans.
+	 */
+	 if (inode->i_nlink == 0)
+		 return -ENOENT;
+
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e8e632a1dcdf..bc5857199ec2 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -293,13 +293,14 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
  *
  * This function is called when the write-buffer timer expires.
  */
-static void wbuf_timer_callback_nolock(unsigned long data)
+static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
 {
-	struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data;
+	struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
 
 	wbuf->need_sync = 1;
 	wbuf->c->need_wbuf_sync = 1;
 	ubifs_wake_up_bgt(wbuf->c);
+	return HRTIMER_NORESTART;
 }
 
 /**
@@ -308,13 +309,12 @@ static void wbuf_timer_callback_nolock(unsigned long data)
  */
 static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 {
-	ubifs_assert(!timer_pending(&wbuf->timer));
+	ubifs_assert(!hrtimer_active(&wbuf->timer));
 
-	if (!wbuf->timeout)
+	if (!ktime_to_ns(wbuf->softlimit))
 		return;
-
-	wbuf->timer.expires = jiffies + wbuf->timeout;
-	add_timer(&wbuf->timer);
+	hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta,
+			       HRTIMER_MODE_REL);
 }
 
 /**
@@ -329,7 +329,7 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 	 * should be canceled.
 	 */
 	wbuf->need_sync = 0;
-	del_timer(&wbuf->timer);
+	hrtimer_cancel(&wbuf->timer);
 }
 
 /**
@@ -825,6 +825,7 @@ out:
 int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
 {
 	size_t size;
+	ktime_t hardlimit;
 
 	wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
 	if (!wbuf->buf)
@@ -845,14 +846,21 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
 	wbuf->sync_callback = NULL;
 	mutex_init(&wbuf->io_mutex);
 	spin_lock_init(&wbuf->lock);
-
 	wbuf->c = c;
-	init_timer(&wbuf->timer);
-	wbuf->timer.function = wbuf_timer_callback_nolock;
-	wbuf->timer.data = (unsigned long)wbuf;
-	wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
 	wbuf->next_ino = 0;
 
+	hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	wbuf->timer.function = wbuf_timer_callback_nolock;
+	/*
+	 * Make write-buffer soft limit to be 20% of the hard limit. The
+	 * write-buffer timer is allowed to expire any time between the soft
+	 * and hard limits.
+	 */
+	hardlimit = ktime_set(DEFAULT_WBUF_TIMEOUT_SECS, 0);
+	wbuf->delta = (DEFAULT_WBUF_TIMEOUT_SECS * NSEC_PER_SEC) * 2 / 10;
+	wbuf->softlimit = ktime_sub_ns(hardlimit, wbuf->delta);
+	hrtimer_set_expires_range_ns(&wbuf->timer,  wbuf->softlimit,
+				     wbuf->delta);
 	return 0;
 }
 
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 10662975d2ef..805605250f12 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -343,33 +343,15 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
  *
  * This function returns %1 if @offs was in the last write to the LEB whose data
  * is in @buf, otherwise %0 is returned.  The determination is made by checking
- * for subsequent empty space starting from the next min_io_size boundary (or a
- * bit less than the common header size if min_io_size is one).
+ * for subsequent empty space starting from the next @c->min_io_size boundary.
  */
 static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
 {
-	int empty_offs;
-	int check_len;
+	int empty_offs, check_len;
 	uint8_t *p;
 
-	if (c->min_io_size == 1) {
-		check_len = c->leb_size - offs;
-		p = buf + check_len;
-		for (; check_len > 0; check_len--)
-			if (*--p != 0xff)
-				break;
-		/*
-		 * 'check_len' is the size of the corruption which cannot be
-		 * more than the size of 1 node if it was caused by an unclean
-		 * unmount.
-		 */
-		if (check_len > UBIFS_MAX_NODE_SZ)
-			return 0;
-		return 1;
-	}
-
 	/*
-	 * Round up to the next c->min_io_size boundary i.e. 'offs' is in the
+	 * Round up to the next @c->min_io_size boundary i.e. @offs is in the
 	 * last wbuf written. After that should be empty space.
 	 */
 	empty_offs = ALIGN(offs + 1, c->min_io_size);
@@ -392,7 +374,7 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
  *
  * This function pads up to the next min_io_size boundary (if there is one) and
  * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
- * min_io_size boundary (if there is one).
+ * @c->min_io_size boundary.
  */
 static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
 		      int *offs, int *len)
@@ -402,11 +384,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
 	lnum = lnum;
 	dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
 
-	if (c->min_io_size == 1) {
-		memset(*buf, 0xff, c->leb_size - *offs);
-		return;
-	}
-
 	ubifs_assert(!(*offs & 7));
 	empty_offs = ALIGN(*offs, c->min_io_size);
 	pad_len = empty_offs - *offs;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3589eab02a2f..79fad43f3c57 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -361,6 +361,11 @@ static void ubifs_delete_inode(struct inode *inode)
 out:
 	if (ui->dirty)
 		ubifs_release_dirty_inode_budget(c, ui);
+	else {
+		/* We've deleted something - clean the "no space" flags */
+		c->nospace = c->nospace_rp = 0;
+		smp_wmb();
+	}
 	clear_inode(inode);
 }
 
@@ -792,7 +797,7 @@ static int alloc_wbufs(struct ubifs_info *c)
 	 * does not need to be synchronized by timer.
 	 */
 	c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
-	c->jheads[GCHD].wbuf.timeout = 0;
+	c->jheads[GCHD].wbuf.softlimit = ktime_set(0, 0);
 
 	return 0;
 }
@@ -933,6 +938,27 @@ static const match_table_t tokens = {
 };
 
 /**
+ * parse_standard_option - parse a standard mount option.
+ * @option: the option to parse
+ *
+ * Normally, standard mount options like "sync" are passed to file-systems as
+ * flags. However, when a "rootflags=" kernel boot parameter is used, they may
+ * be present in the options string. This function tries to deal with this
+ * situation and parse standard options. Returns 0 if the option was not
+ * recognized, and the corresponding integer flag if it was.
+ *
+ * UBIFS is only interested in the "sync" option, so do not check for anything
+ * else.
+ */
+static int parse_standard_option(const char *option)
+{
+	ubifs_msg("parse %s", option);
+	if (!strcmp(option, "sync"))
+		return MS_SYNCHRONOUS;
+	return 0;
+}
+
+/**
  * ubifs_parse_options - parse mount parameters.
  * @c: UBIFS file-system description object
  * @options: parameters to parse
@@ -1008,9 +1034,19 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
 			break;
 		}
 		default:
-			ubifs_err("unrecognized mount option \"%s\" "
-				  "or missing value", p);
-			return -EINVAL;
+		{
+			unsigned long flag;
+			struct super_block *sb = c->vfs_sb;
+
+			flag = parse_standard_option(p);
+			if (!flag) {
+				ubifs_err("unrecognized mount option \"%s\" "
+					  "or missing value", p);
+				return -EINVAL;
+			}
+			sb->s_flags |= flag;
+			break;
+		}
 		}
 	}
 
@@ -1180,6 +1216,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (!ubifs_compr_present(c->default_compr)) {
 		ubifs_err("'compressor \"%s\" is not compiled in",
 			  ubifs_compr_name(c->default_compr));
+		err = -ENOTSUPP;
 		goto out_free;
 	}
 
@@ -1656,7 +1693,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 
 	for (i = 0; i < c->jhead_cnt; i++) {
 		ubifs_wbuf_sync(&c->jheads[i].wbuf);
-		del_timer_sync(&c->jheads[i].wbuf.timer);
+		hrtimer_cancel(&c->jheads[i].wbuf.timer);
 	}
 
 	c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
@@ -1719,7 +1756,7 @@ static void ubifs_put_super(struct super_block *sb)
 		if (c->jheads)
 			for (i = 0; i < c->jhead_cnt; i++) {
 				ubifs_wbuf_sync(&c->jheads[i].wbuf);
-				del_timer_sync(&c->jheads[i].wbuf.timer);
+				hrtimer_cancel(&c->jheads[i].wbuf.timer);
 			}
 
 		/*
@@ -1911,6 +1948,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	INIT_LIST_HEAD(&c->orph_list);
 	INIT_LIST_HEAD(&c->orph_new);
 
+	c->vfs_sb = sb;
 	c->highest_inum = UBIFS_FIRST_INO;
 	c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
 
@@ -1937,18 +1975,18 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	err  = bdi_init(&c->bdi);
 	if (err)
 		goto out_close;
+	err = bdi_register(&c->bdi, NULL, "ubifs");
+	if (err)
+		goto out_bdi;
 
 	err = ubifs_parse_options(c, data, 0);
 	if (err)
 		goto out_bdi;
 
-	c->vfs_sb = sb;
-
 	sb->s_fs_info = c;
 	sb->s_magic = UBIFS_SUPER_MAGIC;
 	sb->s_blocksize = UBIFS_BLOCK_SIZE;
 	sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
-	sb->s_dev = c->vi.cdev;
 	sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
 	if (c->max_inode_sz > MAX_LFS_FILESIZE)
 		sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
@@ -1993,16 +2031,9 @@ out_free:
 static int sb_test(struct super_block *sb, void *data)
 {
 	dev_t *dev = data;
+	struct ubifs_info *c = sb->s_fs_info;
 
-	return sb->s_dev == *dev;
-}
-
-static int sb_set(struct super_block *sb, void *data)
-{
-	dev_t *dev = data;
-
-	sb->s_dev = *dev;
-	return 0;
+	return c->vi.cdev == *dev;
 }
 
 static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
@@ -2030,7 +2061,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 
 	dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
 
-	sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev);
+	sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev);
 	if (IS_ERR(sb)) {
 		err = PTR_ERR(sb);
 		goto out_close;
@@ -2070,16 +2101,11 @@ out_close:
 	return err;
 }
 
-static void ubifs_kill_sb(struct super_block *sb)
-{
-	generic_shutdown_super(sb);
-}
-
 static struct file_system_type ubifs_fs_type = {
 	.name    = "ubifs",
 	.owner   = THIS_MODULE,
 	.get_sb  = ubifs_get_sb,
-	.kill_sb = ubifs_kill_sb
+	.kill_sb = kill_anon_super,
 };
 
 /*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0a8341e14088..1bf01d820066 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -95,8 +95,8 @@
  */
 #define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
 
-/* Default write-buffer synchronization timeout (5 secs) */
-#define DEFAULT_WBUF_TIMEOUT (5 * HZ)
+/* Default write-buffer synchronization timeout in seconds */
+#define DEFAULT_WBUF_TIMEOUT_SECS 5
 
 /* Maximum possible inode number (only 32-bit inodes are supported now) */
 #define MAX_INUM 0xFFFFFFFF
@@ -650,8 +650,10 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
  * @io_mutex: serializes write-buffer I/O
  * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
  *        fields
+ * @softlimit: soft write-buffer timeout interval
+ * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit
+ *         and @softlimit + @delta)
  * @timer: write-buffer timer
- * @timeout: timer expire interval in jiffies
  * @need_sync: it is set if its timer expired and needs sync
  * @next_ino: points to the next position of the following inode number
  * @inodes: stores the inode numbers of the nodes which are in wbuf
@@ -678,8 +680,9 @@ struct ubifs_wbuf {
 	int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
 	struct mutex io_mutex;
 	spinlock_t lock;
-	struct timer_list timer;
-	int timeout;
+	ktime_t softlimit;
+	unsigned long long delta;
+	struct hrtimer timer;
 	int need_sync;
 	int next_ino;
 	ino_t *inodes;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3d2512c21f05..7cf33379fd46 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -56,9 +56,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 
 
 	UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
-	if (i_block < 0) {
-		ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
-	} else if (i_block < direct_blocks) {
+	if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 	} else if ((i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = UFS_IND_BLOCK;
@@ -440,8 +438,6 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
 	lock_kernel();
 
 	UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
-	if (fragment < 0)
-		goto abort_negative;
 	if (fragment >
 	    ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb)
 	     << uspi->s_fpbshift))
@@ -504,10 +500,6 @@ abort:
 	unlock_kernel();
 	return err;
 
-abort_negative:
-	ufs_warning(sb, "ufs_get_block", "block < 0");
-	goto abort;
-
 abort_too_big:
 	ufs_warning(sb, "ufs_get_block", "block > big");
 	goto abort;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index f65a53f8752f..6127e24062d0 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -24,7 +24,7 @@
  * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
  * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
  */
-#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
 # define XFS_BIG_BLKNOS	1
 # define XFS_BIG_INUMS	1
 #else
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 2e09efbca8db..a220d36f789b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -616,7 +616,7 @@ xfs_max_file_offset(
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
 	ASSERT(sizeof(sector_t) == 8);
 	pagefactor = PAGE_CACHE_SIZE;
 	bitshift = BITS_PER_LONG;