613 files changed, 33150 insertions, 21168 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 047c791427aa..24eb01087b6d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -30,8 +30,8 @@
 #include <linux/parser.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
-#include <net/9p/transport.h>
 #include <net/9p/client.h>
+#include <net/9p/transport.h>
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 
@@ -55,7 +55,7 @@ enum {
 	Opt_err
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_debug, "debug=%x"},
 	{Opt_dfltuid, "dfltuid=%u"},
 	{Opt_dfltgid, "dfltgid=%u"},
@@ -234,7 +234,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	if (!v9ses->clnt->dotu)
 		v9ses->flags &= ~V9FS_EXTENDED;
 
-	v9ses->maxdata = v9ses->clnt->msize;
+	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
 	/* for legacy mode, fall back to V9FS_ACCESS_ANY */
 	if (!v9fs_extended(v9ses) &&
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 57997fa14e69..c295ba786edd 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -46,9 +46,11 @@ extern struct dentry_operations v9fs_cached_dentry_operations;
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_stat *, struct inode *, struct super_block *);
+void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
-void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
+void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 void v9fs_dentry_release(struct dentry *);
 int v9fs_uflags2omode(int uflags, int extended);
+
+ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 97d3aed57983..6fcb1e7095cf 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -38,7 +38,6 @@
 
 #include "v9fs.h"
 #include "v9fs_vfs.h"
-#include "fid.h"
 
 /**
  * v9fs_vfs_readpage - read an entire page in from 9P
@@ -53,14 +52,12 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
 	int retval;
 	loff_t offset;
 	char *buffer;
-	struct p9_fid *fid;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
-	fid = filp->private_data;
 	buffer = kmap(page);
 	offset = page_offset(page);
 
-	retval = p9_client_readn(fid, buffer, offset, PAGE_CACHE_SIZE);
+	retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE);
 	if (retval < 0)
 		goto done;
 
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 88e3787c6ea9..873cd31baa47 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -45,7 +45,7 @@
  *
  */
 
-static inline int dt_type(struct p9_stat *mistat)
+static inline int dt_type(struct p9_wstat *mistat)
 {
 	unsigned long perm = mistat->mode;
 	int rettype = DT_REG;
@@ -69,32 +69,58 @@ static inline int dt_type(struct p9_stat *mistat)
 static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	int over;
+	struct p9_wstat st;
+	int err;
 	struct p9_fid *fid;
-	struct v9fs_session_info *v9ses;
-	struct inode *inode;
-	struct p9_stat *st;
+	int buflen;
+	char *statbuf;
+	int n, i = 0;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
-	inode = filp->f_path.dentry->d_inode;
-	v9ses = v9fs_inode2v9ses(inode);
 	fid = filp->private_data;
-	while ((st = p9_client_dirread(fid, filp->f_pos)) != NULL) {
-		if (IS_ERR(st))
-			return PTR_ERR(st);
 
-		over = filldir(dirent, st->name.str, st->name.len, filp->f_pos,
-			v9fs_qid2ino(&st->qid), dt_type(st));
+	buflen = fid->clnt->msize - P9_IOHDRSZ;
+	statbuf = kmalloc(buflen, GFP_KERNEL);
+	if (!statbuf)
+		return -ENOMEM;
 
-		if (over)
+	while (1) {
+		err = v9fs_file_readn(filp, statbuf, NULL, buflen,
+								fid->rdir_fpos);
+		if (err <= 0)
 			break;
 
-		filp->f_pos += st->size;
-		kfree(st);
-		st = NULL;
+		n = err;
+		while (i < n) {
+			err = p9stat_read(statbuf + i, buflen-i, &st,
+							fid->clnt->dotu);
+			if (err) {
+				P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+				err = -EIO;
+				p9stat_free(&st);
+				goto free_and_exit;
+			}
+
+			i += st.size+2;
+			fid->rdir_fpos += st.size+2;
+
+			over = filldir(dirent, st.name, strlen(st.name),
+			    filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
+
+			filp->f_pos += st.size+2;
+
+			p9stat_free(&st);
+
+			if (over) {
+				err = 0;
+				goto free_and_exit;
+			}
+		}
 	}
 
-	kfree(st);
-	return 0;
+free_and_exit:
+	kfree(statbuf);
+	return err;
 }
 
 
@@ -119,6 +145,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 
 const struct file_operations v9fs_dir_operations = {
 	.read = generic_read_dir,
+	.llseek = generic_file_llseek,
 	.readdir = v9fs_dir_readdir,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 52944d2249a4..68bf2af6c389 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -120,23 +120,72 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 }
 
 /**
- * v9fs_file_read - read from a file
+ * v9fs_file_readn - read from a file
  * @filp: file pointer to read
  * @data: data buffer to read data into
+ * @udata: user data buffer to read data into
  * @count: size of buffer
  * @offset: offset at which to read data
  *
  */
+
+ssize_t
+v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+	       u64 offset)
+{
+	int n, total;
+	struct p9_fid *fid = filp->private_data;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
+					(long long unsigned) offset, count);
+
+	n = 0;
+	total = 0;
+	do {
+		n = p9_client_read(fid, data, udata, offset, count);
+		if (n <= 0)
+			break;
+
+		if (data)
+			data += n;
+		if (udata)
+			udata += n;
+
+		offset += n;
+		count -= n;
+		total += n;
+	} while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ));
+
+	if (n < 0)
+		total = n;
+
+	return total;
+}
+
+/**
+ * v9fs_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+
 static ssize_t
-v9fs_file_read(struct file *filp, char __user * data, size_t count,
+v9fs_file_read(struct file *filp, char __user *udata, size_t count,
 	       loff_t * offset)
 {
 	int ret;
 	struct p9_fid *fid;
 
-	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
 	fid = filp->private_data;
-	ret = p9_client_uread(fid, data, *offset, count);
+
+	if (count > (fid->clnt->msize - P9_IOHDRSZ))
+		ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
+	else
+		ret = p9_client_read(fid, NULL, udata, *offset, count);
+
 	if (ret > 0)
 		*offset += ret;
 
@@ -156,19 +205,38 @@ static ssize_t
 v9fs_file_write(struct file *filp, const char __user * data,
 		size_t count, loff_t * offset)
 {
-	int ret;
+	int n, rsize, total = 0;
 	struct p9_fid *fid;
+	struct p9_client *clnt;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	int origin = *offset;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
 		(int)count, (int)*offset);
 
 	fid = filp->private_data;
-	ret = p9_client_uwrite(fid, data, *offset, count);
-	if (ret > 0) {
-		invalidate_inode_pages2_range(inode->i_mapping, *offset,
-								*offset+ret);
-		*offset += ret;
+	clnt = fid->clnt;
+
+	rsize = fid->iounit;
+	if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
+		rsize = clnt->msize - P9_IOHDRSZ;
+
+	do {
+		if (count < rsize)
+			rsize = count;
+
+		n = p9_client_write(fid, NULL, data+total, *offset+total,
+									rsize);
+		if (n <= 0)
+			break;
+		count -= n;
+		total += n;
+	} while (count > 0);
+
+	if (total > 0) {
+		invalidate_inode_pages2_range(inode->i_mapping, origin,
+								origin+total);
+		*offset += total;
 	}
 
 	if (*offset > inode->i_size) {
@@ -176,7 +244,10 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
 	}
 
-	return ret;
+	if (n < 0)
+		return n;
+
+	return total;
 }
 
 static const struct file_operations v9fs_cached_file_operations = {
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c95295c65045..8314d3f43b71 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -334,7 +334,7 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 {
 	int err, umode;
 	struct inode *ret;
-	struct p9_stat *st;
+	struct p9_wstat *st;
 
 	ret = NULL;
 	st = p9_client_stat(fid);
@@ -417,6 +417,8 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	struct p9_fid *dfid, *ofid, *fid;
 	struct inode *inode;
 
+	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+
 	err = 0;
 	ofid = NULL;
 	fid = NULL;
@@ -424,6 +426,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	dfid = v9fs_fid_clone(dentry->d_parent);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
+		P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err);
 		dfid = NULL;
 		goto error;
 	}
@@ -432,18 +435,22 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	ofid = p9_client_walk(dfid, 0, NULL, 1);
 	if (IS_ERR(ofid)) {
 		err = PTR_ERR(ofid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		ofid = NULL;
 		goto error;
 	}
 
 	err = p9_client_fcreate(ofid, name, perm, mode, extension);
-	if (err < 0)
+	if (err < 0) {
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
 		goto error;
+	}
 
 	/* now walk from the parent so we can get unopened fid */
 	fid = p9_client_walk(dfid, 1, &name, 0);
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
+		P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		fid = NULL;
 		goto error;
 	} else
@@ -453,6 +460,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
+		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
 		goto error;
 	}
 
@@ -626,8 +634,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	return NULL;
 
 error:
-	if (fid)
-		p9_client_clunk(fid);
+	p9_client_clunk(fid);
 
 	return ERR_PTR(result);
 }
@@ -735,7 +742,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	int err;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
-	struct p9_stat *st;
+	struct p9_wstat *st;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	err = -EPERM;
@@ -816,10 +823,9 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
  */
 
 void
-v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
+v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	struct super_block *sb)
 {
-	int n;
 	char ext[32];
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
@@ -843,11 +849,7 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
 		int major = -1;
 		int minor = -1;
 
-		n = stat->extension.len;
-		if (n > sizeof(ext)-1)
-			n = sizeof(ext)-1;
-		memmove(ext, stat->extension.str, n);
-		ext[n] = 0;
+		strncpy(ext, stat->extension, sizeof(ext));
 		sscanf(ext, "%c %u %u", &type, &major, &minor);
 		switch (type) {
 		case 'c':
@@ -858,10 +860,11 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
 			break;
 		default:
 			P9_DPRINTK(P9_DEBUG_ERROR,
-				"Unknown special type %c (%.*s)\n", type,
-				stat->extension.len, stat->extension.str);
+				"Unknown special type %c %s\n", type,
+				stat->extension);
 		};
 		inode->i_rdev = MKDEV(major, minor);
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
 	} else
 		inode->i_rdev = 0;
 
@@ -905,7 +908,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
-	struct p9_stat *st;
+	struct p9_wstat *st;
 
 	P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
 	retval = -EPERM;
@@ -927,15 +930,10 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 	}
 
 	/* copy extension buffer into buffer */
-	if (st->extension.len < buflen)
-		buflen = st->extension.len + 1;
-
-	memmove(buffer, st->extension.str, buflen - 1);
-	buffer[buflen-1] = 0;
+	strncpy(buffer, st->extension, buflen);
 
 	P9_DPRINTK(P9_DEBUG_VFS,
-		"%s -> %.*s (%s)\n", dentry->d_name.name, st->extension.len,
-		st->extension.str, buffer);
+		"%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
 
 	retval = buflen;
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index bf59c3960494..d6cb1a0ca724 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -111,7 +111,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	struct inode *inode = NULL;
 	struct dentry *root = NULL;
 	struct v9fs_session_info *v9ses = NULL;
-	struct p9_stat *st = NULL;
+	struct p9_wstat *st = NULL;
 	int mode = S_IRWXUGO | S_ISVTX;
 	uid_t uid = current->fsuid;
 	gid_t gid = current->fsgid;
@@ -161,10 +161,14 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 
 	sb->s_root = root;
 	root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+
 	v9fs_stat2inode(st, root->d_inode, sb);
+
 	v9fs_fid_add(root, fid);
+	p9stat_free(st);
 	kfree(st);
 
+P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n");
 	return simple_set_mnt(mnt, sb);
 
 release_sb:
diff --git a/fs/Kconfig b/fs/Kconfig
index 37db79a2ff95..522469a7eca3 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -6,61 +6,9 @@ menu "File systems"
 
 if BLOCK
 
-config EXT2_FS
-	tristate "Second extended fs support"
-	help
-	  Ext2 is a standard Linux file system for hard disks.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ext2.
-
-	  If unsure, say Y.
-
-config EXT2_FS_XATTR
-	bool "Ext2 extended attributes"
-	depends on EXT2_FS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config EXT2_FS_POSIX_ACL
-	bool "Ext2 POSIX Access Control Lists"
-	depends on EXT2_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config EXT2_FS_SECURITY
-	bool "Ext2 Security Labels"
-	depends on EXT2_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ext2 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config EXT2_FS_XIP
-	bool "Ext2 execute in place support"
-	depends on EXT2_FS && MMU
-	help
-	  Execute in place can be used on memory-backed block devices. If you
-	  enable this option, you can select to mount block devices which are
-	  capable of this feature without using the page cache.
-
-	  If you do not use a block device that is capable of using this,
-	  or if unsure, say N.
+source "fs/ext2/Kconfig"
+source "fs/ext3/Kconfig"
+source "fs/ext4/Kconfig"
 
 config FS_XIP
 # execute in place
@@ -68,211 +16,16 @@ config FS_XIP
 	depends on EXT2_FS_XIP
 	default y
 
-config EXT3_FS
-	tristate "Ext3 journalling file system support"
-	select JBD
-	help
-	  This is the journalling version of the Second extended file system
-	  (often called ext3), the de facto standard Linux file system
-	  (method to organize files on a storage device) for hard disks.
-
-	  The journalling code included in this driver means you do not have
-	  to run e2fsck (file system checker) on your file systems after a
-	  crash.  The journal keeps track of any changes that were being made
-	  at the time the system crashed, and can ensure that your file system
-	  is consistent without the need for a lengthy check.
-
-	  Other than adding the journal to the file system, the on-disk format
-	  of ext3 is identical to ext2.  It is possible to freely switch
-	  between using the ext3 driver and the ext2 driver, as long as the
-	  file system has been cleanly unmounted, or e2fsck is run on the file
-	  system.
-
-	  To add a journal on an existing ext2 file system or change the
-	  behavior of ext3 file systems, you can use the tune2fs utility ("man
-	  tune2fs").  To modify attributes of files and directories on ext3
-	  file systems, use chattr ("man chattr").  You need to be using
-	  e2fsprogs version 1.20 or later in order to create ext3 journals
-	  (available at <http://sourceforge.net/projects/e2fsprogs/>).
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ext3.
-
-config EXT3_FS_XATTR
-	bool "Ext3 extended attributes"
-	depends on EXT3_FS
-	default y
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-	  You need this for POSIX ACL support on ext3.
-
-config EXT3_FS_POSIX_ACL
-	bool "Ext3 POSIX Access Control Lists"
-	depends on EXT3_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config EXT3_FS_SECURITY
-	bool "Ext3 Security Labels"
-	depends on EXT3_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ext3 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config EXT4DEV_FS
-	tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	select JBD2
-	select CRC16
-	help
-	  Ext4dev is a predecessor filesystem of the next generation
-	  extended fs ext4, based on ext3 filesystem code. It will be
-	  renamed ext4 fs later, once ext4dev is mature and stabilized.
-
-	  Unlike the change from ext2 filesystem to ext3 filesystem,
-	  the on-disk format of ext4dev is not the same as ext3 any more:
-	  it is based on extent maps and it supports 48-bit physical block
-	  numbers. These combined on-disk format changes will allow
-	  ext4dev/ext4 to handle more than 16 TB filesystem volumes --
-	  a hard limit that ext3 cannot overcome without changing the
-	  on-disk format.
-
-	  Other than extent maps and 48-bit block numbers, ext4dev also is
-	  likely to have other new features such as persistent preallocation,
-	  high resolution time stamps, and larger file support etc.  These
-	  features will be added to ext4dev gradually.
-
-	  To compile this file system support as a module, choose M here. The
-	  module will be called ext4dev.
-
-	  If unsure, say N.
-
-config EXT4DEV_FS_XATTR
-	bool "Ext4dev extended attributes"
-	depends on EXT4DEV_FS
-	default y
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-	  You need this for POSIX ACL support on ext4dev/ext4.
-
-config EXT4DEV_FS_POSIX_ACL
-	bool "Ext4dev POSIX Access Control Lists"
-	depends on EXT4DEV_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  POSIX Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the POSIX ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config EXT4DEV_FS_SECURITY
-	bool "Ext4dev Security Labels"
-	depends on EXT4DEV_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ext4dev/ext4 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config JBD
-	tristate
-	help
-	  This is a generic journalling layer for block devices.  It is
-	  currently used by the ext3 and OCFS2 file systems, but it could
-	  also be used to add journal support to other file systems or block
-	  devices such as RAID or LVM.
-
-	  If you are using the ext3 or OCFS2 file systems, you need to
-	  say Y here. If you are not using ext3 OCFS2 then you will probably
-	  want to say N.
-
-	  To compile this device as a module, choose M here: the module will be
-	  called jbd.  If you are compiling ext3 or OCFS2 into the kernel,
-	  you cannot compile this code as a module.
-
-config JBD_DEBUG
-	bool "JBD (ext3) debugging support"
-	depends on JBD && DEBUG_FS
-	help
-	  If you are using the ext3 journaled file system (or potentially any
-	  other file system/device using JBD), this option allows you to
-	  enable debugging output while the system is running, in order to
-	  help track down any problems you are having.  By default the
-	  debugging output will be turned off.
-
-	  If you select Y here, then you will be able to turn on debugging
-	  with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
-	  number between 1 and 5, the higher the number, the more debugging
-	  output is generated.  To turn debugging off again, do
-	  "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
-
-config JBD2
-	tristate
-	select CRC32
-	help
-	  This is a generic journaling layer for block devices that support
-	  both 32-bit and 64-bit block numbers.  It is currently used by
-	  the ext4dev/ext4 filesystem, but it could also be used to add
-	  journal support to other file systems or block devices such
-	  as RAID or LVM.
-
-	  If you are using ext4dev/ext4, you need to say Y here. If you are not
-	  using ext4dev/ext4 then you will probably want to say N.
-
-	  To compile this device as a module, choose M here. The module will be
-	  called jbd2.  If you are compiling ext4dev/ext4 into the kernel,
-	  you cannot compile this code as a module.
-
-config JBD2_DEBUG
-	bool "JBD2 (ext4dev/ext4) debugging support"
-	depends on JBD2 && DEBUG_FS
-	help
-	  If you are using the ext4dev/ext4 journaled file system (or
-	  potentially any other filesystem/device using JBD2), this option
-	  allows you to enable debugging output while the system is running,
-	  in order to help track down any problems you are having.
-	  By default, the debugging output will be turned off.
-
-	  If you select Y here, then you will be able to turn on debugging
-	  with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
-	  number between 1 and 5. The higher the number, the more debugging
-	  output is generated.  To turn debugging off again, do
-	  "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+source "fs/jbd/Kconfig"
+source "fs/jbd2/Kconfig"
 
 config FS_MBCACHE
 # Meta block cache for Extended Attributes (ext2/ext3/ext4)
 	tristate
-	depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
-	default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
-	default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
+	default y if EXT2_FS=y && EXT2_FS_XATTR
+	default y if EXT3_FS=y && EXT3_FS_XATTR
+	default y if EXT4_FS=y && EXT4_FS_XATTR
+	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
 
 config REISERFS_FS
 	tristate "Reiserfs support"
@@ -419,6 +172,14 @@ config FS_POSIX_ACL
 	bool
 	default n
 
+config FILE_LOCKING
+	bool "Enable POSIX file locking API" if EMBEDDED
+	default y
+	help
+	  This option enables standard file locking support, required
+          for filesystems like NFS and for the flock() system
+          call. Disabling this option saves about 11k.
+
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 
@@ -426,7 +187,7 @@ config OCFS2_FS
 	tristate "OCFS2 file system support"
 	depends on NET && SYSFS
 	select CONFIGFS_FS
-	select JBD
+	select JBD2
 	select CRC32
 	help
 	  OCFS2 is a general purpose extent based shared disk cluster file
@@ -497,6 +258,16 @@ config OCFS2_DEBUG_FS
 	  this option for debugging only as it is likely to decrease
 	  performance of the filesystem.
 
+config OCFS2_COMPAT_JBD
+	bool "Use JBD for compatibility"
+	depends on OCFS2_FS
+	default n
+	select JBD
+	help
+	  The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
+	  is backwards compatible with JBD.  It is safe to say N here.
+	  However, if you really want to use the original JBD, say Y here.
+
 endif # BLOCK
 
 config DNOTIFY
@@ -633,7 +404,7 @@ config AUTOFS4_FS
 	  N here.
 
 config FUSE_FS
-	tristate "Filesystem in Userspace support"
+	tristate "FUSE (Filesystem in Userspace) support"
 	help
 	  With FUSE it is possible to implement a fully functional filesystem
 	  in a userspace program.
@@ -902,65 +673,7 @@ endif # BLOCK
 
 menu "Pseudo filesystems"
 
-config PROC_FS
-	bool "/proc file system support" if EMBEDDED
-	default y
-	help
-	  This is a virtual file system providing information about the status
-	  of the system. "Virtual" means that it doesn't take up any space on
-	  your hard disk: the files are created on the fly by the kernel when
-	  you try to access them. Also, you cannot read the files with older
-	  version of the program less: you need to use more or cat.
-
-	  It's totally cool; for example, "cat /proc/interrupts" gives
-	  information about what the different IRQs are used for at the moment
-	  (there is a small number of Interrupt ReQuest lines in your computer
-	  that are used by the attached devices to gain the CPU's attention --
-	  often a source of trouble if two devices are mistakenly configured
-	  to use the same IRQ). The program procinfo to display some
-	  information about your system gathered from the /proc file system.
-
-	  Before you can use the /proc file system, it has to be mounted,
-	  meaning it has to be given a location in the directory hierarchy.
-	  That location should be /proc. A command such as "mount -t proc proc
-	  /proc" or the equivalent line in /etc/fstab does the job.
-
-	  The /proc file system is explained in the file
-	  <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
-	  ("man 5 proc").
-
-	  This option will enlarge your kernel by about 67 KB. Several
-	  programs depend on this, so everyone should say Y here.
-
-config PROC_KCORE
-	bool "/proc/kcore support" if !ARM
-	depends on PROC_FS && MMU
-
-config PROC_VMCORE
-        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && CRASH_DUMP
-	default y
-        help
-        Exports the dump image of crashed kernel in ELF format.
-
-config PROC_SYSCTL
-	bool "Sysctl support (/proc/sys)" if EMBEDDED
-	depends on PROC_FS
-	select SYSCTL
-	default y
-	---help---
-	  The sysctl interface provides a means of dynamically changing
-	  certain kernel parameters and variables on the fly without requiring
-	  a recompile of the kernel or reboot of the system.  The primary
-	  interface is through /proc/sys.  If you say Y here a tree of
-	  modifiable sysctl entries will be generated beneath the
-          /proc/sys directory. They are explained in the files
-	  in <file:Documentation/sysctl/>.  Note that enabling this
-	  option will enlarge the kernel by at least 8 KB.
-
-	  As it is generally a good thing, you should say Y here unless
-	  building a kernel for install/rescue disks or your system is very
-	  limited in memory.
+source "fs/proc/Kconfig"
 
 config SYSFS
 	bool "sysfs file system support" if EMBEDDED
@@ -1194,195 +907,7 @@ config EFS_FS
 	  To compile the EFS file system support as a module, choose M here: the
 	  module will be called efs.
 
-config JFFS2_FS
-	tristate "Journalling Flash File System v2 (JFFS2) support"
-	select CRC32
-	depends on MTD
-	help
-	  JFFS2 is the second generation of the Journalling Flash File System
-	  for use on diskless embedded devices. It provides improved wear
-	  levelling, compression and support for hard links. You cannot use
-	  this on normal block devices, only on 'MTD' devices.
-
-	  Further information on the design and implementation of JFFS2 is
-	  available at <http://sources.redhat.com/jffs2/>.
-
-config JFFS2_FS_DEBUG
-	int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
-	depends on JFFS2_FS
-	default "0"
-	help
-	  This controls the amount of debugging messages produced by the JFFS2
-	  code. Set it to zero for use in production systems. For evaluation,
-	  testing and debugging, it's advisable to set it to one. This will
-	  enable a few assertions and will print debugging messages at the
-	  KERN_DEBUG loglevel, where they won't normally be visible. Level 2
-	  is unlikely to be useful - it enables extra debugging in certain
-	  areas which at one point needed debugging, but when the bugs were
-	  located and fixed, the detailed messages were relegated to level 2.
-
-	  If reporting bugs, please try to have available a full dump of the
-	  messages at debug level 1 while the misbehaviour was occurring.
-
-config JFFS2_FS_WRITEBUFFER
-	bool "JFFS2 write-buffering support"
-	depends on JFFS2_FS
-	default y
-	help
-	  This enables the write-buffering support in JFFS2.
-
-	  This functionality is required to support JFFS2 on the following
-	  types of flash devices:
-	    - NAND flash
-	    - NOR flash with transparent ECC
-	    - DataFlash
-
-config JFFS2_FS_WBUF_VERIFY
-	bool "Verify JFFS2 write-buffer reads"
-	depends on JFFS2_FS_WRITEBUFFER
-	default n
-	help
-	  This causes JFFS2 to read back every page written through the
-	  write-buffer, and check for errors.
-
-config JFFS2_SUMMARY
-	bool "JFFS2 summary support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
-	default n
-	help
-	  This feature makes it possible to use summary information
-	  for faster filesystem mount.
-
-	  The summary information can be inserted into a filesystem image
-	  by the utility 'sumtool'.
-
-	  If unsure, say 'N'.
-
-config JFFS2_FS_XATTR
-	bool "JFFS2 XATTR support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
-	default n
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config JFFS2_FS_POSIX_ACL
-	bool "JFFS2 POSIX Access Control Lists"
-	depends on JFFS2_FS_XATTR
-	default y
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config JFFS2_FS_SECURITY
-	bool "JFFS2 Security Labels"
-	depends on JFFS2_FS_XATTR
-	default y
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the jffs2 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config JFFS2_COMPRESSION_OPTIONS
-	bool "Advanced compression options for JFFS2"
-	depends on JFFS2_FS
-	default n
-	help
-	  Enabling this option allows you to explicitly choose which
-	  compression modules, if any, are enabled in JFFS2. Removing
-	  compressors can mean you cannot read existing file systems,
-	  and enabling experimental compressors can mean that you
-	  write a file system which cannot be read by a standard kernel.
-
-	  If unsure, you should _definitely_ say 'N'.
-
-config JFFS2_ZLIB
-	bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
-	select ZLIB_INFLATE
-	select ZLIB_DEFLATE
-	depends on JFFS2_FS
-	default y
-	help
-	  Zlib is designed to be a free, general-purpose, legally unencumbered,
-	  lossless data-compression library for use on virtually any computer
-	  hardware and operating system. See <http://www.gzip.org/zlib/> for
-	  further information.
-
-	  Say 'Y' if unsure.
-
-config JFFS2_LZO
-	bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
-	select LZO_COMPRESS
-	select LZO_DECOMPRESS
-	depends on JFFS2_FS
-	default n
-	help
-	  minilzo-based compression. Generally works better than Zlib.
-
-	  This feature was added in July, 2007. Say 'N' if you need
-	  compatibility with older bootloaders or kernels.
-
-config JFFS2_RTIME
-	bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
-	depends on JFFS2_FS
-	default y
-	help
-	  Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
-
-config JFFS2_RUBIN
-	bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
-	depends on JFFS2_FS
-	default n
-	help
-	  RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
-
-choice
-	prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
-	default JFFS2_CMODE_PRIORITY
-	depends on JFFS2_FS
-	help
-	  You can set here the default compression mode of JFFS2 from
-	  the available compression modes. Don't touch if unsure.
-
-config JFFS2_CMODE_NONE
-	bool "no compression"
-	help
-	  Uses no compression.
-
-config JFFS2_CMODE_PRIORITY
-	bool "priority"
-	help
-	  Tries the compressors in a predefined order and chooses the first
-	  successful one.
-
-config JFFS2_CMODE_SIZE
-	bool "size (EXPERIMENTAL)"
-	help
-	  Tries all compressors and chooses the one which has the smallest
-	  result.
-
-config JFFS2_CMODE_FAVOURLZO
-	bool "Favour LZO"
-	help
-	  Tries all compressors and chooses the one which has the smallest
-	  result but gives some preference to LZO (which has faster
-	  decompression) at the expense of size.
-
-endchoice
-
+source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
 
@@ -1441,6 +966,19 @@ config MINIX_FS
 	  partition (the one containing the directory /) cannot be compiled as
 	  a module.
 
+config OMFS_FS
+	tristate "SonicBlue Optimized MPEG File System support"
+	depends on BLOCK
+	select CRC_ITU_T
+	help
+	  This is the proprietary file system used by the Rio Karma music
+	  player and ReplayTV DVR.  Despite the name, this filesystem is not
+	  more efficient than a standard FS for MPEG files, in fact likely
+	  the opposite is true.  Say Y if you have either of these devices
+	  and wish to mount its disk.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called omfs.  If unsure, say N.
 
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
@@ -1810,6 +1348,28 @@ config SUNRPC_XPRT_RDMA
 
 	  If unsure, say N.
 
+config SUNRPC_REGISTER_V4
+	bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
+	depends on SUNRPC && EXPERIMENTAL
+	default n
+	help
+	  Sun added support for registering RPC services at an IPv6
+	  address by creating two new versions of the rpcbind protocol
+	  (RFC 1833).
+
+	  This option enables support in the kernel RPC server for
+	  registering kernel RPC services via version 4 of the rpcbind
+	  protocol.  If you enable this option, you must run a portmapper
+	  daemon that supports rpcbind protocol version 4.
+
+	  Serving NFS over IPv6 from knfsd (the kernel's NFS server)
+	  requires that you enable this option and use a portmapper that
+	  supports rpcbind version 4.
+
+	  If unsure, say N to get traditional behavior (register kernel
+	  RPC services using only rpcbind version 2).  Distributions
+	  using the legacy Linux portmapper daemon must say N here.
+
 config RPCSEC_GSS_KRB5
 	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
@@ -1904,149 +1464,7 @@ config SMB_NLS_REMOTE
 
 	  smbmount from samba 2.2.0 or later supports this.
 
-config CIFS
-	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
-	depends on INET
-	select NLS
-	help
-	  This is the client VFS module for the Common Internet File System
-	  (CIFS) protocol which is the successor to the Server Message Block 
-	  (SMB) protocol, the native file sharing mechanism for most early
-	  PC operating systems.  The CIFS protocol is fully supported by 
-	  file servers such as Windows 2000 (including Windows 2003, NT 4  
-	  and Windows XP) as well by Samba (which provides excellent CIFS
-	  server support for Linux and many other operating systems). Limited
-	  support for OS/2 and Windows ME and similar servers is provided as
-	  well.
-
-	  The cifs module provides an advanced network file system
-	  client for mounting to CIFS compliant servers.  It includes
-	  support for DFS (hierarchical name space), secure per-user
-	  session establishment via Kerberos or NTLM or NTLMv2,
-	  safe distributed caching (oplock), optional packet
-	  signing, Unicode and other internationalization improvements.
-	  If you need to mount to Samba or Windows from this machine, say Y.
-
-config CIFS_STATS
-        bool "CIFS statistics"
-        depends on CIFS
-        help
-          Enabling this option will cause statistics for each server share
-	  mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
-
-config CIFS_STATS2
-	bool "Extended statistics"
-	depends on CIFS_STATS
-	help
-	  Enabling this option will allow more detailed statistics on SMB
-	  request timing to be displayed in /proc/fs/cifs/DebugData and also
-	  allow optional logging of slow responses to dmesg (depending on the
-	  value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
-	  These additional statistics may have a minor effect on performance
-	  and memory utilization.
-
-	  Unless you are a developer or are doing network performance analysis
-	  or tuning, say N.
-
-config CIFS_WEAK_PW_HASH
-	bool "Support legacy servers which use weaker LANMAN security"
-	depends on CIFS
-	help
-	  Modern CIFS servers including Samba and most Windows versions
-	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
-	  security mechanisms. These hash the password more securely
-	  than the mechanisms used in the older LANMAN version of the
-	  SMB protocol but LANMAN based authentication is needed to
-	  establish sessions with some old SMB servers.
-
-	  Enabling this option allows the cifs module to mount to older
-	  LANMAN based servers such as OS/2 and Windows 95, but such
-	  mounts may be less secure than mounts using NTLM or more recent
-	  security mechanisms if you are on a public network.  Unless you
-	  have a need to access old SMB servers (and are on a private
-	  network) you probably want to say N.  Even if this support
-	  is enabled in the kernel build, LANMAN authentication will not be
-	  used automatically. At runtime LANMAN mounts are disabled but
-	  can be set to required (or optional) either in
-	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
-	  option on the mount command. This support is disabled by
-	  default in order to reduce the possibility of a downgrade
-	  attack.
-
-	  If unsure, say N.
-
-config CIFS_XATTR
-        bool "CIFS extended attributes"
-        depends on CIFS
-        help
-          Extended attributes are name:value pairs associated with inodes by
-          the kernel or by users (see the attr(5) manual page, or visit
-          <http://acl.bestbits.at/> for details).  CIFS maps the name of
-          extended attributes beginning with the user namespace prefix
-          to SMB/CIFS EAs. EAs are stored on Windows servers without the
-          user namespace prefix, but their names are seen by Linux cifs clients
-          prefaced by the user namespace prefix. The system namespace
-          (used by some filesystems to store ACLs) is not supported at
-          this time.
-
-          If unsure, say N.
-
-config CIFS_POSIX
-        bool "CIFS POSIX Extensions"
-        depends on CIFS_XATTR
-        help
-          Enabling this option will cause the cifs client to attempt to
-	  negotiate a newer dialect with servers, such as Samba 3.0.5
-	  or later, that optionally can handle more POSIX like (rather
-	  than Windows like) file behavior.  It also enables
-	  support for POSIX ACLs (getfacl and setfacl) to servers
-	  (such as Samba 3.10 and later) which can negotiate
-	  CIFS POSIX ACL support.  If unsure, say N.
-
-config CIFS_DEBUG2
-	bool "Enable additional CIFS debugging routines"
-	depends on CIFS
-	help
-	   Enabling this option adds a few more debugging routines
-	   to the cifs code which slightly increases the size of
-	   the cifs module and can cause additional logging of debug
-	   messages in some error paths, slowing performance. This
-	   option can be turned off unless you are debugging
-	   cifs problems.  If unsure, say N.
-
-config CIFS_EXPERIMENTAL
-	  bool "CIFS Experimental Features (EXPERIMENTAL)"
-	  depends on CIFS && EXPERIMENTAL
-	  help
-	    Enables cifs features under testing. These features are
-	    experimental and currently include DFS support and directory 
-	    change notification ie fcntl(F_DNOTIFY), as well as the upcall
-	    mechanism which will be used for Kerberos session negotiation
-	    and uid remapping.  Some of these features also may depend on 
-	    setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
-	    (which is disabled by default). See the file fs/cifs/README 
-	    for more details.  If unsure, say N.
-
-config CIFS_UPCALL
-	  bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
-	  depends on CIFS_EXPERIMENTAL
-	  depends on KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which accesses
-	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
-	    Kerberos tickets which are needed to mount to certain secure servers
-	    (for which more secure Kerberos authentication is required). If
-	    unsure, say N.
-
-config CIFS_DFS_UPCALL
-	  bool "DFS feature support (EXPERIMENTAL)"
-	  depends on CIFS_EXPERIMENTAL
-	  depends on KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which contacts userspace
-	    helper utilities to provide server name resolution (host names to
-	    IP addresses) which is needed for implicit mounts of DFS junction
-	    points. If unsure, say N.
+source "fs/cifs/Kconfig"
 
 config NCP_FS
 	tristate "NCP file system support (to mount NetWare volumes)"
@@ -2093,20 +1511,6 @@ config CODA_FS
 	  To compile the coda client support as a module, choose M here: the
 	  module will be called coda.
 
-config CODA_FS_OLD_API
-	bool "Use 96-bit Coda file identifiers"
-	depends on CODA_FS
-	help
-	  A new kernel-userspace API had to be introduced for Coda v6.0
-	  to support larger 128-bit file identifiers as needed by the
-	  new realms implementation.
-
-	  However this new API is not backward compatible with older
-	  clients. If you really need to run the old Coda userspace
-	  cache manager then say Y.
-
-	  For most cases you probably want to say N.
-
 config AFS_FS
 	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
 	depends on INET && EXPERIMENTAL
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 3263084eef9e..ce9fb3fbfae4 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -25,12 +25,12 @@ config BINFMT_ELF
 
 config COMPAT_BINFMT_ELF
 	bool
-	depends on COMPAT && MMU
+	depends on COMPAT && BINFMT_ELF
 
 config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y
-	depends on (FRV || BLACKFIN)
+	depends on (FRV || BLACKFIN || (SUPERH32 && !MMU))
 	help
 	  ELF FDPIC binaries are based on ELF, but allow the individual load
 	  segments of a binary to be located in memory independently of each
@@ -40,6 +40,28 @@ config BINFMT_ELF_FDPIC
 
 	  It is also possible to run FDPIC ELF binaries on MMU linux also.
 
+config CORE_DUMP_DEFAULT_ELF_HEADERS
+	bool "Write ELF core dumps with partial segments"
+	default n
+	depends on BINFMT_ELF
+	help
+	  ELF core dump files describe each memory mapping of the crashed
+	  process, and can contain or omit the memory contents of each one.
+	  The contents of an unmodified text mapping are omitted by default.
+
+	  For an unmodified text mapping of an ELF object, including just
+	  the first page of the file in a core dump makes it possible to
+	  identify the build ID bits in the file, without paying the i/o
+	  cost and disk space to dump all the text.  However, versions of
+	  GDB before 6.7 are confused by ELF core dump files in this format.
+
+	  The core dump behavior can be controlled per process using
+	  the /proc/PID/coredump_filter pseudo-file; this setting is
+	  inherited.  See Documentation/filesystems/proc.txt for details.
+
+	  This config option changes the default setting of coredump_filter
+	  seen at boot time.  If unsure, say N.
+
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
 	depends on !MMU && (!FRV || BROKEN)
@@ -59,10 +81,12 @@ config BINFMT_SHARED_FLAT
 	help
 	  Support FLAT shared libraries
 
+config HAVE_AOUT
+       def_bool n
+
 config BINFMT_AOUT
 	tristate "Kernel support for a.out and ECOFF binaries"
-	depends on ARCH_SUPPORTS_AOUT && \
-		(X86_32 || ALPHA || ARM || M68K)
+	depends on HAVE_AOUT
 	---help---
 	  A.out (Assembler.OUTput) is a set of formats for libraries and
 	  executables used in the earliest versions of UNIX.  Linux used
diff --git a/fs/Makefile b/fs/Makefile
index 3b2178b4bb66..2168c902d5ca 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,8 +7,8 @@
 
 obj-y :=	open.o read_write.o file_table.o super.o \
 		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
-		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
-		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
+		ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
+		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
 		stack.o
@@ -27,6 +27,8 @@ obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
+obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
 nfsd-$(CONFIG_NFSD)		:= nfsctl.o
@@ -69,7 +71,7 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4DEV_FS)	+= ext4/ # Before ext2 so root fs can be ext4dev
+obj-$(CONFIG_EXT4_FS)		+= ext4/ # Before ext2 so root fs can be ext4
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
 obj-$(CONFIG_EXT2_FS)		+= ext2/
@@ -111,6 +113,7 @@ obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
 obj-$(CONFIG_UDF_FS)		+= udf/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
+obj-$(CONFIG_OMFS_FS)		+= omfs/
 obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)		+= xfs/
 obj-$(CONFIG_9P_FS)		+= 9p/
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index fc1a8dc64d78..85a30e929800 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,6 +197,7 @@ out:
 
 const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
+	.llseek		= generic_file_llseek,
 	.readdir	= adfs_readdir,
 	.fsync		= file_fsync,
 };
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9e421eeb672b..7f83a46f2b7e 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -157,7 +157,7 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
 
 enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err};
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_uid, "uid=%u"},
 	{Opt_gid, "gid=%u"},
 	{Opt_ownmask, "ownmask=%o"},
@@ -249,7 +249,7 @@ static void adfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
 
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 223b1917093e..e9ec915f7553 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -2,6 +2,7 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/amigaffs.h>
+#include <linux/mutex.h>
 
 /* AmigaOS allows file names with up to 30 characters length.
  * Names longer than that will be silently truncated. If you
@@ -98,7 +99,7 @@ struct affs_sb_info {
 	gid_t s_gid;			/* gid to override */
 	umode_t s_mode;			/* mode to override */
 	struct buffer_head *s_root_bh;	/* Cached root block. */
-	struct semaphore s_bmlock;	/* Protects bitmap access. */
+	struct mutex s_bmlock;		/* Protects bitmap access. */
 	struct affs_bm_info *s_bitmap;	/* Bitmap infos. */
 	u32 s_bmap_count;		/* # of bitmap blocks. */
 	u32 s_bmap_bits;		/* # of bits in one bitmap blocks */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index c4a5ad09ddf2..dc5ef14bdc1c 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -45,14 +45,14 @@ affs_count_free_blocks(struct super_block *sb)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	down(&AFFS_SB(sb)->s_bmlock);
+	mutex_lock(&AFFS_SB(sb)->s_bmlock);
 
 	bm = AFFS_SB(sb)->s_bitmap;
 	free = 0;
 	for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--)
 		free += bm->bm_free;
 
-	up(&AFFS_SB(sb)->s_bmlock);
+	mutex_unlock(&AFFS_SB(sb)->s_bmlock);
 
 	return free;
 }
@@ -76,7 +76,7 @@ affs_free_block(struct super_block *sb, u32 block)
 	bit     = blk % sbi->s_bmap_bits;
 	bm      = &sbi->s_bitmap[bmap];
 
-	down(&sbi->s_bmlock);
+	mutex_lock(&sbi->s_bmlock);
 
 	bh = sbi->s_bmap_bh;
 	if (sbi->s_last_bmap != bmap) {
@@ -105,19 +105,19 @@ affs_free_block(struct super_block *sb, u32 block)
 	sb->s_dirt = 1;
 	bm->bm_free++;
 
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 	return;
 
 err_free:
 	affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block);
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 	return;
 
 err_bh_read:
 	affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key);
 	sbi->s_bmap_bh = NULL;
 	sbi->s_last_bmap = ~0;
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 	return;
 
 err_range:
@@ -168,7 +168,7 @@ affs_alloc_block(struct inode *inode, u32 goal)
 	bmap = blk / sbi->s_bmap_bits;
 	bm = &sbi->s_bitmap[bmap];
 
-	down(&sbi->s_bmlock);
+	mutex_lock(&sbi->s_bmlock);
 
 	if (bm->bm_free)
 		goto find_bmap_bit;
@@ -249,7 +249,7 @@ find_bit:
 	mark_buffer_dirty(bh);
 	sb->s_dirt = 1;
 
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 
 	pr_debug("%d\n", blk);
 	return blk;
@@ -259,7 +259,7 @@ err_bh_read:
 	sbi->s_bmap_bh = NULL;
 	sbi->s_last_bmap = ~0;
 err_full:
-	up(&sbi->s_bmlock);
+	mutex_unlock(&sbi->s_bmlock);
 	pr_debug("failed\n");
 	return 0;
 }
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 6e3f282424b0..7b36904dbeac 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t);
 
 const struct file_operations affs_dir_operations = {
 	.read		= generic_read_dir,
+	.llseek		= generic_file_llseek,
 	.readdir	= affs_readdir,
 	.fsync		= file_fsync,
 };
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 6eac7bdeec94..1377b1240b6e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -46,8 +46,6 @@ const struct inode_operations affs_file_inode_operations = {
 static int
 affs_file_open(struct inode *inode, struct file *filp)
 {
-	if (atomic_read(&filp->f_count) != 1)
-		return 0;
 	pr_debug("AFFS: open(%lu,%d)\n",
 		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
 	atomic_inc(&AFFS_I(inode)->i_opencnt);
@@ -57,8 +55,6 @@ affs_file_open(struct inode *inode, struct file *filp)
 static int
 affs_file_release(struct inode *inode, struct file *filp)
 {
-	if (atomic_read(&filp->f_count) != 0)
-		return 0;
 	pr_debug("AFFS: release(%lu, %d)\n",
 		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d214837d5e42..8989c93193ed 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -90,7 +90,7 @@ static void affs_destroy_inode(struct inode *inode)
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
@@ -135,7 +135,7 @@ enum {
 	Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_bs, "bs=%u"},
 	{Opt_mode, "mode=%o"},
 	{Opt_mufs, "mufs"},
@@ -290,7 +290,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sbi)
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
-	init_MUTEX(&sbi->s_bmlock);
+	mutex_init(&sbi->s_bmlock);
 
 	if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
 				&blocksize,&sbi->s_prefix,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index dfda03d4397d..99cf390641f7 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -45,6 +45,7 @@ const struct file_operations afs_dir_file_operations = {
 	.release	= afs_release,
 	.readdir	= afs_readdir,
 	.lock		= afs_lock,
+	.llseek		= generic_file_llseek,
 };
 
 const struct inode_operations afs_dir_inode_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 525f7c56e068..a3901769a96c 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -50,8 +50,8 @@ const struct address_space_operations afs_fs_aops = {
 	.launder_page	= afs_launder_page,
 	.releasepage	= afs_releasepage,
 	.invalidatepage	= afs_invalidatepage,
-	.prepare_write	= afs_prepare_write,
-	.commit_write	= afs_commit_write,
+	.write_begin	= afs_write_begin,
+	.write_end	= afs_write_end,
 	.writepage	= afs_writepage,
 	.writepages	= afs_writepages,
 };
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7102824ba847..67f259d99cd6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -469,8 +469,6 @@ extern bool afs_cm_incoming_call(struct afs_call *);
 extern const struct inode_operations afs_dir_inode_operations;
 extern const struct file_operations afs_dir_file_operations;
 
-extern int afs_permission(struct inode *, int, struct nameidata *);
-
 /*
  * file.c
  */
@@ -605,7 +603,7 @@ extern void afs_clear_permits(struct afs_vnode *);
 extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
 extern void afs_zap_permits(struct rcu_head *);
 extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int, struct nameidata *);
+extern int afs_permission(struct inode *, int);
 
 /*
  * server.c
@@ -730,8 +728,12 @@ extern int afs_volume_release_fileserver(struct afs_vnode *,
  */
 extern int afs_set_page_dirty(struct page *);
 extern void afs_put_writeback(struct afs_writeback *);
-extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned);
-extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned);
+extern int afs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata);
+extern int afs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata);
 extern int afs_writepage(struct page *, struct writeback_control *);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
 extern int afs_write_inode(struct inode *, int);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2f5503902c37..78db4953a800 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -232,7 +232,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 	}
 
 	mntget(newmnt);
-	err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts);
+	err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
 	switch (err) {
 	case 0:
 		path_put(&nd->path);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3bcbeceba1bb..3ef504370034 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -284,7 +284,7 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
  * - AFS ACLs are attached to directories only, and a file is controlled by its
  *   parent directory's ACL
  */
-int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int afs_permission(struct inode *inode, int mask)
 {
 	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_access_t uninitialized_var(access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7e3faeef6818..aee239a048cb 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -27,7 +27,7 @@
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 
-static void afs_i_init_once(struct kmem_cache *cachep, void *foo);
+static void afs_i_init_once(void *foo);
 static int afs_get_sb(struct file_system_type *fs_type,
 		      int flags, const char *dev_name,
 		      void *data, struct vfsmount *mnt);
@@ -64,7 +64,7 @@ enum {
 	afs_opt_vol,
 };
 
-static match_table_t afs_options_list = {
+static const match_table_t afs_options_list = {
 	{ afs_opt_cell,		"cell=%s"	},
 	{ afs_opt_rwpath,	"rwpath"	},
 	{ afs_opt_vol,		"vol=%s"	},
@@ -449,7 +449,7 @@ static void afs_put_super(struct super_block *sb)
 /*
  * initialise an inode cache slab element prior to any use
  */
-static void afs_i_init_once(struct kmem_cache *cachep, void *_vnode)
+static void afs_i_init_once(void *_vnode)
 {
 	struct afs_vnode *vnode = _vnode;
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9a849ad3c489..d6b85dab35fc 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -84,15 +84,23 @@ void afs_put_writeback(struct afs_writeback *wb)
  * partly or wholly fill a page that's under preparation for writing
  */
 static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
-			 unsigned start, unsigned len, struct page *page)
+			 loff_t pos, unsigned len, struct page *page)
 {
+	loff_t i_size;
+	unsigned eof;
 	int ret;
 
-	_enter(",,%u,%u", start, len);
+	_enter(",,%llu,%u", (unsigned long long)pos, len);
 
-	ASSERTCMP(start + len, <=, PAGE_SIZE);
+	ASSERTCMP(len, <=, PAGE_CACHE_SIZE);
 
-	ret = afs_vnode_fetch_data(vnode, key, start, len, page);
+	i_size = i_size_read(&vnode->vfs_inode);
+	if (pos + len > i_size)
+		eof = i_size;
+	else
+		eof = PAGE_CACHE_SIZE;
+
+	ret = afs_vnode_fetch_data(vnode, key, 0, eof, page);
 	if (ret < 0) {
 		if (ret == -ENOENT) {
 			_debug("got NOENT from server"
@@ -107,109 +115,55 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
 }
 
 /*
- * prepare a page for being written to
- */
-static int afs_prepare_page(struct afs_vnode *vnode, struct page *page,
-			    struct key *key, unsigned offset, unsigned to)
-{
-	unsigned eof, tail, start, stop, len;
-	loff_t i_size, pos;
-	void *p;
-	int ret;
-
-	_enter("");
-
-	if (offset == 0 && to == PAGE_SIZE)
-		return 0;
-
-	p = kmap_atomic(page, KM_USER0);
-
-	i_size = i_size_read(&vnode->vfs_inode);
-	pos = (loff_t) page->index << PAGE_SHIFT;
-	if (pos >= i_size) {
-		/* partial write, page beyond EOF */
-		_debug("beyond");
-		if (offset > 0)
-			memset(p, 0, offset);
-		if (to < PAGE_SIZE)
-			memset(p + to, 0, PAGE_SIZE - to);
-		kunmap_atomic(p, KM_USER0);
-		return 0;
-	}
-
-	if (i_size - pos >= PAGE_SIZE) {
-		/* partial write, page entirely before EOF */
-		_debug("before");
-		tail = eof = PAGE_SIZE;
-	} else {
-		/* partial write, page overlaps EOF */
-		eof = i_size - pos;
-		_debug("overlap %u", eof);
-		tail = max(eof, to);
-		if (tail < PAGE_SIZE)
-			memset(p + tail, 0, PAGE_SIZE - tail);
-		if (offset > eof)
-			memset(p + eof, 0, PAGE_SIZE - eof);
-	}
-
-	kunmap_atomic(p, KM_USER0);
-
-	ret = 0;
-	if (offset > 0 || eof > to) {
-		/* need to fill one or two bits that aren't going to be written
-		 * (cover both fillers in one read if there are two) */
-		start = (offset > 0) ? 0 : to;
-		stop = (eof > to) ? eof : offset;
-		len = stop - start;
-		_debug("wr=%u-%u av=0-%u rd=%u@%u",
-		       offset, to, eof, start, len);
-		ret = afs_fill_page(vnode, key, start, len, page);
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-}
-
-/*
  * prepare to perform part of a write to a page
- * - the caller holds the page locked, preventing it from being written out or
- *   modified by anyone else
  */
-int afs_prepare_write(struct file *file, struct page *page,
-		      unsigned offset, unsigned to)
+int afs_write_begin(struct file *file, struct address_space *mapping,
+		    loff_t pos, unsigned len, unsigned flags,
+		    struct page **pagep, void **fsdata)
 {
 	struct afs_writeback *candidate, *wb;
 	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
+	struct page *page;
 	struct key *key = file->private_data;
-	pgoff_t index;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	int ret;
 
 	_enter("{%x:%u},{%lx},%u,%u",
-	       vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+	       vnode->fid.vid, vnode->fid.vnode, index, from, to);
 
 	candidate = kzalloc(sizeof(*candidate), GFP_KERNEL);
 	if (!candidate)
 		return -ENOMEM;
 	candidate->vnode = vnode;
-	candidate->first = candidate->last = page->index;
-	candidate->offset_first = offset;
+	candidate->first = candidate->last = index;
+	candidate->offset_first = from;
 	candidate->to_last = to;
 	candidate->usage = 1;
 	candidate->state = AFS_WBACK_PENDING;
 	init_waitqueue_head(&candidate->waitq);
 
+	page = __grab_cache_page(mapping, index);
+	if (!page) {
+		kfree(candidate);
+		return -ENOMEM;
+	}
+	*pagep = page;
+	/* page won't leak in error case: it eventually gets cleaned off LRU */
+
 	if (!PageUptodate(page)) {
 		_debug("not up to date");
-		ret = afs_prepare_page(vnode, page, key, offset, to);
+		ret = afs_fill_page(vnode, key, pos, len, page);
 		if (ret < 0) {
 			kfree(candidate);
 			_leave(" = %d [prep]", ret);
 			return ret;
 		}
+		SetPageUptodate(page);
 	}
 
 try_again:
-	index = page->index;
 	spin_lock(&vnode->writeback_lock);
 
 	/* see if this page is already pending a writeback under a suitable key
@@ -242,8 +196,8 @@ try_again:
 subsume_in_current_wb:
 	_debug("subsume");
 	ASSERTRANGE(wb->first, <=, index, <=, wb->last);
-	if (index == wb->first && offset < wb->offset_first)
-		wb->offset_first = offset;
+	if (index == wb->first && from < wb->offset_first)
+		wb->offset_first = from;
 	if (index == wb->last && to > wb->to_last)
 		wb->to_last = to;
 	spin_unlock(&vnode->writeback_lock);
@@ -289,17 +243,17 @@ flush_conflicting_wb:
 /*
  * finalise part of a write to a page
  */
-int afs_commit_write(struct file *file, struct page *page,
-		     unsigned offset, unsigned to)
+int afs_write_end(struct file *file, struct address_space *mapping,
+		  loff_t pos, unsigned len, unsigned copied,
+		  struct page *page, void *fsdata)
 {
 	struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode);
 	loff_t i_size, maybe_i_size;
 
-	_enter("{%x:%u},{%lx},%u,%u",
-	       vnode->fid.vid, vnode->fid.vnode, page->index, offset, to);
+	_enter("{%x:%u},{%lx}",
+	       vnode->fid.vid, vnode->fid.vnode, page->index);
 
-	maybe_i_size = (loff_t) page->index << PAGE_SHIFT;
-	maybe_i_size += to;
+	maybe_i_size = pos + copied;
 
 	i_size = i_size_read(&vnode->vfs_inode);
 	if (maybe_i_size > i_size) {
@@ -310,12 +264,13 @@ int afs_commit_write(struct file *file, struct page *page,
 		spin_unlock(&vnode->writeback_lock);
 	}
 
-	SetPageUptodate(page);
 	set_page_dirty(page);
 	if (PageDirty(page))
 		_debug("dirtied");
+	unlock_page(page);
+	page_cache_release(page);
 
-	return 0;
+	return copied;
 }
 
 /*
@@ -404,7 +359,7 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
 			page = pages[loop];
 			if (page->index > wb->last)
 				break;
-			if (TestSetPageLocked(page))
+			if (!trylock_page(page))
 				break;
 			if (!PageDirty(page) ||
 			    page_private(page) != (unsigned long) wb) {
diff --git a/fs/aio.c b/fs/aio.c
index 0fb3117ddd93..f658441d5666 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -512,8 +512,8 @@ static void aio_fput_routine(struct work_struct *data)
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
-	dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
-		req, atomic_read(&req->ki_filp->f_count));
+	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
+		req, atomic_long_read(&req->ki_filp->f_count));
 
 	assert_spin_locked(&ctx->ctx_lock);
 
@@ -528,7 +528,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	/* Must be done under the lock to serialise against cancellation.
 	 * Call this aio_fput as it duplicates fput via the fput_work.
 	 */
-	if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
@@ -586,7 +586,6 @@ static void use_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
-	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
 	tsk->mm = mm;
@@ -610,7 +609,6 @@ static void unuse_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
-	tsk->flags &= ~PF_BORROWED_MM;
 	tsk->mm = NULL;
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 977ef208c051..3662dd44896b 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -58,8 +58,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
  *                    of the file
  *
  * @name:    [in]    name of the "class" of the new file
- * @fops     [in]    file operations for the new file
- * @priv     [in]    private data for the new file (will be file's private_data)
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
  *
  * Creates a new file by hooking it on a single inode. This is useful for files
  * that do not need to have a full-fledged inode in order to operate correctly.
@@ -68,7 +69,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
  * setup.  Returns new descriptor or -error.
  */
 int anon_inode_getfd(const char *name, const struct file_operations *fops,
-		     void *priv)
+		     void *priv, int flags)
 {
 	struct qstr this;
 	struct dentry *dentry;
@@ -78,7 +79,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 	if (IS_ERR(anon_inode_inode))
 		return -ENODEV;
 
-	error = get_unused_fd();
+	error = get_unused_fd_flags(flags);
 	if (error < 0)
 		return error;
 	fd = error;
@@ -115,7 +116,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 	file->f_mapping = anon_inode_inode->i_mapping;
 
 	file->f_pos = 0;
-	file->f_flags = O_RDWR;
+	file->f_flags = O_RDWR | (flags & O_NONBLOCK);
 	file->f_version = 0;
 	file->private_data = priv;
 
diff --git a/fs/attr.c b/fs/attr.c
index 966b73e25f82..7a83819f6ba2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -51,7 +51,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
 	}
 
 	/* Check for setting the inode time. */
-	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
+	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
 		if (!is_owner_or_cap(inode))
 			goto error;
 	}
@@ -108,6 +108,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 	struct timespec now;
 	unsigned int ia_valid = attr->ia_valid;
 
+	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
+		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+			return -EPERM;
+	}
+
 	now = current_fs_time(inode->i_sb);
 
 	attr->ia_ctime = now;
@@ -154,17 +159,17 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 	if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
 		return 0;
 
+	error = security_inode_setattr(dentry, attr);
+	if (error)
+		return error;
+
 	if (ia_valid & ATTR_SIZE)
 		down_write(&dentry->d_inode->i_alloc_sem);
 
 	if (inode->i_op && inode->i_op->setattr) {
-		error = security_inode_setattr(dentry, attr);
-		if (!error)
-			error = inode->i_op->setattr(dentry, attr);
+		error = inode->i_op->setattr(dentry, attr);
 	} else {
 		error = inode_change_ok(inode, attr);
-		if (!error)
-			error = security_inode_setattr(dentry, attr);
 		if (!error) {
 			if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 			    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index dda510d31f84..b70eea1e8c59 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -59,7 +59,7 @@ static const struct super_operations autofs_sops = {
 
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
 
-static match_table_t autofs_tokens = {
+static const match_table_t autofs_tokens = {
 	{Opt_fd, "fd=%u"},
 	{Opt_uid, "uid=%u"},
 	{Opt_gid, "gid=%u"},
diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile
index f2c3b79e94d2..a811c1f7d9ab 100644
--- a/fs/autofs4/Makefile
+++ b/fs/autofs4/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_AUTOFS4_FS) += autofs4.o
 
-autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o
+autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c3d352d7fa93..e0f16da00e54 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -14,6 +14,7 @@
 /* Internal header file for autofs */
 
 #include <linux/auto_fs4.h>
+#include <linux/auto_dev-ioctl.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
 
@@ -21,6 +22,11 @@
 #define AUTOFS_IOC_FIRST     AUTOFS_IOC_READY
 #define AUTOFS_IOC_COUNT     32
 
+#define AUTOFS_DEV_IOCTL_IOC_FIRST	(AUTOFS_DEV_IOCTL_VERSION)
+#define AUTOFS_DEV_IOCTL_IOC_COUNT	(AUTOFS_IOC_COUNT - 11)
+
+#define AUTOFS_TYPE_TRIGGER	(AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
+
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -35,11 +41,27 @@
 /* #define DEBUG */
 
 #ifdef DEBUG
-#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __func__ , ##args); } while(0)
+#define DPRINTK(fmt, args...)				\
+do {							\
+	printk(KERN_DEBUG "pid %d: %s: " fmt "\n",	\
+		current->pid, __func__, ##args);	\
+} while (0)
 #else
-#define DPRINTK(fmt,args...) do {} while(0)
+#define DPRINTK(fmt, args...) do {} while (0)
 #endif
 
+#define AUTOFS_WARN(fmt, args...)			\
+do {							\
+	printk(KERN_WARNING "pid %d: %s: " fmt "\n",	\
+		current->pid, __func__, ##args);	\
+} while (0)
+
+#define AUTOFS_ERROR(fmt, args...)			\
+do {							\
+	printk(KERN_ERR "pid %d: %s: " fmt "\n",	\
+		current->pid, __func__, ##args);	\
+} while (0)
+
 /* Unified info structure.  This is pointed to by both the dentry and
    inode structures.  Each file in the filesystem has an instance of this
    structure.  It holds a reference to the dentry, so dentries are never
@@ -52,12 +74,18 @@ struct autofs_info {
 
 	int		flags;
 
-	struct list_head rehash;
+	struct completion expire_complete;
+
+	struct list_head active;
+	struct list_head expiring;
 
 	struct autofs_sb_info *sbi;
 	unsigned long last_used;
 	atomic_t count;
 
+	uid_t uid;
+	gid_t gid;
+
 	mode_t	mode;
 	size_t	size;
 
@@ -68,15 +96,14 @@ struct autofs_info {
 };
 
 #define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_MOUNTPOINT	(1<<1) /* mountpoint status for direct expire */
 
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
 	struct autofs_wait_queue *next;
 	autofs_wqt_t wait_queue_token;
 	/* We use the following to see what we are waiting for */
-	unsigned int hash;
-	unsigned int len;
-	char *name;
+	struct qstr name;
 	u32 dev;
 	u64 ino;
 	uid_t uid;
@@ -85,15 +112,11 @@ struct autofs_wait_queue {
 	pid_t tgid;
 	/* This is for status reporting upon return */
 	int status;
-	atomic_t wait_ctr;
+	unsigned int wait_ctr;
 };
 
 #define AUTOFS_SBI_MAGIC 0x6d4a556d
 
-#define AUTOFS_TYPE_INDIRECT     0x0001
-#define AUTOFS_TYPE_DIRECT       0x0002
-#define AUTOFS_TYPE_OFFSET       0x0004
-
 struct autofs_sb_info {
 	u32 magic;
 	int pipefd;
@@ -112,8 +135,9 @@ struct autofs_sb_info {
 	struct mutex wq_mutex;
 	spinlock_t fs_lock;
 	struct autofs_wait_queue *queues; /* Wait queue pointer */
-	spinlock_t rehash_lock;
-	struct list_head rehash_list;
+	spinlock_t lookup_lock;
+	struct list_head active_list;
+	struct list_head expiring_list;
 };
 
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
@@ -138,18 +162,14 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
 static inline int autofs4_ispending(struct dentry *dentry)
 {
 	struct autofs_info *inf = autofs4_dentry_ino(dentry);
-	int pending = 0;
 
 	if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
 		return 1;
 
-	if (inf) {
-		spin_lock(&inf->sbi->fs_lock);
-		pending = inf->flags & AUTOFS_INF_EXPIRING;
-		spin_unlock(&inf->sbi->fs_lock);
-	}
+	if (inf->flags & AUTOFS_INF_EXPIRING)
+		return 1;
 
-	return pending;
+	return 0;
 }
 
 static inline void autofs4_copy_atime(struct file *src, struct file *dst)
@@ -164,11 +184,23 @@ void autofs4_free_ino(struct autofs_info *);
 
 /* Expiration */
 int is_autofs4_dentry(struct dentry *);
+int autofs4_expire_wait(struct dentry *dentry);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *,
 			struct autofs_packet_expire __user *);
 int autofs4_expire_multi(struct super_block *, struct vfsmount *,
 			struct autofs_sb_info *, int __user *);
+struct dentry *autofs4_expire_direct(struct super_block *sb,
+				     struct vfsmount *mnt,
+				     struct autofs_sb_info *sbi, int how);
+struct dentry *autofs4_expire_indirect(struct super_block *sb,
+				       struct vfsmount *mnt,
+				       struct autofs_sb_info *sbi, int how);
+
+/* Device node initialization */
+
+int autofs_dev_ioctl_init(void);
+void autofs_dev_ioctl_exit(void);
 
 /* Operations structures */
 
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
new file mode 100644
index 000000000000..625abf5422e2
--- /dev/null
+++ b/fs/autofs4/dev-ioctl.c
@@ -0,0 +1,863 @@
+/*
+ * Copyright 2008 Red Hat, Inc. All rights reserved.
+ * Copyright 2008 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/namei.h>
+#include <linux/fcntl.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/smp_lock.h>
+#include <linux/magic.h>
+#include <linux/dcache.h>
+#include <linux/uaccess.h>
+
+#include "autofs_i.h"
+
+/*
+ * This module implements an interface for routing autofs ioctl control
+ * commands via a miscellaneous device file.
+ *
+ * The alternate interface is needed because we need to be able open
+ * an ioctl file descriptor on an autofs mount that may be covered by
+ * another mount. This situation arises when starting automount(8)
+ * or other user space daemon which uses direct mounts or offset
+ * mounts (used for autofs lazy mount/umount of nested mount trees),
+ * which have been left busy at at service shutdown.
+ */
+
+#define AUTOFS_DEV_IOCTL_SIZE	sizeof(struct autofs_dev_ioctl)
+
+typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *,
+			struct autofs_dev_ioctl *);
+
+static int check_name(const char *name)
+{
+	if (!strchr(name, '/'))
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * Check a string doesn't overrun the chunk of
+ * memory we copied from user land.
+ */
+static int invalid_str(char *str, void *end)
+{
+	while ((void *) str <= end)
+		if (!*str++)
+			return 0;
+	return -EINVAL;
+}
+
+/*
+ * Check that the user compiled against correct version of autofs
+ * misc device code.
+ *
+ * As well as checking the version compatibility this always copies
+ * the kernel interface version out.
+ */
+static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
+{
+	int err = 0;
+
+	if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
+	    (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
+		AUTOFS_WARN("ioctl control interface version mismatch: "
+		     "kernel(%u.%u), user(%u.%u), cmd(%d)",
+		     AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+		     AUTOFS_DEV_IOCTL_VERSION_MINOR,
+		     param->ver_major, param->ver_minor, cmd);
+		err = -EINVAL;
+	}
+
+	/* Fill in the kernel version. */
+	param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+	param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+
+	return err;
+}
+
+/*
+ * Copy parameter control struct, including a possible path allocated
+ * at the end of the struct.
+ */
+static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+{
+	struct autofs_dev_ioctl tmp, *ads;
+
+	if (copy_from_user(&tmp, in, sizeof(tmp)))
+		return ERR_PTR(-EFAULT);
+
+	if (tmp.size < sizeof(tmp))
+		return ERR_PTR(-EINVAL);
+
+	ads = kmalloc(tmp.size, GFP_KERNEL);
+	if (!ads)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(ads, in, tmp.size)) {
+		kfree(ads);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return ads;
+}
+
+static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
+{
+	kfree(param);
+	return;
+}
+
+/*
+ * Check sanity of parameter control fields and if a path is present
+ * check that it has a "/" and is terminated.
+ */
+static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
+{
+	int err = -EINVAL;
+
+	if (check_dev_ioctl_version(cmd, param)) {
+		AUTOFS_WARN("invalid device control module version "
+		     "supplied for cmd(0x%08x)", cmd);
+		goto out;
+	}
+
+	if (param->size > sizeof(*param)) {
+		err = check_name(param->path);
+		if (err) {
+			AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+				    cmd);
+			goto out;
+		}
+
+		err = invalid_str(param->path,
+				 (void *) ((size_t) param + param->size));
+		if (err) {
+			AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+				    cmd);
+			goto out;
+		}
+	}
+
+	err = 0;
+out:
+	return err;
+}
+
+/*
+ * Get the autofs super block info struct from the file opened on
+ * the autofs mount point.
+ */
+static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
+{
+	struct autofs_sb_info *sbi = NULL;
+	struct inode *inode;
+
+	if (f) {
+		inode = f->f_path.dentry->d_inode;
+		sbi = autofs4_sbi(inode->i_sb);
+	}
+	return sbi;
+}
+
+/* Return autofs module protocol version */
+static int autofs_dev_ioctl_protover(struct file *fp,
+				     struct autofs_sb_info *sbi,
+				     struct autofs_dev_ioctl *param)
+{
+	param->arg1 = sbi->version;
+	return 0;
+}
+
+/* Return autofs module protocol sub version */
+static int autofs_dev_ioctl_protosubver(struct file *fp,
+					struct autofs_sb_info *sbi,
+					struct autofs_dev_ioctl *param)
+{
+	param->arg1 = sbi->sub_version;
+	return 0;
+}
+
+/*
+ * Walk down the mount stack looking for an autofs mount that
+ * has the requested device number (aka. new_encode_dev(sb->s_dev).
+ */
+static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+	struct super_block *sb;
+	dev_t s_dev;
+	unsigned int err;
+
+	err = -ENOENT;
+
+	/* Lookup the dentry name at the base of our mount point */
+	dentry = d_lookup(nd->path.dentry, &nd->last);
+	if (!dentry)
+		goto out;
+
+	dput(nd->path.dentry);
+	nd->path.dentry = dentry;
+
+	/* And follow the mount stack looking for our autofs mount */
+	while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
+		inode = nd->path.dentry->d_inode;
+		if (!inode)
+			break;
+
+		sb = inode->i_sb;
+		s_dev = new_encode_dev(sb->s_dev);
+		if (devno == s_dev) {
+			if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
+				err = 0;
+				break;
+			}
+		}
+	}
+out:
+	return err;
+}
+
+/*
+ * Walk down the mount stack looking for an autofs mount that
+ * has the requested mount type (ie. indirect, direct or offset).
+ */
+static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
+{
+	struct dentry *dentry;
+	struct autofs_info *ino;
+	unsigned int err;
+
+	err = -ENOENT;
+
+	/* Lookup the dentry name at the base of our mount point */
+	dentry = d_lookup(nd->path.dentry, &nd->last);
+	if (!dentry)
+		goto out;
+
+	dput(nd->path.dentry);
+	nd->path.dentry = dentry;
+
+	/* And follow the mount stack looking for our autofs mount */
+	while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
+		ino = autofs4_dentry_ino(nd->path.dentry);
+		if (ino && ino->sbi->type & type) {
+			err = 0;
+			break;
+		}
+	}
+out:
+	return err;
+}
+
+static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	BUG_ON(fdt->fd[fd] != NULL);
+	rcu_assign_pointer(fdt->fd[fd], file);
+	FD_SET(fd, fdt->close_on_exec);
+	spin_unlock(&files->file_lock);
+}
+
+
+/*
+ * Open a file descriptor on the autofs mount point corresponding
+ * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
+ */
+static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
+{
+	struct file *filp;
+	struct nameidata nd;
+	int err, fd;
+
+	fd = get_unused_fd();
+	if (likely(fd >= 0)) {
+		/* Get nameidata of the parent directory */
+		err = path_lookup(path, LOOKUP_PARENT, &nd);
+		if (err)
+			goto out;
+
+		/*
+		 * Search down, within the parent, looking for an
+		 * autofs super block that has the device number
+		 * corresponding to the autofs fs we want to open.
+		 */
+		err = autofs_dev_ioctl_find_super(&nd, devid);
+		if (err) {
+			path_put(&nd.path);
+			goto out;
+		}
+
+		filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY);
+		if (IS_ERR(filp)) {
+			err = PTR_ERR(filp);
+			goto out;
+		}
+
+		autofs_dev_ioctl_fd_install(fd, filp);
+	}
+
+	return fd;
+
+out:
+	put_unused_fd(fd);
+	return err;
+}
+
+/* Open a file descriptor on an autofs mount point */
+static int autofs_dev_ioctl_openmount(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	const char *path;
+	dev_t devid;
+	int err, fd;
+
+	/* param->path has already been checked */
+	if (!param->arg1)
+		return -EINVAL;
+
+	param->ioctlfd = -1;
+
+	path = param->path;
+	devid = param->arg1;
+
+	err = 0;
+	fd = autofs_dev_ioctl_open_mountpoint(path, devid);
+	if (unlikely(fd < 0)) {
+		err = fd;
+		goto out;
+	}
+
+	param->ioctlfd = fd;
+out:
+	return err;
+}
+
+/* Close file descriptor allocated above (user can also use close(2)). */
+static int autofs_dev_ioctl_closemount(struct file *fp,
+				       struct autofs_sb_info *sbi,
+				       struct autofs_dev_ioctl *param)
+{
+	return sys_close(param->ioctlfd);
+}
+
+/*
+ * Send "ready" status for an existing wait (either a mount or an expire
+ * request).
+ */
+static int autofs_dev_ioctl_ready(struct file *fp,
+				  struct autofs_sb_info *sbi,
+				  struct autofs_dev_ioctl *param)
+{
+	autofs_wqt_t token;
+
+	token = (autofs_wqt_t) param->arg1;
+	return autofs4_wait_release(sbi, token, 0);
+}
+
+/*
+ * Send "fail" status for an existing wait (either a mount or an expire
+ * request).
+ */
+static int autofs_dev_ioctl_fail(struct file *fp,
+				 struct autofs_sb_info *sbi,
+				 struct autofs_dev_ioctl *param)
+{
+	autofs_wqt_t token;
+	int status;
+
+	token = (autofs_wqt_t) param->arg1;
+	status = param->arg2 ? param->arg2 : -ENOENT;
+	return autofs4_wait_release(sbi, token, status);
+}
+
+/*
+ * Set the pipe fd for kernel communication to the daemon.
+ *
+ * Normally this is set at mount using an option but if we
+ * are reconnecting to a busy mount then we need to use this
+ * to tell the autofs mount about the new kernel pipe fd. In
+ * order to protect mounts against incorrectly setting the
+ * pipefd we also require that the autofs mount be catatonic.
+ *
+ * This also sets the process group id used to identify the
+ * controlling process (eg. the owning automount(8) daemon).
+ */
+static int autofs_dev_ioctl_setpipefd(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	int pipefd;
+	int err = 0;
+
+	if (param->arg1 == -1)
+		return -EINVAL;
+
+	pipefd = param->arg1;
+
+	mutex_lock(&sbi->wq_mutex);
+	if (!sbi->catatonic) {
+		mutex_unlock(&sbi->wq_mutex);
+		return -EBUSY;
+	} else {
+		struct file *pipe = fget(pipefd);
+		if (!pipe->f_op || !pipe->f_op->write) {
+			err = -EPIPE;
+			fput(pipe);
+			goto out;
+		}
+		sbi->oz_pgrp = task_pgrp_nr(current);
+		sbi->pipefd = pipefd;
+		sbi->pipe = pipe;
+		sbi->catatonic = 0;
+	}
+out:
+	mutex_unlock(&sbi->wq_mutex);
+	return err;
+}
+
+/*
+ * Make the autofs mount point catatonic, no longer responsive to
+ * mount requests. Also closes the kernel pipe file descriptor.
+ */
+static int autofs_dev_ioctl_catatonic(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	autofs4_catatonic_mode(sbi);
+	return 0;
+}
+
+/* Set the autofs mount timeout */
+static int autofs_dev_ioctl_timeout(struct file *fp,
+				    struct autofs_sb_info *sbi,
+				    struct autofs_dev_ioctl *param)
+{
+	unsigned long timeout;
+
+	timeout = param->arg1;
+	param->arg1 = sbi->exp_timeout / HZ;
+	sbi->exp_timeout = timeout * HZ;
+	return 0;
+}
+
+/*
+ * Return the uid and gid of the last request for the mount
+ *
+ * When reconstructing an autofs mount tree with active mounts
+ * we need to re-connect to mounts that may have used the original
+ * process uid and gid (or string variations of them) for mount
+ * lookups within the map entry.
+ */
+static int autofs_dev_ioctl_requester(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	struct autofs_info *ino;
+	struct nameidata nd;
+	const char *path;
+	dev_t devid;
+	int err = -ENOENT;
+
+	if (param->size <= sizeof(*param)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	path = param->path;
+	devid = sbi->sb->s_dev;
+
+	param->arg1 = param->arg2 = -1;
+
+	/* Get nameidata of the parent directory */
+	err = path_lookup(path, LOOKUP_PARENT, &nd);
+	if (err)
+		goto out;
+
+	err = autofs_dev_ioctl_find_super(&nd, devid);
+	if (err)
+		goto out_release;
+
+	ino = autofs4_dentry_ino(nd.path.dentry);
+	if (ino) {
+		err = 0;
+		autofs4_expire_wait(nd.path.dentry);
+		spin_lock(&sbi->fs_lock);
+		param->arg1 = ino->uid;
+		param->arg2 = ino->gid;
+		spin_unlock(&sbi->fs_lock);
+	}
+
+out_release:
+	path_put(&nd.path);
+out:
+	return err;
+}
+
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more that can be done.
+ */
+static int autofs_dev_ioctl_expire(struct file *fp,
+				   struct autofs_sb_info *sbi,
+				   struct autofs_dev_ioctl *param)
+{
+	struct dentry *dentry;
+	struct vfsmount *mnt;
+	int err = -EAGAIN;
+	int how;
+
+	how = param->arg1;
+	mnt = fp->f_path.mnt;
+
+	if (sbi->type & AUTOFS_TYPE_TRIGGER)
+		dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
+	else
+		dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
+
+	if (dentry) {
+		struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
+		/*
+		 * This is synchronous because it makes the daemon a
+		 * little easier
+		*/
+		err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+
+		spin_lock(&sbi->fs_lock);
+		if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
+			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
+			sbi->sb->s_root->d_mounted++;
+		}
+		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		complete_all(&ino->expire_complete);
+		spin_unlock(&sbi->fs_lock);
+		dput(dentry);
+	}
+
+	return err;
+}
+
+/* Check if autofs mount point is in use */
+static int autofs_dev_ioctl_askumount(struct file *fp,
+				      struct autofs_sb_info *sbi,
+				      struct autofs_dev_ioctl *param)
+{
+	param->arg1 = 0;
+	if (may_umount(fp->f_path.mnt))
+		param->arg1 = 1;
+	return 0;
+}
+
+/*
+ * Check if the given path is a mountpoint.
+ *
+ * If we are supplied with the file descriptor of an autofs
+ * mount we're looking for a specific mount. In this case
+ * the path is considered a mountpoint if it is itself a
+ * mountpoint or contains a mount, such as a multi-mount
+ * without a root mount. In this case we return 1 if the
+ * path is a mount point and the super magic of the covering
+ * mount if there is one or 0 if it isn't a mountpoint.
+ *
+ * If we aren't supplied with a file descriptor then we
+ * lookup the nameidata of the path and check if it is the
+ * root of a mount. If a type is given we are looking for
+ * a particular autofs mount and if we don't find a match
+ * we return fail. If the located nameidata path is the
+ * root of a mount we return 1 along with the super magic
+ * of the mount or 0 otherwise.
+ *
+ * In both cases the the device number (as returned by
+ * new_encode_dev()) is also returned.
+ */
+static int autofs_dev_ioctl_ismountpoint(struct file *fp,
+					 struct autofs_sb_info *sbi,
+					 struct autofs_dev_ioctl *param)
+{
+	struct nameidata nd;
+	const char *path;
+	unsigned int type;
+	int err = -ENOENT;
+
+	if (param->size <= sizeof(*param)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	path = param->path;
+	type = param->arg1;
+
+	param->arg1 = 0;
+	param->arg2 = 0;
+
+	if (!fp || param->ioctlfd == -1) {
+		if (type == AUTOFS_TYPE_ANY) {
+			struct super_block *sb;
+
+			err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+			if (err)
+				goto out;
+
+			sb = nd.path.dentry->d_sb;
+			param->arg1 = new_encode_dev(sb->s_dev);
+		} else {
+			struct autofs_info *ino;
+
+			err = path_lookup(path, LOOKUP_PARENT, &nd);
+			if (err)
+				goto out;
+
+			err = autofs_dev_ioctl_find_sbi_type(&nd, type);
+			if (err)
+				goto out_release;
+
+			ino = autofs4_dentry_ino(nd.path.dentry);
+			param->arg1 = autofs4_get_dev(ino->sbi);
+		}
+
+		err = 0;
+		if (nd.path.dentry->d_inode &&
+		    nd.path.mnt->mnt_root == nd.path.dentry) {
+			err = 1;
+			param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic;
+		}
+	} else {
+		dev_t devid = new_encode_dev(sbi->sb->s_dev);
+
+		err = path_lookup(path, LOOKUP_PARENT, &nd);
+		if (err)
+			goto out;
+
+		err = autofs_dev_ioctl_find_super(&nd, devid);
+		if (err)
+			goto out_release;
+
+		param->arg1 = autofs4_get_dev(sbi);
+
+		err = have_submounts(nd.path.dentry);
+
+		if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
+			if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
+				struct inode *inode = nd.path.dentry->d_inode;
+				param->arg2 = inode->i_sb->s_magic;
+			}
+		}
+	}
+
+out_release:
+	path_put(&nd.path);
+out:
+	return err;
+}
+
+/*
+ * Our range of ioctl numbers isn't 0 based so we need to shift
+ * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table
+ * lookup.
+ */
+#define cmd_idx(cmd)	(cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST))
+
+static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
+{
+	static struct {
+		int cmd;
+		ioctl_fn fn;
+	} _ioctls[] = {
+		{cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL},
+		{cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD),
+			 autofs_dev_ioctl_protover},
+		{cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD),
+			 autofs_dev_ioctl_protosubver},
+		{cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD),
+			 autofs_dev_ioctl_openmount},
+		{cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD),
+			 autofs_dev_ioctl_closemount},
+		{cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD),
+			 autofs_dev_ioctl_ready},
+		{cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD),
+			 autofs_dev_ioctl_fail},
+		{cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD),
+			 autofs_dev_ioctl_setpipefd},
+		{cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD),
+			 autofs_dev_ioctl_catatonic},
+		{cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD),
+			 autofs_dev_ioctl_timeout},
+		{cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD),
+			 autofs_dev_ioctl_requester},
+		{cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD),
+			 autofs_dev_ioctl_expire},
+		{cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD),
+			 autofs_dev_ioctl_askumount},
+		{cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD),
+			 autofs_dev_ioctl_ismountpoint}
+	};
+	unsigned int idx = cmd_idx(cmd);
+
+	return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn;
+}
+
+/* ioctl dispatcher */
+static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+{
+	struct autofs_dev_ioctl *param;
+	struct file *fp;
+	struct autofs_sb_info *sbi;
+	unsigned int cmd_first, cmd;
+	ioctl_fn fn = NULL;
+	int err = 0;
+
+	/* only root can play with this */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
+	cmd = _IOC_NR(command);
+
+	if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) ||
+	    cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) {
+		return -ENOTTY;
+	}
+
+	/* Copy the parameters into kernel space. */
+	param = copy_dev_ioctl(user);
+	if (IS_ERR(param))
+		return PTR_ERR(param);
+
+	err = validate_dev_ioctl(command, param);
+	if (err)
+		goto out;
+
+	/* The validate routine above always sets the version */
+	if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD)
+		goto done;
+
+	fn = lookup_dev_ioctl(cmd);
+	if (!fn) {
+		AUTOFS_WARN("unknown command 0x%08x", command);
+		return -ENOTTY;
+	}
+
+	fp = NULL;
+	sbi = NULL;
+
+	/*
+	 * For obvious reasons the openmount can't have a file
+	 * descriptor yet. We don't take a reference to the
+	 * file during close to allow for immediate release.
+	 */
+	if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD &&
+	    cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) {
+		fp = fget(param->ioctlfd);
+		if (!fp) {
+			if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD)
+				goto cont;
+			err = -EBADF;
+			goto out;
+		}
+
+		if (!fp->f_op) {
+			err = -ENOTTY;
+			fput(fp);
+			goto out;
+		}
+
+		sbi = autofs_dev_ioctl_sbi(fp);
+		if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
+			err = -EINVAL;
+			fput(fp);
+			goto out;
+		}
+
+		/*
+		 * Admin needs to be able to set the mount catatonic in
+		 * order to be able to perform the re-open.
+		 */
+		if (!autofs4_oz_mode(sbi) &&
+		    cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) {
+			err = -EACCES;
+			fput(fp);
+			goto out;
+		}
+	}
+cont:
+	err = fn(fp, sbi, param);
+
+	if (fp)
+		fput(fp);
+done:
+	if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE))
+		err = -EFAULT;
+out:
+	free_dev_ioctl(param);
+	return err;
+}
+
+static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
+{
+	int err;
+	err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
+	return (long) err;
+}
+
+#ifdef CONFIG_COMPAT
+static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u)
+{
+	return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u));
+}
+#else
+#define autofs_dev_ioctl_compat NULL
+#endif
+
+static const struct file_operations _dev_ioctl_fops = {
+	.unlocked_ioctl	 = autofs_dev_ioctl,
+	.compat_ioctl = autofs_dev_ioctl_compat,
+	.owner	 = THIS_MODULE,
+};
+
+static struct miscdevice _autofs_dev_ioctl_misc = {
+	.minor 		= MISC_DYNAMIC_MINOR,
+	.name  		= AUTOFS_DEVICE_NAME,
+	.fops  		= &_dev_ioctl_fops
+};
+
+/* Register/deregister misc character device */
+int autofs_dev_ioctl_init(void)
+{
+	int r;
+
+	r = misc_register(&_autofs_dev_ioctl_misc);
+	if (r) {
+		AUTOFS_ERROR("misc_register failed for control device");
+		return r;
+	}
+
+	return 0;
+}
+
+void autofs_dev_ioctl_exit(void)
+{
+	misc_deregister(&_autofs_dev_ioctl_misc);
+	return;
+}
+
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 894fee54d4d8..cde2f8e8935a 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -244,10 +244,10 @@ cont:
 }
 
 /* Check if we can expire a direct mount (possibly a tree) */
-static struct dentry *autofs4_expire_direct(struct super_block *sb,
-					    struct vfsmount *mnt,
-					    struct autofs_sb_info *sbi,
-					    int how)
+struct dentry *autofs4_expire_direct(struct super_block *sb,
+				     struct vfsmount *mnt,
+				     struct autofs_sb_info *sbi,
+				     int how)
 {
 	unsigned long timeout;
 	struct dentry *root = dget(sb->s_root);
@@ -259,13 +259,15 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb,
 	now = jiffies;
 	timeout = sbi->exp_timeout;
 
-	/* Lock the tree as we must expire as a whole */
 	spin_lock(&sbi->fs_lock);
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
 		struct autofs_info *ino = autofs4_dentry_ino(root);
-
-		/* Set this flag early to catch sys_chdir and the like */
+		if (d_mountpoint(root)) {
+			ino->flags |= AUTOFS_INF_MOUNTPOINT;
+			root->d_mounted--;
+		}
 		ino->flags |= AUTOFS_INF_EXPIRING;
+		init_completion(&ino->expire_complete);
 		spin_unlock(&sbi->fs_lock);
 		return root;
 	}
@@ -281,10 +283,10 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb,
  *  - it is unused by any user process
  *  - it has been unused for exp_timeout time
  */
-static struct dentry *autofs4_expire_indirect(struct super_block *sb,
-					      struct vfsmount *mnt,
-					      struct autofs_sb_info *sbi,
-					      int how)
+struct dentry *autofs4_expire_indirect(struct super_block *sb,
+				       struct vfsmount *mnt,
+				       struct autofs_sb_info *sbi,
+				       int how)
 {
 	unsigned long timeout;
 	struct dentry *root = sb->s_root;
@@ -292,6 +294,8 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	struct list_head *next;
 	int do_now = how & AUTOFS_EXP_IMMEDIATE;
 	int exp_leaves = how & AUTOFS_EXP_LEAVES;
+	struct autofs_info *ino;
+	unsigned int ino_count;
 
 	if (!root)
 		return NULL;
@@ -316,6 +320,9 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		dentry = dget(dentry);
 		spin_unlock(&dcache_lock);
 
+		spin_lock(&sbi->fs_lock);
+		ino = autofs4_dentry_ino(dentry);
+
 		/*
 		 * Case 1: (i) indirect mount or top level pseudo direct mount
 		 *	   (autofs-4.1).
@@ -326,6 +333,11 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			DPRINTK("checking mountpoint %p %.*s",
 				dentry, (int)dentry->d_name.len, dentry->d_name.name);
 
+			/* Path walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 2;
+			if (atomic_read(&dentry->d_count) > ino_count)
+				goto next;
+
 			/* Can we umount this guy */
 			if (autofs4_mount_busy(mnt, dentry))
 				goto next;
@@ -343,23 +355,25 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 
 		/* Case 2: tree mount, expire iff entire tree is not busy */
 		if (!exp_leaves) {
-			/* Lock the tree as we must expire as a whole */
-			spin_lock(&sbi->fs_lock);
-			if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
-				struct autofs_info *inf = autofs4_dentry_ino(dentry);
+			/* Path walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (atomic_read(&dentry->d_count) > ino_count)
+				goto next;
 
-				/* Set this flag early to catch sys_chdir and the like */
-				inf->flags |= AUTOFS_INF_EXPIRING;
-				spin_unlock(&sbi->fs_lock);
+			if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
 				expired = dentry;
 				goto found;
 			}
-			spin_unlock(&sbi->fs_lock);
 		/*
 		 * Case 3: pseudo direct mount, expire individual leaves
 		 *	   (autofs-4.1).
 		 */
 		} else {
+			/* Path walk currently on this dentry? */
+			ino_count = atomic_read(&ino->count) + 1;
+			if (atomic_read(&dentry->d_count) > ino_count)
+				goto next;
+
 			expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
 			if (expired) {
 				dput(dentry);
@@ -367,6 +381,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			}
 		}
 next:
+		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
 		spin_lock(&dcache_lock);
 		next = next->next;
@@ -377,12 +392,45 @@ next:
 found:
 	DPRINTK("returning %p %.*s",
 		expired, (int)expired->d_name.len, expired->d_name.name);
+	ino = autofs4_dentry_ino(expired);
+	ino->flags |= AUTOFS_INF_EXPIRING;
+	init_completion(&ino->expire_complete);
+	spin_unlock(&sbi->fs_lock);
 	spin_lock(&dcache_lock);
 	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 	spin_unlock(&dcache_lock);
 	return expired;
 }
 
+int autofs4_expire_wait(struct dentry *dentry)
+{
+	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	int status;
+
+	/* Block on any pending expire */
+	spin_lock(&sbi->fs_lock);
+	if (ino->flags & AUTOFS_INF_EXPIRING) {
+		spin_unlock(&sbi->fs_lock);
+
+		DPRINTK("waiting for expire %p name=%.*s",
+			 dentry, dentry->d_name.len, dentry->d_name.name);
+
+		status = autofs4_wait(sbi, dentry, NFY_NONE);
+		wait_for_completion(&ino->expire_complete);
+
+		DPRINTK("expire done status=%d", status);
+
+		if (d_unhashed(dentry))
+			return -EAGAIN;
+
+		return status;
+	}
+	spin_unlock(&sbi->fs_lock);
+
+	return 0;
+}
+
 /* Perform an expiry operation */
 int autofs4_expire_run(struct super_block *sb,
 		      struct vfsmount *mnt,
@@ -390,7 +438,9 @@ int autofs4_expire_run(struct super_block *sb,
 		      struct autofs_packet_expire __user *pkt_p)
 {
 	struct autofs_packet_expire pkt;
+	struct autofs_info *ino;
 	struct dentry *dentry;
+	int ret = 0;
 
 	memset(&pkt,0,sizeof pkt);
 
@@ -406,9 +456,15 @@ int autofs4_expire_run(struct super_block *sb,
 	dput(dentry);
 
 	if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
-		return -EFAULT;
+		ret = -EFAULT;
 
-	return 0;
+	spin_lock(&sbi->fs_lock);
+	ino = autofs4_dentry_ino(dentry);
+	ino->flags &= ~AUTOFS_INF_EXPIRING;
+	complete_all(&ino->expire_complete);
+	spin_unlock(&sbi->fs_lock);
+
+	return ret;
 }
 
 /* Call repeatedly until it returns -EAGAIN, meaning there's nothing
@@ -423,7 +479,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 	if (arg && get_user(do_now, arg))
 		return -EFAULT;
 
-	if (sbi->type & AUTOFS_TYPE_DIRECT)
+	if (sbi->type & AUTOFS_TYPE_TRIGGER)
 		dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
 	else
 		dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
@@ -433,9 +489,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 
 		/* This is synchronous because it makes the daemon a
                    little easier */
-		ino->flags |= AUTOFS_INF_EXPIRING;
 		ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+
+		spin_lock(&sbi->fs_lock);
+		if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
+			sb->s_root->d_mounted++;
+			ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
+		}
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
+		complete_all(&ino->expire_complete);
+		spin_unlock(&sbi->fs_lock);
 		dput(dentry);
 	}
 
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 723a1c5e361b..9722e4bd8957 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -29,11 +29,20 @@ static struct file_system_type autofs_fs_type = {
 
 static int __init init_autofs4_fs(void)
 {
-	return register_filesystem(&autofs_fs_type);
+	int err;
+
+	err = register_filesystem(&autofs_fs_type);
+	if (err)
+		return err;
+
+	autofs_dev_ioctl_init();
+
+	return err;
 }
 
 static void __exit exit_autofs4_fs(void)
 {
+	autofs_dev_ioctl_exit();
 	unregister_filesystem(&autofs_fs_type);
 }
 
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 2fdcf5e1d236..c7e65bb30ba0 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -24,8 +24,10 @@
 
 static void ino_lnkfree(struct autofs_info *ino)
 {
-	kfree(ino->u.symlink);
-	ino->u.symlink = NULL;
+	if (ino->u.symlink) {
+		kfree(ino->u.symlink);
+		ino->u.symlink = NULL;
+	}
 }
 
 struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
@@ -41,16 +43,20 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
 	if (ino == NULL)
 		return NULL;
 
-	ino->flags = 0;
-	ino->mode = mode;
-	ino->inode = NULL;
-	ino->dentry = NULL;
-	ino->size = 0;
-
-	INIT_LIST_HEAD(&ino->rehash);
+	if (!reinit) {
+		ino->flags = 0;
+		ino->inode = NULL;
+		ino->dentry = NULL;
+		ino->size = 0;
+		INIT_LIST_HEAD(&ino->active);
+		INIT_LIST_HEAD(&ino->expiring);
+		atomic_set(&ino->count, 0);
+	}
 
+	ino->uid = 0;
+	ino->gid = 0;
+	ino->mode = mode;
 	ino->last_used = jiffies;
-	atomic_set(&ino->count, 0);
 
 	ino->sbi = sbi;
 
@@ -159,8 +165,8 @@ void autofs4_kill_sb(struct super_block *sb)
 	if (!sbi)
 		goto out_kill_sb;
 
-	if (!sbi->catatonic)
-		autofs4_catatonic_mode(sbi); /* Free wait queues, close pipe */
+	/* Free wait queues, close pipe */
+	autofs4_catatonic_mode(sbi);
 
 	/* Clean up and release dangling references */
 	autofs4_force_release(sbi);
@@ -209,7 +215,7 @@ static const struct super_operations autofs4_sops = {
 enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
 	Opt_indirect, Opt_direct, Opt_offset};
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_fd, "fd=%u"},
 	{Opt_uid, "uid=%u"},
 	{Opt_gid, "gid=%u"},
@@ -284,7 +290,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
 			*type = AUTOFS_TYPE_DIRECT;
 			break;
 		case Opt_offset:
-			*type = AUTOFS_TYPE_DIRECT | AUTOFS_TYPE_OFFSET;
+			*type = AUTOFS_TYPE_OFFSET;
 			break;
 		default:
 			return 1;
@@ -332,14 +338,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi->sb = s;
 	sbi->version = 0;
 	sbi->sub_version = 0;
-	sbi->type = 0;
+	sbi->type = AUTOFS_TYPE_INDIRECT;
 	sbi->min_proto = 0;
 	sbi->max_proto = 0;
 	mutex_init(&sbi->wq_mutex);
 	spin_lock_init(&sbi->fs_lock);
 	sbi->queues = NULL;
-	spin_lock_init(&sbi->rehash_lock);
-	INIT_LIST_HEAD(&sbi->rehash_list);
+	spin_lock_init(&sbi->lookup_lock);
+	INIT_LIST_HEAD(&sbi->active_list);
+	INIT_LIST_HEAD(&sbi->expiring_list);
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
 	s->s_magic = AUTOFS_SUPER_MAGIC;
@@ -373,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	}
 
 	root_inode->i_fop = &autofs4_root_operations;
-	root_inode->i_op = sbi->type & AUTOFS_TYPE_DIRECT ?
+	root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ?
 			&autofs4_direct_root_inode_operations :
 			&autofs4_indirect_root_inode_operations;
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index edf5b6bddb52..2a41c2a7fc52 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -25,25 +25,27 @@ static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
-static int autofs4_dir_close(struct inode *inode, struct file *file);
-static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir);
-static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
 
+#define TRIGGER_FLAGS   (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
+#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
+
 const struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
 	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= autofs4_root_readdir,
+	.readdir	= dcache_readdir,
+	.llseek		= dcache_dir_lseek,
 	.ioctl		= autofs4_root_ioctl,
 };
 
 const struct file_operations autofs4_dir_operations = {
 	.open		= autofs4_dir_open,
-	.release	= autofs4_dir_close,
+	.release	= dcache_dir_close,
 	.read		= generic_read_dir,
-	.readdir	= autofs4_dir_readdir,
+	.readdir	= dcache_readdir,
+	.llseek		= dcache_dir_lseek,
 };
 
 const struct inode_operations autofs4_indirect_root_inode_operations = {
@@ -70,42 +72,10 @@ const struct inode_operations autofs4_dir_inode_operations = {
 	.rmdir		= autofs4_dir_rmdir,
 };
 
-static int autofs4_root_readdir(struct file *file, void *dirent,
-				filldir_t filldir)
-{
-	struct autofs_sb_info *sbi = autofs4_sbi(file->f_path.dentry->d_sb);
-	int oz_mode = autofs4_oz_mode(sbi);
-
-	DPRINTK("called, filp->f_pos = %lld", file->f_pos);
-
-	/*
-	 * Don't set reghost flag if:
-	 * 1) f_pos is larger than zero -- we've already been here.
-	 * 2) we haven't even enabled reghosting in the 1st place.
-	 * 3) this is the daemon doing a readdir
-	 */
-	if (oz_mode && file->f_pos == 0 && sbi->reghost_enabled)
-		sbi->needs_reghost = 1;
-
-	DPRINTK("needs_reghost = %d", sbi->needs_reghost);
-
-	return dcache_readdir(file, dirent, filldir);
-}
-
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
-	struct vfsmount *mnt = file->f_path.mnt;
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct dentry *cursor;
-	int status;
-
-	status = dcache_dir_open(inode, file);
-	if (status)
-		goto out;
-
-	cursor = file->private_data;
-	cursor->d_fsdata = NULL;
 
 	DPRINTK("file=%p dentry=%p %.*s",
 		file, dentry, dentry->d_name.len, dentry->d_name.name);
@@ -113,159 +83,32 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	if (autofs4_oz_mode(sbi))
 		goto out;
 
-	if (autofs4_ispending(dentry)) {
-		DPRINTK("dentry busy");
-		dcache_dir_close(inode, file);
-		status = -EBUSY;
-		goto out;
-	}
-
-	status = -ENOENT;
-	if (!d_mountpoint(dentry) && dentry->d_op && dentry->d_op->d_revalidate) {
-		struct nameidata nd;
-		int empty, ret;
-
-		/* In case there are stale directory dentrys from a failed mount */
-		spin_lock(&dcache_lock);
-		empty = list_empty(&dentry->d_subdirs);
+	/*
+	 * An empty directory in an autofs file system is always a
+	 * mount point. The daemon must have failed to mount this
+	 * during lookup so it doesn't exist. This can happen, for
+	 * example, if user space returns an incorrect status for a
+	 * mount request. Otherwise we're doing a readdir on the
+	 * autofs file system so just let the libfs routines handle
+	 * it.
+	 */
+	spin_lock(&dcache_lock);
+	if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
 		spin_unlock(&dcache_lock);
-
-		if (!empty)
-			d_invalidate(dentry);
-
-		nd.flags = LOOKUP_DIRECTORY;
-		ret = (dentry->d_op->d_revalidate)(dentry, &nd);
-
-		if (ret <= 0) {
-			if (ret < 0)
-				status = ret;
-			dcache_dir_close(inode, file);
-			goto out;
-		}
+		return -ENOENT;
 	}
+	spin_unlock(&dcache_lock);
 
-	if (d_mountpoint(dentry)) {
-		struct file *fp = NULL;
-		struct path fp_path = { .dentry = dentry, .mnt = mnt };
-
-		path_get(&fp_path);
-
-		if (!autofs4_follow_mount(&fp_path.mnt, &fp_path.dentry)) {
-			path_put(&fp_path);
-			dcache_dir_close(inode, file);
-			goto out;
-		}
-
-		fp = dentry_open(fp_path.dentry, fp_path.mnt, file->f_flags);
-		status = PTR_ERR(fp);
-		if (IS_ERR(fp)) {
-			dcache_dir_close(inode, file);
-			goto out;
-		}
-		cursor->d_fsdata = fp;
-	}
-	return 0;
-out:
-	return status;
-}
-
-static int autofs4_dir_close(struct inode *inode, struct file *file)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct dentry *cursor = file->private_data;
-	int status = 0;
-
-	DPRINTK("file=%p dentry=%p %.*s",
-		file, dentry, dentry->d_name.len, dentry->d_name.name);
-
-	if (autofs4_oz_mode(sbi))
-		goto out;
-
-	if (autofs4_ispending(dentry)) {
-		DPRINTK("dentry busy");
-		status = -EBUSY;
-		goto out;
-	}
-
-	if (d_mountpoint(dentry)) {
-		struct file *fp = cursor->d_fsdata;
-		if (!fp) {
-			status = -ENOENT;
-			goto out;
-		}
-		filp_close(fp, current->files);
-	}
-out:
-	dcache_dir_close(inode, file);
-	return status;
-}
-
-static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldir)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct dentry *cursor = file->private_data;
-	int status;
-
-	DPRINTK("file=%p dentry=%p %.*s",
-		file, dentry, dentry->d_name.len, dentry->d_name.name);
-
-	if (autofs4_oz_mode(sbi))
-		goto out;
-
-	if (autofs4_ispending(dentry)) {
-		DPRINTK("dentry busy");
-		return -EBUSY;
-	}
-
-	if (d_mountpoint(dentry)) {
-		struct file *fp = cursor->d_fsdata;
-
-		if (!fp)
-			return -ENOENT;
-
-		if (!fp->f_op || !fp->f_op->readdir)
-			goto out;
-
-		status = vfs_readdir(fp, filldir, dirent);
-		file->f_pos = fp->f_pos;
-		if (status)
-			autofs4_copy_atime(file, fp);
-		return status;
-	}
 out:
-	return dcache_readdir(file, dirent, filldir);
+	return dcache_dir_open(inode, file);
 }
 
 static int try_to_fill_dentry(struct dentry *dentry, int flags)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	struct dentry *new;
 	int status;
 
-	/* Block on any pending expiry here; invalidate the dentry
-           when expiration is done to trigger mount request with a new
-           dentry */
-	if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
-		DPRINTK("waiting for expire %p name=%.*s",
-			 dentry, dentry->d_name.len, dentry->d_name.name);
-
-		status = autofs4_wait(sbi, dentry, NFY_NONE);
-
-		DPRINTK("expire done status=%d", status);
-
-		/*
-		 * If the directory still exists the mount request must
-		 * continue otherwise it can't be followed at the right
-		 * time during the walk.
-		 */
-		status = d_invalidate(dentry);
-		if (status != -EBUSY)
-			return -EAGAIN;
-	}
-
 	DPRINTK("dentry=%p %.*s ino=%p",
 		 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
 
@@ -292,7 +135,8 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 			return status;
 		}
 	/* Trigger mount for path component or follow link */
-	} else if (flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) ||
+	} else if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+			flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) ||
 			current->link_count) {
 		DPRINTK("waiting for mount name=%.*s",
 			dentry->d_name.len, dentry->d_name.name);
@@ -320,26 +164,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 	dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
 	spin_unlock(&dentry->d_lock);
 
-	/*
-	 * The dentry that is passed in from lookup may not be the one
-	 * we end up using, as mkdir can create a new one.  If this
-	 * happens, and another process tries the lookup at the same time,
-	 * it will set the PENDING flag on this new dentry, but add itself
-	 * to our waitq.  Then, if after the lookup succeeds, the first
-	 * process that requested the mount performs another lookup of the
-	 * same directory, it will show up as still pending!  So, we need
-	 * to redo the lookup here and clear pending on that dentry.
-	 */
-	if (d_unhashed(dentry)) {
-		new = d_lookup(dentry->d_parent, &dentry->d_name);
-		if (new) {
-			spin_lock(&new->d_lock);
-			new->d_flags &= ~DCACHE_AUTOFS_PENDING;
-			spin_unlock(&new->d_lock);
-			dput(new);
-		}
-	}
-
 	return 0;
 }
 
@@ -355,51 +179,63 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
 		dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
 		nd->flags);
-
-	/* If it's our master or we shouldn't trigger a mount we're done */
-	lookup_type = nd->flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY);
-	if (oz_mode || !lookup_type)
+	/*
+	 * For an expire of a covered direct or offset mount we need
+	 * to beeak out of follow_down() at the autofs mount trigger
+	 * (d_mounted--), so we can see the expiring flag, and manage
+	 * the blocking and following here until the expire is completed.
+	 */
+	if (oz_mode) {
+		spin_lock(&sbi->fs_lock);
+		if (ino->flags & AUTOFS_INF_EXPIRING) {
+			spin_unlock(&sbi->fs_lock);
+			/* Follow down to our covering mount. */
+			if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+				goto done;
+			goto follow;
+		}
+		spin_unlock(&sbi->fs_lock);
 		goto done;
+	}
 
-	/* If an expire request is pending wait for it. */
-	if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
-		DPRINTK("waiting for active request %p name=%.*s",
-			dentry, dentry->d_name.len, dentry->d_name.name);
-
-		status = autofs4_wait(sbi, dentry, NFY_NONE);
+	/* If an expire request is pending everyone must wait. */
+	autofs4_expire_wait(dentry);
 
-		DPRINTK("request done status=%d", status);
-	}
+	/* We trigger a mount for almost all flags */
+	lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
+	if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
+		goto follow;
 
 	/*
-	 * If the dentry contains directories then it is an
-	 * autofs multi-mount with no root mount offset. So
-	 * don't try to mount it again.
+	 * If the dentry contains directories then it is an autofs
+	 * multi-mount with no root mount offset. So don't try to
+	 * mount it again.
 	 */
 	spin_lock(&dcache_lock);
-	if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
+	if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+	    (!d_mountpoint(dentry) && __simple_empty(dentry))) {
 		spin_unlock(&dcache_lock);
 
 		status = try_to_fill_dentry(dentry, 0);
 		if (status)
 			goto out_error;
 
-		/*
-		 * The mount succeeded but if there is no root mount
-		 * it must be an autofs multi-mount with no root offset
-		 * so we don't need to follow the mount.
-		 */
-		if (d_mountpoint(dentry)) {
-			if (!autofs4_follow_mount(&nd->path.mnt,
-						  &nd->path.dentry)) {
-				status = -ENOENT;
-				goto out_error;
-			}
-		}
-
-		goto done;
+		goto follow;
 	}
 	spin_unlock(&dcache_lock);
+follow:
+	/*
+	 * If there is no root mount it must be an autofs
+	 * multi-mount with no root offset so we don't need
+	 * to follow it.
+	 */
+	if (d_mountpoint(dentry)) {
+		if (!autofs4_follow_mount(&nd->path.mnt,
+					  &nd->path.dentry)) {
+			status = -ENOENT;
+			goto out_error;
+		}
+	}
 
 done:
 	return NULL;
@@ -424,12 +260,23 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 	int status = 1;
 
 	/* Pending dentry */
+	spin_lock(&sbi->fs_lock);
 	if (autofs4_ispending(dentry)) {
 		/* The daemon never causes a mount to trigger */
+		spin_unlock(&sbi->fs_lock);
+
 		if (oz_mode)
 			return 1;
 
 		/*
+		 * If the directory has gone away due to an expire
+		 * we have been called as ->d_revalidate() and so
+		 * we need to return false and proceed to ->lookup().
+		 */
+		if (autofs4_expire_wait(dentry) == -EAGAIN)
+			return 0;
+
+		/*
 		 * A zero status is success otherwise we have a
 		 * negative error code.
 		 */
@@ -437,17 +284,9 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		if (status == 0)
 			return 1;
 
-		/*
-		 * A status of EAGAIN here means that the dentry has gone
-		 * away while waiting for an expire to complete. If we are
-		 * racing with expire lookup will wait for it so this must
-		 * be a revalidate and we need to send it to lookup.
-		 */
-		if (status == -EAGAIN)
-			return 0;
-
 		return status;
 	}
+	spin_unlock(&sbi->fs_lock);
 
 	/* Negative dentry.. invalidate if "old" */
 	if (dentry->d_inode == NULL)
@@ -461,6 +300,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		DPRINTK("dentry=%p %.*s, emptydir",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
 		spin_unlock(&dcache_lock);
+
 		/* The daemon never causes a mount to trigger */
 		if (oz_mode)
 			return 1;
@@ -493,10 +333,12 @@ void autofs4_dentry_release(struct dentry *de)
 		struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
 
 		if (sbi) {
-			spin_lock(&sbi->rehash_lock);
-			if (!list_empty(&inf->rehash))
-				list_del(&inf->rehash);
-			spin_unlock(&sbi->rehash_lock);
+			spin_lock(&sbi->lookup_lock);
+			if (!list_empty(&inf->active))
+				list_del(&inf->active);
+			if (!list_empty(&inf->expiring))
+				list_del(&inf->expiring);
+			spin_unlock(&sbi->lookup_lock);
 		}
 
 		inf->dentry = NULL;
@@ -518,7 +360,7 @@ static struct dentry_operations autofs4_dentry_operations = {
 	.d_release	= autofs4_dentry_release,
 };
 
-static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
 {
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
@@ -526,14 +368,66 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct
 	struct list_head *p, *head;
 
 	spin_lock(&dcache_lock);
-	spin_lock(&sbi->rehash_lock);
-	head = &sbi->rehash_list;
+	spin_lock(&sbi->lookup_lock);
+	head = &sbi->active_list;
 	list_for_each(p, head) {
 		struct autofs_info *ino;
 		struct dentry *dentry;
 		struct qstr *qstr;
 
-		ino = list_entry(p, struct autofs_info, rehash);
+		ino = list_entry(p, struct autofs_info, active);
+		dentry = ino->dentry;
+
+		spin_lock(&dentry->d_lock);
+
+		/* Already gone? */
+		if (atomic_read(&dentry->d_count) == 0)
+			goto next;
+
+		qstr = &dentry->d_name;
+
+		if (dentry->d_name.hash != hash)
+			goto next;
+		if (dentry->d_parent != parent)
+			goto next;
+
+		if (qstr->len != len)
+			goto next;
+		if (memcmp(qstr->name, str, len))
+			goto next;
+
+		if (d_unhashed(dentry)) {
+			dget(dentry);
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&sbi->lookup_lock);
+			spin_unlock(&dcache_lock);
+			return dentry;
+		}
+next:
+		spin_unlock(&dentry->d_lock);
+	}
+	spin_unlock(&sbi->lookup_lock);
+	spin_unlock(&dcache_lock);
+
+	return NULL;
+}
+
+static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+{
+	unsigned int len = name->len;
+	unsigned int hash = name->hash;
+	const unsigned char *str = name->name;
+	struct list_head *p, *head;
+
+	spin_lock(&dcache_lock);
+	spin_lock(&sbi->lookup_lock);
+	head = &sbi->expiring_list;
+	list_for_each(p, head) {
+		struct autofs_info *ino;
+		struct dentry *dentry;
+		struct qstr *qstr;
+
+		ino = list_entry(p, struct autofs_info, expiring);
 		dentry = ino->dentry;
 
 		spin_lock(&dentry->d_lock);
@@ -555,33 +449,16 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct
 			goto next;
 
 		if (d_unhashed(dentry)) {
-			struct inode *inode = dentry->d_inode;
-
-			ino = autofs4_dentry_ino(dentry);
-			list_del_init(&ino->rehash);
 			dget(dentry);
-			/*
-			 * Make the rehashed dentry negative so the VFS
-			 * behaves as it should.
-			 */
-			if (inode) {
-				dentry->d_inode = NULL;
-				list_del_init(&dentry->d_alias);
-				spin_unlock(&dentry->d_lock);
-				spin_unlock(&sbi->rehash_lock);
-				spin_unlock(&dcache_lock);
-				iput(inode);
-				return dentry;
-			}
 			spin_unlock(&dentry->d_lock);
-			spin_unlock(&sbi->rehash_lock);
+			spin_unlock(&sbi->lookup_lock);
 			spin_unlock(&dcache_lock);
 			return dentry;
 		}
 next:
 		spin_unlock(&dentry->d_lock);
 	}
-	spin_unlock(&sbi->rehash_lock);
+	spin_unlock(&sbi->lookup_lock);
 	spin_unlock(&dcache_lock);
 
 	return NULL;
@@ -591,7 +468,8 @@ next:
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi;
-	struct dentry *unhashed;
+	struct autofs_info *ino;
+	struct dentry *expiring, *unhashed;
 	int oz_mode;
 
 	DPRINTK("name = %.*s",
@@ -607,8 +485,26 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
 		 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
 
-	unhashed = autofs4_lookup_unhashed(sbi, dentry->d_parent, &dentry->d_name);
-	if (!unhashed) {
+	expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
+	if (expiring) {
+		/*
+		 * If we are racing with expire the request might not
+		 * be quite complete but the directory has been removed
+		 * so it must have been successful, so just wait for it.
+		 */
+		ino = autofs4_dentry_ino(expiring);
+		autofs4_expire_wait(expiring);
+		spin_lock(&sbi->lookup_lock);
+		if (!list_empty(&ino->expiring))
+			list_del_init(&ino->expiring);
+		spin_unlock(&sbi->lookup_lock);
+		dput(expiring);
+	}
+
+	unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
+	if (unhashed)
+		dentry = unhashed;
+	else {
 		/*
 		 * Mark the dentry incomplete but don't hash it. We do this
 		 * to serialize our inode creation operations (symlink and
@@ -622,39 +518,34 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 		 */
 		dentry->d_op = &autofs4_root_dentry_operations;
 
-		dentry->d_fsdata = NULL;
-		d_instantiate(dentry, NULL);
-	} else {
-		struct autofs_info *ino = autofs4_dentry_ino(unhashed);
-		DPRINTK("rehash %p with %p", dentry, unhashed);
 		/*
-		 * If we are racing with expire the request might not
-		 * be quite complete but the directory has been removed
-		 * so it must have been successful, so just wait for it.
-		 * We need to ensure the AUTOFS_INF_EXPIRING flag is clear
-		 * before continuing as revalidate may fail when calling
-		 * try_to_fill_dentry (returning EAGAIN) if we don't.
+		 * And we need to ensure that the same dentry is used for
+		 * all following lookup calls until it is hashed so that
+		 * the dentry flags are persistent throughout the request.
 		 */
-		while (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
-			DPRINTK("wait for incomplete expire %p name=%.*s",
-				unhashed, unhashed->d_name.len,
-				unhashed->d_name.name);
-			autofs4_wait(sbi, unhashed, NFY_NONE);
-			DPRINTK("request completed");
-		}
-		dentry = unhashed;
+		ino = autofs4_init_ino(NULL, sbi, 0555);
+		if (!ino)
+			return ERR_PTR(-ENOMEM);
+
+		dentry->d_fsdata = ino;
+		ino->dentry = dentry;
+
+		spin_lock(&sbi->lookup_lock);
+		list_add(&ino->active, &sbi->active_list);
+		spin_unlock(&sbi->lookup_lock);
+
+		d_instantiate(dentry, NULL);
 	}
 
 	if (!oz_mode) {
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags |= DCACHE_AUTOFS_PENDING;
 		spin_unlock(&dentry->d_lock);
-	}
-
-	if (dentry->d_op && dentry->d_op->d_revalidate) {
-		mutex_unlock(&dir->i_mutex);
-		(dentry->d_op->d_revalidate)(dentry, nd);
-		mutex_lock(&dir->i_mutex);
+		if (dentry->d_op && dentry->d_op->d_revalidate) {
+			mutex_unlock(&dir->i_mutex);
+			(dentry->d_op->d_revalidate)(dentry, nd);
+			mutex_lock(&dir->i_mutex);
+		}
 	}
 
 	/*
@@ -673,9 +564,11 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 			    return ERR_PTR(-ERESTARTNOINTR);
 			}
 		}
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-		spin_unlock(&dentry->d_lock);
+		if (!oz_mode) {
+			spin_lock(&dentry->d_lock);
+			dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+			spin_unlock(&dentry->d_lock);
+		}
 	}
 
 	/*
@@ -706,7 +599,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 	}
 
 	if (unhashed)
-		return dentry;
+		return unhashed;
 
 	return NULL;
 }
@@ -728,20 +621,31 @@ static int autofs4_dir_symlink(struct inode *dir,
 		return -EACCES;
 
 	ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
-	if (ino == NULL)
-		return -ENOSPC;
+	if (!ino)
+		return -ENOMEM;
 
-	ino->size = strlen(symname);
-	ino->u.symlink = cp = kmalloc(ino->size + 1, GFP_KERNEL);
+	spin_lock(&sbi->lookup_lock);
+	if (!list_empty(&ino->active))
+		list_del_init(&ino->active);
+	spin_unlock(&sbi->lookup_lock);
 
-	if (cp == NULL) {
-		kfree(ino);
-		return -ENOSPC;
+	ino->size = strlen(symname);
+	cp = kmalloc(ino->size + 1, GFP_KERNEL);
+	if (!cp) {
+		if (!dentry->d_fsdata)
+			kfree(ino);
+		return -ENOMEM;
 	}
 
 	strcpy(cp, symname);
 
 	inode = autofs4_get_inode(dir->i_sb, ino);
+	if (!inode) {
+		kfree(cp);
+		if (!dentry->d_fsdata)
+			kfree(ino);
+		return -ENOMEM;
+	}
 	d_add(dentry, inode);
 
 	if (dir == dir->i_sb->s_root->d_inode)
@@ -757,6 +661,7 @@ static int autofs4_dir_symlink(struct inode *dir,
 		atomic_inc(&p_ino->count);
 	ino->inode = inode;
 
+	ino->u.symlink = cp;
 	dir->i_mtime = CURRENT_TIME;
 
 	return 0;
@@ -769,9 +674,8 @@ static int autofs4_dir_symlink(struct inode *dir,
  * that the file no longer exists. However, doing that means that the
  * VFS layer can turn the dentry into a negative dentry.  We don't want
  * this, because the unlink is probably the result of an expire.
- * We simply d_drop it and add it to a rehash candidates list in the
- * super block, which allows the dentry lookup to reuse it retaining
- * the flags, such as expire in progress, in case we're racing with expire.
+ * We simply d_drop it and add it to a expiring list in the super block,
+ * which allows the dentry lookup to check for an incomplete expire.
  *
  * If a process is blocked on the dentry waiting for the expire to finish,
  * it will invalidate the dentry and try to mount with a new one.
@@ -801,9 +705,10 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	dir->i_mtime = CURRENT_TIME;
 
 	spin_lock(&dcache_lock);
-	spin_lock(&sbi->rehash_lock);
-	list_add(&ino->rehash, &sbi->rehash_list);
-	spin_unlock(&sbi->rehash_lock);
+	spin_lock(&sbi->lookup_lock);
+	if (list_empty(&ino->expiring))
+		list_add(&ino->expiring, &sbi->expiring_list);
+	spin_unlock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -829,9 +734,10 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 		spin_unlock(&dcache_lock);
 		return -ENOTEMPTY;
 	}
-	spin_lock(&sbi->rehash_lock);
-	list_add(&ino->rehash, &sbi->rehash_list);
-	spin_unlock(&sbi->rehash_lock);
+	spin_lock(&sbi->lookup_lock);
+	if (list_empty(&ino->expiring))
+		list_add(&ino->expiring, &sbi->expiring_list);
+	spin_unlock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
@@ -866,10 +772,20 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
 	ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
-	if (ino == NULL)
-		return -ENOSPC;
+	if (!ino)
+		return -ENOMEM;
+
+	spin_lock(&sbi->lookup_lock);
+	if (!list_empty(&ino->active))
+		list_del_init(&ino->active);
+	spin_unlock(&sbi->lookup_lock);
 
 	inode = autofs4_get_inode(dir->i_sb, ino);
+	if (!inode) {
+		if (!dentry->d_fsdata)
+			kfree(ino);
+		return -ENOMEM;
+	}
 	d_add(dentry, inode);
 
 	if (dir == dir->i_sb->s_root->d_inode)
@@ -922,44 +838,6 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user
 }
 
 /*
- * Tells the daemon whether we need to reghost or not. Also, clears
- * the reghost_needed flag.
- */
-static inline int autofs4_ask_reghost(struct autofs_sb_info *sbi, int __user *p)
-{
-	int status;
-
-	DPRINTK("returning %d", sbi->needs_reghost);
-
-	status = put_user(sbi->needs_reghost, p);
-	if (status)
-		return status;
-
-	sbi->needs_reghost = 0;
-	return 0;
-}
-
-/*
- * Enable / Disable reghosting ioctl() operation
- */
-static inline int autofs4_toggle_reghost(struct autofs_sb_info *sbi, int __user *p)
-{
-	int status;
-	int val;
-
-	status = get_user(val, p);
-
-	DPRINTK("reghost = %d", val);
-
-	if (status)
-		return status;
-
-	/* turn on/off reghosting, with the val */
-	sbi->reghost_enabled = val;
-	return 0;
-}
-
-/*
 * Tells the daemon whether it can umount the autofs mount.
 */
 static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
@@ -1023,11 +901,6 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
 	case AUTOFS_IOC_SETTIMEOUT:
 		return autofs4_get_set_timeout(sbi, p);
 
-	case AUTOFS_IOC_TOGGLEREGHOST:
-		return autofs4_toggle_reghost(sbi, p);
-	case AUTOFS_IOC_ASKREGHOST:
-		return autofs4_ask_reghost(sbi, p);
-
 	case AUTOFS_IOC_ASKUMOUNT:
 		return autofs4_ask_umount(filp->f_path.mnt, p);
 
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 75e5955c3f6d..4b67c2a2d77c 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -28,6 +28,12 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 {
 	struct autofs_wait_queue *wq, *nwq;
 
+	mutex_lock(&sbi->wq_mutex);
+	if (sbi->catatonic) {
+		mutex_unlock(&sbi->wq_mutex);
+		return;
+	}
+
 	DPRINTK("entering catatonic mode");
 
 	sbi->catatonic = 1;
@@ -36,13 +42,18 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 	while (wq) {
 		nwq = wq->next;
 		wq->status = -ENOENT; /* Magic is gone - report failure */
-		kfree(wq->name);
-		wq->name = NULL;
+		if (wq->name.name) {
+			kfree(wq->name.name);
+			wq->name.name = NULL;
+		}
+		wq->wait_ctr--;
 		wake_up_interruptible(&wq->queue);
 		wq = nwq;
 	}
 	fput(sbi->pipe);	/* Close the pipe */
 	sbi->pipe = NULL;
+	sbi->pipefd = -1;
+	mutex_unlock(&sbi->wq_mutex);
 }
 
 static int autofs4_write(struct file *file, const void *addr, int bytes)
@@ -89,10 +100,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		union autofs_packet_union v4_pkt;
 		union autofs_v5_packet_union v5_pkt;
 	} pkt;
+	struct file *pipe = NULL;
 	size_t pktsz;
 
 	DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
-		wq->wait_queue_token, wq->len, wq->name, type);
+		wq->wait_queue_token, wq->name.len, wq->name.name, type);
 
 	memset(&pkt,0,sizeof pkt); /* For security reasons */
 
@@ -107,9 +119,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		pktsz = sizeof(*mp);
 
 		mp->wait_queue_token = wq->wait_queue_token;
-		mp->len = wq->len;
-		memcpy(mp->name, wq->name, wq->len);
-		mp->name[wq->len] = '\0';
+		mp->len = wq->name.len;
+		memcpy(mp->name, wq->name.name, wq->name.len);
+		mp->name[wq->name.len] = '\0';
 		break;
 	}
 	case autofs_ptype_expire_multi:
@@ -119,9 +131,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		pktsz = sizeof(*ep);
 
 		ep->wait_queue_token = wq->wait_queue_token;
-		ep->len = wq->len;
-		memcpy(ep->name, wq->name, wq->len);
-		ep->name[wq->len] = '\0';
+		ep->len = wq->name.len;
+		memcpy(ep->name, wq->name.name, wq->name.len);
+		ep->name[wq->name.len] = '\0';
 		break;
 	}
 	/*
@@ -138,9 +150,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		pktsz = sizeof(*packet);
 
 		packet->wait_queue_token = wq->wait_queue_token;
-		packet->len = wq->len;
-		memcpy(packet->name, wq->name, wq->len);
-		packet->name[wq->len] = '\0';
+		packet->len = wq->name.len;
+		memcpy(packet->name, wq->name.name, wq->name.len);
+		packet->name[wq->name.len] = '\0';
 		packet->dev = wq->dev;
 		packet->ino = wq->ino;
 		packet->uid = wq->uid;
@@ -154,8 +166,19 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		return;
 	}
 
-	if (autofs4_write(sbi->pipe, &pkt, pktsz))
-		autofs4_catatonic_mode(sbi);
+	/* Check if we have become catatonic */
+	mutex_lock(&sbi->wq_mutex);
+	if (!sbi->catatonic) {
+		pipe = sbi->pipe;
+		get_file(pipe);
+	}
+	mutex_unlock(&sbi->wq_mutex);
+
+	if (pipe) {
+		if (autofs4_write(pipe, &pkt, pktsz))
+			autofs4_catatonic_mode(sbi);
+		fput(pipe);
+	}
 }
 
 static int autofs4_getpath(struct autofs_sb_info *sbi,
@@ -191,58 +214,55 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
 }
 
 static struct autofs_wait_queue *
-autofs4_find_wait(struct autofs_sb_info *sbi,
-		  char *name, unsigned int hash, unsigned int len)
+autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
 {
 	struct autofs_wait_queue *wq;
 
 	for (wq = sbi->queues; wq; wq = wq->next) {
-		if (wq->hash == hash &&
-		    wq->len == len &&
-		    wq->name && !memcmp(wq->name, name, len))
+		if (wq->name.hash == qstr->hash &&
+		    wq->name.len == qstr->len &&
+		    wq->name.name &&
+			 !memcmp(wq->name.name, qstr->name, qstr->len))
 			break;
 	}
 	return wq;
 }
 
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
-		enum autofs_notify notify)
+/*
+ * Check if we have a valid request.
+ * Returns
+ * 1 if the request should continue.
+ *   In this case we can return an autofs_wait_queue entry if one is
+ *   found or NULL to idicate a new wait needs to be created.
+ * 0 or a negative errno if the request shouldn't continue.
+ */
+static int validate_request(struct autofs_wait_queue **wait,
+			    struct autofs_sb_info *sbi,
+			    struct qstr *qstr,
+			    struct dentry*dentry, enum autofs_notify notify)
 {
-	struct autofs_info *ino;
 	struct autofs_wait_queue *wq;
-	char *name;
-	unsigned int len = 0;
-	unsigned int hash = 0;
-	int status, type;
-
-	/* In catatonic mode, we don't wait for nobody */
-	if (sbi->catatonic)
-		return -ENOENT;
-	
-	name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!name)
-		return -ENOMEM;
+	struct autofs_info *ino;
 
-	/* If this is a direct mount request create a dummy name */
-	if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT))
-		len = sprintf(name, "%p", dentry);
-	else {
-		len = autofs4_getpath(sbi, dentry, &name);
-		if (!len) {
-			kfree(name);
-			return -ENOENT;
-		}
+	/* Wait in progress, continue; */
+	wq = autofs4_find_wait(sbi, qstr);
+	if (wq) {
+		*wait = wq;
+		return 1;
 	}
-	hash = full_name_hash(name, len);
 
-	if (mutex_lock_interruptible(&sbi->wq_mutex)) {
-		kfree(name);
-		return -EINTR;
-	}
+	*wait = NULL;
 
-	wq = autofs4_find_wait(sbi, name, hash, len);
+	/* If we don't yet have any info this is a new request */
 	ino = autofs4_dentry_ino(dentry);
-	if (!wq && ino && notify == NFY_NONE) {
+	if (!ino)
+		return 1;
+
+	/*
+	 * If we've been asked to wait on an existing expire (NFY_NONE)
+	 * but there is no wait in the queue ...
+	 */
+	if (notify == NFY_NONE) {
 		/*
 		 * Either we've betean the pending expire to post it's
 		 * wait or it finished while we waited on the mutex.
@@ -253,13 +273,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		while (ino->flags & AUTOFS_INF_EXPIRING) {
 			mutex_unlock(&sbi->wq_mutex);
 			schedule_timeout_interruptible(HZ/10);
-			if (mutex_lock_interruptible(&sbi->wq_mutex)) {
-				kfree(name);
+			if (mutex_lock_interruptible(&sbi->wq_mutex))
 				return -EINTR;
+
+			wq = autofs4_find_wait(sbi, qstr);
+			if (wq) {
+				*wait = wq;
+				return 1;
 			}
-			wq = autofs4_find_wait(sbi, name, hash, len);
-			if (wq)
-				break;
 		}
 
 		/*
@@ -267,18 +288,96 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		 * cases where we wait on NFY_NONE neither depend on the
 		 * return status of the wait.
 		 */
-		if (!wq) {
+		return 0;
+	}
+
+	/*
+	 * If we've been asked to trigger a mount and the request
+	 * completed while we waited on the mutex ...
+	 */
+	if (notify == NFY_MOUNT) {
+		/*
+		 * If the dentry isn't hashed just go ahead and try the
+		 * mount again with a new wait (not much else we can do).
+		*/
+		if (!d_unhashed(dentry)) {
+			/*
+			 * But if the dentry is hashed, that means that we
+			 * got here through the revalidate path.  Thus, we
+			 * need to check if the dentry has been mounted
+			 * while we waited on the wq_mutex. If it has,
+			 * simply return success.
+			 */
+			if (d_mountpoint(dentry))
+				return 0;
+		}
+	}
+
+	return 1;
+}
+
+int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
+		enum autofs_notify notify)
+{
+	struct autofs_wait_queue *wq;
+	struct qstr qstr;
+	char *name;
+	int status, ret, type;
+
+	/* In catatonic mode, we don't wait for nobody */
+	if (sbi->catatonic)
+		return -ENOENT;
+
+	if (!dentry->d_inode) {
+		/*
+		 * A wait for a negative dentry is invalid for certain
+		 * cases. A direct or offset mount "always" has its mount
+		 * point directory created and so the request dentry must
+		 * be positive or the map key doesn't exist. The situation
+		 * is very similar for indirect mounts except only dentrys
+		 * in the root of the autofs file system may be negative.
+		 */
+		if (sbi->type & AUTOFS_TYPE_TRIGGER)
+			return -ENOENT;
+		else if (!IS_ROOT(dentry->d_parent))
+			return -ENOENT;
+	}
+
+	name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+
+	/* If this is a direct mount request create a dummy name */
+	if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER)
+		qstr.len = sprintf(name, "%p", dentry);
+	else {
+		qstr.len = autofs4_getpath(sbi, dentry, &name);
+		if (!qstr.len) {
 			kfree(name);
-			mutex_unlock(&sbi->wq_mutex);
-			return 0;
+			return -ENOENT;
 		}
 	}
+	qstr.name = name;
+	qstr.hash = full_name_hash(name, qstr.len);
+
+	if (mutex_lock_interruptible(&sbi->wq_mutex)) {
+		kfree(qstr.name);
+		return -EINTR;
+	}
+
+	ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+	if (ret <= 0) {
+		if (ret == 0)
+			mutex_unlock(&sbi->wq_mutex);
+		kfree(qstr.name);
+		return ret;
+	}
 
 	if (!wq) {
 		/* Create a new wait queue */
 		wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
 		if (!wq) {
-			kfree(name);
+			kfree(qstr.name);
 			mutex_unlock(&sbi->wq_mutex);
 			return -ENOMEM;
 		}
@@ -289,9 +388,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->next = sbi->queues;
 		sbi->queues = wq;
 		init_waitqueue_head(&wq->queue);
-		wq->hash = hash;
-		wq->name = name;
-		wq->len = len;
+		memcpy(&wq->name, &qstr, sizeof(struct qstr));
 		wq->dev = autofs4_get_dev(sbi);
 		wq->ino = autofs4_get_ino(sbi);
 		wq->uid = current->uid;
@@ -299,7 +396,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->pid = current->pid;
 		wq->tgid = current->tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
-		atomic_set(&wq->wait_ctr, 2);
+		wq->wait_ctr = 2;
 		mutex_unlock(&sbi->wq_mutex);
 
 		if (sbi->version < 5) {
@@ -309,38 +406,35 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 				type = autofs_ptype_expire_multi;
 		} else {
 			if (notify == NFY_MOUNT)
-				type = (sbi->type & AUTOFS_TYPE_DIRECT) ?
+				type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
 					autofs_ptype_missing_direct :
 					 autofs_ptype_missing_indirect;
 			else
-				type = (sbi->type & AUTOFS_TYPE_DIRECT) ?
+				type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
 					autofs_ptype_expire_direct :
 					autofs_ptype_expire_indirect;
 		}
 
 		DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-			(unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
+			(unsigned long) wq->wait_queue_token, wq->name.len,
+			wq->name.name, notify);
 
 		/* autofs4_notify_daemon() may block */
 		autofs4_notify_daemon(sbi, wq, type);
 	} else {
-		atomic_inc(&wq->wait_ctr);
+		wq->wait_ctr++;
 		mutex_unlock(&sbi->wq_mutex);
-		kfree(name);
+		kfree(qstr.name);
 		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
-			(unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
+			(unsigned long) wq->wait_queue_token, wq->name.len,
+			wq->name.name, notify);
 	}
 
-	/* wq->name is NULL if and only if the lock is already released */
-
-	if (sbi->catatonic) {
-		/* We might have slept, so check again for catatonic mode */
-		wq->status = -ENOENT;
-		kfree(wq->name);
-		wq->name = NULL;
-	}
-
-	if (wq->name) {
+	/*
+	 * wq->name.name is NULL iff the lock is already released
+	 * or the mount has been made catatonic.
+	 */
+	if (wq->name.name) {
 		/* Block all but "shutdown" signals while waiting */
 		sigset_t oldset;
 		unsigned long irqflags;
@@ -351,7 +445,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		recalc_sigpending();
 		spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
 
-		wait_event_interruptible(wq->queue, wq->name == NULL);
+		wait_event_interruptible(wq->queue, wq->name.name == NULL);
 
 		spin_lock_irqsave(&current->sighand->siglock, irqflags);
 		current->blocked = oldset;
@@ -363,9 +457,45 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 
 	status = wq->status;
 
+	/*
+	 * For direct and offset mounts we need to track the requester's
+	 * uid and gid in the dentry info struct. This is so it can be
+	 * supplied, on request, by the misc device ioctl interface.
+	 * This is needed during daemon resatart when reconnecting
+	 * to existing, active, autofs mounts. The uid and gid (and
+	 * related string values) may be used for macro substitution
+	 * in autofs mount maps.
+	 */
+	if (!status) {
+		struct autofs_info *ino;
+		struct dentry *de = NULL;
+
+		/* direct mount or browsable map */
+		ino = autofs4_dentry_ino(dentry);
+		if (!ino) {
+			/* If not lookup actual dentry used */
+			de = d_lookup(dentry->d_parent, &dentry->d_name);
+			if (de)
+				ino = autofs4_dentry_ino(de);
+		}
+
+		/* Set mount requester */
+		if (ino) {
+			spin_lock(&sbi->fs_lock);
+			ino->uid = wq->uid;
+			ino->gid = wq->gid;
+			spin_unlock(&sbi->fs_lock);
+		}
+
+		if (de)
+			dput(de);
+	}
+
 	/* Are we the last process to need status? */
-	if (atomic_dec_and_test(&wq->wait_ctr))
+	mutex_lock(&sbi->wq_mutex);
+	if (!--wq->wait_ctr)
 		kfree(wq);
+	mutex_unlock(&sbi->wq_mutex);
 
 	return status;
 }
@@ -387,16 +517,13 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
 	}
 
 	*wql = wq->next;	/* Unlink from chain */
-	mutex_unlock(&sbi->wq_mutex);
-	kfree(wq->name);
-	wq->name = NULL;	/* Do not wait on this queue */
-
+	kfree(wq->name.name);
+	wq->name.name = NULL;	/* Do not wait on this queue */
 	wq->status = status;
-
-	if (atomic_dec_and_test(&wq->wait_ctr))	/* Is anyone still waiting for this guy? */
+	wake_up_interruptible(&wq->queue);
+	if (!--wq->wait_ctr)
 		kfree(wq);
-	else
-		wake_up_interruptible(&wq->queue);
+	mutex_unlock(&sbi->wq_mutex);
 
 	return 0;
 }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f1c2ea8342f5..5f1538c03b1b 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -243,8 +243,7 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
 	return -EIO;
 }
 
-static int bad_inode_permission(struct inode *inode, int mask,
-			struct nameidata *nd)
+static int bad_inode_permission(struct inode *inode, int mask)
 {
 	return -EIO;
 }
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index e2595c2c403a..7893eaa1e58c 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -55,8 +55,12 @@ enum super_flags {
 };
 
 #define BEFS_BYTEORDER_NATIVE 0x42494745
+#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE)
+#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE)
 
 #define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1
+#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1)
+#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1)
 
 /*
  * Flags of inode
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e8717de3bab3..b6dfee37c7b7 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
 static const struct file_operations befs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= befs_readdir,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct inode_operations befs_dir_inode_operations = {
@@ -289,7 +290,7 @@ befs_destroy_inode(struct inode *inode)
         kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
 
@@ -649,7 +650,7 @@ enum {
 	Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
 };
 
-static match_table_t befs_tokens = {
+static const match_table_t befs_tokens = {
 	{Opt_uid, "uid=%d"},
 	{Opt_gid, "gid=%d"},
 	{Opt_charset, "iocharset=%s"},
@@ -808,8 +809,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* account for offset of super block on x86 */
 	disk_sb = (befs_super_block *) bh->b_data;
-	if ((le32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1) ||
-	    (be32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1)) {
+	if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) ||
+	    (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) {
 		befs_debug(sb, "Using PPC superblock location");
 	} else {
 		befs_debug(sb, "Using x86 superblock location");
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 8c3401ff6d6a..41f2b4d0093e 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -26,10 +26,10 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb)
 	befs_sb_info *befs_sb = BEFS_SB(sb);
 
 	/* Check the byte order of the filesystem */
-	if (le32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE)
+	if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE)
 	    befs_sb->byte_order = BEFS_BYTESEX_LE;
-	else if (be32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE)
-	    befs_sb->byte_order = BEFS_BYTESEX_BE;	
+	else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE)
+	    befs_sb->byte_order = BEFS_BYTESEX_BE;
 
 	befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1);
 	befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2);
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 70f5d3a8eede..7109e451abf7 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -16,8 +16,9 @@ struct bfs_sb_info {
 	unsigned long si_freei;
 	unsigned long si_lf_eblk;
 	unsigned long si_lasti;
-	unsigned long * si_imap;
-	struct buffer_head * si_sbh;		/* buffer header w/superblock */
+	unsigned long *si_imap;
+	struct buffer_head *si_sbh;		/* buffer header w/superblock */
+	struct mutex bfs_lock;
 };
 
 /*
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 034950cb3cbe..daae463068e4 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -32,16 +32,17 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 	struct inode *dir = f->f_path.dentry->d_inode;
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
 	unsigned int offset;
 	int block;
 
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 
 	if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
 		printf("Bad f_pos=%08lx for %s:%08lx\n",
 					(unsigned long)f->f_pos,
 					dir->i_sb->s_id, dir->i_ino);
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		return -EBADF;
 	}
 
@@ -61,7 +62,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 						le16_to_cpu(de->ino),
 						DT_UNKNOWN) < 0) {
 					brelse(bh);
-					unlock_kernel();
+					mutex_unlock(&info->bfs_lock);
 					return 0;
 				}
 			}
@@ -71,7 +72,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 		brelse(bh);
 	}
 
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return 0;	
 }
 
@@ -79,6 +80,7 @@ const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= bfs_readdir,
 	.fsync		= file_fsync,
+	.llseek		= generic_file_llseek,
 };
 
 extern void dump_imap(const char *, struct super_block *);
@@ -95,10 +97,10 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	inode = new_inode(s);
 	if (!inode)
 		return -ENOSPC;
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	ino = find_first_zero_bit(info->si_imap, info->si_lasti);
 	if (ino > info->si_lasti) {
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		iput(inode);
 		return -ENOSPC;
 	}
@@ -124,11 +126,11 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
 							inode->i_ino);
 	if (err) {
 		inode_dec_link_count(inode);
+		mutex_unlock(&info->bfs_lock);
 		iput(inode);
-		unlock_kernel();
 		return err;
 	}
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	d_instantiate(dentry, inode);
 	return 0;
 }
@@ -139,22 +141,23 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(dir->i_sb);
 
 	if (dentry->d_name.len > BFS_NAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
 	if (bh) {
 		unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
 		brelse(bh);
 		inode = bfs_iget(dir->i_sb, ino);
 		if (IS_ERR(inode)) {
-			unlock_kernel();
+			mutex_unlock(&info->bfs_lock);
 			return ERR_CAST(inode);
 		}
 	}
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	d_add(dentry, inode);
 	return NULL;
 }
@@ -163,13 +166,14 @@ static int bfs_link(struct dentry *old, struct inode *dir,
 						struct dentry *new)
 {
 	struct inode *inode = old->d_inode;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 	int err;
 
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
 							inode->i_ino);
 	if (err) {
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		return err;
 	}
 	inc_nlink(inode);
@@ -177,19 +181,19 @@ static int bfs_link(struct dentry *old, struct inode *dir,
 	mark_inode_dirty(inode);
 	atomic_inc(&inode->i_count);
 	d_instantiate(new, inode);
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return 0;
 }
 
 static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error = -ENOENT;
-	struct inode *inode;
+	struct inode *inode = dentry->d_inode;
 	struct buffer_head *bh;
 	struct bfs_dirent *de;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 
-	inode = dentry->d_inode;
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
 	if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
 		goto out_brelse;
@@ -210,7 +214,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 
 out_brelse:
 	brelse(bh);
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return error;
 }
 
@@ -220,6 +224,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct inode *old_inode, *new_inode;
 	struct buffer_head *old_bh, *new_bh;
 	struct bfs_dirent *old_de, *new_de;
+	struct bfs_sb_info *info;
 	int error = -ENOENT;
 
 	old_bh = new_bh = NULL;
@@ -227,7 +232,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (S_ISDIR(old_inode->i_mode))
 		return -EINVAL;
 
-	lock_kernel();
+	info = BFS_SB(old_inode->i_sb);
+
+	mutex_lock(&info->bfs_lock);
 	old_bh = bfs_find_entry(old_dir, 
 				old_dentry->d_name.name, 
 				old_dentry->d_name.len, &old_de);
@@ -264,7 +271,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	error = 0;
 
 end_rename:
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	brelse(old_bh);
 	brelse(new_bh);
 	return error;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index b11e63e8fbcd..6a021265f018 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -99,7 +99,7 @@ static int bfs_get_block(struct inode *inode, sector_t block,
 		return -ENOSPC;
 
 	/* The rest has to be protected against itself. */
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 
 	/*
 	 * If the last data block for this file is the last allocated
@@ -151,7 +151,7 @@ static int bfs_get_block(struct inode *inode, sector_t block,
 	mark_buffer_dirty(sbh);
 	map_bh(bh_result, sb, phys);
 out:
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return err;
 }
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8db623838b50..0ed57b5ee012 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -104,6 +104,7 @@ static int bfs_write_inode(struct inode *inode, int unused)
 	struct bfs_inode *di;
 	struct buffer_head *bh;
 	int block, off;
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 
         dprintf("ino=%08x\n", ino);
 
@@ -112,13 +113,13 @@ static int bfs_write_inode(struct inode *inode, int unused)
 		return -EIO;
 	}
 
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh) {
 		printf("Unable to read inode %s:%08x\n",
 				inode->i_sb->s_id, ino);
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		return -EIO;
 	}
 
@@ -145,7 +146,7 @@ static int bfs_write_inode(struct inode *inode, int unused)
 
 	mark_buffer_dirty(bh);
 	brelse(bh);
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	return 0;
 }
 
@@ -170,7 +171,7 @@ static void bfs_delete_inode(struct inode *inode)
 	
 	inode->i_size = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-	lock_kernel();
+	mutex_lock(&info->bfs_lock);
 	mark_inode_dirty(inode);
 
 	block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
@@ -178,7 +179,7 @@ static void bfs_delete_inode(struct inode *inode)
 	if (!bh) {
 		printf("Unable to read inode %s:%08lx\n",
 					inode->i_sb->s_id, ino);
-		unlock_kernel();
+		mutex_unlock(&info->bfs_lock);
 		return;
 	}
 	off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
@@ -204,14 +205,16 @@ static void bfs_delete_inode(struct inode *inode)
 		info->si_lf_eblk = bi->i_sblock - 1;
 		mark_buffer_dirty(info->si_sbh);
 	}
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 	clear_inode(inode);
 }
 
 static void bfs_put_super(struct super_block *s)
 {
 	struct bfs_sb_info *info = BFS_SB(s);
+
 	brelse(info->si_sbh);
+	mutex_destroy(&info->bfs_lock);
 	kfree(info->si_imap);
 	kfree(info);
 	s->s_fs_info = NULL;
@@ -236,11 +239,13 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static void bfs_write_super(struct super_block *s)
 {
-	lock_kernel();
+	struct bfs_sb_info *info = BFS_SB(s);
+
+	mutex_lock(&info->bfs_lock);
 	if (!(s->s_flags & MS_RDONLY))
-		mark_buffer_dirty(BFS_SB(s)->si_sbh);
+		mark_buffer_dirty(info->si_sbh);
 	s->s_dirt = 0;
-	unlock_kernel();
+	mutex_unlock(&info->bfs_lock);
 }
 
 static struct kmem_cache *bfs_inode_cachep;
@@ -259,7 +264,7 @@ static void bfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bfs_inode_info *bi = foo;
 
@@ -380,7 +385,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		struct bfs_inode *di;
 		int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
 		int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
-		unsigned long sblock, eblock;
+		unsigned long eblock;
 
 		if (!off) {
 			brelse(bh);
@@ -399,7 +404,6 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		set_bit(i, info->si_imap);
 		info->si_freeb -= BFS_FILEBLOCKS(di);
 
-		sblock =  le32_to_cpu(di->i_sblock);
 		eblock =  le32_to_cpu(di->i_eblock);
 		if (eblock > info->si_lf_eblk)
 			info->si_lf_eblk = eblock;
@@ -410,6 +414,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 		s->s_dirt = 1;
 	} 
 	dump_imap("read_super", s);
+	mutex_init(&info->bfs_lock);
 	return 0;
 
 out:
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ba4cddb92f1d..204cfd1d7676 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -444,12 +444,6 @@ beyond_if:
 	regs->gp = ex.a_gpvalue;
 #endif
 	start_thread(regs, ex.a_entry, current->mm->start_stack);
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
 	return 0;
 }
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d48ff5f370f4..8fcfa398d350 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -131,6 +131,15 @@ static int padzero(unsigned long elf_bss)
 #define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
 #endif
 
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
 static int
 create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 		unsigned long load_addr, unsigned long interp_load_addr)
@@ -142,7 +151,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	elf_addr_t __user *envp;
 	elf_addr_t __user *sp;
 	elf_addr_t __user *u_platform;
+	elf_addr_t __user *u_base_platform;
 	const char *k_platform = ELF_PLATFORM;
+	const char *k_base_platform = ELF_BASE_PLATFORM;
 	int items;
 	elf_addr_t *elf_info;
 	int ei_index = 0;
@@ -172,6 +183,19 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 			return -EFAULT;
 	}
 
+	/*
+	 * If this architecture has a "base" platform capability
+	 * string, copy it to userspace.
+	 */
+	u_base_platform = NULL;
+	if (k_base_platform) {
+		size_t len = strlen(k_base_platform) + 1;
+
+		u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+		if (__copy_to_user(u_base_platform, k_base_platform, len))
+			return -EFAULT;
+	}
+
 	/* Create the ELF interpreter info */
 	elf_info = (elf_addr_t *)current->mm->saved_auxv;
 	/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -204,10 +228,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_GID, tsk->gid);
 	NEW_AUX_ENT(AT_EGID, tsk->egid);
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
 		NEW_AUX_ENT(AT_PLATFORM,
 			    (elf_addr_t)(unsigned long)u_platform);
 	}
+	if (k_base_platform) {
+		NEW_AUX_ENT(AT_BASE_PLATFORM,
+			    (elf_addr_t)(unsigned long)u_base_platform);
+	}
 	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
 		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
 	}
@@ -654,7 +683,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			 * switch really is going to happen - do this in
 			 * flush_thread().	- akpm
 			 */
-			SET_PERSONALITY(loc->elf_ex, 0);
+			SET_PERSONALITY(loc->elf_ex);
 
 			interpreter = open_exec(elf_interpreter);
 			retval = PTR_ERR(interpreter);
@@ -705,7 +734,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			goto out_free_dentry;
 	} else {
 		/* Executables without an interpreter also need a personality  */
-		SET_PERSONALITY(loc->elf_ex, 0);
+		SET_PERSONALITY(loc->elf_ex);
 	}
 
 	/* Flush all traces of the currently running executable */
@@ -719,7 +748,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 
 	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
 	   may depend on the personality.  */
-	SET_PERSONALITY(loc->elf_ex, 0);
+	SET_PERSONALITY(loc->elf_ex);
 	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
 		current->personality |= READ_IMPLIES_EXEC;
 
@@ -974,12 +1003,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 #endif
 
 	start_thread(regs, elf_entry, bprm->p);
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
 	retval = 0;
 out:
 	kfree(loc);
@@ -1133,16 +1156,24 @@ static int dump_seek(struct file *file, loff_t off)
 static unsigned long vma_dump_size(struct vm_area_struct *vma,
 				   unsigned long mm_flags)
 {
+#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
+
 	/* The vma can be set up to tell us the answer directly.  */
 	if (vma->vm_flags & VM_ALWAYSDUMP)
 		goto whole;
 
+	/* Hugetlb memory check */
+	if (vma->vm_flags & VM_HUGETLB) {
+		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+			goto whole;
+		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+			goto whole;
+	}
+
 	/* Do not dump I/O mapped devices or special mappings */
 	if (vma->vm_flags & (VM_IO | VM_RESERVED))
 		return 0;
 
-#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
-
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
 		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ?
@@ -1310,20 +1341,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
 		/*
-		 * This is the record for the group leader.  Add in the
-		 * cumulative times of previous dead threads.  This total
-		 * won't include the time of each live thread whose state
-		 * is included in the core dump.  The final total reported
-		 * to our parent process when it calls wait4 will include
-		 * those sums as well as the little bit more time it takes
-		 * this and each other thread to finish dying after the
-		 * core dump synchronization phase.
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
 		 */
-		cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
-				   &prstatus->pr_utime);
-		cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
-				   &prstatus->pr_stime);
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_to_timeval(p->utime, &prstatus->pr_utime);
 		cputime_to_timeval(p->stime, &prstatus->pr_stime);
@@ -1477,7 +1503,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	const struct user_regset_view *view = task_user_regset_view(dump_task);
 	struct elf_thread_core_info *t;
 	struct elf_prpsinfo *psinfo;
-	struct task_struct *g, *p;
+	struct core_thread *ct;
 	unsigned int i;
 
 	info->size = 0;
@@ -1516,31 +1542,26 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 	/*
 	 * Allocate a structure for each thread.
 	 */
-	rcu_read_lock();
-	do_each_thread(g, p)
-		if (p->mm == dump_task->mm) {
-			t = kzalloc(offsetof(struct elf_thread_core_info,
-					     notes[info->thread_notes]),
-				    GFP_ATOMIC);
-			if (unlikely(!t)) {
-				rcu_read_unlock();
-				return 0;
-			}
-			t->task = p;
-			if (p == dump_task || !info->thread) {
-				t->next = info->thread;
-				info->thread = t;
-			} else {
-				/*
-				 * Make sure to keep the original task at
-				 * the head of the list.
-				 */
-				t->next = info->thread->next;
-				info->thread->next = t;
-			}
+	for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+		t = kzalloc(offsetof(struct elf_thread_core_info,
+				     notes[info->thread_notes]),
+			    GFP_KERNEL);
+		if (unlikely(!t))
+			return 0;
+
+		t->task = ct->task;
+		if (ct->task == dump_task || !info->thread) {
+			t->next = info->thread;
+			info->thread = t;
+		} else {
+			/*
+			 * Make sure to keep the original task at
+			 * the head of the list.
+			 */
+			t->next = info->thread->next;
+			info->thread->next = t;
 		}
-	while_each_thread(g, p);
-	rcu_read_unlock();
+	}
 
 	/*
 	 * Now fill in each thread's information.
@@ -1687,7 +1708,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 {
 #define	NUM_NOTES	6
 	struct list_head *t;
-	struct task_struct *g, *p;
 
 	info->notes = NULL;
 	info->prstatus = NULL;
@@ -1719,20 +1739,19 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
 
 	info->thread_status_size = 0;
 	if (signr) {
+		struct core_thread *ct;
 		struct elf_thread_status *ets;
-		rcu_read_lock();
-		do_each_thread(g, p)
-			if (current->mm == p->mm && current != p) {
-				ets = kzalloc(sizeof(*ets), GFP_ATOMIC);
-				if (!ets) {
-					rcu_read_unlock();
-					return 0;
-				}
-				ets->thread = p;
-				list_add(&ets->list, &info->thread_list);
-			}
-		while_each_thread(g, p);
-		rcu_read_unlock();
+
+		for (ct = current->mm->core_state->dumper.next;
+						ct; ct = ct->next) {
+			ets = kzalloc(sizeof(*ets), GFP_KERNEL);
+			if (!ets)
+				return 0;
+
+			ets->thread = ct->task;
+			list_add(&ets->list, &info->thread_list);
+		}
+
 		list_for_each(t, &info->thread_list) {
 			int sz;
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d051a32e6270..5b5424cb3391 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -25,6 +25,7 @@
 #include <linux/fcntl.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/security.h>
 #include <linux/highmem.h>
 #include <linux/highuid.h>
 #include <linux/personality.h>
@@ -433,13 +434,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
 	start_thread(regs, entryaddr, current->mm->start_stack);
 
-	if (unlikely(current->ptrace & PT_PTRACED)) {
-		if (current->ptrace & PT_TRACE_EXEC)
-			ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
-		else
-			send_sig(SIGTRAP, current, 0);
-	}
-
 	retval = 0;
 
 error:
@@ -462,8 +456,19 @@ error_kill:
 }
 
 /*****************************************************************************/
+
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
 /*
- * present useful information to the program
+ * present useful information to the program by shovelling it onto the new
+ * process's stack
  */
 static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 				   struct mm_struct *mm,
@@ -473,14 +478,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	unsigned long sp, csp, nitems;
 	elf_caddr_t __user *argv, *envp;
 	size_t platform_len = 0, len;
-	char *k_platform;
-	char __user *u_platform, *p;
+	char *k_platform, *k_base_platform;
+	char __user *u_platform, *u_base_platform, *p;
 	long hwcap;
 	int loop;
+	int nr;	/* reset for each csp adjustment */
 
-	/* we're going to shovel a whole load of stuff onto the stack */
 #ifdef CONFIG_MMU
-	sp = bprm->p;
+	/* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
+	 * by the processes running on the same package. One thing we can do is
+	 * to shuffle the initial stack for them, so we give the architecture
+	 * an opportunity to do so here.
+	 */
+	sp = arch_align_stack(bprm->p);
 #else
 	sp = mm->start_stack;
 
@@ -489,11 +499,14 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 		return -EFAULT;
 #endif
 
-	/* get hold of platform and hardware capabilities masks for the machine
-	 * we are running on.  In some cases (Sparc), this info is impossible
-	 * to get, in others (i386) it is merely difficult.
-	 */
 	hwcap = ELF_HWCAP;
+
+	/*
+	 * If this architecture has a platform capability string, copy it
+	 * to userspace.  In some cases (Sparc), this info is impossible
+	 * for userspace to get any other way, in others (i386) it is
+	 * merely difficult.
+	 */
 	k_platform = ELF_PLATFORM;
 	u_platform = NULL;
 
@@ -505,19 +518,20 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 			return -EFAULT;
 	}
 
-#if defined(__i386__) && defined(CONFIG_SMP)
-	/* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
-	 * by the processes running on the same package. One thing we can do is
-	 * to shuffle the initial stack for them.
-	 *
-	 * the conditionals here are unneeded, but kept in to make the code
-	 * behaviour the same as pre change unless we have hyperthreaded
-	 * processors. This keeps Mr Marcelo Person happier but should be
-	 * removed for 2.5
+	/*
+	 * If this architecture has a "base" platform capability
+	 * string, copy it to userspace.
 	 */
-	if (smp_num_siblings > 1)
-		sp = sp - ((current->pid % 64) << 7);
-#endif
+	k_base_platform = ELF_BASE_PLATFORM;
+	u_base_platform = NULL;
+
+	if (k_base_platform) {
+		platform_len = strlen(k_base_platform) + 1;
+		sp -= platform_len;
+		u_base_platform = (char __user *) sp;
+		if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0)
+			return -EFAULT;
+	}
 
 	sp &= ~7UL;
 
@@ -547,12 +561,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	}
 
 	/* force 16 byte _final_ alignment here for generality */
-#define DLINFO_ITEMS 13
+#define DLINFO_ITEMS 15
 
-	nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0);
-#ifdef DLINFO_ARCH_ITEMS
-	nitems += DLINFO_ARCH_ITEMS;
-#endif
+	nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) +
+		(k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH;
+
+	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD)
+		nitems++;
 
 	csp = sp;
 	sp -= nitems * 2 * sizeof(unsigned long);
@@ -564,39 +579,61 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	sp -= sp & 15UL;
 
 	/* put the ELF interpreter info on the stack */
-#define NEW_AUX_ENT(nr, id, val)					\
+#define NEW_AUX_ENT(id, val)						\
 	do {								\
 		struct { unsigned long _id, _val; } __user *ent;	\
 									\
 		ent = (void __user *) csp;				\
 		__put_user((id), &ent[nr]._id);				\
 		__put_user((val), &ent[nr]._val);			\
+		nr++;							\
 	} while (0)
 
+	nr = 0;
 	csp -= 2 * sizeof(unsigned long);
-	NEW_AUX_ENT(0, AT_NULL, 0);
+	NEW_AUX_ENT(AT_NULL, 0);
 	if (k_platform) {
+		nr = 0;
 		csp -= 2 * sizeof(unsigned long);
-		NEW_AUX_ENT(0, AT_PLATFORM,
+		NEW_AUX_ENT(AT_PLATFORM,
 			    (elf_addr_t) (unsigned long) u_platform);
 	}
 
+	if (k_base_platform) {
+		nr = 0;
+		csp -= 2 * sizeof(unsigned long);
+		NEW_AUX_ENT(AT_BASE_PLATFORM,
+			    (elf_addr_t) (unsigned long) u_base_platform);
+	}
+
+	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
+		nr = 0;
+		csp -= 2 * sizeof(unsigned long);
+		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
+	}
+
+	nr = 0;
 	csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
-	NEW_AUX_ENT( 0, AT_HWCAP,	hwcap);
-	NEW_AUX_ENT( 1, AT_PAGESZ,	PAGE_SIZE);
-	NEW_AUX_ENT( 2, AT_CLKTCK,	CLOCKS_PER_SEC);
-	NEW_AUX_ENT( 3, AT_PHDR,	exec_params->ph_addr);
-	NEW_AUX_ENT( 4, AT_PHENT,	sizeof(struct elf_phdr));
-	NEW_AUX_ENT( 5, AT_PHNUM,	exec_params->hdr.e_phnum);
-	NEW_AUX_ENT( 6,	AT_BASE,	interp_params->elfhdr_addr);
-	NEW_AUX_ENT( 7, AT_FLAGS,	0);
-	NEW_AUX_ENT( 8, AT_ENTRY,	exec_params->entry_addr);
-	NEW_AUX_ENT( 9, AT_UID,		(elf_addr_t) current->uid);
-	NEW_AUX_ENT(10, AT_EUID,	(elf_addr_t) current->euid);
-	NEW_AUX_ENT(11, AT_GID,		(elf_addr_t) current->gid);
-	NEW_AUX_ENT(12, AT_EGID,	(elf_addr_t) current->egid);
+	NEW_AUX_ENT(AT_HWCAP,	hwcap);
+	NEW_AUX_ENT(AT_PAGESZ,	PAGE_SIZE);
+	NEW_AUX_ENT(AT_CLKTCK,	CLOCKS_PER_SEC);
+	NEW_AUX_ENT(AT_PHDR,	exec_params->ph_addr);
+	NEW_AUX_ENT(AT_PHENT,	sizeof(struct elf_phdr));
+	NEW_AUX_ENT(AT_PHNUM,	exec_params->hdr.e_phnum);
+	NEW_AUX_ENT(AT_BASE,	interp_params->elfhdr_addr);
+	NEW_AUX_ENT(AT_FLAGS,	0);
+	NEW_AUX_ENT(AT_ENTRY,	exec_params->entry_addr);
+	NEW_AUX_ENT(AT_UID,	(elf_addr_t) current->uid);
+	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) current->euid);
+	NEW_AUX_ENT(AT_GID,	(elf_addr_t) current->gid);
+	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) current->egid);
+	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
 
 #ifdef ARCH_DLINFO
+	nr = 0;
+	csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long);
+
 	/* ARCH_DLINFO must come last so platform specific code can enforce
 	 * special alignment requirements on the AUXV if necessary (eg. PPC).
 	 */
@@ -1353,20 +1390,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pgrp = task_pgrp_vnr(p);
 	prstatus->pr_sid = task_session_vnr(p);
 	if (thread_group_leader(p)) {
+		struct task_cputime cputime;
+
 		/*
-		 * This is the record for the group leader.  Add in the
-		 * cumulative times of previous dead threads.  This total
-		 * won't include the time of each live thread whose state
-		 * is included in the core dump.  The final total reported
-		 * to our parent process when it calls wait4 will include
-		 * those sums as well as the little bit more time it takes
-		 * this and each other thread to finish dying after the
-		 * core dump synchronization phase.
+		 * This is the record for the group leader.  It shows the
+		 * group-wide total, not its individual thread total.
 		 */
-		cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
-				   &prstatus->pr_utime);
-		cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
-				   &prstatus->pr_stime);
+		thread_group_cputime(p, &cputime);
+		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
 	} else {
 		cputime_to_timeval(p->utime, &prstatus->pr_utime);
 		cputime_to_timeval(p->stime, &prstatus->pr_stime);
@@ -1573,7 +1605,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	struct memelfnote *notes = NULL;
 	struct elf_prstatus *prstatus = NULL;	/* NT_PRSTATUS */
 	struct elf_prpsinfo *psinfo = NULL;	/* NT_PRPSINFO */
- 	struct task_struct *g, *p;
  	LIST_HEAD(thread_list);
  	struct list_head *t;
 	elf_fpregset_t *fpu = NULL;
@@ -1622,20 +1653,19 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 #endif
 
 	if (signr) {
+		struct core_thread *ct;
 		struct elf_thread_status *tmp;
-		rcu_read_lock();
-		do_each_thread(g,p)
-			if (current->mm == p->mm && current != p) {
-				tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
-				if (!tmp) {
-					rcu_read_unlock();
-					goto cleanup;
-				}
-				tmp->thread = p;
-				list_add(&tmp->list, &thread_list);
-			}
-		while_each_thread(g,p);
-		rcu_read_unlock();
+
+		for (ct = current->mm->core_state->dumper.next;
+						ct; ct = ct->next) {
+			tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+			if (!tmp)
+				goto cleanup;
+
+			tmp->thread = ct->task;
+			list_add(&tmp->list, &thread_list);
+		}
+
 		list_for_each(t, &thread_list) {
 			struct elf_thread_status *tmp;
 			int sz;
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index f9c88d0c8ced..32fb00b52cd0 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -43,7 +43,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
 			return -ENOEXEC;
 	}
 
-	bprm->sh_bang = 1;	/* Well, the bang-shell is implicit... */
+	bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 2cb1acda3a82..ccb781a6a804 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -229,13 +229,13 @@ static int decompress_exec(
 	ret = 10;
 	if (buf[3] & EXTRA_FIELD) {
 		ret += 2 + buf[10] + (buf[11] << 8);
-		if (unlikely(LBUFSIZE == ret)) {
+		if (unlikely(LBUFSIZE <= ret)) {
 			DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n");
 			goto out_free_buf;
 		}
 	}
 	if (buf[3] & ORIG_NAME) {
-		for (; ret < LBUFSIZE && (buf[ret] != 0); ret++)
+		while (ret < LBUFSIZE && buf[ret++] != 0)
 			;
 		if (unlikely(LBUFSIZE == ret)) {
 			DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n");
@@ -243,7 +243,7 @@ static int decompress_exec(
 		}
 	}
 	if (buf[3] & COMMENT) {
-		for (;  ret < LBUFSIZE && (buf[ret] != 0); ret++)
+		while (ret < LBUFSIZE && buf[ret++] != 0)
 			;
 		if (unlikely(LBUFSIZE == ret)) {
 			DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n");
@@ -914,15 +914,14 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	/* Stash our initial stack pointer into the mm structure */
 	current->mm->start_stack = (unsigned long )sp;
 
-	
+#ifdef FLAT_PLAT_INIT
+	FLAT_PLAT_INIT(regs);
+#endif
 	DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
 		(int)regs, (int)start_addr, (int)current->mm->start_stack);
 	
 	start_thread(regs, start_addr, current->mm->start_stack);
 
-	if (current->ptrace & PT_PTRACED)
-		send_sig(SIGTRAP, current, 0);
-
 	return 0;
 }
 
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 7191306367c5..f2744ab4e5b3 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/syscalls.h>
+#include <linux/fs.h>
 
 #include <asm/uaccess.h>
 
@@ -116,11 +117,9 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto _ret;
 
 	retval = -ENOEXEC;
-	if (bprm->misc_bang)
+	if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
 		goto _ret;
 
-	bprm->misc_bang = 1;
-
 	/* to keep locking time low, we copy the interpreter string */
 	read_lock(&entries_lock);
 	fmt = check_file(bprm);
@@ -198,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	if (retval < 0)
 		goto _error;
 
+	bprm->recursion_depth++;
+
 	retval = search_binary_handler (bprm, regs);
 	if (retval < 0)
 		goto _error;
@@ -535,31 +536,16 @@ static ssize_t
 bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
 {
 	Node *e = file->f_path.dentry->d_inode->i_private;
-	loff_t pos = *ppos;
 	ssize_t res;
 	char *page;
-	int len;
 
 	if (!(page = (char*) __get_free_page(GFP_KERNEL)))
 		return -ENOMEM;
 
 	entry_status(e, page);
-	len = strlen(page);
 
-	res = -EINVAL;
-	if (pos < 0)
-		goto out;
-	res = 0;
-	if (pos >= len)
-		goto out;
-	if (len < pos + nbytes)
-		nbytes = len - pos;
-	res = -EFAULT;
-	if (copy_to_user(buf, page + pos, nbytes))
-		goto out;
-	*ppos = pos + nbytes;
-	res = nbytes;
-out:
+	res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page));
+
 	free_page((unsigned long) page);
 	return res;
 }
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 9e3963f7ebf1..08343505e184 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -22,14 +22,15 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	char interp[BINPRM_BUF_SIZE];
 	int retval;
 
-	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || (bprm->sh_bang)) 
+	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') ||
+	    (bprm->recursion_depth > BINPRM_MAX_RECURSION))
 		return -ENOEXEC;
 	/*
 	 * This section does the #! interpretation.
 	 * Sorta complicated, but hopefully it will work.  -TYT
 	 */
 
-	bprm->sh_bang = 1;
+	bprm->recursion_depth++;
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index fdc36bfd6a7b..74e587a52796 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -274,8 +274,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	map_hpux_gateway_page(current,current->mm);
 
 	start_thread_som(regs, som_entry, bprm->p);
-	if (current->ptrace & PT_PTRACED)
-		send_sig(SIGTRAP, current, 0);
 	return 0;
 
 	/* error cleanup */
@@ -308,3 +306,5 @@ static void __exit exit_som_binfmt(void)
 
 core_initcall(init_som_binfmt);
 module_exit(exit_som_binfmt);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 63e2ee63058d..19caf7c962ac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 	BUG_ON(bip == NULL);
 
 	/* A cloned bio doesn't own the integrity metadata */
-	if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+	if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
+	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
 	mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
@@ -150,6 +151,24 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
+static int bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi == NULL)
+		return 0;
+
+	if (rw == READ && bi->verify_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_READ))
+		return 1;
+
+	if (rw == WRITE && bi->generate_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_WRITE))
+		return 1;
+
+	return 0;
+}
+
 /**
  * bio_integrity_enabled - Check whether integrity can be passed
  * @bio:	bio to check
@@ -313,6 +332,14 @@ static void bio_integrity_generate(struct bio *bio)
 	}
 }
 
+static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
+{
+	if (bi)
+		return bi->tuple_size;
+
+	return 0;
+}
+
 /**
  * bio_integrity_prep - Prepare bio for integrity I/O
  * @bio:	bio to prepare
@@ -705,7 +732,6 @@ void __init bio_integrity_init_slab(void)
 	bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
 					SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 }
-EXPORT_SYMBOL(bio_integrity_init_slab);
 
 static int __init integrity_init(void)
 {
diff --git a/fs/bio.c b/fs/bio.c
index 88322b066acb..77a55bcceedb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -30,7 +30,7 @@
 
 static struct kmem_cache *bio_slab __read_mostly;
 
-mempool_t *bio_split_pool __read_mostly;
+static mempool_t *bio_split_pool __read_mostly;
 
 /*
  * if you change this list, also change bvec_alloc or things will
@@ -60,28 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
 	struct bio_vec *bvl;
 
 	/*
-	 * see comment near bvec_array define!
+	 * If 'bs' is given, lookup the pool and do the mempool alloc.
+	 * If not, this is a bio_kmalloc() allocation and just do a
+	 * kzalloc() for the exact number of vecs right away.
 	 */
-	switch (nr) {
-		case   1        : *idx = 0; break;
-		case   2 ...   4: *idx = 1; break;
-		case   5 ...  16: *idx = 2; break;
-		case  17 ...  64: *idx = 3; break;
-		case  65 ... 128: *idx = 4; break;
-		case 129 ... BIO_MAX_PAGES: *idx = 5; break;
+	if (bs) {
+		/*
+		 * see comment near bvec_array define!
+		 */
+		switch (nr) {
+		case 1:
+			*idx = 0;
+			break;
+		case 2 ... 4:
+			*idx = 1;
+			break;
+		case 5 ... 16:
+			*idx = 2;
+			break;
+		case 17 ... 64:
+			*idx = 3;
+			break;
+		case 65 ... 128:
+			*idx = 4;
+			break;
+		case 129 ... BIO_MAX_PAGES:
+			*idx = 5;
+			break;
 		default:
 			return NULL;
-	}
-	/*
-	 * idx now points to the pool we want to allocate from
-	 */
-
-	bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
-	if (bvl) {
-		struct biovec_slab *bp = bvec_slabs + *idx;
+		}
 
-		memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
-	}
+		/*
+		 * idx now points to the pool we want to allocate from
+		 */
+		bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+		if (bvl)
+			memset(bvl, 0,
+				bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+	} else
+		bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
 
 	return bvl;
 }
@@ -110,10 +128,17 @@ static void bio_fs_destructor(struct bio *bio)
 	bio_free(bio, fs_bio_set);
 }
 
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+	kfree(bio->bi_io_vec);
+	kfree(bio);
+}
+
 void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
+	bio->bi_comp_cpu = -1;
 	atomic_set(&bio->bi_cnt, 1);
 }
 
@@ -121,19 +146,25 @@ void bio_init(struct bio *bio)
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
  * @nr_iovecs:	number of iovecs to pre-allocate
- * @bs:		the bio_set to allocate from
+ * @bs:		the bio_set to allocate from. If %NULL, just use kmalloc
  *
  * Description:
- *   bio_alloc_bioset will first try it's on mempool to satisfy the allocation.
+ *   bio_alloc_bioset will first try its own mempool to satisfy the allocation.
  *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free.
+ *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
+ *   fall back to just using @kmalloc to allocate the required memory.
  *
  *   allocate bio and iovecs from the memory pools specified by the
- *   bio_set structure.
+ *   bio_set structure, or @kmalloc if none given.
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-	struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
+	struct bio *bio;
+
+	if (bs)
+		bio = mempool_alloc(bs->bio_pool, gfp_mask);
+	else
+		bio = kmalloc(sizeof(*bio), gfp_mask);
 
 	if (likely(bio)) {
 		struct bio_vec *bvl = NULL;
@@ -144,12 +175,15 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
 			if (unlikely(!bvl)) {
-				mempool_free(bio, bs->bio_pool);
+				if (bs)
+					mempool_free(bio, bs->bio_pool);
+				else
+					kfree(bio);
 				bio = NULL;
 				goto out;
 			}
 			bio->bi_flags |= idx << BIO_POOL_OFFSET;
-			bio->bi_max_vecs = bvec_slabs[idx].nr_vecs;
+			bio->bi_max_vecs = bvec_nr_vecs(idx);
 		}
 		bio->bi_io_vec = bvl;
 	}
@@ -167,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 	return bio;
 }
 
+/*
+ * Like bio_alloc(), but doesn't use a mempool backing. This means that
+ * it CAN fail, but while bio_alloc() can only be used for allocations
+ * that have a short (finite) life span, bio_kmalloc() should be used
+ * for more permanent bio allocations (like allocating some bio's for
+ * initalization or setup purposes).
+ */
+struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+{
+	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+
+	if (bio)
+		bio->bi_destructor = bio_kmalloc_destructor;
+
+	return bio;
+}
+
 void zero_fill_bio(struct bio *bio)
 {
 	unsigned long flags;
@@ -211,14 +262,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
 	return bio->bi_phys_segments;
 }
 
-inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
-{
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-		blk_recount_segments(q, bio);
-
-	return bio->bi_hw_segments;
-}
-
 /**
  * 	__bio_clone	-	clone a bio
  * 	@bio: destination bio
@@ -353,8 +396,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 */
 
 	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_hw_segments >= q->max_hw_segments
-	       || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
+	       || bio->bi_phys_segments >= q->max_hw_segments) {
 
 		if (retried_segments)
 			return 0;
@@ -398,13 +440,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	}
 
 	/* If we may be able to merge these biovecs, force a recount */
-	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
-	    BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
+	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
 		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 
 	bio->bi_vcnt++;
 	bio->bi_phys_segments++;
-	bio->bi_hw_segments++;
  done:
 	bio->bi_size += len;
 	return len;
@@ -452,16 +492,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 
 struct bio_map_data {
 	struct bio_vec *iovecs;
-	int nr_sgvecs;
 	struct sg_iovec *sgvecs;
+	int nr_sgvecs;
+	int is_our_pages;
 };
 
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-			     struct sg_iovec *iov, int iov_count)
+			     struct sg_iovec *iov, int iov_count,
+			     int is_our_pages)
 {
 	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
 	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
 	bmd->nr_sgvecs = iov_count;
+	bmd->is_our_pages = is_our_pages;
 	bio->bi_private = bmd;
 }
 
@@ -472,20 +515,21 @@ static void bio_free_map_data(struct bio_map_data *bmd)
 	kfree(bmd);
 }
 
-static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
+static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
+					       gfp_t gfp_mask)
 {
-	struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL);
+	struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
 
 	if (!bmd)
 		return NULL;
 
-	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL);
+	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
 	if (!bmd->iovecs) {
 		kfree(bmd);
 		return NULL;
 	}
 
-	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL);
+	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
 	if (bmd->sgvecs)
 		return bmd;
 
@@ -494,8 +538,9 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
 	return NULL;
 }
 
-static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
-			  int uncopy)
+static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
+			  struct sg_iovec *iov, int iov_count, int uncopy,
+			  int do_free_page)
 {
 	int ret = 0, i;
 	struct bio_vec *bvec;
@@ -505,7 +550,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
 
 	__bio_for_each_segment(bvec, bio, i, 0) {
 		char *bv_addr = page_address(bvec->bv_page);
-		unsigned int bv_len = bvec->bv_len;
+		unsigned int bv_len = iovecs[i].bv_len;
 
 		while (bv_len && iov_idx < iov_count) {
 			unsigned int bytes;
@@ -538,7 +583,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
 			}
 		}
 
-		if (uncopy)
+		if (do_free_page)
 			__free_page(bvec->bv_page);
 	}
 
@@ -555,10 +600,11 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
 int bio_uncopy_user(struct bio *bio)
 {
 	struct bio_map_data *bmd = bio->bi_private;
-	int ret;
-
-	ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
+	int ret = 0;
 
+	if (!bio_flagged(bio, BIO_NULL_MAPPED))
+		ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
+				     bmd->nr_sgvecs, 1, bmd->is_our_pages);
 	bio_free_map_data(bmd);
 	bio_put(bio);
 	return ret;
@@ -567,16 +613,20 @@ int bio_uncopy_user(struct bio *bio)
 /**
  *	bio_copy_user_iov	-	copy user data to bio
  *	@q: destination block queue
+ *	@map_data: pointer to the rq_map_data holding pages (if necessary)
  *	@iov:	the iovec.
  *	@iov_count: number of elements in the iovec
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Prepares and returns a bio for indirect user io, bouncing data
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
-struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
-			      int iov_count, int write_to_vm)
+struct bio *bio_copy_user_iov(struct request_queue *q,
+			      struct rq_map_data *map_data,
+			      struct sg_iovec *iov, int iov_count,
+			      int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio_map_data *bmd;
 	struct bio_vec *bvec;
@@ -599,25 +649,38 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 		len += iov[i].iov_len;
 	}
 
-	bmd = bio_alloc_map_data(nr_pages, iov_count);
+	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
 	if (!bmd)
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	bio = bio_alloc(GFP_KERNEL, nr_pages);
+	bio = bio_alloc(gfp_mask, nr_pages);
 	if (!bio)
 		goto out_bmd;
 
 	bio->bi_rw |= (!write_to_vm << BIO_RW);
 
 	ret = 0;
+	i = 0;
 	while (len) {
-		unsigned int bytes = PAGE_SIZE;
+		unsigned int bytes;
+
+		if (map_data)
+			bytes = 1U << (PAGE_SHIFT + map_data->page_order);
+		else
+			bytes = PAGE_SIZE;
 
 		if (bytes > len)
 			bytes = len;
 
-		page = alloc_page(q->bounce_gfp | GFP_KERNEL);
+		if (map_data) {
+			if (i == map_data->nr_entries) {
+				ret = -ENOMEM;
+				break;
+			}
+			page = map_data->pages[i++];
+		} else
+			page = alloc_page(q->bounce_gfp | gfp_mask);
 		if (!page) {
 			ret = -ENOMEM;
 			break;
@@ -636,16 +699,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 	 * success
 	 */
 	if (!write_to_vm) {
-		ret = __bio_copy_iov(bio, iov, iov_count, 0);
+		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
 		if (ret)
 			goto cleanup;
 	}
 
-	bio_set_map_data(bmd, bio, iov, iov_count);
+	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
 	return bio;
 cleanup:
-	bio_for_each_segment(bvec, bio, i)
-		__free_page(bvec->bv_page);
+	if (!map_data)
+		bio_for_each_segment(bvec, bio, i)
+			__free_page(bvec->bv_page);
 
 	bio_put(bio);
 out_bmd:
@@ -656,29 +720,32 @@ out_bmd:
 /**
  *	bio_copy_user	-	copy user data to bio
  *	@q: destination block queue
+ *	@map_data: pointer to the rq_map_data holding pages (if necessary)
  *	@uaddr: start of user address
  *	@len: length in bytes
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Prepares and returns a bio for indirect user io, bouncing data
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
-			  unsigned int len, int write_to_vm)
+struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
+			  unsigned long uaddr, unsigned int len,
+			  int write_to_vm, gfp_t gfp_mask)
 {
 	struct sg_iovec iov;
 
 	iov.iov_base = (void __user *)uaddr;
 	iov.iov_len = len;
 
-	return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
 }
 
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
 				      struct sg_iovec *iov, int iov_count,
-				      int write_to_vm)
+				      int write_to_vm, gfp_t gfp_mask)
 {
 	int i, j;
 	int nr_pages = 0;
@@ -704,12 +771,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	if (!nr_pages)
 		return ERR_PTR(-EINVAL);
 
-	bio = bio_alloc(GFP_KERNEL, nr_pages);
+	bio = bio_alloc(gfp_mask, nr_pages);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
 	if (!pages)
 		goto out;
 
@@ -721,12 +788,8 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 		const int local_nr_pages = end - start;
 		const int page_limit = cur_page + local_nr_pages;
 		
-		down_read(&current->mm->mmap_sem);
-		ret = get_user_pages(current, current->mm, uaddr,
-				     local_nr_pages,
-				     write_to_vm, 0, &pages[cur_page], NULL);
-		up_read(&current->mm->mmap_sem);
-
+		ret = get_user_pages_fast(uaddr, local_nr_pages,
+				write_to_vm, &pages[cur_page]);
 		if (ret < local_nr_pages) {
 			ret = -EFAULT;
 			goto out_unmap;
@@ -792,19 +855,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
  *	@uaddr: start of user address
  *	@len: length in bytes
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Map the user space address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
-			 unsigned long uaddr, unsigned int len, int write_to_vm)
+			 unsigned long uaddr, unsigned int len, int write_to_vm,
+			 gfp_t gfp_mask)
 {
 	struct sg_iovec iov;
 
 	iov.iov_base = (void __user *)uaddr;
 	iov.iov_len = len;
 
-	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
+	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
 }
 
 /**
@@ -814,18 +879,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
  *	@iov:	the iovec.
  *	@iov_count: number of elements in the iovec
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Map the user space address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
 			     struct sg_iovec *iov, int iov_count,
-			     int write_to_vm)
+			     int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio *bio;
 
-	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
-
+	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
+				 gfp_mask);
 	if (IS_ERR(bio))
 		return bio;
 
@@ -949,19 +1015,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 {
 	struct bio_vec *bvec;
 	const int read = bio_data_dir(bio) == READ;
-	char *p = bio->bi_private;
+	struct bio_map_data *bmd = bio->bi_private;
 	int i;
+	char *p = bmd->sgvecs[0].iov_base;
 
 	__bio_for_each_segment(bvec, bio, i, 0) {
 		char *addr = page_address(bvec->bv_page);
+		int len = bmd->iovecs[i].bv_len;
 
 		if (read && !err)
-			memcpy(p, addr, bvec->bv_len);
+			memcpy(p, addr, len);
 
 		__free_page(bvec->bv_page);
-		p += bvec->bv_len;
+		p += len;
 	}
 
+	bio_free_map_data(bmd);
 	bio_put(bio);
 }
 
@@ -979,38 +1048,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 			  gfp_t gfp_mask, int reading)
 {
-	unsigned long kaddr = (unsigned long)data;
-	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = kaddr >> PAGE_SHIFT;
-	const int nr_pages = end - start;
 	struct bio *bio;
 	struct bio_vec *bvec;
-	int i, ret;
-
-	bio = bio_alloc(gfp_mask, nr_pages);
-	if (!bio)
-		return ERR_PTR(-ENOMEM);
-
-	while (len) {
-		struct page *page;
-		unsigned int bytes = PAGE_SIZE;
-
-		if (bytes > len)
-			bytes = len;
-
-		page = alloc_page(q->bounce_gfp | gfp_mask);
-		if (!page) {
-			ret = -ENOMEM;
-			goto cleanup;
-		}
-
-		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
-			ret = -EINVAL;
-			goto cleanup;
-		}
+	int i;
 
-		len -= bytes;
-	}
+	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
+	if (IS_ERR(bio))
+		return bio;
 
 	if (!reading) {
 		void *p = data;
@@ -1023,16 +1067,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 		}
 	}
 
-	bio->bi_private = data;
 	bio->bi_end_io = bio_copy_kern_endio;
-	return bio;
-cleanup:
-	bio_for_each_segment(bvec, bio, i)
-		__free_page(bvec->bv_page);
 
-	bio_put(bio);
-
-	return ERR_PTR(ret);
+	return bio;
 }
 
 /*
@@ -1219,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err)
  * split a bio - only worry about a bio with a single page
  * in it's iovec
  */
-struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
+struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 {
-	struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO);
+	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
 
 	if (!bp)
 		return bp;
@@ -1255,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	bp->bio2.bi_end_io = bio_pair_end_2;
 
 	bp->bio1.bi_private = bi;
-	bp->bio2.bi_private = pool;
+	bp->bio2.bi_private = bio_split_pool;
 
 	if (bio_integrity(bi))
 		bio_integrity_split(bi, bp, first_sectors);
@@ -1263,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	return bp;
 }
 
+/**
+ *      bio_sector_offset - Find hardware sector offset in bio
+ *      @bio:           bio to inspect
+ *      @index:         bio_vec index
+ *      @offset:        offset in bv_page
+ *
+ *      Return the number of hardware sectors between beginning of bio
+ *      and an end point indicated by a bio_vec index and an offset
+ *      within that vector's page.
+ */
+sector_t bio_sector_offset(struct bio *bio, unsigned short index,
+			   unsigned int offset)
+{
+	unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+	struct bio_vec *bv;
+	sector_t sectors;
+	int i;
+
+	sectors = 0;
+
+	if (index >= bio->bi_idx)
+		index = bio->bi_vcnt - 1;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		if (i == index) {
+			if (offset > bv->bv_offset)
+				sectors += (offset - bv->bv_offset) / sector_sz;
+			break;
+		}
+
+		sectors += bv->bv_len / sector_sz;
+	}
+
+	return sectors;
+}
+EXPORT_SYMBOL(bio_sector_offset);
 
 /*
  * create memory pools for biovec's in a bio_set.
@@ -1365,6 +1438,7 @@ static int __init init_bio(void)
 subsys_initcall(init_bio);
 
 EXPORT_SYMBOL(bio_alloc);
+EXPORT_SYMBOL(bio_kmalloc);
 EXPORT_SYMBOL(bio_put);
 EXPORT_SYMBOL(bio_free);
 EXPORT_SYMBOL(bio_endio);
@@ -1372,7 +1446,6 @@ EXPORT_SYMBOL(bio_init);
 EXPORT_SYMBOL(__bio_clone);
 EXPORT_SYMBOL(bio_clone);
 EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_hw_segments);
 EXPORT_SYMBOL(bio_add_page);
 EXPORT_SYMBOL(bio_add_pc_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
@@ -1382,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern);
 EXPORT_SYMBOL(bio_copy_kern);
 EXPORT_SYMBOL(bio_pair_release);
 EXPORT_SYMBOL(bio_split);
-EXPORT_SYMBOL(bio_split_pool);
 EXPORT_SYMBOL(bio_copy_user);
 EXPORT_SYMBOL(bio_uncopy_user);
 EXPORT_SYMBOL(bioset_create);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 10d8a0aa871a..88a776fa0ef6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -271,7 +271,7 @@ static void bdev_destroy_inode(struct inode *inode)
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
@@ -540,22 +540,6 @@ EXPORT_SYMBOL(bd_release);
  *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
  */
 
-static struct kobject *bdev_get_kobj(struct block_device *bdev)
-{
-	if (bdev->bd_contains != bdev)
-		return kobject_get(&bdev->bd_part->dev.kobj);
-	else
-		return kobject_get(&bdev->bd_disk->dev.kobj);
-}
-
-static struct kobject *bdev_get_holder(struct block_device *bdev)
-{
-	if (bdev->bd_contains != bdev)
-		return kobject_get(bdev->bd_part->holder_dir);
-	else
-		return kobject_get(bdev->bd_disk->holder_dir);
-}
-
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	if (!from || !to)
@@ -604,11 +588,11 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
 	if (!bo->hdev)
 		goto fail_put_sdir;
 
-	bo->sdev = bdev_get_kobj(bdev);
+	bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
 	if (!bo->sdev)
 		goto fail_put_hdev;
 
-	bo->hdir = bdev_get_holder(bdev);
+	bo->hdir = kobject_get(bdev->bd_part->holder_dir);
 	if (!bo->hdir)
 		goto fail_put_sdev;
 
@@ -856,18 +840,98 @@ EXPORT_SYMBOL_GPL(bd_release_from_disk);
  * to be used for internal purposes.  If you ever need it - reconsider
  * your API.
  */
-struct block_device *open_by_devnum(dev_t dev, unsigned mode)
+struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
 {
 	struct block_device *bdev = bdget(dev);
 	int err = -ENOMEM;
-	int flags = mode & FMODE_WRITE ? O_RDWR : O_RDONLY;
 	if (bdev)
-		err = blkdev_get(bdev, mode, flags);
+		err = blkdev_get(bdev, mode);
 	return err ? ERR_PTR(err) : bdev;
 }
 
 EXPORT_SYMBOL(open_by_devnum);
 
+/**
+ * flush_disk - invalidates all buffer-cache entries on a disk
+ *
+ * @bdev:      struct block device to be flushed
+ *
+ * Invalidates all buffer-cache entries on a disk. It should be called
+ * when a disk has been changed -- either by a media change or online
+ * resize.
+ */
+static void flush_disk(struct block_device *bdev)
+{
+	if (__invalidate_device(bdev)) {
+		char name[BDEVNAME_SIZE] = "";
+
+		if (bdev->bd_disk)
+			disk_name(bdev->bd_disk, 0, name);
+		printk(KERN_WARNING "VFS: busy inodes on changed media or "
+		       "resized disk %s\n", name);
+	}
+
+	if (!bdev->bd_disk)
+		return;
+	if (disk_partitionable(bdev->bd_disk))
+		bdev->bd_invalidated = 1;
+}
+
+/**
+ * check_disk_size_change - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @bdev: struct bdev to adjust.
+ *
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs.
+ */
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+{
+	loff_t disk_size, bdev_size;
+
+	disk_size = (loff_t)get_capacity(disk) << 9;
+	bdev_size = i_size_read(bdev->bd_inode);
+	if (disk_size != bdev_size) {
+		char name[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, name);
+		printk(KERN_INFO
+		       "%s: detected capacity change from %lld to %lld\n",
+		       name, bdev_size, disk_size);
+		i_size_write(bdev->bd_inode, disk_size);
+		flush_disk(bdev);
+	}
+}
+EXPORT_SYMBOL(check_disk_size_change);
+
+/**
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
+ * @disk: struct gendisk to be revalidated
+ *
+ * This routine is a wrapper for lower-level driver's revalidate_disk
+ * call-backs.  It is used to do common pre and post operations needed
+ * for all revalidate_disk operations.
+ */
+int revalidate_disk(struct gendisk *disk)
+{
+	struct block_device *bdev;
+	int ret = 0;
+
+	if (disk->fops->revalidate_disk)
+		ret = disk->fops->revalidate_disk(disk);
+
+	bdev = bdget_disk(disk, 0);
+	if (!bdev)
+		return ret;
+
+	mutex_lock(&bdev->bd_mutex);
+	check_disk_size_change(disk, bdev);
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
+	return ret;
+}
+EXPORT_SYMBOL(revalidate_disk);
+
 /*
  * This routine checks whether a removable media has been changed,
  * and invalidates all buffer-cache-entries in that case. This
@@ -887,13 +951,9 @@ int check_disk_change(struct block_device *bdev)
 	if (!bdops->media_changed(bdev->bd_disk))
 		return 0;
 
-	if (__invalidate_device(bdev))
-		printk("VFS: busy inodes on changed media.\n");
-
+	flush_disk(bdev);
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
-	if (bdev->bd_disk->minors > 1)
-		bdev->bd_invalidated = 1;
 	return 1;
 }
 
@@ -914,9 +974,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
-			int for_part);
-static int __blkdev_put(struct block_device *bdev, int for_part);
+static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
  * bd_mutex locking:
@@ -925,46 +983,49 @@ static int __blkdev_put(struct block_device *bdev, int for_part);
  *    mutex_lock_nested(whole->bd_mutex, 1)
  */
 
-static int do_open(struct block_device *bdev, struct file *file, int for_part)
+static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
-	struct module *owner = NULL;
 	struct gendisk *disk;
+	struct hd_struct *part = NULL;
 	int ret;
-	int part;
+	int partno;
 	int perm = 0;
 
-	if (file->f_mode & FMODE_READ)
+	if (mode & FMODE_READ)
 		perm |= MAY_READ;
-	if (file->f_mode & FMODE_WRITE)
+	if (mode & FMODE_WRITE)
 		perm |= MAY_WRITE;
 	/*
 	 * hooks: /n/, see "layering violations".
 	 */
 	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
-	if (ret != 0)
+	if (ret != 0) {
+		bdput(bdev);
 		return ret;
+	}
 
 	ret = -ENXIO;
-	file->f_mapping = bdev->bd_inode->i_mapping;
+
 	lock_kernel();
-	disk = get_gendisk(bdev->bd_dev, &part);
-	if (!disk) {
-		unlock_kernel();
-		bdput(bdev);
-		return ret;
-	}
-	owner = disk->fops->owner;
+
+	disk = get_gendisk(bdev->bd_dev, &partno);
+	if (!disk)
+		goto out_unlock_kernel;
+	part = disk_get_part(disk, partno);
+	if (!part)
+		goto out_unlock_kernel;
 
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
+		bdev->bd_part = part;
 		bdev->bd_contains = bdev;
-		if (!part) {
+		if (!partno) {
 			struct backing_dev_info *bdi;
 			if (disk->fops->open) {
-				ret = disk->fops->open(bdev->bd_inode, file);
+				ret = disk->fops->open(bdev, mode);
 				if (ret)
-					goto out_first;
+					goto out_clear;
 			}
 			if (!bdev->bd_openers) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
@@ -976,36 +1037,36 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			if (bdev->bd_invalidated)
 				rescan_partitions(disk, bdev);
 		} else {
-			struct hd_struct *p;
 			struct block_device *whole;
 			whole = bdget_disk(disk, 0);
 			ret = -ENOMEM;
 			if (!whole)
-				goto out_first;
+				goto out_clear;
 			BUG_ON(for_part);
-			ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
+			ret = __blkdev_get(whole, mode, 1);
 			if (ret)
-				goto out_first;
+				goto out_clear;
 			bdev->bd_contains = whole;
-			p = disk->part[part - 1];
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
-			if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
+			if (!(disk->flags & GENHD_FL_UP) ||
+			    !part || !part->nr_sects) {
 				ret = -ENXIO;
-				goto out_first;
+				goto out_clear;
 			}
-			kobject_get(&p->dev.kobj);
-			bdev->bd_part = p;
-			bd_set_size(bdev, (loff_t) p->nr_sects << 9);
+			bd_set_size(bdev, (loff_t)part->nr_sects << 9);
 		}
 	} else {
+		disk_put_part(part);
 		put_disk(disk);
-		module_put(owner);
+		module_put(disk->fops->owner);
+		part = NULL;
+		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
-				ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
+				ret = bdev->bd_disk->fops->open(bdev, mode);
 				if (ret)
-					goto out;
+					goto out_unlock_bdev;
 			}
 			if (bdev->bd_invalidated)
 				rescan_partitions(bdev->bd_disk, bdev);
@@ -1018,44 +1079,30 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	unlock_kernel();
 	return 0;
 
-out_first:
+ out_clear:
 	bdev->bd_disk = NULL;
+	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 	if (bdev != bdev->bd_contains)
-		__blkdev_put(bdev->bd_contains, 1);
+		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
-	put_disk(disk);
-	module_put(owner);
-out:
+ out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
+ out_unlock_kernel:
 	unlock_kernel();
-	if (ret)
-		bdput(bdev);
-	return ret;
-}
 
-static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,
-			int for_part)
-{
-	/*
-	 * This crockload is due to bad choice of ->open() type.
-	 * It will go away.
-	 * For now, block device ->open() routine must _not_
-	 * examine anything in 'inode' argument except ->i_rdev.
-	 */
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-	fake_file.f_mode = mode;
-	fake_file.f_flags = flags;
-	fake_file.f_path.dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
-
-	return do_open(bdev, &fake_file, for_part);
+	disk_put_part(part);
+	if (disk)
+		module_put(disk->fops->owner);
+	put_disk(disk);
+	bdput(bdev);
+
+	return ret;
 }
 
-int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags)
+int blkdev_get(struct block_device *bdev, fmode_t mode)
 {
-	return __blkdev_get(bdev, mode, flags, 0);
+	return __blkdev_get(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_get);
 
@@ -1072,28 +1119,36 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	 */
 	filp->f_flags |= O_LARGEFILE;
 
+	if (filp->f_flags & O_NDELAY)
+		filp->f_mode |= FMODE_NDELAY;
+	if (filp->f_flags & O_EXCL)
+		filp->f_mode |= FMODE_EXCL;
+	if ((filp->f_flags & O_ACCMODE) == 3)
+		filp->f_mode |= FMODE_WRITE_IOCTL;
+
 	bdev = bd_acquire(inode);
 	if (bdev == NULL)
 		return -ENOMEM;
 
-	res = do_open(bdev, filp, 0);
+	filp->f_mapping = bdev->bd_inode->i_mapping;
+
+	res = blkdev_get(bdev, filp->f_mode);
 	if (res)
 		return res;
 
-	if (!(filp->f_flags & O_EXCL) )
+	if (!(filp->f_mode & FMODE_EXCL))
 		return 0;
 
 	if (!(res = bd_claim(bdev, filp)))
 		return 0;
 
-	blkdev_put(bdev);
+	blkdev_put(bdev, filp->f_mode);
 	return res;
 }
 
-static int __blkdev_put(struct block_device *bdev, int for_part)
+static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	int ret = 0;
-	struct inode *bd_inode = bdev->bd_inode;
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device *victim = NULL;
 
@@ -1108,18 +1163,15 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
-			ret = disk->fops->release(bd_inode, NULL);
+			ret = disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
 
 		put_disk(disk);
 		module_put(owner);
-
-		if (bdev->bd_contains != bdev) {
-			kobject_put(&bdev->bd_part->dev.kobj);
-			bdev->bd_part = NULL;
-		}
+		disk_put_part(bdev->bd_part);
+		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
 		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 		if (bdev != bdev->bd_contains)
@@ -1130,13 +1182,13 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	if (victim)
-		__blkdev_put(victim, 1);
+		__blkdev_put(victim, mode, 1);
 	return ret;
 }
 
-int blkdev_put(struct block_device *bdev)
+int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
-	return __blkdev_put(bdev, 0);
+	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
 
@@ -1145,12 +1197,16 @@ static int blkdev_close(struct inode * inode, struct file * filp)
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
 	if (bdev->bd_holder == filp)
 		bd_release(bdev);
-	return blkdev_put(bdev);
+	return blkdev_put(bdev, filp->f_mode);
 }
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
+	fmode_t mode = file->f_mode;
+	if (file->f_flags & O_NDELAY)
+		mode |= FMODE_NDELAY_NOW;
+	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
 static const struct address_space_operations def_blk_aops = {
@@ -1186,7 +1242,7 @@ int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
 	int res;
 	mm_segment_t old_fs = get_fs();
 	set_fs(KERNEL_DS);
-	res = blkdev_ioctl(bdev->bd_inode, NULL, cmd, arg);
+	res = blkdev_ioctl(bdev, 0, cmd, arg);
 	set_fs(old_fs);
 	return res;
 }
@@ -1195,73 +1251,70 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 
 /**
  * lookup_bdev  - lookup a struct block_device by name
- *
  * @path:	special file representing the block device
  *
- * Get a reference to the blockdevice at @path in the current
+ * Get a reference to the blockdevice at @pathname in the current
  * namespace if possible and return it.  Return ERR_PTR(error)
  * otherwise.
  */
-struct block_device *lookup_bdev(const char *path)
+struct block_device *lookup_bdev(const char *pathname)
 {
 	struct block_device *bdev;
 	struct inode *inode;
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	if (!path || !*path)
+	if (!pathname || !*pathname)
 		return ERR_PTR(-EINVAL);
 
-	error = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	error = kern_path(pathname, LOOKUP_FOLLOW, &path);
 	if (error)
 		return ERR_PTR(error);
 
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 	error = -ENOTBLK;
 	if (!S_ISBLK(inode->i_mode))
 		goto fail;
 	error = -EACCES;
-	if (nd.path.mnt->mnt_flags & MNT_NODEV)
+	if (path.mnt->mnt_flags & MNT_NODEV)
 		goto fail;
 	error = -ENOMEM;
 	bdev = bd_acquire(inode);
 	if (!bdev)
 		goto fail;
 out:
-	path_put(&nd.path);
+	path_put(&path);
 	return bdev;
 fail:
 	bdev = ERR_PTR(error);
 	goto out;
 }
+EXPORT_SYMBOL(lookup_bdev);
 
 /**
- * open_bdev_excl  -  open a block device by name and set it up for use
+ * open_bdev_exclusive  -  open a block device by name and set it up for use
  *
  * @path:	special file representing the block device
- * @flags:	%MS_RDONLY for opening read-only
+ * @mode:	FMODE_... combination to pass be used
  * @holder:	owner for exclusion
  *
  * Open the blockdevice described by the special file at @path, claim it
  * for the @holder.
  */
-struct block_device *open_bdev_excl(const char *path, int flags, void *holder)
+struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
 	struct block_device *bdev;
-	mode_t mode = FMODE_READ;
 	int error = 0;
 
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 
-	if (!(flags & MS_RDONLY))
-		mode |= FMODE_WRITE;
-	error = blkdev_get(bdev, mode, 0);
+	error = blkdev_get(bdev, mode);
 	if (error)
 		return ERR_PTR(error);
 	error = -EACCES;
-	if (!(flags & MS_RDONLY) && bdev_read_only(bdev))
+	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
 		goto blkdev_put;
 	error = bd_claim(bdev, holder);
 	if (error)
@@ -1270,26 +1323,27 @@ struct block_device *open_bdev_excl(const char *path, int flags, void *holder)
 	return bdev;
 	
 blkdev_put:
-	blkdev_put(bdev);
+	blkdev_put(bdev, mode);
 	return ERR_PTR(error);
 }
 
-EXPORT_SYMBOL(open_bdev_excl);
+EXPORT_SYMBOL(open_bdev_exclusive);
 
 /**
- * close_bdev_excl  -  release a blockdevice openen by open_bdev_excl()
+ * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
  *
  * @bdev:	blockdevice to close
+ * @mode:	mode, must match that used to open.
  *
- * This is the counterpart to open_bdev_excl().
+ * This is the counterpart to open_bdev_exclusive().
  */
-void close_bdev_excl(struct block_device *bdev)
+void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
 {
 	bd_release(bdev);
-	blkdev_put(bdev);
+	blkdev_put(bdev, mode);
 }
 
-EXPORT_SYMBOL(close_bdev_excl);
+EXPORT_SYMBOL(close_bdev_exclusive);
 
 int __invalidate_device(struct block_device *bdev)
 {
diff --git a/fs/buffer.c b/fs/buffer.c
index d48caee12e2a..6569fda5cfed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -76,8 +76,7 @@ EXPORT_SYMBOL(__lock_buffer);
 
 void unlock_buffer(struct buffer_head *bh)
 {
-	smp_mb__before_clear_bit();
-	clear_buffer_locked(bh);
+	clear_bit_unlock(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
@@ -580,7 +579,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
 /*
  * The buffer's backing address_space's private_lock must be held
  */
-static inline void __remove_assoc_queue(struct buffer_head *bh)
+static void __remove_assoc_queue(struct buffer_head *bh)
 {
 	list_del_init(&bh->b_assoc_buffers);
 	WARN_ON(!bh->b_assoc_map);
@@ -706,7 +705,7 @@ static int __set_page_dirty(struct page *page,
 	if (TestSetPageDirty(page))
 		return 0;
 
-	write_lock_irq(&mapping->tree_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 
@@ -719,7 +718,7 @@ static int __set_page_dirty(struct page *page,
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
-	write_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
 	return 1;
@@ -1214,8 +1213,7 @@ void __brelse(struct buffer_head * buf)
 		put_bh(buf);
 		return;
 	}
-	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
-	WARN_ON(1);
+	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
 }
 
 /*
@@ -1721,7 +1719,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 		 */
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
-		} else if (test_set_buffer_locked(bh)) {
+		} else if (!trylock_buffer(bh)) {
 			redirty_page_for_writepage(wbc, page);
 			continue;
 		}
@@ -2097,6 +2095,52 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 EXPORT_SYMBOL(generic_write_end);
 
 /*
+ * block_is_partially_uptodate checks whether buffers within a page are
+ * uptodate or not.
+ *
+ * Returns true if all buffers which correspond to a file portion
+ * we want to read are uptodate.
+ */
+int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+					unsigned long from)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, block_end, blocksize;
+	unsigned to;
+	struct buffer_head *bh, *head;
+	int ret = 1;
+
+	if (!page_has_buffers(page))
+		return 0;
+
+	blocksize = 1 << inode->i_blkbits;
+	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+	to = from + to;
+	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+		return 0;
+
+	head = page_buffers(page);
+	bh = head;
+	block_start = 0;
+	do {
+		block_end = block_start + blocksize;
+		if (block_end > from && block_start < to) {
+			if (!buffer_uptodate(bh)) {
+				ret = 0;
+				break;
+			}
+			if (block_end >= to)
+				break;
+		}
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return ret;
+}
+EXPORT_SYMBOL(block_is_partially_uptodate);
+
+/*
  * Generic "read page" function for block devices that have the normal
  * get_block functionality. This is most of the block device filesystems.
  * Reads the page asynchronously --- the unlock_buffer() and
@@ -2881,14 +2925,17 @@ int submit_bh(int rw, struct buffer_head * bh)
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
 
-	if (buffer_ordered(bh) && (rw == WRITE))
-		rw = WRITE_BARRIER;
+	/*
+	 * Mask in barrier bit for a write (could be either a WRITE or a
+	 * WRITE_SYNC
+	 */
+	if (buffer_ordered(bh) && (rw & WRITE))
+		rw |= WRITE_BARRIER;
 
 	/*
-	 * Only clear out a write error when rewriting, should this
-	 * include WRITE_SYNC as well?
+	 * Only clear out a write error when rewriting
 	 */
-	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
+	if (test_set_buffer_req(bh) && (rw & WRITE))
 		clear_buffer_write_io_error(bh);
 
 	/*
@@ -2955,7 +3002,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 
 		if (rw == SWRITE || rw == SWRITE_SYNC)
 			lock_buffer(bh);
-		else if (test_set_buffer_locked(bh))
+		else if (!trylock_buffer(bh))
 			continue;
 
 		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
@@ -3272,7 +3319,7 @@ int bh_submit_read(struct buffer_head *bh)
 EXPORT_SYMBOL(bh_submit_read);
 
 static void
-init_buffer_head(struct kmem_cache *cachep, void *data)
+init_buffer_head(void *data)
 {
 	struct buffer_head *bh = data;
 
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3cb7cda3d780..700697a72618 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -22,9 +22,6 @@
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
 #include "internal.h"
 
 /*
@@ -389,15 +386,22 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 	cdev_put(new);
 	if (ret)
 		return ret;
+
+	ret = -ENXIO;
 	filp->f_op = fops_get(p->ops);
-	if (!filp->f_op) {
-		cdev_put(p);
-		return -ENXIO;
-	}
-	if (filp->f_op->open)
+	if (!filp->f_op)
+		goto out_cdev_put;
+
+	if (filp->f_op->open) {
 		ret = filp->f_op->open(inode,filp);
-	if (ret)
-		cdev_put(p);
+		if (ret)
+			goto out_cdev_put;
+	}
+
+	return 0;
+
+ out_cdev_put:
+	cdev_put(p);
 	return ret;
 }
 
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 1f3465201fdf..8f528ea24c48 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,28 @@
+Version 1.55
+------------
+Various fixes to make delete of open files behavior more predictable
+(when delete of an open file fails we mark the file as "delete-on-close"
+in a way that more servers accept, but only if we can first rename the
+file to a temporary name).  Add experimental support for more safely
+handling fcntl(F_SETLEASE).
+
+Version 1.54
+------------
+Fix premature write failure on congested networks (we would give up
+on EAGAIN from the socket too quickly on large writes).
+Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
+Fix endian problems in acl (mode from/to cifs acl) on bigendian
+architectures.  Fix problems with preserving timestamps on copying open
+files (e.g. "cp -a") to Windows servers.  For mkdir and create honor setgid bit
+on parent directory when server supports Unix Extensions but not POSIX
+create. Update cifs.upcall version to handle new Kerberos sec flags
+(this requires update of cifs.upcall program from Samba).  Fix memory leak
+on dns_upcall (resolving DFS referralls).  Fix plain text password
+authentication (requires setting SecurityFlags to 0x30030 to enable
+lanman and plain text though).  Fix writes to be at correct offset when
+file is open with O_APPEND and file is on a directio (forcediretio) mount.
+Fix bug in rewinding readdir directory searches.  Add nodfs mount option.
+
 Version 1.53
 ------------
 DFS support added (Microsoft Distributed File System client support needed
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
new file mode 100644
index 000000000000..341a98965bd0
--- /dev/null
+++ b/fs/cifs/Kconfig
@@ -0,0 +1,142 @@
+config CIFS
+	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
+	depends on INET
+	select NLS
+	help
+	  This is the client VFS module for the Common Internet File System
+	  (CIFS) protocol which is the successor to the Server Message Block
+	  (SMB) protocol, the native file sharing mechanism for most early
+	  PC operating systems.  The CIFS protocol is fully supported by
+	  file servers such as Windows 2000 (including Windows 2003, NT 4
+	  and Windows XP) as well by Samba (which provides excellent CIFS
+	  server support for Linux and many other operating systems). Limited
+	  support for OS/2 and Windows ME and similar servers is provided as
+	  well.
+
+	  The cifs module provides an advanced network file system
+	  client for mounting to CIFS compliant servers.  It includes
+	  support for DFS (hierarchical name space), secure per-user
+	  session establishment via Kerberos or NTLM or NTLMv2,
+	  safe distributed caching (oplock), optional packet
+	  signing, Unicode and other internationalization improvements.
+	  If you need to mount to Samba or Windows from this machine, say Y.
+
+config CIFS_STATS
+        bool "CIFS statistics"
+        depends on CIFS
+        help
+          Enabling this option will cause statistics for each server share
+	  mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
+
+config CIFS_STATS2
+	bool "Extended statistics"
+	depends on CIFS_STATS
+	help
+	  Enabling this option will allow more detailed statistics on SMB
+	  request timing to be displayed in /proc/fs/cifs/DebugData and also
+	  allow optional logging of slow responses to dmesg (depending on the
+	  value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
+	  These additional statistics may have a minor effect on performance
+	  and memory utilization.
+
+	  Unless you are a developer or are doing network performance analysis
+	  or tuning, say N.
+
+config CIFS_WEAK_PW_HASH
+	bool "Support legacy servers which use weaker LANMAN security"
+	depends on CIFS
+	help
+	  Modern CIFS servers including Samba and most Windows versions
+	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
+	  security mechanisms. These hash the password more securely
+	  than the mechanisms used in the older LANMAN version of the
+	  SMB protocol but LANMAN based authentication is needed to
+	  establish sessions with some old SMB servers.
+
+	  Enabling this option allows the cifs module to mount to older
+	  LANMAN based servers such as OS/2 and Windows 95, but such
+	  mounts may be less secure than mounts using NTLM or more recent
+	  security mechanisms if you are on a public network.  Unless you
+	  have a need to access old SMB servers (and are on a private
+	  network) you probably want to say N.  Even if this support
+	  is enabled in the kernel build, LANMAN authentication will not be
+	  used automatically. At runtime LANMAN mounts are disabled but
+	  can be set to required (or optional) either in
+	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
+	  option on the mount command. This support is disabled by
+	  default in order to reduce the possibility of a downgrade
+	  attack.
+
+	  If unsure, say N.
+
+config CIFS_UPCALL
+	  bool "Kerberos/SPNEGO advanced session setup"
+	  depends on CIFS && KEYS
+	  help
+	    Enables an upcall mechanism for CIFS which accesses
+	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+	    Kerberos tickets which are needed to mount to certain secure servers
+	    (for which more secure Kerberos authentication is required). If
+	    unsure, say N.
+
+config CIFS_XATTR
+        bool "CIFS extended attributes"
+        depends on CIFS
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).  CIFS maps the name of
+          extended attributes beginning with the user namespace prefix
+          to SMB/CIFS EAs. EAs are stored on Windows servers without the
+          user namespace prefix, but their names are seen by Linux cifs clients
+          prefaced by the user namespace prefix. The system namespace
+          (used by some filesystems to store ACLs) is not supported at
+          this time.
+
+          If unsure, say N.
+
+config CIFS_POSIX
+        bool "CIFS POSIX Extensions"
+        depends on CIFS_XATTR
+        help
+          Enabling this option will cause the cifs client to attempt to
+	  negotiate a newer dialect with servers, such as Samba 3.0.5
+	  or later, that optionally can handle more POSIX like (rather
+	  than Windows like) file behavior.  It also enables
+	  support for POSIX ACLs (getfacl and setfacl) to servers
+	  (such as Samba 3.10 and later) which can negotiate
+	  CIFS POSIX ACL support.  If unsure, say N.
+
+config CIFS_DEBUG2
+	bool "Enable additional CIFS debugging routines"
+	depends on CIFS
+	help
+	   Enabling this option adds a few more debugging routines
+	   to the cifs code which slightly increases the size of
+	   the cifs module and can cause additional logging of debug
+	   messages in some error paths, slowing performance. This
+	   option can be turned off unless you are debugging
+	   cifs problems.  If unsure, say N.
+
+config CIFS_EXPERIMENTAL
+	  bool "CIFS Experimental Features (EXPERIMENTAL)"
+	  depends on CIFS && EXPERIMENTAL
+	  help
+	    Enables cifs features under testing. These features are
+	    experimental and currently include DFS support and directory
+	    change notification ie fcntl(F_DNOTIFY), as well as the upcall
+	    mechanism which will be used for Kerberos session negotiation
+	    and uid remapping.  Some of these features also may depend on
+	    setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
+	    (which is disabled by default). See the file fs/cifs/README
+	    for more details.  If unsure, say N.
+
+config CIFS_DFS_UPCALL
+	  bool "DFS feature support (EXPERIMENTAL)"
+	  depends on CIFS_EXPERIMENTAL
+	  depends on KEYS
+	  help
+	    Enables an upcall mechanism for CIFS which contacts userspace
+	    helper utilities to provide server name resolution (host names to
+	    IP addresses) which is needed for implicit mounts of DFS junction
+	    points. If unsure, say N.
diff --git a/fs/cifs/README b/fs/cifs/README
index 2bd6fe556f88..a439dc1739b3 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -463,6 +463,9 @@ A partial list of the supported mount options follows:
 		with cifs style mandatory byte range locks (and most
 		cifs servers do not yet support requesting advisory
 		byte range locks).
+ nodfs          Disable DFS (global name space support) even if the
+		server claims to support it.  This can help work around
+		a problem with parsing of DFS paths with Samba 3.0.24 server.
  remount        remount the share (often used to change from ro to rw mounts
 	        or vice versa)
  cifsacl        Report mode bits (e.g. on stat) based on the Windows ACL for
@@ -488,6 +491,19 @@ A partial list of the supported mount options follows:
 		Note that this differs from the sign mount option in that it
 		causes encryption of data sent over this mounted share but other
 		shares mounted to the same server are unaffected.
+ locallease     This option is rarely needed. Fcntl F_SETLEASE is
+		used by some applications such as Samba and NFSv4 server to
+		check to see whether a file is cacheable.  CIFS has no way
+		to explicitly request a lease, but can check whether a file
+		is cacheable (oplocked).  Unfortunately, even if a file
+		is not oplocked, it could still be cacheable (ie cifs client
+		could grant fcntl leases if no other local processes are using
+		the file) for cases for example such as when the server does not
+		support oplocks and the user is sure that the only updates to
+		the file will be from this client. Specifying this mount option
+		will allow the cifs client to check for leases (only) locally
+		for files which are not oplocked instead of denying leases
+		in that case. (EXPERIMENTAL)
  sec            Security mode.  Allowed values are:
 			none	attempt to connection as a null user (no name)
 			krb5    Use Kerberos version 5 authentication
@@ -542,10 +558,20 @@ SecurityFlags		Flags which control security negotiation and
 			hashing mechanisms (as "must use") on the other hand 
 			does not make much sense. Default flags are 
 				0x07007 
-			(NTLM, NTLMv2 and packet signing allowed).  Maximum 
+			(NTLM, NTLMv2 and packet signing allowed).  The maximum 
 			allowable flags if you want to allow mounts to servers
 			using weaker password hashes is 0x37037 (lanman,
-			plaintext, ntlm, ntlmv2, signing allowed):
+			plaintext, ntlm, ntlmv2, signing allowed).  Some
+			SecurityFlags require the corresponding menuconfig
+			options to be enabled (lanman and plaintext require
+			CONFIG_CIFS_WEAK_PW_HASH for example).  Enabling
+			plaintext authentication currently requires also
+			enabling lanman authentication in the security flags
+			because the cifs module only supports sending
+			laintext passwords using the older lanman dialect
+			form of the session setup SMB.  (e.g. for authentication
+			using plain text passwords, set the SecurityFlags
+			to 0x30030):
  
 			may use packet signing 				0x00001
 			must use packet signing				0x01001
@@ -628,6 +654,9 @@ requires enabling CONFIG_CIFS_EXPERIMENTAL
 	cifsacl support needed to retrieve approximated mode bits based on
 		the contents on the CIFS ACL.
 
+	lease support: cifs will check the oplock state before calling into
+	the vfs to see if we can grant a lease on a file.
+
 	DNOTIFY fcntl: needed for support of directory change 
 			    notification and perhaps later for file leases)
 
@@ -642,8 +671,30 @@ The statistics for the number of total SMBs and oplock breaks are different in
 that they represent all for that share, not just those for which the server
 returned success.
 	
-Also note that "cat /proc/fs/cifs/DebugData" will display information about 
+Also note that "cat /proc/fs/cifs/DebugData" will display information about
 the active sessions and the shares that are mounted.
-Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is
-on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and
-LANMAN support do not require this helper.
+
+Enabling Kerberos (extended security) works but requires version 1.2 or later
+of the helper program cifs.upcall to be present and to be configured in the
+/etc/request-key.conf file.  The cifs.upcall helper program is from the Samba
+project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
+require this helper. Note that NTLMv2 security (which does not require the
+cifs.upcall helper program), instead of using Kerberos, is sufficient for
+some use cases.
+
+Enabling DFS support (used to access shares transparently in an MS-DFS
+global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled.  In
+addition, DFS support for target shares which are specified as UNC
+names which begin with host names (rather than IP addresses) requires
+a user space helper (such as cifs.upcall) to be present in order to
+translate host names to ip address, and the user space helper must also
+be configured in the file /etc/request-key.conf
+
+To use cifs Kerberos and DFS support, the Linux keyutils package should be
+installed and something like the following lines should be added to the
+/etc/request-key.conf file:
+
+create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
+create dns_resolver * * /usr/local/sbin/cifs.upcall %k
+
+
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index f58e41d3ba48..1b09f1670061 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -400,7 +400,7 @@ asn1_oid_decode(struct asn1_ctx *ctx,
 	size = eoc - ctx->pointer + 1;
 
 	/* first subid actually encodes first two subids */
-	if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+	if (size < 2 || size > UINT_MAX/sizeof(unsigned long))
 		return 0;
 
 	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
@@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 	unsigned int cls, con, tag, oidlen, rc;
 	bool use_ntlmssp = false;
 	bool use_kerberos = false;
+	bool use_mskerberos = false;
 
 	*secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
 
@@ -483,6 +484,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 	asn1_open(&ctx, security_blob, length);
 
+	/* GSSAPI header */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
 		cFYI(1, ("Error decoding negTokenInit header"));
 		return 0;
@@ -490,156 +492,149 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 		   || (tag != ASN1_EOC)) {
 		cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag));
 		return 0;
-	} else {
-		/*      remember to free obj->oid */
-		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
-		if (rc) {
-			if ((tag == ASN1_OJI) && (cls == ASN1_PRI)) {
-				rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
-				if (rc) {
-					rc = compare_oid(oid, oidlen,
-							 SPNEGO_OID,
-							 SPNEGO_OID_LEN);
-					kfree(oid);
-				}
-			} else
-				rc = 0;
-		}
+	}
 
-		if (!rc) {
-			cFYI(1, ("Error decoding negTokenInit header"));
-			return 0;
-		}
+	/* Check for SPNEGO OID -- remember to free obj->oid */
+	rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
+	if (rc) {
+		if ((tag == ASN1_OJI) && (con == ASN1_PRI) &&
+		    (cls == ASN1_UNI)) {
+			rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
+			if (rc) {
+				rc = compare_oid(oid, oidlen, SPNEGO_OID,
+						 SPNEGO_OID_LEN);
+				kfree(oid);
+			}
+		} else
+			rc = 0;
+	}
 
-		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1, ("Error decoding negTokenInit"));
-			return 0;
-		} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
-			   || (tag != ASN1_EOC)) {
-			cFYI(1,
-			     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
-			      cls, con, tag, end, *end));
-			return 0;
-		}
+	/* SPNEGO OID not present or garbled -- bail out */
+	if (!rc) {
+		cFYI(1, ("Error decoding negTokenInit header"));
+		return 0;
+	}
 
-		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1, ("Error decoding negTokenInit"));
-			return 0;
-		} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-			   || (tag != ASN1_SEQ)) {
-			cFYI(1,
-			     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
-			      cls, con, tag, end, *end));
-			return 0;
-		}
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, ("Error decoding negTokenInit"));
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+		   || (tag != ASN1_EOC)) {
+		cFYI(1,
+		     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+		      cls, con, tag, end, *end));
+		return 0;
+	}
 
-		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1, ("Error decoding 2nd part of negTokenInit"));
-			return 0;
-		} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
-			   || (tag != ASN1_EOC)) {
-			cFYI(1,
-			     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
-			      cls, con, tag, end, *end));
-			return 0;
-		}
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, ("Error decoding negTokenInit"));
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+		   || (tag != ASN1_SEQ)) {
+		cFYI(1,
+		     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+		      cls, con, tag, end, *end));
+		return 0;
+	}
 
-		if (asn1_header_decode
-		    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-			cFYI(1, ("Error decoding 2nd part of negTokenInit"));
-			return 0;
-		} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-			   || (tag != ASN1_SEQ)) {
-			cFYI(1,
-			     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
-			      cls, con, tag, end, *end));
-			return 0;
-		}
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+		   || (tag != ASN1_EOC)) {
+		cFYI(1,
+		     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+		      cls, con, tag, end, *end));
+		return 0;
+	}
 
-		while (!asn1_eoc_decode(&ctx, sequence_end)) {
-			rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
-			if (!rc) {
-				cFYI(1,
-				     ("Error decoding negTokenInit hdr exit2"));
-				return 0;
-			}
-			if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
-				if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
-
-					cFYI(1,
-					  ("OID len = %d oid = 0x%lx 0x%lx "
-					   "0x%lx 0x%lx",
-					   oidlen, *oid, *(oid + 1),
-					   *(oid + 2), *(oid + 3)));
-
-					if (compare_oid(oid, oidlen,
-							MSKRB5_OID,
-							MSKRB5_OID_LEN))
-						use_kerberos = true;
-					else if (compare_oid(oid, oidlen,
-							     KRB5_OID,
-							     KRB5_OID_LEN))
-						use_kerberos = true;
-					else if (compare_oid(oid, oidlen,
-							     NTLMSSP_OID,
-							     NTLMSSP_OID_LEN))
-						use_ntlmssp = true;
-
-					kfree(oid);
-				}
-			} else {
-				cFYI(1, ("Should be an oid what is going on?"));
-			}
-		}
+	if (asn1_header_decode
+	    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
+		cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+		   || (tag != ASN1_SEQ)) {
+		cFYI(1,
+		     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+		      cls, con, tag, end, *end));
+		return 0;
+	}
 
-		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1,
-			     ("Error decoding last part negTokenInit exit3"));
-			return 0;
-		} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-			/* tag = 3 indicating mechListMIC */
+	while (!asn1_eoc_decode(&ctx, sequence_end)) {
+		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
+		if (!rc) {
 			cFYI(1,
-			     ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
-			      cls, con, tag, end, *end));
+			     ("Error decoding negTokenInit hdr exit2"));
 			return 0;
 		}
-		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1,
-			     ("Error decoding last part negTokenInit exit5"));
-			return 0;
-		} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
-			   || (tag != ASN1_SEQ)) {
-			cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
-				cls, con, tag, end, *end));
+		if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
+			if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
+
+				cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx "
+					 "0x%lx 0x%lx", oidlen, *oid,
+					 *(oid + 1), *(oid + 2), *(oid + 3)));
+
+				if (compare_oid(oid, oidlen, MSKRB5_OID,
+						MSKRB5_OID_LEN) &&
+						!use_kerberos)
+					use_mskerberos = true;
+				else if (compare_oid(oid, oidlen, KRB5_OID,
+						     KRB5_OID_LEN) &&
+						     !use_mskerberos)
+					use_kerberos = true;
+				else if (compare_oid(oid, oidlen, NTLMSSP_OID,
+						     NTLMSSP_OID_LEN))
+					use_ntlmssp = true;
+
+				kfree(oid);
+			}
+		} else {
+			cFYI(1, ("Should be an oid what is going on?"));
 		}
+	}
 
-		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1,
-			     ("Error decoding last part negTokenInit exit 7"));
-			return 0;
-		} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-			cFYI(1,
-			     ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
-			      cls, con, tag, end, *end));
-			return 0;
-		}
-		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1,
-			     ("Error decoding last part negTokenInit exit9"));
-			return 0;
-		} else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
-			   || (tag != ASN1_GENSTR)) {
-			cFYI(1,
-			     ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
-			      cls, con, tag, end, *end));
-			return 0;
-		}
-		cFYI(1, ("Need to call asn1_octets_decode() function for %s",
-			 ctx.pointer));	/* is this UTF-8 or ASCII? */
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, ("Error decoding last part negTokenInit exit3"));
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
+		/* tag = 3 indicating mechListMIC */
+		cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
+			 cls, con, tag, end, *end));
+		return 0;
+	}
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, ("Error decoding last part negTokenInit exit5"));
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+		   || (tag != ASN1_SEQ)) {
+		cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end));
+	}
+
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, ("Error decoding last part negTokenInit exit 7"));
+		return 0;
+	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
+		cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+			 cls, con, tag, end, *end));
+		return 0;
+	}
+	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+		cFYI(1, ("Error decoding last part negTokenInit exit9"));
+		return 0;
+	} else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
+		   || (tag != ASN1_GENSTR)) {
+		cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
+			 cls, con, tag, end, *end));
+		return 0;
 	}
+	cFYI(1, ("Need to call asn1_octets_decode() function for %s",
+		 ctx.pointer));	/* is this UTF-8 or ASCII? */
 
 	if (use_kerberos)
 		*secType = Kerberos;
+	else if (use_mskerberos)
+		*secType = MSKerberos;
 	else if (use_ntlmssp)
 		*secType = NTLMSSP;
 
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index cc950f69e51e..69a12aae91d3 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,27 +79,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-		if (mid_entry) {
-			cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
-				mid_entry->midState,
-				(int)mid_entry->command,
-				mid_entry->pid,
-				mid_entry->tsk,
-				mid_entry->mid));
+		cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+			mid_entry->midState,
+			(int)mid_entry->command,
+			mid_entry->pid,
+			mid_entry->tsk,
+			mid_entry->mid));
 #ifdef CONFIG_CIFS_STATS2
-			cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
-				mid_entry->largeBuf,
-				mid_entry->resp_buf,
-				mid_entry->when_received,
-				jiffies));
+		cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+			mid_entry->largeBuf,
+			mid_entry->resp_buf,
+			mid_entry->when_received,
+			jiffies));
 #endif /* STATS2 */
-			cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
-				  mid_entry->multiEnd));
-			if (mid_entry->resp_buf) {
-				cifs_dump_detail(mid_entry->resp_buf);
-				cifs_dump_mem("existing buf: ",
-					mid_entry->resp_buf, 62);
-			}
+		cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+			  mid_entry->multiEnd));
+		if (mid_entry->resp_buf) {
+			cifs_dump_detail(mid_entry->resp_buf);
+			cifs_dump_mem("existing buf: ",
+				mid_entry->resp_buf, 62);
 		}
 	}
 	spin_unlock(&GlobalMid_Lock);
@@ -107,9 +105,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 #endif /* CONFIG_CIFS_DEBUG2 */
 
 #ifdef CONFIG_PROC_FS
-static int
-cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
-		     int count, int *eof, void *data)
+static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
 	struct list_head *tmp;
 	struct list_head *tmp1;
@@ -117,23 +113,13 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 	struct cifsSesInfo *ses;
 	struct cifsTconInfo *tcon;
 	int i;
-	int length = 0;
-	char *original_buf = buf;
-
-	*beginBuffer = buf + offset;
 
-	length =
-	    sprintf(buf,
+	seq_puts(m,
 		    "Display Internal CIFS Data Structures for Debugging\n"
 		    "---------------------------------------------------\n");
-	buf += length;
-	length = sprintf(buf, "CIFS Version %s\n", CIFS_VERSION);
-	buf += length;
-	length = sprintf(buf,
-		"Active VFS Requests: %d\n", GlobalTotalActiveXid);
-	buf += length;
-	length = sprintf(buf, "Servers:");
-	buf += length;
+	seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
+	seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
+	seq_printf(m, "Servers:");
 
 	i = 0;
 	read_lock(&GlobalSMBSeslock);
@@ -142,11 +128,10 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 		ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
 		if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) ||
 		   (ses->serverNOS == NULL)) {
-			buf += sprintf(buf, "\nentry for %s not fully "
+			seq_printf(m, "\nentry for %s not fully "
 					"displayed\n\t", ses->serverName);
 		} else {
-			length =
-			    sprintf(buf,
+			seq_printf(m,
 				    "\n%d) Name: %s  Domain: %s Mounts: %d OS:"
 				    " %s  \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
 				    " session status: %d\t",
@@ -154,10 +139,9 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 				atomic_read(&ses->inUse),
 				ses->serverOS, ses->serverNOS,
 				ses->capabilities, ses->status);
-			buf += length;
 		}
 		if (ses->server) {
-			buf += sprintf(buf, "TCP status: %d\n\tLocal Users To "
+			seq_printf(m, "TCP status: %d\n\tLocal Users To "
 				    "Server: %d SecMode: 0x%x Req On Wire: %d",
 				ses->server->tcpStatus,
 				atomic_read(&ses->server->socketUseCount),
@@ -165,41 +149,34 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 				atomic_read(&ses->server->inFlight));
 
 #ifdef CONFIG_CIFS_STATS2
-			buf += sprintf(buf, " In Send: %d In MaxReq Wait: %d",
+			seq_printf(m, " In Send: %d In MaxReq Wait: %d",
 				atomic_read(&ses->server->inSend),
 				atomic_read(&ses->server->num_waiters));
 #endif
 
-			length = sprintf(buf, "\nMIDs:\n");
-			buf += length;
+			seq_puts(m, "\nMIDs:\n");
 
 			spin_lock(&GlobalMid_Lock);
 			list_for_each(tmp1, &ses->server->pending_mid_q) {
 				mid_entry = list_entry(tmp1, struct
 					mid_q_entry,
 					qhead);
-				if (mid_entry) {
-					length = sprintf(buf,
-							"State: %d com: %d pid:"
-							" %d tsk: %p mid %d\n",
-							mid_entry->midState,
-							(int)mid_entry->command,
-							mid_entry->pid,
-							mid_entry->tsk,
-							mid_entry->mid);
-					buf += length;
-				}
+				seq_printf(m, "State: %d com: %d pid:"
+						" %d tsk: %p mid %d\n",
+						mid_entry->midState,
+						(int)mid_entry->command,
+						mid_entry->pid,
+						mid_entry->tsk,
+						mid_entry->mid);
 			}
 			spin_unlock(&GlobalMid_Lock);
 		}
 
 	}
 	read_unlock(&GlobalSMBSeslock);
-	sprintf(buf, "\n");
-	buf++;
+	seq_putc(m, '\n');
 
-	length = sprintf(buf, "Shares:");
-	buf += length;
+	seq_puts(m, "Shares:");
 
 	i = 0;
 	read_lock(&GlobalSMBSeslock);
@@ -208,62 +185,52 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 		i++;
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
 		dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
-		length = sprintf(buf, "\n%d) %s Uses: %d ", i,
+		seq_printf(m, "\n%d) %s Uses: %d ", i,
 				 tcon->treeName, atomic_read(&tcon->useCount));
-		buf += length;
 		if (tcon->nativeFileSystem) {
-			length = sprintf(buf, "Type: %s ",
+			seq_printf(m, "Type: %s ",
 					 tcon->nativeFileSystem);
-			buf += length;
 		}
-		length = sprintf(buf, "DevInfo: 0x%x Attributes: 0x%x"
+		seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
 				 "\nPathComponentMax: %d Status: %d",
 			    le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
 			    le32_to_cpu(tcon->fsAttrInfo.Attributes),
 			    le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
 			    tcon->tidStatus);
-		buf += length;
 		if (dev_type == FILE_DEVICE_DISK)
-			length = sprintf(buf, " type: DISK ");
+			seq_puts(m, " type: DISK ");
 		else if (dev_type == FILE_DEVICE_CD_ROM)
-			length = sprintf(buf, " type: CDROM ");
+			seq_puts(m, " type: CDROM ");
 		else
-			length =
-			    sprintf(buf, " type: %d ", dev_type);
-		buf += length;
-		if (tcon->tidStatus == CifsNeedReconnect) {
-			buf += sprintf(buf, "\tDISCONNECTED ");
-			length += 14;
-		}
+			seq_printf(m, " type: %d ", dev_type);
+
+		if (tcon->tidStatus == CifsNeedReconnect)
+			seq_puts(m, "\tDISCONNECTED ");
 	}
 	read_unlock(&GlobalSMBSeslock);
 
-	length = sprintf(buf, "\n");
-	buf += length;
+	seq_putc(m, '\n');
 
 	/* BB add code to dump additional info such as TCP session info now */
-	/* Now calculate total size of returned data */
-	length = buf - original_buf;
-
-	if (offset + count >= length)
-		*eof = 1;
-	if (length < offset) {
-		*eof = 1;
-		return 0;
-	} else {
-		length = length - offset;
-	}
-	if (length > count)
-		length = count;
+	return 0;
+}
 
-	return length;
+static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_debug_data_proc_show, NULL);
 }
 
-#ifdef CONFIG_CIFS_STATS
+static const struct file_operations cifs_debug_data_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_debug_data_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
-static int
-cifs_stats_write(struct file *file, const char __user *buffer,
-		 unsigned long count, void *data)
+#ifdef CONFIG_CIFS_STATS
+static ssize_t cifs_stats_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -307,236 +274,132 @@ cifs_stats_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-cifs_stats_read(char *buf, char **beginBuffer, off_t offset,
-		  int count, int *eof, void *data)
+static int cifs_stats_proc_show(struct seq_file *m, void *v)
 {
-	int item_length, i, length;
+	int i;
 	struct list_head *tmp;
 	struct cifsTconInfo *tcon;
 
-	*beginBuffer = buf + offset;
-
-	length = sprintf(buf,
+	seq_printf(m,
 			"Resources in use\nCIFS Session: %d\n",
 			sesInfoAllocCount.counter);
-	buf += length;
-	item_length =
-		sprintf(buf, "Share (unique mount targets): %d\n",
+	seq_printf(m, "Share (unique mount targets): %d\n",
 			tconInfoAllocCount.counter);
-	length += item_length;
-	buf += item_length;
-	item_length =
-		sprintf(buf, "SMB Request/Response Buffer: %d Pool size: %d\n",
+	seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n",
 			bufAllocCount.counter,
 			cifs_min_rcv + tcpSesAllocCount.counter);
-	length += item_length;
-	buf += item_length;
-	item_length =
-		sprintf(buf, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
+	seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
 			smBufAllocCount.counter, cifs_min_small);
-	length += item_length;
-	buf += item_length;
 #ifdef CONFIG_CIFS_STATS2
-	item_length = sprintf(buf, "Total Large %d Small %d Allocations\n",
+	seq_printf(m, "Total Large %d Small %d Allocations\n",
 				atomic_read(&totBufAllocCount),
 				atomic_read(&totSmBufAllocCount));
-	length += item_length;
-	buf += item_length;
 #endif /* CONFIG_CIFS_STATS2 */
 
-	item_length =
-		sprintf(buf, "Operations (MIDs): %d\n",
-			midCount.counter);
-	length += item_length;
-	buf += item_length;
-	item_length = sprintf(buf,
+	seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
+	seq_printf(m,
 		"\n%d session %d share reconnects\n",
 		tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
-	length += item_length;
-	buf += item_length;
 
-	item_length = sprintf(buf,
+	seq_printf(m,
 		"Total vfs operations: %d maximum at one time: %d\n",
 		GlobalCurrentXid, GlobalMaxActiveXid);
-	length += item_length;
-	buf += item_length;
 
 	i = 0;
 	read_lock(&GlobalSMBSeslock);
 	list_for_each(tmp, &GlobalTreeConnectionList) {
 		i++;
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		item_length = sprintf(buf, "\n%d) %s", i, tcon->treeName);
-		buf += item_length;
-		length += item_length;
-		if (tcon->tidStatus == CifsNeedReconnect) {
-			buf += sprintf(buf, "\tDISCONNECTED ");
-			length += 14;
-		}
-		item_length = sprintf(buf, "\nSMBs: %d Oplock Breaks: %d",
+		seq_printf(m, "\n%d) %s", i, tcon->treeName);
+		if (tcon->tidStatus == CifsNeedReconnect)
+			seq_puts(m, "\tDISCONNECTED ");
+		seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
 			atomic_read(&tcon->num_smbs_sent),
 			atomic_read(&tcon->num_oplock_brks));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nReads:  %d Bytes: %lld",
+		seq_printf(m, "\nReads:  %d Bytes: %lld",
 			atomic_read(&tcon->num_reads),
 			(long long)(tcon->bytes_read));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nWrites: %d Bytes: %lld",
+		seq_printf(m, "\nWrites: %d Bytes: %lld",
 			atomic_read(&tcon->num_writes),
 			(long long)(tcon->bytes_written));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf,
+		seq_printf(m,
 			"\nLocks: %d HardLinks: %d Symlinks: %d",
 			atomic_read(&tcon->num_locks),
 			atomic_read(&tcon->num_hardlinks),
 			atomic_read(&tcon->num_symlinks));
-		buf += item_length;
-		length += item_length;
 
-		item_length = sprintf(buf, "\nOpens: %d Closes: %d Deletes: %d",
+		seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
 			atomic_read(&tcon->num_opens),
 			atomic_read(&tcon->num_closes),
 			atomic_read(&tcon->num_deletes));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nMkdirs: %d Rmdirs: %d",
+		seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
 			atomic_read(&tcon->num_mkdirs),
 			atomic_read(&tcon->num_rmdirs));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nRenames: %d T2 Renames %d",
+		seq_printf(m, "\nRenames: %d T2 Renames %d",
 			atomic_read(&tcon->num_renames),
 			atomic_read(&tcon->num_t2renames));
-		buf += item_length;
-		length += item_length;
-		item_length = sprintf(buf, "\nFindFirst: %d FNext %d FClose %d",
+		seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
 			atomic_read(&tcon->num_ffirst),
 			atomic_read(&tcon->num_fnext),
 			atomic_read(&tcon->num_fclose));
-		buf += item_length;
-		length += item_length;
 	}
 	read_unlock(&GlobalSMBSeslock);
 
-	buf += sprintf(buf, "\n");
-	length++;
-
-	if (offset + count >= length)
-		*eof = 1;
-	if (length < offset) {
-		*eof = 1;
-		return 0;
-	} else {
-		length = length - offset;
-	}
-	if (length > count)
-		length = count;
+	seq_putc(m, '\n');
+	return 0;
+}
 
-	return length;
+static int cifs_stats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_stats_proc_show, NULL);
 }
+
+static const struct file_operations cifs_stats_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_stats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_stats_proc_write,
+};
 #endif /* STATS */
 
 static struct proc_dir_entry *proc_fs_cifs;
-read_proc_t cifs_txanchor_read;
-static read_proc_t cifsFYI_read;
-static write_proc_t cifsFYI_write;
-static read_proc_t oplockEnabled_read;
-static write_proc_t oplockEnabled_write;
-static read_proc_t lookupFlag_read;
-static write_proc_t lookupFlag_write;
-static read_proc_t traceSMB_read;
-static write_proc_t traceSMB_write;
-static read_proc_t multiuser_mount_read;
-static write_proc_t multiuser_mount_write;
-static read_proc_t security_flags_read;
-static write_proc_t security_flags_write;
-/* static read_proc_t ntlmv2_enabled_read;
-static write_proc_t ntlmv2_enabled_write;
-static read_proc_t packet_signing_enabled_read;
-static write_proc_t packet_signing_enabled_write;*/
-static read_proc_t experimEnabled_read;
-static write_proc_t experimEnabled_write;
-static read_proc_t linuxExtensionsEnabled_read;
-static write_proc_t linuxExtensionsEnabled_write;
+static const struct file_operations cifsFYI_proc_fops;
+static const struct file_operations cifs_oplock_proc_fops;
+static const struct file_operations cifs_lookup_cache_proc_fops;
+static const struct file_operations traceSMB_proc_fops;
+static const struct file_operations cifs_multiuser_mount_proc_fops;
+static const struct file_operations cifs_security_flags_proc_fops;
+static const struct file_operations cifs_experimental_proc_fops;
+static const struct file_operations cifs_linux_ext_proc_fops;
 
 void
 cifs_proc_init(void)
 {
-	struct proc_dir_entry *pde;
-
 	proc_fs_cifs = proc_mkdir("fs/cifs", NULL);
 	if (proc_fs_cifs == NULL)
 		return;
 
 	proc_fs_cifs->owner = THIS_MODULE;
-	create_proc_read_entry("DebugData", 0, proc_fs_cifs,
-				cifs_debug_data_read, NULL);
+	proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
 
 #ifdef CONFIG_CIFS_STATS
-	pde = create_proc_read_entry("Stats", 0, proc_fs_cifs,
-				cifs_stats_read, NULL);
-	if (pde)
-		pde->write_proc = cifs_stats_write;
+	proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
 #endif /* STATS */
-	pde = create_proc_read_entry("cifsFYI", 0, proc_fs_cifs,
-				cifsFYI_read, NULL);
-	if (pde)
-		pde->write_proc = cifsFYI_write;
-
-	pde =
-	    create_proc_read_entry("traceSMB", 0, proc_fs_cifs,
-				traceSMB_read, NULL);
-	if (pde)
-		pde->write_proc = traceSMB_write;
-
-	pde = create_proc_read_entry("OplockEnabled", 0, proc_fs_cifs,
-				oplockEnabled_read, NULL);
-	if (pde)
-		pde->write_proc = oplockEnabled_write;
-
-	pde = create_proc_read_entry("Experimental", 0, proc_fs_cifs,
-				experimEnabled_read, NULL);
-	if (pde)
-		pde->write_proc = experimEnabled_write;
-
-	pde = create_proc_read_entry("LinuxExtensionsEnabled", 0, proc_fs_cifs,
-				linuxExtensionsEnabled_read, NULL);
-	if (pde)
-		pde->write_proc = linuxExtensionsEnabled_write;
-
-	pde =
-	    create_proc_read_entry("MultiuserMount", 0, proc_fs_cifs,
-				multiuser_mount_read, NULL);
-	if (pde)
-		pde->write_proc = multiuser_mount_write;
-
-	pde =
-	    create_proc_read_entry("SecurityFlags", 0, proc_fs_cifs,
-				security_flags_read, NULL);
-	if (pde)
-		pde->write_proc = security_flags_write;
-
-	pde =
-	create_proc_read_entry("LookupCacheEnabled", 0, proc_fs_cifs,
-				lookupFlag_read, NULL);
-	if (pde)
-		pde->write_proc = lookupFlag_write;
-
-/*	pde =
-	    create_proc_read_entry("NTLMV2Enabled", 0, proc_fs_cifs,
-				ntlmv2_enabled_read, NULL);
-	if (pde)
-		pde->write_proc = ntlmv2_enabled_write;
-
-	pde =
-	    create_proc_read_entry("PacketSigningEnabled", 0, proc_fs_cifs,
-				packet_signing_enabled_read, NULL);
-	if (pde)
-		pde->write_proc = packet_signing_enabled_write;*/
+	proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
+	proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
+	proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
+	proc_create("Experimental", 0, proc_fs_cifs,
+		    &cifs_experimental_proc_fops);
+	proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
+		    &cifs_linux_ext_proc_fops);
+	proc_create("MultiuserMount", 0, proc_fs_cifs,
+		    &cifs_multiuser_mount_proc_fops);
+	proc_create("SecurityFlags", 0, proc_fs_cifs,
+		    &cifs_security_flags_proc_fops);
+	proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
+		    &cifs_lookup_cache_proc_fops);
 }
 
 void
@@ -553,39 +416,26 @@ cifs_proc_clean(void)
 #endif
 	remove_proc_entry("MultiuserMount", proc_fs_cifs);
 	remove_proc_entry("OplockEnabled", proc_fs_cifs);
-/*	remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); */
 	remove_proc_entry("SecurityFlags", proc_fs_cifs);
-/*	remove_proc_entry("PacketSigningEnabled", proc_fs_cifs); */
 	remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
 	remove_proc_entry("Experimental", proc_fs_cifs);
 	remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
 	remove_proc_entry("fs/cifs", NULL);
 }
 
-static int
-cifsFYI_read(char *page, char **start, off_t off, int count,
-	     int *eof, void *data)
+static int cifsFYI_proc_show(struct seq_file *m, void *v)
 {
-	int len;
-
-	len = sprintf(page, "%d\n", cifsFYI);
-
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+	seq_printf(m, "%d\n", cifsFYI);
+	return 0;
+}
 
-	return len;
+static int cifsFYI_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifsFYI_proc_show, NULL);
 }
-static int
-cifsFYI_write(struct file *file, const char __user *buffer,
-	      unsigned long count, void *data)
+
+static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -603,30 +453,28 @@ cifsFYI_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-oplockEnabled_read(char *page, char **start, off_t off,
-		   int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%d\n", oplockEnabled);
-
-	len -= off;
-	*start = page + off;
+static const struct file_operations cifsFYI_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifsFYI_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifsFYI_proc_write,
+};
 
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+static int cifs_oplock_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", oplockEnabled);
+	return 0;
+}
 
-	return len;
+static int cifs_oplock_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_oplock_proc_show, NULL);
 }
-static int
-oplockEnabled_write(struct file *file, const char __user *buffer,
-		    unsigned long count, void *data)
+
+static ssize_t cifs_oplock_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -642,30 +490,28 @@ oplockEnabled_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-experimEnabled_read(char *page, char **start, off_t off,
-		    int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%d\n", experimEnabled);
+static const struct file_operations cifs_oplock_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_oplock_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_oplock_proc_write,
+};
 
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+static int cifs_experimental_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", experimEnabled);
+	return 0;
+}
 
-	return len;
+static int cifs_experimental_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_experimental_proc_show, NULL);
 }
-static int
-experimEnabled_write(struct file *file, const char __user *buffer,
-		     unsigned long count, void *data)
+
+static ssize_t cifs_experimental_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -683,29 +529,28 @@ experimEnabled_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-linuxExtensionsEnabled_read(char *page, char **start, off_t off,
-			    int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%d\n", linuxExtEnabled);
-	len -= off;
-	*start = page + off;
+static const struct file_operations cifs_experimental_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_experimental_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_experimental_proc_write,
+};
 
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", linuxExtEnabled);
+	return 0;
+}
 
-	return len;
+static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_linux_ext_proc_show, NULL);
 }
-static int
-linuxExtensionsEnabled_write(struct file *file, const char __user *buffer,
-			     unsigned long count, void *data)
+
+static ssize_t cifs_linux_ext_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -721,31 +566,28 @@ linuxExtensionsEnabled_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
+static const struct file_operations cifs_linux_ext_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_linux_ext_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_linux_ext_proc_write,
+};
 
-static int
-lookupFlag_read(char *page, char **start, off_t off,
-		int count, int *eof, void *data)
+static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v)
 {
-	int len;
-
-	len = sprintf(page, "%d\n", lookupCacheEnabled);
-
-	len -= off;
-	*start = page + off;
-
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+	seq_printf(m, "%d\n", lookupCacheEnabled);
+	return 0;
+}
 
-	return len;
+static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_lookup_cache_proc_show, NULL);
 }
-static int
-lookupFlag_write(struct file *file, const char __user *buffer,
-		    unsigned long count, void *data)
+
+static ssize_t cifs_lookup_cache_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -760,30 +602,29 @@ lookupFlag_write(struct file *file, const char __user *buffer,
 
 	return count;
 }
-static int
-traceSMB_read(char *page, char **start, off_t off, int count,
-	      int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%d\n", traceSMB);
-
-	len -= off;
-	*start = page + off;
 
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
+static const struct file_operations cifs_lookup_cache_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_lookup_cache_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_lookup_cache_proc_write,
+};
 
-	if (len < 0)
-		len = 0;
+static int traceSMB_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", traceSMB);
+	return 0;
+}
 
-	return len;
+static int traceSMB_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, traceSMB_proc_show, NULL);
 }
-static int
-traceSMB_write(struct file *file, const char __user *buffer,
-	       unsigned long count, void *data)
+
+static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -799,30 +640,28 @@ traceSMB_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-multiuser_mount_read(char *page, char **start, off_t off,
-		     int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%d\n", multiuser_mount);
-
-	len -= off;
-	*start = page + off;
+static const struct file_operations traceSMB_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= traceSMB_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= traceSMB_proc_write,
+};
 
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", multiuser_mount);
+	return 0;
+}
 
-	return len;
+static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh)
+{
+	return single_open(fh, cifs_multiuser_mount_proc_show, NULL);
 }
-static int
-multiuser_mount_write(struct file *file, const char __user *buffer,
-		      unsigned long count, void *data)
+
+static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	char c;
 	int rc;
@@ -838,30 +677,28 @@ multiuser_mount_write(struct file *file, const char __user *buffer,
 	return count;
 }
 
-static int
-security_flags_read(char *page, char **start, off_t off,
-		       int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "0x%x\n", extended_security);
-
-	len -= off;
-	*start = page + off;
+static const struct file_operations cifs_multiuser_mount_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_multiuser_mount_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_multiuser_mount_proc_write,
+};
 
-	if (len > count)
-		len = count;
-	else
-		*eof = 1;
-
-	if (len < 0)
-		len = 0;
+static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "0x%x\n", extended_security);
+	return 0;
+}
 
-	return len;
+static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cifs_security_flags_proc_show, NULL);
 }
-static int
-security_flags_write(struct file *file, const char __user *buffer,
-			unsigned long count, void *data)
+
+static ssize_t cifs_security_flags_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
 	unsigned int flags;
 	char flags_string[12];
@@ -917,6 +754,15 @@ security_flags_write(struct file *file, const char __user *buffer,
 	/* BB should we turn on MAY flags for other MUST options? */
 	return count;
 }
+
+static const struct file_operations cifs_security_flags_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= cifs_security_flags_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= cifs_security_flags_proc_write,
+};
 #else
 inline void cifs_proc_init(void)
 {
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d82374c9e329..d2c8eef84f3c 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -226,7 +226,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 	int err;
 
 	mntget(newmnt);
-	err = do_add_mount(newmnt, nd, nd->path.mnt->mnt_flags, mntlist);
+	err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist);
 	switch (err) {
 	case 0:
 		path_put(&nd->path);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 7013aaff6aed..fcee9298b620 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -66,11 +66,28 @@ struct key_type cifs_spnego_key_type = {
 	.describe	= user_describe,
 };
 
-#define MAX_VER_STR_LEN   9 /* length of longest version string e.g.
-				strlen(";ver=0xFF") */
-#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg
-			       in future could have strlen(";sec=ntlmsspi") */
-#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+/* length of longest version string e.g.  strlen("ver=0xFF") */
+#define MAX_VER_STR_LEN		8
+
+/* length of longest security mechanism name, eg in future could have
+ * strlen(";sec=ntlmsspi") */
+#define MAX_MECH_STR_LEN	13
+
+/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+#define MAX_IPV6_ADDR_LEN	42
+
+/* strlen of "host=" */
+#define HOST_KEY_LEN		5
+
+/* strlen of ";ip4=" or ";ip6=" */
+#define IP_KEY_LEN		5
+
+/* strlen of ";uid=0x" */
+#define UID_KEY_LEN		7
+
+/* strlen of ";user=" */
+#define USER_KEY_LEN		6
+
 /* get a key struct with a SPNEGO security blob, suitable for session setup */
 struct key *
 cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -81,11 +98,15 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	struct key *spnego_key;
 	const char *hostname = server->hostname;
 
-	/* BB: come up with better scheme for determining length */
-	/* length of fields (with semicolons): ver=0xyz ipv4= ipaddress host=
-	   hostname sec=mechanism uid=0x uid */
-	desc_len = MAX_VER_STR_LEN + 5 + MAX_IPV6_ADDR_LEN + 1 + 6 +
-		  strlen(hostname) + MAX_MECH_STR_LEN + 8 + (sizeof(uid_t) * 2);
+	/* length of fields (with semicolons): ver=0xyz ip4=ipaddress
+	   host=hostname sec=mechanism uid=0xFF user=username */
+	desc_len = MAX_VER_STR_LEN +
+		   HOST_KEY_LEN + strlen(hostname) +
+		   IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
+		   MAX_MECH_STR_LEN +
+		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
+		   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
+
 	spnego_key = ERR_PTR(-ENOMEM);
 	description = kzalloc(desc_len, GFP_KERNEL);
 	if (description == NULL)
@@ -110,9 +131,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 
 	dp = description + strlen(description);
 
-	/* for now, only sec=krb5 is valid */
+	/* for now, only sec=krb5 and sec=mskrb5 are valid */
 	if (server->secType == Kerberos)
 		sprintf(dp, ";sec=krb5");
+	else if (server->secType == MSKerberos)
+		sprintf(dp, ";sec=mskrb5");
 	else
 		goto out;
 
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index 05a34b17a1ab..e4041ec4d712 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -23,7 +23,7 @@
 #ifndef _CIFS_SPNEGO_H
 #define _CIFS_SPNEGO_H
 
-#define CIFS_SPNEGO_UPCALL_VERSION 1
+#define CIFS_SPNEGO_UPCALL_VERSION 2
 
 /*
  * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 0e9fc2ba90ee..57ecdc83c26f 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -56,7 +56,7 @@ int match_sid(struct cifs_sid *ctsid)
 	struct cifs_sid *cwsid;
 
 	if (!ctsid)
-		return (-1);
+		return -1;
 
 	for (i = 0; i < NUM_WK_SIDS; ++i) {
 		cwsid = &(wksidarr[i].cifssid);
@@ -87,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
 		}
 
 		cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname));
-		return (0); /* sids compare/match */
+		return 0; /* sids compare/match */
 	}
 
 	cFYI(1, ("No matching sid"));
-	return (-1);
+	return -1;
 }
 
 /* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -102,16 +102,16 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 	int num_subauth, num_sat, num_saw;
 
 	if ((!ctsid) || (!cwsid))
-		return (0);
+		return 0;
 
 	/* compare the revision */
 	if (ctsid->revision != cwsid->revision)
-		return (0);
+		return 0;
 
 	/* compare all of the six auth values */
 	for (i = 0; i < 6; ++i) {
 		if (ctsid->authority[i] != cwsid->authority[i])
-			return (0);
+			return 0;
 	}
 
 	/* compare all of the subauth values if any */
@@ -121,11 +121,11 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 	if (num_subauth) {
 		for (i = 0; i < num_subauth; ++i) {
 			if (ctsid->sub_auth[i] != cwsid->sub_auth[i])
-				return (0);
+				return 0;
 		}
 	}
 
-	return (1); /* sids compare/match */
+	return 1; /* sids compare/match */
 }
 
 
@@ -169,8 +169,7 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
 	for (i = 0; i < 6; i++)
 		ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
 	for (i = 0; i < 5; i++)
-		ngroup_sid_ptr->sub_auth[i] =
-				cpu_to_le32(group_sid_ptr->sub_auth[i]);
+		ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
 
 	return;
 }
@@ -285,7 +284,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
 	size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
 	pntace->size = cpu_to_le16(size);
 
-	return (size);
+	return size;
 }
 
 
@@ -426,7 +425,7 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
 	pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
 	pndacl->num_aces = cpu_to_le32(3);
 
-	return (0);
+	return 0;
 }
 
 
@@ -510,7 +509,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 			sizeof(struct cifs_sid)); */
 
 
-	return (0);
+	return 0;
 }
 
 
@@ -527,7 +526,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 	struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
 
 	if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
-		return (-EIO);
+		return -EIO;
 
 	owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
 				le32_to_cpu(pntsd->osidoffset));
@@ -550,7 +549,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 	/* copy security descriptor control portion and owner and group sid */
 	copy_sec_desc(pntsd, pnntsd, sidsoffset);
 
-	return (rc);
+	return rc;
 }
 
 
@@ -629,11 +628,11 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
 
 	if (!inode)
-		return (rc);
+		return rc;
 
 	sb = inode->i_sb;
 	if (sb == NULL)
-		return (rc);
+		return rc;
 
 	cifs_sb = CIFS_SB(sb);
 	xid = GetXid();
@@ -652,7 +651,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 		if (rc != 0) {
 			cERROR(1, ("Unable to open file to set ACL"));
 			FreeXid(xid);
-			return (rc);
+			return rc;
 		}
 	}
 
@@ -665,7 +664,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 
 	FreeXid(xid);
 
-	return (rc);
+	return rc;
 }
 
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
@@ -715,7 +714,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 		if (!pnntsd) {
 			cERROR(1, ("Unable to allocate security descriptor"));
 			kfree(pntsd);
-			return (-ENOMEM);
+			return -ENOMEM;
 		}
 
 		rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
@@ -732,6 +731,6 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 		kfree(pntsd);
 	}
 
-	return (rc);
+	return rc;
 }
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 4ff8939c6cc7..bd5f13d38450 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
 
 	if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
 		if (extended_security & CIFSSEC_MAY_PLNTXT) {
+			memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
 			memcpy(lnm_session_key, password_with_pad,
 				CIFS_ENCPWD_SIZE);
 			return;
@@ -310,9 +311,8 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
 	utf8 and other multibyte codepages each need their own strupper
 	function since a byte at a time will ont work. */
 
-	for (i = 0; i < CIFS_ENCPWD_SIZE; i++) {
+	for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
 		password_with_pad[i] = toupper(password_with_pad[i]);
-	}
 
 	SMBencrypt(password_with_pad, ses->server->cryptKey, lnm_session_key);
 	/* clear password before we return/free memory */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 22857c639df5..ac5915d61dca 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -175,6 +175,8 @@ out_no_root:
 	if (inode)
 		iput(inode);
 
+	cifs_umount(sb, cifs_sb);
+
 out_mount_failed:
 	if (cifs_sb) {
 #ifdef CONFIG_CIFS_DFS_UPCALL
@@ -267,15 +269,18 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int cifs_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int cifs_permission(struct inode *inode, int mask)
 {
 	struct cifs_sb_info *cifs_sb;
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
-		return 0;
-	else /* file mode might have been restricted at mount time
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) {
+		if ((mask & MAY_EXEC) && !execute_ok(inode))
+			return -EACCES;
+		else
+			return 0;
+	} else /* file mode might have been restricted at mount time
 		on the client (above and beyond ACL on servers) for
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */
@@ -307,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
 	file data or metadata */
 	cifs_inode->clientCanCacheRead = false;
 	cifs_inode->clientCanCacheAll = false;
+	cifs_inode->delete_pending = false;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 
 	/* Can not set i_flags here - they get immediately overwritten
@@ -615,6 +621,37 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 	return generic_file_llseek_unlocked(file, offset, origin);
 }
 
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
+{
+	/* note that this is called by vfs setlease with the BKL held
+	   although I doubt that BKL is needed here in cifs */
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	if (!(S_ISREG(inode->i_mode)))
+		return -EINVAL;
+
+	/* check if file is oplocked */
+	if (((arg == F_RDLCK) &&
+		(CIFS_I(inode)->clientCanCacheRead)) ||
+	    ((arg == F_WRLCK) &&
+		(CIFS_I(inode)->clientCanCacheAll)))
+		return generic_setlease(file, arg, lease);
+	else if (CIFS_SB(inode->i_sb)->tcon->local_lease &&
+			!CIFS_I(inode)->clientCanCacheRead)
+		/* If the server claims to support oplock on this
+		   file, then we still need to check oplock even
+		   if the local_lease mount option is set, but there
+		   are servers which do not support oplock for which
+		   this mount option may be useful if the user
+		   knows that the file won't be changed on the server
+		   by anyone else */
+		return generic_setlease(file, arg, lease);
+	else
+		return -EAGAIN;
+}
+#endif
+
 struct file_system_type cifs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "cifs",
@@ -693,6 +730,7 @@ const struct file_operations cifs_file_ops = {
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
+	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
@@ -713,6 +751,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
+	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_nobrl_ops = {
@@ -733,6 +772,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
+	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
@@ -752,6 +792,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
+	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
@@ -763,10 +804,11 @@ const struct file_operations cifs_dir_ops = {
 	.dir_notify = cifs_dir_notify,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 	.unlocked_ioctl  = cifs_ioctl,
+	.llseek = generic_file_llseek,
 };
 
 static void
-cifs_init_once(struct kmem_cache *cachep, void *inode)
+cifs_init_once(void *inode)
 {
 	struct cifsInodeInfo *cifsi = inode;
 
@@ -930,36 +972,40 @@ static int cifs_oplock_thread(void *dummyarg)
 			schedule_timeout(39*HZ);
 		} else {
 			oplock_item = list_entry(GlobalOplock_Q.next,
-				struct oplock_q_entry, qhead);
-			if (oplock_item) {
-				cFYI(1, ("found oplock item to write out"));
-				pTcon = oplock_item->tcon;
-				inode = oplock_item->pinode;
-				netfid = oplock_item->netfid;
-				spin_unlock(&GlobalMid_Lock);
-				DeleteOplockQEntry(oplock_item);
-				/* can not grab inode sem here since it would
+						struct oplock_q_entry, qhead);
+			cFYI(1, ("found oplock item to write out"));
+			pTcon = oplock_item->tcon;
+			inode = oplock_item->pinode;
+			netfid = oplock_item->netfid;
+			spin_unlock(&GlobalMid_Lock);
+			DeleteOplockQEntry(oplock_item);
+			/* can not grab inode sem here since it would
 				deadlock when oplock received on delete
 				since vfs_unlink holds the i_mutex across
 				the call */
-				/* mutex_lock(&inode->i_mutex);*/
-				if (S_ISREG(inode->i_mode)) {
-					rc =
-					   filemap_fdatawrite(inode->i_mapping);
-					if (CIFS_I(inode)->clientCanCacheRead
-									 == 0) {
-						waitrc = filemap_fdatawait(inode->i_mapping);
-						invalidate_remote_inode(inode);
-					}
-					if (rc == 0)
-						rc = waitrc;
-				} else
-					rc = 0;
-				/* mutex_unlock(&inode->i_mutex);*/
-				if (rc)
-					CIFS_I(inode)->write_behind_rc = rc;
-				cFYI(1, ("Oplock flush inode %p rc %d",
-					inode, rc));
+			/* mutex_lock(&inode->i_mutex);*/
+			if (S_ISREG(inode->i_mode)) {
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+				if (CIFS_I(inode)->clientCanCacheAll == 0)
+					break_lease(inode, FMODE_READ);
+				else if (CIFS_I(inode)->clientCanCacheRead == 0)
+					break_lease(inode, FMODE_WRITE);
+#endif
+				rc = filemap_fdatawrite(inode->i_mapping);
+				if (CIFS_I(inode)->clientCanCacheRead == 0) {
+					waitrc = filemap_fdatawait(
+							      inode->i_mapping);
+					invalidate_remote_inode(inode);
+				}
+				if (rc == 0)
+					rc = waitrc;
+			} else
+				rc = 0;
+			/* mutex_unlock(&inode->i_mutex);*/
+			if (rc)
+				CIFS_I(inode)->write_behind_rc = rc;
+			cFYI(1, ("Oplock flush inode %p rc %d",
+				inode, rc));
 
 				/* releasing stale oplock after recent reconnect
 				of smb session using a now incorrect file
@@ -967,15 +1013,13 @@ static int cifs_oplock_thread(void *dummyarg)
 				not bother sending an oplock release if session
 				to server still is disconnected since oplock
 				already released by the server in that case */
-				if (pTcon->tidStatus != CifsNeedReconnect) {
-				    rc = CIFSSMBLock(0, pTcon, netfid,
-					    0 /* len */ , 0 /* offset */, 0,
-					    0, LOCKING_ANDX_OPLOCK_RELEASE,
-					    false /* wait flag */);
-					cFYI(1, ("Oplock release rc = %d", rc));
-				}
-			} else
-				spin_unlock(&GlobalMid_Lock);
+			if (pTcon->tidStatus != CifsNeedReconnect) {
+				rc = CIFSSMBLock(0, pTcon, netfid,
+						0 /* len */ , 0 /* offset */, 0,
+						0, LOCKING_ANDX_OPLOCK_RELEASE,
+						false /* wait flag */);
+				cFYI(1, ("Oplock release rc = %d", rc));
+			}
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(1);  /* yield in case q were corrupt */
 		}
@@ -1001,8 +1045,7 @@ static int cifs_dnotify_thread(void *dummyarg)
 		list_for_each(tmp, &GlobalSMBSessionList) {
 			ses = list_entry(tmp, struct cifsSesInfo,
 				cifsSessionList);
-			if (ses && ses->server &&
-			     atomic_read(&ses->server->inFlight))
+			if (ses->server && atomic_read(&ses->server->inFlight))
 				wake_up_all(&ses->server->response_q);
 		}
 		read_unlock(&GlobalSMBSeslock);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 25a6cbd15529..074de0b5064d 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,7 +41,7 @@ extern int cifs_create(struct inode *, struct dentry *, int,
 		       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
 				  struct nameidata *);
-extern int cifs_unlink(struct inode *, struct dentry *);
+extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
 extern int cifs_mkdir(struct inode *, struct dentry *, int);
@@ -101,5 +101,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.53"
+#define CIFS_VERSION   "1.55"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9cfcf326ead3..c791e5b5a914 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -27,7 +27,7 @@
 #define MAX_SES_INFO 2
 #define MAX_TCON_INFO 4
 
-#define MAX_TREE_SIZE 2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1
+#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
 #define MAX_SERVER_SIZE 15
 #define MAX_SHARE_SIZE  64	/* used to be 20, this should still be enough */
 #define MAX_USERNAME_SIZE 32	/* 32 is to allow for 15 char names + null
@@ -80,7 +80,8 @@ enum securityEnum {
 	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */
 	RawNTLMSSP,		/* NTLMSSP without SPNEGO */
 	NTLMSSP,		/* NTLMSSP via SPNEGO */
-	Kerberos		/* Kerberos via SPNEGO */
+	Kerberos,		/* Kerberos via SPNEGO */
+	MSKerberos,		/* MS Kerberos via SPNEGO */
 };
 
 enum protocolEnum {
@@ -284,6 +285,7 @@ struct cifsTconInfo {
 	bool seal:1;      /* transport encryption for this mounted share */
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
+	bool local_lease:1; /* check leases (only) on local system not remote */
 	/* BB add field for back pointer to sb struct(s)? */
 };
 
@@ -308,6 +310,7 @@ struct cifs_search_info {
 	__u32 resume_key;
 	char *ntwrk_buf_start;
 	char *srch_entries_start;
+	char *last_entry;
 	char *presume_name;
 	unsigned int resume_name_len;
 	bool endOfSearch:1;
@@ -351,6 +354,7 @@ struct cifsInodeInfo {
 	bool clientCanCacheRead:1;	/* read oplock */
 	bool clientCanCacheAll:1;	/* read and writebehind oplock */
 	bool oplockPending:1;
+	bool delete_pending:1;		/* DELETE_ON_CLOSE is set */
 	struct inode vfs_inode;
 };
 
@@ -537,8 +541,8 @@ require use of the stronger protocol */
 #endif /* WEAK_PW_HASH */
 #define   CIFSSEC_MUST_SEAL	0x40040 /* not supported yet */
 
-#define   CIFSSEC_DEF  CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2
-#define   CIFSSEC_MAX  CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
+#define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
 #define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5)
 /*
  *****************************************************************
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 0f327c224da3..d2a073edd1b8 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -31,7 +31,7 @@
 #else
 #define CIFS_PROT   0
 #endif
-#define POSIX_PROT  CIFS_PROT+1
+#define POSIX_PROT  (CIFS_PROT+1)
 #define BAD_PROT 0xFFFF
 
 /* SMB command codes */
@@ -262,7 +262,7 @@
  */
 #define CIFS_NO_HANDLE        0xFFFF
 
-#define NO_CHANGE_64          cpu_to_le64(0xFFFFFFFFFFFFFFFFULL)
+#define NO_CHANGE_64          0xFFFFFFFFFFFFFFFFULL
 #define NO_CHANGE_32          0xFFFFFFFFUL
 
 /* IPC$ in ASCII */
@@ -341,7 +341,7 @@
 #define CREATE_COMPLETE_IF_OPLK 0x00000100	/* should be zero */
 #define CREATE_NO_EA_KNOWLEDGE  0x00000200
 #define CREATE_EIGHT_DOT_THREE  0x00000400	/* doc says this is obsolete
-						 "open for recovery" flag - should
+						 "open for recovery" flag should
 						 be zero in any case */
 #define CREATE_OPEN_FOR_RECOVERY 0x00000400
 #define CREATE_RANDOM_ACCESS	0x00000800
@@ -414,8 +414,8 @@ struct smb_hdr {
 	__u8 WordCount;
 } __attribute__((packed));
 /* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) ( *(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
-#define BCC_LE(smb_var) ( *(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
 /* given a pointer to an smb_hdr retrieve the pointer to the byte area */
 #define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2)
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b9f5e935f821..0cff7fe986e8 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -172,12 +172,15 @@ extern int CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon);
 extern int CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
 			struct kstatfs *FSData);
 
-extern int CIFSSMBSetTimes(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
 			const char *fileName, const FILE_BASIC_INFO *data,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
-extern int CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
-			const FILE_BASIC_INFO *data, __u16 fid);
+extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+			const FILE_BASIC_INFO *data, __u16 fid,
+			__u32 pid_of_opener);
+extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+			bool delete_file, __u16 fid, __u32 pid_of_opener);
 #if 0
 extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
 			char *fileName, __u16 dos_attributes,
@@ -191,9 +194,20 @@ extern int CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon,
 			 __u64 size, __u16 fileHandle, __u32 opener_pid,
 			bool AllocSizeFlag);
-extern int CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *pTcon,
-			char *full_path, __u64 mode, __u64 uid,
-			__u64 gid, dev_t dev,
+
+struct cifs_unix_set_info_args {
+	__u64	ctime;
+	__u64	atime;
+	__u64	mtime;
+	__u64	mode;
+	__u64	uid;
+	__u64	gid;
+	dev_t	device;
+};
+
+extern int CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *pTcon,
+			char *fileName,
+			const struct cifs_unix_set_info_args *args,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
 
@@ -217,7 +231,7 @@ extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
 extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
-			int netfid, char *target_name,
+			int netfid, const char *target_name,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
 extern int CIFSCreateHardLink(const int xid,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4511b708f0f3..843a85fb8b9a 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -128,8 +128,7 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
 	write_lock(&GlobalSMBSeslock);
 	list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
 		open_file = list_entry(tmp, struct cifsFileInfo, tlist);
-		if (open_file)
-			open_file->invalidHandle = true;
+		open_file->invalidHandle = true;
 	}
 	write_unlock(&GlobalSMBSeslock);
 	/* BB Add call to invalidate_inodes(sb) for all superblocks mounted
@@ -686,11 +685,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 						 SecurityBlob,
 						 count - 16,
 						 &server->secType);
-			if (rc == 1) {
+			if (rc == 1)
 				rc = 0;
-			} else {
+			else
 				rc = -EINVAL;
-			}
 		}
 	} else
 		server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -1311,6 +1309,7 @@ OldOpenRetry:
 				cpu_to_le64(le32_to_cpu(pSMBr->EndOfFile));
 			pfile_info->EndOfFile = pfile_info->AllocationSize;
 			pfile_info->NumberOfLinks = cpu_to_le32(1);
+			pfile_info->DeletePending = 0;
 		}
 	}
 
@@ -1412,6 +1411,7 @@ openRetry:
 		    pfile_info->AllocationSize = pSMBr->AllocationSize;
 		    pfile_info->EndOfFile = pSMBr->EndOfFile;
 		    pfile_info->NumberOfLinks = cpu_to_le32(1);
+		    pfile_info->DeletePending = 0;
 		}
 	}
 
@@ -2019,7 +2019,7 @@ renameRetry:
 }
 
 int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
-		int netfid, char *target_name,
+		int netfid, const char *target_name,
 		const struct nls_table *nls_codepage, int remap)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
@@ -2073,7 +2073,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 					remap);
 	}
 	rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
-	count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str) + 2;
+	count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
 	byte_count += count;
 	pSMB->DataCount = cpu_to_le16(count);
 	pSMB->TotalDataCount = pSMB->DataCount;
@@ -3616,6 +3616,8 @@ findFirstRetry:
 		/* BB remember to free buffer if error BB */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 		if (rc == 0) {
+			unsigned int lnoff;
+
 			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
 				psrch_inf->unicode = true;
 			else
@@ -3638,6 +3640,17 @@ findFirstRetry:
 					le16_to_cpu(parms->SearchCount);
 			psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
 				psrch_inf->entries_in_buffer;
+			lnoff = le16_to_cpu(parms->LastNameOffset);
+			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+			      lnoff) {
+				cERROR(1, ("ignoring corrupt resume name"));
+				psrch_inf->last_entry = NULL;
+				return rc;
+			}
+
+			psrch_inf->last_entry = psrch_inf->srch_entries_start +
+							lnoff;
+
 			*pnetfid = parms->SearchHandle;
 		} else {
 			cifs_buf_release(pSMB);
@@ -3727,6 +3740,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 		if (rc == 0) {
+			unsigned int lnoff;
+
 			/* BB fixme add lock for file (srch_info) struct here */
 			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
 				psrch_inf->unicode = true;
@@ -3753,6 +3768,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 						le16_to_cpu(parms->SearchCount);
 			psrch_inf->index_of_last_entry +=
 				psrch_inf->entries_in_buffer;
+			lnoff = le16_to_cpu(parms->LastNameOffset);
+			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+			      lnoff) {
+				cERROR(1, ("ignoring corrupt resume name"));
+				psrch_inf->last_entry = NULL;
+				return rc;
+			} else
+				psrch_inf->last_entry =
+					psrch_inf->srch_entries_start + lnoff;
+
 /*  cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
 	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
 
@@ -3914,7 +3939,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	bool is_unicode;
 	struct dfs_referral_level_3 *ref;
 
-	is_unicode = pSMBr->hdr.Flags2 & SMBFLG2_UNICODE;
+	if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+		is_unicode = true;
+	else
+		is_unicode = false;
 	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
 
 	if (*num_of_nodes < 1) {
@@ -4814,8 +4842,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
    time and resort to the original setpathinfo level which takes the ancient
    DOS time format with 2 second granularity */
 int
-CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
-		    const FILE_BASIC_INFO *data, __u16 fid)
+CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+		    const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
 	char *data_offset;
@@ -4828,11 +4856,8 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
 	if (rc)
 		return rc;
 
-	/* At this point there is no need to override the current pid
-	with the pid of the opener, but that could change if we someday
-	use an existing handle (rather than opening one on the fly) */
-	/* pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
-	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));*/
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
 
 	params = 6;
 	pSMB->MaxSetupCount = 0;
@@ -4878,11 +4903,66 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
 	return rc;
 }
 
+int
+CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+			  bool delete_file, __u16 fid, __u32 pid_of_opener)
+{
+	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
+	char *data_offset;
+	int rc = 0;
+	__u16 params, param_offset, offset, byte_count, count;
+
+	cFYI(1, ("Set File Disposition (via SetFileInfo)"));
+	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+	if (rc)
+		return rc;
+
+	pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+	pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+	params = 6;
+	pSMB->MaxSetupCount = 0;
+	pSMB->Reserved = 0;
+	pSMB->Flags = 0;
+	pSMB->Timeout = 0;
+	pSMB->Reserved2 = 0;
+	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+	offset = param_offset + params;
+
+	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+
+	count = 1;
+	pSMB->MaxParameterCount = cpu_to_le16(2);
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
+	pSMB->SetupCount = 1;
+	pSMB->Reserved3 = 0;
+	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+	byte_count = 3 /* pad */  + params + count;
+	pSMB->DataCount = cpu_to_le16(count);
+	pSMB->ParameterCount = cpu_to_le16(params);
+	pSMB->TotalDataCount = pSMB->DataCount;
+	pSMB->TotalParameterCount = pSMB->ParameterCount;
+	pSMB->ParameterOffset = cpu_to_le16(param_offset);
+	pSMB->DataOffset = cpu_to_le16(offset);
+	pSMB->Fid = fid;
+	pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
+	pSMB->Reserved4 = 0;
+	pSMB->hdr.smb_buf_length += byte_count;
+	pSMB->ByteCount = cpu_to_le16(byte_count);
+	*data_offset = delete_file ? 1 : 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	if (rc)
+		cFYI(1, ("Send error in SetFileDisposition = %d", rc));
+
+	return rc;
+}
 
 int
-CIFSSMBSetTimes(const int xid, struct cifsTconInfo *tcon, const char *fileName,
-		const FILE_BASIC_INFO *data,
-		const struct nls_table *nls_codepage, int remap)
+CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
+		   const char *fileName, const FILE_BASIC_INFO *data,
+		   const struct nls_table *nls_codepage, int remap)
 {
 	TRANSACTION2_SPI_REQ *pSMB = NULL;
 	TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -5011,10 +5091,9 @@ SetAttrLgcyRetry:
 #endif /* temporarily unneeded SetAttr legacy function */
 
 int
-CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *tcon,
-		    char *fileName, __u64 mode, __u64 uid, __u64 gid,
-		    dev_t device, const struct nls_table *nls_codepage,
-		    int remap)
+CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
+		   const struct cifs_unix_set_info_args *args,
+		   const struct nls_table *nls_codepage, int remap)
 {
 	TRANSACTION2_SPI_REQ *pSMB = NULL;
 	TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -5023,6 +5102,7 @@ CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	FILE_UNIX_BASIC_INFO *data_offset;
 	__u16 params, param_offset, offset, count, byte_count;
+	__u64 mode = args->mode;
 
 	cFYI(1, ("In SetUID/GID/Mode"));
 setPermsRetry:
@@ -5078,16 +5158,16 @@ setPermsRetry:
 	set file size and do not want to truncate file size to zero
 	accidently as happened on one Samba server beta by putting
 	zero instead of -1 here */
-	data_offset->EndOfFile = NO_CHANGE_64;
-	data_offset->NumOfBytes = NO_CHANGE_64;
-	data_offset->LastStatusChange = NO_CHANGE_64;
-	data_offset->LastAccessTime = NO_CHANGE_64;
-	data_offset->LastModificationTime = NO_CHANGE_64;
-	data_offset->Uid = cpu_to_le64(uid);
-	data_offset->Gid = cpu_to_le64(gid);
+	data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
+	data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
+	data_offset->LastStatusChange = cpu_to_le64(args->ctime);
+	data_offset->LastAccessTime = cpu_to_le64(args->atime);
+	data_offset->LastModificationTime = cpu_to_le64(args->mtime);
+	data_offset->Uid = cpu_to_le64(args->uid);
+	data_offset->Gid = cpu_to_le64(args->gid);
 	/* better to leave device as zero when it is  */
-	data_offset->DevMajor = cpu_to_le64(MAJOR(device));
-	data_offset->DevMinor = cpu_to_le64(MINOR(device));
+	data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
+	data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
 	data_offset->Permissions = cpu_to_le64(mode);
 
 	if (S_ISREG(mode))
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e8fa46c7cff2..71b7661e2260 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -90,6 +90,8 @@ struct smb_vol {
 	bool nocase:1;     /* request case insensitive filenames */
 	bool nobrl:1;      /* disable sending byte range locks to srv */
 	bool seal:1;       /* request transport encryption on share */
+	bool nodfs:1;      /* Do not request DFS, even if available */
+	bool local_lease:1; /* check leases only on local system, not remote */
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -124,7 +126,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	struct mid_q_entry *mid_entry;
 
 	spin_lock(&GlobalMid_Lock);
-	if (kthread_should_stop()) {
+	if (server->tcpStatus == CifsExiting) {
 		/* the demux thread will exit normally
 		next time through the loop */
 		spin_unlock(&GlobalMid_Lock);
@@ -151,7 +153,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	}
 	list_for_each(tmp, &GlobalTreeConnectionList) {
 		tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
-		if ((tcon) && (tcon->ses) && (tcon->ses->server == server))
+		if ((tcon->ses) && (tcon->ses->server == server))
 			tcon->tidStatus = CifsNeedReconnect;
 	}
 	read_unlock(&GlobalSMBSeslock);
@@ -173,20 +175,19 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		mid_entry = list_entry(tmp, struct
 					mid_q_entry,
 					qhead);
-		if (mid_entry) {
-			if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
+		if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
 				/* Mark other intransit requests as needing
 				   retry so we do not immediately mark the
 				   session bad again (ie after we reconnect
 				   below) as they timeout too */
-				mid_entry->midState = MID_RETRY_NEEDED;
-			}
+			mid_entry->midState = MID_RETRY_NEEDED;
 		}
 	}
 	spin_unlock(&GlobalMid_Lock);
 	up(&server->tcpSem);
 
-	while ((!kthread_should_stop()) && (server->tcpStatus != CifsGood)) {
+	while ((server->tcpStatus != CifsExiting) &&
+	       (server->tcpStatus != CifsGood)) {
 		try_to_freeze();
 		if (server->protocolType == IPV6) {
 			rc = ipv6_connect(&server->addr.sockAddr6,
@@ -203,7 +204,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		} else {
 			atomic_inc(&tcpSesReconnectCount);
 			spin_lock(&GlobalMid_Lock);
-			if (!kthread_should_stop())
+			if (server->tcpStatus != CifsExiting)
 				server->tcpStatus = CifsGood;
 			server->sequence_number = 0;
 			spin_unlock(&GlobalMid_Lock);
@@ -351,16 +352,14 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 
 	current->flags |= PF_MEMALLOC;
 	cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current)));
-	write_lock(&GlobalSMBSeslock);
-	atomic_inc(&tcpSesAllocCount);
-	length = tcpSesAllocCount.counter;
-	write_unlock(&GlobalSMBSeslock);
-	if (length  > 1)
+
+	length = atomic_inc_return(&tcpSesAllocCount);
+	if (length > 1)
 		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
 				GFP_KERNEL);
 
 	set_freezable();
-	while (!kthread_should_stop()) {
+	while (server->tcpStatus != CifsExiting) {
 		if (try_to_freeze())
 			continue;
 		if (bigbuf == NULL) {
@@ -401,7 +400,7 @@ incomplete_rcv:
 		    kernel_recvmsg(csocket, &smb_msg,
 				&iov, 1, pdu_length, 0 /* BB other flags? */);
 
-		if (kthread_should_stop()) {
+		if (server->tcpStatus == CifsExiting) {
 			break;
 		} else if (server->tcpStatus == CifsNeedReconnect) {
 			cFYI(1, ("Reconnect after server stopped responding"));
@@ -455,7 +454,7 @@ incomplete_rcv:
 		/* Note that FC 1001 length is big endian on the wire,
 		but we convert it here so it is always manipulated
 		as host byte order */
-		pdu_length = ntohl(smb_buffer->smb_buf_length);
+		pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
 		smb_buffer->smb_buf_length = pdu_length;
 
 		cFYI(1, ("rfc1002 length 0x%x", pdu_length+4));
@@ -526,7 +525,7 @@ incomplete_rcv:
 		     total_read += length) {
 			length = kernel_recvmsg(csocket, &smb_msg, &iov, 1,
 						pdu_length - total_read, 0);
-			if (kthread_should_stop() ||
+			if ((server->tcpStatus == CifsExiting) ||
 			    (length == -EINTR)) {
 				/* then will exit */
 				reconnect = 2;
@@ -655,14 +654,6 @@ multi_t2_fnd:
 	spin_unlock(&GlobalMid_Lock);
 	wake_up_all(&server->response_q);
 
-	/* don't exit until kthread_stop is called */
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_UNINTERRUPTIBLE);
-	}
-	set_current_state(TASK_RUNNING);
-
 	/* check if we have blocked requests that need to free */
 	/* Note that cifs_max_pending is normally 50, but
 	can be set at module install time to as little as two */
@@ -745,14 +736,11 @@ multi_t2_fnd:
 		coming home not much else we can do but free the memory */
 	}
 
-	write_lock(&GlobalSMBSeslock);
-	atomic_dec(&tcpSesAllocCount);
-	length = tcpSesAllocCount.counter;
-
 	/* last chance to mark ses pointers invalid
 	if there are any pointing to this (e.g
 	if a crazy root user tried to kill cifsd
 	kernel thread explicitly this might happen) */
+	write_lock(&GlobalSMBSeslock);
 	list_for_each(tmp, &GlobalSMBSessionList) {
 		ses = list_entry(tmp, struct cifsSesInfo,
 				cifsSessionList);
@@ -762,11 +750,24 @@ multi_t2_fnd:
 	write_unlock(&GlobalSMBSeslock);
 
 	kfree(server->hostname);
+	task_to_wake = xchg(&server->tsk, NULL);
 	kfree(server);
+
+	length = atomic_dec_return(&tcpSesAllocCount);
 	if (length  > 0)
 		mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
 				GFP_KERNEL);
 
+	/* if server->tsk was NULL then wait for a signal before exiting */
+	if (!task_to_wake) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		while (!signal_pending(current)) {
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+		set_current_state(TASK_RUNNING);
+	}
+
 	return 0;
 }
 
@@ -1223,6 +1224,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->sfu_emul = 1;
 		} else if (strnicmp(data, "nosfu", 5) == 0) {
 			vol->sfu_emul = 0;
+		} else if (strnicmp(data, "nodfs", 5) == 0) {
+			vol->nodfs = 1;
 		} else if (strnicmp(data, "posixpaths", 10) == 0) {
 			vol->posix_paths = 1;
 		} else if (strnicmp(data, "noposixpaths", 12) == 0) {
@@ -1273,6 +1276,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->no_psx_acl = 0;
 		} else if (strnicmp(data, "noacl", 5) == 0) {
 			vol->no_psx_acl = 1;
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+		} else if (strnicmp(data, "locallease", 6) == 0) {
+			vol->local_lease = 1;
+#endif
 		} else if (strnicmp(data, "sign", 4) == 0) {
 			vol->secFlg |= CIFSSEC_MUST_SIGN;
 		} else if (strnicmp(data, "seal", 4) == 0) {
@@ -1461,6 +1468,39 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	return rc;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key cifs_key[2];
+static struct lock_class_key cifs_slock_key[2];
+
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
+		&cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	BUG_ON(sock_owned_by_user(sk));
+	sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
+		&cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
+}
+#else
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+}
+#endif
+
 /* See RFC1001 section 14 on representation of Netbios names */
 static void rfc1002mangle(char *target, char *source, unsigned int length)
 {
@@ -1495,6 +1535,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
 			cFYI(1, ("Socket created"));
 			(*csocket)->sk->sk_allocation = GFP_NOFS;
+			cifs_reclassify_socket4(*csocket);
 		}
 	}
 
@@ -1627,6 +1668,7 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
 			 cFYI(1, ("ipv6 Socket created"));
 			(*csocket)->sk->sk_allocation = GFP_NOFS;
+			cifs_reclassify_socket6(*csocket);
 		}
 	}
 
@@ -1815,6 +1857,16 @@ convert_delimiter(char *path, char delim)
 	}
 }
 
+static void
+kill_cifsd(struct TCP_Server_Info *server)
+{
+	struct task_struct *task;
+
+	task = xchg(&server->tsk, NULL);
+	if (task)
+		force_sig(SIGKILL, task);
+}
+
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	   char *mount_data, const char *devname)
@@ -2136,6 +2188,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			   for the retry flag is used */
 			tcon->retry = volume_info.retry;
 			tcon->nocase = volume_info.nocase;
+			tcon->local_lease = volume_info.local_lease;
 			if (tcon->seal != volume_info.seal)
 				cERROR(1, ("transport encryption setting "
 					   "conflicts with existing tid"));
@@ -2167,6 +2220,12 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 						volume_info.UNC,
 						tcon, cifs_sb->local_nls);
 					cFYI(1, ("CIFS Tcon rc = %d", rc));
+					if (volume_info.nodfs) {
+						tcon->Flags &=
+							~SMB_SHARE_IS_IN_DFS;
+						cFYI(1, ("DFS disabled (%d)",
+							tcon->Flags));
+					}
 				}
 				if (!rc) {
 					atomic_inc(&pSesInfo->inUse);
@@ -2195,14 +2254,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			spin_lock(&GlobalMid_Lock);
 			srvTcp->tcpStatus = CifsExiting;
 			spin_unlock(&GlobalMid_Lock);
-			if (srvTcp->tsk) {
-				/* If we could verify that kthread_stop would
-				   always wake up processes blocked in
-				   tcp in recv_mesg then we could remove the
-				   send_sig call */
-				force_sig(SIGKILL, srvTcp->tsk);
-				kthread_stop(srvTcp->tsk);
-			}
+			kill_cifsd(srvTcp);
 		}
 		 /* If find_unc succeeded then rc == 0 so we can not end */
 		if (tcon)  /* up accidently freeing someone elses tcon struct */
@@ -2215,19 +2267,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 					temp_rc = CIFSSMBLogoff(xid, pSesInfo);
 					/* if the socketUseCount is now zero */
 					if ((temp_rc == -ESHUTDOWN) &&
-					    (pSesInfo->server) &&
-					    (pSesInfo->server->tsk)) {
-						force_sig(SIGKILL,
-							pSesInfo->server->tsk);
-						kthread_stop(pSesInfo->server->tsk);
-					}
+					    (pSesInfo->server))
+						kill_cifsd(pSesInfo->server);
 				} else {
 					cFYI(1, ("No session or bad tcon"));
-					if ((pSesInfo->server) &&
-					    (pSesInfo->server->tsk)) {
-						force_sig(SIGKILL,
-							pSesInfo->server->tsk);
-						kthread_stop(pSesInfo->server->tsk);
+					if (pSesInfo->server) {
+						spin_lock(&GlobalMid_Lock);
+						srvTcp->tcpStatus = CifsExiting;
+						spin_unlock(&GlobalMid_Lock);
+						kill_cifsd(pSesInfo->server);
 					}
 				}
 				sesInfoFree(pSesInfo);
@@ -3514,7 +3562,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	int rc = 0;
 	int xid;
 	struct cifsSesInfo *ses = NULL;
-	struct task_struct *cifsd_task;
 	char *tmp;
 
 	xid = GetXid();
@@ -3530,7 +3577,6 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 		tconInfoFree(cifs_sb->tcon);
 		if ((ses) && (ses->server)) {
 			/* save off task so we do not refer to ses later */
-			cifsd_task = ses->server->tsk;
 			cFYI(1, ("About to do SMBLogoff "));
 			rc = CIFSSMBLogoff(xid, ses);
 			if (rc == -EBUSY) {
@@ -3538,10 +3584,8 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 				return 0;
 			} else if (rc == -ESHUTDOWN) {
 				cFYI(1, ("Waking up socket by sending signal"));
-				if (cifsd_task) {
-					force_sig(SIGKILL, cifsd_task);
-					kthread_stop(cifsd_task);
-				}
+				if (ses->server)
+					kill_cifsd(ses->server);
 				rc = 0;
 			} /* else - we have an smb session
 				left on this socket do not kill cifsd */
@@ -3568,19 +3612,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 	char ntlm_session_key[CIFS_SESS_KEY_SIZE];
 	bool ntlmv2_flag = false;
 	int first_time = 0;
+	struct TCP_Server_Info *server = pSesInfo->server;
 
 	/* what if server changes its buffer size after dropping the session? */
-	if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ {
+	if (server->maxBuf == 0) /* no need to send on reconnect */ {
 		rc = CIFSSMBNegotiate(xid, pSesInfo);
-		if (rc == -EAGAIN) /* retry only once on 1st time connection */ {
+		if (rc == -EAGAIN) {
+			/* retry only once on 1st time connection */
 			rc = CIFSSMBNegotiate(xid, pSesInfo);
 			if (rc == -EAGAIN)
 				rc = -EHOSTDOWN;
 		}
 		if (rc == 0) {
 			spin_lock(&GlobalMid_Lock);
-			if (pSesInfo->server->tcpStatus != CifsExiting)
-				pSesInfo->server->tcpStatus = CifsGood;
+			if (server->tcpStatus != CifsExiting)
+				server->tcpStatus = CifsGood;
 			else
 				rc = -EHOSTDOWN;
 			spin_unlock(&GlobalMid_Lock);
@@ -3588,97 +3634,92 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 		}
 		first_time = 1;
 	}
-	if (!rc) {
-		pSesInfo->flags = 0;
-		pSesInfo->capabilities = pSesInfo->server->capabilities;
-		if (linuxExtEnabled == 0)
-			pSesInfo->capabilities &= (~CAP_UNIX);
+
+	if (rc)
+		goto ss_err_exit;
+
+	pSesInfo->flags = 0;
+	pSesInfo->capabilities = server->capabilities;
+	if (linuxExtEnabled == 0)
+		pSesInfo->capabilities &= (~CAP_UNIX);
 	/*	pSesInfo->sequence_number = 0;*/
-		cFYI(1,
-		      ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-			pSesInfo->server->secMode,
-			pSesInfo->server->capabilities,
-			pSesInfo->server->timeAdj));
-		if (experimEnabled < 2)
-			rc = CIFS_SessSetup(xid, pSesInfo,
-					    first_time, nls_info);
-		else if (extended_security
-				&& (pSesInfo->capabilities
-					& CAP_EXTENDED_SECURITY)
-				&& (pSesInfo->server->secType == NTLMSSP)) {
-			rc = -EOPNOTSUPP;
-		} else if (extended_security
-			   && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
-			   && (pSesInfo->server->secType == RawNTLMSSP)) {
-			cFYI(1, ("NTLMSSP sesssetup"));
-			rc = CIFSNTLMSSPNegotiateSessSetup(xid,
-						pSesInfo,
-						&ntlmv2_flag,
-						nls_info);
-			if (!rc) {
-				if (ntlmv2_flag) {
-					char *v2_response;
-					cFYI(1, ("more secure NTLM ver2 hash"));
-					if (CalcNTLMv2_partial_mac_key(pSesInfo,
-						nls_info)) {
-						rc = -ENOMEM;
-						goto ss_err_exit;
-					} else
-						v2_response = kmalloc(16 + 64 /* blob */, GFP_KERNEL);
-					if (v2_response) {
-						CalcNTLMv2_response(pSesInfo,
-								   v2_response);
-				/*		if (first_time)
-						  cifs_calculate_ntlmv2_mac_key(
-						   pSesInfo->server->mac_signing_key,
-						   response, ntlm_session_key,*/
-						kfree(v2_response);
+	cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+		 server->secMode, server->capabilities, server->timeAdj));
+
+	if (experimEnabled < 2)
+		rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
+	else if (extended_security
+			&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
+			&& (server->secType == NTLMSSP)) {
+		rc = -EOPNOTSUPP;
+	} else if (extended_security
+			&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
+			&& (server->secType == RawNTLMSSP)) {
+		cFYI(1, ("NTLMSSP sesssetup"));
+		rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
+						   nls_info);
+		if (!rc) {
+			if (ntlmv2_flag) {
+				char *v2_response;
+				cFYI(1, ("more secure NTLM ver2 hash"));
+				if (CalcNTLMv2_partial_mac_key(pSesInfo,
+								nls_info)) {
+					rc = -ENOMEM;
+					goto ss_err_exit;
+				} else
+					v2_response = kmalloc(16 + 64 /* blob*/,
+								GFP_KERNEL);
+				if (v2_response) {
+					CalcNTLMv2_response(pSesInfo,
+								v2_response);
+				/*	if (first_time)
+						cifs_calculate_ntlmv2_mac_key */
+					kfree(v2_response);
 					/* BB Put dummy sig in SessSetup PDU? */
-					} else {
-						rc = -ENOMEM;
-						goto ss_err_exit;
-					}
-
 				} else {
-					SMBNTencrypt(pSesInfo->password,
-						pSesInfo->server->cryptKey,
-						ntlm_session_key);
-
-					if (first_time)
-						cifs_calculate_mac_key(
-							&pSesInfo->server->mac_signing_key,
-							ntlm_session_key,
-							pSesInfo->password);
+					rc = -ENOMEM;
+					goto ss_err_exit;
 				}
+
+			} else {
+				SMBNTencrypt(pSesInfo->password,
+					     server->cryptKey,
+					     ntlm_session_key);
+
+				if (first_time)
+					cifs_calculate_mac_key(
+					     &server->mac_signing_key,
+					     ntlm_session_key,
+					     pSesInfo->password);
+			}
 			/* for better security the weaker lanman hash not sent
 			   in AuthSessSetup so we no longer calculate it */
 
-				rc = CIFSNTLMSSPAuthSessSetup(xid,
-					pSesInfo,
-					ntlm_session_key,
-					ntlmv2_flag,
-					nls_info);
-			}
-		} else { /* old style NTLM 0.12 session setup */
-			SMBNTencrypt(pSesInfo->password,
-				pSesInfo->server->cryptKey,
-				ntlm_session_key);
+			rc = CIFSNTLMSSPAuthSessSetup(xid, pSesInfo,
+						      ntlm_session_key,
+						      ntlmv2_flag,
+						      nls_info);
+		}
+	} else { /* old style NTLM 0.12 session setup */
+		SMBNTencrypt(pSesInfo->password, server->cryptKey,
+			     ntlm_session_key);
 
-			if (first_time)
-				cifs_calculate_mac_key(
-					&pSesInfo->server->mac_signing_key,
-					ntlm_session_key, pSesInfo->password);
+		if (first_time)
+			cifs_calculate_mac_key(&server->mac_signing_key,
+						ntlm_session_key,
+						pSesInfo->password);
 
-			rc = CIFSSessSetup(xid, pSesInfo,
-				ntlm_session_key, nls_info);
-		}
-		if (rc) {
-			cERROR(1, ("Send error in SessSetup = %d", rc));
-		} else {
-			cFYI(1, ("CIFS Session Established successfully"));
+		rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
+	}
+	if (rc) {
+		cERROR(1, ("Send error in SessSetup = %d", rc));
+	} else {
+		cFYI(1, ("CIFS Session Established successfully"));
+			spin_lock(&GlobalMid_Lock);
 			pSesInfo->status = CifsGood;
-		}
+			spin_unlock(&GlobalMid_Lock);
 	}
+
 ss_err_exit:
 	return rc;
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index fb69c1fa85c9..e962e75e6f7b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -226,23 +226,28 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		/* If Open reported that we actually created a file
 		then we now have to set the mode if possible */
 		if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+			struct cifs_unix_set_info_args args = {
+				.mode	= mode,
+				.ctime	= NO_CHANGE_64,
+				.atime	= NO_CHANGE_64,
+				.mtime	= NO_CHANGE_64,
+				.device	= 0,
+			};
+
 			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-				CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode,
-					(__u64)current->fsuid,
-					(__u64)current->fsgid,
-					0 /* dev */,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+				args.uid = (__u64) current->fsuid;
+				if (inode->i_mode & S_ISGID)
+					args.gid = (__u64) inode->i_gid;
+				else
+					args.gid = (__u64) current->fsgid;
 			} else {
-				CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode,
-					(__u64)-1,
-					(__u64)-1,
-					0 /* dev */,
-					cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
+				args.uid = NO_CHANGE_64;
+				args.gid = NO_CHANGE_64;
 			}
+			CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+				cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 		} else {
 			/* BB implement mode setting via Windows security
 			   descriptors e.g. */
@@ -267,7 +272,12 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 				    (cifs_sb->mnt_cifs_flags &
 				     CIFS_MOUNT_SET_UID)) {
 					newinode->i_uid = current->fsuid;
-					newinode->i_gid = current->fsgid;
+					if (inode->i_mode & S_ISGID)
+						newinode->i_gid =
+							inode->i_gid;
+					else
+						newinode->i_gid =
+							current->fsgid;
 				}
 			}
 		}
@@ -357,21 +367,24 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	if (full_path == NULL)
 		rc = -ENOMEM;
 	else if (pTcon->unix_ext) {
-		mode &= ~current->fs->umask;
+		struct cifs_unix_set_info_args args = {
+			.mode	= mode & ~current->fs->umask,
+			.ctime	= NO_CHANGE_64,
+			.atime	= NO_CHANGE_64,
+			.mtime	= NO_CHANGE_64,
+			.device	= device_number,
+		};
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-			rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path,
-				mode, (__u64)current->fsuid,
-				(__u64)current->fsgid,
-				device_number, cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
+			args.uid = (__u64) current->fsuid;
+			args.gid = (__u64) current->fsgid;
 		} else {
-			rc = CIFSSMBUnixSetPerms(xid, pTcon,
-				full_path, mode, (__u64)-1, (__u64)-1,
-				device_number, cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
+			args.uid = NO_CHANGE_64;
+			args.gid = NO_CHANGE_64;
 		}
+		rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path,
+			&args, cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 		if (!rc) {
 			rc = cifs_get_inode_info_unix(&newinode, full_path,
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index f730ef35499e..1e0c1bd8f2e4 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -29,38 +29,13 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 
-static int dns_resolver_instantiate(struct key *key, const void *data,
-		size_t datalen)
-{
-	int rc = 0;
-	char *ip;
-
-	ip = kmalloc(datalen+1, GFP_KERNEL);
-	if (!ip)
-		return -ENOMEM;
-
-	memcpy(ip, data, datalen);
-	ip[datalen] = '\0';
-
-	rcu_assign_pointer(key->payload.data, ip);
-
-	return rc;
-}
-
-struct key_type key_type_dns_resolver = {
-	.name        = "dns_resolver",
-	.def_datalen = sizeof(struct in_addr),
-	.describe    = user_describe,
-	.instantiate = dns_resolver_instantiate,
-	.match       = user_match,
-};
-
 /* Checks if supplied name is IP address
  * returns:
  * 		1 - name is IP
  * 		0 - name is not IP
  */
-static int is_ip(const char *name)
+static int
+is_ip(const char *name)
 {
 	int rc;
 	struct sockaddr_in sin_server;
@@ -82,6 +57,47 @@ static int is_ip(const char *name)
 	return 0;
 }
 
+static int
+dns_resolver_instantiate(struct key *key, const void *data,
+		size_t datalen)
+{
+	int rc = 0;
+	char *ip;
+
+	ip = kmalloc(datalen + 1, GFP_KERNEL);
+	if (!ip)
+		return -ENOMEM;
+
+	memcpy(ip, data, datalen);
+	ip[datalen] = '\0';
+
+	/* make sure this looks like an address */
+	if (!is_ip((const char *) ip)) {
+		kfree(ip);
+		return -EINVAL;
+	}
+
+	key->type_data.x[0] = datalen;
+	rcu_assign_pointer(key->payload.data, ip);
+
+	return rc;
+}
+
+static void
+dns_resolver_destroy(struct key *key)
+{
+	kfree(key->payload.data);
+}
+
+struct key_type key_type_dns_resolver = {
+	.name        = "dns_resolver",
+	.def_datalen = sizeof(struct in_addr),
+	.describe    = user_describe,
+	.instantiate = dns_resolver_instantiate,
+	.destroy     = dns_resolver_destroy,
+	.match       = user_match,
+};
+
 /* Resolves server name to ip address.
  * input:
  * 	unc - server UNC
@@ -133,6 +149,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 
 	rkey = request_key(&key_type_dns_resolver, name, "");
 	if (!IS_ERR(rkey)) {
+		len = rkey->type_data.x[0];
 		data = rkey->payload.data;
 	} else {
 		cERROR(1, ("%s: unable to resolve: %s", __func__, name));
@@ -141,11 +158,9 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 
 skip_upcall:
 	if (data) {
-		len = strlen(data);
-		*ip_addr = kmalloc(len+1, GFP_KERNEL);
+		*ip_addr = kmalloc(len + 1, GFP_KERNEL);
 		if (*ip_addr) {
-			memcpy(*ip_addr, data, len);
-			(*ip_addr)[len] = '\0';
+			memcpy(*ip_addr, data, len + 1);
 			if (!IS_ERR(rkey))
 				cFYI(1, ("%s: resolved: %s to %s", __func__,
 							name,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0aac824371a5..62d8bd8f14c0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -107,7 +107,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 
 	/* want handles we can use to read with first
 	   in the list so we do not have to walk the
-	   list to search for one in prepare_write */
+	   list to search for one in write_begin */
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
 		list_add_tail(&pCifsFile->flist,
 			      &pCifsInode->openFileList);
@@ -310,18 +310,19 @@ int cifs_open(struct inode *inode, struct file *file)
 		/* time to set mode which we can not set earlier due to
 		   problems creating new read-only files */
 		if (pTcon->unix_ext) {
-			CIFSSMBUnixSetPerms(xid, pTcon, full_path,
-					    inode->i_mode,
-					    (__u64)-1, (__u64)-1, 0 /* dev */,
+			struct cifs_unix_set_info_args args = {
+				.mode	= inode->i_mode,
+				.uid	= NO_CHANGE_64,
+				.gid	= NO_CHANGE_64,
+				.ctime	= NO_CHANGE_64,
+				.atime	= NO_CHANGE_64,
+				.mtime	= NO_CHANGE_64,
+				.device	= 0,
+			};
+			CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-		} else {
-			/* BB implement via Windows security descriptors eg
-			   CIFSSMBWinSetPerms(xid, pTcon, full_path, mode,
-					      -1, -1, local_nls);
-			   in the meantime could set r/o dos attribute when
-			   perms are eg: mode & 0222 == 0 */
 		}
 	}
 
@@ -832,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 		return -EBADF;
 	open_file = (struct cifsFileInfo *) file->private_data;
 
+	rc = generic_write_checks(file, poffset, &write_size, 0);
+	if (rc)
+		return rc;
+
 	xid = GetXid();
 
 	if (*poffset > file->f_path.dentry->d_inode->i_size)
@@ -910,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 }
 
 static ssize_t cifs_write(struct file *file, const char *write_data,
-	size_t write_size, loff_t *poffset)
+			  size_t write_size, loff_t *poffset)
 {
 	int rc = 0;
 	unsigned int bytes_written = 0;
@@ -1060,6 +1065,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
 struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 {
 	struct cifsFileInfo *open_file;
+	bool any_available = false;
 	int rc;
 
 	/* Having a null inode here (because mapping->host was set to zero by
@@ -1075,8 +1081,10 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 	read_lock(&GlobalSMBSeslock);
 refind_writable:
 	list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
-		if (open_file->closePend)
+		if (open_file->closePend ||
+		    (!any_available && open_file->pid != current->tgid))
 			continue;
+
 		if (open_file->pfile &&
 		    ((open_file->pfile->f_flags & O_RDWR) ||
 		     (open_file->pfile->f_flags & O_WRONLY))) {
@@ -1126,6 +1134,11 @@ refind_writable:
 			   of the loop here. */
 		}
 	}
+	/* couldn't find useable FH with same pid, try any available */
+	if (!any_available) {
+		any_available = true;
+		goto refind_writable;
+	}
 	read_unlock(&GlobalSMBSeslock);
 	return NULL;
 }
@@ -1280,7 +1293,7 @@ retry:
 
 			if (first < 0)
 				lock_page(page);
-			else if (TestSetPageLocked(page))
+			else if (!trylock_page(page))
 				break;
 
 			if (unlikely(page->mapping != mapping)) {
@@ -1442,49 +1455,52 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
 	return rc;
 }
 
-static int cifs_commit_write(struct file *file, struct page *page,
-	unsigned offset, unsigned to)
+static int cifs_write_end(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
 {
-	int xid;
-	int rc = 0;
-	struct inode *inode = page->mapping->host;
-	loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-	char *page_data;
+	int rc;
+	struct inode *inode = mapping->host;
 
-	xid = GetXid();
-	cFYI(1, ("commit write for page %p up to position %lld for %d",
-		 page, position, to));
-	spin_lock(&inode->i_lock);
-	if (position > inode->i_size)
-		i_size_write(inode, position);
+	cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
+		 page, pos, copied));
+
+	if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+		SetPageUptodate(page);
 
-	spin_unlock(&inode->i_lock);
 	if (!PageUptodate(page)) {
-		position =  ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset;
-		/* can not rely on (or let) writepage write this data */
-		if (to < offset) {
-			cFYI(1, ("Illegal offsets, can not copy from %d to %d",
-				offset, to));
-			FreeXid(xid);
-			return rc;
-		}
+		char *page_data;
+		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+		int xid;
+
+		xid = GetXid();
 		/* this is probably better than directly calling
 		   partialpage_write since in this function the file handle is
 		   known which we might as well	leverage */
 		/* BB check if anything else missing out of ppw
 		   such as updating last write time */
 		page_data = kmap(page);
-		rc = cifs_write(file, page_data + offset, to-offset,
-				&position);
-		if (rc > 0)
-			rc = 0;
-		/* else if (rc < 0) should we set writebehind rc? */
+		rc = cifs_write(file, page_data + offset, copied, &pos);
+		/* if (rc < 0) should we set writebehind rc? */
 		kunmap(page);
+
+		FreeXid(xid);
 	} else {
+		rc = copied;
+		pos += copied;
 		set_page_dirty(page);
 	}
 
-	FreeXid(xid);
+	if (rc > 0) {
+		spin_lock(&inode->i_lock);
+		if (pos > inode->i_size)
+			i_size_write(inode, pos);
+		spin_unlock(&inode->i_lock);
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+
 	return rc;
 }
 
@@ -1775,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		SetPageUptodate(page);
 		unlock_page(page);
 		if (!pagevec_add(plru_pvec, page))
-			__pagevec_lru_add(plru_pvec);
+			__pagevec_lru_add_file(plru_pvec);
 		data += PAGE_CACHE_SIZE;
 	}
 	return;
@@ -1909,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		bytes_read = 0;
 	}
 
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 
 /* need to free smb_read_data buf before exit */
 	if (smb_read_data) {
@@ -2030,49 +2046,44 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
 		return true;
 }
 
-static int cifs_prepare_write(struct file *file, struct page *page,
-	unsigned from, unsigned to)
+static int cifs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
 {
-	int rc = 0;
-	loff_t i_size;
-	loff_t offset;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+
+	cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
 
-	cFYI(1, ("prepare write for page %p from %d to %d", page, from, to));
-	if (PageUptodate(page))
+	*pagep = __grab_cache_page(mapping, index);
+	if (!*pagep)
+		return -ENOMEM;
+
+	if (PageUptodate(*pagep))
 		return 0;
 
 	/* If we are writing a full page it will be up to date,
 	   no need to read from the server */
-	if ((to == PAGE_CACHE_SIZE) && (from == 0)) {
-		SetPageUptodate(page);
+	if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
 		return 0;
-	}
 
-	offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-	i_size = i_size_read(page->mapping->host);
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+		int rc;
 
-	if ((offset >= i_size) ||
-	    ((from == 0) && (offset + to) >= i_size)) {
-		/*
-		 * We don't need to read data beyond the end of the file.
-		 * zero it, and set the page uptodate
-		 */
-		simple_prepare_write(file, page, from, to);
-		SetPageUptodate(page);
-	} else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
 		/* might as well read a page, it is fast enough */
-		rc = cifs_readpage_worker(file, page, &offset);
+		rc = cifs_readpage_worker(file, *pagep, &offset);
+
+		/* we do not need to pass errors back
+		   e.g. if we do not have read access to the file
+		   because cifs_write_end will attempt synchronous writes
+		   -- shaggy */
 	} else {
 		/* we could try using another file handle if there is one -
 		   but how would we lock it to prevent close of that handle
 		   racing with this read? In any case
-		   this will be written out by commit_write so is fine */
+		   this will be written out by write_end so is fine */
 	}
 
-	/* we do not need to pass errors back
-	   e.g. if we do not have read access to the file
-	   because cifs_commit_write will do the right thing.  -- shaggy */
-
 	return 0;
 }
 
@@ -2081,8 +2092,8 @@ const struct address_space_operations cifs_addr_ops = {
 	.readpages = cifs_readpages,
 	.writepage = cifs_writepage,
 	.writepages = cifs_writepages,
-	.prepare_write = cifs_prepare_write,
-	.commit_write = cifs_commit_write,
+	.write_begin = cifs_write_begin,
+	.write_end = cifs_write_end,
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
@@ -2097,8 +2108,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.readpage = cifs_readpage,
 	.writepage = cifs_writepage,
 	.writepages = cifs_writepages,
-	.prepare_write = cifs_prepare_write,
-	.commit_write = cifs_commit_write,
+	.write_begin = cifs_write_begin,
+	.write_end = cifs_write_end,
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2e904bd111c8..d54fa8aeaea9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -506,6 +506,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	inode = *pinode;
 	cifsInfo = CIFS_I(inode);
 	cifsInfo->cifsAttrs = attr;
+	cifsInfo->delete_pending = pfindData->DeletePending ? true : false;
 	cFYI(1, ("Old time %ld", cifsInfo->time));
 	cifsInfo->time = jiffies;
 	cFYI(1, ("New time %ld", cifsInfo->time));
@@ -546,7 +547,8 @@ int cifs_get_inode_info(struct inode **pinode,
 		if ((inode->i_mode & S_IWUGO) == 0 &&
 		    (attr & ATTR_READONLY) == 0)
 			inode->i_mode |= (S_IWUGO & default_mode);
-			inode->i_mode &= ~S_IFMT;
+
+		inode->i_mode &= ~S_IFMT;
 	}
 	/* clear write bits if ATTR_READONLY is set */
 	if (attr & ATTR_READONLY)
@@ -649,6 +651,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_fop = &simple_dir_operations;
 		inode->i_uid = cifs_sb->mnt_uid;
 		inode->i_gid = cifs_sb->mnt_gid;
+	} else if (rc) {
 		_FreeXid(xid);
 		iget_failed(inode);
 		return ERR_PTR(rc);
@@ -663,40 +666,244 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
 	return inode;
 }
 
-int cifs_unlink(struct inode *inode, struct dentry *direntry)
+static int
+cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
+		    char *full_path, __u32 dosattr)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid;
+	__u32 netpid;
+	bool set_time = false;
+	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	FILE_BASIC_INFO	info_buf;
+
+	if (attrs->ia_valid & ATTR_ATIME) {
+		set_time = true;
+		info_buf.LastAccessTime =
+			cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
+	} else
+		info_buf.LastAccessTime = 0;
+
+	if (attrs->ia_valid & ATTR_MTIME) {
+		set_time = true;
+		info_buf.LastWriteTime =
+		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
+	} else
+		info_buf.LastWriteTime = 0;
+
+	/*
+	 * Samba throws this field away, but windows may actually use it.
+	 * Do not set ctime unless other time stamps are changed explicitly
+	 * (i.e. by utimes()) since we would then have a mix of client and
+	 * server times.
+	 */
+	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
+		cFYI(1, ("CIFS - CTIME changed"));
+		info_buf.ChangeTime =
+		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
+	} else
+		info_buf.ChangeTime = 0;
+
+	info_buf.CreationTime = 0;	/* don't change */
+	info_buf.Attributes = cpu_to_le32(dosattr);
+
+	/*
+	 * If the file is already open for write, just use that fileid
+	 */
+	open_file = find_writable_file(cifsInode);
+	if (open_file) {
+		netfid = open_file->netfid;
+		netpid = open_file->pid;
+		goto set_via_filehandle;
+	}
+
+	/*
+	 * NT4 apparently returns success on this call, but it doesn't
+	 * really work.
+	 */
+	if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
+		rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
+				     &info_buf, cifs_sb->local_nls,
+				     cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc == 0) {
+			cifsInode->cifsAttrs = dosattr;
+			goto out;
+		} else if (rc != -EOPNOTSUPP && rc != -EINVAL)
+			goto out;
+	}
+
+	cFYI(1, ("calling SetFileInfo since SetPathInfo for "
+		 "times not supported by this server"));
+	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
+			 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
+			 CREATE_NOT_DIR, &netfid, &oplock,
+			 NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc != 0) {
+		if (rc == -EIO)
+			rc = -EINVAL;
+		goto out;
+	}
+
+	netpid = current->tgid;
+
+set_via_filehandle:
+	rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
+	if (!rc)
+		cifsInode->cifsAttrs = dosattr;
+
+	if (open_file == NULL)
+		CIFSSMBClose(xid, pTcon, netfid);
+	else
+		atomic_dec(&open_file->wrtPending);
+out:
+	return rc;
+}
+
+/*
+ * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
+ * and rename it to a random name that hopefully won't conflict with
+ * anything else.
+ */
+static int
+cifs_rename_pending_delete(char *full_path, struct dentry *dentry, int xid)
+{
+	int oplock = 0;
+	int rc;
+	__u16 netfid;
+	struct inode *inode = dentry->d_inode;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	__u32 dosattr, origattr;
+	FILE_BASIC_INFO *info_buf = NULL;
+
+	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+			 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
+			 &netfid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0)
+		goto out;
+
+	origattr = cifsInode->cifsAttrs;
+	if (origattr == 0)
+		origattr |= ATTR_NORMAL;
+
+	dosattr = origattr & ~ATTR_READONLY;
+	if (dosattr == 0)
+		dosattr |= ATTR_NORMAL;
+	dosattr |= ATTR_HIDDEN;
+
+	/* set ATTR_HIDDEN and clear ATTR_READONLY, but only if needed */
+	if (dosattr != origattr) {
+		info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
+		if (info_buf == NULL) {
+			rc = -ENOMEM;
+			goto out_close;
+		}
+		info_buf->Attributes = cpu_to_le32(dosattr);
+		rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+					current->tgid);
+		/* although we would like to mark the file hidden
+ 		   if that fails we will still try to rename it */
+		if (rc != 0)
+			cifsInode->cifsAttrs = dosattr;
+		else
+			dosattr = origattr; /* since not able to change them */
+	}
+
+	/* rename the file */
+	rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != 0) {
+		rc = -ETXTBSY;
+		goto undo_setattr;
+	}
+
+	/* try to set DELETE_ON_CLOSE */
+	if (!cifsInode->delete_pending) {
+		rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid,
+					       current->tgid);
+		/*
+		 * some samba versions return -ENOENT when we try to set the
+		 * file disposition here. Likely a samba bug, but work around
+		 * it for now. This means that some cifsXXX files may hang
+		 * around after they shouldn't.
+		 *
+		 * BB: remove this hack after more servers have the fix
+		 */
+		if (rc == -ENOENT)
+			rc = 0;
+		else if (rc != 0) {
+			rc = -ETXTBSY;
+			goto undo_rename;
+		}
+		cifsInode->delete_pending = true;
+	}
+
+out_close:
+	CIFSSMBClose(xid, tcon, netfid);
+out:
+	kfree(info_buf);
+	return rc;
+
+	/*
+	 * reset everything back to the original state. Don't bother
+	 * dealing with errors here since we can't do anything about
+	 * them anyway.
+	 */
+undo_rename:
+	CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					    CIFS_MOUNT_MAP_SPECIAL_CHR);
+undo_setattr:
+	if (dosattr != origattr) {
+		info_buf->Attributes = cpu_to_le32(origattr);
+		if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+					current->tgid))
+			cifsInode->cifsAttrs = origattr;
+	}
+
+	goto out_close;
+}
+
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int rc = 0;
 	int xid;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
-	struct cifsInodeInfo *cifsInode;
-	FILE_BASIC_INFO *pinfo_buf;
+	struct inode *inode = dentry->d_inode;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct super_block *sb = dir->i_sb;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct iattr *attrs = NULL;
+	__u32 dosattr = 0, origattr = 0;
 
-	cFYI(1, ("cifs_unlink, inode = 0x%p", inode));
+	cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
 
 	xid = GetXid();
 
-	if (inode)
-		cifs_sb = CIFS_SB(inode->i_sb);
-	else
-		cifs_sb = CIFS_SB(direntry->d_sb);
-	pTcon = cifs_sb->tcon;
-
-	/* Unlink can be called from rename so we can not grab the sem here
-	   since we deadlock otherwise */
-/*	mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/
-	full_path = build_path_from_dentry(direntry);
-/*	mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/
+	/* Unlink can be called from rename so we can not take the
+	 * sb->s_vfs_rename_mutex here */
+	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		FreeXid(xid);
 		return -ENOMEM;
 	}
 
-	if ((pTcon->ses->capabilities & CAP_UNIX) &&
+	if ((tcon->ses->capabilities & CAP_UNIX) &&
 		(CIFS_UNIX_POSIX_PATH_OPS_CAP &
-			le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
-		rc = CIFSPOSIXDelFile(xid, pTcon, full_path,
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = CIFSPOSIXDelFile(xid, tcon, full_path,
 			SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 		cFYI(1, ("posix del rc %d", rc));
@@ -704,124 +911,60 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 			goto psx_del_no_retry;
 	}
 
-	rc = CIFSSMBDelFile(xid, pTcon, full_path, cifs_sb->local_nls,
+retry_std_delete:
+	rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+
 psx_del_no_retry:
 	if (!rc) {
-		if (direntry->d_inode)
-			drop_nlink(direntry->d_inode);
+		if (inode)
+			drop_nlink(inode);
 	} else if (rc == -ENOENT) {
-		d_drop(direntry);
+		d_drop(dentry);
 	} else if (rc == -ETXTBSY) {
-		int oplock = 0;
-		__u16 netfid;
-
-		rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE,
-				 CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE,
-				 &netfid, &oplock, NULL, cifs_sb->local_nls,
-				 cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc == 0) {
-			CIFSSMBRenameOpenFile(xid, pTcon, netfid, NULL,
-					      cifs_sb->local_nls,
-					      cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			CIFSSMBClose(xid, pTcon, netfid);
-			if (direntry->d_inode)
-				drop_nlink(direntry->d_inode);
+		rc = cifs_rename_pending_delete(full_path, dentry, xid);
+		if (rc == 0)
+			drop_nlink(inode);
+	} else if (rc == -EACCES && dosattr == 0) {
+		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+		if (attrs == NULL) {
+			rc = -ENOMEM;
+			goto out_reval;
 		}
-	} else if (rc == -EACCES) {
-		/* try only if r/o attribute set in local lookup data? */
-		pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL);
-		if (pinfo_buf) {
-			/* ATTRS set to normal clears r/o bit */
-			pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL);
-			if (!(pTcon->ses->flags & CIFS_SES_NT4))
-				rc = CIFSSMBSetTimes(xid, pTcon, full_path,
-						     pinfo_buf,
-						     cifs_sb->local_nls,
-						     cifs_sb->mnt_cifs_flags &
-							CIFS_MOUNT_MAP_SPECIAL_CHR);
-			else
-				rc = -EOPNOTSUPP;
 
-			if (rc == -EOPNOTSUPP) {
-				int oplock = 0;
-				__u16 netfid;
-			/*	rc = CIFSSMBSetAttrLegacy(xid, pTcon,
-							  full_path,
-							  (__u16)ATTR_NORMAL,
-							  cifs_sb->local_nls);
-			   For some strange reason it seems that NT4 eats the
-			   old setattr call without actually setting the
-			   attributes so on to the third attempted workaround
-			   */
-
-			/* BB could scan to see if we already have it open
-			   and pass in pid of opener to function */
-				rc = CIFSSMBOpen(xid, pTcon, full_path,
-						 FILE_OPEN, SYNCHRONIZE |
-						 FILE_WRITE_ATTRIBUTES, 0,
-						 &netfid, &oplock, NULL,
-						 cifs_sb->local_nls,
-						 cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-				if (rc == 0) {
-					rc = CIFSSMBSetFileTimes(xid, pTcon,
-								 pinfo_buf,
-								 netfid);
-					CIFSSMBClose(xid, pTcon, netfid);
-				}
-			}
-			kfree(pinfo_buf);
-		}
-		if (rc == 0) {
-			rc = CIFSSMBDelFile(xid, pTcon, full_path,
-					    cifs_sb->local_nls,
-					    cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			if (!rc) {
-				if (direntry->d_inode)
-					drop_nlink(direntry->d_inode);
-			} else if (rc == -ETXTBSY) {
-				int oplock = 0;
-				__u16 netfid;
-
-				rc = CIFSSMBOpen(xid, pTcon, full_path,
-						 FILE_OPEN, DELETE,
-						 CREATE_NOT_DIR |
-						 CREATE_DELETE_ON_CLOSE,
-						 &netfid, &oplock, NULL,
-						 cifs_sb->local_nls,
-						 cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-				if (rc == 0) {
-					CIFSSMBRenameOpenFile(xid, pTcon,
-						netfid, NULL,
-						cifs_sb->local_nls,
-						cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-					CIFSSMBClose(xid, pTcon, netfid);
-					if (direntry->d_inode)
-						drop_nlink(direntry->d_inode);
-				}
-			/* BB if rc = -ETXTBUSY goto the rename logic BB */
-			}
-		}
-	}
-	if (direntry->d_inode) {
-		cifsInode = CIFS_I(direntry->d_inode);
-		cifsInode->time = 0;	/* will force revalidate to get info
-					   when needed */
-		direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
+		/* try to reset dos attributes */
+		origattr = cifsInode->cifsAttrs;
+		if (origattr == 0)
+			origattr |= ATTR_NORMAL;
+		dosattr = origattr & ~ATTR_READONLY;
+		if (dosattr == 0)
+			dosattr |= ATTR_NORMAL;
+		dosattr |= ATTR_HIDDEN;
+
+		rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
+		if (rc != 0)
+			goto out_reval;
+
+		goto retry_std_delete;
 	}
+
+	/* undo the setattr if we errored out and it's needed */
+	if (rc != 0 && dosattr != 0)
+		cifs_set_file_info(inode, attrs, xid, full_path, origattr);
+
+out_reval:
 	if (inode) {
-		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 		cifsInode = CIFS_I(inode);
-		cifsInode->time = 0;	/* force revalidate of dir as well */
+		cifsInode->time = 0;	/* will force revalidate to get info
+					   when needed */
+		inode->i_ctime = current_fs_time(sb);
 	}
+	dir->i_ctime = dir->i_mtime = current_fs_time(sb);
+	cifsInode = CIFS_I(dir);
+	CIFS_I(dir)->time = 0;	/* force revalidate of dir as well */
 
 	kfree(full_path);
+	kfree(attrs);
 	FreeXid(xid);
 	return rc;
 }
@@ -866,7 +1009,7 @@ static void posix_fill_in_inode(struct inode *tmp_inode,
 
 int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 {
-	int rc = 0;
+	int rc = 0, tmprc;
 	int xid;
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
@@ -928,6 +1071,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 				kfree(pInfo);
 				goto mkdir_get_info;
 			}
+
 			/* Is an i_ino of zero legal? */
 			/* Are there sanity checks we can use to ensure that
 			   the server is really filling in that field? */
@@ -984,35 +1128,52 @@ mkdir_get_info:
 		  * failed to get it from the server or was set bogus */
 		if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
 				direntry->d_inode->i_nlink = 2;
+
 		mode &= ~current->fs->umask;
+		/* must turn on setgid bit if parent dir has it */
+		if (inode->i_mode & S_ISGID)
+			mode |= S_ISGID;
+
 		if (pTcon->unix_ext) {
+			struct cifs_unix_set_info_args args = {
+				.mode	= mode,
+				.ctime	= NO_CHANGE_64,
+				.atime	= NO_CHANGE_64,
+				.mtime	= NO_CHANGE_64,
+				.device	= 0,
+			};
 			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-				CIFSSMBUnixSetPerms(xid, pTcon, full_path,
-						    mode,
-						    (__u64)current->fsuid,
-						    (__u64)current->fsgid,
-						    0 /* dev_t */,
-						    cifs_sb->local_nls,
-						    cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
+				args.uid = (__u64)current->fsuid;
+				if (inode->i_mode & S_ISGID)
+					args.gid = (__u64)inode->i_gid;
+				else
+					args.gid = (__u64)current->fsgid;
 			} else {
-				CIFSSMBUnixSetPerms(xid, pTcon, full_path,
-						    mode, (__u64)-1,
-						    (__u64)-1, 0 /* dev_t */,
-						    cifs_sb->local_nls,
-						    cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
+				args.uid = NO_CHANGE_64;
+				args.gid = NO_CHANGE_64;
 			}
+			CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+					    cifs_sb->local_nls,
+					    cifs_sb->mnt_cifs_flags &
+					    CIFS_MOUNT_MAP_SPECIAL_CHR);
 		} else {
 			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
 			    (mode & S_IWUGO) == 0) {
 				FILE_BASIC_INFO pInfo;
+				struct cifsInodeInfo *cifsInode;
+				u32 dosattrs;
+
 				memset(&pInfo, 0, sizeof(pInfo));
-				pInfo.Attributes = cpu_to_le32(ATTR_READONLY);
-				CIFSSMBSetTimes(xid, pTcon, full_path,
-						&pInfo, cifs_sb->local_nls,
+				cifsInode = CIFS_I(newinode);
+				dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
+				pInfo.Attributes = cpu_to_le32(dosattrs);
+				tmprc = CIFSSMBSetPathInfo(xid, pTcon,
+						full_path, &pInfo,
+						cifs_sb->local_nls,
 						cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
+				if (tmprc == 0)
+					cifsInode->cifsAttrs = dosattrs;
 			}
 			if (direntry->d_inode) {
 				if (cifs_sb->mnt_cifs_flags &
@@ -1024,8 +1185,12 @@ mkdir_get_info:
 				     CIFS_MOUNT_SET_UID) {
 					direntry->d_inode->i_uid =
 						current->fsuid;
-					direntry->d_inode->i_gid =
-						current->fsgid;
+					if (inode->i_mode & S_ISGID)
+						direntry->d_inode->i_gid =
+							inode->i_gid;
+					else
+						direntry->d_inode->i_gid =
+							current->fsgid;
 				}
 			}
 		}
@@ -1080,117 +1245,140 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	return rc;
 }
 
-int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
-	struct inode *target_inode, struct dentry *target_direntry)
+static int
+cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
+		struct dentry *to_dentry, const char *toPath)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
+	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	__u16 srcfid;
+	int oplock, rc;
+
+	/* try path-based rename first */
+	rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
+			   cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	/*
+	 * don't bother with rename by filehandle unless file is busy and
+	 * source Note that cross directory moves do not work with
+	 * rename by filehandle to various Windows servers.
+	 */
+	if (rc == 0 || rc != -ETXTBSY)
+		return rc;
+
+	/* open the file to be renamed -- we need DELETE perms */
+	rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
+			 CREATE_NOT_DIR, &srcfid, &oplock, NULL,
+			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (rc == 0) {
+		rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid,
+				(const char *) to_dentry->d_name.name,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+		CIFSSMBClose(xid, pTcon, srcfid);
+	}
+
+	return rc;
+}
+
+int cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
+	struct inode *target_dir, struct dentry *target_dentry)
 {
-	char *fromName;
-	char *toName;
+	char *fromName = NULL;
+	char *toName = NULL;
 	struct cifs_sb_info *cifs_sb_source;
 	struct cifs_sb_info *cifs_sb_target;
-	struct cifsTconInfo *pTcon;
-	int xid;
-	int rc = 0;
+	struct cifsTconInfo *tcon;
+	FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
+	FILE_UNIX_BASIC_INFO *info_buf_target;
+	int xid, rc, tmprc;
+
+	cifs_sb_target = CIFS_SB(target_dir->i_sb);
+	cifs_sb_source = CIFS_SB(source_dir->i_sb);
+	tcon = cifs_sb_source->tcon;
 
 	xid = GetXid();
 
-	cifs_sb_target = CIFS_SB(target_inode->i_sb);
-	cifs_sb_source = CIFS_SB(source_inode->i_sb);
-	pTcon = cifs_sb_source->tcon;
+	/*
+	 * BB: this might be allowed if same server, but different share.
+	 * Consider adding support for this
+	 */
+	if (tcon != cifs_sb_target->tcon) {
+		rc = -EXDEV;
+		goto cifs_rename_exit;
+	}
 
-	if (pTcon != cifs_sb_target->tcon) {
-		FreeXid(xid);
-		return -EXDEV;	/* BB actually could be allowed if same server,
-				   but different share.
-				   Might eventually add support for this */
+	/*
+	 * we already have the rename sem so we do not need to
+	 * grab it again here to protect the path integrity
+	 */
+	fromName = build_path_from_dentry(source_dentry);
+	if (fromName == NULL) {
+		rc = -ENOMEM;
+		goto cifs_rename_exit;
 	}
 
-	/* we already  have the rename sem so we do not need to grab it again
-	   here to protect the path integrity */
-	fromName = build_path_from_dentry(source_direntry);
-	toName = build_path_from_dentry(target_direntry);
-	if ((fromName == NULL) || (toName == NULL)) {
+	toName = build_path_from_dentry(target_dentry);
+	if (toName == NULL) {
 		rc = -ENOMEM;
 		goto cifs_rename_exit;
 	}
 
-	rc = CIFSSMBRename(xid, pTcon, fromName, toName,
-			   cifs_sb_source->local_nls,
-			   cifs_sb_source->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
-	if (rc == -EEXIST) {
-		/* check if they are the same file because rename of hardlinked
-		   files is a noop */
-		FILE_UNIX_BASIC_INFO *info_buf_source;
-		FILE_UNIX_BASIC_INFO *info_buf_target;
+	rc = cifs_do_rename(xid, source_dentry, fromName,
+			    target_dentry, toName);
 
+	if (rc == -EEXIST && tcon->unix_ext) {
+		/*
+		 * Are src and dst hardlinks of same inode? We can
+		 * only tell with unix extensions enabled
+		 */
 		info_buf_source =
-			kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
-		if (info_buf_source != NULL) {
-			info_buf_target = info_buf_source + 1;
-			if (pTcon->unix_ext)
-				rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
+			kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
+					GFP_KERNEL);
+		if (info_buf_source == NULL) {
+			rc = -ENOMEM;
+			goto cifs_rename_exit;
+		}
+
+		info_buf_target = info_buf_source + 1;
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon, fromName,
 					info_buf_source,
 					cifs_sb_source->local_nls,
 					cifs_sb_source->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			/* else rc is still EEXIST so will fall through to
-			   unlink the target and retry rename */
-			if (rc == 0) {
-				rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName,
-						info_buf_target,
-						cifs_sb_target->local_nls,
-						/* remap based on source sb */
-						cifs_sb_source->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-			}
-			if ((rc == 0) &&
-			    (info_buf_source->UniqueId ==
-			     info_buf_target->UniqueId)) {
-			/* do not rename since the files are hardlinked which
-			   is a noop */
-			} else {
-			/* we either can not tell the files are hardlinked
-			   (as with Windows servers) or files are not
-			   hardlinked so delete the target manually before
-			   renaming to follow POSIX rather than Windows
-			   semantics */
-				cifs_unlink(target_inode, target_direntry);
-				rc = CIFSSMBRename(xid, pTcon, fromName,
-						   toName,
-						   cifs_sb_source->local_nls,
-						   cifs_sb_source->mnt_cifs_flags
-						   & CIFS_MOUNT_MAP_SPECIAL_CHR);
-			}
-			kfree(info_buf_source);
-		} /* if we can not get memory just leave rc as EEXIST */
-	}
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (tmprc != 0)
+			goto unlink_target;
 
-	if (rc)
-		cFYI(1, ("rename rc %d", rc));
-
-	if ((rc == -EIO) || (rc == -EEXIST)) {
-		int oplock = 0;
-		__u16 netfid;
-
-		/* BB FIXME Is Generic Read correct for rename? */
-		/* if renaming directory - we should not say CREATE_NOT_DIR,
-		   need to test renaming open directory, also GENERIC_READ
-		   might not right be right access to request */
-		rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ,
-				 CREATE_NOT_DIR, &netfid, &oplock, NULL,
-				 cifs_sb_source->local_nls,
-				 cifs_sb_source->mnt_cifs_flags &
+		tmprc = CIFSSMBUnixQPathInfo(xid, tcon,
+					toName, info_buf_target,
+					cifs_sb_target->local_nls,
+					/* remap based on source sb */
+					cifs_sb_source->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc == 0) {
-			rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName,
-					      cifs_sb_source->local_nls,
-					      cifs_sb_source->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			CIFSSMBClose(xid, pTcon, netfid);
-		}
+
+		if (tmprc == 0 && (info_buf_source->UniqueId ==
+				   info_buf_target->UniqueId))
+			/* same file, POSIX says that this is a noop */
+			goto cifs_rename_exit;
+	} /* else ... BB we could add the same check for Windows by
+		     checking the UniqueId via FILE_INTERNAL_INFO */
+
+unlink_target:
+	if ((rc == -EACCES) || (rc == -EEXIST)) {
+		tmprc = cifs_unlink(target_dir, target_dentry);
+		if (tmprc)
+			goto cifs_rename_exit;
+
+		rc = cifs_do_rename(xid, source_dentry, fromName,
+				    target_dentry, toName);
 	}
 
 cifs_rename_exit:
+	kfree(info_buf_source);
 	kfree(fromName);
 	kfree(toName);
 	FreeXid(xid);
@@ -1310,10 +1498,11 @@ int cifs_revalidate(struct dentry *direntry)
 /*		if (S_ISDIR(direntry->d_inode->i_mode))
 			shrink_dcache_parent(direntry); */
 		if (S_ISREG(direntry->d_inode->i_mode)) {
-			if (direntry->d_inode->i_mapping)
+			if (direntry->d_inode->i_mapping) {
 				wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
 				if (wbrc)
 					CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
+			}
 			/* may eventually have to do this for open files too */
 			if (list_empty(&(cifsInode->openFileList))) {
 				/* changed on server - flush read ahead pages */
@@ -1413,31 +1602,209 @@ out_busy:
 	return -ETXTBSY;
 }
 
-int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+static int
+cifs_set_file_size(struct inode *inode, struct iattr *attrs,
+		   int xid, char *full_path)
+{
+	int rc;
+	struct cifsFileInfo *open_file;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+
+	/*
+	 * To avoid spurious oplock breaks from server, in the case of
+	 * inodes that we already have open, avoid doing path based
+	 * setting of file size if we can do it by handle.
+	 * This keeps our caching token (oplock) and avoids timeouts
+	 * when the local oplock break takes longer to flush
+	 * writebehind data than the SMB timeout for the SetPathInfo
+	 * request would allow
+	 */
+	open_file = find_writable_file(cifsInode);
+	if (open_file) {
+		__u16 nfid = open_file->netfid;
+		__u32 npid = open_file->pid;
+		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
+					npid, false);
+		atomic_dec(&open_file->wrtPending);
+		cFYI(1, ("SetFSize for attrs rc = %d", rc));
+		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			unsigned int bytes_written;
+			rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
+					  &bytes_written, NULL, NULL, 1);
+			cFYI(1, ("Wrt seteof rc %d", rc));
+		}
+	} else
+		rc = -EINVAL;
+
+	if (rc != 0) {
+		/* Set file size by pathname rather than by handle
+		   either because no valid, writeable file handle for
+		   it was found or because there was an error setting
+		   it by handle */
+		rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size,
+				   false, cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
+		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			__u16 netfid;
+			int oplock = 0;
+
+			rc = SMBLegacyOpen(xid, pTcon, full_path,
+				FILE_OPEN, GENERIC_WRITE,
+				CREATE_NOT_DIR, &netfid, &oplock, NULL,
+				cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+			if (rc == 0) {
+				unsigned int bytes_written;
+				rc = CIFSSMBWrite(xid, pTcon, netfid, 0,
+						  attrs->ia_size,
+						  &bytes_written, NULL,
+						  NULL, 1);
+				cFYI(1, ("wrt seteof rc %d", rc));
+				CIFSSMBClose(xid, pTcon, netfid);
+			}
+		}
+	}
+
+	if (rc == 0) {
+		rc = cifs_vmtruncate(inode, attrs->ia_size);
+		cifs_truncate_page(inode->i_mapping, inode->i_size);
+	}
+
+	return rc;
+}
+
+static int
+cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 {
+	int rc;
 	int xid;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
-	int rc = -EACCES;
-	struct cifsFileInfo *open_file = NULL;
-	FILE_BASIC_INFO time_buf;
-	bool set_time = false;
-	bool set_dosattr = false;
-	__u64 mode = 0xFFFFFFFFFFFFFFFFULL;
-	__u64 uid = 0xFFFFFFFFFFFFFFFFULL;
-	__u64 gid = 0xFFFFFFFFFFFFFFFFULL;
-	struct cifsInodeInfo *cifsInode;
 	struct inode *inode = direntry->d_inode;
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+	struct cifs_unix_set_info_args *args = NULL;
+
+	cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
+		 direntry->d_name.name, attrs->ia_valid));
+
+	xid = GetXid();
+
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
+		/* check if we have permission to change attrs */
+		rc = inode_change_ok(inode, attrs);
+		if (rc < 0)
+			goto out;
+		else
+			rc = 0;
+	}
+
+	full_path = build_path_from_dentry(direntry);
+	if (full_path == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
+		/*
+		   Flush data before changing file size or changing the last
+		   write time of the file on the server. If the
+		   flush returns error, store it to report later and continue.
+		   BB: This should be smarter. Why bother flushing pages that
+		   will be truncated anyway? Also, should we error out here if
+		   the flush returns error?
+		 */
+		rc = filemap_write_and_wait(inode->i_mapping);
+		if (rc != 0) {
+			cifsInode->write_behind_rc = rc;
+			rc = 0;
+		}
+	}
+
+	if (attrs->ia_valid & ATTR_SIZE) {
+		rc = cifs_set_file_size(inode, attrs, xid, full_path);
+		if (rc != 0)
+			goto out;
+	}
+
+	/* skip mode change if it's just for clearing setuid/setgid */
+	if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+		attrs->ia_valid &= ~ATTR_MODE;
+
+	args = kmalloc(sizeof(*args), GFP_KERNEL);
+	if (args == NULL) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* set up the struct */
+	if (attrs->ia_valid & ATTR_MODE)
+		args->mode = attrs->ia_mode;
+	else
+		args->mode = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_UID)
+		args->uid = attrs->ia_uid;
+	else
+		args->uid = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_GID)
+		args->gid = attrs->ia_gid;
+	else
+		args->gid = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_ATIME)
+		args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
+	else
+		args->atime = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_MTIME)
+		args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime);
+	else
+		args->mtime = NO_CHANGE_64;
+
+	if (attrs->ia_valid & ATTR_CTIME)
+		args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime);
+	else
+		args->ctime = NO_CHANGE_64;
+
+	args->device = 0;
+	rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, args,
+				cifs_sb->local_nls,
+				cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+	if (!rc)
+		rc = inode_setattr(inode, attrs);
+out:
+	kfree(args);
+	kfree(full_path);
+	FreeXid(xid);
+	return rc;
+}
+
+static int
+cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
+{
+	int xid;
+	struct inode *inode = direntry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+	char *full_path = NULL;
+	int rc = -EACCES;
+	__u32 dosattr = 0;
+	__u64 mode = NO_CHANGE_64;
 
 	xid = GetXid();
 
 	cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
 		 direntry->d_name.name, attrs->ia_valid));
 
-	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
-
 	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
 		/* check if we have permission to change attrs */
 		rc = inode_change_ok(inode, attrs);
@@ -1453,7 +1820,6 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		FreeXid(xid);
 		return -ENOMEM;
 	}
-	cifsInode = CIFS_I(inode);
 
 	if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
 		/*
@@ -1472,78 +1838,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	}
 
 	if (attrs->ia_valid & ATTR_SIZE) {
-		/* To avoid spurious oplock breaks from server, in the case of
-		   inodes that we already have open, avoid doing path based
-		   setting of file size if we can do it by handle.
-		   This keeps our caching token (oplock) and avoids timeouts
-		   when the local oplock break takes longer to flush
-		   writebehind data than the SMB timeout for the SetPathInfo
-		   request would allow */
-
-		open_file = find_writable_file(cifsInode);
-		if (open_file) {
-			__u16 nfid = open_file->netfid;
-			__u32 npid = open_file->pid;
-			rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size,
-						nfid, npid, false);
-			atomic_dec(&open_file->wrtPending);
-			cFYI(1, ("SetFSize for attrs rc = %d", rc));
-			if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
-				unsigned int bytes_written;
-				rc = CIFSSMBWrite(xid, pTcon,
-						  nfid, 0, attrs->ia_size,
-						  &bytes_written, NULL, NULL,
-						  1 /* 45 seconds */);
-				cFYI(1, ("Wrt seteof rc %d", rc));
-			}
-		} else
-			rc = -EINVAL;
-
-		if (rc != 0) {
-			/* Set file size by pathname rather than by handle
-			   either because no valid, writeable file handle for
-			   it was found or because there was an error setting
-			   it by handle */
-			rc = CIFSSMBSetEOF(xid, pTcon, full_path,
-					   attrs->ia_size, false,
-					   cifs_sb->local_nls,
-					   cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
-			if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
-				__u16 netfid;
-				int oplock = 0;
-
-				rc = SMBLegacyOpen(xid, pTcon, full_path,
-					FILE_OPEN, GENERIC_WRITE,
-					CREATE_NOT_DIR, &netfid, &oplock,
-					NULL, cifs_sb->local_nls,
-					cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-				if (rc == 0) {
-					unsigned int bytes_written;
-					rc = CIFSSMBWrite(xid, pTcon,
-							netfid, 0,
-							attrs->ia_size,
-							&bytes_written, NULL,
-							NULL, 1 /* 45 sec */);
-					cFYI(1, ("wrt seteof rc %d", rc));
-					CIFSSMBClose(xid, pTcon, netfid);
-				}
-
-			}
-		}
-
-		/* Server is ok setting allocation size implicitly - no need
-		   to call:
-		CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, true,
-			 cifs_sb->local_nls);
-		   */
-
-		if (rc == 0) {
-			rc = cifs_vmtruncate(inode, attrs->ia_size);
-			cifs_truncate_page(inode->i_mapping, inode->i_size);
-		} else
+		rc = cifs_set_file_size(inode, attrs, xid, full_path);
+		if (rc != 0)
 			goto cifs_setattr_exit;
 	}
 
@@ -1554,21 +1850,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 	 * CIFSACL support + proper Windows to Unix idmapping, we may be
 	 * able to support this in the future.
 	 */
-	if (!pTcon->unix_ext &&
-	    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
 		attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
-	} else {
-		if (attrs->ia_valid & ATTR_UID) {
-			cFYI(1, ("UID changed to %d", attrs->ia_uid));
-			uid = attrs->ia_uid;
-		}
-		if (attrs->ia_valid & ATTR_GID) {
-			cFYI(1, ("GID changed to %d", attrs->ia_gid));
-			gid = attrs->ia_gid;
-		}
-	}
-
-	time_buf.Attributes = 0;
 
 	/* skip mode change if it's just for clearing setuid/setgid */
 	if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
@@ -1579,13 +1862,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		mode = attrs->ia_mode;
 	}
 
-	if ((pTcon->unix_ext)
-	    && (attrs->ia_valid & (ATTR_MODE | ATTR_GID | ATTR_UID)))
-		rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode, uid, gid,
-					 0 /* dev_t */, cifs_sb->local_nls,
-					 cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-	else if (attrs->ia_valid & ATTR_MODE) {
+	if (attrs->ia_valid & ATTR_MODE) {
 		rc = 0;
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
@@ -1594,24 +1871,19 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #endif
 		if (((mode & S_IWUGO) == 0) &&
 		    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
-			set_dosattr = true;
-			time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs |
-							  ATTR_READONLY);
+
+			dosattr = cifsInode->cifsAttrs | ATTR_READONLY;
+
 			/* fix up mode if we're not using dynperm */
 			if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
 				attrs->ia_mode = inode->i_mode & ~S_IWUGO;
 		} else if ((mode & S_IWUGO) &&
 			   (cifsInode->cifsAttrs & ATTR_READONLY)) {
-			/* If file is readonly on server, we would
-			not be able to write to it - so if any write
-			bit is enabled for user or group or other we
-			need to at least try to remove r/o dos attr */
-			set_dosattr = true;
-			time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs &
-					    (~ATTR_READONLY));
-			/* Windows ignores set to zero */
-			if (time_buf.Attributes == 0)
-				time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
+
+			dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+			/* Attributes of 0 are ignored */
+			if (dosattr == 0)
+				dosattr |= ATTR_NORMAL;
 
 			/* reset local inode permissions to normal */
 			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
@@ -1629,82 +1901,18 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		}
 	}
 
-	if (attrs->ia_valid & ATTR_ATIME) {
-		set_time = true;
-		time_buf.LastAccessTime =
-		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
-	} else
-		time_buf.LastAccessTime = 0;
-
-	if (attrs->ia_valid & ATTR_MTIME) {
-		set_time = true;
-		time_buf.LastWriteTime =
-		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
-	} else
-		time_buf.LastWriteTime = 0;
-	/* Do not set ctime explicitly unless other time
-	   stamps are changed explicitly (i.e. by utime()
-	   since we would then have a mix of client and
-	   server times */
+	if (attrs->ia_valid & (ATTR_MTIME|ATTR_ATIME|ATTR_CTIME) ||
+	    ((attrs->ia_valid & ATTR_MODE) && dosattr)) {
+		rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
+		/* BB: check for rc = -EOPNOTSUPP and switch to legacy mode */
 
-	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-		set_time = true;
-		/* Although Samba throws this field away
-		it may be useful to Windows - but we do
-		not want to set ctime unless some other
-		timestamp is changing */
-		cFYI(1, ("CIFS - CTIME changed"));
-		time_buf.ChangeTime =
-		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
-	} else
-		time_buf.ChangeTime = 0;
-
-	if (set_time || set_dosattr) {
-		time_buf.CreationTime = 0;	/* do not change */
-		/* In the future we should experiment - try setting timestamps
-		   via Handle (SetFileInfo) instead of by path */
-		if (!(pTcon->ses->flags & CIFS_SES_NT4))
-			rc = CIFSSMBSetTimes(xid, pTcon, full_path, &time_buf,
-					     cifs_sb->local_nls,
-					     cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-		else
-			rc = -EOPNOTSUPP;
-
-		if (rc == -EOPNOTSUPP) {
-			int oplock = 0;
-			__u16 netfid;
-
-			cFYI(1, ("calling SetFileInfo since SetPathInfo for "
-				 "times not supported by this server"));
-			/* BB we could scan to see if we already have it open
-			   and pass in pid of opener to function */
-			rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-					 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
-					 CREATE_NOT_DIR, &netfid, &oplock,
-					 NULL, cifs_sb->local_nls,
-					 cifs_sb->mnt_cifs_flags &
-						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			if (rc == 0) {
-				rc = CIFSSMBSetFileTimes(xid, pTcon, &time_buf,
-							 netfid);
-				CIFSSMBClose(xid, pTcon, netfid);
-			} else {
-			/* BB For even older servers we could convert time_buf
-			   into old DOS style which uses two second
-			   granularity */
-
-			/* rc = CIFSSMBSetTimesLegacy(xid, pTcon, full_path,
-					&time_buf, cifs_sb->local_nls); */
-			}
-		}
 		/* Even if error on time set, no sense failing the call if
 		the server would set the time to a reasonable value anyway,
 		and this check ensures that we are not being called from
 		sys_utimes in which case we ought to fail the call back to
 		the user when the server rejects the call */
 		if ((rc) && (attrs->ia_valid &
-			 (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE)))
+				(ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE)))
 			rc = 0;
 	}
 
@@ -1718,6 +1926,21 @@ cifs_setattr_exit:
 	return rc;
 }
 
+int
+cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+{
+	struct inode *inode = direntry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *pTcon = cifs_sb->tcon;
+
+	if (pTcon->unix_ext)
+		return cifs_setattr_unix(direntry, attrs);
+
+	return cifs_setattr_nounix(direntry, attrs);
+
+	/* BB: add cifs_setattr_legacy for really old servers */
+}
+
 #if 0
 void cifs_delete_inode(struct inode *inode)
 {
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4b17f8fe3157..88786ba02d27 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -150,8 +150,7 @@ cifs_buf_get(void)
    but it may be more efficient to always alloc same size
    albeit slightly larger than necessary and maxbuffersize
    defaults to this and can not be bigger */
-	ret_buf = (struct smb_hdr *) mempool_alloc(cifs_req_poolp,
-						   GFP_KERNEL | GFP_NOFS);
+	ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
 
 	/* clear the first few header bytes */
 	/* for most paths, more is cleared in header_assemble */
@@ -188,8 +187,7 @@ cifs_small_buf_get(void)
    but it may be more efficient to always alloc same size
    albeit slightly larger than necessary and maxbuffersize
    defaults to this and can not be bigger */
-	ret_buf = (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp,
-						   GFP_KERNEL | GFP_NOFS);
+	ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
 	if (ret_buf) {
 	/* No need to clear memory here, cleared in header assemble */
 	/*	memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
@@ -313,8 +311,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 	buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
 	buffer->Pid = cpu_to_le16((__u16)current->tgid);
 	buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
-	spin_lock(&GlobalMid_Lock);
-	spin_unlock(&GlobalMid_Lock);
 	if (treeCon) {
 		buffer->Tid = treeCon->tid;
 		if (treeCon->ses) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 83f306954883..58d57299f2a0 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -640,6 +640,70 @@ static int is_dir_changed(struct file *file)
 
 }
 
+static int cifs_save_resume_key(const char *current_entry,
+	struct cifsFileInfo *cifsFile)
+{
+	int rc = 0;
+	unsigned int len = 0;
+	__u16 level;
+	char *filename;
+
+	if ((cifsFile == NULL) || (current_entry == NULL))
+		return -EINVAL;
+
+	level = cifsFile->srch_inf.info_level;
+
+	if (level == SMB_FIND_FILE_UNIX) {
+		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
+
+		filename = &pFindData->FileName[0];
+		if (cifsFile->srch_inf.unicode) {
+			len = cifs_unicode_bytelen(filename);
+		} else {
+			/* BB should we make this strnlen of PATH_MAX? */
+			len = strnlen(filename, PATH_MAX);
+		}
+		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
+		FILE_DIRECTORY_INFO *pFindData =
+			(FILE_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
+		FILE_FULL_DIRECTORY_INFO *pFindData =
+			(FILE_FULL_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
+		SEARCH_ID_FULL_DIR_INFO *pFindData =
+			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
+		FILE_BOTH_DIRECTORY_INFO *pFindData =
+			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
+		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if (level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO *pFindData =
+			(FIND_FILE_STANDARD_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		/* one byte length, no name conversion */
+		len = (unsigned int)pFindData->FileNameLength;
+		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+	} else {
+		cFYI(1, ("Unknown findfirst level %d", level));
+		return -EINVAL;
+	}
+	cifsFile->srch_inf.resume_name_len = len;
+	cifsFile->srch_inf.presume_name = filename;
+	return rc;
+}
+
 /* find the corresponding entry in the search */
 /* Note that the SMB server returns search entries for . and .. which
    complicates logic here if we choose to parse for them and we do not
@@ -690,6 +754,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 			else
 				cifs_buf_release(cifsFile->srch_inf.
 						ntwrk_buf_start);
+			cifsFile->srch_inf.ntwrk_buf_start = NULL;
 		}
 		rc = initiate_cifs_search(xid, file);
 		if (rc) {
@@ -697,6 +762,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 				 rc));
 			return rc;
 		}
+		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
 	}
 
 	while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
@@ -704,6 +770,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		cFYI(1, ("calling findnext2"));
 		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
 				  &cifsFile->srch_inf);
+		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
 		if (rc)
 			return -ENOENT;
 	}
@@ -918,69 +985,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	return rc;
 }
 
-static int cifs_save_resume_key(const char *current_entry,
-	struct cifsFileInfo *cifsFile)
-{
-	int rc = 0;
-	unsigned int len = 0;
-	__u16 level;
-	char *filename;
-
-	if ((cifsFile == NULL) || (current_entry == NULL))
-		return -EINVAL;
-
-	level = cifsFile->srch_inf.info_level;
-
-	if (level == SMB_FIND_FILE_UNIX) {
-		FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
-
-		filename = &pFindData->FileName[0];
-		if (cifsFile->srch_inf.unicode) {
-			len = cifs_unicode_bytelen(filename);
-		} else {
-			/* BB should we make this strnlen of PATH_MAX? */
-			len = strnlen(filename, PATH_MAX);
-		}
-		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
-		FILE_DIRECTORY_INFO *pFindData =
-			(FILE_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
-		FILE_FULL_DIRECTORY_INFO *pFindData =
-			(FILE_FULL_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
-		SEARCH_ID_FULL_DIR_INFO *pFindData =
-			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
-		FILE_BOTH_DIRECTORY_INFO *pFindData =
-			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		len = le32_to_cpu(pFindData->FileNameLength);
-		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
-	} else if (level == SMB_FIND_FILE_INFO_STANDARD) {
-		FIND_FILE_STANDARD_INFO *pFindData =
-			(FIND_FILE_STANDARD_INFO *)current_entry;
-		filename = &pFindData->FileName[0];
-		/* one byte length, no name conversion */
-		len = (unsigned int)pFindData->FileNameLength;
-		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
-	} else {
-		cFYI(1, ("Unknown findfirst level %d", level));
-		return -EINVAL;
-	}
-	cifsFile->srch_inf.resume_name_len = len;
-	cifsFile->srch_inf.presume_name = filename;
-	return rc;
-}
 
 int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index ed150efbe27c..2851d5da0c8c 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		char lnm_session_key[CIFS_SESS_KEY_SIZE];
 
+		pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
+
 		/* no capabilities flags in old lanman negotiation */
 
 		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
@@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 		} else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-	} else if (type == Kerberos) {
+	} else if (type == Kerberos || type == MSKerberos) {
 #ifdef CONFIG_CIFS_UPCALL
 		struct cifs_spnego_msg *msg;
 		spnego_key = cifs_get_spnego_key(ses);
@@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		}
 
 		msg = spnego_key->payload.data;
+		/* check version field to make sure that cifs.upcall is
+		   sending us a response in an expected form */
+		if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+			cERROR(1, ("incorrect version of cifs.upcall (expected"
+				   " %d but got %d)",
+				   CIFS_SPNEGO_UPCALL_VERSION, msg->version));
+			rc = -EKEYREJECTED;
+			goto ssetup_exit;
+		}
 		/* bail out if key is too long */
 		if (msg->sesskey_len >
 		    sizeof(ses->server->mac_signing_key.data.krb5)) {
@@ -613,8 +624,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 					 ses, nls_cp);
 
 ssetup_exit:
-	if (spnego_key)
+	if (spnego_key) {
+		key_revoke(spnego_key);
 		key_put(spnego_key);
+	}
 	kfree(str_area);
 	if (resp_buf_type == CIFS_SMALL_BUFFER) {
 		cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 000ac509c98a..bf0e6d8e382a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -50,8 +50,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
 		return NULL;
 	}
 
-	temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp,
-						    GFP_KERNEL | GFP_NOFS);
+	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
 	if (temp == NULL)
 		return temp;
 	else {
@@ -265,6 +264,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
 	cFYI(1, ("Sending smb:  total_len %d", total_len));
 	dump_smb(smb_buffer, len);
 
+	i = 0;
 	while (total_len) {
 		rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
 				    n_vec - first_vec, total_len);
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index e1c854890f94..bf4a3fd3c8e3 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -28,11 +28,9 @@ int coda_fake_statfs;
 char * coda_f2s(struct CodaFid *f)
 {
 	static char s[60];
-#ifdef CONFIG_CODA_FS_OLD_API
- 	sprintf(s, "(%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2]);
-#else
+
  	sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]);
-#endif
+
 	return s;
 }
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 3d2580e00a3e..75b1fa90b2cb 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -137,13 +137,18 @@ exit:
 }
 
 
-int coda_permission(struct inode *inode, int mask, struct nameidata *nd)
+int coda_permission(struct inode *inode, int mask)
 {
         int error = 0;
+
+	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
  
 	if (!mask)
 		return 0; 
 
+	if ((mask & MAY_EXEC) && !execute_ok(inode))
+		return -EACCES;
+
 	lock_kernel();
 
 	if (coda_cache_check(inode, mask))
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2f58dfc70083..830f51abb971 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -58,7 +58,7 @@ static void coda_destroy_inode(struct inode *inode)
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
 
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index c21a1f552a63..773f2ce9aa06 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,8 +24,7 @@
 #include <linux/coda_psdev.h>
 
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask,
-				 struct nameidata *nd);
+static int coda_ioctl_permission(struct inode *inode, int mask);
 static int coda_pioctl(struct inode * inode, struct file * filp, 
                        unsigned int cmd, unsigned long user_data);
 
@@ -42,16 +41,15 @@ const struct file_operations coda_ioctl_operations = {
 };
 
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask,
-				 struct nameidata *nd)
+static int coda_ioctl_permission(struct inode *inode, int mask)
 {
-        return 0;
+	return (mask & MAY_EXEC) ? -EACCES : 0;
 }
 
 static int coda_pioctl(struct inode * inode, struct file * filp, 
                        unsigned int cmd, unsigned long user_data)
 {
-	struct nameidata nd;
+	struct path path;
         int error;
 	struct PioctlData data;
         struct inode *target_inode = NULL;
@@ -66,21 +64,21 @@ static int coda_pioctl(struct inode * inode, struct file * filp,
          * Look up the pathname. Note that the pathname is in 
          * user memory, and namei takes care of this
          */
-        if ( data.follow ) {
-                error = user_path_walk(data.path, &nd);
+        if (data.follow) {
+                error = user_path(data.path, &path);
 	} else {
-	        error = user_path_walk_link(data.path, &nd);
+	        error = user_lpath(data.path, &path);
 	}
 		
 	if ( error ) {
 		return error;
         } else {
-		target_inode = nd.path.dentry->d_inode;
+		target_inode = path.dentry->d_inode;
 	}
 	
 	/* return if it is not a Coda inode */
 	if ( target_inode->i_sb != inode->i_sb ) {
-		path_put(&nd.path);
+		path_put(&path);
 	        return  -EINVAL;
 	}
 
@@ -89,7 +87,7 @@ static int coda_pioctl(struct inode * inode, struct file * filp,
 
 	error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
 
-	path_put(&nd.path);
+	path_put(&path);
         return error;
 }
 
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index e3eb3556622b..cfd29da714d1 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -363,7 +363,7 @@ static int init_coda_psdev(void)
 	}		
 	for (i = 0; i < MAX_CODADEVS; i++)
 		device_create(coda_psdev_class, NULL,
-			      MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i);
+			      MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
 	coda_sysctl_init();
 	goto out;
 
@@ -377,11 +377,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
 MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
 MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
 MODULE_LICENSE("GPL");
-#ifdef CONFIG_CODA_FS_OLD_API
-MODULE_VERSION("5.3.21");
-#else
 MODULE_VERSION("6.6");
-#endif
 
 static int __init init_coda(void)
 {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 359e531094dd..ce432bca95d1 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -52,12 +52,8 @@ static void *alloc_upcall(int opcode, int size)
         inp->ih.opcode = opcode;
 	inp->ih.pid = current->pid;
 	inp->ih.pgid = task_pgrp_nr(current);
-#ifdef CONFIG_CODA_FS_OLD_API
-	memset(&inp->ih.cred, 0, sizeof(struct coda_cred));
-	inp->ih.cred.cr_fsuid = current->fsuid;
-#else
 	inp->ih.uid = current->fsuid;
-#endif
+
 	return (void*)inp;
 }
 
@@ -166,20 +162,11 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
 	union inputArgs *inp;
 	union outputArgs *outp;
 	int insize, outsize, error;
-#ifdef CONFIG_CODA_FS_OLD_API
-	struct coda_cred cred = { 0, };
-	cred.cr_fsuid = uid;
-#endif
 	
 	insize = SIZE(release);
 	UPARG(CODA_CLOSE);
 	
-#ifdef CONFIG_CODA_FS_OLD_API
-	memcpy(&(inp->ih.cred), &cred, sizeof(cred));
-#else
 	inp->ih.uid = uid;
-#endif
-	
         inp->coda_close.VFid = *fid;
         inp->coda_close.flags = flags;
 
diff --git a/fs/compat.c b/fs/compat.c
index ed43e17a5dc6..e5f49f538502 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -137,6 +137,45 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _
 	return compat_sys_futimesat(AT_FDCWD, filename, t);
 }
 
+static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
+{
+	compat_ino_t ino = stat->ino;
+	typeof(ubuf->st_uid) uid = 0;
+	typeof(ubuf->st_gid) gid = 0;
+	int err;
+
+	SET_UID(uid, stat->uid);
+	SET_GID(gid, stat->gid);
+
+	if ((u64) stat->size > MAX_NON_LFS ||
+	    !old_valid_dev(stat->dev) ||
+	    !old_valid_dev(stat->rdev))
+		return -EOVERFLOW;
+	if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+		return -EOVERFLOW;
+
+	if (clear_user(ubuf, sizeof(*ubuf)))
+		return -EFAULT;
+
+	err  = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
+	err |= __put_user(ino, &ubuf->st_ino);
+	err |= __put_user(stat->mode, &ubuf->st_mode);
+	err |= __put_user(stat->nlink, &ubuf->st_nlink);
+	err |= __put_user(uid, &ubuf->st_uid);
+	err |= __put_user(gid, &ubuf->st_gid);
+	err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
+	err |= __put_user(stat->size, &ubuf->st_size);
+	err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
+	err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
+	err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
+	err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
+	err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
+	err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
+	err |= __put_user(stat->blksize, &ubuf->st_blksize);
+	err |= __put_user(stat->blocks, &ubuf->st_blocks);
+	return err;
+}
+
 asmlinkage long compat_sys_newstat(char __user * filename,
 		struct compat_stat __user *statbuf)
 {
@@ -197,8 +236,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 {
 	
 	if (sizeof ubuf->f_blocks == 4) {
-		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) &
-		    0xffffffff00000000ULL)
+		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
 			return -EOVERFLOW;
 		/* f_files and f_ffree may be -1; it's okay
 		 * to stuff that into 32 bits */
@@ -234,18 +273,18 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
  * The following statfs calls are copies of code from fs/open.c and
  * should be checked against those from time to time
  */
-asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs __user *buf)
+asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.path.dentry, &tmp);
+		error = vfs_statfs(path.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs(buf, &tmp);
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
@@ -271,8 +310,8 @@ out:
 static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
 {
 	if (sizeof ubuf->f_blocks == 4) {
-		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) &
-		    0xffffffff00000000ULL)
+		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
 			return -EOVERFLOW;
 		/* f_files and f_ffree may be -1; it's okay
 		 * to stuff that into 32 bits */
@@ -299,21 +338,21 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 	return 0;
 }
 
-asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, struct compat_statfs64 __user *buf)
+asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.path.dentry, &tmp);
+		error = vfs_statfs(path.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs64(buf, &tmp);
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
@@ -792,8 +831,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen,
 	if (buf->result)
 		return -EINVAL;
 	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->result = -EOVERFLOW;
 		return -EOVERFLOW;
+	}
 	buf->result++;
 	dirent = buf->dirent;
 	if (!access_ok(VERIFY_WRITE, dirent,
@@ -828,7 +869,7 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
 	buf.dirent = dirent;
 
 	error = vfs_readdir(file, compat_fillonedir, &buf);
-	if (error >= 0)
+	if (buf.result)
 		error = buf.result;
 
 	fput(file);
@@ -862,8 +903,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
 	if (reclen > buf->count)
 		return -EINVAL;
 	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->error = -EOVERFLOW;
 		return -EOVERFLOW;
+	}
 	dirent = buf->previous;
 	if (dirent) {
 		if (__put_user(offset, &dirent->d_off))
@@ -913,9 +956,8 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 	buf.error = 0;
 
 	error = vfs_readdir(file, compat_filldir, &buf);
-	if (error < 0)
-		goto out_putf;
-	error = buf.error;
+	if (error >= 0)
+		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
 		if (put_user(file->f_pos, &lastdirent->d_off))
@@ -923,8 +965,6 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 		else
 			error = count - buf.count;
 	}
-
-out_putf:
 	fput(file);
 out:
 	return error;
@@ -1004,19 +1044,16 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
 	buf.error = 0;
 
 	error = vfs_readdir(file, compat_filldir64, &buf);
-	if (error < 0)
-		goto out_putf;
-	error = buf.error;
+	if (error >= 0)
+		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
 		typeof(lastdirent->d_off) d_off = file->f_pos;
-		error = -EFAULT;
 		if (__put_user_unaligned(d_off, &lastdirent->d_off))
-			goto out_putf;
-		error = count - buf.count;
+			error = -EFAULT;
+		else
+			error = count - buf.count;
 	}
-
-out_putf:
 	fput(file);
 out:
 	return error;
@@ -1235,7 +1272,7 @@ static int compat_count(compat_uptr_t __user *argv, int max)
 			if (!p)
 				break;
 			argv++;
-			if(++i > max)
+			if (i++ >= max)
 				return -E2BIG;
 		}
 	}
@@ -1432,6 +1469,57 @@ out_ret:
 
 #define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
 
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec ts;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&ts);
+	ts = timespec_sub(*end_time, ts);
+	if (ts.tv_sec < 0)
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	if (timeval) {
+		struct compat_timeval rtv;
+
+		rtv.tv_sec = ts.tv_sec;
+		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+	} else {
+		struct compat_timespec rts;
+
+		rts.tv_sec = ts.tv_sec;
+		rts.tv_nsec = ts.tv_nsec;
+
+		if (!copy_to_user(p, &rts, sizeof(rts)))
+			return ret;
+	}
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
 /*
  * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
  * 64-bit unsigned longs.
@@ -1513,7 +1601,8 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
 int compat_core_sys_select(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp, s64 *timeout)
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct timespec *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -1560,7 +1649,7 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, timeout);
+	ret = do_select(n, &fds, end_time);
 
 	if (ret < 0)
 		goto out;
@@ -1586,7 +1675,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct compat_timeval __user *tvp)
 {
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	struct compat_timeval tv;
 	int ret;
 
@@ -1594,43 +1683,15 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		if (copy_from_user(&tv, tvp, sizeof(tv)))
 			return -EFAULT;
 
-		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(tv.tv_usec, 1000000/HZ);
-			timeout += tv.tv_sec * HZ;
-		}
 	}
 
-	ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tvp) {
-		struct compat_timeval rtv;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		rtv.tv_sec = timeout;
-		if (compat_timeval_compare(&rtv, &tv) >= 0)
-			rtv = tv;
-		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
 
 	return ret;
 }
@@ -1643,15 +1704,16 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 {
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
 	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
 	}
 
@@ -1666,51 +1728,8 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	do {
-		if (tsp) {
-			if ((unsigned long)ts.tv_sec < MAX_SELECT_SECONDS) {
-				timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
-				timeout += ts.tv_sec * (unsigned long)HZ;
-				ts.tv_sec = 0;
-				ts.tv_nsec = 0;
-			} else {
-				ts.tv_sec -= MAX_SELECT_SECONDS;
-				timeout = MAX_SELECT_SECONDS * HZ;
-			}
-		}
-
-		ret = compat_core_sys_select(n, inp, outp, exp, &timeout);
-
-	} while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec));
-
-	if (tsp) {
-		struct compat_timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-
-		rts.tv_sec = timeout / HZ;
-		rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ);
-		if (rts.tv_nsec >= NSEC_PER_SEC) {
-			rts.tv_sec++;
-			rts.tv_nsec -= NSEC_PER_SEC;
-		}
-		if (compat_timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
 		/*
@@ -1755,18 +1774,16 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct compat_timespec ts;
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		/* We assume that ts.tv_sec is always lower than
-		   the number of seconds that can be expressed in
-		   an s64. Otherwise the compiler bitches at us */
-		timeout = DIV_ROUND_UP(ts.tv_nsec, 1000000000/HZ);
-		timeout += ts.tv_sec * HZ;
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
 	}
 
 	if (sigmask) {
@@ -1780,7 +1797,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	ret = do_sys_poll(ufds, nfds, to);
 
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR) {
@@ -1798,31 +1815,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	if (tsp && timeout >= 0) {
-		struct compat_timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		/* Yes, we know it's actually an s64, but it's also positive. */
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-					1000;
-		rts.tv_sec = timeout;
-		if (compat_timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND && timeout >= 0)
-				ret = -EINTR;
-		}
-	}
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	return ret;
 }
@@ -2131,9 +2124,9 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 
 #ifdef CONFIG_SIGNALFD
 
-asmlinkage long compat_sys_signalfd(int ufd,
-				    const compat_sigset_t __user *sigmask,
-				    compat_size_t sigsetsize)
+asmlinkage long compat_sys_signalfd4(int ufd,
+				     const compat_sigset_t __user *sigmask,
+				     compat_size_t sigsetsize, int flags)
 {
 	compat_sigset_t ss32;
 	sigset_t tmp;
@@ -2148,9 +2141,15 @@ asmlinkage long compat_sys_signalfd(int ufd,
 	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
 		return -EFAULT;
 
-	return sys_signalfd(ufd, ksigmask, sizeof(sigset_t));
+	return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
 }
 
+asmlinkage long compat_sys_signalfd(int ufd,
+				    const compat_sigset_t __user *sigmask,
+				    compat_size_t sigsetsize)
+{
+	return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
+}
 #endif /* CONFIG_SIGNALFD */
 
 #ifdef CONFIG_TIMERFD
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c54eaab71a19..5235c67e7594 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/raid/md.h>
 #include <linux/kd.h>
-#include <linux/dirent.h>
 #include <linux/route.h>
 #include <linux/in6.h>
 #include <linux/ipv6_route.h>
@@ -58,7 +57,6 @@
 #include <linux/syscalls.h>
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
-#include <linux/wireless.h>
 #include <linux/atalk.h>
 #include <linux/loop.h>
 
@@ -1759,64 +1757,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
 	return sys_ioctl(fd, cmd, (unsigned long)tdata);
 }
 
-struct compat_iw_point {
-	compat_caddr_t pointer;
-	__u16 length;
-	__u16 flags;
-};
-
-static int do_wireless_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct iwreq __user *iwr;
-	struct iwreq __user *iwr_u;
-	struct iw_point __user *iwp;
-	struct compat_iw_point __user *iwp_u;
-	compat_caddr_t pointer_u;
-	void __user *pointer;
-	__u16 length, flags;
-	int ret;
-
-	iwr_u = compat_ptr(arg);
-	iwp_u = (struct compat_iw_point __user *) &iwr_u->u.data;
-	iwr = compat_alloc_user_space(sizeof(*iwr));
-	if (iwr == NULL)
-		return -ENOMEM;
-
-	iwp = &iwr->u.data;
-
-	if (!access_ok(VERIFY_WRITE, iwr, sizeof(*iwr)))
-		return -EFAULT;
-
-	if (__copy_in_user(&iwr->ifr_ifrn.ifrn_name[0],
-			   &iwr_u->ifr_ifrn.ifrn_name[0],
-			   sizeof(iwr->ifr_ifrn.ifrn_name)))
-		return -EFAULT;
-
-	if (__get_user(pointer_u, &iwp_u->pointer) ||
-	    __get_user(length, &iwp_u->length) ||
-	    __get_user(flags, &iwp_u->flags))
-		return -EFAULT;
-
-	if (__put_user(compat_ptr(pointer_u), &iwp->pointer) ||
-	    __put_user(length, &iwp->length) ||
-	    __put_user(flags, &iwp->flags))
-		return -EFAULT;
-
-	ret = sys_ioctl(fd, cmd, (unsigned long) iwr);
-
-	if (__get_user(pointer, &iwp->pointer) ||
-	    __get_user(length, &iwp->length) ||
-	    __get_user(flags, &iwp->flags))
-		return -EFAULT;
-
-	if (__put_user(ptr_to_compat(pointer), &iwp_u->pointer) ||
-	    __put_user(length, &iwp_u->length) ||
-	    __put_user(flags, &iwp_u->flags))
-		return -EFAULT;
-
-	return ret;
-}
-
 /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
  * for some operations; this forces use of the newer bridge-utils that
  * use compatiable ioctls
@@ -2356,8 +2296,6 @@ COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
 COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
 COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
 COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_ASKREGHOST)
-COMPATIBLE_IOCTL(AUTOFS_IOC_TOGGLEREGHOST)
 COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
 /* Raw devices */
 COMPATIBLE_IOCTL(RAW_SETBIND)
@@ -2405,6 +2343,7 @@ COMPATIBLE_IOCTL(HCIGETDEVLIST)
 COMPATIBLE_IOCTL(HCIGETDEVINFO)
 COMPATIBLE_IOCTL(HCIGETCONNLIST)
 COMPATIBLE_IOCTL(HCIGETCONNINFO)
+COMPATIBLE_IOCTL(HCIGETAUTHINFO)
 COMPATIBLE_IOCTL(HCISETRAW)
 COMPATIBLE_IOCTL(HCISETSCAN)
 COMPATIBLE_IOCTL(HCISETAUTH)
@@ -2501,36 +2440,6 @@ COMPATIBLE_IOCTL(I2C_TENBIT)
 COMPATIBLE_IOCTL(I2C_PEC)
 COMPATIBLE_IOCTL(I2C_RETRIES)
 COMPATIBLE_IOCTL(I2C_TIMEOUT)
-/* wireless */
-COMPATIBLE_IOCTL(SIOCSIWCOMMIT)
-COMPATIBLE_IOCTL(SIOCGIWNAME)
-COMPATIBLE_IOCTL(SIOCSIWNWID)
-COMPATIBLE_IOCTL(SIOCGIWNWID)
-COMPATIBLE_IOCTL(SIOCSIWFREQ)
-COMPATIBLE_IOCTL(SIOCGIWFREQ)
-COMPATIBLE_IOCTL(SIOCSIWMODE)
-COMPATIBLE_IOCTL(SIOCGIWMODE)
-COMPATIBLE_IOCTL(SIOCSIWSENS)
-COMPATIBLE_IOCTL(SIOCGIWSENS)
-COMPATIBLE_IOCTL(SIOCSIWRANGE)
-COMPATIBLE_IOCTL(SIOCSIWPRIV)
-COMPATIBLE_IOCTL(SIOCSIWSTATS)
-COMPATIBLE_IOCTL(SIOCSIWAP)
-COMPATIBLE_IOCTL(SIOCGIWAP)
-COMPATIBLE_IOCTL(SIOCSIWRATE)
-COMPATIBLE_IOCTL(SIOCGIWRATE)
-COMPATIBLE_IOCTL(SIOCSIWRTS)
-COMPATIBLE_IOCTL(SIOCGIWRTS)
-COMPATIBLE_IOCTL(SIOCSIWFRAG)
-COMPATIBLE_IOCTL(SIOCGIWFRAG)
-COMPATIBLE_IOCTL(SIOCSIWTXPOW)
-COMPATIBLE_IOCTL(SIOCGIWTXPOW)
-COMPATIBLE_IOCTL(SIOCSIWRETRY)
-COMPATIBLE_IOCTL(SIOCGIWRETRY)
-COMPATIBLE_IOCTL(SIOCSIWPOWER)
-COMPATIBLE_IOCTL(SIOCGIWPOWER)
-COMPATIBLE_IOCTL(SIOCSIWAUTH)
-COMPATIBLE_IOCTL(SIOCGIWAUTH)
 /* hiddev */
 COMPATIBLE_IOCTL(HIDIOCGVERSION)
 COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
@@ -2761,29 +2670,7 @@ COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
 HANDLE_IOCTL(I2C_FUNCS, w_long)
 HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
 HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl)
-/* wireless */
-HANDLE_IOCTL(SIOCGIWRANGE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWPRIV, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWSTATS, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWTHRSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWTHRSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWMLME, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWAPLIST, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWSCAN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWSCAN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWESSID, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWESSID, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWNICKN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWNICKN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWENCODE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWENCODE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWGENIE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWGENIE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWENCODEEXT, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl)
+/* bridge */
 HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
 HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
 /* Not implemented in the native kernel */
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index da015c12e3ea..762d287123ca 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -49,8 +49,10 @@ struct configfs_dirent {
 #define CONFIGFS_USET_DEFAULT	0x0080
 #define CONFIGFS_USET_DROPPING	0x0100
 #define CONFIGFS_USET_IN_MKDIR	0x0200
+#define CONFIGFS_USET_CREATING	0x0400
 #define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
 
+extern struct mutex configfs_symlink_mutex;
 extern spinlock_t configfs_dirent_lock;
 
 extern struct vfsmount * configfs_mount;
@@ -66,6 +68,7 @@ extern void configfs_inode_exit(void);
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
 				struct dentry *, void *, umode_t, int);
+extern int configfs_dirent_is_ready(struct configfs_dirent *);
 
 extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
 extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 0e64312a084c..8e93341f3e82 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -185,7 +185,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
 	error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
 	if (!error)
 		error = configfs_make_dirent(p->d_fsdata, d, k, mode,
-					     CONFIGFS_DIR);
+					     CONFIGFS_DIR | CONFIGFS_USET_CREATING);
 	if (!error) {
 		error = configfs_create(d, mode, init_dir);
 		if (!error) {
@@ -209,6 +209,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
  *	configfs_create_dir - create a directory for an config_item.
  *	@item:		config_itemwe're creating directory for.
  *	@dentry:	config_item's dentry.
+ *
+ *	Note: user-created entries won't be allowed under this new directory
+ *	until it is validated by configfs_dir_set_ready()
  */
 
 static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
@@ -231,6 +234,44 @@ static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
 	return error;
 }
 
+/*
+ * Allow userspace to create new entries under a new directory created with
+ * configfs_create_dir(), and under all of its chidlren directories recursively.
+ * @sd		configfs_dirent of the new directory to validate
+ *
+ * Caller must hold configfs_dirent_lock.
+ */
+static void configfs_dir_set_ready(struct configfs_dirent *sd)
+{
+	struct configfs_dirent *child_sd;
+
+	sd->s_type &= ~CONFIGFS_USET_CREATING;
+	list_for_each_entry(child_sd, &sd->s_children, s_sibling)
+		if (child_sd->s_type & CONFIGFS_USET_CREATING)
+			configfs_dir_set_ready(child_sd);
+}
+
+/*
+ * Check that a directory does not belong to a directory hierarchy being
+ * attached and not validated yet.
+ * @sd		configfs_dirent of the directory to check
+ *
+ * @return	non-zero iff the directory was validated
+ *
+ * Note: takes configfs_dirent_lock, so the result may change from false to true
+ * in two consecutive calls, but never from true to false.
+ */
+int configfs_dirent_is_ready(struct configfs_dirent *sd)
+{
+	int ret;
+
+	spin_lock(&configfs_dirent_lock);
+	ret = !(sd->s_type & CONFIGFS_USET_CREATING);
+	spin_unlock(&configfs_dirent_lock);
+
+	return ret;
+}
+
 int configfs_create_link(struct configfs_symlink *sl,
 			 struct dentry *parent,
 			 struct dentry *dentry)
@@ -283,6 +324,8 @@ static void remove_dir(struct dentry * d)
  * The only thing special about this is that we remove any files in
  * the directory before we remove the directory, and we've inlined
  * what used to be configfs_rmdir() below, instead of calling separately.
+ *
+ * Caller holds the mutex of the item's inode
  */
 
 static void configfs_remove_dir(struct config_item * item)
@@ -330,7 +373,19 @@ static struct dentry * configfs_lookup(struct inode *dir,
 	struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
 	struct configfs_dirent * sd;
 	int found = 0;
-	int err = 0;
+	int err;
+
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 *
+	 * This forbids userspace to read/write attributes of items which may
+	 * not complete their initialization, since the dentries of the
+	 * attributes won't be instantiated.
+	 */
+	err = -ENOENT;
+	if (!configfs_dirent_is_ready(parent_sd))
+		goto out;
 
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (sd->s_type & CONFIGFS_NOT_PINNED) {
@@ -353,6 +408,7 @@ static struct dentry * configfs_lookup(struct inode *dir,
 		return simple_lookup(dir, dentry, nd);
 	}
 
+out:
 	return ERR_PTR(err);
 }
 
@@ -370,13 +426,17 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
 	struct configfs_dirent *sd;
 	int ret;
 
+	/* Mark that we're trying to drop the group */
+	parent_sd->s_type |= CONFIGFS_USET_DROPPING;
+
 	ret = -EBUSY;
 	if (!list_empty(&parent_sd->s_links))
 		goto out;
 
 	ret = 0;
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
-		if (sd->s_type & CONFIGFS_NOT_PINNED)
+		if (!sd->s_element ||
+		    (sd->s_type & CONFIGFS_NOT_PINNED))
 			continue;
 		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
 			/* Abort if racing with mkdir() */
@@ -385,8 +445,6 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex
 					*wait_mutex = &sd->s_dentry->d_inode->i_mutex;
 				return -EAGAIN;
 			}
-			/* Mark that we're trying to drop the group */
-			sd->s_type |= CONFIGFS_USET_DROPPING;
 
 			/*
 			 * Yup, recursive.  If there's a problem, blame
@@ -414,12 +472,11 @@ static void configfs_detach_rollback(struct dentry *dentry)
 	struct configfs_dirent *parent_sd = dentry->d_fsdata;
 	struct configfs_dirent *sd;
 
-	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
-		if (sd->s_type & CONFIGFS_USET_DEFAULT) {
+	parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;
+
+	list_for_each_entry(sd, &parent_sd->s_children, s_sibling)
+		if (sd->s_type & CONFIGFS_USET_DEFAULT)
 			configfs_detach_rollback(sd->s_dentry);
-			sd->s_type &= ~CONFIGFS_USET_DROPPING;
-		}
-	}
 }
 
 static void detach_attrs(struct config_item * item)
@@ -558,36 +615,21 @@ static int create_default_group(struct config_group *parent_group,
 static int populate_groups(struct config_group *group)
 {
 	struct config_group *new_group;
-	struct dentry *dentry = group->cg_item.ci_dentry;
 	int ret = 0;
 	int i;
 
 	if (group->default_groups) {
-		/*
-		 * FYI, we're faking mkdir here
-		 * I'm not sure we need this semaphore, as we're called
-		 * from our parent's mkdir.  That holds our parent's
-		 * i_mutex, so afaik lookup cannot continue through our
-		 * parent to find us, let alone mess with our tree.
-		 * That said, taking our i_mutex is closer to mkdir
-		 * emulation, and shouldn't hurt.
-		 */
-		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
-
 		for (i = 0; group->default_groups[i]; i++) {
 			new_group = group->default_groups[i];
 
 			ret = create_default_group(group, new_group);
-			if (ret)
+			if (ret) {
+				detach_groups(group);
 				break;
+			}
 		}
-
-		mutex_unlock(&dentry->d_inode->i_mutex);
 	}
 
-	if (ret)
-		detach_groups(group);
-
 	return ret;
 }
 
@@ -702,7 +744,15 @@ static int configfs_attach_item(struct config_item *parent_item,
 	if (!ret) {
 		ret = populate_attrs(item);
 		if (ret) {
+			/*
+			 * We are going to remove an inode and its dentry but
+			 * the VFS may already have hit and used them. Thus,
+			 * we must lock them as rmdir() would.
+			 */
+			mutex_lock(&dentry->d_inode->i_mutex);
 			configfs_remove_dir(item);
+			dentry->d_inode->i_flags |= S_DEAD;
+			mutex_unlock(&dentry->d_inode->i_mutex);
 			d_delete(dentry);
 		}
 	}
@@ -710,6 +760,7 @@ static int configfs_attach_item(struct config_item *parent_item,
 	return ret;
 }
 
+/* Caller holds the mutex of the item's inode */
 static void configfs_detach_item(struct config_item *item)
 {
 	detach_attrs(item);
@@ -728,16 +779,30 @@ static int configfs_attach_group(struct config_item *parent_item,
 		sd = dentry->d_fsdata;
 		sd->s_type |= CONFIGFS_USET_DIR;
 
+		/*
+		 * FYI, we're faking mkdir in populate_groups()
+		 * We must lock the group's inode to avoid races with the VFS
+		 * which can already hit the inode and try to add/remove entries
+		 * under it.
+		 *
+		 * We must also lock the inode to remove it safely in case of
+		 * error, as rmdir() would.
+		 */
+		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
 		ret = populate_groups(to_config_group(item));
 		if (ret) {
 			configfs_detach_item(item);
-			d_delete(dentry);
+			dentry->d_inode->i_flags |= S_DEAD;
 		}
+		mutex_unlock(&dentry->d_inode->i_mutex);
+		if (ret)
+			d_delete(dentry);
 	}
 
 	return ret;
 }
 
+/* Caller holds the mutex of the group's inode */
 static void configfs_detach_group(struct config_item *item)
 {
 	detach_groups(to_config_group(item));
@@ -1027,14 +1092,15 @@ EXPORT_SYMBOL(configfs_undepend_item);
 
 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	int ret, module_got = 0;
-	struct config_group *group;
-	struct config_item *item;
+	int ret = 0;
+	int module_got = 0;
+	struct config_group *group = NULL;
+	struct config_item *item = NULL;
 	struct config_item *parent_item;
 	struct configfs_subsystem *subsys;
 	struct configfs_dirent *sd;
 	struct config_item_type *type;
-	struct module *owner = NULL;
+	struct module *subsys_owner = NULL, *new_item_owner = NULL;
 	char *name;
 
 	if (dentry->d_parent == configfs_sb->s_root) {
@@ -1043,6 +1109,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	}
 
 	sd = dentry->d_parent->d_fsdata;
+
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 */
+	if (!configfs_dirent_is_ready(sd)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
 	if (!(sd->s_type & CONFIGFS_USET_DIR)) {
 		ret = -EPERM;
 		goto out;
@@ -1061,27 +1137,47 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out_put;
 	}
 
+	/*
+	 * The subsystem may belong to a different module than the item
+	 * being created.  We don't want to safely pin the new item but
+	 * fail to pin the subsystem it sits under.
+	 */
+	if (!subsys->su_group.cg_item.ci_type) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+	if (!try_module_get(subsys_owner)) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+
 	name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
 	if (!name) {
 		ret = -ENOMEM;
-		goto out_put;
+		goto out_subsys_put;
 	}
 
 	snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
 
 	mutex_lock(&subsys->su_mutex);
-	group = NULL;
-	item = NULL;
 	if (type->ct_group_ops->make_group) {
-		ret = type->ct_group_ops->make_group(to_config_group(parent_item), name, &group);
-		if (!ret) {
+		group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
+		if (!group)
+			group = ERR_PTR(-ENOMEM);
+		if (!IS_ERR(group)) {
 			link_group(to_config_group(parent_item), group);
 			item = &group->cg_item;
-		}
+		} else
+			ret = PTR_ERR(group);
 	} else {
-		ret = type->ct_group_ops->make_item(to_config_group(parent_item), name, &item);
-		if (!ret)
+		item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
+		if (!item)
+			item = ERR_PTR(-ENOMEM);
+		if (!IS_ERR(item))
 			link_obj(parent_item, item);
+		else
+			ret = PTR_ERR(item);
 	}
 	mutex_unlock(&subsys->su_mutex);
 
@@ -1091,7 +1187,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		 * If ret != 0, then link_obj() was never called.
 		 * There are no extra references to clean up.
 		 */
-		goto out_put;
+		goto out_subsys_put;
 	}
 
 	/*
@@ -1105,8 +1201,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out_unlink;
 	}
 
-	owner = type->ct_owner;
-	if (!try_module_get(owner)) {
+	new_item_owner = type->ct_owner;
+	if (!try_module_get(new_item_owner)) {
 		ret = -EINVAL;
 		goto out_unlink;
 	}
@@ -1136,6 +1232,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	spin_lock(&configfs_dirent_lock);
 	sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+	if (!ret)
+		configfs_dir_set_ready(dentry->d_fsdata);
 	spin_unlock(&configfs_dirent_lock);
 
 out_unlink:
@@ -1153,9 +1251,13 @@ out_unlink:
 		mutex_unlock(&subsys->su_mutex);
 
 		if (module_got)
-			module_put(owner);
+			module_put(new_item_owner);
 	}
 
+out_subsys_put:
+	if (ret)
+		module_put(subsys_owner);
+
 out_put:
 	/*
 	 * link_obj()/link_group() took a reference from child->parent,
@@ -1174,7 +1276,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct config_item *item;
 	struct configfs_subsystem *subsys;
 	struct configfs_dirent *sd;
-	struct module *owner = NULL;
+	struct module *subsys_owner = NULL, *dead_item_owner = NULL;
 	int ret;
 
 	if (dentry->d_parent == configfs_sb->s_root)
@@ -1201,14 +1303,26 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -EINVAL;
 	}
 
-	spin_lock(&configfs_dirent_lock);
+	/* configfs_mkdir() shouldn't have allowed this */
+	BUG_ON(!subsys->su_group.cg_item.ci_type);
+	subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+
+	/*
+	 * Ensure that no racing symlink() will make detach_prep() fail while
+	 * the new link is temporarily attached
+	 */
 	do {
 		struct mutex *wait_mutex;
 
+		mutex_lock(&configfs_symlink_mutex);
+		spin_lock(&configfs_dirent_lock);
 		ret = configfs_detach_prep(dentry, &wait_mutex);
-		if (ret) {
+		if (ret)
 			configfs_detach_rollback(dentry);
-			spin_unlock(&configfs_dirent_lock);
+		spin_unlock(&configfs_dirent_lock);
+		mutex_unlock(&configfs_symlink_mutex);
+
+		if (ret) {
 			if (ret != -EAGAIN) {
 				config_item_put(parent_item);
 				return ret;
@@ -1217,11 +1331,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 			/* Wait until the racing operation terminates */
 			mutex_lock(wait_mutex);
 			mutex_unlock(wait_mutex);
-
-			spin_lock(&configfs_dirent_lock);
 		}
 	} while (ret == -EAGAIN);
-	spin_unlock(&configfs_dirent_lock);
 
 	/* Get a working ref for the duration of this function */
 	item = configfs_get_config_item(dentry);
@@ -1230,7 +1341,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	config_item_put(parent_item);
 
 	if (item->ci_type)
-		owner = item->ci_type->ct_owner;
+		dead_item_owner = item->ci_type->ct_owner;
 
 	if (sd->s_type & CONFIGFS_USET_DIR) {
 		configfs_detach_group(item);
@@ -1252,7 +1363,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 	/* Drop our reference from above */
 	config_item_put(item);
 
-	module_put(owner);
+	module_put(dead_item_owner);
+	module_put(subsys_owner);
 
 	return 0;
 }
@@ -1308,13 +1420,24 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 {
 	struct dentry * dentry = file->f_path.dentry;
 	struct configfs_dirent * parent_sd = dentry->d_fsdata;
+	int err;
 
 	mutex_lock(&dentry->d_inode->i_mutex);
-	file->private_data = configfs_new_dirent(parent_sd, NULL);
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 */
+	err = -ENOENT;
+	if (configfs_dirent_is_ready(parent_sd)) {
+		file->private_data = configfs_new_dirent(parent_sd, NULL);
+		if (IS_ERR(file->private_data))
+			err = PTR_ERR(file->private_data);
+		else
+			err = 0;
+	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
-	return IS_ERR(file->private_data) ? PTR_ERR(file->private_data) : 0;
-
+	return err;
 }
 
 static int configfs_dir_close(struct inode *inode, struct file *file)
@@ -1485,6 +1608,10 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 		if (err) {
 			d_delete(dentry);
 			dput(dentry);
+		} else {
+			spin_lock(&configfs_dirent_lock);
+			configfs_dir_set_ready(dentry->d_fsdata);
+			spin_unlock(&configfs_dirent_lock);
 		}
 	}
 
@@ -1511,11 +1638,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 	mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
 			  I_MUTEX_PARENT);
 	mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+	mutex_lock(&configfs_symlink_mutex);
 	spin_lock(&configfs_dirent_lock);
 	if (configfs_detach_prep(dentry, NULL)) {
 		printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
 	}
 	spin_unlock(&configfs_dirent_lock);
+	mutex_unlock(&configfs_symlink_mutex);
 	configfs_detach_group(&group->cg_item);
 	dentry->d_inode->i_flags |= S_DEAD;
 	mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 0004d18c40ac..932a92b31483 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -31,6 +31,9 @@
 #include <linux/configfs.h>
 #include "configfs_internal.h"
 
+/* Protects attachments of new symlinks */
+DEFINE_MUTEX(configfs_symlink_mutex);
+
 static int item_depth(struct config_item * item)
 {
 	struct config_item * p = item;
@@ -73,11 +76,20 @@ static int create_link(struct config_item *parent_item,
 	struct configfs_symlink *sl;
 	int ret;
 
+	ret = -ENOENT;
+	if (!configfs_dirent_is_ready(target_sd))
+		goto out;
 	ret = -ENOMEM;
 	sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
 	if (sl) {
 		sl->sl_target = config_item_get(item);
 		spin_lock(&configfs_dirent_lock);
+		if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
+			spin_unlock(&configfs_dirent_lock);
+			config_item_put(item);
+			kfree(sl);
+			return -ENOENT;
+		}
 		list_add(&sl->sl_list, &target_sd->s_links);
 		spin_unlock(&configfs_dirent_lock);
 		ret = configfs_create_link(sl, parent_item->ci_dentry,
@@ -91,22 +103,23 @@ static int create_link(struct config_item *parent_item,
 		}
 	}
 
+out:
 	return ret;
 }
 
 
-static int get_target(const char *symname, struct nameidata *nd,
+static int get_target(const char *symname, struct path *path,
 		      struct config_item **target)
 {
 	int ret;
 
-	ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
+	ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path);
 	if (!ret) {
-		if (nd->path.dentry->d_sb == configfs_sb) {
-			*target = configfs_get_config_item(nd->path.dentry);
+		if (path->dentry->d_sb == configfs_sb) {
+			*target = configfs_get_config_item(path->dentry);
 			if (!*target) {
 				ret = -ENOENT;
-				path_put(&nd->path);
+				path_put(path);
 			}
 		} else
 			ret = -EPERM;
@@ -119,7 +132,8 @@ static int get_target(const char *symname, struct nameidata *nd,
 int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
 	int ret;
-	struct nameidata nd;
+	struct path path;
+	struct configfs_dirent *sd;
 	struct config_item *parent_item;
 	struct config_item *target_item;
 	struct config_item_type *type;
@@ -128,27 +142,39 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
 	if (dentry->d_parent == configfs_sb->s_root)
 		goto out;
 
+	sd = dentry->d_parent->d_fsdata;
+	/*
+	 * Fake invisibility if dir belongs to a group/default groups hierarchy
+	 * being attached
+	 */
+	ret = -ENOENT;
+	if (!configfs_dirent_is_ready(sd))
+		goto out;
+
 	parent_item = configfs_get_config_item(dentry->d_parent);
 	type = parent_item->ci_type;
 
+	ret = -EPERM;
 	if (!type || !type->ct_item_ops ||
 	    !type->ct_item_ops->allow_link)
 		goto out_put;
 
-	ret = get_target(symname, &nd, &target_item);
+	ret = get_target(symname, &path, &target_item);
 	if (ret)
 		goto out_put;
 
 	ret = type->ct_item_ops->allow_link(parent_item, target_item);
 	if (!ret) {
+		mutex_lock(&configfs_symlink_mutex);
 		ret = create_link(parent_item, target_item, dentry);
+		mutex_unlock(&configfs_symlink_mutex);
 		if (ret && type->ct_item_ops->drop_link)
 			type->ct_item_ops->drop_link(parent_item,
 						     target_item);
 	}
 
 	config_item_put(target_item);
-	path_put(&nd.path);
+	path_put(&path);
 
 out_put:
 	config_item_put(parent_item);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 0c3b618c15b3..f40423eb1a14 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex);
 static int cramfs_iget5_test(struct inode *inode, void *opaque)
 {
 	struct cramfs_inode *cramfs_inode = opaque;
-
-	if (inode->i_ino != CRAMINO(cramfs_inode))
-		return 0; /* does not match */
-
-	if (inode->i_ino != 1)
-		return 1;
-
-	/* all empty directories, char, block, pipe, and sock, share inode #1 */
-
-	if ((inode->i_mode != cramfs_inode->mode) ||
-	    (inode->i_gid != cramfs_inode->gid) ||
-	    (inode->i_uid != cramfs_inode->uid))
-		return 0; /* does not match */
-
-	if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) &&
-	    (inode->i_rdev != old_decode_dev(cramfs_inode->size)))
-		return 0; /* does not match */
-
-	return 1; /* matches */
+	return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
 }
 
 static int cramfs_iget5_set(struct inode *inode, void *opaque)
 {
-	static struct timespec zerotime;
 	struct cramfs_inode *cramfs_inode = opaque;
-	inode->i_mode = cramfs_inode->mode;
-	inode->i_uid = cramfs_inode->uid;
-	inode->i_size = cramfs_inode->size;
-	inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
-	inode->i_gid = cramfs_inode->gid;
-	/* Struct copy intentional */
-	inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
 	inode->i_ino = CRAMINO(cramfs_inode);
-	/* inode->i_nlink is left 1 - arguably wrong for directories,
-	   but it's the best we can do without reading the directory
-           contents.  1 yields the right result in GNU find, even
-	   without -noleaf option. */
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_fop = &generic_ro_fops;
-		inode->i_data.a_ops = &cramfs_aops;
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &cramfs_dir_inode_operations;
-		inode->i_fop = &cramfs_directory_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
-		inode->i_op = &page_symlink_inode_operations;
-		inode->i_data.a_ops = &cramfs_aops;
-	} else {
-		inode->i_size = 0;
-		inode->i_blocks = 0;
-		init_special_inode(inode, inode->i_mode,
-			old_decode_dev(cramfs_inode->size));
-	}
 	return 0;
 }
 
@@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 	struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
 					    cramfs_iget5_test, cramfs_iget5_set,
 					    cramfs_inode);
+	static struct timespec zerotime;
+
 	if (inode && (inode->i_state & I_NEW)) {
+		inode->i_mode = cramfs_inode->mode;
+		inode->i_uid = cramfs_inode->uid;
+		inode->i_size = cramfs_inode->size;
+		inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+		inode->i_gid = cramfs_inode->gid;
+		/* Struct copy intentional */
+		inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+		/* inode->i_nlink is left 1 - arguably wrong for directories,
+		   but it's the best we can do without reading the directory
+		   contents.  1 yields the right result in GNU find, even
+		   without -noleaf option. */
+		if (S_ISREG(inode->i_mode)) {
+			inode->i_fop = &generic_ro_fops;
+			inode->i_data.a_ops = &cramfs_aops;
+		} else if (S_ISDIR(inode->i_mode)) {
+			inode->i_op = &cramfs_dir_inode_operations;
+			inode->i_fop = &cramfs_directory_operations;
+		} else if (S_ISLNK(inode->i_mode)) {
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_data.a_ops = &cramfs_aops;
+		} else {
+			inode->i_size = 0;
+			inode->i_blocks = 0;
+			init_special_inode(inode, inode->i_mode,
+				old_decode_dev(cramfs_inode->size));
+		}
 		unlock_new_inode(inode);
 	}
 	return inode;
 }
 
+static void cramfs_drop_inode(struct inode *inode)
+{
+	if (inode->i_ino == 1)
+		generic_delete_inode(inode);
+	else
+		generic_drop_inode(inode);
+}
+
 /*
  * We have our own block cache: don't fill up the buffer cache
  * with the rom-image, because the way the filesystem is set
@@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = {
 	.put_super	= cramfs_put_super,
 	.remount_fs	= cramfs_remount,
 	.statfs		= cramfs_statfs,
+	.drop_inode	= cramfs_drop_inode,
 };
 
 static int cramfs_get_sb(struct file_system_type *fs_type,
diff --git a/fs/dcache.c b/fs/dcache.c
index 6068c25b393c..a1d86c7f3e66 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -61,7 +61,6 @@ static struct kmem_cache *dentry_cache __read_mostly;
 static unsigned int d_hash_mask __read_mostly;
 static unsigned int d_hash_shift __read_mostly;
 static struct hlist_head *dentry_hashtable __read_mostly;
-static LIST_HEAD(dentry_unused);
 
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
@@ -70,6 +69,7 @@ struct dentry_stat_t dentry_stat = {
 
 static void __d_free(struct dentry *dentry)
 {
+	WARN_ON(!list_empty(&dentry->d_alias));
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
 	kmem_cache_free(dentry_cache, dentry); 
@@ -96,14 +96,6 @@ static void d_free(struct dentry *dentry)
 		call_rcu(&dentry->d_u.d_rcu, d_callback);
 }
 
-static void dentry_lru_remove(struct dentry *dentry)
-{
-	if (!list_empty(&dentry->d_lru)) {
-		list_del_init(&dentry->d_lru);
-		dentry_stat.nr_unused--;
-	}
-}
-
 /*
  * Release the dentry's inode, using the filesystem
  * d_iput() operation if defined.
@@ -130,6 +122,41 @@ static void dentry_iput(struct dentry * dentry)
 	}
 }
 
+/*
+ * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ */
+static void dentry_lru_add(struct dentry *dentry)
+{
+	list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+	dentry->d_sb->s_nr_dentry_unused++;
+	dentry_stat.nr_unused++;
+}
+
+static void dentry_lru_add_tail(struct dentry *dentry)
+{
+	list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+	dentry->d_sb->s_nr_dentry_unused++;
+	dentry_stat.nr_unused++;
+}
+
+static void dentry_lru_del(struct dentry *dentry)
+{
+	if (!list_empty(&dentry->d_lru)) {
+		list_del(&dentry->d_lru);
+		dentry->d_sb->s_nr_dentry_unused--;
+		dentry_stat.nr_unused--;
+	}
+}
+
+static void dentry_lru_del_init(struct dentry *dentry)
+{
+	if (likely(!list_empty(&dentry->d_lru))) {
+		list_del_init(&dentry->d_lru);
+		dentry->d_sb->s_nr_dentry_unused--;
+		dentry_stat.nr_unused--;
+	}
+}
+
 /**
  * d_kill - kill dentry and return parent
  * @dentry: dentry to kill
@@ -148,9 +175,12 @@ static struct dentry *d_kill(struct dentry *dentry)
 	dentry_stat.nr_dentry--;	/* For d_free, below */
 	/*drops the locks, at that point nobody can reach this dentry */
 	dentry_iput(dentry);
-	parent = dentry->d_parent;
+	if (IS_ROOT(dentry))
+		parent = NULL;
+	else
+		parent = dentry->d_parent;
 	d_free(dentry);
-	return dentry == parent ? NULL : parent;
+	return parent;
 }
 
 /* 
@@ -212,8 +242,7 @@ repeat:
 		goto kill_it;
   	if (list_empty(&dentry->d_lru)) {
   		dentry->d_flags |= DCACHE_REFERENCED;
-  		list_add(&dentry->d_lru, &dentry_unused);
-  		dentry_stat.nr_unused++;
+		dentry_lru_add(dentry);
   	}
  	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
@@ -222,7 +251,8 @@ repeat:
 unhash_it:
 	__d_drop(dentry);
 kill_it:
-	dentry_lru_remove(dentry);
+	/* if dentry was on the d_lru list delete it from there */
+	dentry_lru_del(dentry);
 	dentry = d_kill(dentry);
 	if (dentry)
 		goto repeat;
@@ -290,7 +320,7 @@ int d_invalidate(struct dentry * dentry)
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	atomic_inc(&dentry->d_count);
-	dentry_lru_remove(dentry);
+	dentry_lru_del_init(dentry);
 	return dentry;
 }
 
@@ -406,133 +436,168 @@ static void prune_one_dentry(struct dentry * dentry)
 
 		if (dentry->d_op && dentry->d_op->d_delete)
 			dentry->d_op->d_delete(dentry);
-		dentry_lru_remove(dentry);
+		dentry_lru_del_init(dentry);
 		__d_drop(dentry);
 		dentry = d_kill(dentry);
 		spin_lock(&dcache_lock);
 	}
 }
 
-/**
- * prune_dcache - shrink the dcache
- * @count: number of entries to try and free
- * @sb: if given, ignore dentries for other superblocks
- *         which are being unmounted.
- *
- * Shrink the dcache. This is done when we need
- * more memory, or simply when we need to unmount
- * something (at which point we need to unuse
- * all dentries).
- *
- * This function may fail to free any resources if
- * all the dentries are in use.
+/*
+ * Shrink the dentry LRU on a given superblock.
+ * @sb   : superblock to shrink dentry LRU.
+ * @count: If count is NULL, we prune all dentries on superblock.
+ * @flags: If flags is non-zero, we need to do special processing based on
+ * which flags are set. This means we don't need to maintain multiple
+ * similar copies of this loop.
  */
- 
-static void prune_dcache(int count, struct super_block *sb)
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
 {
-	spin_lock(&dcache_lock);
-	for (; count ; count--) {
-		struct dentry *dentry;
-		struct list_head *tmp;
-		struct rw_semaphore *s_umount;
-
-		cond_resched_lock(&dcache_lock);
+	LIST_HEAD(referenced);
+	LIST_HEAD(tmp);
+	struct dentry *dentry;
+	int cnt = 0;
 
-		tmp = dentry_unused.prev;
-		if (sb) {
-			/* Try to find a dentry for this sb, but don't try
-			 * too hard, if they aren't near the tail they will
-			 * be moved down again soon
+	BUG_ON(!sb);
+	BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
+	spin_lock(&dcache_lock);
+	if (count != NULL)
+		/* called from prune_dcache() and shrink_dcache_parent() */
+		cnt = *count;
+restart:
+	if (count == NULL)
+		list_splice_init(&sb->s_dentry_lru, &tmp);
+	else {
+		while (!list_empty(&sb->s_dentry_lru)) {
+			dentry = list_entry(sb->s_dentry_lru.prev,
+					struct dentry, d_lru);
+			BUG_ON(dentry->d_sb != sb);
+
+			spin_lock(&dentry->d_lock);
+			/*
+			 * If we are honouring the DCACHE_REFERENCED flag and
+			 * the dentry has this flag set, don't free it. Clear
+			 * the flag and put it back on the LRU.
 			 */
-			int skip = count;
-			while (skip && tmp != &dentry_unused &&
-			    list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
-				skip--;
-				tmp = tmp->prev;
+			if ((flags & DCACHE_REFERENCED)
+				&& (dentry->d_flags & DCACHE_REFERENCED)) {
+				dentry->d_flags &= ~DCACHE_REFERENCED;
+				list_move_tail(&dentry->d_lru, &referenced);
+				spin_unlock(&dentry->d_lock);
+			} else {
+				list_move_tail(&dentry->d_lru, &tmp);
+				spin_unlock(&dentry->d_lock);
+				cnt--;
+				if (!cnt)
+					break;
 			}
+			cond_resched_lock(&dcache_lock);
 		}
-		if (tmp == &dentry_unused)
-			break;
-		list_del_init(tmp);
-		prefetch(dentry_unused.prev);
- 		dentry_stat.nr_unused--;
-		dentry = list_entry(tmp, struct dentry, d_lru);
-
- 		spin_lock(&dentry->d_lock);
+	}
+	while (!list_empty(&tmp)) {
+		dentry = list_entry(tmp.prev, struct dentry, d_lru);
+		dentry_lru_del_init(dentry);
+		spin_lock(&dentry->d_lock);
 		/*
 		 * We found an inuse dentry which was not removed from
-		 * dentry_unused because of laziness during lookup.  Do not free
-		 * it - just keep it off the dentry_unused list.
+		 * the LRU because of laziness during lookup.  Do not free
+		 * it - just keep it off the LRU list.
 		 */
- 		if (atomic_read(&dentry->d_count)) {
- 			spin_unlock(&dentry->d_lock);
+		if (atomic_read(&dentry->d_count)) {
+			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		/* If the dentry was recently referenced, don't free it. */
-		if (dentry->d_flags & DCACHE_REFERENCED) {
-			dentry->d_flags &= ~DCACHE_REFERENCED;
- 			list_add(&dentry->d_lru, &dentry_unused);
- 			dentry_stat.nr_unused++;
- 			spin_unlock(&dentry->d_lock);
+		prune_one_dentry(dentry);
+		/* dentry->d_lock was dropped in prune_one_dentry() */
+		cond_resched_lock(&dcache_lock);
+	}
+	if (count == NULL && !list_empty(&sb->s_dentry_lru))
+		goto restart;
+	if (count != NULL)
+		*count = cnt;
+	if (!list_empty(&referenced))
+		list_splice(&referenced, &sb->s_dentry_lru);
+	spin_unlock(&dcache_lock);
+}
+
+/**
+ * prune_dcache - shrink the dcache
+ * @count: number of entries to try to free
+ *
+ * Shrink the dcache. This is done when we need more memory, or simply when we
+ * need to unmount something (at which point we need to unuse all dentries).
+ *
+ * This function may fail to free any resources if all the dentries are in use.
+ */
+static void prune_dcache(int count)
+{
+	struct super_block *sb;
+	int w_count;
+	int unused = dentry_stat.nr_unused;
+	int prune_ratio;
+	int pruned;
+
+	if (unused == 0 || count == 0)
+		return;
+	spin_lock(&dcache_lock);
+restart:
+	if (count >= unused)
+		prune_ratio = 1;
+	else
+		prune_ratio = unused / count;
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (sb->s_nr_dentry_unused == 0)
 			continue;
-		}
-		/*
-		 * If the dentry is not DCACHED_REFERENCED, it is time
-		 * to remove it from the dcache, provided the super block is
-		 * NULL (which means we are trying to reclaim memory)
-		 * or this dentry belongs to the same super block that
-		 * we want to shrink.
-		 */
-		/*
-		 * If this dentry is for "my" filesystem, then I can prune it
-		 * without taking the s_umount lock (I already hold it).
+		sb->s_count++;
+		/* Now, we reclaim unused dentrins with fairness.
+		 * We reclaim them same percentage from each superblock.
+		 * We calculate number of dentries to scan on this sb
+		 * as follows, but the implementation is arranged to avoid
+		 * overflows:
+		 * number of dentries to scan on this sb =
+		 * count * (number of dentries on this sb /
+		 * number of dentries in the machine)
 		 */
-		if (sb && dentry->d_sb == sb) {
-			prune_one_dentry(dentry);
-			continue;
-		}
+		spin_unlock(&sb_lock);
+		if (prune_ratio != 1)
+			w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
+		else
+			w_count = sb->s_nr_dentry_unused;
+		pruned = w_count;
 		/*
-		 * ...otherwise we need to be sure this filesystem isn't being
-		 * unmounted, otherwise we could race with
-		 * generic_shutdown_super(), and end up holding a reference to
-		 * an inode while the filesystem is unmounted.
-		 * So we try to get s_umount, and make sure s_root isn't NULL.
-		 * (Take a local copy of s_umount to avoid a use-after-free of
-		 * `dentry').
+		 * We need to be sure this filesystem isn't being unmounted,
+		 * otherwise we could race with generic_shutdown_super(), and
+		 * end up holding a reference to an inode while the filesystem
+		 * is unmounted.  So we try to get s_umount, and make sure
+		 * s_root isn't NULL.
 		 */
-		s_umount = &dentry->d_sb->s_umount;
-		if (down_read_trylock(s_umount)) {
-			if (dentry->d_sb->s_root != NULL) {
-				prune_one_dentry(dentry);
-				up_read(s_umount);
-				continue;
+		if (down_read_trylock(&sb->s_umount)) {
+			if ((sb->s_root != NULL) &&
+			    (!list_empty(&sb->s_dentry_lru))) {
+				spin_unlock(&dcache_lock);
+				__shrink_dcache_sb(sb, &w_count,
+						DCACHE_REFERENCED);
+				pruned -= w_count;
+				spin_lock(&dcache_lock);
 			}
-			up_read(s_umount);
+			up_read(&sb->s_umount);
 		}
-		spin_unlock(&dentry->d_lock);
+		spin_lock(&sb_lock);
+		count -= pruned;
 		/*
-		 * Insert dentry at the head of the list as inserting at the
-		 * tail leads to a cycle.
+		 * restart only when sb is no longer on the list and
+		 * we have more work to do.
 		 */
- 		list_add(&dentry->d_lru, &dentry_unused);
-		dentry_stat.nr_unused++;
+		if (__put_super_and_need_restart(sb) && count > 0) {
+			spin_unlock(&sb_lock);
+			goto restart;
+		}
 	}
+	spin_unlock(&sb_lock);
 	spin_unlock(&dcache_lock);
 }
 
-/*
- * Shrink the dcache for the specified super block.
- * This allows us to unmount a device without disturbing
- * the dcache for the other devices.
- *
- * This implementation makes just two traversals of the
- * unused list.  On the first pass we move the selected
- * dentries to the most recent end, and on the second
- * pass we free them.  The second pass must restart after
- * each dput(), but since the target dentries are all at
- * the end, it's really just a single traversal.
- */
-
 /**
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
@@ -541,44 +606,9 @@ static void prune_dcache(int count, struct super_block *sb)
  * is used to free the dcache before unmounting a file
  * system
  */
-
 void shrink_dcache_sb(struct super_block * sb)
 {
-	struct list_head *tmp, *next;
-	struct dentry *dentry;
-
-	/*
-	 * Pass one ... move the dentries for the specified
-	 * superblock to the most recent end of the unused list.
-	 */
-	spin_lock(&dcache_lock);
-	list_for_each_prev_safe(tmp, next, &dentry_unused) {
-		dentry = list_entry(tmp, struct dentry, d_lru);
-		if (dentry->d_sb != sb)
-			continue;
-		list_move_tail(tmp, &dentry_unused);
-	}
-
-	/*
-	 * Pass two ... free the dentries for this superblock.
-	 */
-repeat:
-	list_for_each_prev_safe(tmp, next, &dentry_unused) {
-		dentry = list_entry(tmp, struct dentry, d_lru);
-		if (dentry->d_sb != sb)
-			continue;
-		dentry_stat.nr_unused--;
-		list_del_init(tmp);
-		spin_lock(&dentry->d_lock);
-		if (atomic_read(&dentry->d_count)) {
-			spin_unlock(&dentry->d_lock);
-			continue;
-		}
-		prune_one_dentry(dentry);
-		cond_resched_lock(&dcache_lock);
-		goto repeat;
-	}
-	spin_unlock(&dcache_lock);
+	__shrink_dcache_sb(sb, NULL, 0);
 }
 
 /*
@@ -595,7 +625,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 
 	/* detach this root from the system */
 	spin_lock(&dcache_lock);
-	dentry_lru_remove(dentry);
+	dentry_lru_del_init(dentry);
 	__d_drop(dentry);
 	spin_unlock(&dcache_lock);
 
@@ -609,7 +639,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			spin_lock(&dcache_lock);
 			list_for_each_entry(loop, &dentry->d_subdirs,
 					    d_u.d_child) {
-				dentry_lru_remove(loop);
+				dentry_lru_del_init(loop);
 				__d_drop(loop);
 				cond_resched_lock(&dcache_lock);
 			}
@@ -640,11 +670,12 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 				BUG();
 			}
 
-			parent = dentry->d_parent;
-			if (parent == dentry)
+			if (IS_ROOT(dentry))
 				parent = NULL;
-			else
+			else {
+				parent = dentry->d_parent;
 				atomic_dec(&parent->d_count);
+			}
 
 			list_del(&dentry->d_u.d_child);
 			detached++;
@@ -791,14 +822,13 @@ resume:
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
 		next = tmp->next;
 
-		dentry_lru_remove(dentry);
+		dentry_lru_del_init(dentry);
 		/* 
 		 * move only zero ref count dentries to the end 
 		 * of the unused list for prune_dcache
 		 */
 		if (!atomic_read(&dentry->d_count)) {
-			list_add_tail(&dentry->d_lru, &dentry_unused);
-			dentry_stat.nr_unused++;
+			dentry_lru_add_tail(dentry);
 			found++;
 		}
 
@@ -840,10 +870,11 @@ out:
  
 void shrink_dcache_parent(struct dentry * parent)
 {
+	struct super_block *sb = parent->d_sb;
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found, parent->d_sb);
+		__shrink_dcache_sb(sb, &found, 0);
 }
 
 /*
@@ -863,7 +894,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr, NULL);
+		prune_dcache(nr);
 	}
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
@@ -951,6 +982,15 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 	return d_alloc(parent, &q);
 }
 
+/* the caller must hold dcache_lock */
+static void __d_instantiate(struct dentry *dentry, struct inode *inode)
+{
+	if (inode)
+		list_add(&dentry->d_alias, &inode->i_dentry);
+	dentry->d_inode = inode;
+	fsnotify_d_instantiate(dentry, inode);
+}
+
 /**
  * d_instantiate - fill in inode information for a dentry
  * @entry: dentry to complete
@@ -970,10 +1010,7 @@ void d_instantiate(struct dentry *entry, struct inode * inode)
 {
 	BUG_ON(!list_empty(&entry->d_alias));
 	spin_lock(&dcache_lock);
-	if (inode)
-		list_add(&entry->d_alias, &inode->i_dentry);
-	entry->d_inode = inode;
-	fsnotify_d_instantiate(entry, inode);
+	__d_instantiate(entry, inode);
 	spin_unlock(&dcache_lock);
 	security_d_instantiate(entry, inode);
 }
@@ -1003,7 +1040,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
 	unsigned int hash = entry->d_name.hash;
 
 	if (!inode) {
-		entry->d_inode = NULL;
+		__d_instantiate(entry, NULL);
 		return NULL;
 	}
 
@@ -1022,9 +1059,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry,
 		return alias;
 	}
 
-	list_add(&entry->d_alias, &inode->i_dentry);
-	entry->d_inode = inode;
-	fsnotify_d_instantiate(entry, inode);
+	__d_instantiate(entry, inode);
 	return NULL;
 }
 
@@ -1085,69 +1120,71 @@ static inline struct hlist_head *d_hash(struct dentry *parent,
 }
 
 /**
- * d_alloc_anon - allocate an anonymous dentry
+ * d_obtain_alias - find or allocate a dentry for a given inode
  * @inode: inode to allocate the dentry for
  *
- * This is similar to d_alloc_root.  It is used by filesystems when
- * creating a dentry for a given inode, often in the process of 
- * mapping a filehandle to a dentry.  The returned dentry may be
- * anonymous, or may have a full name (if the inode was already
- * in the cache).  The file system may need to make further
- * efforts to connect this dentry into the dcache properly.
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
+ * similar open by handle operations.  The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
  *
- * When called on a directory inode, we must ensure that
- * the inode only ever has one dentry.  If a dentry is
- * found, that is returned instead of allocating a new one.
+ * When called on a directory inode, we must ensure that the inode only ever
+ * has one dentry.  If a dentry is found, that is returned instead of
+ * allocating a new one.
  *
  * On successful return, the reference to the inode has been transferred
- * to the dentry.  If %NULL is returned (indicating kmalloc failure),
- * the reference on the inode has not been released.
+ * to the dentry.  In case of an error the reference on the inode is released.
+ * To make it easier to use in export operations a %NULL or IS_ERR inode may
+ * be passed in and will be the error will be propagate to the return value,
+ * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
  */
-
-struct dentry * d_alloc_anon(struct inode *inode)
+struct dentry *d_obtain_alias(struct inode *inode)
 {
 	static const struct qstr anonstring = { .name = "" };
 	struct dentry *tmp;
 	struct dentry *res;
 
-	if ((res = d_find_alias(inode))) {
-		iput(inode);
-		return res;
-	}
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
 
-	tmp = d_alloc(NULL, &anonstring);
-	if (!tmp)
-		return NULL;
+	res = d_find_alias(inode);
+	if (res)
+		goto out_iput;
 
+	tmp = d_alloc(NULL, &anonstring);
+	if (!tmp) {
+		res = ERR_PTR(-ENOMEM);
+		goto out_iput;
+	}
 	tmp->d_parent = tmp; /* make sure dput doesn't croak */
-	
+
 	spin_lock(&dcache_lock);
 	res = __d_find_alias(inode, 0);
-	if (!res) {
-		/* attach a disconnected dentry */
-		res = tmp;
-		tmp = NULL;
-		spin_lock(&res->d_lock);
-		res->d_sb = inode->i_sb;
-		res->d_parent = res;
-		res->d_inode = inode;
-		res->d_flags |= DCACHE_DISCONNECTED;
-		res->d_flags &= ~DCACHE_UNHASHED;
-		list_add(&res->d_alias, &inode->i_dentry);
-		hlist_add_head(&res->d_hash, &inode->i_sb->s_anon);
-		spin_unlock(&res->d_lock);
-
-		inode = NULL; /* don't drop reference */
+	if (res) {
+		spin_unlock(&dcache_lock);
+		dput(tmp);
+		goto out_iput;
 	}
+
+	/* attach a disconnected dentry */
+	spin_lock(&tmp->d_lock);
+	tmp->d_sb = inode->i_sb;
+	tmp->d_inode = inode;
+	tmp->d_flags |= DCACHE_DISCONNECTED;
+	tmp->d_flags &= ~DCACHE_UNHASHED;
+	list_add(&tmp->d_alias, &inode->i_dentry);
+	hlist_add_head(&tmp->d_hash, &inode->i_sb->s_anon);
+	spin_unlock(&tmp->d_lock);
+
 	spin_unlock(&dcache_lock);
+	return tmp;
 
-	if (inode)
-		iput(inode);
-	if (tmp)
-		dput(tmp);
+ out_iput:
+	iput(inode);
 	return res;
 }
-
+EXPORT_SYMBOL_GPL(d_obtain_alias);
 
 /**
  * d_splice_alias - splice a disconnected dentry into the tree if one exists
@@ -1174,17 +1211,14 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 		new = __d_find_alias(inode, 1);
 		if (new) {
 			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
-			fsnotify_d_instantiate(new, inode);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(new, inode);
 			d_rehash(dentry);
 			d_move(new, dentry);
 			iput(inode);
 		} else {
-			/* d_instantiate takes dcache_lock, so we do it by hand */
-			list_add(&dentry->d_alias, &inode->i_dentry);
-			dentry->d_inode = inode;
-			fsnotify_d_instantiate(dentry, inode);
+			/* already taking dcache_lock, so d_add() by hand */
+			__d_instantiate(dentry, inode);
 			spin_unlock(&dcache_lock);
 			security_d_instantiate(dentry, inode);
 			d_rehash(dentry);
@@ -1194,6 +1228,106 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 	return new;
 }
 
+/**
+ * d_add_ci - lookup or allocate new dentry with case-exact name
+ * @inode:  the inode case-insensitive lookup has found
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name:   the case-exact name to be associated with the returned dentry
+ *
+ * This is to avoid filling the dcache with case-insensitive names to the
+ * same inode, only the actual correct case is stored in the dcache for
+ * case-insensitive filesystems.
+ *
+ * For a case-insensitive lookup match and if the the case-exact dentry
+ * already exists in in the dcache, use it and return it.
+ *
+ * If no entry exists with the exact case name, allocate new dentry with
+ * the exact case, and return the spliced entry.
+ */
+struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
+			struct qstr *name)
+{
+	int error;
+	struct dentry *found;
+	struct dentry *new;
+
+	/* Does a dentry matching the name exist already? */
+	found = d_hash_and_lookup(dentry->d_parent, name);
+	/* If not, create it now and return */
+	if (!found) {
+		new = d_alloc(dentry->d_parent, name);
+		if (!new) {
+			error = -ENOMEM;
+			goto err_out;
+		}
+		found = d_splice_alias(inode, new);
+		if (found) {
+			dput(new);
+			return found;
+		}
+		return new;
+	}
+	/* Matching dentry exists, check if it is negative. */
+	if (found->d_inode) {
+		if (unlikely(found->d_inode != inode)) {
+			/* This can't happen because bad inodes are unhashed. */
+			BUG_ON(!is_bad_inode(inode));
+			BUG_ON(!is_bad_inode(found->d_inode));
+		}
+		/*
+		 * Already have the inode and the dentry attached, decrement
+		 * the reference count to balance the iget() done
+		 * earlier on.  We found the dentry using d_lookup() so it
+		 * cannot be disconnected and thus we do not need to worry
+		 * about any NFS/disconnectedness issues here.
+		 */
+		iput(inode);
+		return found;
+	}
+	/*
+	 * Negative dentry: instantiate it unless the inode is a directory and
+	 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
+	 * in which case d_move() that in place of the found dentry.
+	 */
+	if (!S_ISDIR(inode->i_mode)) {
+		/* Not a directory; everything is easy. */
+		d_instantiate(found, inode);
+		return found;
+	}
+	spin_lock(&dcache_lock);
+	if (list_empty(&inode->i_dentry)) {
+		/*
+		 * Directory without a 'disconnected' dentry; we need to do
+		 * d_instantiate() by hand because it takes dcache_lock which
+		 * we already hold.
+		 */
+		__d_instantiate(found, inode);
+		spin_unlock(&dcache_lock);
+		security_d_instantiate(found, inode);
+		return found;
+	}
+	/*
+	 * Directory with a 'disconnected' dentry; get a reference to the
+	 * 'disconnected' dentry.
+	 */
+	new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+	dget_locked(new);
+	spin_unlock(&dcache_lock);
+	/* Do security vodoo. */
+	security_d_instantiate(found, inode);
+	/* Move new in place of found. */
+	d_move(new, found);
+	/* Balance the iget() we did above. */
+	iput(inode);
+	/* Throw away found. */
+	dput(found);
+	/* Use new as the actual dentry. */
+	return new;
+
+err_out:
+	iput(inode);
+	return ERR_PTR(error);
+}
 
 /**
  * d_lookup - search for a dentry
@@ -1215,7 +1349,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
  * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
  * lookup is going on.
  *
- * dentry_unused list is not updated even if lookup finds the required dentry
+ * The dentry unused LRU is not updated even if lookup finds the required dentry
  * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
  * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
  * acquisition.
@@ -1268,6 +1402,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 		if (dentry->d_parent != parent)
 			goto next;
 
+		/* non-existing due to RCU? */
+		if (d_unhashed(dentry))
+			goto next;
+
 		/*
 		 * It is safe to compare names since d_move() cannot
 		 * change the qstr (protected by d_lock).
@@ -1283,10 +1421,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 				goto next;
 		}
 
-		if (!d_unhashed(dentry)) {
-			atomic_inc(&dentry->d_count);
-			found = dentry;
-		}
+		atomic_inc(&dentry->d_count);
+		found = dentry;
 		spin_unlock(&dentry->d_lock);
 		break;
 next:
@@ -1327,8 +1463,6 @@ out:
  * d_validate - verify dentry provided from insecure source
  * @dentry: The dentry alleged to be valid child of @dparent
  * @dparent: The parent dentry (known to be valid)
- * @hash: Hash of the dentry
- * @len: Length of the name
  *
  * An insecure source has sent us a dentry, here we verify it and dget() it.
  * This is used by ncpfs in its readdir implementation.
@@ -1585,18 +1719,23 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	spin_unlock(&dcache_lock);
 }
 
-/*
- * Helper that returns 1 if p1 is a parent of p2, else 0
+/**
+ * d_ancestor - search for an ancestor
+ * @p1: ancestor dentry
+ * @p2: child dentry
+ *
+ * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
+ * an ancestor of p2, else NULL.
  */
-static int d_isparent(struct dentry *p1, struct dentry *p2)
+struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
 {
 	struct dentry *p;
 
-	for (p = p2; p->d_parent != p; p = p->d_parent) {
+	for (p = p2; !IS_ROOT(p); p = p->d_parent) {
 		if (p->d_parent == p1)
-			return 1;
+			return p;
 	}
-	return 0;
+	return NULL;
 }
 
 /*
@@ -1620,7 +1759,7 @@ static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
 
 	/* Check for loops */
 	ret = ERR_PTR(-ELOOP);
-	if (d_isparent(alias, dentry))
+	if (d_ancestor(alias, dentry))
 		goto out_err;
 
 	/* See lock_rename() */
@@ -1693,7 +1832,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 
 	if (!inode) {
 		actual = dentry;
-		dentry->d_inode = NULL;
+		__d_instantiate(dentry, NULL);
 		goto found_lock;
 	}
 
@@ -2020,32 +2159,27 @@ out:
  * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
  */
   
-int is_subdir(struct dentry * new_dentry, struct dentry * old_dentry)
+int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
 	int result;
-	struct dentry * saved = new_dentry;
 	unsigned long seq;
 
-	/* need rcu_readlock to protect against the d_parent trashing due to
-	 * d_move
+	/* FIXME: This is old behavior, needed? Please check callers. */
+	if (new_dentry == old_dentry)
+		return 1;
+
+	/*
+	 * Need rcu_readlock to protect against the d_parent trashing
+	 * due to d_move
 	 */
 	rcu_read_lock();
-        do {
+	do {
 		/* for restarting inner loop in case of seq retry */
-		new_dentry = saved;
-		result = 0;
 		seq = read_seqbegin(&rename_lock);
-		for (;;) {
-			if (new_dentry != old_dentry) {
-				struct dentry * parent = new_dentry->d_parent;
-				if (parent == new_dentry)
-					break;
-				new_dentry = parent;
-				continue;
-			}
+		if (d_ancestor(old_dentry, new_dentry))
 			result = 1;
-			break;
-		}
+		else
+			result = 0;
 	} while (read_seqretry(&rename_lock, seq));
 	rcu_read_unlock();
 
@@ -2215,7 +2349,6 @@ void __init vfs_caches_init(unsigned long mempages)
 }
 
 EXPORT_SYMBOL(d_alloc);
-EXPORT_SYMBOL(d_alloc_anon);
 EXPORT_SYMBOL(d_alloc_root);
 EXPORT_SYMBOL(d_delete);
 EXPORT_SYMBOL(d_find_alias);
@@ -2228,6 +2361,7 @@ EXPORT_SYMBOL(d_path);
 EXPORT_SYMBOL(d_prune_aliases);
 EXPORT_SYMBOL(d_rehash);
 EXPORT_SYMBOL(d_splice_alias);
+EXPORT_SYMBOL(d_add_ci);
 EXPORT_SYMBOL(d_validate);
 EXPORT_SYMBOL(dget_locked);
 EXPORT_SYMBOL(dput);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e9602d85c11d..3dbe2169cf36 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -26,8 +26,7 @@
 #include <linux/debugfs.h>
 #include <linux/fsnotify.h>
 #include <linux/string.h>
-
-#define DEBUGFS_MAGIC	0x64626720
+#include <linux/magic.h>
 
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
@@ -309,6 +308,31 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 
+static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+{
+	int ret = 0;
+
+	if (debugfs_positive(dentry)) {
+		if (dentry->d_inode) {
+			dget(dentry);
+			switch (dentry->d_inode->i_mode & S_IFMT) {
+			case S_IFDIR:
+				ret = simple_rmdir(parent->d_inode, dentry);
+				break;
+			case S_IFLNK:
+				kfree(dentry->d_inode->i_private);
+				/* fall through */
+			default:
+				simple_unlink(parent->d_inode, dentry);
+				break;
+			}
+			if (!ret)
+				d_delete(dentry);
+			dput(dentry);
+		}
+	}
+}
+
 /**
  * debugfs_remove - removes a file or directory from the debugfs filesystem
  * @dentry: a pointer to a the dentry of the file or directory to be
@@ -325,7 +349,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 void debugfs_remove(struct dentry *dentry)
 {
 	struct dentry *parent;
-	int ret = 0;
 	
 	if (!dentry)
 		return;
@@ -335,29 +358,83 @@ void debugfs_remove(struct dentry *dentry)
 		return;
 
 	mutex_lock(&parent->d_inode->i_mutex);
-	if (debugfs_positive(dentry)) {
-		if (dentry->d_inode) {
-			dget(dentry);
-			switch (dentry->d_inode->i_mode & S_IFMT) {
-			case S_IFDIR:
-				ret = simple_rmdir(parent->d_inode, dentry);
-				break;
-			case S_IFLNK:
-				kfree(dentry->d_inode->i_private);
-				/* fall through */
-			default:
-				simple_unlink(parent->d_inode, dentry);
+	__debugfs_remove(dentry, parent);
+	mutex_unlock(&parent->d_inode->i_mutex);
+	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+}
+EXPORT_SYMBOL_GPL(debugfs_remove);
+
+/**
+ * debugfs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in debugfs that
+ * was previously created with a call to another debugfs function
+ * (like debugfs_create_file() or variants thereof.)
+ *
+ * This function is required to be called in order for the file to be
+ * removed, no automatic cleanup of files will happen when a module is
+ * removed, you are responsible here.
+ */
+void debugfs_remove_recursive(struct dentry *dentry)
+{
+	struct dentry *child;
+	struct dentry *parent;
+
+	if (!dentry)
+		return;
+
+	parent = dentry->d_parent;
+	if (!parent || !parent->d_inode)
+		return;
+
+	parent = dentry;
+	mutex_lock(&parent->d_inode->i_mutex);
+
+	while (1) {
+		/*
+		 * When all dentries under "parent" has been removed,
+		 * walk up the tree until we reach our starting point.
+		 */
+		if (list_empty(&parent->d_subdirs)) {
+			mutex_unlock(&parent->d_inode->i_mutex);
+			if (parent == dentry)
 				break;
-			}
-			if (!ret)
-				d_delete(dentry);
-			dput(dentry);
+			parent = parent->d_parent;
+			mutex_lock(&parent->d_inode->i_mutex);
+		}
+		child = list_entry(parent->d_subdirs.next, struct dentry,
+				d_u.d_child);
+
+		/*
+		 * If "child" isn't empty, walk down the tree and
+		 * remove all its descendants first.
+		 */
+		if (!list_empty(&child->d_subdirs)) {
+			mutex_unlock(&parent->d_inode->i_mutex);
+			parent = child;
+			mutex_lock(&parent->d_inode->i_mutex);
+			continue;
 		}
+		__debugfs_remove(child, parent);
+		if (parent->d_subdirs.next == &child->d_u.d_child) {
+			/*
+			 * Avoid infinite loop if we fail to remove
+			 * one dentry.
+			 */
+			mutex_unlock(&parent->d_inode->i_mutex);
+			break;
+		}
+		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	}
+
+	parent = dentry->d_parent;
+	mutex_lock(&parent->d_inode->i_mutex);
+	__debugfs_remove(dentry, parent);
 	mutex_unlock(&parent->d_inode->i_mutex);
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
-EXPORT_SYMBOL_GPL(debugfs_remove);
+EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
 /**
  * debugfs_rename - rename a file/directory in the debugfs filesystem
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 285b64a8b06e..4a714f6c1bed 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,9 +27,10 @@
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 
 #define DEVPTS_DEFAULT_MODE 0600
+#define PTMX_MINOR	2
 
 extern int pty_limit;			/* Config limit on Unix98 ptys */
-static DEFINE_IDR(allocated_ptys);
+static DEFINE_IDA(allocated_ptys);
 static DEFINE_MUTEX(allocated_ptys_lock);
 
 static struct vfsmount *devpts_mnt;
@@ -48,7 +49,7 @@ enum {
 	Opt_err
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_uid, "uid=%u"},
 	{Opt_gid, "gid=%u"},
 	{Opt_mode, "mode=%o"},
@@ -169,35 +170,27 @@ static struct file_system_type devpts_fs_type = {
  * to the System V naming convention
  */
 
-static struct dentry *get_node(int num)
-{
-	char s[12];
-	struct dentry *root = devpts_root;
-	mutex_lock(&root->d_inode->i_mutex);
-	return lookup_one_len(s, root, sprintf(s, "%d", num));
-}
-
-int devpts_new_index(void)
+int devpts_new_index(struct inode *ptmx_inode)
 {
 	int index;
-	int idr_ret;
+	int ida_ret;
 
 retry:
-	if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) {
+	if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) {
 		return -ENOMEM;
 	}
 
 	mutex_lock(&allocated_ptys_lock);
-	idr_ret = idr_get_new(&allocated_ptys, NULL, &index);
-	if (idr_ret < 0) {
+	ida_ret = ida_get_new(&allocated_ptys, &index);
+	if (ida_ret < 0) {
 		mutex_unlock(&allocated_ptys_lock);
-		if (idr_ret == -EAGAIN)
+		if (ida_ret == -EAGAIN)
 			goto retry;
 		return -EIO;
 	}
 
 	if (index >= pty_limit) {
-		idr_remove(&allocated_ptys, index);
+		ida_remove(&allocated_ptys, index);
 		mutex_unlock(&allocated_ptys_lock);
 		return -EIO;
 	}
@@ -205,20 +198,21 @@ retry:
 	return index;
 }
 
-void devpts_kill_index(int idx)
+void devpts_kill_index(struct inode *ptmx_inode, int idx)
 {
 	mutex_lock(&allocated_ptys_lock);
-	idr_remove(&allocated_ptys, idx);
+	ida_remove(&allocated_ptys, idx);
 	mutex_unlock(&allocated_ptys_lock);
 }
 
-int devpts_pty_new(struct tty_struct *tty)
+int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 {
 	int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
 	struct tty_driver *driver = tty->driver;
 	dev_t device = MKDEV(driver->major, driver->minor_start+number);
 	struct dentry *dentry;
 	struct inode *inode = new_inode(devpts_mnt->mnt_sb);
+	char s[12];
 
 	/* We're supposed to be given the slave end of a pty */
 	BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
@@ -233,10 +227,15 @@ int devpts_pty_new(struct tty_struct *tty)
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	init_special_inode(inode, S_IFCHR|config.mode, device);
 	inode->i_private = tty;
+	tty->driver_data = inode;
 
-	dentry = get_node(number);
-	if (!IS_ERR(dentry) && !dentry->d_inode) {
-		d_instantiate(dentry, inode);
+	sprintf(s, "%d", number);
+
+	mutex_lock(&devpts_root->d_inode->i_mutex);
+
+	dentry = d_alloc_name(devpts_root, s);
+	if (!IS_ERR(dentry)) {
+		d_add(dentry, inode);
 		fsnotify_create(devpts_root->d_inode, dentry);
 	}
 
@@ -245,36 +244,31 @@ int devpts_pty_new(struct tty_struct *tty)
 	return 0;
 }
 
-struct tty_struct *devpts_get_tty(int number)
+struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
 {
-	struct dentry *dentry = get_node(number);
-	struct tty_struct *tty;
-
-	tty = NULL;
-	if (!IS_ERR(dentry)) {
-		if (dentry->d_inode)
-			tty = dentry->d_inode->i_private;
-		dput(dentry);
-	}
+	BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
 
-	mutex_unlock(&devpts_root->d_inode->i_mutex);
-
-	return tty;
+	if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+		return (struct tty_struct *)pts_inode->i_private;
+	return NULL;
 }
 
-void devpts_pty_kill(int number)
+void devpts_pty_kill(struct tty_struct *tty)
 {
-	struct dentry *dentry = get_node(number);
+	struct inode *inode = tty->driver_data;
+	struct dentry *dentry;
 
-	if (!IS_ERR(dentry)) {
-		struct inode *inode = dentry->d_inode;
-		if (inode) {
-			inode->i_nlink--;
-			d_delete(dentry);
-			dput(dentry);
-		}
+	BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
+
+	mutex_lock(&devpts_root->d_inode->i_mutex);
+
+	dentry = d_find_alias(inode);
+	if (dentry && !IS_ERR(dentry)) {
+		inode->i_nlink--;
+		d_delete(dentry);
 		dput(dentry);
 	}
+
 	mutex_unlock(&devpts_root->d_inode->i_mutex);
 }
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9e81addbd6ea..af0558dbe8b7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -5,11 +5,11 @@
  *
  * O_DIRECT
  *
- * 04Jul2002	akpm@zip.com.au
+ * 04Jul2002	Andrew Morton
  *		Initial version
  * 11Sep2002	janetinc@us.ibm.com
  * 		added readv/writev support.
- * 29Oct2002	akpm@zip.com.au
+ * 29Oct2002	Andrew Morton
  *		rewrote bio_add_page() support.
  * 30Oct2002	pbadari@us.ibm.com
  *		added support for non-aligned IO.
@@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *dio)
 	int nr_pages;
 
 	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(
-		current,			/* Task for fault acounting */
-		current->mm,			/* whose pages? */
+	ret = get_user_pages_fast(
 		dio->curr_user_address,		/* Where from? */
 		nr_pages,			/* How many pages? */
 		dio->rw == READ,		/* Write to memory? */
-		0,				/* force (?) */
-		&dio->pages[0],
-		NULL);				/* vmas */
-	up_read(&current->mm->mmap_sem);
+		&dio->pages[0]);		/* Put results here */
 
 	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(0);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 492d8caaaf25..fd9859f92fad 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,9 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
 #include <net/sock.h>
 
 #include "config.h"
@@ -30,31 +33,27 @@
 
 static struct config_group *space_list;
 static struct config_group *comm_list;
-static struct comm *local_comm;
-
-struct clusters;
-struct cluster;
-struct spaces;
-struct space;
-struct comms;
-struct comm;
-struct nodes;
-struct node;
-
-static int make_cluster(struct config_group *, const char *,
-			struct config_group **);
+static struct dlm_comm *local_comm;
+
+struct dlm_clusters;
+struct dlm_cluster;
+struct dlm_spaces;
+struct dlm_space;
+struct dlm_comms;
+struct dlm_comm;
+struct dlm_nodes;
+struct dlm_node;
+
+static struct config_group *make_cluster(struct config_group *, const char *);
 static void drop_cluster(struct config_group *, struct config_item *);
 static void release_cluster(struct config_item *);
-static int make_space(struct config_group *, const char *,
-		      struct config_group **);
+static struct config_group *make_space(struct config_group *, const char *);
 static void drop_space(struct config_group *, struct config_item *);
 static void release_space(struct config_item *);
-static int make_comm(struct config_group *, const char *,
-		     struct config_item **);
+static struct config_item *make_comm(struct config_group *, const char *);
 static void drop_comm(struct config_group *, struct config_item *);
 static void release_comm(struct config_item *);
-static int make_node(struct config_group *, const char *,
-		     struct config_item **);
+static struct config_item *make_node(struct config_group *, const char *);
 static void drop_node(struct config_group *, struct config_item *);
 static void release_node(struct config_item *);
 
@@ -72,17 +71,22 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
 			  const char *buf, size_t len);
 
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
-static ssize_t comm_local_read(struct comm *cm, char *buf);
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
-static ssize_t node_nodeid_read(struct node *nd, char *buf);
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
-static ssize_t node_weight_read(struct node *nd, char *buf);
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
-
-struct cluster {
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+				size_t len);
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+				size_t len);
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
+				size_t len);
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+				size_t len);
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+				size_t len);
+
+struct dlm_cluster {
 	struct config_group group;
 	unsigned int cl_tcp_port;
 	unsigned int cl_buffer_size;
@@ -113,11 +117,11 @@ enum {
 
 struct cluster_attribute {
 	struct configfs_attribute attr;
-	ssize_t (*show)(struct cluster *, char *);
-	ssize_t (*store)(struct cluster *, const char *, size_t);
+	ssize_t (*show)(struct dlm_cluster *, char *);
+	ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
 };
 
-static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
+static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
 			   int *info_field, int check_zero,
 			   const char *buf, size_t len)
 {
@@ -138,12 +142,12 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
 }
 
 #define CLUSTER_ATTR(name, check_zero)                                        \
-static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
+static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
 {                                                                             \
 	return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name,         \
 			   check_zero, buf, len);                             \
 }                                                                             \
-static ssize_t name##_read(struct cluster *cl, char *buf)                     \
+static ssize_t name##_read(struct dlm_cluster *cl, char *buf)                 \
 {                                                                             \
 	return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name);               \
 }                                                                             \
@@ -185,8 +189,8 @@ enum {
 
 struct comm_attribute {
 	struct configfs_attribute attr;
-	ssize_t (*show)(struct comm *, char *);
-	ssize_t (*store)(struct comm *, const char *, size_t);
+	ssize_t (*show)(struct dlm_comm *, char *);
+	ssize_t (*store)(struct dlm_comm *, const char *, size_t);
 };
 
 static struct comm_attribute comm_attr_nodeid = {
@@ -226,8 +230,8 @@ enum {
 
 struct node_attribute {
 	struct configfs_attribute attr;
-	ssize_t (*show)(struct node *, char *);
-	ssize_t (*store)(struct node *, const char *, size_t);
+	ssize_t (*show)(struct dlm_node *, char *);
+	ssize_t (*store)(struct dlm_node *, const char *, size_t);
 };
 
 static struct node_attribute node_attr_nodeid = {
@@ -252,26 +256,26 @@ static struct configfs_attribute *node_attrs[] = {
 	NULL,
 };
 
-struct clusters {
+struct dlm_clusters {
 	struct configfs_subsystem subsys;
 };
 
-struct spaces {
+struct dlm_spaces {
 	struct config_group ss_group;
 };
 
-struct space {
+struct dlm_space {
 	struct config_group group;
 	struct list_head members;
 	struct mutex members_lock;
 	int members_count;
 };
 
-struct comms {
+struct dlm_comms {
 	struct config_group cs_group;
 };
 
-struct comm {
+struct dlm_comm {
 	struct config_item item;
 	int nodeid;
 	int local;
@@ -279,11 +283,11 @@ struct comm {
 	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
 };
 
-struct nodes {
+struct dlm_nodes {
 	struct config_group ns_group;
 };
 
-struct node {
+struct dlm_node {
 	struct config_item item;
 	struct list_head list; /* space->members */
 	int nodeid;
@@ -376,38 +380,40 @@ static struct config_item_type node_type = {
 	.ct_owner = THIS_MODULE,
 };
 
-static struct cluster *to_cluster(struct config_item *i)
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
 {
-	return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+	return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+		   NULL;
 }
 
-static struct space *to_space(struct config_item *i)
+static struct dlm_space *config_item_to_space(struct config_item *i)
 {
-	return i ? container_of(to_config_group(i), struct space, group) : NULL;
+	return i ? container_of(to_config_group(i), struct dlm_space, group) :
+		   NULL;
 }
 
-static struct comm *to_comm(struct config_item *i)
+static struct dlm_comm *config_item_to_comm(struct config_item *i)
 {
-	return i ? container_of(i, struct comm, item) : NULL;
+	return i ? container_of(i, struct dlm_comm, item) : NULL;
 }
 
-static struct node *to_node(struct config_item *i)
+static struct dlm_node *config_item_to_node(struct config_item *i)
 {
-	return i ? container_of(i, struct node, item) : NULL;
+	return i ? container_of(i, struct dlm_node, item) : NULL;
 }
 
-static int make_cluster(struct config_group *g, const char *name,
-			struct config_group **new_g)
+static struct config_group *make_cluster(struct config_group *g,
+					 const char *name)
 {
-	struct cluster *cl = NULL;
-	struct spaces *sps = NULL;
-	struct comms *cms = NULL;
+	struct dlm_cluster *cl = NULL;
+	struct dlm_spaces *sps = NULL;
+	struct dlm_comms *cms = NULL;
 	void *gps = NULL;
 
-	cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+	cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
 	gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
-	sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
-	cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+	sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
+	cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
 
 	if (!cl || !gps || !sps || !cms)
 		goto fail;
@@ -435,20 +441,19 @@ static int make_cluster(struct config_group *g, const char *name,
 
 	space_list = &sps->ss_group;
 	comm_list = &cms->cs_group;
-	*new_g = &cl->group;
-	return 0;
+	return &cl->group;
 
  fail:
 	kfree(cl);
 	kfree(gps);
 	kfree(sps);
 	kfree(cms);
-	return -ENOMEM;
+	return ERR_PTR(-ENOMEM);
 }
 
 static void drop_cluster(struct config_group *g, struct config_item *i)
 {
-	struct cluster *cl = to_cluster(i);
+	struct dlm_cluster *cl = config_item_to_cluster(i);
 	struct config_item *tmp;
 	int j;
 
@@ -466,21 +471,20 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
 
 static void release_cluster(struct config_item *i)
 {
-	struct cluster *cl = to_cluster(i);
+	struct dlm_cluster *cl = config_item_to_cluster(i);
 	kfree(cl->group.default_groups);
 	kfree(cl);
 }
 
-static int make_space(struct config_group *g, const char *name,
-		      struct config_group **new_g)
+static struct config_group *make_space(struct config_group *g, const char *name)
 {
-	struct space *sp = NULL;
-	struct nodes *nds = NULL;
+	struct dlm_space *sp = NULL;
+	struct dlm_nodes *nds = NULL;
 	void *gps = NULL;
 
-	sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+	sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
 	gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
-	nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+	nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
 
 	if (!sp || !gps || !nds)
 		goto fail;
@@ -495,19 +499,18 @@ static int make_space(struct config_group *g, const char *name,
 	INIT_LIST_HEAD(&sp->members);
 	mutex_init(&sp->members_lock);
 	sp->members_count = 0;
-	*new_g = &sp->group;
-	return 0;
+	return &sp->group;
 
  fail:
 	kfree(sp);
 	kfree(gps);
 	kfree(nds);
-	return -ENOMEM;
+	return ERR_PTR(-ENOMEM);
 }
 
 static void drop_space(struct config_group *g, struct config_item *i)
 {
-	struct space *sp = to_space(i);
+	struct dlm_space *sp = config_item_to_space(i);
 	struct config_item *tmp;
 	int j;
 
@@ -524,31 +527,29 @@ static void drop_space(struct config_group *g, struct config_item *i)
 
 static void release_space(struct config_item *i)
 {
-	struct space *sp = to_space(i);
+	struct dlm_space *sp = config_item_to_space(i);
 	kfree(sp->group.default_groups);
 	kfree(sp);
 }
 
-static int make_comm(struct config_group *g, const char *name,
-		     struct config_item **new_i)
+static struct config_item *make_comm(struct config_group *g, const char *name)
 {
-	struct comm *cm;
+	struct dlm_comm *cm;
 
-	cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+	cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
 	if (!cm)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	config_item_init_type_name(&cm->item, name, &comm_type);
 	cm->nodeid = -1;
 	cm->local = 0;
 	cm->addr_count = 0;
-	*new_i = &cm->item;
-	return 0;
+	return &cm->item;
 }
 
 static void drop_comm(struct config_group *g, struct config_item *i)
 {
-	struct comm *cm = to_comm(i);
+	struct dlm_comm *cm = config_item_to_comm(i);
 	if (local_comm == cm)
 		local_comm = NULL;
 	dlm_lowcomms_close(cm->nodeid);
@@ -559,19 +560,18 @@ static void drop_comm(struct config_group *g, struct config_item *i)
 
 static void release_comm(struct config_item *i)
 {
-	struct comm *cm = to_comm(i);
+	struct dlm_comm *cm = config_item_to_comm(i);
 	kfree(cm);
 }
 
-static int make_node(struct config_group *g, const char *name,
-		     struct config_item **new_i)
+static struct config_item *make_node(struct config_group *g, const char *name)
 {
-	struct space *sp = to_space(g->cg_item.ci_parent);
-	struct node *nd;
+	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+	struct dlm_node *nd;
 
-	nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+	nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
 	if (!nd)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	config_item_init_type_name(&nd->item, name, &node_type);
 	nd->nodeid = -1;
@@ -583,14 +583,13 @@ static int make_node(struct config_group *g, const char *name,
 	sp->members_count++;
 	mutex_unlock(&sp->members_lock);
 
-	*new_i = &nd->item;
-	return 0;
+	return &nd->item;
 }
 
 static void drop_node(struct config_group *g, struct config_item *i)
 {
-	struct space *sp = to_space(g->cg_item.ci_parent);
-	struct node *nd = to_node(i);
+	struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+	struct dlm_node *nd = config_item_to_node(i);
 
 	mutex_lock(&sp->members_lock);
 	list_del(&nd->list);
@@ -602,11 +601,11 @@ static void drop_node(struct config_group *g, struct config_item *i)
 
 static void release_node(struct config_item *i)
 {
-	struct node *nd = to_node(i);
+	struct dlm_node *nd = config_item_to_node(i);
 	kfree(nd);
 }
 
-static struct clusters clusters_root = {
+static struct dlm_clusters clusters_root = {
 	.subsys = {
 		.su_group = {
 			.cg_item = {
@@ -636,7 +635,7 @@ void dlm_config_exit(void)
 static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
 			    char *buf)
 {
-	struct cluster *cl = to_cluster(i);
+	struct dlm_cluster *cl = config_item_to_cluster(i);
 	struct cluster_attribute *cla =
 			container_of(a, struct cluster_attribute, attr);
 	return cla->show ? cla->show(cl, buf) : 0;
@@ -646,7 +645,7 @@ static ssize_t store_cluster(struct config_item *i,
 			     struct configfs_attribute *a,
 			     const char *buf, size_t len)
 {
-	struct cluster *cl = to_cluster(i);
+	struct dlm_cluster *cl = config_item_to_cluster(i);
 	struct cluster_attribute *cla =
 		container_of(a, struct cluster_attribute, attr);
 	return cla->store ? cla->store(cl, buf, len) : -EINVAL;
@@ -655,7 +654,7 @@ static ssize_t store_cluster(struct config_item *i,
 static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
 			 char *buf)
 {
-	struct comm *cm = to_comm(i);
+	struct dlm_comm *cm = config_item_to_comm(i);
 	struct comm_attribute *cma =
 			container_of(a, struct comm_attribute, attr);
 	return cma->show ? cma->show(cm, buf) : 0;
@@ -664,29 +663,31 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
 			  const char *buf, size_t len)
 {
-	struct comm *cm = to_comm(i);
+	struct dlm_comm *cm = config_item_to_comm(i);
 	struct comm_attribute *cma =
 		container_of(a, struct comm_attribute, attr);
 	return cma->store ? cma->store(cm, buf, len) : -EINVAL;
 }
 
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
 {
 	return sprintf(buf, "%d\n", cm->nodeid);
 }
 
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+				 size_t len)
 {
 	cm->nodeid = simple_strtol(buf, NULL, 0);
 	return len;
 }
 
-static ssize_t comm_local_read(struct comm *cm, char *buf)
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
 {
 	return sprintf(buf, "%d\n", cm->local);
 }
 
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+				size_t len)
 {
 	cm->local= simple_strtol(buf, NULL, 0);
 	if (cm->local && !local_comm)
@@ -694,7 +695,7 @@ static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
 	return len;
 }
 
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
 {
 	struct sockaddr_storage *addr;
 
@@ -716,7 +717,7 @@ static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
 static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 			 char *buf)
 {
-	struct node *nd = to_node(i);
+	struct dlm_node *nd = config_item_to_node(i);
 	struct node_attribute *nda =
 			container_of(a, struct node_attribute, attr);
 	return nda->show ? nda->show(nd, buf) : 0;
@@ -725,29 +726,31 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
 static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
 			  const char *buf, size_t len)
 {
-	struct node *nd = to_node(i);
+	struct dlm_node *nd = config_item_to_node(i);
 	struct node_attribute *nda =
 		container_of(a, struct node_attribute, attr);
 	return nda->store ? nda->store(nd, buf, len) : -EINVAL;
 }
 
-static ssize_t node_nodeid_read(struct node *nd, char *buf)
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
 {
 	return sprintf(buf, "%d\n", nd->nodeid);
 }
 
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+				 size_t len)
 {
 	nd->nodeid = simple_strtol(buf, NULL, 0);
 	return len;
 }
 
-static ssize_t node_weight_read(struct node *nd, char *buf)
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
 {
 	return sprintf(buf, "%d\n", nd->weight);
 }
 
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+				 size_t len)
 {
 	nd->weight = simple_strtol(buf, NULL, 0);
 	return len;
@@ -757,7 +760,7 @@ static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
  * Functions for the dlm to get the info that's been configured
  */
 
-static struct space *get_space(char *name)
+static struct dlm_space *get_space(char *name)
 {
 	struct config_item *i;
 
@@ -768,18 +771,45 @@ static struct space *get_space(char *name)
 	i = config_group_find_item(space_list, name);
 	mutex_unlock(&space_list->cg_subsys->su_mutex);
 
-	return to_space(i);
+	return config_item_to_space(i);
 }
 
-static void put_space(struct space *sp)
+static void put_space(struct dlm_space *sp)
 {
 	config_item_put(&sp->group.cg_item);
 }
 
-static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
+{
+	switch (x->ss_family) {
+	case AF_INET: {
+		struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+		struct sockaddr_in *siny = (struct sockaddr_in *)y;
+		if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+			return 0;
+		if (sinx->sin_port != siny->sin_port)
+			return 0;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+		struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+		if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+			return 0;
+		if (sinx->sin6_port != siny->sin6_port)
+			return 0;
+		break;
+	}
+	default:
+		return 0;
+	}
+	return 1;
+}
+
+static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 {
 	struct config_item *i;
-	struct comm *cm = NULL;
+	struct dlm_comm *cm = NULL;
 	int found = 0;
 
 	if (!comm_list)
@@ -788,7 +818,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 	mutex_lock(&clusters_root.subsys.su_mutex);
 
 	list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
-		cm = to_comm(i);
+		cm = config_item_to_comm(i);
 
 		if (nodeid) {
 			if (cm->nodeid != nodeid)
@@ -797,8 +827,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 			config_item_get(i);
 			break;
 		} else {
-			if (!cm->addr_count ||
-			    memcmp(cm->addr[0], addr, sizeof(*addr)))
+			if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
 				continue;
 			found = 1;
 			config_item_get(i);
@@ -812,7 +841,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
 	return cm;
 }
 
-static void put_comm(struct comm *cm)
+static void put_comm(struct dlm_comm *cm)
 {
 	config_item_put(&cm->item);
 }
@@ -821,8 +850,8 @@ static void put_comm(struct comm *cm)
 int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 		    int **new_out, int *new_count_out)
 {
-	struct space *sp;
-	struct node *nd;
+	struct dlm_space *sp;
+	struct dlm_node *nd;
 	int i = 0, rv = 0, ids_count = 0, new_count = 0;
 	int *ids, *new;
 
@@ -885,8 +914,8 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
 
 int dlm_node_weight(char *lsname, int nodeid)
 {
-	struct space *sp;
-	struct node *nd;
+	struct dlm_space *sp;
+	struct dlm_node *nd;
 	int w = -EEXIST;
 
 	sp = get_space(lsname);
@@ -908,7 +937,7 @@ int dlm_node_weight(char *lsname, int nodeid)
 
 int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
 {
-	struct comm *cm = get_comm(nodeid, NULL);
+	struct dlm_comm *cm = get_comm(nodeid, NULL);
 	if (!cm)
 		return -EEXIST;
 	if (!cm->addr_count)
@@ -920,7 +949,7 @@ int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
 
 int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
 {
-	struct comm *cm = get_comm(0, addr);
+	struct dlm_comm *cm = get_comm(0, addr);
 	if (!cm)
 		return -EEXIST;
 	*nodeid = cm->nodeid;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5a7ac33b629c..868e4c9ef127 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -441,8 +441,11 @@ struct dlm_ls {
 	uint32_t		ls_global_id;	/* global unique lockspace ID */
 	uint32_t		ls_exflags;
 	int			ls_lvblen;
-	int			ls_count;	/* reference count */
+	int			ls_count;	/* refcount of processes in
+						   the dlm using this ls */
+	int			ls_create_count; /* create/release refcount */
 	unsigned long		ls_flags;	/* LSFL_ */
+	unsigned long		ls_scan_time;
 	struct kobject		ls_kobj;
 
 	struct dlm_rsbtable	*ls_rsbtbl;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 2d3d1027ce2b..724ddac91538 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -363,6 +363,7 @@ static int search_rsb_list(struct list_head *head, char *name, int len,
 		if (len == r->res_length && !memcmp(name, r->res_name, len))
 			goto found;
 	}
+	*r_ret = NULL;
 	return -EBADR;
 
  found:
@@ -1782,7 +1783,8 @@ static void grant_pending_locks(struct dlm_rsb *r)
 
 	list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
 		if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
-			if (cw && high == DLM_LOCK_PR)
+			if (cw && high == DLM_LOCK_PR &&
+			    lkb->lkb_grmode == DLM_LOCK_PR)
 				queue_bast(r, lkb, DLM_LOCK_CW);
 			else
 				queue_bast(r, lkb, high);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 499e16759e96..d910501de6d2 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -23,6 +23,7 @@
 #include "lock.h"
 #include "recover.h"
 #include "requestqueue.h"
+#include "user.h"
 
 static int			ls_count;
 static struct mutex		ls_lock;
@@ -211,19 +212,41 @@ void dlm_lockspace_exit(void)
 	kset_unregister(dlm_kset);
 }
 
+static struct dlm_ls *find_ls_to_scan(void)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (time_after_eq(jiffies, ls->ls_scan_time +
+					    dlm_config.ci_scan_secs * HZ)) {
+			spin_unlock(&lslist_lock);
+			return ls;
+		}
+	}
+	spin_unlock(&lslist_lock);
+	return NULL;
+}
+
 static int dlm_scand(void *data)
 {
 	struct dlm_ls *ls;
+	int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
 
 	while (!kthread_should_stop()) {
-		list_for_each_entry(ls, &lslist, ls_list) {
+		ls = find_ls_to_scan();
+		if (ls) {
 			if (dlm_lock_recovery_try(ls)) {
+				ls->ls_scan_time = jiffies;
 				dlm_scan_rsbs(ls);
 				dlm_scan_timeout(ls);
 				dlm_unlock_recovery(ls);
+			} else {
+				ls->ls_scan_time += HZ;
 			}
+		} else {
+			schedule_timeout_interruptible(timeout_jiffies);
 		}
-		schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
 	}
 	return 0;
 }
@@ -246,23 +269,6 @@ static void dlm_scand_stop(void)
 	kthread_stop(scand_task);
 }
 
-static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
-{
-	struct dlm_ls *ls;
-
-	spin_lock(&lslist_lock);
-
-	list_for_each_entry(ls, &lslist, ls_list) {
-		if (ls->ls_namelen == namelen &&
-		    memcmp(ls->ls_name, name, namelen) == 0)
-			goto out;
-	}
-	ls = NULL;
- out:
-	spin_unlock(&lslist_lock);
-	return ls;
-}
-
 struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
 {
 	struct dlm_ls *ls;
@@ -327,6 +333,7 @@ static void remove_lockspace(struct dlm_ls *ls)
 	for (;;) {
 		spin_lock(&lslist_lock);
 		if (ls->ls_count == 0) {
+			WARN_ON(ls->ls_create_count != 0);
 			list_del(&ls->ls_list);
 			spin_unlock(&lslist_lock);
 			return;
@@ -381,7 +388,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 			 uint32_t flags, int lvblen)
 {
 	struct dlm_ls *ls;
-	int i, size, error = -ENOMEM;
+	int i, size, error;
 	int do_unreg = 0;
 
 	if (namelen > DLM_LOCKSPACE_LEN)
@@ -393,12 +400,37 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	if (!try_module_get(THIS_MODULE))
 		return -EINVAL;
 
-	ls = dlm_find_lockspace_name(name, namelen);
-	if (ls) {
-		*lockspace = ls;
+	if (!dlm_user_daemon_available()) {
+		module_put(THIS_MODULE);
+		return -EUNATCH;
+	}
+
+	error = 0;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		WARN_ON(ls->ls_create_count <= 0);
+		if (ls->ls_namelen != namelen)
+			continue;
+		if (memcmp(ls->ls_name, name, namelen))
+			continue;
+		if (flags & DLM_LSFL_NEWEXCL) {
+			error = -EEXIST;
+			break;
+		}
+		ls->ls_create_count++;
 		module_put(THIS_MODULE);
-		return -EEXIST;
+		error = 1; /* not an error, return 0 */
+		break;
 	}
+	spin_unlock(&lslist_lock);
+
+	if (error < 0)
+		goto out;
+	if (error)
+		goto ret_zero;
+
+	error = -ENOMEM;
 
 	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
 	if (!ls)
@@ -408,6 +440,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	ls->ls_lvblen = lvblen;
 	ls->ls_count = 0;
 	ls->ls_flags = 0;
+	ls->ls_scan_time = jiffies;
 
 	if (flags & DLM_LSFL_TIMEWARN)
 		set_bit(LSFL_TIMEWARN, &ls->ls_flags);
@@ -418,8 +451,9 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 		ls->ls_allocation = GFP_KERNEL;
 
 	/* ls_exflags are forced to match among nodes, and we don't
-	   need to require all nodes to have TIMEWARN or FS set */
-	ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS));
+	   need to require all nodes to have some flags set */
+	ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
+				    DLM_LSFL_NEWEXCL));
 
 	size = dlm_config.ci_rsbtbl_size;
 	ls->ls_rsbtbl_size = size;
@@ -510,6 +544,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	down_write(&ls->ls_in_recovery);
 
 	spin_lock(&lslist_lock);
+	ls->ls_create_count = 1;
 	list_add(&ls->ls_list, &lslist);
 	spin_unlock(&lslist_lock);
 
@@ -548,7 +583,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	dlm_create_debug_file(ls);
 
 	log_debug(ls, "join complete");
-
+ ret_zero:
 	*lockspace = ls;
 	return 0;
 
@@ -635,13 +670,34 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 	struct dlm_lkb *lkb;
 	struct dlm_rsb *rsb;
 	struct list_head *head;
-	int i;
-	int busy = lockspace_busy(ls);
+	int i, busy, rv;
+
+	busy = lockspace_busy(ls);
+
+	spin_lock(&lslist_lock);
+	if (ls->ls_create_count == 1) {
+		if (busy > force)
+			rv = -EBUSY;
+		else {
+			/* remove_lockspace takes ls off lslist */
+			ls->ls_create_count = 0;
+			rv = 0;
+		}
+	} else if (ls->ls_create_count > 1) {
+		rv = --ls->ls_create_count;
+	} else {
+		rv = -EINVAL;
+	}
+	spin_unlock(&lslist_lock);
 
-	if (busy > force)
-		return -EBUSY;
+	if (rv) {
+		log_debug(ls, "release_lockspace no remove %d", rv);
+		return rv;
+	}
+
+	dlm_device_deregister(ls);
 
-	if (force < 3)
+	if (force < 3 && dlm_user_daemon_available())
 		do_uevent(ls, 0);
 
 	dlm_recoverd_stop(ls);
@@ -720,15 +776,10 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 	dlm_clear_members(ls);
 	dlm_clear_members_gone(ls);
 	kfree(ls->ls_node_array);
+	log_debug(ls, "release_lockspace final free");
 	kobject_put(&ls->ls_kobj);
 	/* The ls structure will be freed when the kobject is done with */
 
-	mutex_lock(&ls_lock);
-	ls_count--;
-	if (!ls_count)
-		threads_stop();
-	mutex_unlock(&ls_lock);
-
 	module_put(THIS_MODULE);
 	return 0;
 }
@@ -750,11 +801,38 @@ static int release_lockspace(struct dlm_ls *ls, int force)
 int dlm_release_lockspace(void *lockspace, int force)
 {
 	struct dlm_ls *ls;
+	int error;
 
 	ls = dlm_find_lockspace_local(lockspace);
 	if (!ls)
 		return -EINVAL;
 	dlm_put_lockspace(ls);
-	return release_lockspace(ls, force);
+
+	mutex_lock(&ls_lock);
+	error = release_lockspace(ls, force);
+	if (!error)
+		ls_count--;
+	else if (!ls_count)
+		threads_stop();
+	mutex_unlock(&ls_lock);
+
+	return error;
+}
+
+void dlm_stop_lockspaces(void)
+{
+	struct dlm_ls *ls;
+
+ restart:
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+			continue;
+		spin_unlock(&lslist_lock);
+		log_error(ls, "no userland control daemon, stopping lockspace");
+		dlm_ls_stop(ls);
+		goto restart;
+	}
+	spin_unlock(&lslist_lock);
 }
 
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 891eabbdd021..f879f87901f8 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -20,6 +20,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
 struct dlm_ls *dlm_find_lockspace_local(void *id);
 struct dlm_ls *dlm_find_lockspace_device(int minor);
 void dlm_put_lockspace(struct dlm_ls *ls);
+void dlm_stop_lockspaces(void);
 
 #endif				/* __LOCKSPACE_DOT_H__ */
 
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 637018c891ef..3962262f991a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -891,8 +891,10 @@ static void tcp_connect_to_sock(struct connection *con)
 		goto out_err;
 
 	memset(&saddr, 0, sizeof(saddr));
-	if (dlm_nodeid_to_addr(con->nodeid, &saddr))
+	if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
+		sock_release(sock);
 		goto out_err;
+	}
 
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 78878c5781ca..eba87ff3177b 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -116,7 +116,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (xop->callback == NULL)
 		wait_event(recv_wq, (op->done != 0));
 	else {
-		rv = -EINPROGRESS;
+		rv = FILE_LOCK_DEFERRED;
 		goto out;
 	}
 
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index f976f303c196..b3832c67194a 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2007 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -15,7 +15,6 @@
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
-#include <linux/smp_lock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
 
@@ -27,6 +26,8 @@
 
 static const char name_prefix[] = "dlm";
 static const struct file_operations device_fops;
+static atomic_t dlm_monitor_opened;
+static int dlm_monitor_unused = 1;
 
 #ifdef CONFIG_COMPAT
 
@@ -340,10 +341,15 @@ static int device_user_deadlock(struct dlm_user_proc *proc,
 	return error;
 }
 
-static int create_misc_device(struct dlm_ls *ls, char *name)
+static int dlm_device_register(struct dlm_ls *ls, char *name)
 {
 	int error, len;
 
+	/* The device is already registered.  This happens when the
+	   lockspace is created multiple times from userspace. */
+	if (ls->ls_device.name)
+		return 0;
+
 	error = -ENOMEM;
 	len = strlen(name) + strlen(name_prefix) + 2;
 	ls->ls_device.name = kzalloc(len, GFP_KERNEL);
@@ -363,6 +369,22 @@ fail:
 	return error;
 }
 
+int dlm_device_deregister(struct dlm_ls *ls)
+{
+	int error;
+
+	/* The device is not registered.  This happens when the lockspace
+	   was never used from userspace, or when device_create_lockspace()
+	   calls dlm_release_lockspace() after the register fails. */
+	if (!ls->ls_device.name)
+		return 0;
+
+	error = misc_deregister(&ls->ls_device);
+	if (!error)
+		kfree(ls->ls_device.name);
+	return error;
+}
+
 static int device_user_purge(struct dlm_user_proc *proc,
 			     struct dlm_purge_params *params)
 {
@@ -397,7 +419,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
 	if (!ls)
 		return -ENOENT;
 
-	error = create_misc_device(ls, params->name);
+	error = dlm_device_register(ls, params->name);
 	dlm_put_lockspace(ls);
 
 	if (error)
@@ -421,31 +443,22 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
 	if (!ls)
 		return -ENOENT;
 
-	/* Deregister the misc device first, so we don't have
-	 * a device that's not attached to a lockspace. If
-	 * dlm_release_lockspace fails then we can recreate it
-	 */
-	error = misc_deregister(&ls->ls_device);
-	if (error) {
-		dlm_put_lockspace(ls);
-		goto out;
-	}
-	kfree(ls->ls_device.name);
-
 	if (params->flags & DLM_USER_LSFLG_FORCEFREE)
 		force = 2;
 
 	lockspace = ls->ls_local_handle;
+	dlm_put_lockspace(ls);
 
-	/* dlm_release_lockspace waits for references to go to zero,
-	   so all processes will need to close their device for the ls
-	   before the release will procede */
+	/* The final dlm_release_lockspace waits for references to go to
+	   zero, so all processes will need to close their device for the
+	   ls before the release will proceed.  release also calls the
+	   device_deregister above.  Converting a positive return value
+	   from release to zero means that userspace won't know when its
+	   release was the final one, but it shouldn't need to know. */
 
-	dlm_put_lockspace(ls);
 	error = dlm_release_lockspace(lockspace, force);
-	if (error)
-		create_misc_device(ls, ls->ls_name);
- out:
+	if (error > 0)
+		error = 0;
 	return error;
 }
 
@@ -527,8 +540,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 		k32buf = (struct dlm_write_request32 *)kbuf;
 		kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) -
 			       sizeof(struct dlm_write_request32)), GFP_KERNEL);
-		if (!kbuf)
+		if (!kbuf) {
+			kfree(k32buf);
 			return -ENOMEM;
+		}
 
 		if (proc)
 			set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
@@ -539,8 +554,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 
 	/* do we really need this? can a write happen after a close? */
 	if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
-	    test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
-		return -EINVAL;
+	    (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) {
+		error = -EINVAL;
+		goto out_free;
+	}
 
 	sigfillset(&allsigs);
 	sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
@@ -619,17 +636,13 @@ static int device_open(struct inode *inode, struct file *file)
 	struct dlm_user_proc *proc;
 	struct dlm_ls *ls;
 
-	lock_kernel();
 	ls = dlm_find_lockspace_device(iminor(inode));
-	if (!ls) {
-		unlock_kernel();
+	if (!ls)
 		return -ENOENT;
-	}
 
 	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
 	if (!proc) {
 		dlm_put_lockspace(ls);
-		unlock_kernel();
 		return -ENOMEM;
 	}
 
@@ -641,7 +654,6 @@ static int device_open(struct inode *inode, struct file *file)
 	spin_lock_init(&proc->locks_spin);
 	init_waitqueue_head(&proc->wait);
 	file->private_data = proc;
-	unlock_kernel();
 
 	return 0;
 }
@@ -874,9 +886,28 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
 	return 0;
 }
 
+int dlm_user_daemon_available(void)
+{
+	/* dlm_controld hasn't started (or, has started, but not
+	   properly populated configfs) */
+
+	if (!dlm_our_nodeid())
+		return 0;
+
+	/* This is to deal with versions of dlm_controld that don't
+	   know about the monitor device.  We assume that if the
+	   dlm_controld was started (above), but the monitor device
+	   was never opened, that it's an old version.  dlm_controld
+	   should open the monitor device before populating configfs. */
+
+	if (dlm_monitor_unused)
+		return 1;
+
+	return atomic_read(&dlm_monitor_opened) ? 1 : 0;
+}
+
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
-	cycle_kernel_lock();
 	file->private_data = NULL;
 	return 0;
 }
@@ -886,6 +917,20 @@ static int ctl_device_close(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int monitor_device_open(struct inode *inode, struct file *file)
+{
+	atomic_inc(&dlm_monitor_opened);
+	dlm_monitor_unused = 0;
+	return 0;
+}
+
+static int monitor_device_close(struct inode *inode, struct file *file)
+{
+	if (atomic_dec_and_test(&dlm_monitor_opened))
+		dlm_stop_lockspaces();
+	return 0;
+}
+
 static const struct file_operations device_fops = {
 	.open    = device_open,
 	.release = device_close,
@@ -909,19 +954,42 @@ static struct miscdevice ctl_device = {
 	.minor = MISC_DYNAMIC_MINOR,
 };
 
+static const struct file_operations monitor_device_fops = {
+	.open    = monitor_device_open,
+	.release = monitor_device_close,
+	.owner   = THIS_MODULE,
+};
+
+static struct miscdevice monitor_device = {
+	.name  = "dlm-monitor",
+	.fops  = &monitor_device_fops,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
 int __init dlm_user_init(void)
 {
 	int error;
 
+	atomic_set(&dlm_monitor_opened, 0);
+
 	error = misc_register(&ctl_device);
-	if (error)
+	if (error) {
 		log_print("misc_register failed for control device");
+		goto out;
+	}
 
+	error = misc_register(&monitor_device);
+	if (error) {
+		log_print("misc_register failed for monitor device");
+		misc_deregister(&ctl_device);
+	}
+ out:
 	return error;
 }
 
 void dlm_user_exit(void)
 {
 	misc_deregister(&ctl_device);
+	misc_deregister(&monitor_device);
 }
 
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index d38e9f3e4151..35eb6a13d616 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -12,5 +12,7 @@
 void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
 int dlm_user_init(void);
 void dlm_user_exit(void);
+int dlm_device_deregister(struct dlm_ls *ls);
+int dlm_user_daemon_available(void);
 
 #endif
diff --git a/fs/dquot.c b/fs/dquot.c
index 5ac77da19959..5e95261005b2 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -9,8 +9,6 @@
  * implementation is based on one of the several variants of the LINUX
  * inode-subsystem with added complexity of the diskquota system.
  * 
- * Version: $Id: dquot.c,v 6.3 1996/11/17 18:35:34 mvw Exp mvw $
- * 
  * Author:	Marco van Wieringen <mvw@planets.elm.net>
  *
  * Fixes:   Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
@@ -562,6 +560,8 @@ static struct shrinker dqcache_shrinker = {
  */
 static void dqput(struct dquot *dquot)
 {
+	int ret;
+
 	if (!dquot)
 		return;
 #ifdef __DQUOT_PARANOIA
@@ -594,7 +594,19 @@ we_slept:
 	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) {
 		spin_unlock(&dq_list_lock);
 		/* Commit dquot before releasing */
-		dquot->dq_sb->dq_op->write_dquot(dquot);
+		ret = dquot->dq_sb->dq_op->write_dquot(dquot);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: cannot write quota structure on "
+				"device %s (error %d). Quota may get out of "
+				"sync!\n", dquot->dq_sb->s_id, ret);
+			/*
+			 * We clear dirty bit anyway, so that we avoid
+			 * infinite loop here
+			 */
+			spin_lock(&dq_list_lock);
+			clear_dquot_dirty(dquot);
+			spin_unlock(&dq_list_lock);
+		}
 		goto we_slept;
 	}
 	/* Clear flag in case dquot was inactive (something bad happened) */
@@ -875,13 +887,15 @@ static void print_warning(struct dquot *dquot, const int warntype)
 	char *msg = NULL;
 	struct tty_struct *tty;
 
-	if (!need_print_warning(dquot))
+	if (warntype == QUOTA_NL_IHARDBELOW ||
+	    warntype == QUOTA_NL_ISOFTBELOW ||
+	    warntype == QUOTA_NL_BHARDBELOW ||
+	    warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
 		return;
 
-	mutex_lock(&tty_mutex);
 	tty = get_current_tty();
 	if (!tty)
-		goto out_lock;
+		return;
 	tty_write_message(tty, dquot->dq_sb->s_id);
 	if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
 		tty_write_message(tty, ": warning, ");
@@ -909,8 +923,7 @@ static void print_warning(struct dquot *dquot, const int warntype)
 			break;
 	}
 	tty_write_message(tty, msg);
-out_lock:
-	mutex_unlock(&tty_mutex);
+	tty_kref_put(tty);
 }
 #endif
 
@@ -1083,6 +1096,35 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	return QUOTA_OK;
 }
 
+static int info_idq_free(struct dquot *dquot, ulong inodes)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+		return QUOTA_NL_NOWARN;
+
+	if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
+		return QUOTA_NL_ISOFTBELOW;
+	if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
+	    dquot->dq_dqb.dqb_curinodes - inodes < dquot->dq_dqb.dqb_ihardlimit)
+		return QUOTA_NL_IHARDBELOW;
+	return QUOTA_NL_NOWARN;
+}
+
+static int info_bdq_free(struct dquot *dquot, qsize_t space)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+	    toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+		return QUOTA_NL_NOWARN;
+
+	if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
+	    dquot->dq_dqb.dqb_bsoftlimit)
+		return QUOTA_NL_BSOFTBELOW;
+	if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
+	    toqb(dquot->dq_dqb.dqb_curspace - space) <
+						dquot->dq_dqb.dqb_bhardlimit)
+		return QUOTA_NL_BHARDBELOW;
+	return QUOTA_NL_NOWARN;
+}
 /*
  *	Initialize quota pointers in inode
  *	Transaction must be started at entry
@@ -1139,6 +1181,28 @@ int dquot_drop(struct inode *inode)
 	return 0;
 }
 
+/* Wrapper to remove references to quota structures from inode */
+void vfs_dq_drop(struct inode *inode)
+{
+	/* Here we can get arbitrary inode from clear_inode() so we have
+	 * to be careful. OTOH we don't need locking as quota operations
+	 * are allowed to change only at mount time */
+	if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
+	    && inode->i_sb->dq_op->drop) {
+		int cnt;
+		/* Test before calling to rule out calls from proc and such
+                 * where we are not allowed to block. Note that this is
+		 * actually reliable test even without the lock - the caller
+		 * must assure that nobody can come after the DQUOT_DROP and
+		 * add quota pointers back anyway */
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+			if (inode->i_dquot[cnt] != NODQUOT)
+				break;
+		if (cnt < MAXQUOTAS)
+			inode->i_sb->dq_op->drop(inode);
+	}
+}
+
 /*
  * Following four functions update i_blocks+i_bytes fields and
  * quota information (together with appropriate checks)
@@ -1248,6 +1312,7 @@ warn_put_all:
 int dquot_free_space(struct inode *inode, qsize_t number)
 {
 	unsigned int cnt;
+	char warntype[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
@@ -1256,6 +1321,7 @@ out_sub:
 		inode_sub_bytes(inode, number);
 		return QUOTA_OK;
 	}
+
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {
@@ -1266,6 +1332,7 @@ out_sub:
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (inode->i_dquot[cnt] == NODQUOT)
 			continue;
+		warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
 		dquot_decr_space(inode->i_dquot[cnt], number);
 	}
 	inode_sub_bytes(inode, number);
@@ -1274,6 +1341,7 @@ out_sub:
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		if (inode->i_dquot[cnt])
 			mark_dquot_dirty(inode->i_dquot[cnt]);
+	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
@@ -1284,11 +1352,13 @@ out_sub:
 int dquot_free_inode(const struct inode *inode, unsigned long number)
 {
 	unsigned int cnt;
+	char warntype[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return QUOTA_OK;
+
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {
@@ -1299,6 +1369,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (inode->i_dquot[cnt] == NODQUOT)
 			continue;
+		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
 		dquot_decr_inodes(inode->i_dquot[cnt], number);
 	}
 	spin_unlock(&dq_data_lock);
@@ -1306,6 +1377,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		if (inode->i_dquot[cnt])
 			mark_dquot_dirty(inode->i_dquot[cnt]);
+	flush_warnings(inode->i_dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
@@ -1323,7 +1395,8 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	struct dquot *transfer_to[MAXQUOTAS];
 	int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid,
 	    chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
-	char warntype[MAXQUOTAS];
+	char warntype_to[MAXQUOTAS];
+	char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
@@ -1332,7 +1405,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	/* Clear the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		transfer_to[cnt] = transfer_from[cnt] = NODQUOT;
-		warntype[cnt] = QUOTA_NL_NOWARN;
+		warntype_to[cnt] = QUOTA_NL_NOWARN;
 	}
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Now recheck reliably when holding dqptr_sem */
@@ -1364,8 +1437,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		if (transfer_to[cnt] == NODQUOT)
 			continue;
 		transfer_from[cnt] = inode->i_dquot[cnt];
-		if (check_idq(transfer_to[cnt], 1, warntype+cnt) == NO_QUOTA ||
-		    check_bdq(transfer_to[cnt], space, 0, warntype+cnt) == NO_QUOTA)
+		if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
+		    NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
+		    warntype_to + cnt) == NO_QUOTA)
 			goto warn_put_all;
 	}
 
@@ -1381,6 +1455,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 
 		/* Due to IO error we might not have transfer_from[] structure */
 		if (transfer_from[cnt]) {
+			warntype_from_inodes[cnt] =
+				info_idq_free(transfer_from[cnt], 1);
+			warntype_from_space[cnt] =
+				info_bdq_free(transfer_from[cnt], space);
 			dquot_decr_inodes(transfer_from[cnt], 1);
 			dquot_decr_space(transfer_from[cnt], space);
 		}
@@ -1400,7 +1478,9 @@ warn_put_all:
 		if (transfer_to[cnt])
 			mark_dquot_dirty(transfer_to[cnt]);
 	}
-	flush_warnings(transfer_to, warntype);
+	flush_warnings(transfer_to, warntype_to);
+	flush_warnings(transfer_from, warntype_from_inodes);
+	flush_warnings(transfer_from, warntype_from_space);
 	
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT)
@@ -1412,6 +1492,18 @@ warn_put_all:
 	return ret;
 }
 
+/* Wrapper for transferring ownership of an inode */
+int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+{
+	if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+		vfs_dq_init(inode);
+		if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
+			return 1;
+	}
+	return 0;
+}
+
+
 /*
  * Write info of quota file to disk
  */
@@ -1697,30 +1789,36 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
 	return ret;
 }
 
+int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
+		      struct path *path)
+{
+	int error = security_quota_on(path->dentry);
+	if (error)
+		return error;
+	/* Quota file not on the same filesystem? */
+	if (path->mnt->mnt_sb != sb)
+		error = -EXDEV;
+	else
+		error = vfs_quota_on_inode(path->dentry->d_inode, type,
+					   format_id);
+	return error;
+}
+
 /* Actual function called from quotactl() */
-int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path,
+int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 		 int remount)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
 	if (remount)
 		return vfs_quota_on_remount(sb, type);
 
-	error = path_lookup(path, LOOKUP_FOLLOW, &nd);
-	if (error < 0)
-		return error;
-	error = security_quota_on(nd.path.dentry);
-	if (error)
-		goto out_path;
-	/* Quota file not on the same filesystem? */
-	if (nd.path.mnt->mnt_sb != sb)
-		error = -EXDEV;
-	else
-		error = vfs_quota_on_inode(nd.path.dentry->d_inode, type,
-					   format_id);
-out_path:
-	path_put(&nd.path);
+	error = kern_path(name, LOOKUP_FOLLOW, &path);
+	if (!error) {
+		error = vfs_quota_on_path(sb, type, format_id, &path);
+		path_put(&path);
+	}
 	return error;
 }
 
@@ -1752,6 +1850,22 @@ out:
 	return error;
 }
 
+/* Wrapper to turn on quotas when remounting rw */
+int vfs_dq_quota_on_remount(struct super_block *sb)
+{
+	int cnt;
+	int ret = 0, err;
+
+	if (!sb->s_qcop || !sb->s_qcop->quota_on)
+		return -ENOSYS;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
+		if (err < 0 && !ret)
+			ret = err;
+	}
+	return ret;
+}
+
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
@@ -2073,6 +2187,7 @@ EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_data_lock);
 EXPORT_SYMBOL(vfs_quota_on);
+EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
 EXPORT_SYMBOL(vfs_quota_off);
 EXPORT_SYMBOL(vfs_quota_sync);
@@ -2087,8 +2202,11 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(vfs_dq_drop);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
 EXPORT_SYMBOL(dquot_free_inode);
 EXPORT_SYMBOL(dquot_transfer);
+EXPORT_SYMBOL(vfs_dq_transfer);
+EXPORT_SYMBOL(vfs_dq_quota_on_remount);
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
index 1e34a7fd4884..2cc9ee4ad2eb 100644
--- a/fs/ecryptfs/Makefile
+++ b/fs/ecryptfs/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
 
-ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o debug.o
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index e2832bc7869a..06db79d05c12 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
 #include <linux/crypto.h>
 #include <linux/file.h>
 #include <linux/scatterlist.h>
+#include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
 static int
@@ -474,8 +475,8 @@ int ecryptfs_encrypt_page(struct page *page)
 {
 	struct inode *ecryptfs_inode;
 	struct ecryptfs_crypt_stat *crypt_stat;
-	char *enc_extent_virt = NULL;
-	struct page *enc_extent_page;
+	char *enc_extent_virt;
+	struct page *enc_extent_page = NULL;
 	loff_t extent_offset;
 	int rc = 0;
 
@@ -491,14 +492,14 @@ int ecryptfs_encrypt_page(struct page *page)
 			       page->index);
 		goto out;
 	}
-	enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER);
-	if (!enc_extent_virt) {
+	enc_extent_page = alloc_page(GFP_USER);
+	if (!enc_extent_page) {
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR, "Error allocating memory for "
 				"encrypted extent\n");
 		goto out;
 	}
-	enc_extent_page = virt_to_page(enc_extent_virt);
+	enc_extent_virt = kmap(enc_extent_page);
 	for (extent_offset = 0;
 	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
 	     extent_offset++) {
@@ -526,7 +527,10 @@ int ecryptfs_encrypt_page(struct page *page)
 		}
 	}
 out:
-	kfree(enc_extent_virt);
+	if (enc_extent_page) {
+		kunmap(enc_extent_page);
+		__free_page(enc_extent_page);
+	}
 	return rc;
 }
 
@@ -608,8 +612,8 @@ int ecryptfs_decrypt_page(struct page *page)
 {
 	struct inode *ecryptfs_inode;
 	struct ecryptfs_crypt_stat *crypt_stat;
-	char *enc_extent_virt = NULL;
-	struct page *enc_extent_page;
+	char *enc_extent_virt;
+	struct page *enc_extent_page = NULL;
 	unsigned long extent_offset;
 	int rc = 0;
 
@@ -626,14 +630,14 @@ int ecryptfs_decrypt_page(struct page *page)
 			       page->index);
 		goto out;
 	}
-	enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER);
-	if (!enc_extent_virt) {
+	enc_extent_page = alloc_page(GFP_USER);
+	if (!enc_extent_page) {
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR, "Error allocating memory for "
 				"encrypted extent\n");
 		goto out;
 	}
-	enc_extent_page = virt_to_page(enc_extent_virt);
+	enc_extent_virt = kmap(enc_extent_page);
 	for (extent_offset = 0;
 	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
 	     extent_offset++) {
@@ -661,7 +665,10 @@ int ecryptfs_decrypt_page(struct page *page)
 		}
 	}
 out:
-	kfree(enc_extent_virt);
+	if (enc_extent_page) {
+		kunmap(enc_extent_page);
+		__free_page(enc_extent_page);
+	}
 	return rc;
 }
 
@@ -1032,10 +1039,8 @@ static int contains_ecryptfs_marker(char *data)
 {
 	u32 m_1, m_2;
 
-	memcpy(&m_1, data, 4);
-	m_1 = be32_to_cpu(m_1);
-	memcpy(&m_2, (data + 4), 4);
-	m_2 = be32_to_cpu(m_2);
+	m_1 = get_unaligned_be32(data);
+	m_2 = get_unaligned_be32(data + 4);
 	if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
 		return 1;
 	ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
@@ -1073,8 +1078,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
 	int i;
 	u32 flags;
 
-	memcpy(&flags, page_virt, 4);
-	flags = be32_to_cpu(flags);
+	flags = get_unaligned_be32(page_virt);
 	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
 			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
 		if (flags & ecryptfs_flag_map[i].file_flag) {
@@ -1100,11 +1104,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
 
 	get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
 	m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
-	m_1 = cpu_to_be32(m_1);
-	memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
-	m_2 = cpu_to_be32(m_2);
-	memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2,
-	       (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+	put_unaligned_be32(m_1, page_virt);
+	page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2);
+	put_unaligned_be32(m_2, page_virt);
 	(*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
 }
 
@@ -1121,8 +1123,7 @@ write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
 			flags |= ecryptfs_flag_map[i].file_flag;
 	/* Version is in top 8 bits of the 32-bit flag vector */
 	flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
-	flags = cpu_to_be32(flags);
-	memcpy(page_virt, &flags, 4);
+	put_unaligned_be32(flags, page_virt);
 	(*written) = 4;
 }
 
@@ -1238,11 +1239,9 @@ ecryptfs_write_header_metadata(char *virt,
 	num_header_extents_at_front =
 		(u16)(crypt_stat->num_header_bytes_at_front
 		      / crypt_stat->extent_size);
-	header_extent_size = cpu_to_be32(header_extent_size);
-	memcpy(virt, &header_extent_size, 4);
+	put_unaligned_be32(header_extent_size, virt);
 	virt += 4;
-	num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front);
-	memcpy(virt, &num_header_extents_at_front, 2);
+	put_unaligned_be16(num_header_extents_at_front, virt);
 	(*written) = 6;
 }
 
@@ -1410,15 +1409,13 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
 	u32 header_extent_size;
 	u16 num_header_extents_at_front;
 
-	memcpy(&header_extent_size, virt, sizeof(u32));
-	header_extent_size = be32_to_cpu(header_extent_size);
-	virt += sizeof(u32);
-	memcpy(&num_header_extents_at_front, virt, sizeof(u16));
-	num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front);
+	header_extent_size = get_unaligned_be32(virt);
+	virt += sizeof(__be32);
+	num_header_extents_at_front = get_unaligned_be16(virt);
 	crypt_stat->num_header_bytes_at_front =
 		(((size_t)num_header_extents_at_front
 		  * (size_t)header_extent_size));
-	(*bytes_read) = (sizeof(u32) + sizeof(u16));
+	(*bytes_read) = (sizeof(__be32) + sizeof(__be16));
 	if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
 	    && (crypt_stat->num_header_bytes_at_front
 		< ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c15c25745e05..3504cf9df358 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -79,11 +79,6 @@
 #define ECRYPTFS_MAX_PKI_NAME_BYTES 16
 #define ECRYPTFS_DEFAULT_NUM_USERS 4
 #define ECRYPTFS_MAX_NUM_USERS 32768
-#define ECRYPTFS_TRANSPORT_NETLINK 0
-#define ECRYPTFS_TRANSPORT_CONNECTOR 1
-#define ECRYPTFS_TRANSPORT_RELAYFS 2
-#define ECRYPTFS_TRANSPORT_MISCDEV 3
-#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_MISCDEV
 #define ECRYPTFS_XATTR_NAME "user.ecryptfs"
 
 #define RFC2440_CIPHER_DES3_EDE 0x02
@@ -400,8 +395,6 @@ struct ecryptfs_msg_ctx {
 	struct mutex mux;
 };
 
-extern unsigned int ecryptfs_transport;
-
 struct ecryptfs_daemon;
 
 struct ecryptfs_daemon {
@@ -559,10 +552,25 @@ extern struct kmem_cache *ecryptfs_key_record_cache;
 extern struct kmem_cache *ecryptfs_key_sig_cache;
 extern struct kmem_cache *ecryptfs_global_auth_tok_cache;
 extern struct kmem_cache *ecryptfs_key_tfm_cache;
+extern struct kmem_cache *ecryptfs_open_req_cache;
+
+struct ecryptfs_open_req {
+#define ECRYPTFS_REQ_PROCESSED 0x00000001
+#define ECRYPTFS_REQ_DROPPED   0x00000002
+#define ECRYPTFS_REQ_ZOMBIE    0x00000004
+	u32 flags;
+	struct file **lower_file;
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
+	wait_queue_head_t wait;
+	struct mutex mux;
+	struct list_head kthread_ctl_list;
+};
 
+#define ECRYPTFS_INTERPOSE_FLAG_D_ADD                 0x00000001
 int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       struct dentry *this_dentry, struct super_block *sb,
-		       int flag);
+		       u32 flags);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
 int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
 			     const char *name, int length,
@@ -612,31 +620,20 @@ int
 ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		  size_t size, int flags);
 int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
-int ecryptfs_process_helo(unsigned int transport, uid_t euid,
-			  struct user_namespace *user_ns, struct pid *pid);
+int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
+			  struct pid *pid);
 int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
 			  struct pid *pid);
 int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 			      struct user_namespace *user_ns, struct pid *pid,
 			      u32 seq);
-int ecryptfs_send_message(unsigned int transport, char *data, int data_len,
+int ecryptfs_send_message(char *data, int data_len,
 			  struct ecryptfs_msg_ctx **msg_ctx);
 int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
 			       struct ecryptfs_message **emsg);
-int ecryptfs_init_messaging(unsigned int transport);
-void ecryptfs_release_messaging(unsigned int transport);
+int ecryptfs_init_messaging(void);
+void ecryptfs_release_messaging(void);
 
-int ecryptfs_send_netlink(char *data, int data_len,
-			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
-			  u16 msg_flags, struct pid *daemon_pid);
-int ecryptfs_init_netlink(void);
-void ecryptfs_release_netlink(void);
-
-int ecryptfs_send_connector(char *data, int data_len,
-			    struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
-			    u16 msg_flags, struct pid *daemon_pid);
-int ecryptfs_init_connector(void);
-void ecryptfs_release_connector(void);
 void
 ecryptfs_write_header_metadata(char *virt,
 			       struct ecryptfs_crypt_stat *crypt_stat,
@@ -690,5 +687,11 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
 int
 ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
 		      struct user_namespace *user_ns, struct pid *pid);
+int ecryptfs_init_kthread(void);
+void ecryptfs_destroy_kthread(void);
+int ecryptfs_privileged_open(struct file **lower_file,
+			     struct dentry *lower_dentry,
+			     struct vfsmount *lower_mnt);
+int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 24749bf0668f..eb3dc4c7ac06 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -71,12 +71,11 @@ struct ecryptfs_getdents_callback {
 	void *dirent;
 	struct dentry *dentry;
 	filldir_t filldir;
-	int err;
 	int filldir_called;
 	int entries_written;
 };
 
-/* Inspired by generic filldir in fs/readir.c */
+/* Inspired by generic filldir in fs/readdir.c */
 static int
 ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
 		 u64 ino, unsigned int d_type)
@@ -125,18 +124,18 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
 	buf.dirent = dirent;
 	buf.dentry = file->f_path.dentry;
 	buf.filldir = filldir;
-retry:
 	buf.filldir_called = 0;
 	buf.entries_written = 0;
-	buf.err = 0;
 	rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
-	if (buf.err)
-		rc = buf.err;
-	if (buf.filldir_called && !buf.entries_written)
-		goto retry;
 	file->f_pos = lower_file->f_pos;
+	if (rc < 0)
+		goto out;
+	if (buf.filldir_called && !buf.entries_written)
+		goto out;
 	if (rc >= 0)
-		fsstack_copy_attr_atime(inode, lower_file->f_path.dentry->d_inode);
+		fsstack_copy_attr_atime(inode,
+					lower_file->f_path.dentry->d_inode);
+out:
 	return rc;
 }
 
@@ -192,6 +191,23 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
 				      | ECRYPTFS_ENCRYPTED);
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
+	if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+	    && !(file->f_flags & O_RDONLY)) {
+		rc = -EPERM;
+		printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
+		       "file must hence be opened RO\n", __func__);
+		goto out;
+	}
+	if (!ecryptfs_inode_to_private(inode)->lower_file) {
+		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to initialize "
+			       "the persistent file for the dentry with name "
+			       "[%s]; rc = [%d]\n", __func__,
+			       ecryptfs_dentry->d_name.name, rc);
+			goto out;
+		}
+	}
 	ecryptfs_set_file_lower(
 		file, ecryptfs_inode_to_private(inode)->lower_file);
 	if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c92cc1c00aae..89209f00f9c7 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/mount.h>
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
+#include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
 static struct dentry *lock_parent(struct dentry *dentry)
@@ -188,6 +189,16 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
 				"context; rc = [%d]\n", rc);
 		goto out;
 	}
+	if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to initialize "
+			       "the persistent file for the dentry with name "
+			       "[%s]; rc = [%d]\n", __func__,
+			       ecryptfs_dentry->d_name.name, rc);
+			goto out;
+		}
+	}
 	rc = ecryptfs_write_metadata(ecryptfs_dentry);
 	if (rc) {
 		printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
@@ -307,10 +318,11 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 		d_add(dentry, NULL);
 		goto out;
 	}
-	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1);
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
+				ECRYPTFS_INTERPOSE_FLAG_D_ADD);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error interposing\n");
-		goto out_dput;
+		goto out;
 	}
 	if (S_ISDIR(lower_inode->i_mode)) {
 		ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
@@ -336,11 +348,21 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 		rc = -ENOMEM;
 		ecryptfs_printk(KERN_ERR,
 				"Cannot ecryptfs_kmalloc a page\n");
-		goto out_dput;
+		goto out;
 	}
 	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
 	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
 		ecryptfs_set_default_sizes(crypt_stat);
+	if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
+		rc = ecryptfs_init_persistent_file(dentry);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to initialize "
+			       "the persistent file for the dentry with name "
+			       "[%s]; rc = [%d]\n", __func__,
+			       dentry->d_name.name, rc);
+			goto out;
+		}
+	}
 	rc = ecryptfs_read_and_validate_header_region(page_virt,
 						      dentry->d_inode);
 	if (rc) {
@@ -364,8 +386,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 		else
 			file_size = i_size_read(lower_dentry->d_inode);
 	} else {
-		memcpy(&file_size, page_virt, sizeof(file_size));
-		file_size = be64_to_cpu(file_size);
+		file_size = get_unaligned_be64(page_virt);
 	}
 	i_size_write(dentry->d_inode, (loff_t)file_size);
 	kmem_cache_free(ecryptfs_header_cache_2, page_virt);
@@ -444,7 +465,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	int rc;
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
-	umode_t mode;
 	char *encoded_symname;
 	int encoded_symlen;
 	struct ecryptfs_crypt_stat *crypt_stat = NULL;
@@ -452,7 +472,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	dget(lower_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	mode = S_IALLUGO;
 	encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
 						  strlen(symname),
 						  &encoded_symname);
@@ -461,7 +480,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_lock;
 	}
 	rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
-			 encoded_symname, mode);
+			 encoded_symname);
 	kfree(encoded_symname);
 	if (rc || !lower_dentry->d_inode)
 		goto out_lock;
@@ -809,22 +828,9 @@ out:
 }
 
 static int
-ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+ecryptfs_permission(struct inode *inode, int mask)
 {
-	int rc;
-
-        if (nd) {
-		struct vfsmount *vfsmnt_save = nd->path.mnt;
-		struct dentry *dentry_save = nd->path.dentry;
-
-		nd->path.mnt = ecryptfs_dentry_to_lower_mnt(nd->path.dentry);
-		nd->path.dentry = ecryptfs_dentry_to_lower(nd->path.dentry);
-		rc = permission(ecryptfs_inode_to_lower(inode), mask, nd);
-		nd->path.mnt = vfsmnt_save;
-		nd->path.dentry = dentry_save;
-        } else
-		rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL);
-        return rc;
+	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
 
 /**
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e82b457180be..e22bc3961345 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -44,15 +44,15 @@ static int process_request_key_err(long err_code)
 	int rc = 0;
 
 	switch (err_code) {
-	case ENOKEY:
+	case -ENOKEY:
 		ecryptfs_printk(KERN_WARNING, "No key\n");
 		rc = -ENOENT;
 		break;
-	case EKEYEXPIRED:
+	case -EKEYEXPIRED:
 		ecryptfs_printk(KERN_WARNING, "Key expired\n");
 		rc = -ETIME;
 		break;
-	case EKEYREVOKED:
+	case -EKEYREVOKED:
 		ecryptfs_printk(KERN_WARNING, "Key revoked\n");
 		rc = -EINVAL;
 		break;
@@ -234,8 +234,8 @@ parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code,
 	}
 	i += data_len;
 	if (message_len < (i + m_size)) {
-		ecryptfs_printk(KERN_ERR, "The received netlink message is "
-				"shorter than expected\n");
+		ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd "
+				"is shorter than expected\n");
 		rc = -EIO;
 		goto out;
 	}
@@ -438,8 +438,8 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 	struct ecryptfs_msg_ctx *msg_ctx;
 	struct ecryptfs_message *msg = NULL;
 	char *auth_tok_sig;
-	char *netlink_message;
-	size_t netlink_message_length;
+	char *payload;
+	size_t payload_len;
 	int rc;
 
 	rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok);
@@ -449,15 +449,15 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
 		goto out;
 	}
 	rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key),
-				 &netlink_message, &netlink_message_length);
+				 &payload, &payload_len);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n");
 		goto out;
 	}
-	rc = ecryptfs_send_message(ecryptfs_transport, netlink_message,
-				   netlink_message_length, &msg_ctx);
+	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error sending netlink message\n");
+		ecryptfs_printk(KERN_ERR, "Error sending message to "
+				"ecryptfsd\n");
 		goto out;
 	}
 	rc = ecryptfs_wait_for_response(msg_ctx, &msg);
@@ -963,8 +963,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
 	if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
 		printk(KERN_ERR "Could not find key with description: [%s]\n",
 		       sig);
-		process_request_key_err(PTR_ERR(*auth_tok_key));
-		rc = -EINVAL;
+		rc = process_request_key_err(PTR_ERR(*auth_tok_key));
 		goto out;
 	}
 	(*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key);
@@ -1334,23 +1333,22 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
 			struct ecryptfs_key_record *key_rec)
 {
 	struct ecryptfs_msg_ctx *msg_ctx = NULL;
-	char *netlink_payload;
-	size_t netlink_payload_length;
+	char *payload = NULL;
+	size_t payload_len;
 	struct ecryptfs_message *msg;
 	int rc;
 
 	rc = write_tag_66_packet(auth_tok->token.private_key.signature,
 				 ecryptfs_code_for_cipher_string(crypt_stat),
-				 crypt_stat, &netlink_payload,
-				 &netlink_payload_length);
+				 crypt_stat, &payload, &payload_len);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
 		goto out;
 	}
-	rc = ecryptfs_send_message(ecryptfs_transport, netlink_payload,
-				   netlink_payload_length, &msg_ctx);
+	rc = ecryptfs_send_message(payload, payload_len, &msg_ctx);
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error sending netlink message\n");
+		ecryptfs_printk(KERN_ERR, "Error sending message to "
+				"ecryptfsd\n");
 		goto out;
 	}
 	rc = ecryptfs_wait_for_response(msg_ctx, &msg);
@@ -1365,8 +1363,7 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
 		ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n");
 	kfree(msg);
 out:
-	if (netlink_payload)
-		kfree(netlink_payload);
+	kfree(payload);
 	return rc;
 }
 /**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
new file mode 100644
index 000000000000..c440c6b58b2d
--- /dev/null
+++ b/fs/ecryptfs/kthread.c
@@ -0,0 +1,203 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2008 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <linux/mount.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_open_req_cache;
+
+static struct ecryptfs_kthread_ctl {
+#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001
+	u32 flags;
+	struct mutex mux;
+	struct list_head req_list;
+	wait_queue_head_t wait;
+} ecryptfs_kthread_ctl;
+
+static struct task_struct *ecryptfs_kthread;
+
+/**
+ * ecryptfs_threadfn
+ * @ignored: ignored
+ *
+ * The eCryptfs kernel thread that has the responsibility of getting
+ * the lower persistent file with RW permissions.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_threadfn(void *ignored)
+{
+	set_freezable();
+	while (1)  {
+		struct ecryptfs_open_req *req;
+
+		wait_event_freezable(
+			ecryptfs_kthread_ctl.wait,
+			(!list_empty(&ecryptfs_kthread_ctl.req_list)
+			 || kthread_should_stop()));
+		mutex_lock(&ecryptfs_kthread_ctl.mux);
+		if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+			mutex_unlock(&ecryptfs_kthread_ctl.mux);
+			goto out;
+		}
+		while (!list_empty(&ecryptfs_kthread_ctl.req_list)) {
+			req = list_first_entry(&ecryptfs_kthread_ctl.req_list,
+					       struct ecryptfs_open_req,
+					       kthread_ctl_list);
+			mutex_lock(&req->mux);
+			list_del(&req->kthread_ctl_list);
+			if (!(req->flags & ECRYPTFS_REQ_ZOMBIE)) {
+				dget(req->lower_dentry);
+				mntget(req->lower_mnt);
+				(*req->lower_file) = dentry_open(
+					req->lower_dentry, req->lower_mnt,
+					(O_RDWR | O_LARGEFILE));
+				req->flags |= ECRYPTFS_REQ_PROCESSED;
+			}
+			wake_up(&req->wait);
+			mutex_unlock(&req->mux);
+		}
+		mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	}
+out:
+	return 0;
+}
+
+int ecryptfs_init_kthread(void)
+{
+	int rc = 0;
+
+	mutex_init(&ecryptfs_kthread_ctl.mux);
+	init_waitqueue_head(&ecryptfs_kthread_ctl.wait);
+	INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list);
+	ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL,
+				       "ecryptfs-kthread");
+	if (IS_ERR(ecryptfs_kthread)) {
+		rc = PTR_ERR(ecryptfs_kthread);
+		printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]"
+		       "\n", __func__, rc);
+	}
+	return rc;
+}
+
+void ecryptfs_destroy_kthread(void)
+{
+	struct ecryptfs_open_req *req;
+
+	mutex_lock(&ecryptfs_kthread_ctl.mux);
+	ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE;
+	list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list,
+			    kthread_ctl_list) {
+		mutex_lock(&req->mux);
+		req->flags |= ECRYPTFS_REQ_ZOMBIE;
+		wake_up(&req->wait);
+		mutex_unlock(&req->mux);
+	}
+	mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	kthread_stop(ecryptfs_kthread);
+	wake_up(&ecryptfs_kthread_ctl.wait);
+}
+
+/**
+ * ecryptfs_privileged_open
+ * @lower_file: Result of dentry_open by root on lower dentry
+ * @lower_dentry: Lower dentry for file to open
+ * @lower_mnt: Lower vfsmount for file to open
+ *
+ * This function gets a r/w file opened againt the lower dentry.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_privileged_open(struct file **lower_file,
+			     struct dentry *lower_dentry,
+			     struct vfsmount *lower_mnt)
+{
+	struct ecryptfs_open_req *req;
+	int rc = 0;
+
+	/* Corresponding dput() and mntput() are done when the
+	 * persistent file is fput() when the eCryptfs inode is
+	 * destroyed. */
+	dget(lower_dentry);
+	mntget(lower_mnt);
+	(*lower_file) = dentry_open(lower_dentry, lower_mnt,
+				    (O_RDWR | O_LARGEFILE));
+	if (!IS_ERR(*lower_file))
+		goto out;
+	req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
+	if (!req) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	mutex_init(&req->mux);
+	req->lower_file = lower_file;
+	req->lower_dentry = lower_dentry;
+	req->lower_mnt = lower_mnt;
+	init_waitqueue_head(&req->wait);
+	req->flags = 0;
+	mutex_lock(&ecryptfs_kthread_ctl.mux);
+	if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+		rc = -EIO;
+		mutex_unlock(&ecryptfs_kthread_ctl.mux);
+		printk(KERN_ERR "%s: We are in the middle of shutting down; "
+		       "aborting privileged request to open lower file\n",
+			__func__);
+		goto out_free;
+	}
+	list_add_tail(&req->kthread_ctl_list, &ecryptfs_kthread_ctl.req_list);
+	mutex_unlock(&ecryptfs_kthread_ctl.mux);
+	wake_up(&ecryptfs_kthread_ctl.wait);
+	wait_event(req->wait, (req->flags != 0));
+	mutex_lock(&req->mux);
+	BUG_ON(req->flags == 0);
+	if (req->flags & ECRYPTFS_REQ_DROPPED
+	    || req->flags & ECRYPTFS_REQ_ZOMBIE) {
+		rc = -EIO;
+		printk(KERN_WARNING "%s: Privileged open request dropped\n",
+		       __func__);
+		goto out_unlock;
+	}
+	if (IS_ERR(*req->lower_file)) {
+		rc = PTR_ERR(*req->lower_file);
+		dget(lower_dentry);
+		mntget(lower_mnt);
+		(*lower_file) = dentry_open(lower_dentry, lower_mnt,
+					    (O_RDONLY | O_LARGEFILE));
+		if (IS_ERR(*lower_file)) {
+			rc = PTR_ERR(*req->lower_file);
+			(*lower_file) = NULL;
+			printk(KERN_WARNING "%s: Error attempting privileged "
+			       "open of lower file with either RW or RO "
+			       "perms; rc = [%d]. Giving up.\n",
+			       __func__, rc);
+		}
+	}
+out_unlock:
+	mutex_unlock(&req->mux);
+out_free:
+	kmem_cache_free(ecryptfs_open_req_cache, req);
+out:
+	return rc;
+}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d603631601eb..64d2ba980df4 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -30,7 +30,6 @@
 #include <linux/namei.h>
 #include <linux/skbuff.h>
 #include <linux/crypto.h>
-#include <linux/netlink.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/key.h>
@@ -49,8 +48,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity,
 		 "0, which is Quiet)");
 
 /**
- * Module parameter that defines the number of netlink message buffer
- * elements
+ * Module parameter that defines the number of message buffer elements
  */
 unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS;
 
@@ -60,9 +58,9 @@ MODULE_PARM_DESC(ecryptfs_message_buf_len,
 
 /**
  * Module parameter that defines the maximum guaranteed amount of time to wait
- * for a response through netlink.  The actual sleep time will be, more than
+ * for a response from ecryptfsd.  The actual sleep time will be, more than
  * likely, a small amount greater than this specified value, but only less if
- * the netlink message successfully arrives.
+ * the message successfully arrives.
  */
 signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ;
 
@@ -83,8 +81,6 @@ module_param(ecryptfs_number_of_users, uint, 0);
 MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of "
 		 "concurrent users of eCryptfs");
 
-unsigned int ecryptfs_transport = ECRYPTFS_DEFAULT_TRANSPORT;
-
 void __ecryptfs_printk(const char *fmt, ...)
 {
 	va_list args;
@@ -117,7 +113,7 @@ void __ecryptfs_printk(const char *fmt, ...)
  *
  * Returns zero on success; non-zero otherwise
  */
-static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
+int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 {
 	struct ecryptfs_inode_info *inode_info =
 		ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
@@ -130,26 +126,12 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 			ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
 
 		lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
-		/* Corresponding dput() and mntput() are done when the
-		 * persistent file is fput() when the eCryptfs inode
-		 * is destroyed. */
-		dget(lower_dentry);
-		mntget(lower_mnt);
-		inode_info->lower_file = dentry_open(lower_dentry,
-						     lower_mnt,
-						     (O_RDWR | O_LARGEFILE));
-		if (IS_ERR(inode_info->lower_file)) {
-			dget(lower_dentry);
-			mntget(lower_mnt);
-			inode_info->lower_file = dentry_open(lower_dentry,
-							     lower_mnt,
-							     (O_RDONLY
-							      | O_LARGEFILE));
-		}
-		if (IS_ERR(inode_info->lower_file)) {
+		rc = ecryptfs_privileged_open(&inode_info->lower_file,
+						     lower_dentry, lower_mnt);
+		if (rc || IS_ERR(inode_info->lower_file)) {
 			printk(KERN_ERR "Error opening lower persistent file "
-			       "for lower_dentry [0x%p] and lower_mnt [0x%p]\n",
-			       lower_dentry, lower_mnt);
+			       "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
+			       "rc = [%d]\n", lower_dentry, lower_mnt, rc);
 			rc = PTR_ERR(inode_info->lower_file);
 			inode_info->lower_file = NULL;
 		}
@@ -163,14 +145,14 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
  * @lower_dentry: Existing dentry in the lower filesystem
  * @dentry: ecryptfs' dentry
  * @sb: ecryptfs's super_block
- * @flag: If set to true, then d_add is called, else d_instantiate is called
+ * @flags: flags to govern behavior of interpose procedure
  *
  * Interposes upper and lower dentries.
  *
  * Returns zero on success; non-zero otherwise
  */
 int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
-		       struct super_block *sb, int flag)
+		       struct super_block *sb, u32 flags)
 {
 	struct inode *lower_inode;
 	struct inode *inode;
@@ -207,7 +189,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 		init_special_inode(inode, lower_inode->i_mode,
 				   lower_inode->i_rdev);
 	dentry->d_op = &ecryptfs_dops;
-	if (flag)
+	if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
 		d_add(dentry, inode);
 	else
 		d_instantiate(dentry, inode);
@@ -215,13 +197,6 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
 	/* This size will be overwritten for real files w/ headers and
 	 * other metadata */
 	fsstack_copy_inode_size(inode, lower_inode);
-	rc = ecryptfs_init_persistent_file(dentry);
-	if (rc) {
-		printk(KERN_ERR "%s: Error attempting to initialize the "
-		       "persistent file for the dentry with name [%s]; "
-		       "rc = [%d]\n", __func__, dentry->d_name.name, rc);
-		goto out;
-	}
 out:
 	return rc;
 }
@@ -232,7 +207,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
        ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
        ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{ecryptfs_opt_sig, "sig=%s"},
 	{ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
 	{ecryptfs_opt_cipher, "cipher=%s"},
@@ -262,10 +237,11 @@ static int ecryptfs_init_global_auth_toks(
 			       "session keyring for sig specified in mount "
 			       "option: [%s]\n", global_auth_tok->sig);
 			global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID;
-			rc = 0;
+			goto out;
 		} else
 			global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID;
 	}
+out:
 	return rc;
 }
 
@@ -314,7 +290,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 	char *cipher_name_dst;
 	char *cipher_name_src;
 	char *cipher_key_bytes_src;
-	int cipher_name_len;
 
 	if (!options) {
 		rc = -EINVAL;
@@ -395,17 +370,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 		goto out;
 	}
 	if (!cipher_name_set) {
-		cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
-		if (unlikely(cipher_name_len
-			     >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) {
-			rc = -EINVAL;
-			BUG();
-			goto out;
-		}
-		memcpy(mount_crypt_stat->global_default_cipher_name,
-		       ECRYPTFS_DEFAULT_CIPHER, cipher_name_len);
-		mount_crypt_stat->global_default_cipher_name[cipher_name_len]
-		    = '\0';
+		int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
+
+		BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+
+		strcpy(mount_crypt_stat->global_default_cipher_name,
+		       ECRYPTFS_DEFAULT_CIPHER);
 	}
 	if (!cipher_key_bytes_set) {
 		mount_crypt_stat->global_default_cipher_key_size = 0;
@@ -430,7 +400,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 		printk(KERN_WARNING "One or more global auth toks could not "
 		       "properly register; rc = [%d]\n", rc);
 	}
-	rc = 0;
 out:
 	return rc;
 }
@@ -502,31 +471,26 @@ out:
  */
 static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
 {
+	struct path path;
 	int rc;
-	struct nameidata nd;
-	struct dentry *lower_root;
-	struct vfsmount *lower_mnt;
 
-	memset(&nd, 0, sizeof(struct nameidata));
-	rc = path_lookup(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &nd);
+	rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
 		goto out;
 	}
-	lower_root = nd.path.dentry;
-	lower_mnt = nd.path.mnt;
-	ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
-	sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
-	sb->s_blocksize = lower_root->d_sb->s_blocksize;
-	ecryptfs_set_dentry_lower(sb->s_root, lower_root);
-	ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt);
-	rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0);
+	ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
+	sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
+	sb->s_blocksize = path.dentry->d_sb->s_blocksize;
+	ecryptfs_set_dentry_lower(sb->s_root, path.dentry);
+	ecryptfs_set_dentry_lower_mnt(sb->s_root, path.mnt);
+	rc = ecryptfs_interpose(path.dentry, sb->s_root, sb, 0);
 	if (rc)
 		goto out_free;
 	rc = 0;
 	goto out;
 out_free:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return rc;
 }
@@ -605,7 +569,7 @@ static struct file_system_type ecryptfs_fs_type = {
  * Initializes the ecryptfs_inode_info_cache when it is created
  */
 static void
-inode_info_init_once(struct kmem_cache *cachep, void *vptr)
+inode_info_init_once(void *vptr)
 {
 	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
 
@@ -616,7 +580,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
-	void (*ctor)(struct kmem_cache *cache, void *obj);
+	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
 		.cache = &ecryptfs_auth_tok_list_item_cache,
@@ -679,6 +643,11 @@ static struct ecryptfs_cache_info {
 		.name = "ecryptfs_key_tfm_cache",
 		.size = sizeof(struct ecryptfs_key_tfm),
 	},
+	{
+		.cache = &ecryptfs_open_req_cache,
+		.name = "ecryptfs_open_req_cache",
+		.size = sizeof(struct ecryptfs_open_req),
+	},
 };
 
 static void ecryptfs_free_kmem_caches(void)
@@ -795,12 +764,19 @@ static int __init ecryptfs_init(void)
 		printk(KERN_ERR "sysfs registration failed\n");
 		goto out_unregister_filesystem;
 	}
-	rc = ecryptfs_init_messaging(ecryptfs_transport);
+	rc = ecryptfs_init_kthread();
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Failure occured while attempting to "
-				"initialize the eCryptfs netlink socket\n");
+		printk(KERN_ERR "%s: kthread initialization failed; "
+		       "rc = [%d]\n", __func__, rc);
 		goto out_do_sysfs_unregistration;
 	}
+	rc = ecryptfs_init_messaging();
+	if (rc) {
+		printk(KERN_ERR "Failure occured while attempting to "
+				"initialize the communications channel to "
+				"ecryptfsd\n");
+		goto out_destroy_kthread;
+	}
 	rc = ecryptfs_init_crypto();
 	if (rc) {
 		printk(KERN_ERR "Failure whilst attempting to init crypto; "
@@ -813,7 +789,9 @@ static int __init ecryptfs_init(void)
 
 	goto out;
 out_release_messaging:
-	ecryptfs_release_messaging(ecryptfs_transport);
+	ecryptfs_release_messaging();
+out_destroy_kthread:
+	ecryptfs_destroy_kthread();
 out_do_sysfs_unregistration:
 	do_sysfs_unregistration();
 out_unregister_filesystem:
@@ -832,7 +810,8 @@ static void __exit ecryptfs_exit(void)
 	if (rc)
 		printk(KERN_ERR "Failure whilst attempting to destroy crypto; "
 		       "rc = [%d]\n", rc);
-	ecryptfs_release_messaging(ecryptfs_transport);
+	ecryptfs_release_messaging();
+	ecryptfs_destroy_kthread();
 	do_sysfs_unregistration();
 	unregister_filesystem(&ecryptfs_fs_type);
 	ecryptfs_free_kmem_caches();
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 1b5c20058acb..c6983978a31e 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -134,12 +134,11 @@ out:
 }
 
 static int
-ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
-			     u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx);
+ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
+			     struct ecryptfs_msg_ctx **msg_ctx);
 
 /**
  * ecryptfs_send_raw_message
- * @transport: Transport type
  * @msg_type: Message type
  * @daemon: Daemon struct for recipient of message
  *
@@ -150,38 +149,25 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
  *
  * Returns zero on success; non-zero otherwise
  */
-static int ecryptfs_send_raw_message(unsigned int transport, u8 msg_type,
+static int ecryptfs_send_raw_message(u8 msg_type,
 				     struct ecryptfs_daemon *daemon)
 {
 	struct ecryptfs_msg_ctx *msg_ctx;
 	int rc;
 
-	switch(transport) {
-	case ECRYPTFS_TRANSPORT_NETLINK:
-		rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0,
-					   daemon->pid);
-		break;
-	case ECRYPTFS_TRANSPORT_MISCDEV:
-		rc = ecryptfs_send_message_locked(transport, NULL, 0, msg_type,
-						  &msg_ctx);
-		if (rc) {
-			printk(KERN_ERR "%s: Error whilst attempting to send "
-			       "message via procfs; rc = [%d]\n", __func__, rc);
-			goto out;
-		}
-		/* Raw messages are logically context-free (e.g., no
-		 * reply is expected), so we set the state of the
-		 * ecryptfs_msg_ctx object to indicate that it should
-		 * be freed as soon as the transport sends out the message. */
-		mutex_lock(&msg_ctx->mux);
-		msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
-		mutex_unlock(&msg_ctx->mux);
-		break;
-	case ECRYPTFS_TRANSPORT_CONNECTOR:
-	case ECRYPTFS_TRANSPORT_RELAYFS:
-	default:
-		rc = -ENOSYS;
+	rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx);
+	if (rc) {
+		printk(KERN_ERR "%s: Error whilst attempting to send "
+		       "message to ecryptfsd; rc = [%d]\n", __func__, rc);
+		goto out;
 	}
+	/* Raw messages are logically context-free (e.g., no
+	 * reply is expected), so we set the state of the
+	 * ecryptfs_msg_ctx object to indicate that it should
+	 * be freed as soon as the message is sent. */
+	mutex_lock(&msg_ctx->mux);
+	msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY;
+	mutex_unlock(&msg_ctx->mux);
 out:
 	return rc;
 }
@@ -227,7 +213,6 @@ out:
 
 /**
  * ecryptfs_process_helo
- * @transport: The underlying transport (netlink, etc.)
  * @euid: The user ID owner of the message
  * @user_ns: The namespace in which @euid applies
  * @pid: The process ID for the userspace program that sent the
@@ -239,8 +224,8 @@ out:
  * Returns zero after adding a new daemon to the hash list;
  * non-zero otherwise.
  */
-int ecryptfs_process_helo(unsigned int transport, uid_t euid,
-			  struct user_namespace *user_ns, struct pid *pid)
+int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
+			  struct pid *pid)
 {
 	struct ecryptfs_daemon *new_daemon;
 	struct ecryptfs_daemon *old_daemon;
@@ -252,8 +237,7 @@ int ecryptfs_process_helo(unsigned int transport, uid_t euid,
 		printk(KERN_WARNING "Received request from user [%d] "
 		       "to register daemon [0x%p]; unregistering daemon "
 		       "[0x%p]\n", euid, pid, old_daemon->pid);
-		rc = ecryptfs_send_raw_message(transport, ECRYPTFS_MSG_QUIT,
-					       old_daemon);
+		rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon);
 		if (rc)
 			printk(KERN_WARNING "Failed to send QUIT "
 			       "message to daemon [0x%p]; rc = [%d]\n",
@@ -467,8 +451,6 @@ out:
 
 /**
  * ecryptfs_send_message_locked
- * @transport: The transport over which to send the message (i.e.,
- *             netlink)
  * @data: The data to send
  * @data_len: The length of data
  * @msg_ctx: The message context allocated for the send
@@ -478,8 +460,8 @@ out:
  * Returns zero on success; non-zero otherwise
  */
 static int
-ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
-			     u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx)
+ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
+			     struct ecryptfs_msg_ctx **msg_ctx)
 {
 	struct ecryptfs_daemon *daemon;
 	int rc;
@@ -503,20 +485,8 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len,
 	ecryptfs_msg_ctx_free_to_alloc(*msg_ctx);
 	mutex_unlock(&(*msg_ctx)->mux);
 	mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
-	switch (transport) {
-	case ECRYPTFS_TRANSPORT_NETLINK:
-		rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, msg_type,
-					   0, daemon->pid);
-		break;
-	case ECRYPTFS_TRANSPORT_MISCDEV:
-		rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type,
-					   0, daemon);
-		break;
-	case ECRYPTFS_TRANSPORT_CONNECTOR:
-	case ECRYPTFS_TRANSPORT_RELAYFS:
-	default:
-		rc = -ENOSYS;
-	}
+	rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0,
+				   daemon);
 	if (rc)
 		printk(KERN_ERR "%s: Error attempting to send message to "
 		       "userspace daemon; rc = [%d]\n", __func__, rc);
@@ -526,8 +496,6 @@ out:
 
 /**
  * ecryptfs_send_message
- * @transport: The transport over which to send the message (i.e.,
- *             netlink)
  * @data: The data to send
  * @data_len: The length of data
  * @msg_ctx: The message context allocated for the send
@@ -536,14 +504,14 @@ out:
  *
  * Returns zero on success; non-zero otherwise
  */
-int ecryptfs_send_message(unsigned int transport, char *data, int data_len,
+int ecryptfs_send_message(char *data, int data_len,
 			  struct ecryptfs_msg_ctx **msg_ctx)
 {
 	int rc;
 
 	mutex_lock(&ecryptfs_daemon_hash_mux);
-	rc = ecryptfs_send_message_locked(transport, data, data_len,
-					  ECRYPTFS_MSG_REQUEST, msg_ctx);
+	rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST,
+					  msg_ctx);
 	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	return rc;
 }
@@ -586,7 +554,7 @@ sleep:
 	return rc;
 }
 
-int ecryptfs_init_messaging(unsigned int transport)
+int ecryptfs_init_messaging(void)
 {
 	int i;
 	int rc = 0;
@@ -639,27 +607,14 @@ int ecryptfs_init_messaging(unsigned int transport)
 		mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
 	}
 	mutex_unlock(&ecryptfs_msg_ctx_lists_mux);
-	switch(transport) {
-	case ECRYPTFS_TRANSPORT_NETLINK:
-		rc = ecryptfs_init_netlink();
-		if (rc)
-			ecryptfs_release_messaging(transport);
-		break;
-	case ECRYPTFS_TRANSPORT_MISCDEV:
-		rc = ecryptfs_init_ecryptfs_miscdev();
-		if (rc)
-			ecryptfs_release_messaging(transport);
-		break;
-	case ECRYPTFS_TRANSPORT_CONNECTOR:
-	case ECRYPTFS_TRANSPORT_RELAYFS:
-	default:
-		rc = -ENOSYS;
-	}
+	rc = ecryptfs_init_ecryptfs_miscdev();
+	if (rc)
+		ecryptfs_release_messaging();
 out:
 	return rc;
 }
 
-void ecryptfs_release_messaging(unsigned int transport)
+void ecryptfs_release_messaging(void)
 {
 	if (ecryptfs_msg_ctx_arr) {
 		int i;
@@ -698,17 +653,6 @@ void ecryptfs_release_messaging(unsigned int transport)
 		kfree(ecryptfs_daemon_hash);
 		mutex_unlock(&ecryptfs_daemon_hash_mux);
 	}
-	switch(transport) {
-	case ECRYPTFS_TRANSPORT_NETLINK:
-		ecryptfs_release_netlink();
-		break;
-	case ECRYPTFS_TRANSPORT_MISCDEV:
-		ecryptfs_destroy_ecryptfs_miscdev();
-		break;
-	case ECRYPTFS_TRANSPORT_CONNECTOR:
-	case ECRYPTFS_TRANSPORT_RELAYFS:
-	default:
-		break;
-	}
+	ecryptfs_destroy_ecryptfs_miscdev();
 	return;
 }
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 09a4522f65e6..b484792a0996 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -358,46 +358,6 @@ out_unlock_daemon:
 }
 
 /**
- * ecryptfs_miscdev_helo
- * @euid: effective user id of miscdevess sending helo packet
- * @user_ns: The namespace in which @euid applies
- * @pid: miscdevess id of miscdevess sending helo packet
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_miscdev_helo(uid_t euid, struct user_namespace *user_ns,
-				 struct pid *pid)
-{
-	int rc;
-
-	rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, euid, user_ns,
-				   pid);
-	if (rc)
-		printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
-	return rc;
-}
-
-/**
- * ecryptfs_miscdev_quit
- * @euid: effective user id of miscdevess sending quit packet
- * @user_ns: The namespace in which @euid applies
- * @pid: miscdevess id of miscdevess sending quit packet
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_miscdev_quit(uid_t euid, struct user_namespace *user_ns,
-				 struct pid *pid)
-{
-	int rc;
-
-	rc = ecryptfs_process_quit(euid, user_ns, pid);
-	if (rc)
-		printk(KERN_WARNING
-		       "Error processing QUIT message; rc = [%d]\n", rc);
-	return rc;
-}
-
-/**
  * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
  * @data: Bytes comprising struct ecryptfs_message
  * @data_size: sizeof(struct ecryptfs_message) + data len
@@ -512,26 +472,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 			       __func__, rc);
 		break;
 	case ECRYPTFS_MSG_HELO:
-		rc = ecryptfs_miscdev_helo(current->euid,
-					   current->nsproxy->user_ns,
-					   task_pid(current));
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to process "
-			       "helo from pid [0x%p]; rc = [%d]\n", __func__,
-			       task_pid(current), rc);
-			goto out_free;
-		}
-		break;
 	case ECRYPTFS_MSG_QUIT:
-		rc = ecryptfs_miscdev_quit(current->euid,
-					   current->nsproxy->user_ns,
-					   task_pid(current));
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to process "
-			       "quit from pid [0x%p]; rc = [%d]\n", __func__,
-			       task_pid(current), rc);
-			goto out_free;
-		}
 		break;
 	default:
 		ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2b6fe1e6e8ba..04d7b3fa1ac6 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -264,22 +265,34 @@ out:
 }
 
 /**
- * ecryptfs_prepare_write
+ * ecryptfs_write_begin
  * @file: The eCryptfs file
- * @page: The eCryptfs page
- * @from: The start byte from which we will write
- * @to: The end byte to which we will write
+ * @mapping: The eCryptfs object
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused)
  *
  * This function must zero any hole we create
  *
  * Returns zero on success; non-zero otherwise
  */
-static int ecryptfs_prepare_write(struct file *file, struct page *page,
-				  unsigned from, unsigned to)
+static int ecryptfs_write_begin(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
 {
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct page *page;
 	loff_t prev_page_end_size;
 	int rc = 0;
 
+	page = __grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
 	if (!PageUptodate(page)) {
 		struct ecryptfs_crypt_stat *crypt_stat =
 			&ecryptfs_inode_to_private(
@@ -288,8 +301,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
 		    || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
 			rc = ecryptfs_read_lower_page_segment(
-				page, page->index, 0, PAGE_CACHE_SIZE,
-				page->mapping->host);
+				page, index, 0, PAGE_CACHE_SIZE, mapping->host);
 			if (rc) {
 				printk(KERN_ERR "%s: Error attemping to read "
 				       "lower page segment; rc = [%d]\n",
@@ -315,8 +327,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 				SetPageUptodate(page);
 			} else {
 				rc = ecryptfs_read_lower_page_segment(
-					page, page->index, 0, PAGE_CACHE_SIZE,
-					page->mapping->host);
+					page, index, 0, PAGE_CACHE_SIZE,
+					mapping->host);
 				if (rc) {
 					printk(KERN_ERR "%s: Error reading "
 					       "page; rc = [%d]\n",
@@ -338,10 +350,10 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 			SetPageUptodate(page);
 		}
 	}
-	prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT);
+	prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
 	/* If creating a page or more of holes, zero them out via truncate.
 	 * Note, this will increase i_size. */
-	if (page->index != 0) {
+	if (index != 0) {
 		if (prev_page_end_size > i_size_read(page->mapping->host)) {
 			rc = ecryptfs_truncate(file->f_path.dentry,
 					       prev_page_end_size);
@@ -356,8 +368,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page,
 	}
 	/* Writing to a new page, and creating a small hole from start
 	 * of page?  Zero it out. */
-	if ((i_size_read(page->mapping->host) == prev_page_end_size)
-	    && (from != 0))
+	if ((i_size_read(mapping->host) == prev_page_end_size)
+	    && (pos != 0))
 		zero_user(page, 0, PAGE_CACHE_SIZE);
 out:
 	return rc;
@@ -372,7 +384,6 @@ out:
  */
 static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
 {
-	u64 file_size;
 	char *file_size_virt;
 	int rc;
 
@@ -381,9 +392,7 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
 		rc = -ENOMEM;
 		goto out;
 	}
-	file_size = (u64)i_size_read(ecryptfs_inode);
-	file_size = cpu_to_be64(file_size);
-	memcpy(file_size_virt, &file_size, sizeof(u64));
+	put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt);
 	rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
 				  sizeof(u64));
 	kfree(file_size_virt);
@@ -403,7 +412,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	struct dentry *lower_dentry =
 		ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry;
 	struct inode *lower_inode = lower_dentry->d_inode;
-	u64 file_size;
 	int rc;
 
 	if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) {
@@ -424,9 +432,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 					   xattr_virt, PAGE_CACHE_SIZE);
 	if (size < 0)
 		size = 8;
-	file_size = (u64)i_size_read(ecryptfs_inode);
-	file_size = cpu_to_be64(file_size);
-	memcpy(xattr_virt, &file_size, sizeof(u64));
+	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
 	rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
 					 xattr_virt, size, 0);
 	mutex_unlock(&lower_inode->i_mutex);
@@ -450,21 +456,28 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
 }
 
 /**
- * ecryptfs_commit_write
+ * ecryptfs_write_end
  * @file: The eCryptfs file object
+ * @mapping: The eCryptfs object
+ * @pos: The file position
+ * @len: The length of the data (unused)
+ * @copied: The amount of data copied
  * @page: The eCryptfs page
- * @from: Ignored (we rotate the page IV on each write)
- * @to: Ignored
+ * @fsdata: The fsdata (unused)
  *
  * This is where we encrypt the data and pass the encrypted data to
  * the lower filesystem.  In OpenPGP-compatible mode, we operate on
  * entire underlying packets.
  */
-static int ecryptfs_commit_write(struct file *file, struct page *page,
-				 unsigned from, unsigned to)
+static int ecryptfs_write_end(struct file *file,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned copied,
+			struct page *page, void *fsdata)
 {
-	loff_t pos;
-	struct inode *ecryptfs_inode = page->mapping->host;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + copied;
+	struct inode *ecryptfs_inode = mapping->host;
 	struct ecryptfs_crypt_stat *crypt_stat =
 		&ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
 	int rc;
@@ -476,25 +489,22 @@ static int ecryptfs_commit_write(struct file *file, struct page *page,
 	} else
 		ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
 	ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
-			"(page w/ index = [0x%.16x], to = [%d])\n", page->index,
-			to);
+			"(page w/ index = [0x%.16x], to = [%d])\n", index, to);
 	/* Fills in zeros if 'to' goes beyond inode size */
 	rc = fill_zeros_to_end_of_page(page, to);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
-				"zeros in page with index = [0x%.16x]\n",
-				page->index);
+			"zeros in page with index = [0x%.16x]\n", index);
 		goto out;
 	}
 	rc = ecryptfs_encrypt_page(page);
 	if (rc) {
 		ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
-				"index [0x%.16x])\n", page->index);
+				"index [0x%.16x])\n", index);
 		goto out;
 	}
-	pos = (((loff_t)page->index) << PAGE_CACHE_SHIFT) + to;
-	if (pos > i_size_read(ecryptfs_inode)) {
-		i_size_write(ecryptfs_inode, pos);
+	if (pos + copied > i_size_read(ecryptfs_inode)) {
+		i_size_write(ecryptfs_inode, pos + copied);
 		ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
 				"[0x%.16x]\n", i_size_read(ecryptfs_inode));
 	}
@@ -502,7 +512,11 @@ static int ecryptfs_commit_write(struct file *file, struct page *page,
 	if (rc)
 		printk(KERN_ERR "Error writing inode size to metadata; "
 		       "rc = [%d]\n", rc);
+	else
+		rc = copied;
 out:
+	unlock_page(page);
+	page_cache_release(page);
 	return rc;
 }
 
@@ -523,7 +537,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
 struct address_space_operations ecryptfs_aops = {
 	.writepage = ecryptfs_writepage,
 	.readpage = ecryptfs_readpage,
-	.prepare_write = ecryptfs_prepare_write,
-	.commit_write = ecryptfs_commit_write,
+	.write_begin = ecryptfs_write_begin,
+	.write_end = ecryptfs_write_end,
 	.bmap = ecryptfs_bmap,
 };
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
deleted file mode 100644
index e0abad62b395..000000000000
--- a/fs/ecryptfs/netlink.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/**
- * eCryptfs: Linux filesystem encryption layer
- *
- * Copyright (C) 2004-2006 International Business Machines Corp.
- *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
- *		Tyler Hicks <tyhicks@ou.edu>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
- * 02111-1307, USA.
- */
-
-#include <net/sock.h>
-#include <linux/hash.h>
-#include <linux/random.h>
-#include "ecryptfs_kernel.h"
-
-static struct sock *ecryptfs_nl_sock;
-
-/**
- * ecryptfs_send_netlink
- * @data: The data to include as the payload
- * @data_len: The byte count of the data
- * @msg_ctx: The netlink context that will be used to handle the
- *          response message
- * @msg_type: The type of netlink message to send
- * @msg_flags: The flags to include in the netlink header
- * @daemon_pid: The process id of the daemon to send the message to
- *
- * Sends the data to the specified daemon pid and uses the netlink
- * context element to store the data needed for validation upon
- * receiving the response.  The data and the netlink context can be
- * null if just sending a netlink header is sufficient.  Returns zero
- * upon sending the message; non-zero upon error.
- */
-int ecryptfs_send_netlink(char *data, int data_len,
-			  struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type,
-			  u16 msg_flags, struct pid *daemon_pid)
-{
-	struct sk_buff *skb;
-	struct nlmsghdr *nlh;
-	struct ecryptfs_message *msg;
-	size_t payload_len;
-	int rc;
-
-	payload_len = ((data && data_len) ? (sizeof(*msg) + data_len) : 0);
-	skb = alloc_skb(NLMSG_SPACE(payload_len), GFP_KERNEL);
-	if (!skb) {
-		rc = -ENOMEM;
-		ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n");
-		goto out;
-	}
-	nlh = NLMSG_PUT(skb, pid_nr(daemon_pid), msg_ctx ? msg_ctx->counter : 0,
-			msg_type, payload_len);
-	nlh->nlmsg_flags = msg_flags;
-	if (msg_ctx && payload_len) {
-		msg = (struct ecryptfs_message *)NLMSG_DATA(nlh);
-		msg->index = msg_ctx->index;
-		msg->data_len = data_len;
-		memcpy(msg->data, data, data_len);
-	}
-	rc = netlink_unicast(ecryptfs_nl_sock, skb, pid_nr(daemon_pid), 0);
-	if (rc < 0) {
-		ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink "
-				"message; rc = [%d]\n", rc);
-		goto out;
-	}
-	rc = 0;
-	goto out;
-nlmsg_failure:
-	rc = -EMSGSIZE;
-	kfree_skb(skb);
-out:
-	return rc;
-}
-
-/**
- * ecryptfs_process_nl_reponse
- * @skb: The socket buffer containing the netlink message of state
- *       RESPONSE
- *
- * Processes a response message after sending a operation request to
- * userspace.  Attempts to assign the msg to a netlink context element
- * at the index specified in the msg.  The sk_buff and nlmsghdr must
- * be validated before this function. Returns zero upon delivery to
- * desired context element; non-zero upon delivery failure or error.
- */
-static int ecryptfs_process_nl_response(struct sk_buff *skb)
-{
-	struct nlmsghdr *nlh = nlmsg_hdr(skb);
-	struct ecryptfs_message *msg = NLMSG_DATA(nlh);
-	struct pid *pid;
-	int rc;
-
-	if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) {
-		rc = -EINVAL;
-		ecryptfs_printk(KERN_ERR, "Received netlink message with "
-				"incorrectly specified data length\n");
-		goto out;
-	}
-	pid = find_get_pid(NETLINK_CREDS(skb)->pid);
-	rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, NULL,
-				       pid, nlh->nlmsg_seq);
-	put_pid(pid);
-	if (rc)
-		printk(KERN_ERR
-		       "Error processing response message; rc = [%d]\n", rc);
-out:
-	return rc;
-}
-
-/**
- * ecryptfs_process_nl_helo
- * @skb: The socket buffer containing the nlmsghdr in HELO state
- *
- * Gets uid and pid of the skb and adds the values to the daemon id
- * hash. Returns zero after adding a new daemon id to the hash list;
- * non-zero otherwise.
- */
-static int ecryptfs_process_nl_helo(struct sk_buff *skb)
-{
-	struct pid *pid;
-	int rc;
-
-	pid = find_get_pid(NETLINK_CREDS(skb)->pid);
-	rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK,
-				   NETLINK_CREDS(skb)->uid, NULL, pid);
-	put_pid(pid);
-	if (rc)
-		printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
-	return rc;
-}
-
-/**
- * ecryptfs_process_nl_quit
- * @skb: The socket buffer containing the nlmsghdr in QUIT state
- *
- * Gets uid and pid of the skb and deletes the corresponding daemon
- * id, if it is the registered that is requesting the
- * deletion. Returns zero after deleting the desired daemon id;
- * non-zero otherwise.
- */
-static int ecryptfs_process_nl_quit(struct sk_buff *skb)
-{
-	struct pid *pid;
-	int rc;
-
-	pid = find_get_pid(NETLINK_CREDS(skb)->pid);
-	rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, NULL, pid);
-	put_pid(pid);
-	if (rc)
-		printk(KERN_WARNING
-		       "Error processing QUIT message; rc = [%d]\n", rc);
-	return rc;
-}
-
-/**
- * ecryptfs_receive_nl_message
- *
- * Callback function called by netlink system when a message arrives.
- * If the message looks to be valid, then an attempt is made to assign
- * it to its desired netlink context element and wake up the process
- * that is waiting for a response.
- */
-static void ecryptfs_receive_nl_message(struct sk_buff *skb)
-{
-	struct nlmsghdr *nlh;
-
-	nlh = nlmsg_hdr(skb);
-	if (!NLMSG_OK(nlh, skb->len)) {
-		ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
-				"message\n");
-		goto free;
-	}
-	switch (nlh->nlmsg_type) {
-		case ECRYPTFS_MSG_RESPONSE:
-			if (ecryptfs_process_nl_response(skb)) {
-				ecryptfs_printk(KERN_WARNING, "Failed to "
-						"deliver netlink response to "
-						"requesting operation\n");
-			}
-			break;
-		case ECRYPTFS_MSG_HELO:
-			if (ecryptfs_process_nl_helo(skb)) {
-				ecryptfs_printk(KERN_WARNING, "Failed to "
-						"fulfill HELO request\n");
-			}
-			break;
-		case ECRYPTFS_MSG_QUIT:
-			if (ecryptfs_process_nl_quit(skb)) {
-				ecryptfs_printk(KERN_WARNING, "Failed to "
-						"fulfill QUIT request\n");
-			}
-			break;
-		default:
-			ecryptfs_printk(KERN_WARNING, "Dropping netlink "
-					"message of unrecognized type [%d]\n",
-					nlh->nlmsg_type);
-			break;
-	}
-free:
-	kfree_skb(skb);
-}
-
-/**
- * ecryptfs_init_netlink
- *
- * Initializes the daemon id hash list, netlink context array, and
- * necessary locks.  Returns zero upon success; non-zero upon error.
- */
-int ecryptfs_init_netlink(void)
-{
-	int rc;
-
-	ecryptfs_nl_sock = netlink_kernel_create(&init_net, NETLINK_ECRYPTFS, 0,
-						 ecryptfs_receive_nl_message,
-						 NULL, THIS_MODULE);
-	if (!ecryptfs_nl_sock) {
-		rc = -EIO;
-		ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n");
-		goto out;
-	}
-	ecryptfs_nl_sock->sk_sndtimeo = ECRYPTFS_DEFAULT_SEND_TIMEOUT;
-	rc = 0;
-out:
-	return rc;
-}
-
-/**
- * ecryptfs_release_netlink
- *
- * Frees all memory used by the netlink context array and releases the
- * netlink socket.
- */
-void ecryptfs_release_netlink(void)
-{
-	netlink_kernel_release(ecryptfs_nl_sock);
-	ecryptfs_nl_sock = NULL;
-}
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 3a404e7fad53..c3fb5f9c4a44 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
 	}
 	unlock_kernel();
 
-	d_add(dentry, inode);
-	return NULL;
+	return d_splice_alias(inode, dentry);
 }
 
 static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino,
@@ -113,35 +112,14 @@ struct dentry *efs_fh_to_parent(struct super_block *sb, struct fid *fid,
 
 struct dentry *efs_get_parent(struct dentry *child)
 {
-	struct dentry *parent;
-	struct inode *inode;
+	struct dentry *parent = ERR_PTR(-ENOENT);
 	efs_ino_t ino;
-	long error;
 
 	lock_kernel();
-
-	error = -ENOENT;
 	ino = efs_find_entry(child->d_inode, "..", 2);
-	if (!ino)
-		goto fail;
-
-	inode = efs_iget(child->d_inode->i_sb, ino);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		goto fail;
-	}
-
-	error = -ENOMEM;
-	parent = d_alloc_anon(inode);
-	if (!parent)
-		goto fail_iput;
-
+	if (ino)
+		parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino));
 	unlock_kernel();
-	return parent;
 
- fail_iput:
-	iput(inode);
- fail:
-	unlock_kernel();
-	return ERR_PTR(error);
+	return parent;
 }
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d733531b55e2..73b19cfc91fc 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -70,7 +70,7 @@ static void efs_destroy_inode(struct inode *inode)
 	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
 
@@ -341,8 +341,6 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
 			sb->inode_blocks *
 			(EFS_BLOCKSIZE / sizeof(struct efs_dinode));
 	buf->f_ffree   = sb->inode_free;	/* free inodes */
-	buf->f_fsid.val[0] = (sb->fs_magic >> 16) & 0xffff; /* fs ID */
-	buf->f_fsid.val[1] =  sb->fs_magic        & 0xffff; /* fs ID */
 	buf->f_namelen = EFS_MAXNAMELEN;	/* max filename length */
 
 	return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 343942deeec1..08bf558d0408 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -198,11 +198,18 @@ struct file *eventfd_fget(int fd)
 	return file;
 }
 
-asmlinkage long sys_eventfd(unsigned int count)
+asmlinkage long sys_eventfd2(unsigned int count, int flags)
 {
 	int fd;
 	struct eventfd_ctx *ctx;
 
+	/* Check the EFD_* constants for consistency.  */
+	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+		return -EINVAL;
+
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
@@ -214,9 +221,15 @@ asmlinkage long sys_eventfd(unsigned int count)
 	 * When we call this, the initialization must be complete, since
 	 * anon_inode_getfd() will install the fd.
 	 */
-	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx);
+	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
+			      flags & (O_CLOEXEC | O_NONBLOCK));
 	if (fd < 0)
 		kfree(ctx);
 	return fd;
 }
 
+asmlinkage long sys_eventfd(unsigned int count)
+{
+	return sys_eventfd2(count, 0);
+}
+
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 990c01d2d66b..aec5c13f6341 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -927,12 +927,16 @@ errxit:
 	/*
 	 * During the time we spent in the loop above, some other events
 	 * might have been queued by the poll callback. We re-insert them
-	 * here (in case they are not already queued, or they're one-shot).
+	 * inside the main ready-list here.
 	 */
 	for (nepi = ep->ovflist; (epi = nepi) != NULL;
 	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
-		if (!ep_is_linked(&epi->rdllink) &&
-		    (epi->event.events & ~EP_PRIVATE_BITS))
+		/*
+		 * If the above loop quit with errors, the epoll item might still
+		 * be linked to "txlist", and the list_splice() done below will
+		 * take care of those cases.
+		 */
+		if (!ep_is_linked(&epi->rdllink))
 			list_add_tail(&epi->rdllink, &ep->rdllist);
 	}
 	/*
@@ -1041,25 +1045,27 @@ retry:
 }
 
 /*
- * It opens an eventpoll file descriptor. The "size" parameter is there
- * for historical reasons, when epoll was using an hash instead of an
- * RB tree. With the current implementation, the "size" parameter is ignored
- * (besides sanity checks).
+ * Open an eventpoll file descriptor.
  */
-asmlinkage long sys_epoll_create(int size)
+asmlinkage long sys_epoll_create1(int flags)
 {
 	int error, fd = -1;
 	struct eventpoll *ep;
 
+	/* Check the EPOLL_* constant for consistency.  */
+	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
+
+	if (flags & ~EPOLL_CLOEXEC)
+		return -EINVAL;
+
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
-		     current, size));
+		     current, flags));
 
 	/*
-	 * Sanity check on the size parameter, and create the internal data
-	 * structure ( "struct eventpoll" ).
+	 * Create the internal data structure ( "struct eventpoll" ).
 	 */
-	error = -EINVAL;
-	if (size <= 0 || (error = ep_alloc(&ep)) < 0) {
+	error = ep_alloc(&ep);
+	if (error < 0) {
 		fd = error;
 		goto error_return;
 	}
@@ -1068,17 +1074,26 @@ asmlinkage long sys_epoll_create(int size)
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure and a free file descriptor.
 	 */
-	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep);
+	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+			      flags & O_CLOEXEC);
 	if (fd < 0)
 		ep_free(ep);
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-		     current, size, fd));
+		     current, flags, fd));
 
 	return fd;
 }
 
+asmlinkage long sys_epoll_create(int size)
+{
+	if (size < 0)
+		return -EINVAL;
+
+	return sys_epoll_create1(0);
+}
+
 /*
  * The following function implements the controller interface for
  * the eventpoll file that enables the insertion/removal/change of
diff --git a/fs/exec.c b/fs/exec.c
index fd9234379e8d..4e834f16d9da 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -25,10 +25,11 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
-#include <linux/mman.h>
+#include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/smp_lock.h>
+#include <linux/swap.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
@@ -37,29 +38,24 @@
 #include <linux/key.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
-#include <linux/swap.h>
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <linux/module.h>
 #include <linux/namei.h>
 #include <linux/proc_fs.h>
-#include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
-#include <linux/rmap.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
+#include <linux/tracehook.h>
+#include <linux/kmod.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
 
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
-
 #ifdef __alpha__
 /* for /sbin/loader handling in search_binary_handler() */
 #include <linux/a.out.h>
@@ -108,11 +104,17 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
  */
 asmlinkage long sys_uselib(const char __user * library)
 {
-	struct file * file;
+	struct file *file;
 	struct nameidata nd;
-	int error;
-
-	error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
+	char *tmp = getname(library);
+	int error = PTR_ERR(tmp);
+
+	if (!IS_ERR(tmp)) {
+		error = path_lookup_open(AT_FDCWD, tmp,
+					 LOOKUP_FOLLOW, &nd,
+					 FMODE_READ|FMODE_EXEC);
+		putname(tmp);
+	}
 	if (error)
 		goto out;
 
@@ -120,7 +122,11 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
 		goto exit;
 
-	error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
+	error = -EACCES;
+	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+		goto exit;
+
+	error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
 
@@ -382,7 +388,7 @@ static int count(char __user * __user * argv, int max)
 			if (!p)
 				break;
 			argv++;
-			if(++i > max)
+			if (i++ >= max)
 				return -E2BIG;
 			cond_resched();
 		}
@@ -541,7 +547,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		/*
 		 * when the old and new regions overlap clear from new_end.
 		 */
-		free_pgd_range(&tlb, new_end, old_end, new_end,
+		free_pgd_range(tlb, new_end, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	} else {
 		/*
@@ -550,7 +556,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		 * have constraints on va-space that make this illegal (IA64) -
 		 * for the others its just a little faster.
 		 */
-		free_pgd_range(&tlb, old_start, old_end, new_end,
+		free_pgd_range(tlb, old_start, old_end, new_end,
 			vma->vm_next ? vma->vm_next->vm_start : 0);
 	}
 	tlb_finish_mmu(tlb, new_end, old_end);
@@ -658,38 +664,43 @@ EXPORT_SYMBOL(setup_arg_pages);
 struct file *open_exec(const char *name)
 {
 	struct nameidata nd;
-	int err;
 	struct file *file;
+	int err;
 
-	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
-	file = ERR_PTR(err);
-
-	if (!err) {
-		struct inode *inode = nd.path.dentry->d_inode;
-		file = ERR_PTR(-EACCES);
-		if (S_ISREG(inode->i_mode)) {
-			int err = vfs_permission(&nd, MAY_EXEC);
-			file = ERR_PTR(err);
-			if (!err) {
-				file = nameidata_to_filp(&nd,
-							O_RDONLY|O_LARGEFILE);
-				if (!IS_ERR(file)) {
-					err = deny_write_access(file);
-					if (err) {
-						fput(file);
-						file = ERR_PTR(err);
-					}
-				}
-out:
-				return file;
-			}
-		}
-		release_open_intent(&nd);
-		path_put(&nd.path);
+	err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd,
+				FMODE_READ|FMODE_EXEC);
+	if (err)
+		goto out;
+
+	err = -EACCES;
+	if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
+		goto out_path_put;
+
+	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+		goto out_path_put;
+
+	err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+	if (err)
+		goto out_path_put;
+
+	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
+	if (IS_ERR(file))
+		return file;
+
+	err = deny_write_access(file);
+	if (err) {
+		fput(file);
+		goto out;
 	}
-	goto out;
-}
 
+	return file;
+
+ out_path_put:
+	release_open_intent(&nd);
+	path_put(&nd.path);
+ out:
+	return ERR_PTR(err);
+}
 EXPORT_SYMBOL(open_exec);
 
 int kernel_read(struct file *file, unsigned long offset,
@@ -724,12 +735,10 @@ static int exec_mmap(struct mm_struct *mm)
 		 * Make sure that if there is a core dump in progress
 		 * for the old mm, we get out and die instead of going
 		 * through with the exec.  We must hold mmap_sem around
-		 * checking core_waiters and changing tsk->mm.  The
-		 * core-inducing thread will increment core_waiters for
-		 * each thread whose ->mm == old_mm.
+		 * checking core_state and changing tsk->mm.
 		 */
 		down_read(&old_mm->mmap_sem);
-		if (unlikely(old_mm->core_waiters)) {
+		if (unlikely(old_mm->core_state)) {
 			up_read(&old_mm->mmap_sem);
 			return -EINTR;
 		}
@@ -740,11 +749,11 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
-	mm_update_next_owner(old_mm);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
 		BUG_ON(active_mm != old_mm);
+		mm_update_next_owner(old_mm);
 		mmput(old_mm);
 		return 0;
 	}
@@ -813,8 +822,6 @@ static int de_thread(struct task_struct *tsk)
 			schedule();
 		}
 
-		if (unlikely(task_child_reaper(tsk) == leader))
-			task_active_pid_ns(tsk)->child_reaper = tsk;
 		/*
 		 * The only record we have of the real-time age of a
 		 * process, regardless of execs it's done, is start_time.
@@ -1075,13 +1082,8 @@ EXPORT_SYMBOL(prepare_binprm);
 
 static int unsafe_exec(struct task_struct *p)
 {
-	int unsafe = 0;
-	if (p->ptrace & PT_PTRACED) {
-		if (p->ptrace & PT_PTRACE_CAP)
-			unsafe |= LSM_UNSAFE_PTRACE_CAP;
-		else
-			unsafe |= LSM_UNSAFE_PTRACE;
-	}
+	int unsafe = tracehook_unsafe_exec(p);
+
 	if (atomic_read(&p->fs->count) > 1 ||
 	    atomic_read(&p->files->count) > 1 ||
 	    atomic_read(&p->sighand->count) > 1)
@@ -1182,7 +1184,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			return retval;
 
 		/* Remember if the application is TASO.  */
-		bprm->sh_bang = eh->ah.entry < 0x100000000UL;
+		bprm->taso = eh->ah.entry < 0x100000000UL;
 
 		bprm->file = file;
 		bprm->loader = loader;
@@ -1218,6 +1220,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			read_unlock(&binfmt_lock);
 			retval = fn(bprm, regs);
 			if (retval >= 0) {
+				tracehook_report_exec(fmt, bprm, regs);
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
 				if (bprm->file)
@@ -1239,8 +1242,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 		read_unlock(&binfmt_lock);
 		if (retval != -ENOEXEC || bprm->mm == NULL) {
 			break;
-#ifdef CONFIG_KMOD
-		}else{
+#ifdef CONFIG_MODULES
+		} else {
 #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
 			if (printable(bprm->buf[0]) &&
 			    printable(bprm->buf[1]) &&
@@ -1328,6 +1331,7 @@ int do_execve(char * filename,
 	if (retval < 0)
 		goto out;
 
+	current->flags &= ~PF_KTHREAD;
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
 		/* execve success */
@@ -1382,17 +1386,14 @@ EXPORT_SYMBOL(set_binfmt);
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  */
-static int format_corename(char *corename, const char *pattern, long signr)
+static int format_corename(char *corename, long signr)
 {
-	const char *pat_ptr = pattern;
+	const char *pat_ptr = core_pattern;
+	int ispipe = (*pat_ptr == '|');
 	char *out_ptr = corename;
 	char *const out_end = corename + CORENAME_MAX_SIZE;
 	int rc;
 	int pid_in_pattern = 0;
-	int ispipe = 0;
-
-	if (*pattern == '|')
-		ispipe = 1;
 
 	/* Repeat as long as we have more pattern to process and more output
 	   space */
@@ -1492,8 +1493,7 @@ static int format_corename(char *corename, const char *pattern, long signr)
 	 * If core_pattern does not include a %p (as is the default)
 	 * and core_uses_pid is set, then .%pid will be appended to
 	 * the filename. Do not do this for piped commands. */
-	if (!ispipe && !pid_in_pattern
-            && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
+	if (!ispipe && !pid_in_pattern && core_uses_pid) {
 		rc = snprintf(out_ptr, out_end - out_ptr,
 			      ".%d", task_tgid_vnr(current));
 		if (rc > out_end - out_ptr)
@@ -1505,9 +1505,10 @@ out:
 	return ispipe;
 }
 
-static void zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start)
 {
 	struct task_struct *t;
+	int nr = 0;
 
 	start->signal->flags = SIGNAL_GROUP_EXIT;
 	start->signal->group_stop_count = 0;
@@ -1515,72 +1516,99 @@ static void zap_process(struct task_struct *start)
 	t = start;
 	do {
 		if (t != current && t->mm) {
-			t->mm->core_waiters++;
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
+			nr++;
 		}
-	} while ((t = next_thread(t)) != start);
+	} while_each_thread(start, t);
+
+	return nr;
 }
 
 static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
-				int exit_code)
+				struct core_state *core_state, int exit_code)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
-	int err = -EAGAIN;
+	int nr = -EAGAIN;
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (!signal_group_exit(tsk->signal)) {
+		mm->core_state = core_state;
 		tsk->signal->group_exit_code = exit_code;
-		zap_process(tsk);
-		err = 0;
+		nr = zap_process(tsk);
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
-	if (err)
-		return err;
+	if (unlikely(nr < 0))
+		return nr;
 
-	if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+	if (atomic_read(&mm->mm_users) == nr + 1)
 		goto done;
-
+	/*
+	 * We should find and kill all tasks which use this mm, and we should
+	 * count them correctly into ->nr_threads. We don't take tasklist
+	 * lock, but this is safe wrt:
+	 *
+	 * fork:
+	 *	None of sub-threads can fork after zap_process(leader). All
+	 *	processes which were created before this point should be
+	 *	visible to zap_threads() because copy_process() adds the new
+	 *	process to the tail of init_task.tasks list, and lock/unlock
+	 *	of ->siglock provides a memory barrier.
+	 *
+	 * do_exit:
+	 *	The caller holds mm->mmap_sem. This means that the task which
+	 *	uses this mm can't pass exit_mm(), so it can't exit or clear
+	 *	its ->mm.
+	 *
+	 * de_thread:
+	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
+	 *	we must see either old or new leader, this does not matter.
+	 *	However, it can change p->sighand, so lock_task_sighand(p)
+	 *	must be used. Since p->mm != NULL and we hold ->mmap_sem
+	 *	it can't fail.
+	 *
+	 *	Note also that "g" can be the old leader with ->mm == NULL
+	 *	and already unhashed and thus removed from ->thread_group.
+	 *	This is OK, __unhash_process()->list_del_rcu() does not
+	 *	clear the ->next pointer, we will find the new leader via
+	 *	next_thread().
+	 */
 	rcu_read_lock();
 	for_each_process(g) {
 		if (g == tsk->group_leader)
 			continue;
-
+		if (g->flags & PF_KTHREAD)
+			continue;
 		p = g;
 		do {
 			if (p->mm) {
-				if (p->mm == mm) {
-					/*
-					 * p->sighand can't disappear, but
-					 * may be changed by de_thread()
-					 */
+				if (unlikely(p->mm == mm)) {
 					lock_task_sighand(p, &flags);
-					zap_process(p);
+					nr += zap_process(p);
 					unlock_task_sighand(p, &flags);
 				}
 				break;
 			}
-		} while ((p = next_thread(p)) != g);
+		} while_each_thread(g, p);
 	}
 	rcu_read_unlock();
 done:
-	return mm->core_waiters;
+	atomic_set(&core_state->nr_threads, nr);
+	return nr;
 }
 
-static int coredump_wait(int exit_code)
+static int coredump_wait(int exit_code, struct core_state *core_state)
 {
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
-	struct completion startup_done;
 	struct completion *vfork_done;
 	int core_waiters;
 
-	init_completion(&mm->core_done);
-	init_completion(&startup_done);
-	mm->core_startup_done = &startup_done;
-
-	core_waiters = zap_threads(tsk, mm, exit_code);
+	init_completion(&core_state->startup);
+	core_state->dumper.task = tsk;
+	core_state->dumper.next = NULL;
+	core_waiters = zap_threads(tsk, mm, core_state, exit_code);
 	up_write(&mm->mmap_sem);
 
 	if (unlikely(core_waiters < 0))
@@ -1597,12 +1625,32 @@ static int coredump_wait(int exit_code)
 	}
 
 	if (core_waiters)
-		wait_for_completion(&startup_done);
+		wait_for_completion(&core_state->startup);
 fail:
-	BUG_ON(mm->core_waiters);
 	return core_waiters;
 }
 
+static void coredump_finish(struct mm_struct *mm)
+{
+	struct core_thread *curr, *next;
+	struct task_struct *task;
+
+	next = mm->core_state->dumper.next;
+	while ((curr = next) != NULL) {
+		next = curr->next;
+		task = curr->task;
+		/*
+		 * see exit_mm(), curr->task must not see
+		 * ->task == NULL before we read ->next.
+		 */
+		smp_mb();
+		curr->task = NULL;
+		wake_up_process(task);
+	}
+
+	mm->core_state = NULL;
+}
+
 /*
  * set_dumpable converts traditional three-value dumpable to two flags and
  * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
@@ -1654,6 +1702,7 @@ int get_dumpable(struct mm_struct *mm)
 
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 {
+	struct core_state core_state;
 	char corename[CORENAME_MAX_SIZE + 1];
 	struct mm_struct *mm = current->mm;
 	struct linux_binfmt * binfmt;
@@ -1677,7 +1726,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	/*
 	 * If another thread got here first, or we are not dumpable, bail out.
 	 */
-	if (mm->core_waiters || !get_dumpable(mm)) {
+	if (mm->core_state || !get_dumpable(mm)) {
 		up_write(&mm->mmap_sem);
 		goto fail;
 	}
@@ -1692,7 +1741,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		current->fsuid = 0;	/* Dump root private */
 	}
 
-	retval = coredump_wait(exit_code);
+	retval = coredump_wait(exit_code, &core_state);
 	if (retval < 0)
 		goto fail;
 
@@ -1707,7 +1756,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 * uses lock_kernel()
 	 */
  	lock_kernel();
-	ispipe = format_corename(corename, core_pattern, signr);
+	ispipe = format_corename(corename, signr);
 	unlock_kernel();
 	/*
 	 * Don't bother to check the RLIMIT_CORE value if core_pattern points
@@ -1786,7 +1835,7 @@ fail_unlock:
 		argv_free(helper_argv);
 
 	current->fsuid = fsuid;
-	complete_all(&mm->core_done);
+	coredump_finish(mm);
 fail:
 	return retval;
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index cc91227d3bb8..80246bad1b7f 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -94,9 +94,8 @@ find_disconnected_root(struct dentry *dentry)
  * It may already be, as the flag isn't always updated when connection happens.
  */
 static int
-reconnect_path(struct vfsmount *mnt, struct dentry *target_dir)
+reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
 {
-	char nbuf[NAME_MAX+1];
 	int noprogress = 0;
 	int err = -ESTALE;
 
@@ -281,13 +280,14 @@ static int get_name(struct vfsmount *mnt, struct dentry *dentry,
 		int old_seq = buffer.sequence;
 
 		error = vfs_readdir(file, filldir_one, &buffer);
+		if (buffer.found) {
+			error = 0;
+			break;
+		}
 
 		if (error < 0)
 			break;
 
-		error = 0;
-		if (buffer.found)
-			break;
 		error = -ENOENT;
 		if (old_seq == buffer.sequence)
 			break;
@@ -360,14 +360,13 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 {
 	const struct export_operations *nop = mnt->mnt_sb->s_export_op;
 	struct dentry *result, *alias;
+	char nbuf[NAME_MAX+1];
 	int err;
 
 	/*
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
-	if (!result)
-		result = ERR_PTR(-ESTALE);
 	if (IS_ERR(result))
 		return result;
 
@@ -381,7 +380,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 		 * filesystem root.
 		 */
 		if (result->d_flags & DCACHE_DISCONNECTED) {
-			err = reconnect_path(mnt, result);
+			err = reconnect_path(mnt, result, nbuf);
 			if (err)
 				goto err_result;
 		}
@@ -397,7 +396,6 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 		 * It's not a directory.  Life is a little more complicated.
 		 */
 		struct dentry *target_dir, *nresult;
-		char nbuf[NAME_MAX+1];
 
 		/*
 		 * See if either the dentry we just got from the filesystem
@@ -422,8 +420,6 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 
 		target_dir = nop->fh_to_parent(mnt->mnt_sb, fid,
 				fh_len, fileid_type);
-		if (!target_dir)
-			goto err_result;
 		err = PTR_ERR(target_dir);
 		if (IS_ERR(target_dir))
 			goto err_result;
@@ -433,7 +429,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 		 * connected to the filesystem root.  The VFS really doesn't
 		 * like disconnected directories..
 		 */
-		err = reconnect_path(mnt, target_dir);
+		err = reconnect_path(mnt, target_dir, nbuf);
 		if (err) {
 			dput(target_dir);
 			goto err_result;
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
new file mode 100644
index 000000000000..14a6780fd034
--- /dev/null
+++ b/fs/ext2/Kconfig
@@ -0,0 +1,55 @@
+config EXT2_FS
+	tristate "Second extended fs support"
+	help
+	  Ext2 is a standard Linux file system for hard disks.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ext2.
+
+	  If unsure, say Y.
+
+config EXT2_FS_XATTR
+	bool "Ext2 extended attributes"
+	depends on EXT2_FS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config EXT2_FS_POSIX_ACL
+	bool "Ext2 POSIX Access Control Lists"
+	depends on EXT2_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT2_FS_SECURITY
+	bool "Ext2 Security Labels"
+	depends on EXT2_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext2 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config EXT2_FS_XIP
+	bool "Ext2 execute in place support"
+	depends on EXT2_FS && MMU
+	help
+	  Execute in place can be used on memory-backed block devices. If you
+	  enable this option, you can select to mount block devices which are
+	  capable of this feature without using the page cache.
+
+	  If you do not use a block device that is capable of using this,
+	  or if unsure, say N.
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index e58669e1b87c..ae8c4f850b27 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -294,7 +294,7 @@ ext2_check_acl(struct inode *inode, int mask)
 }
 
 int
-ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext2_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, ext2_check_acl);
 }
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 0bde85bafe38..b42cf578554b 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -58,7 +58,7 @@ static inline int ext2_acl_count(size_t size)
 #define EXT2_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext2_permission (struct inode *, int, struct nameidata *);
+extern int ext2_permission (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 10bb02c3f25c..6dac7ba2d22d 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -1295,6 +1295,7 @@ retry_alloc:
 	 * turn off reservation for this allocation
 	 */
 	if (my_rsv && (free_blocks < windowsz)
+		&& (free_blocks > 0)
 		&& (rsv_is_empty(&my_rsv->rsv_window)))
 		my_rsv = NULL;
 
@@ -1332,7 +1333,7 @@ retry_alloc:
 		 * free blocks is less than half of the reservation
 		 * window size.
 		 */
-		if (free_blocks <= (windowsz/2))
+		if (my_rsv && (free_blocks <= (windowsz/2)))
 			continue;
 
 		brelse(bitmap_bh);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index a78c6b4af060..9a0fc400f91c 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -103,7 +103,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 	return err;
 }
 
-static void ext2_check_page(struct page *page)
+static void ext2_check_page(struct page *page, int quiet)
 {
 	struct inode *dir = page->mapping->host;
 	struct super_block *sb = dir->i_sb;
@@ -146,10 +146,10 @@ out:
 	/* Too bad, we had an error */
 
 Ebadsize:
-	ext2_error(sb, "ext2_check_page",
-		"size of directory #%lu is not a multiple of chunk size",
-		dir->i_ino
-	);
+	if (!quiet)
+		ext2_error(sb, __func__,
+			"size of directory #%lu is not a multiple "
+			"of chunk size", dir->i_ino);
 	goto fail;
 Eshort:
 	error = "rec_len is smaller than minimal";
@@ -166,32 +166,36 @@ Espan:
 Einumber:
 	error = "inode out of bounds";
 bad_entry:
-	ext2_error (sb, "ext2_check_page", "bad entry in directory #%lu: %s - "
-		"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
-		dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
-		(unsigned long) le32_to_cpu(p->inode),
-		rec_len, p->name_len);
+	if (!quiet)
+		ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
+			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+			dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+			(unsigned long) le32_to_cpu(p->inode),
+			rec_len, p->name_len);
 	goto fail;
 Eend:
-	p = (ext2_dirent *)(kaddr + offs);
-	ext2_error (sb, "ext2_check_page",
-		"entry in directory #%lu spans the page boundary"
-		"offset=%lu, inode=%lu",
-		dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
-		(unsigned long) le32_to_cpu(p->inode));
+	if (!quiet) {
+		p = (ext2_dirent *)(kaddr + offs);
+		ext2_error(sb, "ext2_check_page",
+			"entry in directory #%lu spans the page boundary"
+			"offset=%lu, inode=%lu",
+			dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+			(unsigned long) le32_to_cpu(p->inode));
+	}
 fail:
 	SetPageChecked(page);
 	SetPageError(page);
 }
 
-static struct page * ext2_get_page(struct inode *dir, unsigned long n)
+static struct page * ext2_get_page(struct inode *dir, unsigned long n,
+				   int quiet)
 {
 	struct address_space *mapping = dir->i_mapping;
 	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		kmap(page);
 		if (!PageChecked(page))
-			ext2_check_page(page);
+			ext2_check_page(page, quiet);
 		if (PageError(page))
 			goto fail;
 	}
@@ -292,7 +296,7 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
 		ext2_dirent *de;
-		struct page *page = ext2_get_page(inode, n);
+		struct page *page = ext2_get_page(inode, n, 0);
 
 		if (IS_ERR(page)) {
 			ext2_error(sb, __func__,
@@ -350,17 +354,18 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
  * (as a parameter - res_dir). Page is returned mapped and unlocked.
  * Entry is guaranteed to be valid.
  */
-struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
-			struct dentry *dentry, struct page ** res_page)
+struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
+			struct qstr *child, struct page ** res_page)
 {
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const char *name = child->name;
+	int namelen = child->len;
 	unsigned reclen = EXT2_DIR_REC_LEN(namelen);
 	unsigned long start, n;
 	unsigned long npages = dir_pages(dir);
 	struct page *page = NULL;
 	struct ext2_inode_info *ei = EXT2_I(dir);
 	ext2_dirent * de;
+	int dir_has_error = 0;
 
 	if (npages == 0)
 		goto out;
@@ -374,7 +379,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
 	n = start;
 	do {
 		char *kaddr;
-		page = ext2_get_page(dir, n);
+		page = ext2_get_page(dir, n, dir_has_error);
 		if (!IS_ERR(page)) {
 			kaddr = page_address(page);
 			de = (ext2_dirent *) kaddr;
@@ -391,7 +396,9 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir,
 				de = ext2_next_entry(de);
 			}
 			ext2_put_page(page);
-		}
+		} else
+			dir_has_error = 1;
+
 		if (++n >= npages)
 			n = 0;
 		/* next page is past the blocks we've got */
@@ -414,7 +421,7 @@ found:
 
 struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
 {
-	struct page *page = ext2_get_page(dir, 0);
+	struct page *page = ext2_get_page(dir, 0, 0);
 	ext2_dirent *de = NULL;
 
 	if (!IS_ERR(page)) {
@@ -424,13 +431,13 @@ struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p)
 	return de;
 }
 
-ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry)
+ino_t ext2_inode_by_name(struct inode *dir, struct qstr *child)
 {
 	ino_t res = 0;
-	struct ext2_dir_entry_2 * de;
+	struct ext2_dir_entry_2 *de;
 	struct page *page;
 	
-	de = ext2_find_entry (dir, dentry, &page);
+	de = ext2_find_entry (dir, child, &page);
 	if (de) {
 		res = le32_to_cpu(de->inode);
 		ext2_put_page(page);
@@ -487,7 +494,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 	for (n = 0; n <= npages; n++) {
 		char *dir_end;
 
-		page = ext2_get_page(dir, n);
+		page = ext2_get_page(dir, n, 0);
 		err = PTR_ERR(page);
 		if (IS_ERR(page))
 			goto out;
@@ -655,14 +662,17 @@ int ext2_empty_dir (struct inode * inode)
 {
 	struct page *page = NULL;
 	unsigned long i, npages = dir_pages(inode);
+	int dir_has_error = 0;
 
 	for (i = 0; i < npages; i++) {
 		char *kaddr;
 		ext2_dirent * de;
-		page = ext2_get_page(inode, i);
+		page = ext2_get_page(inode, i, dir_has_error);
 
-		if (IS_ERR(page))
+		if (IS_ERR(page)) {
+			dir_has_error = 1;
 			continue;
+		}
 
 		kaddr = page_address(page);
 		de = (ext2_dirent *)kaddr;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da2d33b..3203042b36ef 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -105,9 +105,9 @@ extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_wind
 
 /* dir.c */
 extern int ext2_add_link (struct dentry *, struct inode *);
-extern ino_t ext2_inode_by_name(struct inode *, struct dentry *);
+extern ino_t ext2_inode_by_name(struct inode *, struct qstr *);
 extern int ext2_make_empty(struct inode *, struct inode *);
-extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct dentry *, struct page **);
+extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct qstr *, struct page **);
 extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *);
 extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
@@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
+extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len);
 int __ext2_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c36293..45ed07122182 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = {
 #endif
 	.setattr	= ext2_setattr,
 	.permission	= ext2_permission,
+	.fiemap		= ext2_fiemap,
 };
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 384fc0d1dd74..7658b33e2653 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
+#include <linux/fiemap.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_
 
 }
 
+int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext2_get_block);
+}
+
 static int ext2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, ext2_get_block, wbc);
@@ -791,6 +799,7 @@ const struct address_space_operations ext2_aops = {
 	.direct_IO		= ext2_direct_IO,
 	.writepages		= ext2_writepages,
 	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate	= block_is_partially_uptodate,
 };
 
 const struct address_space_operations ext2_aops_xip = {
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 80c97fd8c571..2a747252ec12 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -60,7 +60,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
 	if (dentry->d_name.len > EXT2_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	ino = ext2_inode_by_name(dir, dentry);
+	ino = ext2_inode_by_name(dir, &dentry->d_name);
 	inode = NULL;
 	if (ino) {
 		inode = ext2_iget(dir->i_sb, ino);
@@ -72,27 +72,11 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
 
 struct dentry *ext2_get_parent(struct dentry *child)
 {
-	unsigned long ino;
-	struct dentry *parent;
-	struct inode *inode;
-	struct dentry dotdot;
-
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-
-	ino = ext2_inode_by_name(child->d_inode, &dotdot);
+	struct qstr dotdot = {.name = "..", .len = 2};
+	unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot);
 	if (!ino)
 		return ERR_PTR(-ENOENT);
-	inode = ext2_iget(child->d_inode->i_sb, ino);
-
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino));
 } 
 
 /*
@@ -257,7 +241,7 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
 	struct page * page;
 	int err = -ENOENT;
 
-	de = ext2_find_entry (dir, dentry, &page);
+	de = ext2_find_entry (dir, &dentry->d_name, &page);
 	if (!de)
 		goto out;
 
@@ -299,7 +283,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	struct ext2_dir_entry_2 * old_de;
 	int err = -ENOENT;
 
-	old_de = ext2_find_entry (old_dir, old_dentry, &old_page);
+	old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
 	if (!old_de)
 		goto out;
 
@@ -319,7 +303,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 			goto out_dir;
 
 		err = -ENOENT;
-		new_de = ext2_find_entry (new_dir, new_dentry, &new_page);
+		new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
 		inode_inc_link_count(old_inode);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ef50cbc792db..647cd888ac87 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -31,6 +31,7 @@
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/log2.h>
+#include <linux/quotaops.h>
 #include <asm/uaccess.h>
 #include "ext2.h"
 #include "xattr.h"
@@ -158,7 +159,7 @@ static void ext2_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
 
@@ -392,7 +393,7 @@ enum {
 	Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_bsd_df, "bsddf"},
 	{Opt_minix_df, "minixdf"},
 	{Opt_grpid, "grpid"},
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index eaa23d2d5213..70c0dbdcdcb7 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -14,7 +14,7 @@ static size_t
 ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 			 const char *name, size_t name_len)
 {
-	const int prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (list && total_len <= list_size) {
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 83ee149f353d..e8219f8eae9f 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -12,13 +12,11 @@
 #include <linux/ext2_fs.h>
 #include "xattr.h"
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t
 ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 			const char *name, size_t name_len)
 {
-	const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index f383e7c3a7b5..92495d28c62f 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -11,13 +11,11 @@
 #include "ext2.h"
 #include "xattr.h"
 
-#define XATTR_USER_PREFIX "user."
-
 static size_t
 ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 		     const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index 4fb94c20041b..b72b85884223 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -11,6 +11,7 @@
 #include <linux/buffer_head.h>
 #include <linux/ext2_fs_sb.h>
 #include <linux/ext2_fs.h>
+#include <linux/blkdev.h>
 #include "ext2.h"
 #include "xip.h"
 
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
new file mode 100644
index 000000000000..8e0cfe44b0fc
--- /dev/null
+++ b/fs/ext3/Kconfig
@@ -0,0 +1,67 @@
+config EXT3_FS
+	tristate "Ext3 journalling file system support"
+	select JBD
+	help
+	  This is the journalling version of the Second extended file system
+	  (often called ext3), the de facto standard Linux file system
+	  (method to organize files on a storage device) for hard disks.
+
+	  The journalling code included in this driver means you do not have
+	  to run e2fsck (file system checker) on your file systems after a
+	  crash.  The journal keeps track of any changes that were being made
+	  at the time the system crashed, and can ensure that your file system
+	  is consistent without the need for a lengthy check.
+
+	  Other than adding the journal to the file system, the on-disk format
+	  of ext3 is identical to ext2.  It is possible to freely switch
+	  between using the ext3 driver and the ext2 driver, as long as the
+	  file system has been cleanly unmounted, or e2fsck is run on the file
+	  system.
+
+	  To add a journal on an existing ext2 file system or change the
+	  behavior of ext3 file systems, you can use the tune2fs utility ("man
+	  tune2fs").  To modify attributes of files and directories on ext3
+	  file systems, use chattr ("man chattr").  You need to be using
+	  e2fsprogs version 1.20 or later in order to create ext3 journals
+	  (available at <http://sourceforge.net/projects/e2fsprogs/>).
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ext3.
+
+config EXT3_FS_XATTR
+	bool "Ext3 extended attributes"
+	depends on EXT3_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on ext3.
+
+config EXT3_FS_POSIX_ACL
+	bool "Ext3 POSIX Access Control Lists"
+	depends on EXT3_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT3_FS_SECURITY
+	bool "Ext3 Security Labels"
+	depends on EXT3_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext3 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index a754d1848173..b60bb241880c 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -299,7 +299,7 @@ ext3_check_acl(struct inode *inode, int mask)
 }
 
 int
-ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext3_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, ext3_check_acl);
 }
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 0d1e6279cbfd..42da16b8cac0 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -58,7 +58,7 @@ static inline int ext3_acl_count(size_t size)
 #define EXT3_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext3_permission (struct inode *, int, struct nameidata *);
+extern int ext3_permission (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 92fd0338a6eb..f5b57a2ca35a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1547,6 +1547,7 @@ retry_alloc:
 	 * turn off reservation for this allocation
 	 */
 	if (my_rsv && (free_blocks < windowsz)
+		&& (free_blocks > 0)
 		&& (rsv_is_empty(&my_rsv->rsv_window)))
 		my_rsv = NULL;
 
@@ -1585,7 +1586,7 @@ retry_alloc:
 		 * free blocks is less than half of the reservation
 		 * window size.
 		 */
-		if (free_blocks <= (windowsz/2))
+		if (my_rsv && (free_blocks <= (windowsz/2)))
 			continue;
 
 		brelse(bitmap_bh);
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 8ca3bfd72427..5853f4440af4 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -102,6 +102,7 @@ static int ext3_readdir(struct file * filp,
 	int err;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = 0;
+	int dir_has_error = 0;
 
 	sb = inode->i_sb;
 
@@ -148,9 +149,12 @@ static int ext3_readdir(struct file * filp,
 		 * of recovering data when there's a bad sector
 		 */
 		if (!bh) {
-			ext3_error (sb, "ext3_readdir",
-				"directory #%lu contains a hole at offset %lu",
-				inode->i_ino, (unsigned long)filp->f_pos);
+			if (!dir_has_error) {
+				ext3_error(sb, __func__, "directory #%lu "
+					"contains a hole at offset %lld",
+					inode->i_ino, filp->f_pos);
+				dir_has_error = 1;
+			}
 			/* corrupt size?  Maybe no more blocks to read */
 			if (filp->f_pos > inode->i_blocks << 9)
 				break;
@@ -272,7 +276,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 
 	while (n) {
 		/* Do the node's children first */
-		if ((n)->rb_left) {
+		if (n->rb_left) {
 			n = n->rb_left;
 			continue;
 		}
@@ -301,24 +305,18 @@ static void free_rb_tree_fname(struct rb_root *root)
 			parent->rb_right = NULL;
 		n = parent;
 	}
-	root->rb_node = NULL;
 }
 
 
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
 {
 	struct dir_private_info *p;
 
-	p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 	if (!p)
 		return NULL;
-	p->root.rb_node = NULL;
-	p->curr_node = NULL;
-	p->extra_fname = NULL;
-	p->last_pos = 0;
 	p->curr_hash = pos2maj_hash(pos);
 	p->curr_minor_hash = pos2min_hash(pos);
-	p->next_hash = 0;
 	return p;
 }
 
@@ -416,7 +414,7 @@ static int call_filldir(struct file * filp, void * dirent,
 				get_dtype(sb, fname->file_type));
 		if (error) {
 			filp->f_pos = curr_pos;
-			info->extra_fname = fname->next;
+			info->extra_fname = fname;
 			return error;
 		}
 		fname = fname->next;
@@ -433,7 +431,7 @@ static int ext3_dx_readdir(struct file * filp,
 	int	ret;
 
 	if (!info) {
-		info = create_dir_info(filp->f_pos);
+		info = ext3_htree_create_dir_info(filp->f_pos);
 		if (!info)
 			return -ENOMEM;
 		filp->private_data = info;
@@ -455,11 +453,12 @@ static int ext3_dx_readdir(struct file * filp,
 	 * If there are any leftover names on the hash collision
 	 * chain, return them first.
 	 */
-	if (info->extra_fname &&
-	    call_filldir(filp, dirent, filldir, info->extra_fname))
-		goto finished;
-
-	if (!info->curr_node)
+	if (info->extra_fname) {
+		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+			goto finished;
+		info->extra_fname = NULL;
+		goto next_node;
+	} else if (!info->curr_node)
 		info->curr_node = rb_first(&info->root);
 
 	while (1) {
@@ -490,9 +489,14 @@ static int ext3_dx_readdir(struct file * filp,
 		info->curr_minor_hash = fname->minor_hash;
 		if (call_filldir(filp, dirent, filldir, fname))
 			break;
-
+	next_node:
 		info->curr_node = rb_next(info->curr_node);
-		if (!info->curr_node) {
+		if (info->curr_node) {
+			fname = rb_entry(info->curr_node, struct fname,
+					 rb_hash);
+			info->curr_hash = fname->hash;
+			info->curr_minor_hash = fname->minor_hash;
+		} else {
 			if (info->next_hash == ~0) {
 				filp->f_pos = EXT3_HTREE_EOF;
 				break;
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913d3019..3be1e0689c9a 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.permission	= ext3_permission,
+	.fiemap		= ext3_fiemap,
 };
 
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 77126821b2e9..47b678d73e7a 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -669,6 +669,14 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
 	if (IS_ERR(inode))
 		goto iget_failed;
 
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext3_can_truncate(inode))
+		goto bad_orphan;
+
 	if (NEXT_ORPHAN(inode) > max_ino)
 		goto bad_orphan;
 	brelse(bitmap_bh);
@@ -690,6 +698,7 @@ bad_orphan:
 		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
 		       NEXT_ORPHAN(inode));
 		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
 		/* Avoid freeing blocks if we got a bad deleted inode */
 		if (inode->i_nlink == 0)
 			inode->i_blocks = 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 6ae4ecf3ce40..f8424ad89971 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -36,6 +36,7 @@
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/fiemap.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -981,6 +982,13 @@ out:
 	return ret;
 }
 
+int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		u64 start, u64 len)
+{
+	return generic_block_fiemap(inode, fieinfo, start, len,
+				    ext3_get_block);
+}
+
 /*
  * `handle' can be NULL if create is zero
  */
@@ -1178,6 +1186,13 @@ write_begin_failed:
 		ext3_journal_stop(handle);
 		unlock_page(page);
 		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
 	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
@@ -1767,44 +1782,47 @@ static int ext3_journalled_set_page_dirty(struct page *page)
 }
 
 static const struct address_space_operations ext3_ordered_aops = {
-	.readpage	= ext3_readpage,
-	.readpages	= ext3_readpages,
-	.writepage	= ext3_ordered_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext3_write_begin,
-	.write_end	= ext3_ordered_write_end,
-	.bmap		= ext3_bmap,
-	.invalidatepage	= ext3_invalidatepage,
-	.releasepage	= ext3_releasepage,
-	.direct_IO	= ext3_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_ordered_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_ordered_write_end,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.direct_IO		= ext3_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext3_writeback_aops = {
-	.readpage	= ext3_readpage,
-	.readpages	= ext3_readpages,
-	.writepage	= ext3_writeback_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext3_write_begin,
-	.write_end	= ext3_writeback_write_end,
-	.bmap		= ext3_bmap,
-	.invalidatepage	= ext3_invalidatepage,
-	.releasepage	= ext3_releasepage,
-	.direct_IO	= ext3_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_writeback_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_writeback_write_end,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.direct_IO		= ext3_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext3_journalled_aops = {
-	.readpage	= ext3_readpage,
-	.readpages	= ext3_readpages,
-	.writepage	= ext3_journalled_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext3_write_begin,
-	.write_end	= ext3_journalled_write_end,
-	.set_page_dirty	= ext3_journalled_set_page_dirty,
-	.bmap		= ext3_bmap,
-	.invalidatepage	= ext3_invalidatepage,
-	.releasepage	= ext3_releasepage,
+	.readpage		= ext3_readpage,
+	.readpages		= ext3_readpages,
+	.writepage		= ext3_journalled_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext3_write_begin,
+	.write_end		= ext3_journalled_write_end,
+	.set_page_dirty		= ext3_journalled_set_page_dirty,
+	.bmap			= ext3_bmap,
+	.invalidatepage		= ext3_invalidatepage,
+	.releasepage		= ext3_releasepage,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 void ext3_set_aops(struct inode *inode)
@@ -2127,7 +2145,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
 
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
-		ext3_journal_dirty_metadata(handle, this_bh);
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if (bh2jh(this_bh))
+			ext3_journal_dirty_metadata(handle, this_bh);
+		else
+			ext3_error(inode->i_sb, "ext3_free_data",
+				   "circular indirect block detected, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long)this_bh->b_blocknr);
 	}
 }
 
@@ -2253,6 +2285,19 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 	}
 }
 
+int ext3_can_truncate(struct inode *inode)
+{
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return 0;
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext3_inode_is_fast_symlink(inode);
+	return 0;
+}
+
 /*
  * ext3_truncate()
  *
@@ -2297,12 +2342,7 @@ void ext3_truncate(struct inode *inode)
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	struct page *page;
 
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	    S_ISLNK(inode->i_mode)))
-		return;
-	if (ext3_inode_is_fast_symlink(inode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+	if (!ext3_can_truncate(inode))
 		return;
 
 	/*
@@ -2513,6 +2553,16 @@ static int __ext3_get_inode_loc(struct inode *inode,
 	}
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
+
+		/*
+		 * If the buffer has the write error flag, we have failed
+		 * to write out another inode in the same block.  In this
+		 * case, we don't have to read the block because we may
+		 * read the old inode data successfully.
+		 */
+		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+			set_buffer_uptodate(bh);
+
 		if (buffer_uptodate(bh)) {
 			/* someone brought it uptodate while we waited */
 			unlock_buffer(bh);
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 0d0c70151642..b7394d05ee8e 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -239,7 +239,7 @@ setrsvsz_out:
 	case EXT3_IOC_GROUP_EXTEND: {
 		ext3_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
-		int err;
+		int err, err2;
 
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -254,8 +254,10 @@ setrsvsz_out:
 		}
 		err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
-		journal_flush(EXT3_SB(sb)->s_journal);
+		err2 = journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
 group_extend_out:
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
@@ -263,7 +265,7 @@ group_extend_out:
 	case EXT3_IOC_GROUP_ADD: {
 		struct ext3_new_group_data input;
 		struct super_block *sb = inode->i_sb;
-		int err;
+		int err, err2;
 
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -280,8 +282,10 @@ group_extend_out:
 
 		err = ext3_group_add(sb, &input);
 		journal_lock_updates(EXT3_SB(sb)->s_journal);
-		journal_flush(EXT3_SB(sb)->s_journal);
+		err2 = journal_flush(EXT3_SB(sb)->s_journal);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
 group_add_out:
 		mnt_drop_write(filp->f_path.mnt);
 		return err;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 0b8cf80154f1..3e5edc92aa0b 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -159,7 +159,7 @@ static void dx_set_count (struct dx_entry *entries, unsigned value);
 static void dx_set_limit (struct dx_entry *entries, unsigned value);
 static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
 static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static struct dx_frame *dx_probe(struct qstr *entry,
 				 struct inode *dir,
 				 struct dx_hash_info *hinfo,
 				 struct dx_frame *frame,
@@ -176,8 +176,9 @@ static int ext3_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
 				 struct dx_frame *frames,
 				 __u32 *start_hash);
-static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
-		       struct ext3_dir_entry_2 **res_dir, int *err);
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+			int *err);
 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
@@ -240,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
 		EXT3_DIR_REC_LEN(2) - infosize;
-	return 0? 20: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 static inline unsigned dx_node_limit (struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
-	return 0? 22: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 /*
@@ -342,7 +343,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
  * back to userspace.
  */
 static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(struct qstr *entry, struct inode *dir,
 	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
 {
 	unsigned count, indirect;
@@ -353,8 +354,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	u32 hash;
 
 	frame->bh = NULL;
-	if (dentry)
-		dir = dentry->d_parent->d_inode;
 	if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
 		goto fail;
 	root = (struct dx_root *) bh->b_data;
@@ -370,8 +369,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	}
 	hinfo->hash_version = root->info.hash_version;
 	hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
-	if (dentry)
-		ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+	if (entry)
+		ext3fs_dirhash(entry->name, entry->len, hinfo);
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
@@ -803,15 +802,15 @@ static inline int ext3_match (int len, const char * const name,
  */
 static inline int search_dirblock(struct buffer_head * bh,
 				  struct inode *dir,
-				  struct dentry *dentry,
+				  struct qstr *child,
 				  unsigned long offset,
 				  struct ext3_dir_entry_2 ** res_dir)
 {
 	struct ext3_dir_entry_2 * de;
 	char * dlimit;
 	int de_len;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const char *name = child->name;
+	int namelen = child->len;
 
 	de = (struct ext3_dir_entry_2 *) bh->b_data;
 	dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -850,8 +849,9 @@ static inline int search_dirblock(struct buffer_head * bh,
  * The returned buffer_head has ->b_count elevated.  The caller is expected
  * to brelse() it when appropriate.
  */
-static struct buffer_head * ext3_find_entry (struct dentry *dentry,
-					struct ext3_dir_entry_2 ** res_dir)
+static struct buffer_head *ext3_find_entry(struct inode *dir,
+					struct qstr *entry,
+					struct ext3_dir_entry_2 **res_dir)
 {
 	struct super_block * sb;
 	struct buffer_head * bh_use[NAMEI_RA_SIZE];
@@ -863,16 +863,15 @@ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
 				   buffer */
 	int num = 0;
 	int nblocks, i, err;
-	struct inode *dir = dentry->d_parent->d_inode;
 	int namelen;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
-	namelen = dentry->d_name.len;
+	namelen = entry->len;
 	if (namelen > EXT3_NAME_LEN)
 		return NULL;
 	if (is_dx(dir)) {
-		bh = ext3_dx_find_entry(dentry, res_dir, &err);
+		bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
 		/*
 		 * On success, or if the error was file not found,
 		 * return.  Otherwise, fall back to doing a search the
@@ -923,7 +922,7 @@ restart:
 			brelse(bh);
 			goto next;
 		}
-		i = search_dirblock(bh, dir, dentry,
+		i = search_dirblock(bh, dir, entry,
 			    block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
 		if (i == 1) {
 			EXT3_I(dir)->i_dir_start_lookup = block;
@@ -957,8 +956,9 @@ cleanup_and_exit:
 	return ret;
 }
 
-static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
-		       struct ext3_dir_entry_2 **res_dir, int *err)
+static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
+			struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
+			int *err)
 {
 	struct super_block * sb;
 	struct dx_hash_info	hinfo;
@@ -968,14 +968,13 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
 	struct buffer_head *bh;
 	unsigned long block;
 	int retval;
-	int namelen = dentry->d_name.len;
-	const u8 *name = dentry->d_name.name;
-	struct inode *dir = dentry->d_parent->d_inode;
+	int namelen = entry->len;
+	const u8 *name = entry->name;
 
 	sb = dir->i_sb;
 	/* NFS may look up ".." - look at dx_root directory block */
-	if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-		if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+	if (namelen > 2 || name[0] != '.'|| (namelen == 2 && name[1] != '.')) {
+		if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
 			return NULL;
 	} else {
 		frame = frames;
@@ -991,19 +990,21 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
 		de = (struct ext3_dir_entry_2 *) bh->b_data;
 		top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
 				       EXT3_DIR_REC_LEN(0));
-		for (; de < top; de = ext3_next_entry(de))
-		if (ext3_match (namelen, name, de)) {
-			if (!ext3_check_dir_entry("ext3_find_entry",
-						  dir, de, bh,
-				  (block<<EXT3_BLOCK_SIZE_BITS(sb))
-					  +((char *)de - bh->b_data))) {
-				brelse (bh);
+		for (; de < top; de = ext3_next_entry(de)) {
+			int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
+				  + ((char *) de - bh->b_data);
+
+			if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
+				brelse(bh);
 				*err = ERR_BAD_DX_DIR;
 				goto errout;
 			}
-			*res_dir = de;
-			dx_release (frames);
-			return bh;
+
+			if (ext3_match(namelen, name, de)) {
+				*res_dir = de;
+				dx_release(frames);
+				return bh;
+			}
 		}
 		brelse (bh);
 		/* Check to see if we should continue to search */
@@ -1034,7 +1035,7 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
 	if (dentry->d_name.len > EXT3_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	bh = ext3_find_entry(dentry, &de);
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
 	inode = NULL;
 	if (bh) {
 		unsigned long ino = le32_to_cpu(de->inode);
@@ -1055,18 +1056,11 @@ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, str
 struct dentry *ext3_get_parent(struct dentry *child)
 {
 	unsigned long ino;
-	struct dentry *parent;
-	struct inode *inode;
-	struct dentry dotdot;
+	struct qstr dotdot = {.name = "..", .len = 2};
 	struct ext3_dir_entry_2 * de;
 	struct buffer_head *bh;
 
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-	dotdot.d_parent = child; /* confusing, isn't it! */
-
-	bh = ext3_find_entry(&dotdot, &de);
-	inode = NULL;
+	bh = ext3_find_entry(child->d_inode, &dotdot, &de);
 	if (!bh)
 		return ERR_PTR(-ENOENT);
 	ino = le32_to_cpu(de->inode);
@@ -1078,16 +1072,7 @@ struct dentry *ext3_get_parent(struct dentry *child)
 		return ERR_PTR(-EIO);
 	}
 
-	inode = ext3_iget(child->d_inode->i_sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino));
 }
 
 #define S_SHIFT 12
@@ -1501,7 +1486,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	struct ext3_dir_entry_2 *de;
 	int err;
 
-	frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 	entries = frame->entries;
@@ -2054,7 +2039,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 		return PTR_ERR(handle);
 
 	retval = -ENOENT;
-	bh = ext3_find_entry (dentry, &de);
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_rmdir;
 
@@ -2116,7 +2101,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 		handle->h_sync = 1;
 
 	retval = -ENOENT;
-	bh = ext3_find_entry (dentry, &de);
+	bh = ext3_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_unlink;
 
@@ -2274,7 +2259,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		handle->h_sync = 1;
 
-	old_bh = ext3_find_entry (old_dentry, &old_de);
+	old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
 	 *  We might rmdir the source, keep it as pwd of some process
@@ -2287,7 +2272,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		goto end_rename;
 
 	new_inode = new_dentry->d_inode;
-	new_bh = ext3_find_entry (new_dentry, &new_de);
+	new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
 	if (new_bh) {
 		if (!new_inode) {
 			brelse (new_bh);
@@ -2353,7 +2338,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		struct buffer_head *old_bh2;
 		struct ext3_dir_entry_2 *old_de2;
 
-		old_bh2 = ext3_find_entry(old_dentry, &old_de2);
+		old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
+					  &old_de2);
 		if (old_bh2) {
 			retval = ext3_delete_entry(handle, old_dir,
 						   old_de2, old_bh2);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 77278e947e94..78fdf3836370 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -790,7 +790,8 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 
 	if (reserved_gdb || gdb_off == 0) {
 		if (!EXT3_HAS_COMPAT_FEATURE(sb,
-					     EXT3_FEATURE_COMPAT_RESIZE_INODE)){
+					     EXT3_FEATURE_COMPAT_RESIZE_INODE)
+		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
 			ext3_warning(sb, __func__,
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2845425077e8..18eaa78ecb4e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -347,7 +347,7 @@ fail:
 static int ext3_blkdev_put(struct block_device *bdev)
 {
 	bd_release(bdev);
-	return blkdev_put(bdev);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -393,7 +393,8 @@ static void ext3_put_super (struct super_block * sb)
 	int i;
 
 	ext3_xattr_put_super(sb);
-	journal_destroy(sbi->s_journal);
+	if (journal_destroy(sbi->s_journal) < 0)
+		ext3_abort(sb, __func__, "Couldn't clean up the journal");
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -472,7 +473,7 @@ static void ext3_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
 
@@ -625,6 +626,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
 	ext3_show_quota_options(seq, sb);
 
 	return 0;
@@ -754,13 +758,14 @@ enum {
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_bsd_df, "bsddf"},
 	{Opt_minix_df, "minixdf"},
 	{Opt_grpid, "grpid"},
@@ -796,6 +801,8 @@ static match_table_t tokens = {
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -842,7 +849,7 @@ static int parse_options (char *options, struct super_block *sb,
 	int data_opt = 0;
 	int option;
 #ifdef CONFIG_QUOTA
-	int qtype;
+	int qtype, qfmt;
 	char *qname;
 #endif
 
@@ -1011,6 +1018,12 @@ static int parse_options (char *options, struct super_block *sb,
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
+		case Opt_data_err_abort:
+			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -1018,9 +1031,11 @@ static int parse_options (char *options, struct super_block *sb,
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 set_qf_name:
-			if (sb_any_quota_enabled(sb)) {
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
-					"EXT3-fs: Cannot change journalled "
+					"EXT3-fs: Cannot change journaled "
 					"quota options when quota turned on.\n");
 				return 0;
 			}
@@ -1056,9 +1071,11 @@ set_qf_name:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 clear_qf_name:
-			if (sb_any_quota_enabled(sb)) {
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT3-fs: Cannot change "
-					"journalled quota options when "
+					"journaled quota options when "
 					"quota turned on.\n");
 				return 0;
 			}
@@ -1069,10 +1086,20 @@ clear_qf_name:
 			sbi->s_qf_names[qtype] = NULL;
 			break;
 		case Opt_jqfmt_vfsold:
-			sbi->s_jquota_fmt = QFMT_VFS_OLD;
-			break;
+			qfmt = QFMT_VFS_OLD;
+			goto set_qf_format;
 		case Opt_jqfmt_vfsv0:
-			sbi->s_jquota_fmt = QFMT_VFS_V0;
+			qfmt = QFMT_VFS_V0;
+set_qf_format:
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    sbi->s_jquota_fmt != qfmt) {
+				printk(KERN_ERR "EXT3-fs: Cannot change "
+					"journaled quota options when "
+					"quota turned on.\n");
+				return 0;
+			}
+			sbi->s_jquota_fmt = qfmt;
 			break;
 		case Opt_quota:
 		case Opt_usrquota:
@@ -1084,7 +1111,8 @@ clear_qf_name:
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 		case Opt_noquota:
-			if (sb_any_quota_enabled(sb)) {
+			if (sb_any_quota_enabled(sb) ||
+			    sb_any_quota_suspended(sb)) {
 				printk(KERN_ERR "EXT3-fs: Cannot change quota "
 					"options when quota turned on.\n");
 				return 0;
@@ -1169,14 +1197,14 @@ clear_qf_name:
 		}
 
 		if (!sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT3-fs: journalled quota format "
+			printk(KERN_ERR "EXT3-fs: journaled quota format "
 					"not specified.\n");
 			return 0;
 		}
 	} else {
 		if (sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT3-fs: journalled quota format "
-					"specified with no journalling "
+			printk(KERN_ERR "EXT3-fs: journaled quota format "
+					"specified with no journaling "
 					"enabled.\n");
 			return 0;
 		}
@@ -1370,7 +1398,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 			int ret = ext3_quota_on_mount(sb, i);
 			if (ret < 0)
 				printk(KERN_ERR
-					"EXT3-fs: Cannot turn on journalled "
+					"EXT3-fs: Cannot turn on journaled "
 					"quota: error %d\n", ret);
 		}
 	}
@@ -1971,6 +1999,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JFS_BARRIER;
 	else
 		journal->j_flags &= ~JFS_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
 	spin_unlock(&journal->j_state_lock);
 }
 
@@ -2035,7 +2067,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	if (bd_claim(bdev, sb)) {
 		printk(KERN_ERR
 		        "EXT3: failed to claim external journal device.\n");
-		blkdev_put(bdev);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
 
@@ -2265,7 +2297,9 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
 	journal_t *journal = EXT3_SB(sb)->s_journal;
 
 	journal_lock_updates(journal);
-	journal_flush(journal);
+	if (journal_flush(journal) < 0)
+		goto out;
+
 	lock_super(sb);
 	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
@@ -2274,6 +2308,8 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
 		ext3_commit_super(sb, es, 1);
 	}
 	unlock_super(sb);
+
+out:
 	journal_unlock_updates(journal);
 }
 
@@ -2373,7 +2409,13 @@ static void ext3_write_super_lockfs(struct super_block *sb)
 
 		/* Now we set up the journal barrier. */
 		journal_lock_updates(journal);
-		journal_flush(journal);
+
+		/*
+		 * We don't want to clear needs_recovery flag when we failed
+		 * to flush the journal.
+		 */
+		if (journal_flush(journal) < 0)
+			return;
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
@@ -2712,7 +2754,7 @@ static int ext3_release_dquot(struct dquot *dquot)
 
 static int ext3_mark_dquot_dirty(struct dquot *dquot)
 {
-	/* Are we journalling quotas? */
+	/* Are we journaling quotas? */
 	if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
 	    EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
@@ -2752,32 +2794,56 @@ static int ext3_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-			 char *path, int remount)
+			 char *name, int remount)
 {
 	int err;
-	struct nameidata nd;
+	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* Not journalling quota or remount? */
-	if ((!EXT3_SB(sb)->s_qf_names[USRQUOTA] &&
-	    !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) || remount)
-		return vfs_quota_on(sb, type, format_id, path, remount);
-	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	/* When remounting, no checks are needed and in fact, name is NULL */
+	if (remount)
+		return vfs_quota_on(sb, type, format_id, name, remount);
+
+	err = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (err)
 		return err;
+
 	/* Quotafile not on the same filesystem? */
-	if (nd.path.mnt->mnt_sb != sb) {
-		path_put(&nd.path);
+	if (path.mnt->mnt_sb != sb) {
+		path_put(&path);
 		return -EXDEV;
 	}
-	/* Quotafile not in fs root? */
-	if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-		printk(KERN_WARNING
-			"EXT3-fs: Quota file not on filesystem root. "
-			"Journalled quota will not work.\n");
-	path_put(&nd.path);
-	return vfs_quota_on(sb, type, format_id, path, remount);
+	/* Journaling quota? */
+	if (EXT3_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (path.dentry->d_parent != sb->s_root)
+			printk(KERN_WARNING
+				"EXT3-fs: Quota file not on filesystem root. "
+				"Journaled quota will not work.\n");
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (ext3_should_journal_data(path.dentry->d_inode)) {
+		/*
+		 * We don't need to lock updates but journal_flush() could
+		 * otherwise be livelocked...
+		 */
+		journal_lock_updates(EXT3_SB(sb)->s_journal);
+		err = journal_flush(EXT3_SB(sb)->s_journal);
+		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+		if (err) {
+			path_put(&path);
+			return err;
+		}
+	}
+
+	err = vfs_quota_on_path(sb, type, format_id, &path);
+	path_put(&path);
+	return err;
 }
 
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 821efaf2b94e..37b81097bdf2 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -15,7 +15,7 @@ static size_t
 ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 			 const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index 0327497a55ce..c7c41a410c4b 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -13,13 +13,11 @@
 #include <linux/ext3_fs.h>
 #include "xattr.h"
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t
 ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 			const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 1abd8f92c440..430fe63b31b3 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -12,13 +12,11 @@
 #include <linux/ext3_fs.h>
 #include "xattr.h"
 
-#define XATTR_USER_PREFIX "user."
-
 static size_t
 ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 		     const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
new file mode 100644
index 000000000000..7505482a08fa
--- /dev/null
+++ b/fs/ext4/Kconfig
@@ -0,0 +1,79 @@
+config EXT4_FS
+	tristate "The Extended 4 (ext4) filesystem"
+	select JBD2
+	select CRC16
+	help
+	  This is the next generation of the ext3 filesystem.
+
+	  Unlike the change from ext2 filesystem to ext3 filesystem,
+	  the on-disk format of ext4 is not forwards compatible with
+	  ext3; it is based on extent maps and it supports 48-bit
+	  physical block numbers.  The ext4 filesystem also supports delayed
+	  allocation, persistent preallocation, high resolution time stamps,
+	  and a number of other features to improve performance and speed
+	  up fsck time.  For more information, please see the web pages at
+	  http://ext4.wiki.kernel.org.
+
+	  The ext4 filesystem will support mounting an ext3
+	  filesystem; while there will be some performance gains from
+	  the delayed allocation and inode table readahead, the best
+	  performance gains will require enabling ext4 features in the
+	  filesystem, or formating a new filesystem as an ext4
+	  filesystem initially.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called ext4.
+
+	  If unsure, say N.
+
+config EXT4DEV_COMPAT
+	bool "Enable ext4dev compatibility"
+	depends on EXT4_FS
+	help
+	  Starting with 2.6.28, the name of the ext4 filesystem was
+	  renamed from ext4dev to ext4.  Unfortunately there are some
+	  legacy userspace programs (such as klibc's fstype) have
+	  "ext4dev" hardcoded.
+
+	  To enable backwards compatibility so that systems that are
+	  still expecting to mount ext4 filesystems using ext4dev,
+	  chose Y here.   This feature will go away by 2.6.31, so
+	  please arrange to get your userspace programs fixed!
+
+config EXT4_FS_XATTR
+	bool "Ext4 extended attributes"
+	depends on EXT4_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on ext4.
+
+config EXT4_FS_POSIX_ACL
+	bool "Ext4 POSIX Access Control Lists"
+	depends on EXT4_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+	bool "Ext4 Security Labels"
+	depends on EXT4_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext4 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2f..a8ff003a00f7 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,12 @@
 # Makefile for the linux ext4-filesystem routines.
 #
 
-obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+obj-$(CONFIG_EXT4_FS) += ext4.o
 
-ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 		   ext4_jbd2.o migrate.o mballoc.o
 
-ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
-ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
-ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY)	+= xattr_security.o
+ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
+ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 3c8dab880d91..694ed6fadcc8 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size)
 	acl = posix_acl_alloc(count, GFP_NOFS);
 	if (!acl)
 		return ERR_PTR(-ENOMEM);
-	for (n=0; n < count; n++) {
+	for (n = 0; n < count; n++) {
 		ext4_acl_entry *entry =
 			(ext4_acl_entry *)value;
 		if ((char *)value + sizeof(ext4_acl_entry_short) > end)
 			goto fail;
 		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
 		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
-		switch(acl->a_entries[n].e_tag) {
-			case ACL_USER_OBJ:
-			case ACL_GROUP_OBJ:
-			case ACL_MASK:
-			case ACL_OTHER:
-				value = (char *)value +
-					sizeof(ext4_acl_entry_short);
-				acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
-				break;
-
-			case ACL_USER:
-			case ACL_GROUP:
-				value = (char *)value + sizeof(ext4_acl_entry);
-				if ((char *)value > end)
-					goto fail;
-				acl->a_entries[n].e_id =
-					le32_to_cpu(entry->e_id);
-				break;
-
-			default:
+
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+				sizeof(ext4_acl_entry_short);
+			acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+			break;
+
+		case ACL_USER:
+		case ACL_GROUP:
+			value = (char *)value + sizeof(ext4_acl_entry);
+			if ((char *)value > end)
 				goto fail;
+			acl->a_entries[n].e_id =
+				le32_to_cpu(entry->e_id);
+			break;
+
+		default:
+			goto fail;
 		}
 	}
 	if (value != end)
@@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
 		return ERR_PTR(-ENOMEM);
 	ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
 	e = (char *)ext_acl + sizeof(ext4_acl_header);
-	for (n=0; n < acl->a_count; n++) {
+	for (n = 0; n < acl->a_count; n++) {
 		ext4_acl_entry *entry = (ext4_acl_entry *)e;
 		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
 		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
-		switch(acl->a_entries[n].e_tag) {
-			case ACL_USER:
-			case ACL_GROUP:
-				entry->e_id =
-					cpu_to_le32(acl->a_entries[n].e_id);
-				e += sizeof(ext4_acl_entry);
-				break;
-
-			case ACL_USER_OBJ:
-			case ACL_GROUP_OBJ:
-			case ACL_MASK:
-			case ACL_OTHER:
-				e += sizeof(ext4_acl_entry_short);
-				break;
-
-			default:
-				goto fail;
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+			e += sizeof(ext4_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(ext4_acl_entry_short);
+			break;
+
+		default:
+			goto fail;
 		}
 	}
 	return (char *)ext_acl;
@@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type)
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return NULL;
 
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			acl = ext4_iget_acl(inode, &ei->i_acl);
-			if (acl != EXT4_ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
-			break;
-
-		case ACL_TYPE_DEFAULT:
-			acl = ext4_iget_acl(inode, &ei->i_default_acl);
-			if (acl != EXT4_ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
-			break;
-
-		default:
-			return ERR_PTR(-EINVAL);
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		acl = ext4_iget_acl(inode, &ei->i_acl);
+		if (acl != EXT4_ACL_NOT_CACHED)
+			return acl;
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		acl = ext4_iget_acl(inode, &ei->i_default_acl);
+		if (acl != EXT4_ACL_NOT_CACHED)
+			return acl;
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+
+	default:
+		return ERR_PTR(-EINVAL);
 	}
 	retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
 	if (retval > 0) {
@@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type)
 	kfree(value);
 
 	if (!IS_ERR(acl)) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext4_iset_acl(inode, &ei->i_acl, acl);
-				break;
-
-			case ACL_TYPE_DEFAULT:
-				ext4_iset_acl(inode, &ei->i_default_acl, acl);
-				break;
+		switch (type) {
+		case ACL_TYPE_ACCESS:
+			ext4_iset_acl(inode, &ei->i_acl, acl);
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			ext4_iset_acl(inode, &ei->i_default_acl, acl);
+			break;
 		}
 	}
 	return acl;
@@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
-			if (acl) {
-				mode_t mode = inode->i_mode;
-				error = posix_acl_equiv_mode(acl, &mode);
-				if (error < 0)
-					return error;
-				else {
-					inode->i_mode = mode;
-					ext4_mark_inode_dirty(handle, inode);
-					if (error == 0)
-						acl = NULL;
-				}
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0)
+				return error;
+			else {
+				inode->i_mode = mode;
+				ext4_mark_inode_dirty(handle, inode);
+				if (error == 0)
+					acl = NULL;
 			}
-			break;
+		}
+		break;
 
-		case ACL_TYPE_DEFAULT:
-			name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
-			if (!S_ISDIR(inode->i_mode))
-				return acl ? -EACCES : 0;
-			break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
 
-		default:
-			return -EINVAL;
+	default:
+		return -EINVAL;
 	}
 	if (acl) {
 		value = ext4_acl_to_disk(acl, &size);
@@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 
 	kfree(value);
 	if (!error) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext4_iset_acl(inode, &ei->i_acl, acl);
-				break;
-
-			case ACL_TYPE_DEFAULT:
-				ext4_iset_acl(inode, &ei->i_default_acl, acl);
-				break;
+		switch (type) {
+		case ACL_TYPE_ACCESS:
+			ext4_iset_acl(inode, &ei->i_acl, acl);
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			ext4_iset_acl(inode, &ei->i_default_acl, acl);
+			break;
 		}
 	}
 	return error;
@@ -299,7 +299,7 @@ ext4_check_acl(struct inode *inode, int mask)
 }
 
 int
-ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext4_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, ext4_check_acl);
 }
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 26a5c1abf147..cb45257a246e 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,18 +51,18 @@ static inline int ext4_acl_count(size_t size)
 	}
 }
 
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 
 /* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
    if the ACL has not been cached */
 #define EXT4_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext4_permission (struct inode *, int, struct nameidata *);
-extern int ext4_acl_chmod (struct inode *);
-extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+extern int ext4_permission(struct inode *, int);
+extern int ext4_acl_chmod(struct inode *);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
-#else  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
 #define ext4_permission NULL
 
@@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
 	return 0;
 }
-#endif  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#endif  /* CONFIG_EXT4_FS_POSIX_ACL */
 
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 495ab21b9832..b9821be709bd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
 	}
 	return used_blocks;
 }
+
 /* Initializes an uninitialized block bitmap if given, and returns the
  * number of blocks free in the group. */
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -132,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		group_blocks = ext4_blocks_count(sbi->s_es) -
 			le32_to_cpu(sbi->s_es->s_first_data_block) -
-			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
+			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
 	} else {
 		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
 	}
@@ -200,20 +201,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  * @bh:			pointer to the buffer head to store the block
  *			group descriptor
  */
-struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 					     ext4_group_t block_group,
-					     struct buffer_head ** bh)
+					     struct buffer_head **bh)
 {
 	unsigned long group_desc;
 	unsigned long offset;
-	struct ext4_group_desc * desc;
+	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (block_group >= sbi->s_groups_count) {
-		ext4_error (sb, "ext4_get_group_desc",
-			    "block_group >= groups_count - "
-			    "block_group = %lu, groups_count = %lu",
-			    block_group, sbi->s_groups_count);
+		ext4_error(sb, "ext4_get_group_desc",
+			   "block_group >= groups_count - "
+			   "block_group = %lu, groups_count = %lu",
+			   block_group, sbi->s_groups_count);
 
 		return NULL;
 	}
@@ -222,10 +223,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 	if (!sbi->s_group_desc[group_desc]) {
-		ext4_error (sb, "ext4_get_group_desc",
-			    "Group descriptor not loaded - "
-			    "block_group = %lu, group_desc = %lu, desc = %lu",
-			     block_group, group_desc, offset);
+		ext4_error(sb, "ext4_get_group_desc",
+			   "Group descriptor not loaded - "
+			   "block_group = %lu, group_desc = %lu, desc = %lu",
+			   block_group, group_desc, offset);
 		return NULL;
 	}
 
@@ -302,8 +303,8 @@ err_out:
 struct buffer_head *
 ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
-	struct ext4_group_desc * desc;
-	struct buffer_head * bh = NULL;
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh = NULL;
 	ext4_fsblk_t bitmap_blk;
 
 	desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -314,25 +315,30 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	if (unlikely(!bh)) {
 		ext4_error(sb, __func__,
 			    "Cannot read block bitmap - "
-			    "block_group = %d, block_bitmap = %llu",
-			    (int)block_group, (unsigned long long)bitmap_blk);
+			    "block_group = %lu, block_bitmap = %llu",
+			    block_group, bitmap_blk);
 		return NULL;
 	}
-	if (bh_uptodate_or_lock(bh))
+	if (buffer_uptodate(bh) &&
+	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
 		return bh;
 
+	lock_buffer(bh);
+	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
+		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 		return bh;
 	}
+	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
 		ext4_error(sb, __func__,
 			    "Cannot read block bitmap - "
-			    "block_group = %d, block_bitmap = %llu",
-			    (int)block_group, (unsigned long long)bitmap_blk);
+			    "block_group = %lu, block_bitmap = %llu",
+			    block_group, bitmap_blk);
 		return NULL;
 	}
 	ext4_valid_block_bitmap(sb, desc, block_group, bh);
@@ -342,301 +348,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	 */
 	return bh;
 }
-/*
- * The reservation window structure operations
- * --------------------------------------------
- * Operations include:
- * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
- *
- * We use a red-black tree to represent per-filesystem reservation
- * windows.
- *
- */
-
-/**
- * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root:		root of per-filesystem reservation rb tree
- * @verbose:		verbose mode
- * @fn:			function which wishes to dump the reservation map
- *
- * If verbose is turned on, it will print the whole block reservation
- * windows(start, end).	Otherwise, it will only print out the "bad" windows,
- * those windows that overlap with their immediate neighbors.
- */
-#if 1
-static void __rsv_window_dump(struct rb_root *root, int verbose,
-			      const char *fn)
-{
-	struct rb_node *n;
-	struct ext4_reserve_window_node *rsv, *prev;
-	int bad;
-
-restart:
-	n = rb_first(root);
-	bad = 0;
-	prev = NULL;
-
-	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
-	while (n) {
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-		if (verbose)
-			printk("reservation window 0x%p "
-			       "start:  %llu, end:  %llu\n",
-			       rsv, rsv->rsv_start, rsv->rsv_end);
-		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
-			printk("Bad reservation %p (start >= end)\n",
-			       rsv);
-			bad = 1;
-		}
-		if (prev && prev->rsv_end >= rsv->rsv_start) {
-			printk("Bad reservation %p (prev->end >= start)\n",
-			       rsv);
-			bad = 1;
-		}
-		if (bad) {
-			if (!verbose) {
-				printk("Restarting reservation walk in verbose mode\n");
-				verbose = 1;
-				goto restart;
-			}
-		}
-		n = rb_next(n);
-		prev = rsv;
-	}
-	printk("Window map complete.\n");
-	BUG_ON(bad);
-}
-#define rsv_window_dump(root, verbose) \
-	__rsv_window_dump((root), (verbose), __func__)
-#else
-#define rsv_window_dump(root, verbose) do {} while (0)
-#endif
-
-/**
- * goal_in_my_reservation()
- * @rsv:		inode's reservation window
- * @grp_goal:		given goal block relative to the allocation block group
- * @group:		the current allocation block group
- * @sb:			filesystem super block
- *
- * Test if the given goal block (group relative) is within the file's
- * own block reservation window range.
- *
- * If the reservation window is outside the goal allocation group, return 0;
- * grp_goal (given goal block) could be -1, which means no specific
- * goal block. In this case, always return 1.
- * If the goal block is within the reservation window, return 1;
- * otherwise, return 0;
- */
-static int
-goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
-			ext4_group_t group, struct super_block *sb)
-{
-	ext4_fsblk_t group_first_block, group_last_block;
-
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	if ((rsv->_rsv_start > group_last_block) ||
-	    (rsv->_rsv_end < group_first_block))
-		return 0;
-	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
-		|| (grp_goal + group_first_block > rsv->_rsv_end)))
-		return 0;
-	return 1;
-}
-
-/**
- * search_reserve_window()
- * @rb_root:		root of reservation tree
- * @goal:		target allocation block
- *
- * Find the reserved window which includes the goal, or the previous one
- * if the goal is not in any window.
- * Returns NULL if there are no windows or if all windows start after the goal.
- */
-static struct ext4_reserve_window_node *
-search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
-{
-	struct rb_node *n = root->rb_node;
-	struct ext4_reserve_window_node *rsv;
-
-	if (!n)
-		return NULL;
-
-	do {
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-
-		if (goal < rsv->rsv_start)
-			n = n->rb_left;
-		else if (goal > rsv->rsv_end)
-			n = n->rb_right;
-		else
-			return rsv;
-	} while (n);
-	/*
-	 * We've fallen off the end of the tree: the goal wasn't inside
-	 * any particular node.  OK, the previous node must be to one
-	 * side of the interval containing the goal.  If it's the RHS,
-	 * we need to back up one.
-	 */
-	if (rsv->rsv_start > goal) {
-		n = rb_prev(&rsv->rsv_node);
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-	}
-	return rsv;
-}
-
-/**
- * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
- * @sb:			super block
- * @rsv:		reservation window to add
- *
- * Must be called with rsv_lock hold.
- */
-void ext4_rsv_window_add(struct super_block *sb,
-		    struct ext4_reserve_window_node *rsv)
-{
-	struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
-	struct rb_node *node = &rsv->rsv_node;
-	ext4_fsblk_t start = rsv->rsv_start;
-
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
-	struct ext4_reserve_window_node *this;
-
-	while (*p)
-	{
-		parent = *p;
-		this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
-
-		if (start < this->rsv_start)
-			p = &(*p)->rb_left;
-		else if (start > this->rsv_end)
-			p = &(*p)->rb_right;
-		else {
-			rsv_window_dump(root, 1);
-			BUG();
-		}
-	}
-
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-}
-
-/**
- * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
- * @sb:			super block
- * @rsv:		reservation window to remove
- *
- * Mark the block reservation window as not allocated, and unlink it
- * from the filesystem reservation window rb tree. Must be called with
- * rsv_lock hold.
- */
-static void rsv_window_remove(struct super_block *sb,
-			      struct ext4_reserve_window_node *rsv)
-{
-	rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_alloc_hit = 0;
-	rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
-}
-
-/*
- * rsv_is_empty() -- Check if the reservation window is allocated.
- * @rsv:		given reservation window to check
- *
- * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
- */
-static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
-{
-	/* a valid reservation end block could not be 0 */
-	return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-}
-
-/**
- * ext4_init_block_alloc_info()
- * @inode:		file inode structure
- *
- * Allocate and initialize the	reservation window structure, and
- * link the window to the ext4 inode structure at last
- *
- * The reservation window structure is only dynamically allocated
- * and linked to ext4 inode the first time the open file
- * needs a new block. So, before every ext4_new_block(s) call, for
- * regular files, we should check whether the reservation window
- * structure exists or not. In the latter case, this function is called.
- * Fail to do so will result in block reservation being turned off for that
- * open file.
- *
- * This function is called from ext4_get_blocks_handle(), also called
- * when setting the reservation window size through ioctl before the file
- * is open for write (needs block allocation).
- *
- * Needs down_write(i_data_sem) protection prior to call this function.
- */
-void ext4_init_block_alloc_info(struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-	struct super_block *sb = inode->i_sb;
-
-	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
-	if (block_i) {
-		struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
-
-		rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-		rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-
-		/*
-		 * if filesystem is mounted with NORESERVATION, the goal
-		 * reservation window size is set to zero to indicate
-		 * block reservation is off
-		 */
-		if (!test_opt(sb, RESERVATION))
-			rsv->rsv_goal_size = 0;
-		else
-			rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
-		rsv->rsv_alloc_hit = 0;
-		block_i->last_alloc_logical_block = 0;
-		block_i->last_alloc_physical_block = 0;
-	}
-	ei->i_block_alloc_info = block_i;
-}
-
-/**
- * ext4_discard_reservation()
- * @inode:		inode
- *
- * Discard(free) block reservation window on last file close, or truncate
- * or at last iput().
- *
- * It is being called in three cases:
- *	ext4_release_file(): last writer close the file
- *	ext4_clear_inode(): last iput(), when nobody link to this file.
- *	ext4_truncate(): when the block indirect map is about to change.
- *
- */
-void ext4_discard_reservation(struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-	struct ext4_reserve_window_node *rsv;
-	spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
-
-	ext4_mb_discard_inode_preallocations(inode);
-
-	if (!block_i)
-		return;
-
-	rsv = &block_i->rsv_window_node;
-	if (!rsv_is_empty(&rsv->rsv_window)) {
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&rsv->rsv_window))
-			rsv_window_remove(inode->i_sb, rsv);
-		spin_unlock(rsv_lock);
-	}
-}
 
 /**
  * ext4_free_blocks_sb() -- Free given blocks and update quota
@@ -645,6 +356,13 @@ void ext4_discard_reservation(struct inode *inode)
  * @block:			start physcial block to free
  * @count:			number of blocks to free
  * @pdquot_freed_blocks:	pointer to quota
+ *
+ * XXX This function is only used by the on-line resizing code, which
+ * should probably be fixed up to call the mballoc variant.  There
+ * this needs to be cleaned up later; in fact, I'm not convinced this
+ * is 100% correct in the face of the mballoc code.  The online resizing
+ * code needs to be fixed up to more tightly (and correctly) interlock
+ * with the mballoc code.
  */
 void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 			 ext4_fsblk_t block, unsigned long count,
@@ -656,8 +374,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 	ext4_grpblk_t bit;
 	unsigned long i;
 	unsigned long overflow;
-	struct ext4_group_desc * desc;
-	struct ext4_super_block * es;
+	struct ext4_group_desc *desc;
+	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int err = 0, ret;
 	ext4_grpblk_t group_freed;
@@ -668,13 +386,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	    block + count < block ||
 	    block + count > ext4_blocks_count(es)) {
-		ext4_error (sb, "ext4_free_blocks",
-			    "Freeing blocks not in datazone - "
-			    "block = %llu, count = %lu", block, count);
+		ext4_error(sb, "ext4_free_blocks",
+			   "Freeing blocks not in datazone - "
+			   "block = %llu, count = %lu", block, count);
 		goto error_return;
 	}
 
-	ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
+	ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
 
 do_more:
 	overflow = 0;
@@ -691,7 +409,7 @@ do_more:
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
-	desc = ext4_get_group_desc (sb, block_group, &gd_bh);
+	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
 	if (!desc)
 		goto error_return;
 
@@ -700,10 +418,10 @@ do_more:
 	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, desc),
 		     sbi->s_itb_per_group)) {
-		ext4_error (sb, "ext4_free_blocks",
-			    "Freeing blocks in system zones - "
-			    "Block = %llu, count = %lu",
-			    block, count);
+		ext4_error(sb, "ext4_free_blocks",
+			   "Freeing blocks in system zones - "
+			   "Block = %llu, count = %lu",
+			   block, count);
 		goto error_return;
 	}
 
@@ -845,759 +563,71 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count,
 			int metadata)
 {
-	struct super_block * sb;
+	struct super_block *sb;
 	unsigned long dquot_freed_blocks;
 
 	/* this isn't the right place to decide whether block is metadata
 	 * inode.c/extents.c knows better, but for safety ... */
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
-			ext4_should_journal_data(inode))
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		metadata = 1;
+
+	/* We need to make sure we don't reuse
+	 * block released untill the transaction commit.
+	 * writeback mode have weak data consistency so
+	 * don't force data as metadata when freeing block
+	 * for writeback mode.
+	 */
+	if (metadata == 0 && !ext4_should_writeback_data(inode))
 		metadata = 1;
 
 	sb = inode->i_sb;
 
-	if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
-		ext4_free_blocks_sb(handle, sb, block, count,
-						&dquot_freed_blocks);
-	else
-		ext4_mb_free_blocks(handle, inode, block, count,
-						metadata, &dquot_freed_blocks);
+	ext4_mb_free_blocks(handle, inode, block, count,
+			    metadata, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
 		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
 	return;
 }
 
-/**
- * ext4_test_allocatable()
- * @nr:			given allocation block group
- * @bh:			bufferhead contains the bitmap of the given block group
- *
- * For ext4 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy.  This
- * prevents deletes from freeing up the page for reuse until we have
- * committed the delete transaction.
- *
- * If we didn't do this, then deleting something and reallocating it as
- * data would allow the old block to be overwritten before the
- * transaction committed (because we force data to disk before commit).
- * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
- *
- * @@@ We may want to make this allocation behaviour conditional on
- * data-writes at some point, and disable it for metadata allocations or
- * sync-data inodes.
- */
-static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+						s64 nblocks)
 {
-	int ret;
-	struct journal_head *jh = bh2jh(bh);
-
-	if (ext4_test_bit(nr, bh->b_data))
-		return 0;
-
-	jbd_lock_bh_state(bh);
-	if (!jh->b_committed_data)
-		ret = 1;
-	else
-		ret = !ext4_test_bit(nr, jh->b_committed_data);
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
+	s64 free_blocks, dirty_blocks;
+	s64 root_blocks = 0;
+	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
-/**
- * bitmap_search_next_usable_block()
- * @start:		the starting block (group relative) of the search
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) of the reservation
- *
- * The bitmap search --- search forward alternately through the actual
- * bitmap on disk and the last-committed copy in journal, until we find a
- * bit free in both bitmaps.
- */
-static ext4_grpblk_t
-bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-					ext4_grpblk_t maxblocks)
-{
-	ext4_grpblk_t next;
-	struct journal_head *jh = bh2jh(bh);
-
-	while (start < maxblocks) {
-		next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
-		if (next >= maxblocks)
-			return -1;
-		if (ext4_test_allocatable(next, bh))
-			return next;
-		jbd_lock_bh_state(bh);
-		if (jh->b_committed_data)
-			start = ext4_find_next_zero_bit(jh->b_committed_data,
-							maxblocks, next);
-		jbd_unlock_bh_state(bh);
-	}
-	return -1;
-}
+	free_blocks  = percpu_counter_read_positive(fbc);
+	dirty_blocks = percpu_counter_read_positive(dbc);
 
-/**
- * find_next_usable_block()
- * @start:		the starting block (group relative) to find next
- *			allocatable block in bitmap.
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) for the search
- *
- * Find an allocatable block in a bitmap.  We honor both the bitmap and
- * its last-committed copy (if that exists), and perform the "most
- * appropriate allocation" algorithm of looking for a free block near
- * the initial goal; then for a free byte somewhere in the bitmap; then
- * for any free bit in the bitmap.
- */
-static ext4_grpblk_t
-find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-			ext4_grpblk_t maxblocks)
-{
-	ext4_grpblk_t here, next;
-	char *p, *r;
-
-	if (start > 0) {
-		/*
-		 * The goal was occupied; search forward for a free
-		 * block within the next XX blocks.
-		 *
-		 * end_goal is more or less random, but it has to be
-		 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
-		 * next 64-bit boundary is simple..
-		 */
-		ext4_grpblk_t end_goal = (start + 63) & ~63;
-		if (end_goal > maxblocks)
-			end_goal = maxblocks;
-		here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
-		if (here < end_goal && ext4_test_allocatable(here, bh))
-			return here;
-		ext4_debug("Bit not found near goal\n");
-	}
-
-	here = start;
-	if (here < 0)
-		here = 0;
-
-	p = ((char *)bh->b_data) + (here >> 3);
-	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-	next = (r - ((char *)bh->b_data)) << 3;
-
-	if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
-		return next;
-
-	/*
-	 * The bitmap search --- search forward alternately through the actual
-	 * bitmap and the last-committed copy until we find a bit free in
-	 * both
-	 */
-	here = bitmap_search_next_usable_block(here, bh, maxblocks);
-	return here;
-}
-
-/**
- * claim_block()
- * @block:		the free block (group relative) to allocate
- * @bh:			the bufferhead containts the block group bitmap
- *
- * We think we can allocate this block in this bitmap.  Try to set the bit.
- * If that succeeds then check that nobody has allocated and then freed the
- * block since we saw that is was not marked in b_committed_data.  If it _was_
- * allocated and freed then clear the bit in the bitmap again and return
- * zero (failure).
- */
-static inline int
-claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
-{
-	struct journal_head *jh = bh2jh(bh);
-	int ret;
-
-	if (ext4_set_bit_atomic(lock, block, bh->b_data))
-		return 0;
-	jbd_lock_bh_state(bh);
-	if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
-		ext4_clear_bit_atomic(lock, block, bh->b_data);
-		ret = 0;
-	} else {
-		ret = 1;
-	}
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
-
-/**
- * ext4_try_to_allocate()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
- * @my_rsv:		reservation window
- *
- * Attempt to allocate blocks within a give range. Set the range of allocation
- * first, then find the first free bit(s) from the bitmap (within the range),
- * and at last, allocate the blocks by claiming the found free bit as allocated.
- *
- * To set the range of this allocation:
- *	if there is a reservation window, only try to allocate block(s) from the
- *	file's own reservation window;
- *	Otherwise, the allocation range starts from the give goal block, ends at
- *	the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap.  In that case we must release write access to the old one via
- * ext4_journal_release_buffer(), else we'll run out of credits.
- */
-static ext4_grpblk_t
-ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
-			ext4_group_t group, struct buffer_head *bitmap_bh,
-			ext4_grpblk_t grp_goal, unsigned long *count,
-			struct ext4_reserve_window *my_rsv)
-{
-	ext4_fsblk_t group_first_block;
-	ext4_grpblk_t start, end;
-	unsigned long num = 0;
-
-	/* we do allocation within the reservation window if we have a window */
-	if (my_rsv) {
-		group_first_block = ext4_group_first_block_no(sb, group);
-		if (my_rsv->_rsv_start >= group_first_block)
-			start = my_rsv->_rsv_start - group_first_block;
-		else
-			/* reservation window cross group boundary */
-			start = 0;
-		end = my_rsv->_rsv_end - group_first_block + 1;
-		if (end > EXT4_BLOCKS_PER_GROUP(sb))
-			/* reservation window crosses group boundary */
-			end = EXT4_BLOCKS_PER_GROUP(sb);
-		if ((start <= grp_goal) && (grp_goal < end))
-			start = grp_goal;
-		else
-			grp_goal = -1;
-	} else {
-		if (grp_goal > 0)
-			start = grp_goal;
-		else
-			start = 0;
-		end = EXT4_BLOCKS_PER_GROUP(sb);
-	}
-
-	BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
-
-repeat:
-	if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
-		grp_goal = find_next_usable_block(start, bitmap_bh, end);
-		if (grp_goal < 0)
-			goto fail_access;
-		if (!my_rsv) {
-			int i;
-
-			for (i = 0; i < 7 && grp_goal > start &&
-					ext4_test_allocatable(grp_goal - 1,
-								bitmap_bh);
-					i++, grp_goal--)
-				;
-		}
-	}
-	start = grp_goal;
-
-	if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-		grp_goal, bitmap_bh)) {
-		/*
-		 * The block was allocated by another thread, or it was
-		 * allocated and then freed by another thread
-		 */
-		start++;
-		grp_goal++;
-		if (start >= end)
-			goto fail_access;
-		goto repeat;
-	}
-	num++;
-	grp_goal++;
-	while (num < *count && grp_goal < end
-		&& ext4_test_allocatable(grp_goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-				grp_goal, bitmap_bh)) {
-		num++;
-		grp_goal++;
-	}
-	*count = num;
-	return grp_goal - num;
-fail_access:
-	*count = num;
-	return -1;
-}
-
-/**
- *	find_next_reservable_window():
- *		find a reservable space within the given range.
- *		It does not allocate the reservation window for now:
- *		alloc_new_reservation() will do the work later.
- *
- *	@search_head: the head of the searching list;
- *		This is not necessarily the list head of the whole filesystem
- *
- *		We have both head and start_block to assist the search
- *		for the reservable space. The list starts from head,
- *		but we will shift to the place where start_block is,
- *		then start from there, when looking for a reservable space.
- *
- *	@size: the target new reservation window size
- *
- *	@group_first_block: the first block we consider to start
- *			the real search from
- *
- *	@last_block:
- *		the maximum block number that our goal reservable space
- *		could start from. This is normally the last block in this
- *		group. The search will end when we found the start of next
- *		possible reservable space is out of this boundary.
- *		This could handle the cross boundary reservation window
- *		request.
- *
- *	basically we search from the given range, rather than the whole
- *	reservation double linked list, (start_block, last_block)
- *	to find a free region that is of my size and has not
- *	been reserved.
- *
- */
-static int find_next_reservable_window(
-				struct ext4_reserve_window_node *search_head,
-				struct ext4_reserve_window_node *my_rsv,
-				struct super_block * sb,
-				ext4_fsblk_t start_block,
-				ext4_fsblk_t last_block)
-{
-	struct rb_node *next;
-	struct ext4_reserve_window_node *rsv, *prev;
-	ext4_fsblk_t cur;
-	int size = my_rsv->rsv_goal_size;
-
-	/* TODO: make the start of the reservation window byte-aligned */
-	/* cur = *start_block & ~7;*/
-	cur = start_block;
-	rsv = search_head;
-	if (!rsv)
-		return -1;
-
-	while (1) {
-		if (cur <= rsv->rsv_end)
-			cur = rsv->rsv_end + 1;
-
-		/* TODO?
-		 * in the case we could not find a reservable space
-		 * that is what is expected, during the re-search, we could
-		 * remember what's the largest reservable space we could have
-		 * and return that one.
-		 *
-		 * For now it will fail if we could not find the reservable
-		 * space with expected-size (or more)...
-		 */
-		if (cur > last_block)
-			return -1;		/* fail */
-
-		prev = rsv;
-		next = rb_next(&rsv->rsv_node);
-		rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
+	if (!capable(CAP_SYS_RESOURCE) &&
+		sbi->s_resuid != current->fsuid &&
+		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+		root_blocks = ext4_r_blocks_count(sbi->s_es);
 
-		/*
-		 * Reached the last reservation, we can just append to the
-		 * previous one.
-		 */
-		if (!next)
-			break;
-
-		if (cur + size <= rsv->rsv_start) {
-			/*
-			 * Found a reserveable space big enough.  We could
-			 * have a reservation across the group boundary here
-			 */
-			break;
+	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+						EXT4_FREEBLOCKS_WATERMARK) {
+		free_blocks  = percpu_counter_sum(fbc);
+		dirty_blocks = percpu_counter_sum(dbc);
+		if (dirty_blocks < 0) {
+			printk(KERN_CRIT "Dirty block accounting "
+					"went wrong %lld\n",
+					dirty_blocks);
 		}
 	}
-	/*
-	 * we come here either :
-	 * when we reach the end of the whole list,
-	 * and there is empty reservable space after last entry in the list.
-	 * append it to the end of the list.
-	 *
-	 * or we found one reservable space in the middle of the list,
-	 * return the reservation window that we could append to.
-	 * succeed.
+	/* Check whether we have space after
+	 * accounting for current dirty blocks
 	 */
+	if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
+		/* we don't have free space */
+		return -ENOSPC;
 
-	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
-		rsv_window_remove(sb, my_rsv);
-
-	/*
-	 * Let's book the whole avaliable window for now.  We will check the
-	 * disk bitmap later and then, if there are free blocks then we adjust
-	 * the window size if it's larger than requested.
-	 * Otherwise, we will remove this node from the tree next time
-	 * call find_next_reservable_window.
-	 */
-	my_rsv->rsv_start = cur;
-	my_rsv->rsv_end = cur + size - 1;
-	my_rsv->rsv_alloc_hit = 0;
-
-	if (prev != my_rsv)
-		ext4_rsv_window_add(sb, my_rsv);
-
+	/* Add the blocks to nblocks */
+	percpu_counter_add(dbc, nblocks);
 	return 0;
 }
 
 /**
- *	alloc_new_reservation()--allocate a new reservation window
- *
- *		To make a new reservation, we search part of the filesystem
- *		reservation list (the list that inside the group). We try to
- *		allocate a new reservation window near the allocation goal,
- *		or the beginning of the group, if there is no goal.
- *
- *		We first find a reservable space after the goal, then from
- *		there, we check the bitmap for the first free block after
- *		it. If there is no free block until the end of group, then the
- *		whole group is full, we failed. Otherwise, check if the free
- *		block is inside the expected reservable space, if so, we
- *		succeed.
- *		If the first free block is outside the reservable space, then
- *		start from the first free block, we search for next available
- *		space, and go on.
- *
- *	on succeed, a new reservation will be found and inserted into the list
- *	It contains at least one free block, and it does not overlap with other
- *	reservation windows.
- *
- *	failed: we failed to find a reservation window in this group
- *
- *	@rsv: the reservation
- *
- *	@grp_goal: The goal (group-relative).  It is where the search for a
- *		free reservable space should start from.
- *		if we have a grp_goal(grp_goal >0 ), then start from there,
- *		no grp_goal(grp_goal = -1), we start from the first block
- *		of the group.
- *
- *	@sb: the super block
- *	@group: the group we are trying to allocate in
- *	@bitmap_bh: the block group block bitmap
- *
- */
-static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
-		ext4_grpblk_t grp_goal, struct super_block *sb,
-		ext4_group_t group, struct buffer_head *bitmap_bh)
-{
-	struct ext4_reserve_window_node *search_head;
-	ext4_fsblk_t group_first_block, group_end_block, start_block;
-	ext4_grpblk_t first_free_block;
-	struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
-	unsigned long size;
-	int ret;
-	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	if (grp_goal < 0)
-		start_block = group_first_block;
-	else
-		start_block = grp_goal + group_first_block;
-
-	size = my_rsv->rsv_goal_size;
-
-	if (!rsv_is_empty(&my_rsv->rsv_window)) {
-		/*
-		 * if the old reservation is cross group boundary
-		 * and if the goal is inside the old reservation window,
-		 * we will come here when we just failed to allocate from
-		 * the first part of the window. We still have another part
-		 * that belongs to the next group. In this case, there is no
-		 * point to discard our window and try to allocate a new one
-		 * in this group(which will fail). we should
-		 * keep the reservation window, just simply move on.
-		 *
-		 * Maybe we could shift the start block of the reservation
-		 * window to the first block of next group.
-		 */
-
-		if ((my_rsv->rsv_start <= group_end_block) &&
-				(my_rsv->rsv_end > group_end_block) &&
-				(start_block >= my_rsv->rsv_start))
-			return -1;
-
-		if ((my_rsv->rsv_alloc_hit >
-		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
-			/*
-			 * if the previously allocation hit ratio is
-			 * greater than 1/2, then we double the size of
-			 * the reservation window the next time,
-			 * otherwise we keep the same size window
-			 */
-			size = size * 2;
-			if (size > EXT4_MAX_RESERVE_BLOCKS)
-				size = EXT4_MAX_RESERVE_BLOCKS;
-			my_rsv->rsv_goal_size= size;
-		}
-	}
-
-	spin_lock(rsv_lock);
-	/*
-	 * shift the search start to the window near the goal block
-	 */
-	search_head = search_reserve_window(fs_rsv_root, start_block);
-
-	/*
-	 * find_next_reservable_window() simply finds a reservable window
-	 * inside the given range(start_block, group_end_block).
-	 *
-	 * To make sure the reservation window has a free bit inside it, we
-	 * need to check the bitmap after we found a reservable window.
-	 */
-retry:
-	ret = find_next_reservable_window(search_head, my_rsv, sb,
-						start_block, group_end_block);
-
-	if (ret == -1) {
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;
-	}
-
-	/*
-	 * On success, find_next_reservable_window() returns the
-	 * reservation window where there is a reservable space after it.
-	 * Before we reserve this reservable space, we need
-	 * to make sure there is at least a free block inside this region.
-	 *
-	 * searching the first free bit on the block bitmap and copy of
-	 * last committed bitmap alternatively, until we found a allocatable
-	 * block. Search start from the start block of the reservable space
-	 * we just found.
-	 */
-	spin_unlock(rsv_lock);
-	first_free_block = bitmap_search_next_usable_block(
-			my_rsv->rsv_start - group_first_block,
-			bitmap_bh, group_end_block - group_first_block + 1);
-
-	if (first_free_block < 0) {
-		/*
-		 * no free block left on the bitmap, no point
-		 * to reserve the space. return failed.
-		 */
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;		/* failed */
-	}
-
-	start_block = first_free_block + group_first_block;
-	/*
-	 * check if the first free block is within the
-	 * free space we just reserved
-	 */
-	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
-		return 0;		/* success */
-	/*
-	 * if the first free bit we found is out of the reservable space
-	 * continue search for next reservable space,
-	 * start from where the free block is,
-	 * we also shift the list head to where we stopped last time
-	 */
-	search_head = my_rsv;
-	spin_lock(rsv_lock);
-	goto retry;
-}
-
-/**
- * try_to_extend_reservation()
- * @my_rsv:		given reservation window
- * @sb:			super block
- * @size:		the delta to extend
- *
- * Attempt to expand the reservation window large enough to have
- * required number of free blocks
- *
- * Since ext4_try_to_allocate() will always allocate blocks within
- * the reservation window range, if the window size is too small,
- * multiple blocks allocation has to stop at the end of the reservation
- * window. To make this more efficient, given the total number of
- * blocks needed and the current size of the window, we try to
- * expand the reservation window size if necessary on a best-effort
- * basis before ext4_new_blocks() tries to allocate blocks,
- */
-static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
-			struct super_block *sb, int size)
-{
-	struct ext4_reserve_window_node *next_rsv;
-	struct rb_node *next;
-	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
-	if (!spin_trylock(rsv_lock))
-		return;
-
-	next = rb_next(&my_rsv->rsv_node);
-
-	if (!next)
-		my_rsv->rsv_end += size;
-	else {
-		next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
-
-		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
-			my_rsv->rsv_end += size;
-		else
-			my_rsv->rsv_end = next_rsv->rsv_start - 1;
-	}
-	spin_unlock(rsv_lock);
-}
-
-/**
- * ext4_try_to_allocate_with_rsv()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
- * @my_rsv:		reservation window
- * @errp:		pointer to store the error code
- *
- * This is the main function used to allocate a new block and its reservation
- * window.
- *
- * Each time when a new block allocation is need, first try to allocate from
- * its own reservation.  If it does not have a reservation window, instead of
- * looking for a free bit on bitmap first, then look up the reservation list to
- * see if it is inside somebody else's reservation window, we try to allocate a
- * reservation window for it starting from the goal first. Then do the block
- * allocation within the reservation window.
- *
- * This will avoid keeping on searching the reservation list again and
- * again when somebody is looking for a free block (without
- * reservation), and there are lots of free blocks, but they are all
- * being reserved.
- *
- * We use a red-black tree for the per-filesystem reservation list.
- *
- */
-static ext4_grpblk_t
-ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
-			ext4_group_t group, struct buffer_head *bitmap_bh,
-			ext4_grpblk_t grp_goal,
-			struct ext4_reserve_window_node * my_rsv,
-			unsigned long *count, int *errp)
-{
-	ext4_fsblk_t group_first_block, group_last_block;
-	ext4_grpblk_t ret = 0;
-	int fatal;
-	unsigned long num = *count;
-
-	*errp = 0;
-
-	/*
-	 * Make sure we use undo access for the bitmap, because it is critical
-	 * that we do the frozen_data COW on bitmap buffers in all cases even
-	 * if the buffer is in BJ_Forget state in the committing transaction.
-	 */
-	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
-	fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
-	if (fatal) {
-		*errp = fatal;
-		return -1;
-	}
-
-	/*
-	 * we don't deal with reservation when
-	 * filesystem is mounted without reservation
-	 * or the file is not a regular file
-	 * or last attempt to allocate a block with reservation turned on failed
-	 */
-	if (my_rsv == NULL ) {
-		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-						grp_goal, count, NULL);
-		goto out;
-	}
-	/*
-	 * grp_goal is a group relative block number (if there is a goal)
-	 * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
-	 * first block is a filesystem wide block number
-	 * first block is the block number of the first block in this group
-	 */
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	/*
-	 * Basically we will allocate a new block from inode's reservation
-	 * window.
-	 *
-	 * We need to allocate a new reservation window, if:
-	 * a) inode does not have a reservation window; or
-	 * b) last attempt to allocate a block from existing reservation
-	 *    failed; or
-	 * c) we come here with a goal and with a reservation window
-	 *
-	 * We do not need to allocate a new reservation window if we come here
-	 * at the beginning with a goal and the goal is inside the window, or
-	 * we don't have a goal but already have a reservation window.
-	 * then we could go to allocate from the reservation window directly.
-	 */
-	while (1) {
-		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window,
-						grp_goal, group, sb)) {
-			if (my_rsv->rsv_goal_size < *count)
-				my_rsv->rsv_goal_size = *count;
-			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
-							group, bitmap_bh);
-			if (ret < 0)
-				break;			/* failed */
-
-			if (!goal_in_my_reservation(&my_rsv->rsv_window,
-							grp_goal, group, sb))
-				grp_goal = -1;
-		} else if (grp_goal >= 0) {
-			int curr = my_rsv->rsv_end -
-					(grp_goal + group_first_block) + 1;
-
-			if (curr < *count)
-				try_to_extend_reservation(my_rsv, sb,
-							*count - curr);
-		}
-
-		if ((my_rsv->rsv_start > group_last_block) ||
-				(my_rsv->rsv_end < group_first_block)) {
-			rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
-			BUG();
-		}
-		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-					   grp_goal, &num, &my_rsv->rsv_window);
-		if (ret >= 0) {
-			my_rsv->rsv_alloc_hit += num;
-			*count = num;
-			break;				/* succeed */
-		}
-		num = *count;
-	}
-out:
-	if (ret >= 0) {
-		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
-					"bitmap block");
-		fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
-		if (fatal) {
-			*errp = fatal;
-			return -1;
-		}
-		return ret;
-	}
-
-	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
-	ext4_journal_release_buffer(handle, bitmap_bh);
-	return ret;
-}
-
-/**
  * ext4_has_free_blocks()
  * @sbi:	in-core super block structure.
  * @nblocks:	number of neeed blocks
@@ -1607,26 +637,34 @@ out:
  * On success, return nblocks
  */
 ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-						ext4_fsblk_t nblocks)
+						s64 nblocks)
 {
-	ext4_fsblk_t free_blocks;
-	ext4_fsblk_t root_blocks = 0;
+	s64 free_blocks, dirty_blocks;
+	s64 root_blocks = 0;
+	struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+	struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
 
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	free_blocks  = percpu_counter_read_positive(fbc);
+	dirty_blocks = percpu_counter_read_positive(dbc);
 
 	if (!capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
 		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
 		root_blocks = ext4_r_blocks_count(sbi->s_es);
-#ifdef CONFIG_SMP
-	if (free_blocks - root_blocks < FBC_BATCH)
-		free_blocks =
-			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
-#endif
-	if (free_blocks - root_blocks < nblocks)
-		return free_blocks - root_blocks;
+
+	if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+						EXT4_FREEBLOCKS_WATERMARK) {
+		free_blocks  = percpu_counter_sum(fbc);
+		dirty_blocks = percpu_counter_sum(dbc);
+	}
+	if (free_blocks <= (root_blocks + dirty_blocks))
+		/* we don't have free space */
+		return 0;
+
+	if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
+		return free_blocks - (root_blocks + dirty_blocks);
 	return nblocks;
- }
+}
 
 
 /**
@@ -1651,303 +689,6 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
 
-/**
- * ext4_old_new_blocks() -- core block bitmap based block allocation function
- *
- * @handle:		handle to this transaction
- * @inode:		file inode
- * @goal:		given target block(filesystem wide)
- * @count:		target number of blocks to allocate
- * @errp:		error code
- *
- * ext4_old_new_blocks uses a goal block to assist allocation and look up
- * the block bitmap directly to do block allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If
- * that fails, it will try to allocate block(s) from other block groups
- * without any specific goal block.
- *
- * This function is called when -o nomballoc mount option is enabled
- *
- */
-ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gdp_bh;
-	ext4_group_t group_no;
-	ext4_group_t goal_group;
-	ext4_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
-	ext4_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
-	ext4_fsblk_t ret_block;		/* filesyetem-wide allocated block */
-	ext4_group_t bgi;			/* blockgroup iteration index */
-	int fatal = 0, err;
-	int performed_allocation = 0;
-	ext4_grpblk_t free_blocks;	/* number of free blocks in a group */
-	struct super_block *sb;
-	struct ext4_group_desc *gdp;
-	struct ext4_super_block *es;
-	struct ext4_sb_info *sbi;
-	struct ext4_reserve_window_node *my_rsv = NULL;
-	struct ext4_block_alloc_info *block_i;
-	unsigned short windowsz = 0;
-	ext4_group_t ngroups;
-	unsigned long num = *count;
-
-	sb = inode->i_sb;
-	if (!sb) {
-		*errp = -ENODEV;
-		printk("ext4_new_block: nonexistent device");
-		return 0;
-	}
-
-	sbi = EXT4_SB(sb);
-	if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
-		/*
-		 * With delalloc we already reserved the blocks
-		 */
-		*count = ext4_has_free_blocks(sbi, *count);
-	}
-	if (*count == 0) {
-		*errp = -ENOSPC;
-		return 0;	/*return with ENOSPC error */
-	}
-	num = *count;
-
-	/*
-	 * Check quota for allocation of this block.
-	 */
-	if (DQUOT_ALLOC_BLOCK(inode, num)) {
-		*errp = -EDQUOT;
-		return 0;
-	}
-
-	sbi = EXT4_SB(sb);
-	es = EXT4_SB(sb)->s_es;
-	ext4_debug("goal=%llu.\n", goal);
-	/*
-	 * Allocate a block from reservation only when
-	 * filesystem is mounted with reservation(default,-o reservation), and
-	 * it's a regular file, and
-	 * the desired window size is greater than 0 (One could use ioctl
-	 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
-	 * reservation on that particular file)
-	 */
-	block_i = EXT4_I(inode)->i_block_alloc_info;
-	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
-		my_rsv = &block_i->rsv_window_node;
-
-	/*
-	 * First, test whether the goal block is free.
-	 */
-	if (goal < le32_to_cpu(es->s_first_data_block) ||
-	    goal >= ext4_blocks_count(es))
-		goal = le32_to_cpu(es->s_first_data_block);
-	ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
-	goal_group = group_no;
-retry_alloc:
-	gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-	if (!gdp)
-		goto io_error;
-
-	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-	/*
-	 * if there is not enough free blocks to make a new resevation
-	 * turn off reservation for this allocation
-	 */
-	if (my_rsv && (free_blocks < windowsz)
-		&& (rsv_is_empty(&my_rsv->rsv_window)))
-		my_rsv = NULL;
-
-	if (free_blocks > 0) {
-		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, grp_target_blk,
-					my_rsv,	&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-
-	ngroups = EXT4_SB(sb)->s_groups_count;
-	smp_rmb();
-
-	/*
-	 * Now search the rest of the groups.  We assume that
-	 * group_no and gdp correctly point to the last group visited.
-	 */
-	for (bgi = 0; bgi < ngroups; bgi++) {
-		group_no++;
-		if (group_no >= ngroups)
-			group_no = 0;
-		gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-		if (!gdp)
-			goto io_error;
-		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-		/*
-		 * skip this group if the number of
-		 * free blocks is less than half of the reservation
-		 * window size.
-		 */
-		if (free_blocks <= (windowsz/2))
-			continue;
-
-		brelse(bitmap_bh);
-		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		/*
-		 * try to allocate block(s) from this group, without a goal(-1).
-		 */
-		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, -1, my_rsv,
-					&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-	/*
-	 * We may end up a bogus ealier ENOSPC error due to
-	 * filesystem is "full" of reservations, but
-	 * there maybe indeed free blocks avaliable on disk
-	 * In this case, we just forget about the reservations
-	 * just do block allocation as without reservations.
-	 */
-	if (my_rsv) {
-		my_rsv = NULL;
-		windowsz = 0;
-		group_no = goal_group;
-		goto retry_alloc;
-	}
-	/* No space left on the device */
-	*errp = -ENOSPC;
-	goto out;
-
-allocated:
-
-	ext4_debug("using block group %lu(%d)\n",
-			group_no, gdp->bg_free_blocks_count);
-
-	BUFFER_TRACE(gdp_bh, "get_write_access");
-	fatal = ext4_journal_get_write_access(handle, gdp_bh);
-	if (fatal)
-		goto out;
-
-	ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
-
-	if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
-	    in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
-	    in_range(ret_block, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group) ||
-	    in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group)) {
-		ext4_error(sb, "ext4_new_block",
-			    "Allocating block in system zone - "
-			    "blocks from %llu, length %lu",
-			     ret_block, num);
-		/*
-		 * claim_block marked the blocks we allocated
-		 * as in use. So we may want to selectively
-		 * mark some of the blocks as free
-		 */
-		goto retry_alloc;
-	}
-
-	performed_allocation = 1;
-
-#ifdef CONFIG_JBD2_DEBUG
-	{
-		struct buffer_head *debug_bh;
-
-		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, ret_block);
-		if (debug_bh) {
-			BUFFER_TRACE(debug_bh, "state when allocated");
-			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
-			brelse(debug_bh);
-		}
-	}
-	jbd_lock_bh_state(bitmap_bh);
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
-		int i;
-
-		for (i = 0; i < num; i++) {
-			if (ext4_test_bit(grp_alloc_blk+i,
-					bh2jh(bitmap_bh)->b_committed_data)) {
-				printk("%s: block was unexpectedly set in "
-					"b_committed_data\n", __func__);
-			}
-		}
-	}
-	ext4_debug("found bit %d\n", grp_alloc_blk);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	jbd_unlock_bh_state(bitmap_bh);
-#endif
-
-	if (ret_block + num - 1 >= ext4_blocks_count(es)) {
-		ext4_error(sb, "ext4_new_block",
-			    "block(%llu) >= blocks count(%llu) - "
-			    "block_group = %lu, es == %p ", ret_block,
-			ext4_blocks_count(es), group_no, es);
-		goto out;
-	}
-
-	/*
-	 * It is up to the caller to add the new buffer to a journal
-	 * list of some description.  We don't know in advance whether
-	 * the caller wants to use it as metadata or data.
-	 */
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
-		percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
-	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
-		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks -= num;
-		spin_unlock(sb_bgl_lock(sbi, flex_group));
-	}
-
-	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
-	err = ext4_journal_dirty_metadata(handle, gdp_bh);
-	if (!fatal)
-		fatal = err;
-
-	sb->s_dirt = 1;
-	if (fatal)
-		goto out;
-
-	*errp = 0;
-	brelse(bitmap_bh);
-	DQUOT_FREE_BLOCK(inode, *count-num);
-	*count = num;
-	return ret_block;
-
-io_error:
-	*errp = -EIO;
-out:
-	if (fatal) {
-		*errp = fatal;
-		ext4_std_error(sb, fatal);
-	}
-	/*
-	 * Undo the block allocation
-	 */
-	if (!performed_allocation)
-		DQUOT_FREE_BLOCK(inode, *count);
-	brelse(bitmap_bh);
-	return 0;
-}
-
 #define EXT4_META_BLOCK 0x1
 
 static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
@@ -1957,10 +698,6 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
 
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		return ext4_old_new_blocks(handle, inode, goal, count, errp);
-	}
-
 	memset(&ar, 0, sizeof(ar));
 	/* Fill with neighbour allocated blocks */
 
@@ -2002,7 +739,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Account for the allocated meta blocks
 	 */
-	if (!(*errp)) {
+	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		EXT4_I(inode)->i_allocated_meta_blocks += *count;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2087,10 +824,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext4_count_free_blocks: stored = %llu"
-		", computed = %llu, %llu\n",
-		ext4_free_blocks_count(es),
-		desc_count, bitmap_count);
+	printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
+		", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+	       desc_count, bitmap_count);
 	return bitmap_count;
 #else
 	desc_count = 0;
@@ -2177,8 +913,9 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
 			metagroup < first_meta_bg)
-		return ext4_bg_num_gdb_nometa(sb,group);
+		return ext4_bg_num_gdb_nometa(sb, group);
 
 	return ext4_bg_num_gdb_meta(sb,group);
 
 }
+
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index d37ea6750454..0a7a6663c190 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,17 +15,17 @@
 
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
-unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
 {
 	unsigned int i;
 	unsigned long sum = 0;
 
 	if (!map)
-		return (0);
+		return 0;
 	for (i = 0; i < numchars; i++)
 		sum += nibblemap[map->b_data[i] & 0xf] +
 			nibblemap[(map->b_data[i] >> 4) & 0xf];
-	return (sum);
+	return sum;
 }
 
 #endif  /*  EXT4FS_DEBUG  */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d3d23d73c08b..fed5b610df5a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = {
 };
 
 static int ext4_readdir(struct file *, void *, filldir_t);
-static int ext4_dx_readdir(struct file * filp,
-			   void * dirent, filldir_t filldir);
-static int ext4_release_dir (struct inode * inode,
-				struct file * filp);
+static int ext4_dx_readdir(struct file *filp,
+			   void *dirent, filldir_t filldir);
+static int ext4_release_dir(struct inode *inode,
+				struct file *filp);
 
 const struct file_operations ext4_dir_operations = {
 	.llseek		= generic_file_llseek,
@@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 }
 
 
-int ext4_check_dir_entry (const char * function, struct inode * dir,
-			  struct ext4_dir_entry_2 * de,
-			  struct buffer_head * bh,
-			  unsigned long offset)
+int ext4_check_dir_entry(const char *function, struct inode *dir,
+			 struct ext4_dir_entry_2 *de,
+			 struct buffer_head *bh,
+			 unsigned long offset)
 {
-	const char * error_msg = NULL;
+	const char *error_msg = NULL;
 	const int rlen = ext4_rec_len_from_disk(de->rec_len);
 
 	if (rlen < EXT4_DIR_REC_LEN(1))
@@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
-		ext4_error (dir->i_sb, function,
+		ext4_error(dir->i_sb, function,
 			"bad entry in directory #%lu: %s - "
 			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
 			dir->i_ino, error_msg, offset,
@@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
 	return error_msg == NULL ? 1 : 0;
 }
 
-static int ext4_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext4_readdir(struct file *filp,
+			 void *dirent, filldir_t filldir)
 {
 	int error = 0;
 	unsigned long offset;
@@ -102,6 +102,7 @@ static int ext4_readdir(struct file * filp,
 	int err;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = 0;
+	int dir_has_error = 0;
 
 	sb = inode->i_sb;
 
@@ -148,9 +149,13 @@ static int ext4_readdir(struct file * filp,
 		 * of recovering data when there's a bad sector
 		 */
 		if (!bh) {
-			ext4_error (sb, "ext4_readdir",
-				"directory #%lu contains a hole at offset %lu",
-				inode->i_ino, (unsigned long)filp->f_pos);
+			if (!dir_has_error) {
+				ext4_error(sb, __func__, "directory #%lu "
+					   "contains a hole at offset %Lu",
+					   inode->i_ino,
+					   (unsigned long long) filp->f_pos);
+				dir_has_error = 1;
+			}
 			/* corrupt size?  Maybe no more blocks to read */
 			if (filp->f_pos > inode->i_blocks << 9)
 				break;
@@ -187,14 +192,14 @@ revalidate:
 		while (!error && filp->f_pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-			if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
-						   bh, offset)) {
+			if (!ext4_check_dir_entry("ext4_readdir", inode, de,
+						  bh, offset)) {
 				/*
 				 * On error, skip the f_pos to the next block
 				 */
 				filp->f_pos = (filp->f_pos |
 						(sb->s_blocksize - 1)) + 1;
-				brelse (bh);
+				brelse(bh);
 				ret = stored;
 				goto out;
 			}
@@ -218,12 +223,12 @@ revalidate:
 					break;
 				if (version != filp->f_version)
 					goto revalidate;
-				stored ++;
+				stored++;
 			}
 			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
 		}
 		offset = 0;
-		brelse (bh);
+		brelse(bh);
 	}
 out:
 	return ret;
@@ -290,9 +295,9 @@ static void free_rb_tree_fname(struct rb_root *root)
 		parent = rb_parent(n);
 		fname = rb_entry(n, struct fname, rb_hash);
 		while (fname) {
-			struct fname * old = fname;
+			struct fname *old = fname;
 			fname = fname->next;
-			kfree (old);
+			kfree(old);
 		}
 		if (!parent)
 			root->rb_node = NULL;
@@ -331,7 +336,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 			     struct ext4_dir_entry_2 *dirent)
 {
 	struct rb_node **p, *parent = NULL;
-	struct fname * fname, *new_fn;
+	struct fname *fname, *new_fn;
 	struct dir_private_info *info;
 	int len;
 
@@ -388,19 +393,20 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
  * for all entres on the fname linked list.  (Normally there is only
  * one entry on the linked list, unless there are 62 bit hash collisions.)
  */
-static int call_filldir(struct file * filp, void * dirent,
+static int call_filldir(struct file *filp, void *dirent,
 			filldir_t filldir, struct fname *fname)
 {
 	struct dir_private_info *info = filp->private_data;
 	loff_t	curr_pos;
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct super_block * sb;
+	struct super_block *sb;
 	int error;
 
 	sb = inode->i_sb;
 
 	if (!fname) {
-		printk("call_filldir: called with null fname?!?\n");
+		printk(KERN_ERR "ext4: call_filldir: called with "
+		       "null fname?!?\n");
 		return 0;
 	}
 	curr_pos = hash2pos(fname->hash, fname->minor_hash);
@@ -411,7 +417,7 @@ static int call_filldir(struct file * filp, void * dirent,
 				get_dtype(sb, fname->file_type));
 		if (error) {
 			filp->f_pos = curr_pos;
-			info->extra_fname = fname->next;
+			info->extra_fname = fname;
 			return error;
 		}
 		fname = fname->next;
@@ -419,8 +425,8 @@ static int call_filldir(struct file * filp, void * dirent,
 	return 0;
 }
 
-static int ext4_dx_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *filp,
+			 void *dirent, filldir_t filldir)
 {
 	struct dir_private_info *info = filp->private_data;
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -450,11 +456,12 @@ static int ext4_dx_readdir(struct file * filp,
 	 * If there are any leftover names on the hash collision
 	 * chain, return them first.
 	 */
-	if (info->extra_fname &&
-	    call_filldir(filp, dirent, filldir, info->extra_fname))
-		goto finished;
-
-	if (!info->curr_node)
+	if (info->extra_fname) {
+		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+			goto finished;
+		info->extra_fname = NULL;
+		goto next_node;
+	} else if (!info->curr_node)
 		info->curr_node = rb_first(&info->root);
 
 	while (1) {
@@ -485,9 +492,14 @@ static int ext4_dx_readdir(struct file * filp,
 		info->curr_minor_hash = fname->minor_hash;
 		if (call_filldir(filp, dirent, filldir, fname))
 			break;
-
+	next_node:
 		info->curr_node = rb_next(info->curr_node);
-		if (!info->curr_node) {
+		if (info->curr_node) {
+			fname = rb_entry(info->curr_node, struct fname,
+					 rb_hash);
+			info->curr_hash = fname->hash;
+			info->curr_minor_hash = fname->minor_hash;
+		} else {
 			if (info->next_hash == ~0) {
 				filp->f_pos = EXT4_HTREE_EOF;
 				break;
@@ -501,7 +513,7 @@ finished:
 	return 0;
 }
 
-static int ext4_release_dir (struct inode * inode, struct file * filp)
+static int ext4_release_dir(struct inode *inode, struct file *filp)
 {
 	if (filp->private_data)
 		ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 303e41cf7b14..4880cc3e6727 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -44,9 +44,9 @@
 #ifdef EXT4FS_DEBUG
 #define ext4_debug(f, a...)						\
 	do {								\
-		printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
+		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
 			__FILE__, __LINE__, __func__);			\
-		printk (KERN_DEBUG f, ## a);				\
+		printk(KERN_DEBUG f, ## a);				\
 	} while (0)
 #else
 #define ext4_debug(f, a...)	do {} while (0)
@@ -128,7 +128,7 @@ struct ext4_allocation_request {
 #else
 # define EXT4_BLOCK_SIZE(s)		(EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
 #endif
-#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof (__u32))
+#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof(__u32))
 #ifdef __KERNEL__
 # define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
 #else
@@ -245,7 +245,7 @@ struct flex_groups {
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE		0x000B80FF /* User modifiable flags */
 
 /*
  * Inode dynamic state flags
@@ -291,8 +291,6 @@ struct ext4_new_group_data {
 #define	EXT4_IOC_SETFLAGS		FS_IOC_SETFLAGS
 #define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
 #define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
-#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
-#define EXT4_IOC_GROUP_ADD		_IOW('f', 8,struct ext4_new_group_input)
 #define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
 #define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
 #ifdef CONFIG_JBD2_DEBUG
@@ -300,7 +298,10 @@ struct ext4_new_group_data {
 #endif
 #define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
-#define EXT4_IOC_MIGRATE		_IO('f', 7)
+#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 
 /*
  * ioctl commands in 32 bit emulation
@@ -510,7 +511,6 @@ do {									       \
 /*
  * Mount flags
  */
-#define EXT4_MOUNT_CHECK		0x00001	/* Do mount-time checks */
 #define EXT4_MOUNT_OLDALLOC		0x00002  /* Don't use the new Orlov allocator */
 #define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
@@ -538,8 +538,9 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
-#define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
+
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
@@ -667,7 +668,7 @@ struct ext4_super_block {
 };
 
 #ifdef __KERNEL__
-static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb)
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
@@ -725,11 +726,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
  */
 
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
-	( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+	(EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
-	( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+	(EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
-	( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+	(EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
@@ -789,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define	EXT4_DEF_RESUID		0
 #define	EXT4_DEF_RESGID		0
 
+#define EXT4_DEF_INODE_READAHEAD_BLKS	32
+
 /*
  * Default mount options
  */
@@ -954,6 +957,24 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 			unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
 
+extern struct proc_dir_entry *ext4_proc_root;
+
+#ifdef CONFIG_PROC_FS
+extern const struct file_operations ext4_ui_proc_fops;
+
+#define	EXT4_PROC_HANDLER(name, var)					\
+do {									\
+	proc = proc_create_data(name, mode, sbi->s_proc,		\
+				&ext4_ui_proc_fops, &sbi->s_##var);	\
+	if (proc == NULL) {						\
+		printk(KERN_ERR "EXT4-fs: can't create %s\n", name);	\
+		goto err_out;						\
+	}								\
+} while (0)
+#else
+#define EXT4_PROC_HANDLER(name, var)
+#endif
+
 /*
  * Function prototypes
  */
@@ -981,23 +1002,20 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
 					ext4_lblk_t iblock, ext4_fsblk_t goal,
 					unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
-						ext4_fsblk_t nblocks);
-extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
+					 s64 nblocks);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
-				 ext4_fsblk_t block, unsigned long count,
+extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count,
 				unsigned long *pdquot_freed_blocks);
-extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
-extern void ext4_check_blocks_bitmap (struct super_block *);
+extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 						    ext4_group_t block_group,
 						    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-extern void ext4_init_block_alloc_info(struct inode *);
-extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
 
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1009,20 +1027,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 
 /* fsync.c */
-extern int ext4_sync_file (struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, struct dentry *, int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
 			  dx_hash_info *hinfo);
 
 /* ialloc.c */
-extern struct inode * ext4_new_inode (handle_t *, struct inode *, int);
-extern void ext4_free_inode (handle_t *, struct inode *);
-extern struct inode * ext4_orphan_get (struct super_block *, unsigned long);
-extern unsigned long ext4_count_free_inodes (struct super_block *);
-extern unsigned long ext4_count_dirs (struct super_block *);
-extern void ext4_check_inodes_bitmap (struct super_block *);
-extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
+extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern void ext4_free_inode(handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1032,7 +1050,7 @@ extern int ext4_mb_release(struct super_block *);
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
 				struct ext4_allocation_request *, int *);
 extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_mb_discard_inode_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *);
 extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
@@ -1044,46 +1062,47 @@ extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
 
 
 /* inode.c */
-void ext4_da_release_space(struct inode *inode, int used, int to_free);
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 		struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
 struct buffer_head *ext4_bread(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create);
 int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 				ext4_lblk_t iblock, unsigned long maxblocks,
 				struct buffer_head *bh_result,
 				int create, int extend_disksize);
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int  ext4_write_inode (struct inode *, int);
-extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_write_inode(struct inode *, int);
+extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 				struct kstat *stat);
-extern void ext4_delete_inode (struct inode *);
-extern int  ext4_sync_inode (handle_t *, struct inode *);
-extern void ext4_discard_reservation (struct inode *);
+extern void ext4_delete_inode(struct inode *);
+extern int  ext4_sync_inode(handle_t *, struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
-extern void ext4_truncate (struct inode *);
+extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
-extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* migrate.c */
-extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int,
-		       unsigned long);
+extern int ext4_ext_migrate(struct inode *);
 /* namei.c */
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
@@ -1098,14 +1117,14 @@ extern int ext4_group_extend(struct super_block *sb,
 				ext4_fsblk_t n_blocks_count);
 
 /* super.c */
-extern void ext4_error (struct super_block *, const char *, const char *, ...)
+extern void ext4_error(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void __ext4_std_error (struct super_block *, const char *, int);
-extern void ext4_abort (struct super_block *, const char *, const char *, ...)
+extern void __ext4_std_error(struct super_block *, const char *, int);
+extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning (struct super_block *, const char *, const char *, ...)
+extern void ext4_warning(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
-extern void ext4_update_dynamic_rev (struct super_block *sb);
+extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
 extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -1178,7 +1197,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
 
 static inline
 struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
-							ext4_group_t group)
+					    ext4_group_t group)
 {
 	 struct ext4_group_info ***grp_info;
 	 long indexv, indexh;
@@ -1206,6 +1225,28 @@ do {								\
 		__ext4_std_error((sb), __func__, (errno));	\
 } while (0)
 
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#else
+#define EXT4_FREEBLOCKS_WATERMARK 0
+#endif
+
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+	/*
+	 * XXX: replace with spinlock if seen contended -bzzz
+	 */
+	down_write(&EXT4_I(inode)->i_data_sem);
+	if (newsize > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = newsize;
+	up_write(&EXT4_I(inode)->i_data_sem);
+	return ;
+}
+
 /*
  * Inodes and files operations
  */
@@ -1228,6 +1269,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
+				       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 6c166c0a54b7..bec7ce59fc0d 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -124,6 +124,19 @@ struct ext4_ext_path {
 #define EXT4_EXT_CACHE_GAP	1
 #define EXT4_EXT_CACHE_EXTENT	2
 
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+					struct ext4_ext_cache *,
+					struct ext4_extent *, void *);
+
+#define EXT_CONTINUE   0
+#define EXT_BREAK      1
+#define EXT_REPEAT     2
 
 #define EXT_MAX_BLOCK	0xffffffff
 
@@ -216,12 +229,16 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+						   int num,
+						   struct ext4_ext_path *path);
 extern int ext4_ext_try_to_merge(struct inode *inode,
 				 struct ext4_ext_path *path,
 				 struct ext4_extent *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
 extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
+							ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 							struct ext4_ext_path *);
 extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index ef7409f0e7e4..5c124c0ac6d3 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned long ext4_group_t;
 
-struct ext4_reserve_window {
-	ext4_fsblk_t	_rsv_start;	/* First byte reserved */
-	ext4_fsblk_t	_rsv_end;	/* Last byte reserved or 0 */
-};
-
-struct ext4_reserve_window_node {
-	struct rb_node		rsv_node;
-	__u32			rsv_goal_size;
-	__u32			rsv_alloc_hit;
-	struct ext4_reserve_window	rsv_window;
-};
-
-struct ext4_block_alloc_info {
-	/* information about reservation window */
-	struct ext4_reserve_window_node rsv_window_node;
-	/*
-	 * was i_next_alloc_block in ext4_inode_info
-	 * is the logical (file-relative) number of the
-	 * most-recently-allocated block in this file.
-	 * We use this for detecting linearly ascending allocation requests.
-	 */
-	ext4_lblk_t last_alloc_logical_block;
-	/*
-	 * Was i_next_alloc_goal in ext4_inode_info
-	 * is the *physical* companion to i_next_alloc_block.
-	 * it the physical block number of the block which was most-recentl
-	 * allocated to this file.  This give us the goal (target) for the next
-	 * allocation when we detect linearly ascending requests.
-	 */
-	ext4_fsblk_t last_alloc_physical_block;
-};
-
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
 
@@ -97,11 +65,8 @@ struct ext4_inode_info {
 	ext4_group_t	i_block_group;
 	__u32	i_state;		/* Dynamic state flags for ext4 */
 
-	/* block reservation info */
-	struct ext4_block_alloc_info *i_block_alloc_info;
-
 	ext4_lblk_t		i_dir_start_lookup;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	/*
 	 * Extended attributes can be read independently of the main file
 	 * data. Taking i_mutex even when reading would cause contention
@@ -111,7 +76,7 @@ struct ext4_inode_info {
 	 */
 	struct rw_semaphore xattr_sem;
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
 #endif
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index eb8bc3afe6e9..b455c685a98b 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -51,6 +51,14 @@
 					 EXT4_XATTR_TRANS_BLOCKS - 2 + \
 					 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
 
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb)	(EXT4_XATTR_TRANS_BLOCKS + \
+					2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+
 /* Delete operations potentially hit one directory's namespace plus an
  * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
  * generous.  We can grow the delete transaction later if necessary. */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6300226d5531..445fde603df8 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -40,8 +40,8 @@ struct ext4_sb_info {
 	unsigned long s_blocks_last;    /* Last seen block count */
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
-	struct ext4_super_block * s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head ** s_group_desc;
+	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head **s_group_desc;
 	unsigned long  s_mount_opt;
 	ext4_fsblk_t s_sb_block;
 	uid_t s_resuid;
@@ -52,6 +52,7 @@ struct ext4_sb_info {
 	int s_desc_per_block_bits;
 	int s_inode_size;
 	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
 	spinlock_t s_next_gen_lock;
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
@@ -59,16 +60,17 @@ struct ext4_sb_info {
 	struct percpu_counter s_freeblocks_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
+	struct percpu_counter s_dirtyblocks_counter;
 	struct blockgroup_lock s_blockgroup_lock;
+	struct proc_dir_entry *s_proc;
 
 	/* root of the per fs reservation window tree */
 	spinlock_t s_rsv_window_lock;
 	struct rb_root s_rsv_window_root;
-	struct ext4_reserve_window_node s_rsv_window_head;
 
 	/* Journaling */
-	struct inode * s_journal_inode;
-	struct journal_s * s_journal;
+	struct inode *s_journal_inode;
+	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	unsigned long s_commit_interval;
 	struct block_device *journal_bdev;
@@ -97,21 +99,18 @@ struct ext4_sb_info {
 	struct inode *s_buddy_cache;
 	long s_blocks_reserved;
 	spinlock_t s_reserve_lock;
-	struct list_head s_active_transaction;
-	struct list_head s_closed_transaction;
-	struct list_head s_committed_transaction;
 	spinlock_t s_md_lock;
 	tid_t s_last_transaction;
 	unsigned short *s_mb_offsets, *s_mb_maxs;
 
 	/* tunables */
 	unsigned long s_stripe;
-	unsigned long s_mb_stream_request;
-	unsigned long s_mb_max_to_scan;
-	unsigned long s_mb_min_to_scan;
-	unsigned long s_mb_stats;
-	unsigned long s_mb_order2_reqs;
-	unsigned long s_mb_group_prealloc;
+	unsigned int s_mb_stream_request;
+	unsigned int s_mb_max_to_scan;
+	unsigned int s_mb_min_to_scan;
+	unsigned int s_mb_stats;
+	unsigned int s_mb_order2_reqs;
+	unsigned int s_mb_group_prealloc;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
@@ -121,7 +120,6 @@ struct ext4_sb_info {
 	int s_mb_history_cur;
 	int s_mb_history_max;
 	int s_mb_history_num;
-	struct proc_dir_entry *s_mb_proc;
 	spinlock_t s_mb_history_lock;
 	int s_mb_history_filter;
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 42c4c0c892ed..ea2ce3c0ae66 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/falloc.h>
 #include <asm/uaccess.h>
+#include <linux/fiemap.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
@@ -99,7 +100,7 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
 	if (handle->h_buffer_credits > needed)
 		return 0;
 	err = ext4_journal_extend(handle, needed);
-	if (err)
+	if (err <= 0)
 		return err;
 	return ext4_journal_restart(handle, needed);
 }
@@ -383,8 +384,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 	ext_debug("\n");
 }
 #else
-#define ext4_ext_show_path(inode,path)
-#define ext4_ext_show_leaf(inode,path)
+#define ext4_ext_show_path(inode, path)
+#define ext4_ext_show_leaf(inode, path)
 #endif
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -440,9 +441,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 		  if (k != 0 &&
 		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
-				printk("k=%d, ix=0x%p, first=0x%p\n", k,
-					ix, EXT_FIRST_INDEX(eh));
-				printk("%u <= %u\n",
+				printk(KERN_DEBUG "k=%d, ix=0x%p, "
+				       "first=0x%p\n", k,
+				       ix, EXT_FIRST_INDEX(eh));
+				printk(KERN_DEBUG "%u <= %u\n",
 				       le32_to_cpu(ix->ei_block),
 				       le32_to_cpu(ix[-1].ei_block));
 			}
@@ -1441,7 +1443,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
 
 	/*
 	 * get the next allocated block if the extent in the path
-	 * is before the requested block(s) 
+	 * is before the requested block(s)
 	 */
 	if (b2 < b1) {
 		b2 = ext4_ext_next_allocated_block(path);
@@ -1475,7 +1477,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 				struct ext4_ext_path *path,
 				struct ext4_extent *newext)
 {
-	struct ext4_extent_header * eh;
+	struct ext4_extent_header *eh;
 	struct ext4_extent *ex, *fex;
 	struct ext4_extent *nearex; /* nearest extent */
 	struct ext4_ext_path *npath = NULL;
@@ -1625,6 +1627,113 @@ cleanup:
 	return err;
 }
 
+int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+			ext4_lblk_t num, ext_prepare_callback func,
+			void *cbdata)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_ext_cache cbex;
+	struct ext4_extent *ex;
+	ext4_lblk_t next, start = 0, end = 0;
+	ext4_lblk_t last = block + num;
+	int depth, exists, err = 0;
+
+	BUG_ON(func == NULL);
+	BUG_ON(inode == NULL);
+
+	while (block < last && block != EXT_MAX_BLOCK) {
+		num = last - block;
+		/* find extent for this block */
+		path = ext4_ext_find_extent(inode, block, path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			path = NULL;
+			break;
+		}
+
+		depth = ext_depth(inode);
+		BUG_ON(path[depth].p_hdr == NULL);
+		ex = path[depth].p_ext;
+		next = ext4_ext_next_allocated_block(path);
+
+		exists = 0;
+		if (!ex) {
+			/* there is no extent yet, so try to allocate
+			 * all requested space */
+			start = block;
+			end = block + num;
+		} else if (le32_to_cpu(ex->ee_block) > block) {
+			/* need to allocate space before found extent */
+			start = block;
+			end = le32_to_cpu(ex->ee_block);
+			if (block + num < end)
+				end = block + num;
+		} else if (block >= le32_to_cpu(ex->ee_block)
+					+ ext4_ext_get_actual_len(ex)) {
+			/* need to allocate space after found extent */
+			start = block;
+			end = block + num;
+			if (end >= next)
+				end = next;
+		} else if (block >= le32_to_cpu(ex->ee_block)) {
+			/*
+			 * some part of requested space is covered
+			 * by found extent
+			 */
+			start = block;
+			end = le32_to_cpu(ex->ee_block)
+				+ ext4_ext_get_actual_len(ex);
+			if (block + num < end)
+				end = block + num;
+			exists = 1;
+		} else {
+			BUG();
+		}
+		BUG_ON(end <= start);
+
+		if (!exists) {
+			cbex.ec_block = start;
+			cbex.ec_len = end - start;
+			cbex.ec_start = 0;
+			cbex.ec_type = EXT4_EXT_CACHE_GAP;
+		} else {
+			cbex.ec_block = le32_to_cpu(ex->ee_block);
+			cbex.ec_len = ext4_ext_get_actual_len(ex);
+			cbex.ec_start = ext_pblock(ex);
+			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+		}
+
+		BUG_ON(cbex.ec_len == 0);
+		err = func(inode, path, &cbex, ex, cbdata);
+		ext4_ext_drop_refs(path);
+
+		if (err < 0)
+			break;
+
+		if (err == EXT_REPEAT)
+			continue;
+		else if (err == EXT_BREAK) {
+			err = 0;
+			break;
+		}
+
+		if (ext_depth(inode) != depth) {
+			/* depth was changed. we have to realloc path */
+			kfree(path);
+			path = NULL;
+		}
+
+		block = cbex.ec_block + cbex.ec_len;
+	}
+
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+
+	return err;
+}
+
 static void
 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
 			__u32 len, ext4_fsblk_t start, int type)
@@ -1747,54 +1856,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
 }
 
 /*
- * ext4_ext_calc_credits_for_insert:
- * This routine returns max. credits that the extent tree can consume.
- * It should be OK for low-performance paths like ->writepage()
- * To allow many writing processes to fit into a single transaction,
- * the caller should calculate credits under i_data_sem and
- * pass the actual path.
+ * ext4_ext_calc_credits_for_single_extent:
+ * This routine returns max. credits that needed to insert an extent
+ * to the extent tree.
+ * When pass the actual path, the caller should calculate credits
+ * under i_data_sem.
  */
-int ext4_ext_calc_credits_for_insert(struct inode *inode,
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 						struct ext4_ext_path *path)
 {
-	int depth, needed;
-
 	if (path) {
+		int depth = ext_depth(inode);
+		int ret = 0;
+
 		/* probably there is space in leaf? */
-		depth = ext_depth(inode);
 		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
-				< le16_to_cpu(path[depth].p_hdr->eh_max))
-			return 1;
-	}
-
-	/*
-	 * given 32-bit logical block (4294967296 blocks), max. tree
-	 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
-	 * Let's also add one more level for imbalance.
-	 */
-	depth = 5;
+				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
 
-	/* allocation of new data block(s) */
-	needed = 2;
+			/*
+			 *  There are some space in the leaf tree, no
+			 *  need to account for leaf block credit
+			 *
+			 *  bitmaps and block group descriptor blocks
+			 *  and other metadat blocks still need to be
+			 *  accounted.
+			 */
+			/* 1 bitmap, 1 block group descriptor */
+			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+		}
+	}
 
-	/*
-	 * tree can be full, so it would need to grow in depth:
-	 * we need one credit to modify old root, credits for
-	 * new root will be added in split accounting
-	 */
-	needed += 1;
+	return ext4_chunk_trans_blocks(inode, nrblocks);
+}
 
-	/*
-	 * Index split can happen, we would need:
-	 *    allocate intermediate indexes (bitmap + group)
-	 *  + change two blocks at each level, but root (already included)
-	 */
-	needed += (depth * 2) + (depth * 2);
+/*
+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ *
+ * if nrblocks are fit in a single extent (chunk flag is 1), then
+ * in the worse case, each tree level index/leaf need to be changed
+ * if the tree split due to insert a new extent, then the old tree
+ * index/leaf need to be updated too
+ *
+ * If the nrblocks are discontiguous, they could cause
+ * the whole tree split more than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+	int index;
+	int depth = ext_depth(inode);
 
-	/* any allocation modifies superblock */
-	needed += 1;
+	if (chunk)
+		index = depth * 2;
+	else
+		index = depth * 3;
 
-	return needed;
+	return index;
 }
 
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
@@ -1910,16 +2026,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
 		}
 
-		/* at present, extent can't cross block group: */
-		/* leaf + bitmap + group desc + sb + inode */
-		credits = 5;
+		/*
+		 * 3 for leaf, sb, and inode plus 2 (bmap and group
+		 * descriptor) for each block group; assume two block
+		 * groups plus ex_ee_len/blocks_per_block_group for
+		 * the worst case
+		 */
+		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
 		if (ex == EXT_FIRST_EXTENT(eh)) {
 			correct_index = 1;
 			credits += (ext_depth(inode)) + 1;
 		}
-#ifdef CONFIG_QUOTA
 		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
 
 		err = ext4_ext_journal_restart(handle, credits);
 		if (err)
@@ -2133,7 +2251,7 @@ void ext4_ext_init(struct super_block *sb)
 	 */
 
 	if (test_opt(sb, EXTENTS)) {
-		printk("EXT4-fs: file extents enabled");
+		printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
 		printk(", aggressive tests");
 #endif
@@ -2323,7 +2441,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		unsigned int newdepth;
 		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
 		if (allocated <= EXT4_EXT_ZERO_LEN) {
-			/* Mark first half uninitialized.
+			/*
+			 * iblock == ee_block is handled by the zerouout
+			 * at the beginning.
+			 * Mark first half uninitialized.
 			 * Mark second half initialized and zero out the
 			 * initialized extent
 			 */
@@ -2346,7 +2467,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				ex->ee_len   = orig_ex.ee_len;
 				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 				ext4_ext_dirty(handle, inode, path + depth);
-				/* zeroed the full extent */
+				/* blocks available from iblock */
 				return allocated;
 
 			} else if (err)
@@ -2374,6 +2495,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 					err = PTR_ERR(path);
 					return err;
 				}
+				/* get the second half extent details */
 				ex = path[depth].p_ext;
 				err = ext4_ext_get_access(handle, inode,
 								path + depth);
@@ -2403,6 +2525,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
+			/* blocks available from iblock */
 			return allocated;
 
 		} else if (err)
@@ -2418,23 +2541,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		 */
 		orig_ex.ee_len = cpu_to_le16(ee_len -
 						ext4_ext_get_actual_len(ex3));
-		if (newdepth != depth) {
-			depth = newdepth;
-			ext4_ext_drop_refs(path);
-			path = ext4_ext_find_extent(inode, iblock, path);
-			if (IS_ERR(path)) {
-				err = PTR_ERR(path);
-				goto out;
-			}
-			eh = path[depth].p_hdr;
-			ex = path[depth].p_ext;
-			if (ex2 != &newex)
-				ex2 = ex;
-
-			err = ext4_ext_get_access(handle, inode, path + depth);
-			if (err)
-				goto out;
+		depth = newdepth;
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode, iblock, path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
 		}
+		eh = path[depth].p_hdr;
+		ex = path[depth].p_ext;
+		if (ex2 != &newex)
+			ex2 = ex;
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
+
 		allocated = max_blocks;
 
 		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
@@ -2452,6 +2574,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zero out the first half */
+			/* blocks available from iblock */
 			return allocated;
 		}
 	}
@@ -2682,11 +2805,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		goto out2;
 	}
 	/*
-	 * Okay, we need to do block allocation.  Lazily initialize the block
-	 * allocation info here if necessary.
+	 * Okay, we need to do block allocation.
 	 */
-	if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
-		ext4_init_block_alloc_info(inode);
 
 	/* find neighbour allocated blocks */
 	ar.lleft = iblock;
@@ -2746,7 +2866,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		/* free data blocks we just allocated */
 		/* not a good idea to call discard here directly,
 		 * but otherwise we'd need to call it every free() */
-		ext4_mb_discard_inode_preallocations(inode);
+		ext4_discard_preallocations(inode);
 		ext4_free_blocks(handle, inode, ext_pblock(&newex),
 					ext4_ext_get_actual_len(&newex), 0);
 		goto out2;
@@ -2796,7 +2916,7 @@ void ext4_ext_truncate(struct inode *inode)
 	/*
 	 * probably first extent we're gonna free will be last in block
 	 */
-	err = ext4_writepage_trans_blocks(inode) + 3;
+	err = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, err);
 	if (IS_ERR(handle))
 		return;
@@ -2810,7 +2930,7 @@ void ext4_ext_truncate(struct inode *inode)
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
 
-	ext4_mb_discard_inode_preallocations(inode);
+	ext4_discard_preallocations(inode);
 
 	/*
 	 * TODO: optimization is possible here.
@@ -2849,27 +2969,6 @@ out_stop:
 	ext4_journal_stop(handle);
 }
 
-/*
- * ext4_ext_writepage_trans_blocks:
- * calculate max number of blocks we could modify
- * in order to allocate new block for an inode
- */
-int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
-{
-	int needed;
-
-	needed = ext4_ext_calc_credits_for_insert(inode, NULL);
-
-	/* caller wants to allocate num blocks, but note it includes sb */
-	needed = needed * num - (num - 1);
-
-#ifdef CONFIG_QUOTA
-	needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-
-	return needed;
-}
-
 static void ext4_falloc_update_inode(struct inode *inode,
 				int mode, loff_t new_size, int update_ctime)
 {
@@ -2884,10 +2983,11 @@ static void ext4_falloc_update_inode(struct inode *inode,
 	 * Update only when preallocation was requested beyond
 	 * the file size.
 	 */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-				new_size > i_size_read(inode)) {
-		i_size_write(inode, new_size);
-		EXT4_I(inode)->i_disksize = new_size;
+	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+		if (new_size > i_size_read(inode))
+			i_size_write(inode, new_size);
+		if (new_size > EXT4_I(inode)->i_disksize)
+			ext4_update_i_disksize(inode, new_size);
 	}
 
 }
@@ -2930,10 +3030,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
 							- block;
 	/*
-	 * credits to insert 1 extent into extent tree + buffers to be able to
-	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
+	 * credits to insert 1 extent into extent tree
 	 */
-	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+	credits = ext4_chunk_trans_blocks(inode, max_blocks);
 	mutex_lock(&inode->i_mutex);
 retry:
 	while (ret >= 0 && ret < max_blocks) {
@@ -2980,3 +3079,143 @@ retry:
 	mutex_unlock(&inode->i_mutex);
 	return ret > 0 ? ret2 : ret;
 }
+
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
+		       void *data)
+{
+	struct fiemap_extent_info *fieinfo = data;
+	unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+	__u64	logical;
+	__u64	physical;
+	__u64	length;
+	__u32	flags = 0;
+	int	error;
+
+	logical =  (__u64)newex->ec_block << blksize_bits;
+
+	if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+		pgoff_t offset;
+		struct page *page;
+		struct buffer_head *bh = NULL;
+
+		offset = logical >> PAGE_SHIFT;
+		page = find_get_page(inode->i_mapping, offset);
+		if (!page || !page_has_buffers(page))
+			return EXT_CONTINUE;
+
+		bh = page_buffers(page);
+
+		if (!bh)
+			return EXT_CONTINUE;
+
+		if (buffer_delay(bh)) {
+			flags |= FIEMAP_EXTENT_DELALLOC;
+			page_cache_release(page);
+		} else {
+			page_cache_release(page);
+			return EXT_CONTINUE;
+		}
+	}
+
+	physical = (__u64)newex->ec_start << blksize_bits;
+	length =   (__u64)newex->ec_len << blksize_bits;
+
+	if (ex && ext4_ext_is_uninitialized(ex))
+		flags |= FIEMAP_EXTENT_UNWRITTEN;
+
+	/*
+	 * If this extent reaches EXT_MAX_BLOCK, it must be last.
+	 *
+	 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
+	 * this also indicates no more allocated blocks.
+	 *
+	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
+	 */
+	if (logical + length - 1 == EXT_MAX_BLOCK ||
+	    ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+		flags |= FIEMAP_EXTENT_LAST;
+
+	error = fiemap_fill_next_extent(fieinfo, logical, physical,
+					length, flags);
+	if (error < 0)
+		return error;
+	if (error == 1)
+		return EXT_BREAK;
+
+	return EXT_CONTINUE;
+}
+
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+{
+	__u64 physical = 0;
+	__u64 length;
+	__u32 flags = FIEMAP_EXTENT_LAST;
+	int blockbits = inode->i_sb->s_blocksize_bits;
+	int error = 0;
+
+	/* in-inode? */
+	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+		struct ext4_iloc iloc;
+		int offset;	/* offset of xattr in inode */
+
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error)
+			return error;
+		physical = iloc.bh->b_blocknr << blockbits;
+		offset = EXT4_GOOD_OLD_INODE_SIZE +
+				EXT4_I(inode)->i_extra_isize;
+		physical += offset;
+		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
+	} else { /* external block */
+		physical = EXT4_I(inode)->i_file_acl << blockbits;
+		length = inode->i_sb->s_blocksize;
+	}
+
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, 0, physical,
+						length, flags);
+	return (error < 0 ? error : 0);
+}
+
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	ext4_lblk_t start_blk;
+	ext4_lblk_t len_blks;
+	int error = 0;
+
+	/* fallback to generic here if not in extents fmt */
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+		return generic_block_fiemap(inode, fieinfo, start, len,
+			ext4_get_block);
+
+	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+		return -EBADR;
+
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+		error = ext4_xattr_fiemap(inode, fieinfo);
+	} else {
+		start_blk = start >> inode->i_sb->s_blocksize_bits;
+		len_blks = len >> inode->i_sb->s_blocksize_bits;
+
+		/*
+		 * Walk the extent tree gathering extent information.
+		 * ext4_ext_fiemap_cb will push extents back to user.
+		 */
+		down_write(&EXT4_I(inode)->i_data_sem);
+		error = ext4_ext_walk_space(inode, start_blk, len_blks,
+					  ext4_ext_fiemap_cb, fieinfo);
+		up_write(&EXT4_I(inode)->i_data_sem);
+	}
+
+	return error;
+}
+
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 430eb7978db4..6bd11fba71f7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,14 +31,14 @@
  * from ext4_file_open: open gets called at every open, but release
  * gets called only when /all/ the files are closed.
  */
-static int ext4_release_file (struct inode * inode, struct file * filp)
+static int ext4_release_file(struct inode *inode, struct file *filp)
 {
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
 			(atomic_read(&inode->i_writecount) == 1))
 	{
 		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_discard_reservation(inode);
+		ext4_discard_preallocations(inode);
 		up_write(&EXT4_I(inode)->i_data_sem);
 	}
 	if (is_dx(inode) && filp->private_data)
@@ -140,6 +140,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len);
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -162,7 +165,7 @@ const struct inode_operations ext4_file_inode_operations = {
 	.truncate	= ext4_truncate,
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
@@ -170,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = {
 #endif
 	.permission	= ext4_permission,
 	.fallocate	= ext4_fallocate,
+	.fiemap		= ext4_fiemap,
 };
 
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a45c3737ad31..5afe4370840b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
 #include <linux/blkdev.h>
+#include <linux/marker.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -43,7 +44,7 @@
  * inode to disk.
  */
 
-int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
@@ -51,6 +52,10 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
+	trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
+		   inode->i_sb->s_id, datasync, inode->i_ino,
+		   dentry->d_parent->d_inode->i_ino);
+
 	/*
 	 * data=writeback:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1d6329dbe390..556ca8eba3db 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 		sum += DELTA;
 		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
 		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
-	} while(--n);
+	} while (--n);
 
 	buf[0] += b0;
 	buf[1] += b1;
@@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash(const char *name, int len)
 {
 	__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
 	while (len--) {
@@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 	val = pad;
 	if (len > num*4)
 		len = num * 4;
-	for (i=0; i < len; i++) {
+	for (i = 0; i < len; i++) {
 		if ((i % 4) == 0)
 			val = pad;
 		val = msg[i] + (val << 8);
@@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 
 	/* Check to see if the seed is all zero's */
 	if (hinfo->seed) {
-		for (i=0; i < 4; i++) {
+		for (i = 0; i < 4; i++) {
 			if (hinfo->seed[i])
 				break;
 		}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index a92eb305344f..fe34d74cfb19 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -97,34 +97,46 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
  * Return buffer_head of bitmap on success or NULL.
  */
 static struct buffer_head *
-read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
 	struct ext4_group_desc *desc;
 	struct buffer_head *bh = NULL;
+	ext4_fsblk_t bitmap_blk;
 
 	desc = ext4_get_group_desc(sb, block_group, NULL);
 	if (!desc)
-		goto error_out;
+		return NULL;
+	bitmap_blk = ext4_inode_bitmap(sb, desc);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext4_error(sb, __func__,
+			    "Cannot read inode bitmap - "
+			    "block_group = %lu, inode_bitmap = %llu",
+			    block_group, bitmap_blk);
+		return NULL;
+	}
+	if (buffer_uptodate(bh) &&
+	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+		return bh;
+
+	lock_buffer(bh);
+	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-		bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
-		if (!buffer_uptodate(bh)) {
-			lock_buffer(bh);
-			if (!buffer_uptodate(bh)) {
-				ext4_init_inode_bitmap(sb, bh, block_group,
-						       desc);
-				set_buffer_uptodate(bh);
-			}
-			unlock_buffer(bh);
-		}
-	} else {
-		bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
+		ext4_init_inode_bitmap(sb, bh, block_group, desc);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		return bh;
 	}
-	if (!bh)
-		ext4_error(sb, "read_inode_bitmap",
+	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	if (bh_submit_read(bh) < 0) {
+		put_bh(bh);
+		ext4_error(sb, __func__,
 			    "Cannot read inode bitmap - "
 			    "block_group = %lu, inode_bitmap = %llu",
-			    block_group, ext4_inode_bitmap(sb, desc));
-error_out:
+			    block_group, bitmap_blk);
+		return NULL;
+	}
 	return bh;
 }
 
@@ -144,39 +156,40 @@ error_out:
  * though), and then we'd have two inodes sharing the
  * same inode number and space on the harddisk.
  */
-void ext4_free_inode (handle_t *handle, struct inode * inode)
+void ext4_free_inode(handle_t *handle, struct inode *inode)
 {
-	struct super_block * sb = inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	int is_directory;
 	unsigned long ino;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
 	ext4_group_t block_group;
 	unsigned long bit;
-	struct ext4_group_desc * gdp;
-	struct ext4_super_block * es;
+	struct ext4_group_desc *gdp;
+	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err;
 	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
-		printk ("ext4_free_inode: inode has count=%d\n",
-					atomic_read(&inode->i_count));
+		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
+		       atomic_read(&inode->i_count));
 		return;
 	}
 	if (inode->i_nlink) {
-		printk ("ext4_free_inode: inode has nlink=%d\n",
-			inode->i_nlink);
+		printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
+		       inode->i_nlink);
 		return;
 	}
 	if (!sb) {
-		printk("ext4_free_inode: inode on nonexistent device\n");
+		printk(KERN_ERR "ext4_free_inode: inode on "
+		       "nonexistent device\n");
 		return;
 	}
 	sbi = EXT4_SB(sb);
 
 	ino = inode->i_ino;
-	ext4_debug ("freeing inode %lu\n", ino);
+	ext4_debug("freeing inode %lu\n", ino);
 
 	/*
 	 * Note: we must free any quota before locking the superblock,
@@ -190,17 +203,17 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	is_directory = S_ISDIR(inode->i_mode);
 
 	/* Do this BEFORE marking the inode not in use or returning an error */
-	clear_inode (inode);
+	clear_inode(inode);
 
 	es = EXT4_SB(sb)->s_es;
 	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-		ext4_error (sb, "ext4_free_inode",
-			    "reserved or nonexistent inode %lu", ino);
+		ext4_error(sb, "ext4_free_inode",
+			   "reserved or nonexistent inode %lu", ino);
 		goto error_return;
 	}
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
-	bitmap_bh = read_inode_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
 
@@ -212,10 +225,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	/* Ok, now we can actually update the inode bitmaps.. */
 	if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 					bit, bitmap_bh->b_data))
-		ext4_error (sb, "ext4_free_inode",
-			      "bit already cleared for inode %lu", ino);
+		ext4_error(sb, "ext4_free_inode",
+			   "bit already cleared for inode %lu", ino);
 	else {
-		gdp = ext4_get_group_desc (sb, block_group, &bh2);
+		gdp = ext4_get_group_desc(sb, block_group, &bh2);
 
 		BUFFER_TRACE(bh2, "get_write_access");
 		fatal = ext4_journal_get_write_access(handle, bh2);
@@ -277,7 +290,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 	avefreei = freei / ngroups;
 
 	for (group = 0; group < ngroups; group++) {
-		desc = ext4_get_group_desc (sb, group, NULL);
+		desc = ext4_get_group_desc(sb, group, NULL);
 		if (!desc || !desc->bg_free_inodes_count)
 			continue;
 		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -341,7 +354,7 @@ find_close_to_parent:
 			goto found_flexbg;
 		}
 
-		if (best_flex < 0 ||
+		if (flex_group[best_flex].free_inodes == 0 ||
 		    (flex_group[i].free_blocks >
 		     flex_group[best_flex].free_blocks &&
 		     flex_group[i].free_inodes))
@@ -566,16 +579,16 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
  * For other inodes, search forward from the parent directory's block
  * group to find a free inode.
  */
-struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 {
 	struct super_block *sb;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
 	ext4_group_t group = 0;
 	unsigned long ino = 0;
-	struct inode * inode;
-	struct ext4_group_desc * gdp = NULL;
-	struct ext4_super_block * es;
+	struct inode *inode;
+	struct ext4_group_desc *gdp = NULL;
+	struct ext4_super_block *es;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
 	int ret2, err = 0;
@@ -603,7 +616,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	}
 
 	if (S_ISDIR(mode)) {
-		if (test_opt (sb, OLDALLOC))
+		if (test_opt(sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
 		else
 			ret2 = find_group_orlov(sb, dir, &group);
@@ -623,7 +636,7 @@ got_group:
 			goto fail;
 
 		brelse(bitmap_bh);
-		bitmap_bh = read_inode_bitmap(sb, group);
+		bitmap_bh = ext4_read_inode_bitmap(sb, group);
 		if (!bitmap_bh)
 			goto fail;
 
@@ -728,7 +741,7 @@ got:
 
 			/* When marking the block group with
 			 * ~EXT4_BG_INODE_UNINIT we don't want to depend
-			 * on the value of bg_itable_unsed even though
+			 * on the value of bg_itable_unused even though
 			 * mke2fs could have initialized the same for us.
 			 * Instead we calculated the value below
 			 */
@@ -773,7 +786,7 @@ got:
 	}
 
 	inode->i_uid = current->fsuid;
-	if (test_opt (sb, GRPID))
+	if (test_opt(sb, GRPID))
 		inode->i_gid = dir->i_gid;
 	else if (dir->i_mode & S_ISGID) {
 		inode->i_gid = dir->i_gid;
@@ -806,7 +819,6 @@ got:
 		ei->i_flags &= ~EXT4_DIRSYNC_FL;
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
-	ei->i_block_alloc_info = NULL;
 	ei->i_block_group = group;
 
 	ext4_set_inode_flags(inode);
@@ -822,7 +834,7 @@ got:
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
-	if(DQUOT_ALLOC_INODE(inode)) {
+	if (DQUOT_ALLOC_INODE(inode)) {
 		err = -EDQUOT;
 		goto fail_drop;
 	}
@@ -831,7 +843,7 @@ got:
 	if (err)
 		goto fail_free_drop;
 
-	err = ext4_init_security(handle,inode, dir);
+	err = ext4_init_security(handle, inode, dir);
 	if (err)
 		goto fail_free_drop;
 
@@ -891,7 +903,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
-	bitmap_bh = read_inode_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
 	if (!bitmap_bh) {
 		ext4_warning(sb, __func__,
 			     "inode bitmap error for orphan %lu", ino);
@@ -949,7 +961,7 @@ error:
 	return ERR_PTR(err);
 }
 
-unsigned long ext4_count_free_inodes (struct super_block * sb)
+unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
 	unsigned long desc_count;
 	struct ext4_group_desc *gdp;
@@ -964,12 +976,12 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 	bitmap_count = 0;
 	gdp = NULL;
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		gdp = ext4_get_group_desc (sb, i, NULL);
+		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
 		brelse(bitmap_bh);
-		bitmap_bh = read_inode_bitmap(sb, i);
+		bitmap_bh = ext4_read_inode_bitmap(sb, i);
 		if (!bitmap_bh)
 			continue;
 
@@ -979,13 +991,14 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
-		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+	printk(KERN_DEBUG "ext4_count_free_inodes: "
+	       "stored = %u, computed = %lu, %lu\n",
+	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
 	return desc_count;
 #else
 	desc_count = 0;
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		gdp = ext4_get_group_desc (sb, i, NULL);
+		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -996,13 +1009,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 }
 
 /* Called at mount-time, super-block is locked */
-unsigned long ext4_count_dirs (struct super_block * sb)
+unsigned long ext4_count_dirs(struct super_block * sb)
 {
 	unsigned long count = 0;
 	ext4_group_t i;
 
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
 		count += le16_to_cpu(gdp->bg_used_dirs_count);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8ca2763df091..8dbf6953845b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -41,6 +41,8 @@
 #include "acl.h"
 #include "ext4_extents.h"
 
+#define MPAGE_DA_EXTENT_TAIL 0x01
+
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
@@ -188,9 +190,10 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 /*
  * Called at the last iput() if i_nlink is zero.
  */
-void ext4_delete_inode (struct inode * inode)
+void ext4_delete_inode(struct inode *inode)
 {
 	handle_t *handle;
+	int err;
 
 	if (ext4_should_order_data(inode))
 		ext4_begin_ordered_truncate(inode, 0);
@@ -199,8 +202,9 @@ void ext4_delete_inode (struct inode * inode)
 	if (is_bad_inode(inode))
 		goto no_delete;
 
-	handle = start_transaction(inode);
+	handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
+		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
 		 * If we're going to skip the normal cleanup, we still need to
 		 * make sure that the in-core orphan linked list is properly
@@ -213,8 +217,34 @@ void ext4_delete_inode (struct inode * inode)
 	if (IS_SYNC(inode))
 		handle->h_sync = 1;
 	inode->i_size = 0;
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext4_warning(inode->i_sb, __func__,
+			     "couldn't mark inode dirty (err %d)", err);
+		goto stop_handle;
+	}
 	if (inode->i_blocks)
 		ext4_truncate(inode);
+
+	/*
+	 * ext4_ext_truncate() doesn't reserve any slop when it
+	 * restarts journal transactions; therefore there may not be
+	 * enough credits left in the handle to remove the inode from
+	 * the orphan list and set the dtime field.
+	 */
+	if (handle->h_buffer_credits < 3) {
+		err = ext4_journal_extend(handle, 3);
+		if (err > 0)
+			err = ext4_journal_restart(handle, 3);
+		if (err != 0) {
+			ext4_warning(inode->i_sb, __func__,
+				     "couldn't extend journal (err %d)", err);
+		stop_handle:
+			ext4_journal_stop(handle);
+			goto no_delete;
+		}
+	}
+
 	/*
 	 * Kill off the orphan record which ext4_truncate created.
 	 * AKPM: I think this can be inside the above `if'.
@@ -300,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode,
 	int final = 0;
 
 	if (i_block < 0) {
-		ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
+		ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
 	} else if (i_block < direct_blocks) {
 		offsets[n++] = i_block;
 		final = direct_blocks;
-	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+	} else if ((i_block -= direct_blocks) < indirect_blocks) {
 		offsets[n++] = EXT4_IND_BLOCK;
 		offsets[n++] = i_block;
 		final = ptrs;
@@ -370,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
 
 	*err = 0;
 	/* i_data is not going away, no lock needed */
-	add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
+	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
 	if (!p->key)
 		goto no_block;
 	while (--depth) {
 		bh = sb_bread(sb, le32_to_cpu(p->key));
 		if (!bh)
 			goto failure;
-		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
 		/* Reader: end */
 		if (!p->key)
 			goto no_block;
@@ -413,7 +443,7 @@ no_block:
 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
 	__le32 *p;
 	ext4_fsblk_t bg_start;
 	ext4_fsblk_t last_block;
@@ -456,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 		Indirect *partial)
 {
-	struct ext4_block_alloc_info *block_i;
-
-	block_i =  EXT4_I(inode)->i_block_alloc_info;
-
 	/*
-	 * try the heuristic for sequential allocation,
-	 * failing that at least try to get decent locality.
+	 * XXX need to get goal block from mballoc's data structures
 	 */
-	if (block_i && (block == block_i->last_alloc_logical_block + 1)
-		&& (block_i->last_alloc_physical_block != 0)) {
-		return block_i->last_alloc_physical_block + 1;
-	}
 
 	return ext4_find_near(inode, partial);
 }
@@ -600,7 +621,7 @@ allocated:
 	*err = 0;
 	return ret;
 failed_out:
-	for (i = 0; i <index; i++)
+	for (i = 0; i < index; i++)
 		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 	return ret;
 }
@@ -673,7 +694,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		branch[n].p = (__le32 *) bh->b_data + offsets[n];
 		branch[n].key = cpu_to_le32(new_blocks[n]);
 		*branch[n].p = branch[n].key;
-		if ( n == indirect_blks) {
+		if (n == indirect_blks) {
 			current_block = new_blocks[n];
 			/*
 			 * End of chain, update the last new metablock of
@@ -700,7 +721,7 @@ failed:
 		BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
 		ext4_journal_forget(handle, branch[i].bh);
 	}
-	for (i = 0; i <indirect_blks; i++)
+	for (i = 0; i < indirect_blks; i++)
 		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
 
 	ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
@@ -727,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 {
 	int i;
 	int err = 0;
-	struct ext4_block_alloc_info *block_i;
 	ext4_fsblk_t current_block;
 
-	block_i = EXT4_I(inode)->i_block_alloc_info;
 	/*
 	 * If we're splicing into a [td]indirect block (as opposed to the
 	 * inode) then we need to get write access to the [td]indirect block
@@ -753,18 +772,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 	if (num == 0 && blks > 1) {
 		current_block = le32_to_cpu(where->key) + 1;
 		for (i = 1; i < blks; i++)
-			*(where->p + i ) = cpu_to_le32(current_block++);
-	}
-
-	/*
-	 * update the most recently allocated logical & physical block
-	 * in i_block_alloc_info, to assist find the proper goal block for next
-	 * allocation
-	 */
-	if (block_i) {
-		block_i->last_alloc_logical_block = block + blks - 1;
-		block_i->last_alloc_physical_block =
-				le32_to_cpu(where[num].key) + blks - 1;
+			*(where->p + i) = cpu_to_le32(current_block++);
 	}
 
 	/* We are done with atomic stuff, now do the rest of housekeeping */
@@ -884,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 		goto cleanup;
 
 	/*
-	 * Okay, we need to do block allocation.  Lazily initialize the block
-	 * allocation info here if necessary
+	 * Okay, we need to do block allocation.
 	*/
-	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
-		ext4_init_block_alloc_info(inode);
-
 	goal = ext4_find_goal(inode, iblock, partial);
 
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -952,23 +956,75 @@ out:
 	return err;
 }
 
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
 /*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
- * We need sb + group descriptor + bitmap + inode -> 4
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
  */
-#define DIO_CREDITS 25
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ind_blks, dind_blks, tind_blks;
+
+	/* number of new indirect blocks needed */
+	ind_blks = (blocks + icap - 1) / icap;
+
+	dind_blks = (ind_blks + icap - 1) / icap;
+
+	tind_blks = 1;
 
+	return ind_blks + dind_blks + tind_blks;
+}
 
 /*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	if (!blocks)
+		return 0;
+
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_calc_metadata_amount(inode, blocks);
+
+	return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static void ext4_da_update_reserve_space(struct inode *inode, int used)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int total, mdb, mdb_free;
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	/* recalculate the number of metablocks still need to be reserved */
+	total = EXT4_I(inode)->i_reserved_data_blocks - used;
+	mdb = ext4_calc_metadata_amount(inode, total);
+
+	/* figure out how many metablocks to release */
+	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+	if (mdb_free) {
+		/* Account for allocated meta_blocks */
+		mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+		/* update fs dirty blocks counter */
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+		EXT4_I(inode)->i_allocated_meta_blocks = 0;
+		EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+	}
+
+	/* update per-inode reservations */
+	BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
+	EXT4_I(inode)->i_reserved_data_blocks -= used;
+
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+/*
+ * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * and returns if the blocks are already mapped.
  *
- *
- * ext4_ext4 get_block() wrapper function
- * It will do a look up first, and returns if the blocks already mapped.
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  * and store the allocated blocks in the result buffer head and mark it
  * mapped.
@@ -1069,26 +1125,30 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 		 * which were deferred till now
 		 */
 		if ((retval > 0) && buffer_delay(bh))
-			ext4_da_release_space(inode, retval, 0);
+			ext4_da_update_reserve_space(inode, retval);
 	}
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }
 
-static int ext4_get_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+int ext4_get_block(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	int ret = 0, started = 0;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int dio_credits;
 
 	if (create && !handle) {
 		/* Direct IO write... */
 		if (max_blocks > DIO_MAX_BLOCKS)
 			max_blocks = DIO_MAX_BLOCKS;
-		handle = ext4_journal_start(inode, DIO_CREDITS +
-			      2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+		dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+		handle = ext4_journal_start(inode, dio_credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			goto out;
@@ -1156,7 +1216,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 			BUFFER_TRACE(bh, "call get_create_access");
 			fatal = ext4_journal_get_create_access(handle, bh);
 			if (!fatal && !buffer_uptodate(bh)) {
-				memset(bh->b_data,0,inode->i_sb->s_blocksize);
+				memset(bh->b_data, 0, inode->i_sb->s_blocksize);
 				set_buffer_uptodate(bh);
 			}
 			unlock_buffer(bh);
@@ -1181,7 +1241,7 @@ err:
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t block, int create, int *err)
 {
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 
 	bh = ext4_getblk(handle, inode, block, create, err);
 	if (!bh)
@@ -1197,13 +1257,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 	return NULL;
 }
 
-static int walk_page_buffers(	handle_t *handle,
-				struct buffer_head *head,
-				unsigned from,
-				unsigned to,
-				int *partial,
-				int (*fn)(	handle_t *handle,
-						struct buffer_head *bh))
+static int walk_page_buffers(handle_t *handle,
+			     struct buffer_head *head,
+			     unsigned from,
+			     unsigned to,
+			     int *partial,
+			     int (*fn)(handle_t *handle,
+				       struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
@@ -1211,9 +1271,9 @@ static int walk_page_buffers(	handle_t *handle,
 	int err, ret = 0;
 	struct buffer_head *next;
 
-	for (	bh = head, block_start = 0;
-		ret == 0 && (bh != head || !block_start);
-		block_start = block_end, bh = next)
+	for (bh = head, block_start = 0;
+	     ret == 0 && (bh != head || !block_start);
+	     block_start = block_end, bh = next)
 	{
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
@@ -1266,23 +1326,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
- 	struct inode *inode = mapping->host;
+	struct inode *inode = mapping->host;
 	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
 	handle_t *handle;
 	int retries = 0;
- 	struct page *page;
+	struct page *page;
  	pgoff_t index;
- 	unsigned from, to;
+	unsigned from, to;
 
  	index = pos >> PAGE_CACHE_SHIFT;
- 	from = pos & (PAGE_CACHE_SIZE - 1);
- 	to = from + len;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
 
 retry:
-  	handle = ext4_journal_start(inode, needed_blocks);
-  	if (IS_ERR(handle)) {
-  		ret = PTR_ERR(handle);
-  		goto out;
+	handle = ext4_journal_start(inode, needed_blocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
 	}
 
 	page = __grab_cache_page(mapping, index);
@@ -1302,9 +1362,16 @@ retry:
 	}
 
 	if (ret) {
- 		unlock_page(page);
+		unlock_page(page);
 		ext4_journal_stop(handle);
- 		page_cache_release(page);
+		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1336,25 +1403,23 @@ static int ext4_ordered_write_end(struct file *file,
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
-	unsigned from, to;
 	int ret = 0, ret2;
 
-	from = pos & (PAGE_CACHE_SIZE - 1);
-	to = from + len;
-
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
-		/*
-		 * generic_write_end() will run mark_inode_dirty() if i_size
-		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
-		 * into that.
-		 */
 		loff_t new_i_size;
 
 		new_i_size = pos + copied;
-		if (new_i_size > EXT4_I(inode)->i_disksize)
-			EXT4_I(inode)->i_disksize = new_i_size;
+		if (new_i_size > EXT4_I(inode)->i_disksize) {
+			ext4_update_i_disksize(inode, new_i_size);
+			/* We need to mark inode dirty even if
+			 * new_i_size is less that inode->i_size
+			 * bu greater than i_disksize.(hint delalloc)
+			 */
+			ext4_mark_inode_dirty(handle, inode);
+		}
+
 		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
@@ -1379,8 +1444,14 @@ static int ext4_writeback_write_end(struct file *file,
 	loff_t new_i_size;
 
 	new_i_size = pos + copied;
-	if (new_i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = new_i_size;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, new_i_size);
+		/* We need to mark inode dirty even if
+		 * new_i_size is less that inode->i_size
+		 * bu greater than i_disksize.(hint delalloc)
+		 */
+		ext4_mark_inode_dirty(handle, inode);
+	}
 
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
@@ -1405,6 +1476,7 @@ static int ext4_journalled_write_end(struct file *file,
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
+	loff_t new_i_size;
 
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -1419,11 +1491,12 @@ static int ext4_journalled_write_end(struct file *file,
 				to, &partial, write_end_fn);
 	if (!partial)
 		SetPageUptodate(page);
-	if (pos+copied > inode->i_size)
+	new_i_size = pos + copied;
+	if (new_i_size > inode->i_size)
 		i_size_write(inode, pos+copied);
 	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-	if (inode->i_size > EXT4_I(inode)->i_disksize) {
-		EXT4_I(inode)->i_disksize = inode->i_size;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, new_i_size);
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
@@ -1437,39 +1510,10 @@ static int ext4_journalled_write_end(struct file *file,
 
 	return ret ? ret : copied;
 }
-/*
- * Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
- */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
-{
-	int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-	int ind_blks, dind_blks, tind_blks;
-
-	/* number of new indirect blocks needed */
-	ind_blks = (blocks + icap - 1) / icap;
-
-	dind_blks = (ind_blks + icap - 1) / icap;
-
-	tind_blks = 1;
-
-	return ind_blks + dind_blks + tind_blks;
-}
-
-/*
- * Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
- */
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
-{
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-		return ext4_ext_calc_metadata_amount(inode, blocks);
-
-	return ext4_indirect_calc_metadata_amount(inode, blocks);
-}
 
 static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 {
+	int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        unsigned long md_needed, mdblocks, total = 0;
 
@@ -1478,6 +1522,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
+repeat:
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 	total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
 	mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1486,14 +1531,14 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
 	total = md_needed + nrblocks;
 
-	if (ext4_has_free_blocks(sbi, total) < total) {
+	if (ext4_claim_free_blocks(sbi, total)) {
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+			yield();
+			goto repeat;
+		}
 		return -ENOSPC;
 	}
-
-	/* reduce fs free blocks counter */
-	percpu_counter_sub(&sbi->s_freeblocks_counter, total);
-
 	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
 	EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
 
@@ -1501,35 +1546,49 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 	return 0;       /* success */
 }
 
-void ext4_da_release_space(struct inode *inode, int used, int to_free)
+static void ext4_da_release_space(struct inode *inode, int to_free)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	int total, mdb, mdb_free, release;
 
+	if (!to_free)
+		return;		/* Nothing to release, exit */
+
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	if (!EXT4_I(inode)->i_reserved_data_blocks) {
+		/*
+		 * if there is no reserved blocks, but we try to free some
+		 * then the counter is messed up somewhere.
+		 * but since this function is called from invalidate
+		 * page, it's harmless to return without any action
+		 */
+		printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+			    "blocks for inode %lu, but there is no reserved "
+			    "data blocks\n", to_free, inode->i_ino);
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		return;
+	}
+
 	/* recalculate the number of metablocks still need to be reserved */
-	total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+	total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
 	mdb = ext4_calc_metadata_amount(inode, total);
 
 	/* figure out how many metablocks to release */
 	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
 	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
 
-	/* Account for allocated meta_blocks */
-	mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
-
 	release = to_free + mdb_free;
 
-	/* update fs free blocks counter for truncate case */
-	percpu_counter_add(&sbi->s_freeblocks_counter, release);
+	/* update fs dirty blocks counter for truncate case */
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
 
 	/* update per-inode reservations */
-	BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
-	EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+	BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
+	EXT4_I(inode)->i_reserved_data_blocks -= to_free;
 
 	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
 	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
-	EXT4_I(inode)->i_allocated_meta_blocks = 0;
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 }
 
@@ -1551,7 +1610,7 @@ static void ext4_da_page_release_reservation(struct page *page,
 		}
 		curr_off = next_off;
 	} while ((bh = bh->b_this_page) != head);
-	ext4_da_release_space(page->mapping->host, 0, to_release);
+	ext4_da_release_space(page->mapping->host, to_release);
 }
 
 /*
@@ -1564,11 +1623,14 @@ struct mpage_da_data {
 	unsigned long first_page, next_page;	/* extent of pages */
 	get_block_t *get_block;
 	struct writeback_control *wbc;
+	int io_done;
+	long pages_written;
+	int retval;
 };
 
 /*
  * mpage_da_submit_io - walks through extent of pages and try to write
- * them with __mpage_writepage()
+ * them with writepage() call back
  *
  * @mpd->inode: inode
  * @mpd->first_page: first page of the extent
@@ -1583,37 +1645,42 @@ struct mpage_da_data {
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
 	struct address_space *mapping = mpd->inode->i_mapping;
-	struct mpage_data mpd_pp = {
-		.bio = NULL,
-		.last_block_in_bio = 0,
-		.get_block = mpd->get_block,
-		.use_writepage = 1,
-	};
 	int ret = 0, err, nr_pages, i;
 	unsigned long index, end;
 	struct pagevec pvec;
+	long pages_skipped;
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
-
 	pagevec_init(&pvec, 0);
 	index = mpd->first_page;
 	end = mpd->next_page - 1;
 
 	while (index <= end) {
-		/* XXX: optimize tail */
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		/*
+		 * We can use PAGECACHE_TAG_DIRTY lookup here because
+		 * even though we have cleared the dirty flag on the page
+		 * We still keep the page in the radix tree with tag
+		 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+		 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+		 * which is called via the below writepage callback.
+		 */
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY,
+					min(end - index,
+					(pgoff_t)PAGEVEC_SIZE-1) + 1);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
-			index = page->index;
-			if (index > end)
-				break;
-			index++;
-
-			err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
-
+			pages_skipped = mpd->wbc->pages_skipped;
+			err = mapping->a_ops->writepage(page, mpd->wbc);
+			if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+				/*
+				 * have successfully written the page
+				 * without skipping the same
+				 */
+				mpd->pages_written++;
 			/*
 			 * In error case, we have to continue because
 			 * remaining pages are still locked
@@ -1624,9 +1691,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 		}
 		pagevec_release(&pvec);
 	}
-	if (mpd_pp.bio)
-		mpage_bio_submit(WRITE, mpd_pp.bio);
-
 	return ret;
 }
 
@@ -1649,7 +1713,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 	int blocks = exbh->b_size >> inode->i_blkbits;
 	sector_t pblock = exbh->b_blocknr, cur_logical;
 	struct buffer_head *head, *bh;
-	unsigned long index, end;
+	pgoff_t index, end;
 	struct pagevec pvec;
 	int nr_pages, i;
 
@@ -1692,6 +1756,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 				if (buffer_delay(bh)) {
 					bh->b_blocknr = pblock;
 					clear_buffer_delay(bh);
+					bh->b_bdev = inode->i_sb->s_bdev;
+				} else if (buffer_unwritten(bh)) {
+					bh->b_blocknr = pblock;
+					clear_buffer_unwritten(bh);
+					set_buffer_mapped(bh);
+					set_buffer_new(bh);
+					bh->b_bdev = inode->i_sb->s_bdev;
 				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
 
@@ -1719,6 +1790,57 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
 		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
 }
 
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
+					sector_t logical, long blk_cnt)
+{
+	int nr_pages, i;
+	pgoff_t index, end;
+	struct pagevec pvec;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end   = (logical + blk_cnt - 1) >>
+				(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			block_invalidatepage(page, 0);
+			ClearPageUptodate(page);
+			unlock_page(page);
+		}
+	}
+	return;
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	printk(KERN_EMERG "Total free blocks count %lld\n",
+			ext4_count_free_blocks(inode->i_sb));
+	printk(KERN_EMERG "Free/Dirty block details\n");
+	printk(KERN_EMERG "free_blocks=%lld\n",
+			percpu_counter_sum(&sbi->s_freeblocks_counter));
+	printk(KERN_EMERG "dirty_blocks=%lld\n",
+			percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+	printk(KERN_EMERG "Block reservation details\n");
+	printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+			EXT4_I(inode)->i_reserved_data_blocks);
+	printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+			EXT4_I(inode)->i_reserved_meta_blocks);
+	return;
+}
+
 /*
  * mpage_da_map_blocks - go through given space
  *
@@ -1727,54 +1849,87 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
  *
  * The function skips space we know is already mapped to disk blocks.
  *
- * The function ignores errors ->get_block() returns, thus real
- * error handling is postponed to __mpage_writepage()
  */
-static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-	struct buffer_head *lbh = &mpd->lbh;
-	int err = 0, remain = lbh->b_size;
-	sector_t next = lbh->b_blocknr;
+	int err = 0;
 	struct buffer_head new;
+	struct buffer_head *lbh = &mpd->lbh;
+	sector_t next;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
 	 */
 	if (buffer_mapped(lbh) && !buffer_delay(lbh))
-		return;
+		return 0;
+	new.b_state = lbh->b_state;
+	new.b_blocknr = 0;
+	new.b_size = lbh->b_size;
+	next = lbh->b_blocknr;
+	/*
+	 * If we didn't accumulate anything
+	 * to write simply return
+	 */
+	if (!new.b_size)
+		return 0;
+	err = mpd->get_block(mpd->inode, next, &new, 1);
+	if (err) {
 
-	while (remain) {
-		new.b_state = lbh->b_state;
-		new.b_blocknr = 0;
-		new.b_size = remain;
-		err = mpd->get_block(mpd->inode, next, &new, 1);
-		if (err) {
-			/*
-			 * Rather than implement own error handling
-			 * here, we just leave remaining blocks
-			 * unallocated and try again with ->writepage()
-			 */
-			break;
-		}
-		BUG_ON(new.b_size == 0);
+		/* If get block returns with error
+		 * we simply return. Later writepage
+		 * will redirty the page and writepages
+		 * will find the dirty page again
+		 */
+		if (err == -EAGAIN)
+			return 0;
 
-		if (buffer_new(&new))
-			__unmap_underlying_blocks(mpd->inode, &new);
+		if (err == -ENOSPC &&
+				ext4_count_free_blocks(mpd->inode->i_sb)) {
+			mpd->retval = err;
+			return 0;
+		}
 
 		/*
-		 * If blocks are delayed marked, we need to
-		 * put actual blocknr and drop delayed bit
+		 * get block failure will cause us
+		 * to loop in writepages. Because
+		 * a_ops->writepage won't be able to
+		 * make progress. The page will be redirtied
+		 * by writepage and writepages will again
+		 * try to write the same.
 		 */
-		if (buffer_delay(lbh))
-			mpage_put_bnr_to_bhs(mpd, next, &new);
-
-		/* go for the remaining blocks */
-		next += new.b_size >> mpd->inode->i_blkbits;
-		remain -= new.b_size;
+		printk(KERN_EMERG "%s block allocation failed for inode %lu "
+				  "at logical offset %llu with max blocks "
+				  "%zd with error %d\n",
+				  __func__, mpd->inode->i_ino,
+				  (unsigned long long)next,
+				  lbh->b_size >> mpd->inode->i_blkbits, err);
+		printk(KERN_EMERG "This should not happen.!! "
+					"Data will be lost\n");
+		if (err == -ENOSPC) {
+			ext4_print_free_blocks(mpd->inode);
+		}
+		/* invlaidate all the pages */
+		ext4_da_block_invalidatepages(mpd, next,
+				lbh->b_size >> mpd->inode->i_blkbits);
+		return err;
 	}
+	BUG_ON(new.b_size == 0);
+
+	if (buffer_new(&new))
+		__unmap_underlying_blocks(mpd->inode, &new);
+
+	/*
+	 * If blocks are delayed marked, we need to
+	 * put actual blocknr and drop delayed bit
+	 */
+	if (buffer_delay(lbh) || buffer_unwritten(lbh))
+		mpage_put_bnr_to_bhs(mpd, next, &new);
+
+	return 0;
 }
 
-#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
+		(1 << BH_Delay) | (1 << BH_Unwritten))
 
 /*
  * mpage_add_bh_to_extent - try to add one more block to extent of blocks
@@ -1788,41 +1943,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 				   sector_t logical, struct buffer_head *bh)
 {
-	struct buffer_head *lbh = &mpd->lbh;
 	sector_t next;
+	size_t b_size = bh->b_size;
+	struct buffer_head *lbh = &mpd->lbh;
+	int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
 
-	next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
-
+	/* check if thereserved journal credits might overflow */
+	if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
+			/*
+			 * With non-extent format we are limited by the journal
+			 * credit available.  Total credit needed to insert
+			 * nrblocks contiguous blocks is dependent on the
+			 * nrblocks.  So limit nrblocks.
+			 */
+			goto flush_it;
+		} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
+				EXT4_MAX_TRANS_DATA) {
+			/*
+			 * Adding the new buffer_head would make it cross the
+			 * allowed limit for which we have journal credit
+			 * reserved. So limit the new bh->b_size
+			 */
+			b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
+						mpd->inode->i_blkbits;
+			/* we will do mpage_da_submit_io in the next loop */
+		}
+	}
 	/*
 	 * First block in the extent
 	 */
 	if (lbh->b_size == 0) {
 		lbh->b_blocknr = logical;
-		lbh->b_size = bh->b_size;
+		lbh->b_size = b_size;
 		lbh->b_state = bh->b_state & BH_FLAGS;
 		return;
 	}
 
+	next = lbh->b_blocknr + nrblocks;
 	/*
 	 * Can we merge the block to our big extent?
 	 */
 	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
-		lbh->b_size += bh->b_size;
+		lbh->b_size += b_size;
 		return;
 	}
 
+flush_it:
 	/*
 	 * We couldn't merge the block to our extent, so we
 	 * need to flush current  extent and start new one
 	 */
-	mpage_da_map_blocks(mpd);
-
-	/*
-	 * Now start a new extent
-	 */
-	lbh->b_size = bh->b_size;
-	lbh->b_state = bh->b_state & BH_FLAGS;
-	lbh->b_blocknr = logical;
+	if (mpage_da_map_blocks(mpd) == 0)
+		mpage_da_submit_io(mpd);
+	mpd->io_done = 1;
+	return;
 }
 
 /*
@@ -1842,17 +2017,35 @@ static int __mpage_da_writepage(struct page *page,
 	struct buffer_head *bh, *head, fake;
 	sector_t logical;
 
+	if (mpd->io_done) {
+		/*
+		 * Rest of the page in the page_vec
+		 * redirty then and skip then. We will
+		 * try to to write them again after
+		 * starting a new transaction
+		 */
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return MPAGE_DA_EXTENT_TAIL;
+	}
 	/*
 	 * Can we merge this page to current extent?
 	 */
 	if (mpd->next_page != page->index) {
 		/*
 		 * Nope, we can't. So, we map non-allocated blocks
-		 * and start IO on them using __mpage_writepage()
+		 * and start IO on them using writepage()
 		 */
 		if (mpd->next_page != mpd->first_page) {
-			mpage_da_map_blocks(mpd);
-			mpage_da_submit_io(mpd);
+			if (mpage_da_map_blocks(mpd) == 0)
+				mpage_da_submit_io(mpd);
+			/*
+			 * skip rest of the page in the page_vec
+			 */
+			mpd->io_done = 1;
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return MPAGE_DA_EXTENT_TAIL;
 		}
 
 		/*
@@ -1883,6 +2076,8 @@ static int __mpage_da_writepage(struct page *page,
 		set_buffer_dirty(bh);
 		set_buffer_uptodate(bh);
 		mpage_add_bh_to_extent(mpd, logical, bh);
+		if (mpd->io_done)
+			return MPAGE_DA_EXTENT_TAIL;
 	} else {
 		/*
 		 * Page with regular buffer heads, just add all dirty ones
@@ -1891,8 +2086,12 @@ static int __mpage_da_writepage(struct page *page,
 		bh = head;
 		do {
 			BUG_ON(buffer_locked(bh));
-			if (buffer_dirty(bh))
+			if (buffer_dirty(bh) &&
+				(!buffer_mapped(bh) || buffer_delay(bh))) {
 				mpage_add_bh_to_extent(mpd, logical, bh);
+				if (mpd->io_done)
+					return MPAGE_DA_EXTENT_TAIL;
+			}
 			logical++;
 		} while ((bh = bh->b_this_page) != head);
 	}
@@ -1911,46 +2110,37 @@ static int __mpage_da_writepage(struct page *page,
  *
  * This is a library function, which implements the writepages()
  * address_space_operation.
- *
- * In order to avoid duplication of logic that deals with partial pages,
- * multiple bio per page, etc, we find non-allocated blocks, allocate
- * them with minimal calls to ->get_block() and re-use __mpage_writepage()
- *
- * It's important that we call __mpage_writepage() only once for each
- * involved page, otherwise we'd have to implement more complicated logic
- * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
- *
- * See comments to mpage_writepages()
  */
 static int mpage_da_writepages(struct address_space *mapping,
 			       struct writeback_control *wbc,
-			       get_block_t get_block)
+			       struct mpage_da_data *mpd)
 {
-	struct mpage_da_data mpd;
 	int ret;
 
-	if (!get_block)
+	if (!mpd->get_block)
 		return generic_writepages(mapping, wbc);
 
-	mpd.wbc = wbc;
-	mpd.inode = mapping->host;
-	mpd.lbh.b_size = 0;
-	mpd.lbh.b_state = 0;
-	mpd.lbh.b_blocknr = 0;
-	mpd.first_page = 0;
-	mpd.next_page = 0;
-	mpd.get_block = get_block;
-
-	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+	mpd->lbh.b_size = 0;
+	mpd->lbh.b_state = 0;
+	mpd->lbh.b_blocknr = 0;
+	mpd->first_page = 0;
+	mpd->next_page = 0;
+	mpd->io_done = 0;
+	mpd->pages_written = 0;
+	mpd->retval = 0;
 
+	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
 	/*
 	 * Handle last extent of pages
 	 */
-	if (mpd.next_page != mpd.first_page) {
-		mpage_da_map_blocks(&mpd);
-		mpage_da_submit_io(&mpd);
-	}
+	if (!mpd->io_done && mpd->next_page != mpd->first_page) {
+		if (mpage_da_map_blocks(mpd) == 0)
+			mpage_da_submit_io(mpd);
 
+		mpd->io_done = 1;
+		ret = MPAGE_DA_EXTENT_TAIL;
+	}
+	wbc->nr_to_write -= mpd->pages_written;
 	return ret;
 }
 
@@ -2003,18 +2193,24 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	handle_t *handle = NULL;
 
 	handle = ext4_journal_current_handle();
-	if (!handle) {
-		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, 0, 0, 0);
-		BUG_ON(!ret);
-	} else {
-		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
-	}
-
+	BUG_ON(!handle);
+	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+			bh_result, create, 0, EXT4_DELALLOC_RSVED);
 	if (ret > 0) {
+
 		bh_result->b_size = (ret << inode->i_blkbits);
 
+		if (ext4_should_order_data(inode)) {
+			int retval;
+			retval = ext4_jbd2_file_inode(handle, inode);
+			if (retval)
+				/*
+				 * Failed to add inode for ordered
+				 * mode. Don't update file size
+				 */
+				return retval;
+		}
+
 		/*
 		 * Update on-disk size along with block allocation
 		 * we don't use 'extend_disksize' as size may change
@@ -2024,18 +2220,9 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 		if (disksize > i_size_read(inode))
 			disksize = i_size_read(inode);
 		if (disksize > EXT4_I(inode)->i_disksize) {
-			/*
-			 * XXX: replace with spinlock if seen contended -bzzz
-			 */
-			down_write(&EXT4_I(inode)->i_data_sem);
-			if (disksize > EXT4_I(inode)->i_disksize)
-				EXT4_I(inode)->i_disksize = disksize;
-			up_write(&EXT4_I(inode)->i_data_sem);
-
-			if (EXT4_I(inode)->i_disksize == disksize) {
-				ret = ext4_mark_inode_dirty(handle, inode);
-				return ret;
-			}
+			ext4_update_i_disksize(inode, disksize);
+			ret = ext4_mark_inode_dirty(handle, inode);
+			return ret;
 		}
 		ret = 0;
 	}
@@ -2155,102 +2342,177 @@ static int ext4_da_writepage(struct page *page,
 }
 
 /*
- * For now just follow the DIO way to estimate the max credits
- * needed to write out EXT4_MAX_WRITEBACK_PAGES.
- * todo: need to calculate the max credits need for
- * extent based files, currently the DIO credits is based on
- * indirect-blocks mapping way.
- *
- * Probably should have a generic way to calculate credits
- * for DIO, writepages, and truncate
+ * This is called via ext4_da_writepages() to
+ * calulate the total number of credits to reserve to fit
+ * a single extent allocation into a single transaction,
+ * ext4_da_writpeages() will loop calling this before
+ * the block allocation.
  */
-#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
-#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+
+	/*
+	 * With non-extent format the journal credit needed to
+	 * insert nrblocks contiguous block is dependent on
+	 * number of contiguous block. So we will limit
+	 * number of contiguous block to a sane value
+	 */
+	if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+	    (max_blocks > EXT4_MAX_TRANS_DATA))
+		max_blocks = EXT4_MAX_TRANS_DATA;
+
+	return ext4_chunk_trans_blocks(inode, max_blocks);
+}
 
 static int ext4_da_writepages(struct address_space *mapping,
-				struct writeback_control *wbc)
+			      struct writeback_control *wbc)
 {
-	struct inode *inode = mapping->host;
+	pgoff_t	index;
+	int range_whole = 0;
 	handle_t *handle = NULL;
-	int needed_blocks;
-	int ret = 0;
-	long to_write;
-	loff_t range_start = 0;
+	struct mpage_da_data mpd;
+	struct inode *inode = mapping->host;
+	int no_nrwrite_index_update;
+	long pages_written = 0, pages_skipped;
+	int needed_blocks, ret = 0, nr_to_writebump = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
 	 * a transaction for special inodes like journal inode on last iput()
 	 * because that could violate lock ordering on umount
 	 */
-	if (!mapping->nrpages)
+	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
+	/*
+	 * Make sure nr_to_write is >= sbi->s_mb_stream_request
+	 * This make sure small files blocks are allocated in
+	 * single attempt. This ensure that small files
+	 * get less fragmented.
+	 */
+	if (wbc->nr_to_write < sbi->s_mb_stream_request) {
+		nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+		wbc->nr_to_write = sbi->s_mb_stream_request;
+	}
+	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+		range_whole = 1;
+
+	if (wbc->range_cyclic)
+		index = mapping->writeback_index;
+	else
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+
+	mpd.wbc = wbc;
+	mpd.inode = mapping->host;
 
 	/*
-	 * Estimate the worse case needed credits to write out
-	 * EXT4_MAX_BUF_BLOCKS pages
+	 * we don't want write_cache_pages to update
+	 * nr_to_write and writeback_index
 	 */
-	needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+	no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+	wbc->no_nrwrite_index_update = 1;
+	pages_skipped = wbc->pages_skipped;
+
+	while (!ret && wbc->nr_to_write > 0) {
 
-	to_write = wbc->nr_to_write;
-	if (!wbc->range_cyclic) {
 		/*
-		 * If range_cyclic is not set force range_cont
-		 * and save the old writeback_index
+		 * we  insert one extent at a time. So we need
+		 * credit needed for single extent allocation.
+		 * journalled mode is currently not supported
+		 * by delalloc
 		 */
-		wbc->range_cont = 1;
-		range_start =  wbc->range_start;
-	}
+		BUG_ON(ext4_should_journal_data(inode));
+		needed_blocks = ext4_da_writepages_trans_blocks(inode);
 
-	while (!ret && to_write) {
 		/* start a new transaction*/
 		handle = ext4_journal_start(inode, needed_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
+			printk(KERN_EMERG "%s: jbd2_start: "
+			       "%ld pages, ino %lu; err %d\n", __func__,
+				wbc->nr_to_write, inode->i_ino, ret);
+			dump_stack();
 			goto out_writepages;
 		}
-		if (ext4_should_order_data(inode)) {
-			/*
-			 * With ordered mode we need to add
-			 * the inode to the journal handle
-			 * when we do block allocation.
-			 */
-			ret = ext4_jbd2_file_inode(handle, inode);
-			if (ret) {
-				ext4_journal_stop(handle);
-				goto out_writepages;
-			}
-
-		}
-		/*
-		 * set the max dirty pages could be write at a time
-		 * to fit into the reserved transaction credits
-		 */
-		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
-			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+		mpd.get_block = ext4_da_get_block_write;
+		ret = mpage_da_writepages(mapping, wbc, &mpd);
 
-		to_write -= wbc->nr_to_write;
-		ret = mpage_da_writepages(mapping, wbc,
-						ext4_da_get_block_write);
 		ext4_journal_stop(handle);
-		if (wbc->nr_to_write) {
+
+		if (mpd.retval == -ENOSPC) {
+			/* commit the transaction which would
+			 * free blocks released in the transaction
+			 * and try again
+			 */
+			jbd2_journal_force_commit_nested(sbi->s_journal);
+			wbc->pages_skipped = pages_skipped;
+			ret = 0;
+		} else if (ret == MPAGE_DA_EXTENT_TAIL) {
+			/*
+			 * got one extent now try with
+			 * rest of the pages
+			 */
+			pages_written += mpd.pages_written;
+			wbc->pages_skipped = pages_skipped;
+			ret = 0;
+		} else if (wbc->nr_to_write)
 			/*
 			 * There is no more writeout needed
 			 * or we requested for a noblocking writeout
 			 * and we found the device congested
 			 */
-			to_write += wbc->nr_to_write;
 			break;
-		}
-		wbc->nr_to_write = to_write;
 	}
+	if (pages_skipped != wbc->pages_skipped)
+		printk(KERN_EMERG "This should not happen leaving %s "
+				"with nr_to_write = %ld ret = %d\n",
+				__func__, wbc->nr_to_write, ret);
+
+	/* Update index */
+	index += pages_written;
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		/*
+		 * set the writeback_index so that range_cyclic
+		 * mode will write it back later
+		 */
+		mapping->writeback_index = index;
 
 out_writepages:
-	wbc->nr_to_write = to_write;
-	if (range_start)
-		wbc->range_start = range_start;
+	if (!no_nrwrite_index_update)
+		wbc->no_nrwrite_index_update = 0;
+	wbc->nr_to_write -= nr_to_writebump;
 	return ret;
 }
 
+#define FALL_BACK_TO_NONDELALLOC 1
+static int ext4_nonda_switch(struct super_block *sb)
+{
+	s64 free_blocks, dirty_blocks;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/*
+	 * switch to non delalloc mode if we are running low
+	 * on free block. The free block accounting via percpu
+	 * counters can get slightly wrong with FBC_BATCH getting
+	 * accumulated on each CPU without updating global counters
+	 * Delalloc need an accurate free block accounting. So switch
+	 * to non delalloc when we are near to error range.
+	 */
+	free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+	if (2 * free_blocks < 3 * dirty_blocks ||
+		free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+		/*
+		 * free block count is less that 150% of dirty blocks
+		 * or free blocks is less that watermark
+		 */
+		return 1;
+	}
+	return 0;
+}
+
 static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
@@ -2266,6 +2528,12 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
+	if (ext4_nonda_switch(inode->i_sb)) {
+		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+		return ext4_write_begin(file, mapping, pos,
+					len, flags, pagep, fsdata);
+	}
+	*fsdata = (void *)0;
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
@@ -2280,8 +2548,11 @@ retry:
 	}
 
 	page = __grab_cache_page(mapping, index);
-	if (!page)
-		return -ENOMEM;
+	if (!page) {
+		ext4_journal_stop(handle);
+		ret = -ENOMEM;
+		goto out;
+	}
 	*pagep = page;
 
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
@@ -2290,6 +2561,13 @@ retry:
 		unlock_page(page);
 		ext4_journal_stop(handle);
 		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2313,7 +2591,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
 	bh = page_buffers(page);
 	idx = offset >> inode->i_blkbits;
 
-	for (i=0; i < idx; i++)
+	for (i = 0; i < idx; i++)
 		bh = bh->b_this_page;
 
 	if (!buffer_mapped(bh) || (buffer_delay(bh)))
@@ -2331,9 +2609,22 @@ static int ext4_da_write_end(struct file *file,
 	handle_t *handle = ext4_journal_current_handle();
 	loff_t new_i_size;
 	unsigned long start, end;
+	int write_mode = (int)(unsigned long)fsdata;
+
+	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+		if (ext4_should_order_data(inode)) {
+			return ext4_ordered_write_end(file, mapping, pos,
+					len, copied, page, fsdata);
+		} else if (ext4_should_writeback_data(inode)) {
+			return ext4_writeback_write_end(file, mapping, pos,
+					len, copied, page, fsdata);
+		} else {
+			BUG();
+		}
+	}
 
 	start = pos & (PAGE_CACHE_SIZE - 1);
-	end = start + copied -1;
+	end = start + copied - 1;
 
 	/*
 	 * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2357,6 +2648,11 @@ static int ext4_da_write_end(struct file *file,
 				EXT4_I(inode)->i_disksize = new_i_size;
 			}
 			up_write(&EXT4_I(inode)->i_data_sem);
+			/* We need to mark inode dirty even if
+			 * new_i_size is less that inode->i_size
+			 * bu greater than i_disksize.(hint delalloc)
+			 */
+			ext4_mark_inode_dirty(handle, inode);
 		}
 	}
 	ret2 = generic_write_end(file, mapping, pos, len, copied,
@@ -2448,7 +2744,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 			return 0;
 	}
 
-	return generic_block_bmap(mapping,block,ext4_get_block);
+	return generic_block_bmap(mapping, block, ext4_get_block);
 }
 
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2806,59 +3102,63 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 }
 
 static const struct address_space_operations ext4_ordered_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_normal_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_write_begin,
-	.write_end	= ext4_ordered_write_end,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_invalidatepage,
-	.releasepage	= ext4_releasepage,
-	.direct_IO	= ext4_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_normal_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_ordered_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext4_writeback_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_normal_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_write_begin,
-	.write_end	= ext4_writeback_write_end,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_invalidatepage,
-	.releasepage	= ext4_releasepage,
-	.direct_IO	= ext4_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_normal_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_writeback_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext4_journalled_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_journalled_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_write_begin,
-	.write_end	= ext4_journalled_write_end,
-	.set_page_dirty	= ext4_journalled_set_page_dirty,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_invalidatepage,
-	.releasepage	= ext4_releasepage,
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_journalled_writepage,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_journalled_write_end,
+	.set_page_dirty		= ext4_journalled_set_page_dirty,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations ext4_da_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_da_writepage,
-	.writepages	= ext4_da_writepages,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_da_write_begin,
-	.write_end	= ext4_da_write_end,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_da_invalidatepage,
-	.releasepage	= ext4_releasepage,
-	.direct_IO	= ext4_direct_IO,
-	.migratepage	= buffer_migrate_page,
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_da_writepage,
+	.writepages		= ext4_da_writepages,
+	.sync_page		= block_sync_page,
+	.write_begin		= ext4_da_write_begin,
+	.write_end		= ext4_da_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_da_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
 };
 
 void ext4_set_aops(struct inode *inode)
@@ -3050,7 +3350,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
 	if (!partial->key && *partial->p)
 		/* Writer: end */
 		goto no_top;
-	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
 		;
 	/*
 	 * OK, we've found the last block that must survive. The rest of our
@@ -3069,7 +3369,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
 	}
 	/* Writer: end */
 
-	while(partial > p) {
+	while (partial > p) {
 		brelse(partial->bh);
 		partial--;
 	}
@@ -3261,9 +3561,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			/* This zaps the entire block.  Bottom up. */
 			BUFFER_TRACE(bh, "free child branches");
 			ext4_free_branches(handle, inode, bh,
-					   (__le32*)bh->b_data,
-					   (__le32*)bh->b_data + addr_per_block,
-					   depth);
+					(__le32 *) bh->b_data,
+					(__le32 *) bh->b_data + addr_per_block,
+					depth);
 
 			/*
 			 * We've probably journalled the indirect block several
@@ -3430,6 +3730,9 @@ void ext4_truncate(struct inode *inode)
 	 * modify the block allocation tree.
 	 */
 	down_write(&ei->i_data_sem);
+
+	ext4_discard_preallocations(inode);
+
 	/*
 	 * The orphan list entry will now protect us from any crash which
 	 * occurs before the truncate completes, so it is now safe to propagate
@@ -3499,8 +3802,6 @@ do_indirects:
 		;
 	}
 
-	ext4_discard_reservation(inode);
-
 	up_write(&ei->i_data_sem);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
@@ -3525,41 +3826,6 @@ out_stop:
 	ext4_journal_stop(handle);
 }
 
-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
-		unsigned long ino, struct ext4_iloc *iloc)
-{
-	ext4_group_t block_group;
-	unsigned long offset;
-	ext4_fsblk_t block;
-	struct ext4_group_desc *gdp;
-
-	if (!ext4_valid_inum(sb, ino)) {
-		/*
-		 * This error is already checked for in namei.c unless we are
-		 * looking at an NFS filehandle, in which case no error
-		 * report is needed
-		 */
-		return 0;
-	}
-
-	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
-	gdp = ext4_get_group_desc(sb, block_group, NULL);
-	if (!gdp)
-		return 0;
-
-	/*
-	 * Figure out the offset within the block group inode table
-	 */
-	offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
-		EXT4_INODE_SIZE(sb);
-	block = ext4_inode_table(sb, gdp) +
-		(offset >> EXT4_BLOCK_SIZE_BITS(sb));
-
-	iloc->block_group = block_group;
-	iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
-	return block;
-}
-
 /*
  * ext4_get_inode_loc returns with an extra refcount against the inode's
  * underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -3569,23 +3835,49 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
 static int __ext4_get_inode_loc(struct inode *inode,
 				struct ext4_iloc *iloc, int in_mem)
 {
-	ext4_fsblk_t block;
-	struct buffer_head *bh;
+	struct ext4_group_desc	*gdp;
+	struct buffer_head	*bh;
+	struct super_block	*sb = inode->i_sb;
+	ext4_fsblk_t		block;
+	int			inodes_per_block, inode_offset;
+
+	iloc->bh = 0;
+	if (!ext4_valid_inum(sb, inode->i_ino))
+		return -EIO;
 
-	block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
-	if (!block)
+	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
 		return -EIO;
 
-	bh = sb_getblk(inode->i_sb, block);
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+	inode_offset = ((inode->i_ino - 1) %
+			EXT4_INODES_PER_GROUP(sb));
+	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	bh = sb_getblk(sb, block);
 	if (!bh) {
-		ext4_error (inode->i_sb, "ext4_get_inode_loc",
-				"unable to read inode block - "
-				"inode=%lu, block=%llu",
-				 inode->i_ino, block);
+		ext4_error(sb, "ext4_get_inode_loc", "unable to read "
+			   "inode block - inode=%lu, block=%llu",
+			   inode->i_ino, block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
+
+		/*
+		 * If the buffer has the write error flag, we have failed
+		 * to write out another inode in the same block.  In this
+		 * case, we don't have to read the block because we may
+		 * read the old inode data successfully.
+		 */
+		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+			set_buffer_uptodate(bh);
+
 		if (buffer_uptodate(bh)) {
 			/* someone brought it uptodate while we waited */
 			unlock_buffer(bh);
@@ -3599,28 +3891,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
 		 */
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
-			struct ext4_group_desc *desc;
-			int inodes_per_buffer;
-			int inode_offset, i;
-			ext4_group_t block_group;
-			int start;
-
-			block_group = (inode->i_ino - 1) /
-					EXT4_INODES_PER_GROUP(inode->i_sb);
-			inodes_per_buffer = bh->b_size /
-				EXT4_INODE_SIZE(inode->i_sb);
-			inode_offset = ((inode->i_ino - 1) %
-					EXT4_INODES_PER_GROUP(inode->i_sb));
-			start = inode_offset & ~(inodes_per_buffer - 1);
+			int i, start;
 
-			/* Is the inode bitmap in cache? */
-			desc = ext4_get_group_desc(inode->i_sb,
-						block_group, NULL);
-			if (!desc)
-				goto make_io;
+			start = inode_offset & ~(inodes_per_block - 1);
 
-			bitmap_bh = sb_getblk(inode->i_sb,
-				ext4_inode_bitmap(inode->i_sb, desc));
+			/* Is the inode bitmap in cache? */
+			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
 			if (!bitmap_bh)
 				goto make_io;
 
@@ -3633,14 +3909,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
 				brelse(bitmap_bh);
 				goto make_io;
 			}
-			for (i = start; i < start + inodes_per_buffer; i++) {
+			for (i = start; i < start + inodes_per_block; i++) {
 				if (i == inode_offset)
 					continue;
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 					break;
 			}
 			brelse(bitmap_bh);
-			if (i == start + inodes_per_buffer) {
+			if (i == start + inodes_per_block) {
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
@@ -3651,6 +3927,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
 make_io:
 		/*
+		 * If we need to do any I/O, try to pre-readahead extra
+		 * blocks from the inode table.
+		 */
+		if (EXT4_SB(sb)->s_inode_readahead_blks) {
+			ext4_fsblk_t b, end, table;
+			unsigned num;
+
+			table = ext4_inode_table(sb, gdp);
+			/* Make sure s_inode_readahead_blks is a power of 2 */
+			while (EXT4_SB(sb)->s_inode_readahead_blks &
+			       (EXT4_SB(sb)->s_inode_readahead_blks-1))
+				EXT4_SB(sb)->s_inode_readahead_blks = 
+				   (EXT4_SB(sb)->s_inode_readahead_blks &
+				    (EXT4_SB(sb)->s_inode_readahead_blks-1));
+			b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+			if (table > b)
+				b = table;
+			end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+			num = EXT4_INODES_PER_GROUP(sb);
+			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+				num -= le16_to_cpu(gdp->bg_itable_unused);
+			table += num / inodes_per_block;
+			if (end > table)
+				end = table;
+			while (b <= end)
+				sb_breadahead(sb, b++);
+		}
+
+		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
@@ -3660,10 +3966,9 @@ make_io:
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			ext4_error(inode->i_sb, "ext4_get_inode_loc",
-					"unable to read inode block - "
-					"inode=%lu, block=%llu",
-					inode->i_ino, block);
+			ext4_error(sb, __func__,
+				   "unable to read inode block - inode=%lu, "
+				   "block=%llu", inode->i_ino, block);
 			brelse(bh);
 			return -EIO;
 		}
@@ -3755,11 +4060,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
-	ei->i_block_alloc_info = NULL;
 
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
@@ -3769,7 +4073,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
-	if(!(test_opt (inode->i_sb, NO_UID32))) {
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
@@ -3787,7 +4091,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		if (inode->i_mode == 0 ||
 		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
 			/* this inode is deleted */
-			brelse (bh);
+			brelse(bh);
 			ret = -ESTALE;
 			goto bad_inode;
 		}
@@ -3820,7 +4124,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT4_INODE_SIZE(inode->i_sb)) {
-			brelse (bh);
+			brelse(bh);
 			ret = -EIO;
 			goto bad_inode;
 		}
@@ -3873,7 +4177,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	}
-	brelse (iloc.bh);
+	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
@@ -3890,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle,
 	struct inode *inode = &(ei->vfs_inode);
 	u64 i_blocks = inode->i_blocks;
 	struct super_block *sb = inode->i_sb;
-	int err = 0;
 
 	if (i_blocks <= ~0U) {
 		/*
@@ -3900,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle,
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
 		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
-	} else if (i_blocks <= 0xffffffffffffULL) {
+		return 0;
+	}
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+		return -EFBIG;
+
+	if (i_blocks <= 0xffffffffffffULL) {
 		/*
 		 * i_blocks can be represented in a 48 bit variable
 		 * as multiple of 512 bytes
 		 */
-		err = ext4_update_rocompat_feature(handle, sb,
-					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-		if (err)
-			goto  err_out;
-		/* i_block is stored in the split  48 bit fields */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
 	} else {
-		/*
-		 * i_blocks should be represented in a 48 bit variable
-		 * as multiple of  file system block size
-		 */
-		err = ext4_update_rocompat_feature(handle, sb,
-					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-		if (err)
-			goto  err_out;
 		ei->i_flags |= EXT4_HUGE_FILE_FL;
 		/* i_block is stored in file system block size */
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 	}
-err_out:
-	return err;
+	return 0;
 }
 
 /*
@@ -3955,14 +4249,14 @@ static int ext4_do_update_inode(handle_t *handle,
 
 	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
-	if(!(test_opt(inode->i_sb, NO_UID32))) {
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
-		if(!ei->i_dtime) {
+		if (!ei->i_dtime) {
 			raw_inode->i_uid_high =
 				cpu_to_le16(high_16_bits(inode->i_uid));
 			raw_inode->i_gid_high =
@@ -4050,7 +4344,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	ei->i_state &= ~EXT4_STATE_NEW;
 
 out_brelse:
-	brelse (bh);
+	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
@@ -4258,57 +4552,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
+static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
+				      int chunk)
+{
+	int indirects;
+
+	/* if nrblocks are contiguous */
+	if (chunk) {
+		/*
+		 * With N contiguous data blocks, it need at most
+		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+		 * 2 dindirect blocks
+		 * 1 tindirect block
+		 */
+		indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+		return indirects + 3;
+	}
+	/*
+	 * if nrblocks are not contiguous, worse case, each block touch
+	 * a indirect block, and each indirect block touch a double indirect
+	 * block, plus a triple indirect block
+	 */
+	indirects = nrblocks * 2 + 1;
+	return indirects;
+}
+
+static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+		return ext4_indirect_trans_blocks(inode, nrblocks, 0);
+	return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+}
 /*
- * How many blocks doth make a writepage()?
- *
- * With N blocks per page, it may be:
- * N data blocks
- * 2 indirect block
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
+ * Account for index blocks, block groups bitmaps and block group
+ * descriptor blocks if modify datablocks and index blocks
+ * worse case, the indexs blocks spread over different block groups
  *
- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiugous, with flexbg,
+ * they could still across block group boundary.
  *
- * With ordered or writeback data it's the same, less the N data blocks.
+ * Also account for superblock, inode, quota and xattr blocks
+ */
+int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+	int groups, gdpblocks;
+	int idxblocks;
+	int ret = 0;
+
+	/*
+	 * How many index blocks need to touch to modify nrblocks?
+	 * The "Chunk" flag indicating whether the nrblocks is
+	 * physically contiguous on disk
+	 *
+	 * For Direct IO and fallocate, they calls get_block to allocate
+	 * one single extent at a time, so they could set the "Chunk" flag
+	 */
+	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+
+	ret = idxblocks;
+
+	/*
+	 * Now let's see how many group bitmaps and group descriptors need
+	 * to account
+	 */
+	groups = idxblocks;
+	if (chunk)
+		groups += 1;
+	else
+		groups += nrblocks;
+
+	gdpblocks = groups;
+	if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
+		groups = EXT4_SB(inode->i_sb)->s_groups_count;
+	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+
+	/* bitmaps and block group descriptor blocks */
+	ret += groups + gdpblocks;
+
+	/* Blocks for super block, inode, quota and xattr blocks */
+	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+
+	return ret;
+}
+
+/*
+ * Calulate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
  *
- * If the inode's direct blocks can hold an integral number of pages then a
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
+ * This could be called via ext4_write_begin()
  *
- * This still overestimates under most circumstances.  If we were to pass the
- * start and end offsets in here as well we could do block_to_path() on each
- * block and work out the exact number of indirects which are touched.  Pah.
+ * We need to consider the worse case, when
+ * one new block per extent.
  */
-
 int ext4_writepage_trans_blocks(struct inode *inode)
 {
 	int bpp = ext4_journal_blocks_per_page(inode);
-	int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
 	int ret;
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-		return ext4_ext_writepage_trans_blocks(inode, bpp);
+	ret = ext4_meta_trans_blocks(inode, bpp, 0);
 
+	/* Account for data blocks for journalled mode */
 	if (ext4_should_journal_data(inode))
-		ret = 3 * (bpp + indirects) + 2;
-	else
-		ret = 2 * (bpp + indirects) + 2;
-
-#ifdef CONFIG_QUOTA
-	/* We know that structure was already allocated during DQUOT_INIT so
-	 * we will be updating only the data blocks + inodes */
-	ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-
+		ret += bpp;
 	return ret;
 }
 
 /*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+	return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+
+/*
  * The caller must have previously called ext4_reserve_inode_write().
  * Give this, we know that the caller already has write access to iloc->bh.
  */
@@ -4581,6 +4947,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	loff_t size;
 	unsigned long len;
 	int ret = -EINVAL;
+	void *fsdata;
 	struct file *file = vma->vm_file;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
@@ -4619,11 +4986,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	 * on the same page though
 	 */
 	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
-			len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+			len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 	if (ret < 0)
 		goto out_unlock;
 	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
-			len, len, page, NULL);
+			len, len, page, fsdata);
 	if (ret < 0)
 		goto out_unlock;
 	ret = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1faba6..dc99b4776d58 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -23,9 +23,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int flags;
-	unsigned short rsv_window_size;
 
-	ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
 
 	switch (cmd) {
 	case EXT4_IOC_GETFLAGS:
@@ -34,7 +33,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
 		handle_t *handle = NULL;
-		int err;
+		int err, migrate = 0;
 		struct ext4_iloc iloc;
 		unsigned int oldflags;
 		unsigned int jflag;
@@ -82,6 +81,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			if (!capable(CAP_SYS_RESOURCE))
 				goto flags_out;
 		}
+		if (oldflags & EXT4_EXTENTS_FL) {
+			/* We don't support clearning extent flags */
+			if (!(flags & EXT4_EXTENTS_FL)) {
+				err = -EOPNOTSUPP;
+				goto flags_out;
+			}
+		} else if (flags & EXT4_EXTENTS_FL) {
+			/* migrate the file */
+			migrate = 1;
+			flags &= ~EXT4_EXTENTS_FL;
+		}
 
 		handle = ext4_journal_start(inode, 1);
 		if (IS_ERR(handle)) {
@@ -109,6 +119,10 @@ flags_err:
 
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
 			err = ext4_change_inode_journal_flag(inode, jflag);
+		if (err)
+			goto flags_out;
+		if (migrate)
+			err = ext4_ext_migrate(inode);
 flags_out:
 		mutex_unlock(&inode->i_mutex);
 		mnt_drop_write(filp->f_path.mnt);
@@ -175,53 +189,10 @@ setversion_out:
 			return ret;
 		}
 #endif
-	case EXT4_IOC_GETRSVSZ:
-		if (test_opt(inode->i_sb, RESERVATION)
-			&& S_ISREG(inode->i_mode)
-			&& ei->i_block_alloc_info) {
-			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
-			return put_user(rsv_window_size, (int __user *)arg);
-		}
-		return -ENOTTY;
-	case EXT4_IOC_SETRSVSZ: {
-		int err;
-
-		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
-			return -ENOTTY;
-
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
-
-		if (get_user(rsv_window_size, (int __user *)arg))
-			return -EFAULT;
-
-		err = mnt_want_write(filp->f_path.mnt);
-		if (err)
-			return err;
-
-		if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
-			rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
-
-		/*
-		 * need to allocate reservation structure for this inode
-		 * before set the window size
-		 */
-		down_write(&ei->i_data_sem);
-		if (!ei->i_block_alloc_info)
-			ext4_init_block_alloc_info(inode);
-
-		if (ei->i_block_alloc_info){
-			struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
-			rsv->rsv_goal_size = rsv_window_size;
-		}
-		up_write(&ei->i_data_sem);
-		mnt_drop_write(filp->f_path.mnt);
-		return 0;
-	}
 	case EXT4_IOC_GROUP_EXTEND: {
 		ext4_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
-		int err;
+		int err, err2;
 
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -235,8 +206,10 @@ setversion_out:
 
 		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
 		mnt_drop_write(filp->f_path.mnt);
 
 		return err;
@@ -244,7 +217,7 @@ setversion_out:
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
 		struct super_block *sb = inode->i_sb;
-		int err;
+		int err, err2;
 
 		if (!capable(CAP_SYS_RESOURCE))
 			return -EPERM;
@@ -259,15 +232,36 @@ setversion_out:
 
 		err = ext4_group_add(sb, &input);
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (err == 0)
+			err = err2;
 		mnt_drop_write(filp->f_path.mnt);
 
 		return err;
 	}
 
 	case EXT4_IOC_MIGRATE:
-		return ext4_ext_migrate(inode, filp, cmd, arg);
+	{
+		int err;
+		if (!is_owner_or_cap(inode))
+			return -EACCES;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			return err;
+		/*
+		 * inode_mutex prevent write and truncate on the file.
+		 * Read still goes through. We take i_data_sem in
+		 * ext4_ext_swap_inode_data before we switch the
+		 * inode format to prevent read.
+		 */
+		mutex_lock(&(inode->i_mutex));
+		err = ext4_ext_migrate(inode);
+		mutex_unlock(&(inode->i_mutex));
+		mnt_drop_write(filp->f_path.mnt);
+		return err;
+	}
 
 	default:
 		return -ENOTTY;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8d141a25bbee..dfe17a134052 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -477,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 		b2 = (unsigned char *) bitmap;
 		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 			if (b1[i] != b2[i]) {
-				printk("corruption in group %lu at byte %u(%u):"
-				       " %x in copy != %x on disk/prealloc\n",
-					e4b->bd_group, i, i * 8, b1[i], b2[i]);
+				printk(KERN_ERR "corruption in group %lu "
+				       "at byte %u(%u): %x in copy != %x "
+				       "on disk/prealloc\n",
+				       e4b->bd_group, i, i * 8, b1[i], b2[i]);
 				BUG();
 			}
 		}
@@ -533,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 	void *buddy;
 	void *buddy2;
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
 	{
 		static int mb_check_counter;
 		if (mb_check_counter++ % 100 != 0)
@@ -784,16 +782,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if (bh[i] == NULL)
 			goto out;
 
-		if (bh_uptodate_or_lock(bh[i]))
+		if (buffer_uptodate(bh[i]) &&
+		    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
 			continue;
 
+		lock_buffer(bh[i]);
+		spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			ext4_init_block_bitmap(sb, bh[i],
 						first_group + i, desc);
 			set_buffer_uptodate(bh[i]);
 			unlock_buffer(bh[i]);
+			spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
 			continue;
 		}
+		spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
 		get_bh(bh[i]);
 		bh[i]->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh[i]);
@@ -2166,9 +2169,10 @@ static void ext4_mb_history_release(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	remove_proc_entry("mb_groups", sbi->s_mb_proc);
-	remove_proc_entry("mb_history", sbi->s_mb_proc);
-
+	if (sbi->s_proc != NULL) {
+		remove_proc_entry("mb_groups", sbi->s_proc);
+		remove_proc_entry("mb_history", sbi->s_proc);
+	}
 	kfree(sbi->s_mb_history);
 }
 
@@ -2177,10 +2181,10 @@ static void ext4_mb_history_init(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int i;
 
-	if (sbi->s_mb_proc != NULL) {
-		proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc,
+	if (sbi->s_proc != NULL) {
+		proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_history_fops, sb);
-		proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc,
+		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
 	}
 
@@ -2296,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	}
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+	meta_group_info[i]->bb_free_root.rb_node = NULL;;
 
 #ifdef DOUBLE_CHECK
 	{
@@ -2477,24 +2482,19 @@ err_freesgi:
 int ext4_mb_init(struct super_block *sb, int needs_recovery)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	unsigned i;
+	unsigned i, j;
 	unsigned offset;
 	unsigned max;
 	int ret;
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
 	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
 
 	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_offsets == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		return -ENOMEM;
 	}
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_maxs);
 		return -ENOMEM;
 	}
@@ -2517,16 +2517,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	/* init file for buddy data */
 	ret = ext4_mb_init_backend(sb);
 	if (ret != 0) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
 		return ret;
 	}
 
 	spin_lock_init(&sbi->s_md_lock);
-	INIT_LIST_HEAD(&sbi->s_active_transaction);
-	INIT_LIST_HEAD(&sbi->s_closed_transaction);
-	INIT_LIST_HEAD(&sbi->s_committed_transaction);
 	spin_lock_init(&sbi->s_bal_lock);
 
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2537,26 +2533,27 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
 	sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
 
-	i = sizeof(struct ext4_locality_group) * NR_CPUS;
-	sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
 	if (sbi->s_locality_groups == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
 		return -ENOMEM;
 	}
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_possible_cpu(i) {
 		struct ext4_locality_group *lg;
-		lg = &sbi->s_locality_groups[i];
+		lg = per_cpu_ptr(sbi->s_locality_groups, i);
 		mutex_init(&lg->lg_mutex);
-		INIT_LIST_HEAD(&lg->lg_prealloc_list);
+		for (j = 0; j < PREALLOC_TB_SIZE; j++)
+			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
 		spin_lock_init(&lg->lg_prealloc_lock);
 	}
 
 	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_history_init(sb);
 
-	printk("EXT4-fs: mballoc enabled\n");
+	sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+
+	printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
 	return 0;
 }
 
@@ -2571,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 		list_del(&pa->pa_group_list);
 		count++;
-		kfree(pa);
+		kmem_cache_free(ext4_pspace_cachep, pa);
 	}
 	if (count)
 		mb_debug("mballoc: %u PAs left\n", count);
@@ -2585,18 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
-	/* release freed, non-committed blocks */
-	spin_lock(&sbi->s_md_lock);
-	list_splice_init(&sbi->s_closed_transaction,
-			&sbi->s_committed_transaction);
-	list_splice_init(&sbi->s_active_transaction,
-			&sbi->s_committed_transaction);
-	spin_unlock(&sbi->s_md_lock);
-	ext4_mb_free_committed_blocks(sb);
-
 	if (sbi->s_group_info) {
 		for (i = 0; i < sbi->s_groups_count; i++) {
 			grinfo = ext4_get_group_info(sb, i);
@@ -2643,69 +2628,64 @@ int ext4_mb_release(struct super_block *sb)
 				atomic_read(&sbi->s_mb_discarded));
 	}
 
-	kfree(sbi->s_locality_groups);
-
+	free_percpu(sbi->s_locality_groups);
 	ext4_mb_history_release(sb);
 	ext4_mb_destroy_per_dev_proc(sb);
 
 	return 0;
 }
 
-static noinline_for_stack void
-ext4_mb_free_committed_blocks(struct super_block *sb)
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 {
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int err;
-	int i;
-	int count = 0;
-	int count2 = 0;
-	struct ext4_free_metadata *md;
+	struct super_block *sb = journal->j_private;
 	struct ext4_buddy e4b;
+	struct ext4_group_info *db;
+	int err, count = 0, count2 = 0;
+	struct ext4_free_data *entry;
+	ext4_fsblk_t discard_block;
+	struct list_head *l, *ltmp;
 
-	if (list_empty(&sbi->s_committed_transaction))
-		return;
-
-	/* there is committed blocks to be freed yet */
-	do {
-		/* get next array of blocks */
-		md = NULL;
-		spin_lock(&sbi->s_md_lock);
-		if (!list_empty(&sbi->s_committed_transaction)) {
-			md = list_entry(sbi->s_committed_transaction.next,
-					struct ext4_free_metadata, list);
-			list_del(&md->list);
-		}
-		spin_unlock(&sbi->s_md_lock);
-
-		if (md == NULL)
-			break;
+	list_for_each_safe(l, ltmp, &txn->t_private_list) {
+		entry = list_entry(l, struct ext4_free_data, list);
 
 		mb_debug("gonna free %u blocks in group %lu (0x%p):",
-				md->num, md->group, md);
+			 entry->count, entry->group, entry);
 
-		err = ext4_mb_load_buddy(sb, md->group, &e4b);
+		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
 		/* we expect to find existing buddy because it's pinned */
 		BUG_ON(err != 0);
 
+		db = e4b.bd_info;
 		/* there are blocks to put in buddy to make them really free */
-		count += md->num;
+		count += entry->count;
 		count2++;
-		ext4_lock_group(sb, md->group);
-		for (i = 0; i < md->num; i++) {
-			mb_debug(" %u", md->blocks[i]);
-			mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+		ext4_lock_group(sb, entry->group);
+		/* Take it out of per group rb tree */
+		rb_erase(&entry->node, &(db->bb_free_root));
+		mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+		if (!db->bb_free_root.rb_node) {
+			/* No more items in the per group rb tree
+			 * balance refcounts from ext4_mb_free_metadata()
+			 */
+			page_cache_release(e4b.bd_buddy_page);
+			page_cache_release(e4b.bd_bitmap_page);
 		}
-		mb_debug("\n");
-		ext4_unlock_group(sb, md->group);
-
-		/* balance refcounts from ext4_mb_free_metadata() */
-		page_cache_release(e4b.bd_buddy_page);
-		page_cache_release(e4b.bd_bitmap_page);
-
-		kfree(md);
+		ext4_unlock_group(sb, entry->group);
+		discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+			+ entry->start_blk
+			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+		trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
+			   (unsigned long long) discard_block, entry->count);
+		sb_issue_discard(sb, discard_block, entry->count);
+
+		kmem_cache_free(ext4_free_ext_cachep, entry);
 		ext4_mb_release_desc(&e4b);
-
-	} while (md);
+	}
 
 	mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
@@ -2717,119 +2697,52 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 #define EXT4_MB_STREAM_REQ		"stream_req"
 #define EXT4_MB_GROUP_PREALLOC		"group_prealloc"
 
-
-
-#define MB_PROC_FOPS(name)					\
-static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)	\
-{								\
-	struct ext4_sb_info *sbi = m->private;			\
-								\
-	seq_printf(m, "%ld\n", sbi->s_mb_##name);		\
-	return 0;						\
-}								\
-								\
-static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
-{								\
-	return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
-}								\
-								\
-static ssize_t ext4_mb_##name##_proc_write(struct file *file,	\
-		const char __user *buf, size_t cnt, loff_t *ppos)	\
-{								\
-	struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
-	char str[32];						\
-	long value;						\
-	if (cnt >= sizeof(str))					\
-		return -EINVAL;					\
-	if (copy_from_user(str, buf, cnt))			\
-		return -EFAULT;					\
-	value = simple_strtol(str, NULL, 0);			\
-	if (value <= 0)						\
-		return -ERANGE;					\
-	sbi->s_mb_##name = value;				\
-	return cnt;						\
-}								\
-								\
-static const struct file_operations ext4_mb_##name##_proc_fops = {	\
-	.owner		= THIS_MODULE,				\
-	.open		= ext4_mb_##name##_proc_open,		\
-	.read		= seq_read,				\
-	.llseek		= seq_lseek,				\
-	.release	= single_release,			\
-	.write		= ext4_mb_##name##_proc_write,		\
-};
-
-MB_PROC_FOPS(stats);
-MB_PROC_FOPS(max_to_scan);
-MB_PROC_FOPS(min_to_scan);
-MB_PROC_FOPS(order2_reqs);
-MB_PROC_FOPS(stream_request);
-MB_PROC_FOPS(group_prealloc);
-
-#define	MB_PROC_HANDLER(name, var)					\
-do {									\
-	proc = proc_create_data(name, mode, sbi->s_mb_proc,		\
-				&ext4_mb_##var##_proc_fops, sbi);	\
-	if (proc == NULL) {						\
-		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
-		goto err_out;						\
-	}								\
-} while (0)
-
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 {
+#ifdef CONFIG_PROC_FS
 	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct proc_dir_entry *proc;
-	char devname[64];
 
-	if (proc_root_ext4 == NULL) {
-		sbi->s_mb_proc = NULL;
+	if (sbi->s_proc == NULL)
 		return -EINVAL;
-	}
-	bdevname(sb->s_bdev, devname);
-	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
-
-	MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
-	MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
-	MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
-	MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
-	MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
-	MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
 
+	EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
+	EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
+	EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
+	EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
+	EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
+	EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
 	return 0;
 
 err_out:
-	printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-	remove_proc_entry(devname, proc_root_ext4);
-	sbi->s_mb_proc = NULL;
-
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
 	return -ENOMEM;
+#else
+	return 0;
+#endif
 }
 
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 {
+#ifdef CONFIG_PROC_FS
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	char devname[64];
 
-	if (sbi->s_mb_proc == NULL)
+	if (sbi->s_proc == NULL)
 		return -EINVAL;
 
-	bdevname(sb->s_bdev, devname);
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-	remove_proc_entry(devname, proc_root_ext4);
-
+	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
+#endif
 	return 0;
 }
 
@@ -2850,11 +2763,16 @@ int __init init_ext4_mballoc(void)
 		kmem_cache_destroy(ext4_pspace_cachep);
 		return -ENOMEM;
 	}
-#ifdef CONFIG_PROC_FS
-	proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
-	if (proc_root_ext4 == NULL)
-		printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n");
-#endif
+
+	ext4_free_ext_cachep =
+		kmem_cache_create("ext4_free_block_extents",
+				     sizeof(struct ext4_free_data),
+				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	if (ext4_free_ext_cachep == NULL) {
+		kmem_cache_destroy(ext4_pspace_cachep);
+		kmem_cache_destroy(ext4_ac_cachep);
+		return -ENOMEM;
+	}
 	return 0;
 }
 
@@ -2863,9 +2781,7 @@ void exit_ext4_mballoc(void)
 	/* XXX: synchronize_rcu(); */
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
-#ifdef CONFIG_PROC_FS
-	remove_proc_entry("fs/ext4", NULL);
-#endif
+	kmem_cache_destroy(ext4_free_ext_cachep);
 }
 
 
@@ -2875,7 +2791,7 @@ void exit_ext4_mballoc(void)
  */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-				handle_t *handle)
+				handle_t *handle, unsigned long reserv_blks)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_super_block *es;
@@ -2964,15 +2880,16 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-
+	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
 	/*
-	 * free blocks account has already be reduced/reserved
-	 * at write_begin() time for delayed allocation
-	 * do not double accounting
+	 * Now reduce the dirty block count also. Should not go negative
 	 */
 	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
-		percpu_counter_sub(&sbi->s_freeblocks_counter,
-					ac->ac_b_ex.fe_len);
+		/* release all the reserved blocks if non delalloc */
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+	else
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+						ac->ac_b_ex.fe_len);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3260,6 +3177,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 				struct ext4_prealloc_space *pa)
 {
 	unsigned int len = ac->ac_o_ex.fe_len;
+
 	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
 					&ac->ac_b_ex.fe_group,
 					&ac->ac_b_ex.fe_start);
@@ -3277,14 +3195,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 }
 
 /*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+			struct ext4_prealloc_space *pa,
+			struct ext4_prealloc_space *cpa)
+{
+	ext4_fsblk_t cur_distance, new_distance;
+
+	if (cpa == NULL) {
+		atomic_inc(&pa->pa_count);
+		return pa;
+	}
+	cur_distance = abs(goal_block - cpa->pa_pstart);
+	new_distance = abs(goal_block - pa->pa_pstart);
+
+	if (cur_distance < new_distance)
+		return cpa;
+
+	/* drop the previous reference */
+	atomic_dec(&cpa->pa_count);
+	atomic_inc(&pa->pa_count);
+	return pa;
+}
+
+/*
  * search goal blocks in preallocated space
  */
 static noinline_for_stack int
 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 {
+	int order, i;
 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
 	struct ext4_locality_group *lg;
-	struct ext4_prealloc_space *pa;
+	struct ext4_prealloc_space *pa, *cpa = NULL;
+	ext4_fsblk_t goal_block;
 
 	/* only data can be preallocated */
 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3322,22 +3271,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 	lg = ac->ac_lg;
 	if (lg == NULL)
 		return 0;
+	order  = fls(ac->ac_o_ex.fe_len) - 1;
+	if (order > PREALLOC_TB_SIZE - 1)
+		/* The max size of hash table is PREALLOC_TB_SIZE */
+		order = PREALLOC_TB_SIZE - 1;
+
+	goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
+		     ac->ac_g_ex.fe_start +
+		     le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+	/*
+	 * search for the prealloc space that is having
+	 * minimal distance from the goal block.
+	 */
+	for (i = order; i < PREALLOC_TB_SIZE; i++) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
+					pa_inode_list) {
+			spin_lock(&pa->pa_lock);
+			if (pa->pa_deleted == 0 &&
+					pa->pa_free >= ac->ac_o_ex.fe_len) {
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
-		spin_lock(&pa->pa_lock);
-		if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
-			atomic_inc(&pa->pa_count);
-			ext4_mb_use_group_pa(ac, pa);
+				cpa = ext4_mb_check_group_pa(goal_block,
+								pa, cpa);
+			}
 			spin_unlock(&pa->pa_lock);
-			ac->ac_criteria = 20;
-			rcu_read_unlock();
-			return 1;
 		}
-		spin_unlock(&pa->pa_lock);
+		rcu_read_unlock();
+	}
+	if (cpa) {
+		ext4_mb_use_group_pa(ac, cpa);
+		ac->ac_criteria = 20;
+		return 1;
 	}
-	rcu_read_unlock();
-
 	return 0;
 }
 
@@ -3560,6 +3525,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	pa->pa_free = pa->pa_len;
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
 	pa->pa_deleted = 0;
 	pa->pa_linear = 1;
 
@@ -3580,10 +3546,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
 	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 
-	spin_lock(pa->pa_obj_lock);
-	list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
-	spin_unlock(pa->pa_obj_lock);
-
+	/*
+	 * We will later add the new pa to the right bucket
+	 * after updating the pa_free in ext4_mb_release_context
+	 */
 	return 0;
 }
 
@@ -3733,20 +3699,23 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 
 	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
-		/* error handling here */
-		ext4_mb_release_desc(&e4b);
-		BUG_ON(bitmap_bh == NULL);
+		ext4_error(sb, __func__, "Error in reading block "
+				"bitmap for %lu\n", group);
+		return 0;
 	}
 
 	err = ext4_mb_load_buddy(sb, group, &e4b);
-	BUG_ON(err != 0); /* error handling here */
+	if (err) {
+		ext4_error(sb, __func__, "Error in loading buddy "
+				"information for %lu\n", group);
+		put_bh(bitmap_bh);
+		return 0;
+	}
 
 	if (needed == 0)
 		needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
 
-	grp = ext4_get_group_info(sb, group);
 	INIT_LIST_HEAD(&list);
-
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 repeat:
 	ext4_lock_group(sb, group);
@@ -3828,7 +3797,7 @@ out:
  *
  * FIXME!! Make sure it is valid at all the call sites
  */
-void ext4_mb_discard_inode_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -3840,7 +3809,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
 	struct ext4_buddy e4b;
 	int err;
 
-	if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+	if (!S_ISREG(inode->i_mode)) {
 		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
 		return;
 	}
@@ -3903,13 +3872,18 @@ repeat:
 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 
 		err = ext4_mb_load_buddy(sb, group, &e4b);
-		BUG_ON(err != 0); /* error handling here */
+		if (err) {
+			ext4_error(sb, __func__, "Error in loading buddy "
+					"information for %lu\n", group);
+			continue;
+		}
 
 		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
-			/* error handling here */
+			ext4_error(sb, __func__, "Error in reading block "
+					"bitmap for %lu\n", group);
 			ext4_mb_release_desc(&e4b);
-			BUG_ON(bitmap_bh == NULL);
+			continue;
 		}
 
 		ext4_lock_group(sb, group);
@@ -4033,8 +4007,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	 * per cpu locality group is to reduce the contention between block
 	 * request from multiple CPUs.
 	 */
-	ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
-	put_cpu();
+	ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
 
 	/* we're going to use group allocation */
 	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4112,22 +4085,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 
 }
 
+static noinline_for_stack void
+ext4_mb_discard_lg_preallocations(struct super_block *sb,
+					struct ext4_locality_group *lg,
+					int order, int total_entries)
+{
+	ext4_group_t group = 0;
+	struct ext4_buddy e4b;
+	struct list_head discard_list;
+	struct ext4_prealloc_space *pa, *tmp;
+	struct ext4_allocation_context *ac;
+
+	mb_debug("discard locality group preallocation\n");
+
+	INIT_LIST_HEAD(&discard_list);
+	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+
+	spin_lock(&lg->lg_prealloc_lock);
+	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
+						pa_inode_list) {
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			/*
+			 * This is the pa that we just used
+			 * for block allocation. So don't
+			 * free that
+			 */
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		/* only lg prealloc space */
+		BUG_ON(!pa->pa_linear);
+
+		/* seems this one can be freed ... */
+		pa->pa_deleted = 1;
+		spin_unlock(&pa->pa_lock);
+
+		list_del_rcu(&pa->pa_inode_list);
+		list_add(&pa->u.pa_tmp_list, &discard_list);
+
+		total_entries--;
+		if (total_entries <= 5) {
+			/*
+			 * we want to keep only 5 entries
+			 * allowing it to grow to 8. This
+			 * mak sure we don't call discard
+			 * soon for this list.
+			 */
+			break;
+		}
+	}
+	spin_unlock(&lg->lg_prealloc_lock);
+
+	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
+
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+		if (ext4_mb_load_buddy(sb, group, &e4b)) {
+			ext4_error(sb, __func__, "Error in loading buddy "
+					"information for %lu\n", group);
+			continue;
+		}
+		ext4_lock_group(sb, group);
+		list_del(&pa->pa_group_list);
+		ext4_mb_release_group_pa(&e4b, pa, ac);
+		ext4_unlock_group(sb, group);
+
+		ext4_mb_release_desc(&e4b);
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+	if (ac)
+		kmem_cache_free(ext4_ac_cachep, ac);
+}
+
+/*
+ * We have incremented pa_count. So it cannot be freed at this
+ * point. Also we hold lg_mutex. So no parallel allocation is
+ * possible from this lg. That means pa_free cannot be updated.
+ *
+ * A parallel ext4_mb_discard_group_preallocations is possible.
+ * which can cause the lg_prealloc_list to be updated.
+ */
+
+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+{
+	int order, added = 0, lg_prealloc_count = 1;
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg = ac->ac_lg;
+	struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
+
+	order = fls(pa->pa_free) - 1;
+	if (order > PREALLOC_TB_SIZE - 1)
+		/* The max size of hash table is PREALLOC_TB_SIZE */
+		order = PREALLOC_TB_SIZE - 1;
+	/* Add the prealloc space to lg */
+	rcu_read_lock();
+	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
+						pa_inode_list) {
+		spin_lock(&tmp_pa->pa_lock);
+		if (tmp_pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		if (!added && pa->pa_free < tmp_pa->pa_free) {
+			/* Add to the tail of the previous entry */
+			list_add_tail_rcu(&pa->pa_inode_list,
+						&tmp_pa->pa_inode_list);
+			added = 1;
+			/*
+			 * we want to count the total
+			 * number of entries in the list
+			 */
+		}
+		spin_unlock(&tmp_pa->pa_lock);
+		lg_prealloc_count++;
+	}
+	if (!added)
+		list_add_tail_rcu(&pa->pa_inode_list,
+					&lg->lg_prealloc_list[order]);
+	rcu_read_unlock();
+
+	/* Now trim the list to be not more than 8 elements */
+	if (lg_prealloc_count > 8) {
+		ext4_mb_discard_lg_preallocations(sb, lg,
+						order, lg_prealloc_count);
+		return;
+	}
+	return ;
+}
+
 /*
  * release all resource we used in allocation
  */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
-	if (ac->ac_pa) {
-		if (ac->ac_pa->pa_linear) {
+	struct ext4_prealloc_space *pa = ac->ac_pa;
+	if (pa) {
+		if (pa->pa_linear) {
 			/* see comment in ext4_mb_use_group_pa() */
-			spin_lock(&ac->ac_pa->pa_lock);
-			ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
-			ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
-			ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
-			ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
-			spin_unlock(&ac->ac_pa->pa_lock);
+			spin_lock(&pa->pa_lock);
+			pa->pa_pstart += ac->ac_b_ex.fe_len;
+			pa->pa_lstart += ac->ac_b_ex.fe_len;
+			pa->pa_free -= ac->ac_b_ex.fe_len;
+			pa->pa_len -= ac->ac_b_ex.fe_len;
+			spin_unlock(&pa->pa_lock);
+			/*
+			 * We want to add the pa to the right bucket.
+			 * Remove it from the list and while adding
+			 * make sure the list to which we are adding
+			 * doesn't grow big.
+			 */
+			if (likely(pa->pa_free)) {
+				spin_lock(pa->pa_obj_lock);
+				list_del_rcu(&pa->pa_inode_list);
+				spin_unlock(pa->pa_obj_lock);
+				ext4_mb_add_n_trim(ac);
+			}
 		}
-		ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
+		ext4_mb_put_pa(ac, ac->ac_sb, pa);
 	}
 	if (ac->ac_bitmap_page)
 		page_cache_release(ac->ac_bitmap_page);
@@ -4162,33 +4281,32 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 				 struct ext4_allocation_request *ar, int *errp)
 {
+	int freed;
 	struct ext4_allocation_context *ac = NULL;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
-	int freed;
-	int inquota;
+	unsigned long inquota;
+	unsigned long reserv_blks = 0;
 
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
 
-	if (!test_opt(sb, MBALLOC)) {
-		block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
-					    &(ar->len), errp);
-		return block;
-	}
 	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
 		/*
 		 * With delalloc we already reserved the blocks
 		 */
-		ar->len = ext4_has_free_blocks(sbi, ar->len);
-	}
-
-	if (ar->len == 0) {
-		*errp = -ENOSPC;
-		return 0;
+		while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+			/* let others to free the space */
+			yield();
+			ar->len = ar->len >> 1;
+		}
+		if (!ar->len) {
+			*errp = -ENOSPC;
+			return 0;
+		}
+		reserv_blks = ar->len;
 	}
-
 	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
 		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
 		ar->len--;
@@ -4209,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		goto out1;
 	}
 
-	ext4_mb_poll_new_transaction(sb, handle);
-
 	*errp = ext4_mb_initialize_context(ac, ar);
 	if (*errp) {
 		ar->len = 0;
@@ -4234,7 +4350,7 @@ repeat:
 	}
 
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-		*errp = ext4_mb_mark_diskspace_used(ac, handle);
+		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
 		if (*errp ==  -EAGAIN) {
 			ac->ac_b_ex.fe_group = 0;
 			ac->ac_b_ex.fe_start = 0;
@@ -4269,35 +4385,20 @@ out1:
 
 	return block;
 }
-static void ext4_mb_poll_new_transaction(struct super_block *sb,
-						handle_t *handle)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-	if (sbi->s_last_transaction == handle->h_transaction->t_tid)
-		return;
 
-	/* new transaction! time to close last one and free blocks for
-	 * committed transaction. we know that only transaction can be
-	 * active, so previos transaction can be being logged and we
-	 * know that transaction before previous is known to be already
-	 * logged. this means that now we may free blocks freed in all
-	 * transactions before previous one. hope I'm clear enough ... */
-
-	spin_lock(&sbi->s_md_lock);
-	if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
-		mb_debug("new transaction %lu, old %lu\n",
-				(unsigned long) handle->h_transaction->t_tid,
-				(unsigned long) sbi->s_last_transaction);
-		list_splice_init(&sbi->s_closed_transaction,
-				&sbi->s_committed_transaction);
-		list_splice_init(&sbi->s_active_transaction,
-				&sbi->s_closed_transaction);
-		sbi->s_last_transaction = handle->h_transaction->t_tid;
-	}
-	spin_unlock(&sbi->s_md_lock);
-
-	ext4_mb_free_committed_blocks(sb);
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+			struct ext4_free_data *entry2)
+{
+	if ((entry1->t_tid == entry2->t_tid) &&
+	    (entry1->group == entry2->group) &&
+	    ((entry1->start_blk + entry1->count) == entry2->start_blk))
+		return 1;
+	return 0;
 }
 
 static noinline_for_stack int
@@ -4307,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_free_metadata *md;
-	int i;
+	struct ext4_free_data *entry, *new_entry;
+	struct rb_node **n = &db->bb_free_root.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node;
+
 
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
+	new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+	new_entry->start_blk = block;
+	new_entry->group  = group;
+	new_entry->count = count;
+	new_entry->t_tid = handle->h_transaction->t_tid;
+	new_node = &new_entry->node;
+
 	ext4_lock_group(sb, group);
-	for (i = 0; i < count; i++) {
-		md = db->bb_md_cur;
-		if (md && db->bb_tid != handle->h_transaction->t_tid) {
-			db->bb_md_cur = NULL;
-			md = NULL;
+	if (!*n) {
+		/* first free block exent. We need to
+		   protect buddy cache from being freed,
+		 * otherwise we'll refresh it from
+		 * on-disk bitmap and lose not-yet-available
+		 * blocks */
+		page_cache_get(e4b->bd_buddy_page);
+		page_cache_get(e4b->bd_bitmap_page);
+	}
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_free_data, node);
+		if (block < entry->start_blk)
+			n = &(*n)->rb_left;
+		else if (block >= (entry->start_blk + entry->count))
+			n = &(*n)->rb_right;
+		else {
+			ext4_error(sb, __func__,
+			    "Double free of blocks %d (%d %d)\n",
+			    block, entry->start_blk, entry->count);
+			return 0;
 		}
+	}
 
-		if (md == NULL) {
-			ext4_unlock_group(sb, group);
-			md = kmalloc(sizeof(*md), GFP_NOFS);
-			if (md == NULL)
-				return -ENOMEM;
-			md->num = 0;
-			md->group = group;
-
-			ext4_lock_group(sb, group);
-			if (db->bb_md_cur == NULL) {
-				spin_lock(&sbi->s_md_lock);
-				list_add(&md->list, &sbi->s_active_transaction);
-				spin_unlock(&sbi->s_md_lock);
-				/* protect buddy cache from being freed,
-				 * otherwise we'll refresh it from
-				 * on-disk bitmap and lose not-yet-available
-				 * blocks */
-				page_cache_get(e4b->bd_buddy_page);
-				page_cache_get(e4b->bd_bitmap_page);
-				db->bb_md_cur = md;
-				db->bb_tid = handle->h_transaction->t_tid;
-				mb_debug("new md 0x%p for group %lu\n",
-						md, md->group);
-			} else {
-				kfree(md);
-				md = db->bb_md_cur;
-			}
+	rb_link_node(new_node, parent, n);
+	rb_insert_color(new_node, &db->bb_free_root);
+
+	/* Now try to see the extent can be merged to left and right */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, node);
+		if (can_merge(entry, new_entry)) {
+			new_entry->start_blk = entry->start_blk;
+			new_entry->count += entry->count;
+			rb_erase(node, &(db->bb_free_root));
+			spin_lock(&sbi->s_md_lock);
+			list_del(&entry->list);
+			spin_unlock(&sbi->s_md_lock);
+			kmem_cache_free(ext4_free_ext_cachep, entry);
 		}
+	}
 
-		BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
-		md->blocks[md->num] = block + i;
-		md->num++;
-		if (md->num == EXT4_BB_MAX_BLOCKS) {
-			/* no more space, put full container on a sb's list */
-			db->bb_md_cur = NULL;
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, node);
+		if (can_merge(new_entry, entry)) {
+			new_entry->count += entry->count;
+			rb_erase(node, &(db->bb_free_root));
+			spin_lock(&sbi->s_md_lock);
+			list_del(&entry->list);
+			spin_unlock(&sbi->s_md_lock);
+			kmem_cache_free(ext4_free_ext_cachep, entry);
 		}
 	}
+	/* Add the extent to transaction's private list */
+	spin_lock(&sbi->s_md_lock);
+	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
+	spin_unlock(&sbi->s_md_lock);
 	ext4_unlock_group(sb, group);
 	return 0;
 }
@@ -4385,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 
 	*freed = 0;
 
-	ext4_mb_poll_new_transaction(sb, handle);
-
 	sbi = EXT4_SB(sb);
 	es = EXT4_SB(sb)->s_es;
 	if (block < le32_to_cpu(es->s_first_data_block) ||
@@ -4420,11 +4542,15 @@ do_more:
 		count -= overflow;
 	}
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
-	if (!bitmap_bh)
+	if (!bitmap_bh) {
+		err = -EIO;
 		goto error_return;
+	}
 	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
-	if (!gdp)
+	if (!gdp) {
+		err = -EIO;
 		goto error_return;
+	}
 
 	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
 	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index bfe6add46bcf..b5dff1fff1e5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -18,6 +18,8 @@
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
 #include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/marker.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "group.h"
@@ -98,23 +100,29 @@
 
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
 
-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS	30
+struct ext4_free_data {
+	/* this links the free block information from group_info */
+	struct rb_node node;
 
-struct ext4_free_metadata {
-	ext4_group_t group;
-	unsigned short num;
-	ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
+	/* this links the free block information from ext4_sb_info */
 	struct list_head list;
+
+	/* group which free block extent belongs */
+	ext4_group_t group;
+
+	/* free block extent */
+	ext4_grpblk_t start_blk;
+	ext4_grpblk_t count;
+
+	/* transaction which freed this extent */
+	tid_t	t_tid;
 };
 
 struct ext4_group_info {
 	unsigned long	bb_state;
-	unsigned long	bb_tid;
-	struct ext4_free_metadata *bb_md_cur;
+	struct rb_root  bb_free_root;
 	unsigned short	bb_first_free;
 	unsigned short	bb_free;
 	unsigned short	bb_fragments;
@@ -164,11 +172,17 @@ struct ext4_free_extent {
  * Locality group:
  *   we try to group all related changes together
  *   so that writeback can flush/allocate them together as well
+ *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
+ *   (512). We store prealloc space into the hash based on the pa_free blocks
+ *   order value.ie, fls(pa_free)-1;
  */
+#define PREALLOC_TB_SIZE 10
 struct ext4_locality_group {
 	/* for allocator */
-	struct mutex		lg_mutex;	/* to serialize allocates */
-	struct list_head	lg_prealloc_list;/* list of preallocations */
+	/* to serialize allocates */
+	struct mutex		lg_mutex;
+	/* list of preallocations */
+	struct list_head	lg_prealloc_list[PREALLOC_TB_SIZE];
 	spinlock_t		lg_prealloc_lock;
 };
 
@@ -251,13 +265,10 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
-static struct proc_dir_entry *proc_root_ext4;
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
 static void ext4_mb_return_to_preallocation(struct inode *inode,
 					struct ext4_buddy *e4b, sector_t block,
 					int count);
@@ -265,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
 			struct super_block *, struct ext4_prealloc_space *pa);
 static int ext4_mb_init_per_dev_proc(struct super_block *sb);
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 
 
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b9e077ba07e9..f2a9cf498ecd 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
 	 * credit. But below we try to not accumalate too much
 	 * of them by restarting the journal.
 	 */
-	needed = ext4_ext_calc_credits_for_insert(inode, path);
+	needed = ext4_ext_calc_credits_for_single_extent(inode,
+		    lb->last_block - lb->first_block + 1, path);
 
 	/*
 	 * Make sure the credit we accumalated is not really high
@@ -446,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
 
 }
 
-int ext4_ext_migrate(struct inode *inode, struct file *filp,
-				unsigned int cmd, unsigned long arg)
+int ext4_ext_migrate(struct inode *inode)
 {
 	handle_t *handle;
 	int retval = 0, i;
@@ -515,12 +515,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 	 * when we add extents we extent the journal
 	 */
 	/*
-	 * inode_mutex prevent write and truncate on the file. Read still goes
-	 * through. We take i_data_sem in ext4_ext_swap_inode_data before we
-	 * switch the inode format to prevent read.
-	 */
-	mutex_lock(&(inode->i_mutex));
-	/*
 	 * Even though we take i_mutex we can still cause block allocation
 	 * via mmap write to holes. If we have allocated new blocks we fail
 	 * migrate.  New block allocation will clear EXT4_EXT_MIGRATE flag.
@@ -622,7 +616,6 @@ err_out:
 	tmp_inode->i_nlink = 0;
 
 	ext4_journal_stop(handle);
-	mutex_unlock(&(inode->i_mutex));
 
 	if (tmp_inode)
 		iput(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 387ad98350c3..63adcb792988 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,34 +151,36 @@ struct dx_map_entry
 
 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
 static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash (struct dx_entry *entry);
-static void dx_set_hash (struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count (struct dx_entry *entries);
-static unsigned dx_get_limit (struct dx_entry *entries);
-static void dx_set_count (struct dx_entry *entries, unsigned value);
-static void dx_set_limit (struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static inline unsigned dx_get_hash(struct dx_entry *entry);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count(struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit(struct inode *dir);
+static struct dx_frame *dx_probe(const struct qstr *d_name,
 				 struct inode *dir,
 				 struct dx_hash_info *hinfo,
 				 struct dx_frame *frame,
 				 int *err);
-static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
-			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_release(struct dx_frame *frames);
+static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
 		struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
 static void dx_insert_block(struct dx_frame *frame,
 					u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
 				 struct dx_frame *frames,
 				 __u32 *start_hash);
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
-		       struct ext4_dir_entry_2 **res_dir, int *err);
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+		const struct qstr *d_name,
+		struct ext4_dir_entry_2 **res_dir,
+		int *err);
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
@@ -207,44 +209,44 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
 	entry->block = cpu_to_le32(value);
 }
 
-static inline unsigned dx_get_hash (struct dx_entry *entry)
+static inline unsigned dx_get_hash(struct dx_entry *entry)
 {
 	return le32_to_cpu(entry->hash);
 }
 
-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
 {
 	entry->hash = cpu_to_le32(value);
 }
 
-static inline unsigned dx_get_count (struct dx_entry *entries)
+static inline unsigned dx_get_count(struct dx_entry *entries)
 {
 	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
 }
 
-static inline unsigned dx_get_limit (struct dx_entry *entries)
+static inline unsigned dx_get_limit(struct dx_entry *entries)
 {
 	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
 }
 
-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
 {
 	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
 }
 
-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
 {
 	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
 }
 
-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
 		EXT4_DIR_REC_LEN(2) - infosize;
 	return entry_space / sizeof(struct dx_entry);
 }
 
-static inline unsigned dx_node_limit (struct inode *dir)
+static inline unsigned dx_node_limit(struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
 	return entry_space / sizeof(struct dx_entry);
@@ -254,12 +256,12 @@ static inline unsigned dx_node_limit (struct inode *dir)
  * Debug
  */
 #ifdef DX_DEBUG
-static void dx_show_index (char * label, struct dx_entry *entries)
+static void dx_show_index(char * label, struct dx_entry *entries)
 {
 	int i, n = dx_get_count (entries);
-	printk("%s index ", label);
+	printk(KERN_DEBUG "%s index ", label);
 	for (i = 0; i < n; i++) {
-		printk("%x->%lu ", i? dx_get_hash(entries + i) :
+		printk("%x->%lu ", i ? dx_get_hash(entries + i) :
 				0, (unsigned long)dx_get_block(entries + i));
 	}
 	printk("\n");
@@ -306,7 +308,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 			     struct dx_entry *entries, int levels)
 {
 	unsigned blocksize = dir->i_sb->s_blocksize;
-	unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+	unsigned count = dx_get_count(entries), names = 0, space = 0, i;
 	unsigned bcount = 0;
 	struct buffer_head *bh;
 	int err;
@@ -325,11 +327,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 		names += stats.names;
 		space += stats.space;
 		bcount += stats.bcount;
-		brelse (bh);
+		brelse(bh);
 	}
 	if (bcount)
-		printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
-			names, space/bcount,(space/bcount)*100/blocksize);
+		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+		       levels ? "" : "   ", names, space/bcount,
+		       (space/bcount)*100/blocksize);
 	return (struct stats) { names, space, bcount};
 }
 #endif /* DX_DEBUG */
@@ -344,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
  * back to userspace.
  */
 static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(const struct qstr *d_name, struct inode *dir,
 	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
 {
 	unsigned count, indirect;
@@ -355,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	u32 hash;
 
 	frame->bh = NULL;
-	if (dentry)
-		dir = dentry->d_parent->d_inode;
 	if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
 		goto fail;
 	root = (struct dx_root *) bh->b_data;
@@ -372,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	}
 	hinfo->hash_version = root->info.hash_version;
 	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-	if (dentry)
-		ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+	if (d_name)
+		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
@@ -406,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 		goto fail;
 	}
 
-	dxtrace (printk("Look up %x", hash));
+	dxtrace(printk("Look up %x", hash));
 	while (1)
 	{
 		count = dx_get_count(entries);
@@ -555,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 				      0, &err)))
 			return err; /* Failure */
 		p++;
-		brelse (p->bh);
+		brelse(p->bh);
 		p->bh = bh;
 		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
 	}
@@ -593,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 			/* On error, skip the f_pos to the next block. */
 			dir_file->f_pos = (dir_file->f_pos |
 					(dir->i_sb->s_blocksize - 1)) + 1;
-			brelse (bh);
+			brelse(bh);
 			return count;
 		}
 		ext4fs_dirhash(de->name, de->name_len, hinfo);
@@ -635,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	int ret, err;
 	__u32 hashval;
 
-	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
-		       start_minor_hash));
+	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+		       start_hash, start_minor_hash));
 	dir = dir_file->f_path.dentry->d_inode;
 	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -648,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	}
 	hinfo.hash = start_hash;
 	hinfo.minor_hash = 0;
-	frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+	frame = dx_probe(NULL, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 
@@ -694,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 			break;
 	}
 	dx_release(frames);
-	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
-		       count, *next_hash));
+	dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
+		       "next hash: %x\n", count, *next_hash));
 	return count;
 errout:
 	dx_release(frames);
@@ -802,17 +803,17 @@ static inline int ext4_match (int len, const char * const name,
 /*
  * Returns 0 if not found, -1 on failure, and 1 on success
  */
-static inline int search_dirblock(struct buffer_head * bh,
+static inline int search_dirblock(struct buffer_head *bh,
 				  struct inode *dir,
-				  struct dentry *dentry,
+				  const struct qstr *d_name,
 				  unsigned long offset,
 				  struct ext4_dir_entry_2 ** res_dir)
 {
 	struct ext4_dir_entry_2 * de;
 	char * dlimit;
 	int de_len;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const char *name = d_name->name;
+	int namelen = d_name->len;
 
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -851,12 +852,13 @@ static inline int search_dirblock(struct buffer_head * bh,
  * The returned buffer_head has ->b_count elevated.  The caller is expected
  * to brelse() it when appropriate.
  */
-static struct buffer_head * ext4_find_entry (struct dentry *dentry,
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+					const struct qstr *d_name,
 					struct ext4_dir_entry_2 ** res_dir)
 {
-	struct super_block * sb;
-	struct buffer_head * bh_use[NAMEI_RA_SIZE];
-	struct buffer_head * bh, *ret = NULL;
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
 	ext4_lblk_t start, block, b;
 	int ra_max = 0;		/* Number of bh's in the readahead
 				   buffer, bh_use[] */
@@ -865,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
 	int num = 0;
 	ext4_lblk_t  nblocks;
 	int i, err;
-	struct inode *dir = dentry->d_parent->d_inode;
 	int namelen;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
-	namelen = dentry->d_name.len;
+	namelen = d_name->len;
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
 	if (is_dx(dir)) {
-		bh = ext4_dx_find_entry(dentry, res_dir, &err);
+		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
 		/*
 		 * On success, or if the error was file not found,
 		 * return.  Otherwise, fall back to doing a search the
@@ -882,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
 		 */
 		if (bh || (err != ERR_BAD_DX_DIR))
 			return bh;
-		dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+			       "falling back\n"));
 	}
 	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
 	start = EXT4_I(dir)->i_dir_start_lookup;
@@ -926,7 +928,7 @@ restart:
 			brelse(bh);
 			goto next;
 		}
-		i = search_dirblock(bh, dir, dentry,
+		i = search_dirblock(bh, dir, d_name,
 			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
 		if (i == 1) {
 			EXT4_I(dir)->i_dir_start_lookup = block;
@@ -956,11 +958,11 @@ restart:
 cleanup_and_exit:
 	/* Clean up the read-ahead blocks */
 	for (; ra_ptr < ra_max; ra_ptr++)
-		brelse (bh_use[ra_ptr]);
+		brelse(bh_use[ra_ptr]);
 	return ret;
 }
 
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
 		       struct ext4_dir_entry_2 **res_dir, int *err)
 {
 	struct super_block * sb;
@@ -971,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 	struct buffer_head *bh;
 	ext4_lblk_t block;
 	int retval;
-	int namelen = dentry->d_name.len;
-	const u8 *name = dentry->d_name.name;
-	struct inode *dir = dentry->d_parent->d_inode;
+	int namelen = d_name->len;
+	const u8 *name = d_name->name;
 
 	sb = dir->i_sb;
 	/* NFS may look up ".." - look at dx_root directory block */
 	if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-		if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+		if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
 			return NULL;
 	} else {
 		frame = frames;
@@ -1010,7 +1011,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 				return bh;
 			}
 		}
-		brelse (bh);
+		brelse(bh);
 		/* Check to see if we should continue to search */
 		retval = ext4_htree_next_block(dir, hash, frame,
 					       frames, NULL);
@@ -1025,25 +1026,25 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 
 	*err = -ENOENT;
 errout:
-	dxtrace(printk("%s not found\n", name));
+	dxtrace(printk(KERN_DEBUG "%s not found\n", name));
 	dx_release (frames);
 	return NULL;
 }
 
-static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	struct inode * inode;
-	struct ext4_dir_entry_2 * de;
-	struct buffer_head * bh;
+	struct inode *inode;
+	struct ext4_dir_entry_2 *de;
+	struct buffer_head *bh;
 
 	if (dentry->d_name.len > EXT4_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	bh = ext4_find_entry(dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	inode = NULL;
 	if (bh) {
 		unsigned long ino = le32_to_cpu(de->inode);
-		brelse (bh);
+		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
 			ext4_error(dir->i_sb, "ext4_lookup",
 				   "bad inode number: %lu", ino);
@@ -1060,17 +1061,15 @@ static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, str
 struct dentry *ext4_get_parent(struct dentry *child)
 {
 	unsigned long ino;
-	struct dentry *parent;
 	struct inode *inode;
-	struct dentry dotdot;
+	static const struct qstr dotdot = {
+		.name = "..",
+		.len = 2,
+	};
 	struct ext4_dir_entry_2 * de;
 	struct buffer_head *bh;
 
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-	dotdot.d_parent = child; /* confusing, isn't it! */
-
-	bh = ext4_find_entry(&dotdot, &de);
+	bh = ext4_find_entry(child->d_inode, &dotdot, &de);
 	inode = NULL;
 	if (!bh)
 		return ERR_PTR(-ENOENT);
@@ -1083,16 +1082,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 		return ERR_PTR(-EIO);
 	}
 
-	inode = ext4_iget(child->d_inode->i_sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
 }
 
 #define S_SHIFT 12
@@ -1201,10 +1191,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 
 	/* create map in the end of data2 block */
 	map = (struct dx_map_entry *) (data2 + blocksize);
-	count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+	count = dx_make_map((struct ext4_dir_entry_2 *) data1,
 			     blocksize, hinfo, map);
 	map -= count;
-	dx_sort_map (map, count);
+	dx_sort_map(map, count);
 	/* Split the existing block in the middle, size-wise */
 	size = 0;
 	move = 0;
@@ -1225,7 +1215,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 
 	/* Fancy dance to stay within two buffers */
 	de2 = dx_move_dirents(data1, data2, map + split, count - split);
-	de = dx_pack_dirents(data1,blocksize);
+	de = dx_pack_dirents(data1, blocksize);
 	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
 	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
@@ -1237,15 +1227,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 		swap(*bh, bh2);
 		de = de2;
 	}
-	dx_insert_block (frame, hash2 + continued, newblock);
-	err = ext4_journal_dirty_metadata (handle, bh2);
+	dx_insert_block(frame, hash2 + continued, newblock);
+	err = ext4_journal_dirty_metadata(handle, bh2);
 	if (err)
 		goto journal_error;
-	err = ext4_journal_dirty_metadata (handle, frame->bh);
+	err = ext4_journal_dirty_metadata(handle, frame->bh);
 	if (err)
 		goto journal_error;
-	brelse (bh2);
-	dxtrace(dx_show_index ("frame", frame->entries));
+	brelse(bh2);
+	dxtrace(dx_show_index("frame", frame->entries));
 	return de;
 
 journal_error:
@@ -1271,7 +1261,7 @@ errout:
  */
 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode, struct ext4_dir_entry_2 *de,
-			     struct buffer_head * bh)
+			     struct buffer_head *bh)
 {
 	struct inode	*dir = dentry->d_parent->d_inode;
 	const char	*name = dentry->d_name.name;
@@ -1288,11 +1278,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 		while ((char *) de <= top) {
 			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
 						  bh, offset)) {
-				brelse (bh);
+				brelse(bh);
 				return -EIO;
 			}
-			if (ext4_match (namelen, name, de)) {
-				brelse (bh);
+			if (ext4_match(namelen, name, de)) {
+				brelse(bh);
 				return -EEXIST;
 			}
 			nlen = EXT4_DIR_REC_LEN(de->name_len);
@@ -1329,7 +1319,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	} else
 		de->inode = 0;
 	de->name_len = namelen;
-	memcpy (de->name, name, namelen);
+	memcpy(de->name, name, namelen);
 	/*
 	 * XXX shouldn't update any times until successful
 	 * completion of syscall, but too many callers depend
@@ -1377,7 +1367,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	struct fake_dirent *fde;
 
 	blocksize =  dir->i_sb->s_blocksize;
-	dxtrace(printk("Creating index\n"));
+	dxtrace(printk(KERN_DEBUG "Creating index\n"));
 	retval = ext4_journal_get_write_access(handle, bh);
 	if (retval) {
 		ext4_std_error(dir->i_sb, retval);
@@ -1386,7 +1376,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	}
 	root = (struct dx_root *) bh->b_data;
 
-	bh2 = ext4_append (handle, dir, &block, &retval);
+	bh2 = ext4_append(handle, dir, &block, &retval);
 	if (!(bh2)) {
 		brelse(bh);
 		return retval;
@@ -1412,9 +1402,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	root->info.info_length = sizeof(root->info);
 	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
 	entries = root->entries;
-	dx_set_block (entries, 1);
-	dx_set_count (entries, 1);
-	dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+	dx_set_block(entries, 1);
+	dx_set_count(entries, 1);
+	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
 
 	/* Initialize as for dx_probe */
 	hinfo.hash_version = root->info.hash_version;
@@ -1443,14 +1433,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
  * may not sleep between calling this and putting something into
  * the entry, as someone else might have used it while you slept.
  */
-static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
-	struct inode *inode)
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+			  struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	unsigned long offset;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de;
-	struct super_block * sb;
+	struct super_block *sb;
 	int	retval;
 	int	dx_fallback=0;
 	unsigned blocksize;
@@ -1500,13 +1490,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	struct dx_frame frames[2], *frame;
 	struct dx_entry *entries, *at;
 	struct dx_hash_info hinfo;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct inode *dir = dentry->d_parent->d_inode;
-	struct super_block * sb = dir->i_sb;
+	struct super_block *sb = dir->i_sb;
 	struct ext4_dir_entry_2 *de;
 	int err;
 
-	frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 	entries = frame->entries;
@@ -1527,7 +1517,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	}
 
 	/* Block full, should compress but for now just split */
-	dxtrace(printk("using %u of %u node entries\n",
+	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
 		       dx_get_count(entries), dx_get_limit(entries)));
 	/* Need to split index? */
 	if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1559,7 +1549,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 		if (levels) {
 			unsigned icount1 = icount/2, icount2 = icount - icount1;
 			unsigned hash2 = dx_get_hash(entries + icount1);
-			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+				       icount1, icount2));
 
 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
 			err = ext4_journal_get_write_access(handle,
@@ -1567,11 +1558,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 
-			memcpy ((char *) entries2, (char *) (entries + icount1),
-				icount2 * sizeof(struct dx_entry));
-			dx_set_count (entries, icount1);
-			dx_set_count (entries2, icount2);
-			dx_set_limit (entries2, dx_node_limit(dir));
+			memcpy((char *) entries2, (char *) (entries + icount1),
+			       icount2 * sizeof(struct dx_entry));
+			dx_set_count(entries, icount1);
+			dx_set_count(entries2, icount2);
+			dx_set_limit(entries2, dx_node_limit(dir));
 
 			/* Which index block gets the new entry? */
 			if (at - entries >= icount1) {
@@ -1579,16 +1570,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 				frame->entries = entries = entries2;
 				swap(frame->bh, bh2);
 			}
-			dx_insert_block (frames + 0, hash2, newblock);
-			dxtrace(dx_show_index ("node", frames[1].entries));
-			dxtrace(dx_show_index ("node",
+			dx_insert_block(frames + 0, hash2, newblock);
+			dxtrace(dx_show_index("node", frames[1].entries));
+			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
 			err = ext4_journal_dirty_metadata(handle, bh2);
 			if (err)
 				goto journal_error;
 			brelse (bh2);
 		} else {
-			dxtrace(printk("Creating second level index...\n"));
+			dxtrace(printk(KERN_DEBUG
+				       "Creating second level index...\n"));
 			memcpy((char *) entries2, (char *) entries,
 			       icount * sizeof(struct dx_entry));
 			dx_set_limit(entries2, dx_node_limit(dir));
@@ -1630,12 +1622,12 @@ cleanup:
  * ext4_delete_entry deletes a directory entry by merging it with the
  * previous entry
  */
-static int ext4_delete_entry (handle_t *handle,
-			      struct inode * dir,
-			      struct ext4_dir_entry_2 * de_del,
-			      struct buffer_head * bh)
+static int ext4_delete_entry(handle_t *handle,
+			     struct inode *dir,
+			     struct ext4_dir_entry_2 *de_del,
+			     struct buffer_head *bh)
 {
-	struct ext4_dir_entry_2 * de, * pde;
+	struct ext4_dir_entry_2 *de, *pde;
 	int i;
 
 	i = 0;
@@ -1716,11 +1708,11 @@ static int ext4_add_nondir(handle_t *handle,
  * If the create succeeds, we fill in the inode information
  * with d_instantiate().
  */
-static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
-		struct nameidata *nd)
+static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
 {
 	handle_t *handle;
-	struct inode * inode;
+	struct inode *inode;
 	int err, retries = 0;
 
 retry:
@@ -1747,8 +1739,8 @@ retry:
 	return err;
 }
 
-static int ext4_mknod (struct inode * dir, struct dentry *dentry,
-			int mode, dev_t rdev)
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
+		      int mode, dev_t rdev)
 {
 	handle_t *handle;
 	struct inode *inode;
@@ -1767,11 +1759,11 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext4_new_inode (handle, dir, mode);
+	inode = ext4_new_inode(handle, dir, mode);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 		inode->i_op = &ext4_special_inode_operations;
 #endif
 		err = ext4_add_nondir(handle, dentry, inode);
@@ -1782,12 +1774,12 @@ retry:
 	return err;
 }
 
-static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	handle_t *handle;
-	struct inode * inode;
-	struct buffer_head * dir_block;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
+	struct buffer_head *dir_block;
+	struct ext4_dir_entry_2 *de;
 	int err, retries = 0;
 
 	if (EXT4_DIR_LINK_MAX(dir))
@@ -1803,7 +1795,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+	inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -1811,7 +1803,7 @@ retry:
 	inode->i_op = &ext4_dir_inode_operations;
 	inode->i_fop = &ext4_dir_operations;
 	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-	dir_block = ext4_bread (handle, inode, 0, 1, &err);
+	dir_block = ext4_bread(handle, inode, 0, 1, &err);
 	if (!dir_block)
 		goto out_clear_inode;
 	BUFFER_TRACE(dir_block, "get_write_access");
@@ -1820,26 +1812,26 @@ retry:
 	de->inode = cpu_to_le32(inode->i_ino);
 	de->name_len = 1;
 	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
-	strcpy (de->name, ".");
+	strcpy(de->name, ".");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	de = ext4_next_entry(de);
 	de->inode = cpu_to_le32(dir->i_ino);
 	de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
 						EXT4_DIR_REC_LEN(1));
 	de->name_len = 2;
-	strcpy (de->name, "..");
+	strcpy(de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
 	BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
 	ext4_journal_dirty_metadata(handle, dir_block);
-	brelse (dir_block);
+	brelse(dir_block);
 	ext4_mark_inode_dirty(handle, inode);
-	err = ext4_add_entry (handle, dentry, inode);
+	err = ext4_add_entry(handle, dentry, inode);
 	if (err) {
 out_clear_inode:
 		clear_nlink(inode);
 		ext4_mark_inode_dirty(handle, inode);
-		iput (inode);
+		iput(inode);
 		goto out_stop;
 	}
 	ext4_inc_count(handle, dir);
@@ -1856,17 +1848,17 @@ out_stop:
 /*
  * routine to check that the specified directory is empty (for rmdir)
  */
-static int empty_dir (struct inode * inode)
+static int empty_dir(struct inode *inode)
 {
 	unsigned long offset;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de, * de1;
-	struct super_block * sb;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de, *de1;
+	struct super_block *sb;
 	int err = 0;
 
 	sb = inode->i_sb;
 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
-	    !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
+	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
 		if (err)
 			ext4_error(inode->i_sb, __func__,
 				   "error %d reading directory #%lu offset 0",
@@ -1881,23 +1873,23 @@ static int empty_dir (struct inode * inode)
 	de1 = ext4_next_entry(de);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
 			!le32_to_cpu(de1->inode) ||
-			strcmp (".", de->name) ||
-			strcmp ("..", de1->name)) {
-		ext4_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no `.' or `..'",
-			      inode->i_ino);
-		brelse (bh);
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+		ext4_warning(inode->i_sb, "empty_dir",
+			     "bad directory (dir #%lu) - no `.' or `..'",
+			     inode->i_ino);
+		brelse(bh);
 		return 1;
 	}
 	offset = ext4_rec_len_from_disk(de->rec_len) +
 		 ext4_rec_len_from_disk(de1->rec_len);
 	de = ext4_next_entry(de1);
-	while (offset < inode->i_size ) {
+	while (offset < inode->i_size) {
 		if (!bh ||
 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
 			err = 0;
-			brelse (bh);
-			bh = ext4_bread (NULL, inode,
+			brelse(bh);
+			bh = ext4_bread(NULL, inode,
 				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
 			if (!bh) {
 				if (err)
@@ -1917,13 +1909,13 @@ static int empty_dir (struct inode * inode)
 			continue;
 		}
 		if (le32_to_cpu(de->inode)) {
-			brelse (bh);
+			brelse(bh);
 			return 0;
 		}
 		offset += ext4_rec_len_from_disk(de->rec_len);
 		de = ext4_next_entry(de);
 	}
-	brelse (bh);
+	brelse(bh);
 	return 1;
 }
 
@@ -1954,8 +1946,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	 * ->i_nlink. For, say it, character device. Not a regular file,
 	 * not a directory, not a symlink and ->i_nlink > 0.
 	 */
-	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
 
 	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
@@ -2069,12 +2061,12 @@ out_brelse:
 	goto out_err;
 }
 
-static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
 	handle_t *handle;
 
 	/* Initialize quotas before so that eventual writes go in
@@ -2085,7 +2077,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 		return PTR_ERR(handle);
 
 	retval = -ENOENT;
-	bh = ext4_find_entry (dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_rmdir;
 
@@ -2099,16 +2091,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 		goto end_rmdir;
 
 	retval = -ENOTEMPTY;
-	if (!empty_dir (inode))
+	if (!empty_dir(inode))
 		goto end_rmdir;
 
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_rmdir;
 	if (!EXT4_DIR_LINK_EMPTY(inode))
-		ext4_warning (inode->i_sb, "ext4_rmdir",
-			      "empty directory has too many links (%d)",
-			      inode->i_nlink);
+		ext4_warning(inode->i_sb, "ext4_rmdir",
+			     "empty directory has too many links (%d)",
+			     inode->i_nlink);
 	inode->i_version++;
 	clear_nlink(inode);
 	/* There's no need to set i_disksize: the fact that i_nlink is
@@ -2124,16 +2116,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 
 end_rmdir:
 	ext4_journal_stop(handle);
-	brelse (bh);
+	brelse(bh);
 	return retval;
 }
 
-static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
 	handle_t *handle;
 
 	/* Initialize quotas before so that eventual writes go
@@ -2147,7 +2139,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 		handle->h_sync = 1;
 
 	retval = -ENOENT;
-	bh = ext4_find_entry (dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	if (!bh)
 		goto end_unlink;
 
@@ -2158,9 +2150,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 		goto end_unlink;
 
 	if (!inode->i_nlink) {
-		ext4_warning (inode->i_sb, "ext4_unlink",
-			      "Deleting nonexistent file (%lu), %d",
-			      inode->i_ino, inode->i_nlink);
+		ext4_warning(inode->i_sb, "ext4_unlink",
+			     "Deleting nonexistent file (%lu), %d",
+			     inode->i_ino, inode->i_nlink);
 		inode->i_nlink = 1;
 	}
 	retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2178,15 +2170,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 
 end_unlink:
 	ext4_journal_stop(handle);
-	brelse (bh);
+	brelse(bh);
 	return retval;
 }
 
-static int ext4_symlink (struct inode * dir,
-		struct dentry *dentry, const char * symname)
+static int ext4_symlink(struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	handle_t *handle;
-	struct inode * inode;
+	struct inode *inode;
 	int l, err, retries = 0;
 
 	l = strlen(symname)+1;
@@ -2203,12 +2195,12 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
 
-	if (l > sizeof (EXT4_I(inode)->i_data)) {
+	if (l > sizeof(EXT4_I(inode)->i_data)) {
 		inode->i_op = &ext4_symlink_inode_operations;
 		ext4_set_aops(inode);
 		/*
@@ -2221,14 +2213,14 @@ retry:
 		if (err) {
 			clear_nlink(inode);
 			ext4_mark_inode_dirty(handle, inode);
-			iput (inode);
+			iput(inode);
 			goto out_stop;
 		}
 	} else {
 		/* clear the extent format for fast symlink */
 		EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
 		inode->i_op = &ext4_fast_symlink_inode_operations;
-		memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
 		inode->i_size = l-1;
 	}
 	EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2240,8 +2232,8 @@ out_stop:
 	return err;
 }
 
-static int ext4_link (struct dentry * old_dentry,
-		struct inode * dir, struct dentry *dentry)
+static int ext4_link(struct dentry *old_dentry,
+		     struct inode *dir, struct dentry *dentry)
 {
 	handle_t *handle;
 	struct inode *inode = old_dentry->d_inode;
@@ -2284,13 +2276,13 @@ retry:
  * Anybody can rename anything with this: the permission checks are left to the
  * higher-level routines.
  */
-static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
-			   struct inode * new_dir,struct dentry *new_dentry)
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
 {
 	handle_t *handle;
-	struct inode * old_inode, * new_inode;
-	struct buffer_head * old_bh, * new_bh, * dir_bh;
-	struct ext4_dir_entry_2 * old_de, * new_de;
+	struct inode *old_inode, *new_inode;
+	struct buffer_head *old_bh, *new_bh, *dir_bh;
+	struct ext4_dir_entry_2 *old_de, *new_de;
 	int retval;
 
 	old_bh = new_bh = dir_bh = NULL;
@@ -2308,7 +2300,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
 		handle->h_sync = 1;
 
-	old_bh = ext4_find_entry (old_dentry, &old_de);
+	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
 	 *  We might rmdir the source, keep it as pwd of some process
@@ -2321,32 +2313,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 		goto end_rename;
 
 	new_inode = new_dentry->d_inode;
-	new_bh = ext4_find_entry (new_dentry, &new_de);
+	new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
 	if (new_bh) {
 		if (!new_inode) {
-			brelse (new_bh);
+			brelse(new_bh);
 			new_bh = NULL;
 		}
 	}
 	if (S_ISDIR(old_inode->i_mode)) {
 		if (new_inode) {
 			retval = -ENOTEMPTY;
-			if (!empty_dir (new_inode))
+			if (!empty_dir(new_inode))
 				goto end_rename;
 		}
 		retval = -EIO;
-		dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
+		dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
 		if (!dir_bh)
 			goto end_rename;
 		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
 			goto end_rename;
 		retval = -EMLINK;
-		if (!new_inode && new_dir!=old_dir &&
+		if (!new_inode && new_dir != old_dir &&
 				new_dir->i_nlink >= EXT4_LINK_MAX)
 			goto end_rename;
 	}
 	if (!new_bh) {
-		retval = ext4_add_entry (handle, new_dentry, old_inode);
+		retval = ext4_add_entry(handle, new_dentry, old_inode);
 		if (retval)
 			goto end_rename;
 	} else {
@@ -2388,7 +2380,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 		struct buffer_head *old_bh2;
 		struct ext4_dir_entry_2 *old_de2;
 
-		old_bh2 = ext4_find_entry(old_dentry, &old_de2);
+		old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
 		if (old_bh2) {
 			retval = ext4_delete_entry(handle, old_dir,
 						   old_de2, old_bh2);
@@ -2433,9 +2425,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
 	retval = 0;
 
 end_rename:
-	brelse (dir_bh);
-	brelse (old_bh);
-	brelse (new_bh);
+	brelse(dir_bh);
+	brelse(old_bh);
+	brelse(new_bh);
 	ext4_journal_stop(handle);
 	return retval;
 }
@@ -2454,7 +2446,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.mknod		= ext4_mknod,
 	.rename		= ext4_rename,
 	.setattr	= ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
@@ -2465,7 +2457,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 
 const struct inode_operations ext4_special_inode_operations = {
 	.setattr	= ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f000fbe2cd93..b6ec1843a015 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb,
 			     "Inode bitmap not in group (block %llu)",
 			     (unsigned long long)input->inode_bitmap);
 	else if (outside(input->inode_table, start, end) ||
-	         outside(itend - 1, start, end))
+		 outside(itend - 1, start, end))
 		ext4_warning(sb, __func__,
 			     "Inode table not in group (blocks %llu-%llu)",
 			     (unsigned long long)input->inode_table, itend - 1);
@@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb,
 			     (unsigned long long)input->inode_bitmap,
 			     start, metaend - 1);
 	else if (inside(input->inode_table, start, metaend) ||
-	         inside(itend - 1, start, metaend))
+		 inside(itend - 1, start, metaend))
 		ext4_warning(sb, __func__,
 			     "Inode table (%llu-%llu) overlaps"
 			     "GDT table (%llu-%llu)",
@@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
 	if (err) {
 		if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
 			return err;
-	        if ((err = ext4_journal_get_write_access(handle, bh)))
+		if ((err = ext4_journal_get_write_access(handle, bh)))
 			return err;
-        }
+	}
 
 	return 0;
 }
@@ -418,9 +418,9 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 
 	/*
 	 * If we are not using the primary superblock/GDT copy don't resize,
-	 * because the user tools have no way of handling this.  Probably a
-	 * bad time to do it anyways.
-	 */
+         * because the user tools have no way of handling this.  Probably a
+         * bad time to do it anyways.
+         */
 	if (EXT4_SB(sb)->s_sbh->b_blocknr !=
 	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
 		ext4_warning(sb, __func__,
@@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	return 0;
 
 exit_inode:
-	//ext4_journal_release_buffer(handle, iloc.bh);
+	/* ext4_journal_release_buffer(handle, iloc.bh); */
 	brelse(iloc.bh);
 exit_dindj:
-	//ext4_journal_release_buffer(handle, dind);
+	/* ext4_journal_release_buffer(handle, dind); */
 exit_primary:
-	//ext4_journal_release_buffer(handle, *primary);
+	/* ext4_journal_release_buffer(handle, *primary); */
 exit_sbh:
-	//ext4_journal_release_buffer(handle, *primary);
+	/* ext4_journal_release_buffer(handle, *primary); */
 exit_dind:
 	brelse(dind);
 exit_bh:
@@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	if (reserved_gdb || gdb_off == 0) {
 		if (!EXT4_HAS_COMPAT_FEATURE(sb,
-					     EXT4_FEATURE_COMPAT_RESIZE_INODE)){
+					     EXT4_FEATURE_COMPAT_RESIZE_INODE)
+		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
 			ext4_warning(sb, __func__,
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
@@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
 		goto exit_journal;
 
-	/*
-	 * We will only either add reserved group blocks to a backup group
-	 * or remove reserved blocks for the first group in a new group block.
-	 * Doing both would be mean more complex code, and sane people don't
-	 * use non-sparse filesystems anymore.  This is already checked above.
-	 */
+        /*
+         * We will only either add reserved group blocks to a backup group
+         * or remove reserved blocks for the first group in a new group block.
+         * Doing both would be mean more complex code, and sane people don't
+         * use non-sparse filesystems anymore.  This is already checked above.
+         */
 	if (gdb_off) {
 		primary = sbi->s_group_desc[gdb_num];
 		if ((err = ext4_journal_get_write_access(handle, primary)))
@@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	} else if ((err = add_new_gdb(handle, inode, input, &primary)))
 		goto exit_journal;
 
-	/*
-	 * OK, now we've set up the new group.  Time to make it active.
-	 *
-	 * Current kernels don't lock all allocations via lock_super(),
-	 * so we have to be safe wrt. concurrent accesses the group
-	 * data.  So we need to be careful to set all of the relevant
-	 * group descriptor data etc. *before* we enable the group.
-	 *
-	 * The key field here is sbi->s_groups_count: as long as
-	 * that retains its old value, nobody is going to access the new
-	 * group.
-	 *
-	 * So first we update all the descriptor metadata for the new
-	 * group; then we update the total disk blocks count; then we
-	 * update the groups count to enable the group; then finally we
-	 * update the free space counts so that the system can start
-	 * using the new disk blocks.
-	 */
+        /*
+         * OK, now we've set up the new group.  Time to make it active.
+         *
+         * Current kernels don't lock all allocations via lock_super(),
+         * so we have to be safe wrt. concurrent accesses the group
+         * data.  So we need to be careful to set all of the relevant
+         * group descriptor data etc. *before* we enable the group.
+         *
+         * The key field here is sbi->s_groups_count: as long as
+         * that retains its old value, nobody is going to access the new
+         * group.
+         *
+         * So first we update all the descriptor metadata for the new
+         * group; then we update the total disk blocks count; then we
+         * update the groups count to enable the group; then finally we
+         * update the free space counts so that the system can start
+         * using the new disk blocks.
+         */
 
 	/* Update group descriptor block for new group */
 	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
@@ -869,11 +870,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 * We can allocate memory for mb_alloc based on the new group
 	 * descriptor
 	 */
-	if (test_opt(sb, MBALLOC)) {
-		err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
-		if (err)
-			goto exit_journal;
-	}
+	err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+	if (err)
+		goto exit_journal;
+
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
 	 * increasing the group count so that once the group is enabled,
@@ -928,6 +928,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT4_INODES_PER_GROUP(sb));
 
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		ext4_group_t flex_group;
+		flex_group = ext4_flex_group(sbi, input->group);
+		sbi->s_flex_groups[flex_group].free_blocks +=
+			input->free_blocks_count;
+		sbi->s_flex_groups[flex_group].free_inodes +=
+			EXT4_INODES_PER_GROUP(sb);
+	}
+
 	ext4_journal_dirty_metadata(handle, sbi->s_sbh);
 	sb->s_dirt = 1;
 
@@ -946,7 +955,8 @@ exit_put:
 	return err;
 } /* ext4_group_add */
 
-/* Extend the filesystem to the new number of blocks specified.  This entry
+/*
+ * Extend the filesystem to the new number of blocks specified.  This entry
  * point is only used to extend the current filesystem to the end of the last
  * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
  * for emergencies (because it has no dependencies on reserved blocks).
@@ -962,7 +972,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	ext4_group_t o_groups_count;
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	handle_t *handle;
 	int err;
 	unsigned long freed_blocks;
@@ -1024,7 +1034,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			     o_blocks_count + add, add);
 
 	/* See if the device is actually as big as what was requested */
-	bh = sb_bread(sb, o_blocks_count + add -1);
+	bh = sb_bread(sb, o_blocks_count + add - 1);
 	if (!bh) {
 		ext4_warning(sb, __func__,
 			     "can't read last block, resize aborted");
@@ -1075,8 +1085,15 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	/*
 	 * Mark mballoc pages as not up to date so that they will be updated
 	 * next time they are loaded by ext4_mb_load_buddy.
+	 *
+	 * XXX Bad, Bad, BAD!!!  We should not be overloading the
+	 * Uptodate flag, particularly on thte bitmap bh, as way of
+	 * hinting to ext4_mb_load_buddy() that it needs to be
+	 * overloaded.  A user could take a LVM snapshot, then do an
+	 * on-line fsck, and clear the uptodate flag, and this would
+	 * not be a bug in userspace, but a bug in the kernel.  FIXME!!!
 	 */
-	if (test_opt(sb, MBALLOC)) {
+	{
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
 		struct inode *inode = sbi->s_buddy_cache;
 		int blocks_per_page;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1cb371dcd609..bdddea14e782 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,6 +34,8 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <asm/uaccess.h>
@@ -45,24 +47,25 @@
 #include "namei.h"
 #include "group.h"
 
+struct proc_dir_entry *ext4_proc_root;
+
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
 static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
 			       unsigned int);
-static void ext4_commit_super (struct super_block * sb,
-			       struct ext4_super_block * es,
-			       int sync);
-static void ext4_mark_recovery_complete(struct super_block * sb,
-					struct ext4_super_block * es);
-static void ext4_clear_journal_err(struct super_block * sb,
-				   struct ext4_super_block * es);
+static void ext4_commit_super(struct super_block *sb,
+			      struct ext4_super_block *es, int sync);
+static void ext4_mark_recovery_complete(struct super_block *sb,
+					struct ext4_super_block *es);
+static void ext4_clear_journal_err(struct super_block *sb,
+				   struct ext4_super_block *es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
-static const char *ext4_decode_error(struct super_block * sb, int errno,
+static const char *ext4_decode_error(struct super_block *sb, int errno,
 				     char nbuf[16]);
-static int ext4_remount (struct super_block * sb, int * flags, char * data);
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext4_remount(struct super_block *sb, int *flags, char *data);
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static void ext4_unlockfs(struct super_block *sb);
-static void ext4_write_super (struct super_block * sb);
+static void ext4_write_super(struct super_block *sb);
 static void ext4_write_super_lockfs(struct super_block *sb);
 
 
@@ -211,15 +214,15 @@ static void ext4_handle_error(struct super_block *sb)
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	if (!test_opt (sb, ERRORS_CONT)) {
+	if (!test_opt(sb, ERRORS_CONT)) {
 		journal_t *journal = EXT4_SB(sb)->s_journal;
 
 		EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
 		if (journal)
 			jbd2_journal_abort(journal, -EIO);
 	}
-	if (test_opt (sb, ERRORS_RO)) {
-		printk (KERN_CRIT "Remounting filesystem read-only\n");
+	if (test_opt(sb, ERRORS_RO)) {
+		printk(KERN_CRIT "Remounting filesystem read-only\n");
 		sb->s_flags |= MS_RDONLY;
 	}
 	ext4_commit_super(sb, es, 1);
@@ -228,13 +231,13 @@ static void ext4_handle_error(struct super_block *sb)
 			sb->s_id);
 }
 
-void ext4_error (struct super_block * sb, const char * function,
-		 const char * fmt, ...)
+void ext4_error(struct super_block *sb, const char *function,
+		const char *fmt, ...)
 {
 	va_list args;
 
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
@@ -242,7 +245,7 @@ void ext4_error (struct super_block * sb, const char * function,
 	ext4_handle_error(sb);
 }
 
-static const char *ext4_decode_error(struct super_block * sb, int errno,
+static const char *ext4_decode_error(struct super_block *sb, int errno,
 				     char nbuf[16])
 {
 	char *errstr = NULL;
@@ -278,8 +281,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
 /* __ext4_std_error decodes expected errors from journaling functions
  * automatically and invokes the appropriate error response.  */
 
-void __ext4_std_error (struct super_block * sb, const char * function,
-		       int errno)
+void __ext4_std_error(struct super_block *sb, const char *function, int errno)
 {
 	char nbuf[16];
 	const char *errstr;
@@ -292,8 +294,8 @@ void __ext4_std_error (struct super_block * sb, const char * function,
 		return;
 
 	errstr = ext4_decode_error(sb, errno, nbuf);
-	printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
-		sb->s_id, function, errstr);
+	printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
+	       sb->s_id, function, errstr);
 
 	ext4_handle_error(sb);
 }
@@ -308,15 +310,15 @@ void __ext4_std_error (struct super_block * sb, const char * function,
  * case we take the easy way out and panic immediately.
  */
 
-void ext4_abort (struct super_block * sb, const char * function,
-		 const char * fmt, ...)
+void ext4_abort(struct super_block *sb, const char *function,
+		const char *fmt, ...)
 {
 	va_list args;
 
-	printk (KERN_CRIT "ext4_abort called.\n");
+	printk(KERN_CRIT "ext4_abort called.\n");
 
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
@@ -334,8 +336,8 @@ void ext4_abort (struct super_block * sb, const char * function,
 	jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 
-void ext4_warning (struct super_block * sb, const char * function,
-		   const char * fmt, ...)
+void ext4_warning(struct super_block *sb, const char *function,
+		  const char *fmt, ...)
 {
 	va_list args;
 
@@ -372,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 	 */
 }
 
-int ext4_update_compat_feature(handle_t *handle,
-					struct super_block *sb, __u32 compat)
-{
-	int err = 0;
-	if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
-		err = ext4_journal_get_write_access(handle,
-				EXT4_SB(sb)->s_sbh);
-		if (err)
-			return err;
-		EXT4_SET_COMPAT_FEATURE(sb, compat);
-		sb->s_dirt = 1;
-		handle->h_sync = 1;
-		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-					"call ext4_journal_dirty_met adata");
-		err = ext4_journal_dirty_metadata(handle,
-				EXT4_SB(sb)->s_sbh);
-	}
-	return err;
-}
-
-int ext4_update_rocompat_feature(handle_t *handle,
-					struct super_block *sb, __u32 rocompat)
-{
-	int err = 0;
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
-		err = ext4_journal_get_write_access(handle,
-				EXT4_SB(sb)->s_sbh);
-		if (err)
-			return err;
-		EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
-		sb->s_dirt = 1;
-		handle->h_sync = 1;
-		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-					"call ext4_journal_dirty_met adata");
-		err = ext4_journal_dirty_metadata(handle,
-				EXT4_SB(sb)->s_sbh);
-	}
-	return err;
-}
-
-int ext4_update_incompat_feature(handle_t *handle,
-					struct super_block *sb, __u32 incompat)
-{
-	int err = 0;
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
-		err = ext4_journal_get_write_access(handle,
-				EXT4_SB(sb)->s_sbh);
-		if (err)
-			return err;
-		EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
-		sb->s_dirt = 1;
-		handle->h_sync = 1;
-		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-					"call ext4_journal_dirty_met adata");
-		err = ext4_journal_dirty_metadata(handle,
-				EXT4_SB(sb)->s_sbh);
-	}
-	return err;
-}
-
 /*
  * Open the external journal device
  */
@@ -457,7 +399,7 @@ fail:
 static int ext4_blkdev_put(struct block_device *bdev)
 {
 	bd_release(bdev);
-	return blkdev_put(bdev);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -496,7 +438,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 	}
 }
 
-static void ext4_put_super (struct super_block * sb)
+static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
@@ -505,15 +447,18 @@ static void ext4_put_super (struct super_block * sb)
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
-	jbd2_journal_destroy(sbi->s_journal);
+	if (jbd2_journal_destroy(sbi->s_journal) < 0)
+		ext4_abort(sb, __func__, "Couldn't clean up the journal");
 	sbi->s_journal = NULL;
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		BUFFER_TRACE(sbi->s_sbh, "marking dirty");
-		mark_buffer_dirty(sbi->s_sbh);
 		ext4_commit_super(sb, es, 1);
 	}
+	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
@@ -522,6 +467,7 @@ static void ext4_put_super (struct super_block * sb)
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -564,12 +510,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
 #endif
-	ei->i_block_alloc_info = NULL;
 	ei->vfs_inode.i_version = 1;
+	ei->vfs_inode.i_data.writeback_index = 0;
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
@@ -595,12 +541,12 @@ static void ext4_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
 	INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	init_rwsem(&ei->xattr_sem);
 #endif
 	init_rwsem(&ei->i_data_sem);
@@ -626,8 +572,7 @@ static void destroy_inodecache(void)
 
 static void ext4_clear_inode(struct inode *inode)
 {
-	struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (EXT4_I(inode)->i_acl &&
 			EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
 		posix_acl_release(EXT4_I(inode)->i_acl);
@@ -639,22 +584,20 @@ static void ext4_clear_inode(struct inode *inode)
 		EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
 	}
 #endif
-	ext4_discard_reservation(inode);
-	EXT4_I(inode)->i_block_alloc_info = NULL;
-	if (unlikely(rsv))
-		kfree(rsv);
+	ext4_discard_preallocations(inode);
 	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
 				       &EXT4_I(inode)->jinode);
 }
 
-static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
+static inline void ext4_show_quota_options(struct seq_file *seq,
+					   struct super_block *sb)
 {
 #if defined(CONFIG_QUOTA)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (sbi->s_jquota_fmt)
 		seq_printf(seq, ",jqfmt=%s",
-		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
 
 	if (sbi->s_qf_names[USRQUOTA])
 		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -718,7 +661,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",debug");
 	if (test_opt(sb, OLDALLOC))
 		seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	if (test_opt(sb, XATTR_USER) &&
 		!(def_mount_opts & EXT4_DEFM_XATTR_USER))
 		seq_puts(seq, ",user_xattr");
@@ -727,7 +670,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nouser_xattr");
 	}
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
 		seq_puts(seq, ",acl");
 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
@@ -752,8 +695,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nobh");
 	if (!test_opt(sb, EXTENTS))
 		seq_puts(seq, ",noextents");
-	if (!test_opt(sb, MBALLOC))
-		seq_puts(seq, ",nomballoc");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
 	if (!test_opt(sb, DELALLOC))
@@ -773,6 +714,13 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 
+	if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+		seq_printf(seq, ",inode_readahead_blks=%u",
+			   sbi->s_inode_readahead_blks);
+
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -822,8 +770,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 
 #ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
-#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
 static int ext4_dquot_initialize(struct inode *inode, int type);
 static int ext4_dquot_drop(struct inode *inode);
@@ -896,20 +844,22 @@ static const struct export_operations ext4_export_ops = {
 enum {
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+	Opt_inode_readahead_blks
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_bsd_df, "bsddf"},
 	{Opt_minix_df, "minixdf"},
 	{Opt_grpid, "grpid"},
@@ -923,8 +873,6 @@ static match_table_t tokens = {
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
 	{Opt_nouid32, "nouid32"},
-	{Opt_nocheck, "nocheck"},
-	{Opt_nocheck, "check=none"},
 	{Opt_debug, "debug"},
 	{Opt_oldalloc, "oldalloc"},
 	{Opt_orlov, "orlov"},
@@ -947,6 +895,8 @@ static match_table_t tokens = {
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -961,12 +911,11 @@ static match_table_t tokens = {
 	{Opt_extents, "extents"},
 	{Opt_noextents, "noextents"},
 	{Opt_i_version, "i_version"},
-	{Opt_mballoc, "mballoc"},
-	{Opt_nomballoc, "nomballoc"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_err, NULL},
 };
 
@@ -981,7 +930,7 @@ static ext4_fsblk_t get_sb_block(void **data)
 	/*todo: use simple_strtoll with >32bit ext4 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
-		printk("EXT4-fs: Invalid sb specification: %s\n",
+		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
 		       (char *) *data);
 		return 1;
 	}
@@ -991,12 +940,12 @@ static ext4_fsblk_t get_sb_block(void **data)
 	return sb_block;
 }
 
-static int parse_options (char *options, struct super_block *sb,
-			  unsigned int *inum, unsigned long *journal_devnum,
-			  ext4_fsblk_t *n_blocks_count, int is_remount)
+static int parse_options(char *options, struct super_block *sb,
+			 unsigned int *inum, unsigned long *journal_devnum,
+			 ext4_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	char * p;
+	char *p;
 	substring_t args[MAX_OPT_ARGS];
 	int data_opt = 0;
 	int option;
@@ -1009,7 +958,7 @@ static int parse_options (char *options, struct super_block *sb,
 	if (!options)
 		return 1;
 
-	while ((p = strsep (&options, ",")) != NULL) {
+	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
 		if (!*p)
 			continue;
@@ -1017,16 +966,16 @@ static int parse_options (char *options, struct super_block *sb,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_bsd_df:
-			clear_opt (sbi->s_mount_opt, MINIX_DF);
+			clear_opt(sbi->s_mount_opt, MINIX_DF);
 			break;
 		case Opt_minix_df:
-			set_opt (sbi->s_mount_opt, MINIX_DF);
+			set_opt(sbi->s_mount_opt, MINIX_DF);
 			break;
 		case Opt_grpid:
-			set_opt (sbi->s_mount_opt, GRPID);
+			set_opt(sbi->s_mount_opt, GRPID);
 			break;
 		case Opt_nogrpid:
-			clear_opt (sbi->s_mount_opt, GRPID);
+			clear_opt(sbi->s_mount_opt, GRPID);
 			break;
 		case Opt_resuid:
 			if (match_int(&args[0], &option))
@@ -1043,49 +992,47 @@ static int parse_options (char *options, struct super_block *sb,
 			/* *sb_block = match_int(&args[0]); */
 			break;
 		case Opt_err_panic:
-			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
-			clear_opt (sbi->s_mount_opt, ERRORS_RO);
-			set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt(sbi->s_mount_opt, ERRORS_RO);
+			set_opt(sbi->s_mount_opt, ERRORS_PANIC);
 			break;
 		case Opt_err_ro:
-			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
-			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
-			set_opt (sbi->s_mount_opt, ERRORS_RO);
+			clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt(sbi->s_mount_opt, ERRORS_RO);
 			break;
 		case Opt_err_cont:
-			clear_opt (sbi->s_mount_opt, ERRORS_RO);
-			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
-			set_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt(sbi->s_mount_opt, ERRORS_RO);
+			clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt(sbi->s_mount_opt, ERRORS_CONT);
 			break;
 		case Opt_nouid32:
-			set_opt (sbi->s_mount_opt, NO_UID32);
-			break;
-		case Opt_nocheck:
-			clear_opt (sbi->s_mount_opt, CHECK);
+			set_opt(sbi->s_mount_opt, NO_UID32);
 			break;
 		case Opt_debug:
-			set_opt (sbi->s_mount_opt, DEBUG);
+			set_opt(sbi->s_mount_opt, DEBUG);
 			break;
 		case Opt_oldalloc:
-			set_opt (sbi->s_mount_opt, OLDALLOC);
+			set_opt(sbi->s_mount_opt, OLDALLOC);
 			break;
 		case Opt_orlov:
-			clear_opt (sbi->s_mount_opt, OLDALLOC);
+			clear_opt(sbi->s_mount_opt, OLDALLOC);
 			break;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 		case Opt_user_xattr:
-			set_opt (sbi->s_mount_opt, XATTR_USER);
+			set_opt(sbi->s_mount_opt, XATTR_USER);
 			break;
 		case Opt_nouser_xattr:
-			clear_opt (sbi->s_mount_opt, XATTR_USER);
+			clear_opt(sbi->s_mount_opt, XATTR_USER);
 			break;
 #else
 		case Opt_user_xattr:
 		case Opt_nouser_xattr:
-			printk("EXT4 (no)user_xattr options not supported\n");
+			printk(KERN_ERR "EXT4 (no)user_xattr options "
+			       "not supported\n");
 			break;
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 		case Opt_acl:
 			set_opt(sbi->s_mount_opt, POSIX_ACL);
 			break;
@@ -1095,7 +1042,8 @@ static int parse_options (char *options, struct super_block *sb,
 #else
 		case Opt_acl:
 		case Opt_noacl:
-			printk("EXT4 (no)acl options not supported\n");
+			printk(KERN_ERR "EXT4 (no)acl options "
+			       "not supported\n");
 			break;
 #endif
 		case Opt_reservation:
@@ -1115,7 +1063,7 @@ static int parse_options (char *options, struct super_block *sb,
 				       "journal on remount\n");
 				return 0;
 			}
-			set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
+			set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
 			break;
 		case Opt_journal_inum:
 			if (is_remount) {
@@ -1145,7 +1093,7 @@ static int parse_options (char *options, struct super_block *sb,
 			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
 			break;
 		case Opt_noload:
-			set_opt (sbi->s_mount_opt, NOLOAD);
+			set_opt(sbi->s_mount_opt, NOLOAD);
 			break;
 		case Opt_commit:
 			if (match_int(&args[0], &option))
@@ -1178,6 +1126,12 @@ static int parse_options (char *options, struct super_block *sb,
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
+		case Opt_data_err_abort:
+			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -1189,8 +1143,8 @@ set_qf_name:
 			     sb_any_quota_suspended(sb)) &&
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
-					"EXT4-fs: Cannot change journaled "
-					"quota options when quota turned on.\n");
+				       "EXT4-fs: Cannot change journaled "
+				       "quota options when quota turned on.\n");
 				return 0;
 			}
 			qname = match_strdup(&args[0]);
@@ -1331,7 +1285,7 @@ set_qf_format:
 					"on this filesystem, use tune2fs\n");
 				return 0;
 			}
-			set_opt (sbi->s_mount_opt, EXTENTS);
+			set_opt(sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_noextents:
 			/*
@@ -1348,7 +1302,7 @@ set_qf_format:
 						"-o noextents options\n");
 				return 0;
 			}
-			clear_opt (sbi->s_mount_opt, EXTENTS);
+			clear_opt(sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_i_version:
 			set_opt(sbi->s_mount_opt, I_VERSION);
@@ -1357,12 +1311,6 @@ set_qf_format:
 		case Opt_nodelalloc:
 			clear_opt(sbi->s_mount_opt, DELALLOC);
 			break;
-		case Opt_mballoc:
-			set_opt(sbi->s_mount_opt, MBALLOC);
-			break;
-		case Opt_nomballoc:
-			clear_opt(sbi->s_mount_opt, MBALLOC);
-			break;
 		case Opt_stripe:
 			if (match_int(&args[0], &option))
 				return 0;
@@ -1373,10 +1321,17 @@ set_qf_format:
 		case Opt_delalloc:
 			set_opt(sbi->s_mount_opt, DELALLOC);
 			break;
+		case Opt_inode_readahead_blks:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > (1 << 30))
+				return 0;
+			sbi->s_inode_readahead_blks = option;
+			break;
 		default:
-			printk (KERN_ERR
-				"EXT4-fs: Unrecognized mount option \"%s\" "
-				"or missing value\n", p);
+			printk(KERN_ERR
+			       "EXT4-fs: Unrecognized mount option \"%s\" "
+			       "or missing value\n", p);
 			return 0;
 		}
 	}
@@ -1423,31 +1378,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	int res = 0;
 
 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-		printk (KERN_ERR "EXT4-fs warning: revision level too high, "
-			"forcing read-only mode\n");
+		printk(KERN_ERR "EXT4-fs warning: revision level too high, "
+		       "forcing read-only mode\n");
 		res = MS_RDONLY;
 	}
 	if (read_only)
 		return res;
 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
-		printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
-			"running e2fsck is recommended\n");
+		printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
+		       "running e2fsck is recommended\n");
 	else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-		printk (KERN_WARNING
-			"EXT4-fs warning: mounting fs with errors, "
-			"running e2fsck is recommended\n");
+		printk(KERN_WARNING
+		       "EXT4-fs warning: mounting fs with errors, "
+		       "running e2fsck is recommended\n");
 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-		printk (KERN_WARNING
-			"EXT4-fs warning: maximal mount count reached, "
-			"running e2fsck is recommended\n");
+		printk(KERN_WARNING
+		       "EXT4-fs warning: maximal mount count reached, "
+		       "running e2fsck is recommended\n");
 	else if (le32_to_cpu(es->s_checkinterval) &&
 		(le32_to_cpu(es->s_lastcheck) +
 			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-		printk (KERN_WARNING
-			"EXT4-fs warning: checktime reached, "
-			"running e2fsck is recommended\n");
+		printk(KERN_WARNING
+		       "EXT4-fs warning: checktime reached, "
+		       "running e2fsck is recommended\n");
 #if 0
 		/* @@@ We _will_ want to clear the valid bit if we find
 		 * inconsistencies, to force a fsck at reboot.  But for
@@ -1473,15 +1428,9 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
 
-	printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
-	if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
-		char b[BDEVNAME_SIZE];
-
-		printk("external journal on %s\n",
-			bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
-	} else {
-		printk("internal journal\n");
-	}
+	printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+	       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+	       "external", EXT4_SB(sb)->s_journal->j_devname);
 	return res;
 }
 
@@ -1504,16 +1453,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
 	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
 
-	flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
-		groups_per_flex;
-	sbi->s_flex_groups = kmalloc(flex_group_count *
+	/* We allocate both existing and potentially added groups */
+	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
+			    ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
+			      EXT4_DESC_PER_BLOCK_BITS(sb))) /
+			   groups_per_flex;
+	sbi->s_flex_groups = kzalloc(flex_group_count *
 				     sizeof(struct flex_groups), GFP_KERNEL);
 	if (sbi->s_flex_groups == NULL) {
-		printk(KERN_ERR "EXT4-fs: not enough memory\n");
+		printk(KERN_ERR "EXT4-fs: not enough memory for "
+				"%lu flex groups\n", flex_group_count);
 		goto failed;
 	}
-	memset(sbi->s_flex_groups, 0, flex_group_count *
-	       sizeof(struct flex_groups));
 
 	gdp = ext4_get_group_desc(sb, 1, &bh);
 	block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
@@ -1585,7 +1536,7 @@ static int ext4_check_descriptors(struct super_block *sb)
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		flexbg_flag = 1;
 
-	ext4_debug ("Checking group descriptors");
+	ext4_debug("Checking group descriptors");
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1597,43 +1548,45 @@ static int ext4_check_descriptors(struct super_block *sb)
 				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
 
 		block_bitmap = ext4_block_bitmap(sb, gdp);
-		if (block_bitmap < first_block || block_bitmap > last_block)
-		{
+		if (block_bitmap < first_block || block_bitmap > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Block bitmap for group %lu not in group "
-			       "(block %llu)!", i, block_bitmap);
+			       "(block %llu)!\n", i, block_bitmap);
 			return 0;
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
-		if (inode_bitmap < first_block || inode_bitmap > last_block)
-		{
+		if (inode_bitmap < first_block || inode_bitmap > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Inode bitmap for group %lu not in group "
-			       "(block %llu)!", i, inode_bitmap);
+			       "(block %llu)!\n", i, inode_bitmap);
 			return 0;
 		}
 		inode_table = ext4_inode_table(sb, gdp);
 		if (inode_table < first_block ||
-		    inode_table + sbi->s_itb_per_group - 1 > last_block)
-		{
+		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Inode table for group %lu not in group "
-			       "(block %llu)!", i, inode_table);
+			       "(block %llu)!\n", i, inode_table);
 			return 0;
 		}
+		spin_lock(sb_bgl_lock(sbi, i));
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Checksum for group %lu failed (%u!=%u)\n",
 			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 			       gdp)), le16_to_cpu(gdp->bg_checksum));
-			return 0;
+			if (!(sb->s_flags & MS_RDONLY)) {
+				spin_unlock(sb_bgl_lock(sbi, i));
+				return 0;
+			}
 		}
+		spin_unlock(sb_bgl_lock(sbi, i));
 		if (!flexbg_flag)
 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
 	}
 
 	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-	sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
+	sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
 	return 1;
 }
 
@@ -1654,8 +1607,8 @@ static int ext4_check_descriptors(struct super_block *sb)
  * e2fsck was run on this filesystem, and it must have already done the orphan
  * inode cleanup for us, so we can safely abort without any further action.
  */
-static void ext4_orphan_cleanup (struct super_block * sb,
-				 struct ext4_super_block * es)
+static void ext4_orphan_cleanup(struct super_block *sb,
+				struct ext4_super_block *es)
 {
 	unsigned int s_flags = sb->s_flags;
 	int nr_orphans = 0, nr_truncates = 0;
@@ -1715,9 +1668,9 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 		DQUOT_INIT(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
-				"%s: truncating inode %lu to %Ld bytes\n",
+				"%s: truncating inode %lu to %lld bytes\n",
 				__func__, inode->i_ino, inode->i_size);
-			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
 			ext4_truncate(inode);
 			nr_truncates++;
@@ -1732,7 +1685,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 		iput(inode);  /* The delete magic happens here! */
 	}
 
-#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
 
 	if (nr_orphans)
 		printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
@@ -1758,13 +1711,13 @@ static void ext4_orphan_cleanup (struct super_block * sb,
  *
  * Note, this does *not* consider any metadata overhead for vfs i_blocks.
  */
-static loff_t ext4_max_size(int blkbits)
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
 {
 	loff_t res;
 	loff_t upper_limit = MAX_LFS_FILESIZE;
 
 	/* small i_blocks in vfs inode? */
-	if (sizeof(blkcnt_t) < sizeof(u64)) {
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
 		 * CONFIG_LSF is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
@@ -1794,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits)
  * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
  * We need to be 1 filesystem block less than the 2^48 sector limit.
  */
-static loff_t ext4_max_bitmap_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 {
 	loff_t res = EXT4_NDIR_BLOCKS;
 	int meta_blocks;
@@ -1807,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits)
 	 * total number of  512 bytes blocks of the file
 	 */
 
-	if (sizeof(blkcnt_t) < sizeof(u64)) {
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * CONFIG_LSF is not enabled implies the inode
-		 * i_block represent total blocks in 512 bytes
-		 * 32 == size of vfs inode i_blocks * 8
+		 * !has_huge_files or CONFIG_LSF is not enabled
+		 * implies the inode i_block represent total blocks in
+		 * 512 bytes 32 == size of vfs inode i_blocks * 8
 		 */
 		upper_limit = (1LL << 32) - 1;
 
@@ -1899,12 +1852,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 	return 0;
 }
 
-static int ext4_fill_super (struct super_block *sb, void *data, int silent)
+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
 
 {
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi;
 	ext4_fsblk_t block;
@@ -1915,11 +1868,12 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
+	char *cp;
 	int ret = -EINVAL;
 	int blocksize;
 	int db_count;
 	int i;
-	int needs_recovery;
+	int needs_recovery, has_huge_files;
 	__le32 features;
 	__u64 blocks_count;
 	int err;
@@ -1931,10 +1885,15 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT4_DEF_RESUID;
 	sbi->s_resgid = EXT4_DEF_RESGID;
+	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
 
 	unlock_kernel();
 
+	/* Cleanup superblock name */
+	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+		*cp = '!';
+
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
 		printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
@@ -1953,7 +1912,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	}
 
 	if (!(bh = sb_bread(sb, logical_sb_block))) {
-		printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
+		printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
 		goto out_fail;
 	}
 	/*
@@ -1974,11 +1933,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		set_opt(sbi->s_mount_opt, GRPID);
 	if (def_mount_opts & EXT4_DEFM_UID16)
 		set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	if (def_mount_opts & EXT4_DEFM_XATTR_USER)
 		set_opt(sbi->s_mount_opt, XATTR_USER);
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	if (def_mount_opts & EXT4_DEFM_ACL)
 		set_opt(sbi->s_mount_opt, POSIX_ACL);
 #endif
@@ -2013,11 +1972,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		ext4_warning(sb, __func__,
 			"extents feature not enabled on this filesystem, "
 			"use tune2fs.\n");
-	/*
-	 * turn on mballoc code by default in ext4 filesystem
-	 * Use -o nomballoc to turn it off
-	 */
-	set_opt(sbi->s_mount_opt, MBALLOC);
 
 	/*
 	 * enable delayed allocation by default
@@ -2026,8 +1980,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	set_opt(sbi->s_mount_opt, DELALLOC);
 
 
-	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
-			    NULL, 0))
+	if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
+			   NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2042,16 +1996,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		       "running e2fsck is recommended\n");
 
 	/*
-	 * Since ext4 is still considered development code, we require
-	 * that the TEST_FILESYS flag in s->flags be set.
-	 */
-	if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
-		printk(KERN_WARNING "EXT4-fs: %s: not marked "
-		       "OK to use with test code.\n", sb->s_id);
-		goto failed_mount;
-	}
-
-	/*
 	 * Check feature flags regardless of the revision level, since we
 	 * previously didn't change the revision level when setting the flags,
 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
@@ -2070,7 +2014,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		       sb->s_id, le32_to_cpu(features));
 		goto failed_mount;
 	}
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+	if (has_huge_files) {
 		/*
 		 * Large file size enabled file system can only be
 		 * mount if kernel is build with CONFIG_LSF
@@ -2102,7 +2048,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 			goto failed_mount;
 		}
 
-		brelse (bh);
+		brelse(bh);
 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
 		offset = do_div(logical_sb_block, blocksize);
 		bh = sb_bread(sb, logical_sb_block);
@@ -2114,14 +2060,15 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-			printk (KERN_ERR
-				"EXT4-fs: Magic mismatch, very weird !\n");
+			printk(KERN_ERR
+			       "EXT4-fs: Magic mismatch, very weird !\n");
 			goto failed_mount;
 		}
 	}
 
-	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
-	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+						      has_huge_files);
+	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
 		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2132,9 +2079,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
 		    (!is_power_of_2(sbi->s_inode_size)) ||
 		    (sbi->s_inode_size > blocksize)) {
-			printk (KERN_ERR
-				"EXT4-fs: unsupported inode size: %d\n",
-				sbi->s_inode_size);
+			printk(KERN_ERR
+			       "EXT4-fs: unsupported inode size: %d\n",
+			       sbi->s_inode_size);
 			goto failed_mount;
 		}
 		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
@@ -2166,20 +2113,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_mount_state = le16_to_cpu(es->s_state);
 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
-	for (i=0; i < 4; i++)
+	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
-		printk (KERN_ERR
-			"EXT4-fs: #blocks per group too big: %lu\n",
-			sbi->s_blocks_per_group);
+		printk(KERN_ERR
+		       "EXT4-fs: #blocks per group too big: %lu\n",
+		       sbi->s_blocks_per_group);
 		goto failed_mount;
 	}
 	if (sbi->s_inodes_per_group > blocksize * 8) {
-		printk (KERN_ERR
-			"EXT4-fs: #inodes per group too big: %lu\n",
-			sbi->s_inodes_per_group);
+		printk(KERN_ERR
+		       "EXT4-fs: #inodes per group too big: %lu\n",
+		       sbi->s_inodes_per_group);
 		goto failed_mount;
 	}
 
@@ -2213,26 +2160,36 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_groups_count = blocks_count;
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
-	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
 				    GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
-		printk (KERN_ERR "EXT4-fs: not enough memory\n");
+		printk(KERN_ERR "EXT4-fs: not enough memory\n");
 		goto failed_mount;
 	}
 
+#ifdef CONFIG_PROC_FS
+	if (ext4_proc_root)
+		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+	if (sbi->s_proc)
+		proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
+				 &ext4_ui_proc_fops,
+				 &sbi->s_inode_readahead_blks);
+#endif
+
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logical_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
 		if (!sbi->s_group_desc[i]) {
-			printk (KERN_ERR "EXT4-fs: "
-				"can't read group descriptor %d\n", i);
+			printk(KERN_ERR "EXT4-fs: "
+			       "can't read group descriptor %d\n", i);
 			db_count = i;
 			goto failed_mount2;
 		}
 	}
-	if (!ext4_check_descriptors (sb)) {
+	if (!ext4_check_descriptors(sb)) {
 		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
 		goto failed_mount2;
 	}
@@ -2258,24 +2215,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		err = percpu_counter_init(&sbi->s_dirs_counter,
 				ext4_count_dirs(sb));
 	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+	}
 	if (err) {
 		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
 		goto failed_mount3;
 	}
 
-	/* per fileystem reservation list head & lock */
-	spin_lock_init(&sbi->s_rsv_window_lock);
-	sbi->s_rsv_window_root = RB_ROOT;
-	/* Add a single, static dummy reservation to the start of the
-	 * reservation window list --- it gives us a placeholder for
-	 * append-at-start-of-list which makes the allocation logic
-	 * _much_ simpler. */
-	sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
-	sbi->s_rsv_window_head.rsv_goal_size = 0;
-	ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
-
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 
 	/*
@@ -2308,11 +2255,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		    EXT4_SB(sb)->s_journal->j_failed_commit) {
 			printk(KERN_CRIT "EXT4-fs error (device %s): "
 			       "ext4_fill_super: Journal transaction "
-			       "%u is corrupt\n", sb->s_id, 
+			       "%u is corrupt\n", sb->s_id,
 			       EXT4_SB(sb)->s_journal->j_failed_commit);
-			if (test_opt (sb, ERRORS_RO)) {
-				printk (KERN_CRIT
-					"Mounting filesystem read-only\n");
+			if (test_opt(sb, ERRORS_RO)) {
+				printk(KERN_CRIT
+				       "Mounting filesystem read-only\n");
 				sb->s_flags |= MS_RDONLY;
 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2332,9 +2279,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 			goto failed_mount3;
 	} else {
 		if (!silent)
-			printk (KERN_ERR
-				"ext4: No journal on filesystem on %s\n",
-				sb->s_id);
+			printk(KERN_ERR
+			       "ext4: No journal on filesystem on %s\n",
+			       sb->s_id);
 		goto failed_mount3;
 	}
 
@@ -2418,7 +2365,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount4;
 	}
 
-	ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+	ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
 
 	/* determine the minimum size of new large inodes, if present */
 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -2445,6 +2392,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 			"available.\n");
 	}
 
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+				"requested data journaling mode\n");
+		clear_opt(sbi->s_mount_opt, DELALLOC);
+	} else if (test_opt(sb, DELALLOC))
+		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
+	ext4_ext_init(sb);
+	err = ext4_mb_init(sb, needs_recovery);
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+		       err);
+		goto failed_mount4;
+	}
+
 	/*
 	 * akpm: core read_super() calls in here with the superblock locked.
 	 * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2457,22 +2419,12 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
 	if (needs_recovery)
-		printk (KERN_INFO "EXT4-fs: recovery complete.\n");
+		printk(KERN_INFO "EXT4-fs: recovery complete.\n");
 	ext4_mark_recovery_complete(sb, es);
-	printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
-		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
-		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
-		"writeback");
-
-	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
-				"requested data journaling mode\n");
-		clear_opt(sbi->s_mount_opt, DELALLOC);
-	} else if (test_opt(sb, DELALLOC))
-		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
-
-	ext4_ext_init(sb);
-	ext4_mb_init(sb, needs_recovery);
+	printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
+	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
+	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
+	       "writeback");
 
 	lock_kernel();
 	return 0;
@@ -2490,11 +2442,16 @@ failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
 failed_mount:
+	if (sbi->s_proc) {
+		remove_proc_entry("inode_readahead_blks", sbi->s_proc);
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -2528,6 +2485,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JBD2_BARRIER;
 	else
 		journal->j_flags &= ~JBD2_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
 	spin_unlock(&journal->j_state_lock);
 }
 
@@ -2553,7 +2514,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 		return NULL;
 	}
 
-	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
 		printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
@@ -2575,14 +2536,14 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 static journal_t *ext4_get_dev_journal(struct super_block *sb,
 				       dev_t j_dev)
 {
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	journal_t *journal;
 	ext4_fsblk_t start;
 	ext4_fsblk_t len;
 	int hblock, blocksize;
 	ext4_fsblk_t sb_block;
 	unsigned long offset;
-	struct ext4_super_block * es;
+	struct ext4_super_block *es;
 	struct block_device *bdev;
 
 	bdev = ext4_blkdev_get(j_dev);
@@ -2592,7 +2553,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if (bd_claim(bdev, sb)) {
 		printk(KERN_ERR
 			"EXT4: failed to claim external journal device.\n");
-		blkdev_put(bdev);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
 
@@ -2697,8 +2658,8 @@ static int ext4_load_journal(struct super_block *sb,
 					"unavailable, cannot proceed.\n");
 				return -EROFS;
 			}
-			printk (KERN_INFO "EXT4-fs: write access will "
-					"be enabled during recovery.\n");
+			printk(KERN_INFO "EXT4-fs: write access will "
+			       "be enabled during recovery.\n");
 		}
 	}
 
@@ -2716,6 +2677,11 @@ static int ext4_load_journal(struct super_block *sb,
 			return -EINVAL;
 	}
 
+	if (journal->j_flags & JBD2_BARRIER)
+		printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+	else
+		printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
 		err = jbd2_journal_update_format(journal);
 		if (err)  {
@@ -2751,8 +2717,8 @@ static int ext4_load_journal(struct super_block *sb,
 	return 0;
 }
 
-static int ext4_create_journal(struct super_block * sb,
-			       struct ext4_super_block * es,
+static int ext4_create_journal(struct super_block *sb,
+			       struct ext4_super_block *es,
 			       unsigned int journal_inum)
 {
 	journal_t *journal;
@@ -2793,21 +2759,41 @@ static int ext4_create_journal(struct super_block * sb,
 	return 0;
 }
 
-static void ext4_commit_super (struct super_block * sb,
-			       struct ext4_super_block * es,
-			       int sync)
+static void ext4_commit_super(struct super_block *sb,
+			      struct ext4_super_block *es, int sync)
 {
 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
 
 	if (!sbh)
 		return;
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "ext4: previous I/O error to "
+		       "superblock detected for %s.\n", sb->s_id);
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
 	es->s_wtime = cpu_to_le32(get_seconds());
 	ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
-	if (sync)
+	if (sync) {
 		sync_dirty_buffer(sbh);
+		if (buffer_write_io_error(sbh)) {
+			printk(KERN_ERR "ext4: I/O error while writing "
+			       "superblock for %s.\n", sb->s_id);
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
 }
 
 
@@ -2816,13 +2802,15 @@ static void ext4_commit_super (struct super_block * sb,
  * remounting) the filesystem readonly, then we will end up with a
  * consistent fs on disk.  Record that fact.
  */
-static void ext4_mark_recovery_complete(struct super_block * sb,
-					struct ext4_super_block * es)
+static void ext4_mark_recovery_complete(struct super_block *sb,
+					struct ext4_super_block *es)
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 
 	jbd2_journal_lock_updates(journal);
-	jbd2_journal_flush(journal);
+	if (jbd2_journal_flush(journal) < 0)
+		goto out;
+
 	lock_super(sb);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
@@ -2831,6 +2819,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
 		ext4_commit_super(sb, es, 1);
 	}
 	unlock_super(sb);
+
+out:
 	jbd2_journal_unlock_updates(journal);
 }
 
@@ -2839,8 +2829,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
  * has recorded an error from a previous lifetime, move that error to the
  * main filesystem now.
  */
-static void ext4_clear_journal_err(struct super_block * sb,
-				   struct ext4_super_block * es)
+static void ext4_clear_journal_err(struct super_block *sb,
+				   struct ext4_super_block *es)
 {
 	journal_t *journal;
 	int j_errno;
@@ -2865,7 +2855,7 @@ static void ext4_clear_journal_err(struct super_block * sb,
 
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-		ext4_commit_super (sb, es, 1);
+		ext4_commit_super(sb, es, 1);
 
 		jbd2_journal_clear_err(journal);
 	}
@@ -2898,7 +2888,7 @@ int ext4_force_commit(struct super_block *sb)
  * This implicitly triggers the writebehind on sync().
  */
 
-static void ext4_write_super (struct super_block * sb)
+static void ext4_write_super(struct super_block *sb)
 {
 	if (mutex_trylock(&sb->s_lock) != 0)
 		BUG();
@@ -2909,6 +2899,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	tid_t target;
 
+	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
 	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
 		if (wait)
@@ -2930,7 +2921,13 @@ static void ext4_write_super_lockfs(struct super_block *sb)
 
 		/* Now we set up the journal barrier. */
 		jbd2_journal_lock_updates(journal);
-		jbd2_journal_flush(journal);
+
+		/*
+		 * We don't want to clear needs_recovery flag when we failed
+		 * to flush the journal.
+		 */
+		if (jbd2_journal_flush(journal) < 0)
+			return;
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2954,13 +2951,14 @@ static void ext4_unlockfs(struct super_block *sb)
 	}
 }
 
-static int ext4_remount (struct super_block * sb, int * flags, char * data)
+static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct ext4_super_block * es;
+	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_fsblk_t n_blocks_count = 0;
 	unsigned long old_sb_flags;
 	struct ext4_mount_options old_opts;
+	ext4_group_t g;
 	int err;
 #ifdef CONFIG_QUOTA
 	int i;
@@ -3039,6 +3037,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
 			}
 
 			/*
+			 * Make sure the group descriptor checksums
+			 * are sane.  If they aren't, refuse to
+			 * remount r/w.
+			 */
+			for (g = 0; g < sbi->s_groups_count; g++) {
+				struct ext4_group_desc *gdp =
+					ext4_get_group_desc(sb, g, NULL);
+
+				if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
+					printk(KERN_ERR
+	       "EXT4-fs: ext4_remount: "
+		"Checksum for group %lu failed (%u!=%u)\n",
+		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+					       le16_to_cpu(gdp->bg_checksum));
+					err = -EINVAL;
+					goto restore_opts;
+				}
+			}
+
+			/*
 			 * If we have an unprocessed orphan list hanging
 			 * around from a previously readonly bdev mount,
 			 * require a full umount/remount for now.
@@ -3063,7 +3081,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
 			if ((err = ext4_group_extend(sb, es, n_blocks_count)))
 				goto restore_opts;
-			if (!ext4_setup_super (sb, es, 0))
+			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
 		}
 	}
@@ -3093,7 +3111,7 @@ restore_opts:
 	return err;
 }
 
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3143,7 +3161,8 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
 	ext4_free_blocks_count_set(es, buf->f_bfree);
 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
 	if (buf->f_bfree < ext4_r_blocks_count(es))
@@ -3309,51 +3328,56 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-			 char *path, int remount)
+			 char *name, int remount)
 {
 	int err;
-	struct nameidata nd;
+	struct path path;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* When remounting, no checks are needed and in fact, path is NULL */
+	/* When remounting, no checks are needed and in fact, name is NULL */
 	if (remount)
-		return vfs_quota_on(sb, type, format_id, path, remount);
+		return vfs_quota_on(sb, type, format_id, name, remount);
 
-	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	err = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (err)
 		return err;
 
 	/* Quotafile not on the same filesystem? */
-	if (nd.path.mnt->mnt_sb != sb) {
-		path_put(&nd.path);
+	if (path.mnt->mnt_sb != sb) {
+		path_put(&path);
 		return -EXDEV;
 	}
 	/* Journaling quota? */
 	if (EXT4_SB(sb)->s_qf_names[type]) {
-		/* Quotafile not of fs root? */
-		if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+		/* Quotafile not in fs root? */
+		if (path.dentry->d_parent != sb->s_root)
 			printk(KERN_WARNING
 				"EXT4-fs: Quota file not on filesystem root. "
 				"Journaled quota will not work.\n");
- 	}
+	}
 
 	/*
 	 * When we journal data on quota file, we have to flush journal to see
 	 * all updates to the file when we bypass pagecache...
 	 */
-	if (ext4_should_journal_data(nd.path.dentry->d_inode)) {
+	if (ext4_should_journal_data(path.dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
 		 */
 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (err) {
+			path_put(&path);
+			return err;
+		}
 	}
 
-	path_put(&nd.path);
-	return vfs_quota_on(sb, type, format_id, path, remount);
+	err = vfs_quota_on_path(sb, type, format_id, &path);
+	path_put(&path);
+	return err;
 }
 
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
@@ -3412,7 +3436,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	handle_t *handle = journal_current_handle();
 
 	if (!handle) {
-		printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)"
+		printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
 			" cancelled because transaction is not started.\n",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
@@ -3473,18 +3497,82 @@ static int ext4_get_sb(struct file_system_type *fs_type,
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
 
+#ifdef CONFIG_PROC_FS
+static int ext4_ui_proc_show(struct seq_file *m, void *v)
+{
+	unsigned int *p = m->private;
+
+	seq_printf(m, "%u\n", *p);
+	return 0;
+}
+
+static int ext4_ui_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
+}
+
+static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
+			       size_t cnt, loff_t *ppos)
+{
+	unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+	char str[32];
+	unsigned long value;
+
+	if (cnt >= sizeof(str))
+		return -EINVAL;
+	if (copy_from_user(str, buf, cnt))
+		return -EFAULT;
+	value = simple_strtol(str, NULL, 0);
+	if (value < 0)
+		return -ERANGE;
+	*p = value;
+	return cnt;
+}
+
+const struct file_operations ext4_ui_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ext4_ui_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= ext4_ui_proc_write,
+};
+#endif
+
+static struct file_system_type ext4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext4",
+	.get_sb		= ext4_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+#ifdef CONFIG_EXT4DEV_COMPAT
+static int ext4dev_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
+	       "to mount using ext4\n");
+	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
+	       "will go away by 2.6.31\n");
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+}
+
 static struct file_system_type ext4dev_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4dev",
-	.get_sb		= ext4_get_sb,
+	.get_sb		= ext4dev_get_sb,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS("ext4dev");
+#endif
 
 static int __init init_ext4_fs(void)
 {
 	int err;
 
+	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = init_ext4_mballoc();
 	if (err)
 		return err;
@@ -3495,9 +3583,16 @@ static int __init init_ext4_fs(void)
 	err = init_inodecache();
 	if (err)
 		goto out1;
-	err = register_filesystem(&ext4dev_fs_type);
+	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
+#ifdef CONFIG_EXT4DEV_COMPAT
+	err = register_filesystem(&ext4dev_fs_type);
+	if (err) {
+		unregister_filesystem(&ext4_fs_type);
+		goto out;
+	}
+#endif
 	return 0;
 out:
 	destroy_inodecache();
@@ -3510,10 +3605,14 @@ out2:
 
 static void __exit exit_ext4_fs(void)
 {
+	unregister_filesystem(&ext4_fs_type);
+#ifdef CONFIG_EXT4DEV_COMPAT
 	unregister_filesystem(&ext4dev_fs_type);
+#endif
 	destroy_inodecache();
 	exit_ext4_xattr();
 	exit_ext4_mballoc();
+	remove_proc_entry("fs/ext4", NULL);
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e9178643dc01..00740cb32be3 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,10 +23,10 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
-	nd_set_link(nd, (char*)ei->i_data);
+	nd_set_link(nd, (char *) ei->i_data);
 	return NULL;
 }
 
@@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
@@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93c5fdcdad2e..80626d516fee 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache;
 
 static struct xattr_handler *ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
 	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
 #endif
 	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 	[EXT4_XATTR_INDEX_SECURITY]	     = &ext4_xattr_security_handler,
 #endif
 };
@@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
 struct xattr_handler *ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
 	&ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	&ext4_xattr_acl_access_handler,
 	&ext4_xattr_acl_default_handler,
 #endif
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 	&ext4_xattr_security_handler,
 #endif
 	NULL
@@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	struct ext4_xattr_block_find bs = {
 		.s = { .not_found = -ENODATA, },
 	};
+	unsigned long no_expand;
 	int error;
 
 	if (!name)
@@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (strlen(name) > 255)
 		return -ERANGE;
 	down_write(&EXT4_I(inode)->xattr_sem);
+	no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
+	EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+
 	error = ext4_get_inode_loc(inode, &is.iloc);
 	if (error)
 		goto cleanup;
@@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 cleanup:
 	brelse(is.iloc.bh);
 	brelse(bs.bh);
+	if (no_expand == 0)
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
 	up_write(&EXT4_I(inode)->xattr_sem);
 	return error;
 }
@@ -1512,7 +1518,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
 	char *name = entry->e_name;
 	int n;
 
-	for (n=0; n < entry->e_name_len; n++) {
+	for (n = 0; n < entry->e_name_len; n++) {
 		hash = (hash << NAME_HASH_SHIFT) ^
 		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
 		       *name++;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 5992fe979bb9..8ede88b18c29 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -51,8 +51,8 @@ struct ext4_xattr_entry {
 	(((name_len) + EXT4_XATTR_ROUND + \
 	sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
 #define EXT4_XATTR_NEXT(entry) \
-	( (struct ext4_xattr_entry *)( \
-	  (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+	((struct ext4_xattr_entry *)( \
+	 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
 #define EXT4_XATTR_SIZE(size) \
 	(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
 
@@ -63,7 +63,7 @@ struct ext4_xattr_entry {
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
-# ifdef CONFIG_EXT4DEV_FS_XATTR
+# ifdef CONFIG_EXT4_FS_XATTR
 
 extern struct xattr_handler ext4_xattr_user_handler;
 extern struct xattr_handler ext4_xattr_trusted_handler;
@@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void);
 
 extern struct xattr_handler *ext4_xattr_handlers[];
 
-# else  /* CONFIG_EXT4DEV_FS_XATTR */
+# else  /* CONFIG_EXT4_FS_XATTR */
 
 static inline int
 ext4_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 
 #define ext4_xattr_handlers	NULL
 
-# endif  /* CONFIG_EXT4DEV_FS_XATTR */
+# endif  /* CONFIG_EXT4_FS_XATTR */
 
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
 				struct inode *dir);
 #else
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3a9ecac8d61f..3222f51c41cf 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -36,7 +36,7 @@ static inline int fat_max_cache(struct inode *inode)
 
 static struct kmem_cache *fat_cache_cachep;
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct fat_cache *cache = (struct fat_cache *)foo;
 
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 34541d06e626..bae1c3292522 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -17,7 +17,6 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
-#include <linux/dirent.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
@@ -124,10 +123,11 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
  * but ignore that right now.
  * Ahem... Stack smashing in ring 0 isn't fun. Fixed.
  */
-static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len,
+static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
 		       int uni_xlate, struct nls_table *nls)
 {
-	wchar_t *ip, ec;
+	const wchar_t *ip;
+	wchar_t ec;
 	unsigned char *op, nc;
 	int charlen;
 	int k;
@@ -167,6 +167,16 @@ static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len,
 	return (op - ascii);
 }
 
+static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
+				unsigned char *buf, int size)
+{
+	if (sbi->options.utf8)
+		return utf8_wcstombs(buf, uni, size);
+	else
+		return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
+				   sbi->nls_io);
+}
+
 static inline int
 fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
 {
@@ -227,6 +237,19 @@ fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size,
 	return len;
 }
 
+static inline int fat_name_match(struct msdos_sb_info *sbi,
+				 const unsigned char *a, int a_len,
+				 const unsigned char *b, int b_len)
+{
+	if (a_len != b_len)
+		return 0;
+
+	if (sbi->options.name_check != 's')
+		return !nls_strnicmp(sbi->nls_io, a, b, a_len);
+	else
+		return !memcmp(a, b, a_len);
+}
+
 enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };
 
 /**
@@ -302,6 +325,19 @@ parse_long:
 }
 
 /*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE	((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS	((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE	(FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
+/*
  * Return values: negative -> error, 0 -> not found, positive -> found,
  * value is the total amount of slots, including the shortname entry.
  */
@@ -312,29 +348,20 @@ int fat_search_long(struct inode *inode, const unsigned char *name,
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh = NULL;
 	struct msdos_dir_entry *de;
-	struct nls_table *nls_io = sbi->nls_io;
 	struct nls_table *nls_disk = sbi->nls_disk;
-	wchar_t bufuname[14];
 	unsigned char nr_slots;
-	int xlate_len;
+	wchar_t bufuname[14];
 	wchar_t *unicode = NULL;
 	unsigned char work[MSDOS_NAME];
-	unsigned char *bufname = NULL;
-	int uni_xlate = sbi->options.unicode_xlate;
-	int utf8 = sbi->options.utf8;
-	int anycase = (sbi->options.name_check != 's');
+	unsigned char bufname[FAT_MAX_SHORT_SIZE];
 	unsigned short opt_shortname = sbi->options.shortname;
 	loff_t cpos = 0;
-	int chl, i, j, last_u, err;
-
-	bufname = __getname();
-	if (!bufname)
-		return -ENOMEM;
+	int chl, i, j, last_u, err, len;
 
 	err = -ENOENT;
-	while(1) {
+	while (1) {
 		if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
-			goto EODir;
+			goto end_of_dir;
 parse_record:
 		nr_slots = 0;
 		if (de->name[0] == DELETED_FLAG)
@@ -353,7 +380,7 @@ parse_record:
 			else if (status == PARSE_NOT_LONGNAME)
 				goto parse_record;
 			else if (status == PARSE_EOF)
-				goto EODir;
+				goto end_of_dir;
 		}
 
 		memcpy(work, de->name, sizeof(de->name));
@@ -394,30 +421,24 @@ parse_record:
 		if (!last_u)
 			continue;
 
+		/* Compare shortname */
 		bufuname[last_u] = 0x0000;
-		xlate_len = utf8
-			?utf8_wcstombs(bufname, bufuname, PATH_MAX)
-			:uni16_to_x8(bufname, bufuname, PATH_MAX, uni_xlate, nls_io);
-		if (xlate_len == name_len)
-			if ((!anycase && !memcmp(name, bufname, xlate_len)) ||
-			    (anycase && !nls_strnicmp(nls_io, name, bufname,
-								xlate_len)))
-				goto Found;
+		len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+		if (fat_name_match(sbi, name, name_len, bufname, len))
+			goto found;
 
 		if (nr_slots) {
-			xlate_len = utf8
-				?utf8_wcstombs(bufname, unicode, PATH_MAX)
-				:uni16_to_x8(bufname, unicode, PATH_MAX, uni_xlate, nls_io);
-			if (xlate_len != name_len)
-				continue;
-			if ((!anycase && !memcmp(name, bufname, xlate_len)) ||
-			    (anycase && !nls_strnicmp(nls_io, name, bufname,
-								xlate_len)))
-				goto Found;
+			void *longname = unicode + FAT_MAX_UNI_CHARS;
+			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+
+			/* Compare longname */
+			len = fat_uni_to_x8(sbi, unicode, longname, size);
+			if (fat_name_match(sbi, name, name_len, longname, len))
+				goto found;
 		}
 	}
 
-Found:
+found:
 	nr_slots++;	/* include the de */
 	sinfo->slot_off = cpos - nr_slots * sizeof(*de);
 	sinfo->nr_slots = nr_slots;
@@ -425,9 +446,7 @@ Found:
 	sinfo->bh = bh;
 	sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
 	err = 0;
-EODir:
-	if (bufname)
-		__putname(bufname);
+end_of_dir:
 	if (unicode)
 		__putname(unicode);
 
@@ -453,23 +472,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh;
 	struct msdos_dir_entry *de;
-	struct nls_table *nls_io = sbi->nls_io;
 	struct nls_table *nls_disk = sbi->nls_disk;
-	unsigned char long_slots;
-	const char *fill_name;
-	int fill_len;
+	unsigned char nr_slots;
 	wchar_t bufuname[14];
 	wchar_t *unicode = NULL;
-	unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname;
-	unsigned long lpos, dummy, *furrfu = &lpos;
-	int uni_xlate = sbi->options.unicode_xlate;
+	unsigned char c, work[MSDOS_NAME];
+	unsigned char bufname[FAT_MAX_SHORT_SIZE], *ptname = bufname;
+	unsigned short opt_shortname = sbi->options.shortname;
 	int isvfat = sbi->options.isvfat;
-	int utf8 = sbi->options.utf8;
 	int nocase = sbi->options.nocase;
-	unsigned short opt_shortname = sbi->options.shortname;
+	const char *fill_name = NULL;
 	unsigned long inum;
-	int chi, chl, i, i2, j, last, last_u, dotoffset = 0;
+	unsigned long lpos, dummy, *furrfu = &lpos;
 	loff_t cpos;
+	int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len = 0;
 	int ret = 0;
 
 	lock_super(sb);
@@ -489,43 +505,58 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 			cpos = 0;
 		}
 	}
-	if (cpos & (sizeof(struct msdos_dir_entry)-1)) {
+	if (cpos & (sizeof(struct msdos_dir_entry) - 1)) {
 		ret = -ENOENT;
 		goto out;
 	}
 
 	bh = NULL;
-GetNew:
+get_new:
 	if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
-		goto EODir;
+		goto end_of_dir;
 parse_record:
-	long_slots = 0;
-	/* Check for long filename entry */
-	if (isvfat) {
+	nr_slots = 0;
+	/*
+	 * Check for long filename entry, but if short_only, we don't
+	 * need to parse long filename.
+	 */
+	if (isvfat && !short_only) {
 		if (de->name[0] == DELETED_FLAG)
-			goto RecEnd;
+			goto record_end;
 		if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
-			goto RecEnd;
+			goto record_end;
 		if (de->attr != ATTR_EXT && IS_FREE(de->name))
-			goto RecEnd;
+			goto record_end;
 	} else {
 		if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name))
-			goto RecEnd;
+			goto record_end;
 	}
 
 	if (isvfat && de->attr == ATTR_EXT) {
 		int status = fat_parse_long(inode, &cpos, &bh, &de,
-					    &unicode, &long_slots);
+					    &unicode, &nr_slots);
 		if (status < 0) {
 			filp->f_pos = cpos;
 			ret = status;
 			goto out;
 		} else if (status == PARSE_INVALID)
-			goto RecEnd;
+			goto record_end;
 		else if (status == PARSE_NOT_LONGNAME)
 			goto parse_record;
 		else if (status == PARSE_EOF)
-			goto EODir;
+			goto end_of_dir;
+
+		if (nr_slots) {
+			void *longname = unicode + FAT_MAX_UNI_CHARS;
+			int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+			int len = fat_uni_to_x8(sbi, unicode, longname, size);
+
+			fill_name = longname;
+			fill_len = len;
+			/* !both && !short_only, so we don't need shortname. */
+			if (!both)
+				goto start_filldir;
+		}
 	}
 
 	if (sbi->options.dotsOK) {
@@ -587,12 +618,32 @@ parse_record:
 		}
 	}
 	if (!last)
-		goto RecEnd;
+		goto record_end;
 
 	i = last + dotoffset;
 	j = last_u;
 
-	lpos = cpos - (long_slots+1)*sizeof(struct msdos_dir_entry);
+	if (isvfat) {
+		bufuname[j] = 0x0000;
+		i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+	}
+	if (nr_slots) {
+		/* hack for fat_ioctl_filldir() */
+		struct fat_ioctl_filldir_callback *p = dirent;
+
+		p->longname = fill_name;
+		p->long_len = fill_len;
+		p->shortname = bufname;
+		p->short_len = i;
+		fill_name = NULL;
+		fill_len = 0;
+	} else {
+		fill_name = bufname;
+		fill_len = i;
+	}
+
+start_filldir:
+	lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
 	if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
 		inum = inode->i_ino;
 	else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
@@ -607,49 +658,17 @@ parse_record:
 			inum = iunique(sb, MSDOS_ROOT_INO);
 	}
 
-	if (isvfat) {
-		bufuname[j] = 0x0000;
-		i = utf8 ? utf8_wcstombs(bufname, bufuname, sizeof(bufname))
-			 : uni16_to_x8(bufname, bufuname, sizeof(bufname), uni_xlate, nls_io);
-	}
-
-	fill_name = bufname;
-	fill_len = i;
-	if (!short_only && long_slots) {
-		/* convert the unicode long name. 261 is maximum size
-		 * of unicode buffer. (13 * slots + nul) */
-		void *longname = unicode + 261;
-		int buf_size = PATH_MAX - (261 * sizeof(unicode[0]));
-		int long_len = utf8
-			? utf8_wcstombs(longname, unicode, buf_size)
-			: uni16_to_x8(longname, unicode, buf_size, uni_xlate, nls_io);
-
-		if (!both) {
-			fill_name = longname;
-			fill_len = long_len;
-		} else {
-			/* hack for fat_ioctl_filldir() */
-			struct fat_ioctl_filldir_callback *p = dirent;
-
-			p->longname = longname;
-			p->long_len = long_len;
-			p->shortname = bufname;
-			p->short_len = i;
-			fill_name = NULL;
-			fill_len = 0;
-		}
-	}
 	if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
 		    (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
-		goto FillFailed;
+		goto fill_failed;
 
-RecEnd:
+record_end:
 	furrfu = &lpos;
 	filp->f_pos = cpos;
-	goto GetNew;
-EODir:
+	goto get_new;
+end_of_dir:
 	filp->f_pos = cpos;
-FillFailed:
+fill_failed:
 	brelse(bh);
 	if (unicode)
 		__putname(unicode);
@@ -715,7 +734,7 @@ efault:									   \
 	return -EFAULT;							   \
 }
 
-FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, dirent)
+FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
 
 static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
 			     void __user *dirent, filldir_t filldir,
@@ -741,7 +760,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
 static int fat_dir_ioctl(struct inode *inode, struct file *filp,
 			 unsigned int cmd, unsigned long arg)
 {
-	struct dirent __user *d1 = (struct dirent __user *)arg;
+	struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
 	int short_only, both;
 
 	switch (cmd) {
@@ -757,7 +776,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
 		return fat_generic_ioctl(inode, filp, cmd, arg);
 	}
 
-	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct dirent[2])))
+	if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
 		return -EFAULT;
 	/*
 	 * Yes, we don't need this put_user() absolutely. However old
@@ -820,6 +839,7 @@ const struct file_operations fat_dir_operations = {
 	.compat_ioctl	= fat_compat_dir_ioctl,
 #endif
 	.fsync		= file_fsync,
+	.llseek		= generic_file_llseek,
 };
 
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
@@ -1082,7 +1102,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 		goto error_free;
 	}
 
-	fat_date_unix2dos(ts->tv_sec, &time, &date);
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
 
 	de = (struct msdos_dir_entry *)bhs[0]->b_data;
 	/* filling the new directory slots ("." and ".." entries) */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c4af7e..fb98b3d847ed 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 	struct fat_entry fatent;
 	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
 	int i, err, nr_bhs;
+	int first_cl = cluster;
 
 	nr_bhs = 0;
 	fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
 			goto error;
 		}
 
+		/* 
+		 * Issue discard for the sectors we no longer care about,
+		 * batching contiguous clusters into one request
+		 */
+		if (cluster != fatent.entry + 1) {
+			int nr_clus = fatent.entry - first_cl + 1;
+
+			sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+					 nr_clus * sbi->sec_per_clus);
+			first_cl = cluster;
+		}
+
 		ops->ent_put(&fatent, FAT_ENT_FREE);
 		if (sbi->free_clusters != -1) {
 			sbi->free_clusters++;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index c672df4036e9..ddde37025ca6 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -15,6 +15,8 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
@@ -64,6 +66,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 
 		/* Equivalent to a chmod() */
 		ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+		ia.ia_ctime = current_fs_time(inode->i_sb);
 		if (is_dir) {
 			ia.ia_mode = MSDOS_MKMODE(attr,
 				S_IRWXUGO & ~sbi->options.fs_dmask)
@@ -90,11 +93,21 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 			}
 		}
 
+		/*
+		 * The security check is questionable...  We single
+		 * out the RO attribute for checking by the security
+		 * module, just because it maps to a file mode.
+		 */
+		err = security_inode_setattr(filp->f_path.dentry, &ia);
+		if (err)
+			goto up;
+
 		/* This MUST be done before doing anything irreversible... */
-		err = notify_change(filp->f_path.dentry, &ia);
+		err = fat_setattr(filp->f_path.dentry, &ia);
 		if (err)
 			goto up;
 
+		fsnotify_change(filp->f_path.dentry, ia.ia_valid);
 		if (sbi->options.sys_immutable) {
 			if (attr & ATTR_SYS)
 				inode->i_flags |= S_IMMUTABLE;
@@ -300,6 +313,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 	return 0;
 }
 
+#define TIMES_SET_FLAGS	(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+
 int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
@@ -323,9 +338,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 
 	/* Check for setting the inode time. */
 	ia_valid = attr->ia_valid;
-	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
+	if (ia_valid & TIMES_SET_FLAGS) {
 		if (fat_allow_set_time(sbi, inode))
-			attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET);
+			attr->ia_valid &= ~TIMES_SET_FLAGS;
 	}
 
 	error = inode_change_ok(inode, attr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 46a4508ffd2e..19eafbe3c379 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -382,17 +382,20 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 	inode->i_mtime.tv_sec =
-		date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date));
+		date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date),
+			      sbi->options.tz_utc);
 	inode->i_mtime.tv_nsec = 0;
 	if (sbi->options.isvfat) {
 		int secs = de->ctime_cs / 100;
 		int csecs = de->ctime_cs % 100;
 		inode->i_ctime.tv_sec  =
 			date_dos2unix(le16_to_cpu(de->ctime),
-				      le16_to_cpu(de->cdate)) + secs;
+				      le16_to_cpu(de->cdate),
+				      sbi->options.tz_utc) + secs;
 		inode->i_ctime.tv_nsec = csecs * 10000000;
 		inode->i_atime.tv_sec =
-			date_dos2unix(0, le16_to_cpu(de->adate));
+			date_dos2unix(0, le16_to_cpu(de->adate),
+				      sbi->options.tz_utc);
 		inode->i_atime.tv_nsec = 0;
 	} else
 		inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -495,7 +498,7 @@ static void fat_destroy_inode(struct inode *inode)
 	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
 
@@ -559,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait)
 	struct buffer_head *bh;
 	struct msdos_dir_entry *raw_entry;
 	loff_t i_pos;
-	int err = 0;
+	int err;
 
 retry:
 	i_pos = MSDOS_I(inode)->i_pos;
 	if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
 		return 0;
 
-	lock_super(sb);
 	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
 	if (!bh) {
 		printk(KERN_ERR "FAT: unable to read inode block "
 		       "for updating (i_pos %lld)\n", i_pos);
-		err = -EIO;
-		goto out;
+		return -EIO;
 	}
 	spin_lock(&sbi->inode_hash_lock);
 	if (i_pos != MSDOS_I(inode)->i_pos) {
 		spin_unlock(&sbi->inode_hash_lock);
 		brelse(bh);
-		unlock_super(sb);
 		goto retry;
 	}
 
@@ -591,21 +591,23 @@ retry:
 	raw_entry->attr = fat_attr(inode);
 	raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
-	fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time, &raw_entry->date);
+	fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time,
+			  &raw_entry->date, sbi->options.tz_utc);
 	if (sbi->options.isvfat) {
 		__le16 atime;
-		fat_date_unix2dos(inode->i_ctime.tv_sec,&raw_entry->ctime,&raw_entry->cdate);
-		fat_date_unix2dos(inode->i_atime.tv_sec,&atime,&raw_entry->adate);
+		fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime,
+				  &raw_entry->cdate, sbi->options.tz_utc);
+		fat_date_unix2dos(inode->i_atime.tv_sec, &atime,
+				  &raw_entry->adate, sbi->options.tz_utc);
 		raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 +
 			inode->i_ctime.tv_nsec / 10000000;
 	}
 	spin_unlock(&sbi->inode_hash_lock);
 	mark_buffer_dirty(bh);
+	err = 0;
 	if (wait)
 		err = sync_dirty_buffer(bh);
 	brelse(bh);
-out:
-	unlock_super(sb);
 	return err;
 }
 
@@ -679,33 +681,24 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
 			inode = NULL;
 		}
 	}
-	if (!inode) {
-		/* For now, do nothing
-		 * What we could do is:
-		 * follow the file starting at fh[4], and record
-		 * the ".." entry, and the name of the fh[2] entry.
-		 * The follow the ".." file finding the next step up.
-		 * This way we build a path to the root of
-		 * the tree. If this works, we lookup the path and so
-		 * get this inode into the cache.
-		 * Finally try the fat_iget lookup again
-		 * If that fails, then weare totally out of luck
-		 * But all that is for another day
-		 */
-	}
-	if (!inode)
-		return ERR_PTR(-ESTALE);
-
 
-	/* now to find a dentry.
-	 * If possible, get a well-connected one
+	/*
+	 * For now, do nothing if the inode is not found.
+	 *
+	 * What we could do is:
+	 *
+	 *	- follow the file starting at fh[4], and record the ".." entry,
+	 *	  and the name of the fh[2] entry.
+	 *	- then follow the ".." file finding the next step up.
+	 *
+	 * This way we build a path to the root of the tree. If this works, we
+	 * lookup the path and so get this inode into the cache.  Finally try
+	 * the fat_iget lookup again.  If that fails, then we are totally out
+	 * of luck.  But all that is for another day
 	 */
-	result = d_alloc_anon(inode);
-	if (result == NULL) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	result->d_op = sb->s_root->d_op;
+	result = d_obtain_alias(inode);
+	if (!IS_ERR(result))
+		result->d_op = sb->s_root->d_op;
 	return result;
 }
 
@@ -752,15 +745,8 @@ static struct dentry *fat_get_parent(struct dentry *child)
 	}
 	inode = fat_build_inode(sb, de, i_pos);
 	brelse(bh);
-	if (IS_ERR(inode)) {
-		parent = ERR_CAST(inode);
-		goto out;
-	}
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
+
+	parent = d_obtain_alias(inode);
 out:
 	unlock_super(sb);
 
@@ -836,6 +822,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 	}
 	if (sbi->options.flush)
 		seq_puts(m, ",flush");
+	if (opts->tz_utc)
+		seq_puts(m, ",tz=UTC");
 
 	return 0;
 }
@@ -848,10 +836,10 @@ enum {
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-	Opt_obsolate, Opt_flush, Opt_err,
+	Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
 };
 
-static match_table_t fat_tokens = {
+static const match_table_t fat_tokens = {
 	{Opt_check_r, "check=relaxed"},
 	{Opt_check_s, "check=strict"},
 	{Opt_check_n, "check=normal"},
@@ -883,16 +871,17 @@ static match_table_t fat_tokens = {
 	{Opt_obsolate, "cvf_options=%100s"},
 	{Opt_obsolate, "posix"},
 	{Opt_flush, "flush"},
+	{Opt_tz_utc, "tz=UTC"},
 	{Opt_err, NULL},
 };
-static match_table_t msdos_tokens = {
+static const match_table_t msdos_tokens = {
 	{Opt_nodots, "nodots"},
 	{Opt_nodots, "dotsOK=no"},
 	{Opt_dots, "dots"},
 	{Opt_dots, "dotsOK=yes"},
 	{Opt_err, NULL}
 };
-static match_table_t vfat_tokens = {
+static const match_table_t vfat_tokens = {
 	{Opt_charset, "iocharset=%s"},
 	{Opt_shortname_lower, "shortname=lower"},
 	{Opt_shortname_win95, "shortname=win95"},
@@ -947,10 +936,11 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 	opts->utf8 = opts->unicode_xlate = 0;
 	opts->numtail = 1;
 	opts->usefree = opts->nocase = 0;
+	opts->tz_utc = 0;
 	*debug = 0;
 
 	if (!options)
-		return 0;
+		goto out;
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
@@ -1036,6 +1026,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 		case Opt_flush:
 			opts->flush = 1;
 			break;
+		case Opt_tz_utc:
+			opts->tz_utc = 1;
+			break;
 
 		/* msdos specific */
 		case Opt_dots:
@@ -1104,10 +1097,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 			return -EINVAL;
 		}
 	}
+
+out:
 	/* UTF-8 doesn't provide FAT semantics */
 	if (!strcmp(opts->iocharset, "utf8")) {
 		printk(KERN_ERR "FAT: utf8 is not a recommended IO charset"
-		       " for FAT filesystems, filesystem will be case sensitive!\n");
+		       " for FAT filesystems, filesystem will be "
+		       "case sensitive!\n");
 	}
 
 	/* If user doesn't specify allow_utime, it's initialized from dmask. */
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 61f23511eacf..79fb98ad36d4 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -142,7 +142,7 @@ static int day_n[] = {
 };
 
 /* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-int date_dos2unix(unsigned short time, unsigned short date)
+int date_dos2unix(unsigned short time, unsigned short date, int tz_utc)
 {
 	int month, year, secs;
 
@@ -156,16 +156,18 @@ int date_dos2unix(unsigned short time, unsigned short date)
 	    ((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 &&
 	    month < 2 ? 1 : 0)+3653);
 			/* days since 1.1.70 plus 80's leap day */
-	secs += sys_tz.tz_minuteswest*60;
+	if (!tz_utc)
+		secs += sys_tz.tz_minuteswest*60;
 	return secs;
 }
 
 /* Convert linear UNIX date to a MS-DOS time/date pair. */
-void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
+void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc)
 {
 	int day, year, nl_day, month;
 
-	unix_date -= sys_tz.tz_minuteswest*60;
+	if (!tz_utc)
+		unix_date -= sys_tz.tz_minuteswest*60;
 
 	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
 	if (unix_date < 315532800)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 330a7d782591..ac4f7db9f134 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -49,145 +49,94 @@ static int get_close_on_exec(unsigned int fd)
 	return res;
 }
 
-/*
- * locate_fd finds a free file descriptor in the open_fds fdset,
- * expanding the fd arrays if necessary.  Must be called with the
- * file_lock held for write.
- */
-
-static int locate_fd(unsigned int orig_start, int cloexec)
-{
-	struct files_struct *files = current->files;
-	unsigned int newfd;
-	unsigned int start;
-	int error;
-	struct fdtable *fdt;
-
-	spin_lock(&files->file_lock);
-
-	error = -EINVAL;
-	if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-		goto out;
-
-repeat:
-	fdt = files_fdtable(files);
-	/*
-	 * Someone might have closed fd's in the range
-	 * orig_start..fdt->next_fd
-	 */
-	start = orig_start;
-	if (start < files->next_fd)
-		start = files->next_fd;
-
-	newfd = start;
-	if (start < fdt->max_fds)
-		newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
-					   fdt->max_fds, start);
-	
-	error = -EMFILE;
-	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-		goto out;
-
-	error = expand_files(files, newfd);
-	if (error < 0)
-		goto out;
-
-	/*
-	 * If we needed to expand the fs array we
-	 * might have blocked - try again.
-	 */
-	if (error)
-		goto repeat;
-
-	if (start <= files->next_fd)
-		files->next_fd = newfd + 1;
-
-	FD_SET(newfd, fdt->open_fds);
-	if (cloexec)
-		FD_SET(newfd, fdt->close_on_exec);
-	else
-		FD_CLR(newfd, fdt->close_on_exec);
-	error = newfd;
-
-out:
-	spin_unlock(&files->file_lock);
-	return error;
-}
-
-static int dupfd(struct file *file, unsigned int start, int cloexec)
-{
-	int fd = locate_fd(start, cloexec);
-	if (fd >= 0)
-		fd_install(fd, file);
-	else
-		fput(file);
-
-	return fd;
-}
-
-asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
 {
 	int err = -EBADF;
 	struct file * file, *tofree;
 	struct files_struct * files = current->files;
 	struct fdtable *fdt;
 
-	spin_lock(&files->file_lock);
-	if (!(file = fcheck(oldfd)))
-		goto out_unlock;
-	err = newfd;
-	if (newfd == oldfd)
-		goto out_unlock;
-	err = -EBADF;
-	if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-		goto out_unlock;
-	get_file(file);			/* We are now finished with oldfd */
-
-	err = expand_files(files, newfd);
-	if (err < 0)
-		goto out_fput;
+	if ((flags & ~O_CLOEXEC) != 0)
+		return -EINVAL;
 
-	/* To avoid races with open() and dup(), we will mark the fd as
-	 * in-use in the open-file bitmap throughout the entire dup2()
-	 * process.  This is quite safe: do_close() uses the fd array
-	 * entry, not the bitmap, to decide what work needs to be
-	 * done.  --sct */
-	/* Doesn't work. open() might be there first. --AV */
+	if (unlikely(oldfd == newfd))
+		return -EINVAL;
 
-	/* Yes. It's a race. In user space. Nothing sane to do */
+	spin_lock(&files->file_lock);
+	err = expand_files(files, newfd);
+	file = fcheck(oldfd);
+	if (unlikely(!file))
+		goto Ebadf;
+	if (unlikely(err < 0)) {
+		if (err == -EMFILE)
+			goto Ebadf;
+		goto out_unlock;
+	}
+	/*
+	 * We need to detect attempts to do dup2() over allocated but still
+	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
+	 * extra work in their equivalent of fget() - they insert struct
+	 * file immediately after grabbing descriptor, mark it larval if
+	 * more work (e.g. actual opening) is needed and make sure that
+	 * fget() treats larval files as absent.  Potentially interesting,
+	 * but while extra work in fget() is trivial, locking implications
+	 * and amount of surgery on open()-related paths in VFS are not.
+	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
+	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
+	 * scope of POSIX or SUS, since neither considers shared descriptor
+	 * tables and this condition does not arise without those.
+	 */
 	err = -EBUSY;
 	fdt = files_fdtable(files);
 	tofree = fdt->fd[newfd];
 	if (!tofree && FD_ISSET(newfd, fdt->open_fds))
-		goto out_fput;
-
+		goto out_unlock;
+	get_file(file);
 	rcu_assign_pointer(fdt->fd[newfd], file);
 	FD_SET(newfd, fdt->open_fds);
-	FD_CLR(newfd, fdt->close_on_exec);
+	if (flags & O_CLOEXEC)
+		FD_SET(newfd, fdt->close_on_exec);
+	else
+		FD_CLR(newfd, fdt->close_on_exec);
 	spin_unlock(&files->file_lock);
 
 	if (tofree)
 		filp_close(tofree, files);
-	err = newfd;
-out:
-	return err;
+
+	return newfd;
+
+Ebadf:
+	err = -EBADF;
 out_unlock:
 	spin_unlock(&files->file_lock);
-	goto out;
+	return err;
+}
 
-out_fput:
-	spin_unlock(&files->file_lock);
-	fput(file);
-	goto out;
+asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+{
+	if (unlikely(newfd == oldfd)) { /* corner case */
+		struct files_struct *files = current->files;
+		rcu_read_lock();
+		if (!fcheck_files(files, oldfd))
+			oldfd = -EBADF;
+		rcu_read_unlock();
+		return oldfd;
+	}
+	return sys_dup3(oldfd, newfd, 0);
 }
 
 asmlinkage long sys_dup(unsigned int fildes)
 {
 	int ret = -EBADF;
-	struct file * file = fget(fildes);
-
-	if (file)
-		ret = dupfd(file, 0, 0);
+	struct file *file = fget(fildes);
+
+	if (file) {
+		ret = get_unused_fd();
+		if (ret >= 0)
+			fd_install(ret, file);
+		else
+			fput(file);
+	}
 	return ret;
 }
 
@@ -310,8 +259,13 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	switch (cmd) {
 	case F_DUPFD:
 	case F_DUPFD_CLOEXEC:
-		get_file(filp);
-		err = dupfd(filp, arg, cmd == F_DUPFD_CLOEXEC);
+		if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+			break;
+		err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
+		if (err >= 0) {
+			get_file(filp);
+			fd_install(err, filp);
+		}
 		break;
 	case F_GETFD:
 		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
diff --git a/fs/fifo.c b/fs/fifo.c
index 9785e36f81e7..f8f97b8b6d44 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -51,13 +51,13 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
 
 	switch (filp->f_mode) {
-	case 1:
+	case FMODE_READ:
 	/*
 	 *  O_RDONLY
 	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
 	 *  opened, even when there is no process writing the FIFO.
 	 */
-		filp->f_op = &read_fifo_fops;
+		filp->f_op = &read_pipefifo_fops;
 		pipe->r_counter++;
 		if (pipe->readers++ == 0)
 			wake_up_partner(inode);
@@ -76,7 +76,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		}
 		break;
 	
-	case 2:
+	case FMODE_WRITE:
 	/*
 	 *  O_WRONLY
 	 *  POSIX.1 says that O_NONBLOCK means return -1 with
@@ -86,7 +86,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
 			goto err;
 
-		filp->f_op = &write_fifo_fops;
+		filp->f_op = &write_pipefifo_fops;
 		pipe->w_counter++;
 		if (!pipe->writers++)
 			wake_up_partner(inode);
@@ -98,14 +98,14 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		}
 		break;
 	
-	case 3:
+	case FMODE_READ | FMODE_WRITE:
 	/*
 	 *  O_RDWR
 	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
 	 *  This implementation will NEVER block on a O_RDWR open, since
 	 *  the process can at least talk to itself.
 	 */
-		filp->f_op = &rdwr_fifo_fops;
+		filp->f_op = &rdwr_pipefifo_fops;
 
 		pipe->readers++;
 		pipe->writers++;
@@ -151,5 +151,5 @@ err_nocleanup:
  * depending on the access mode of the file...
  */
 const struct file_operations def_fifo_fops = {
-	.open		= fifo_open,	/* will set read or write pipe_fops */
+	.open		= fifo_open,	/* will set read_ or write_pipefifo_fops */
 };
diff --git a/fs/file.c b/fs/file.c
index 7b3887e054d0..f313314f996f 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -6,6 +6,7 @@
  *  Manage the dynamic fd arrays in the process files_struct.
  */
 
+#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/time.h>
@@ -250,9 +251,18 @@ int expand_files(struct files_struct *files, int nr)
 	struct fdtable *fdt;
 
 	fdt = files_fdtable(files);
+
+	/*
+	 * N.B. For clone tasks sharing a files structure, this test
+	 * will limit the total number of files that can be opened.
+	 */
+	if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+		return -EMFILE;
+
 	/* Do we need to expand? */
 	if (nr < fdt->max_fds)
 		return 0;
+
 	/* Can we expand? */
 	if (nr >= sysctl_nr_open)
 		return -EMFILE;
@@ -423,3 +433,63 @@ struct files_struct init_files = {
 	},
 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 };
+
+/*
+ * allocate a file descriptor, mark it busy.
+ */
+int alloc_fd(unsigned start, unsigned flags)
+{
+	struct files_struct *files = current->files;
+	unsigned int fd;
+	int error;
+	struct fdtable *fdt;
+
+	spin_lock(&files->file_lock);
+repeat:
+	fdt = files_fdtable(files);
+	fd = start;
+	if (fd < files->next_fd)
+		fd = files->next_fd;
+
+	if (fd < fdt->max_fds)
+		fd = find_next_zero_bit(fdt->open_fds->fds_bits,
+					   fdt->max_fds, fd);
+
+	error = expand_files(files, fd);
+	if (error < 0)
+		goto out;
+
+	/*
+	 * If we needed to expand the fs array we
+	 * might have blocked - try again.
+	 */
+	if (error)
+		goto repeat;
+
+	if (start <= files->next_fd)
+		files->next_fd = fd + 1;
+
+	FD_SET(fd, fdt->open_fds);
+	if (flags & O_CLOEXEC)
+		FD_SET(fd, fdt->close_on_exec);
+	else
+		FD_CLR(fd, fdt->close_on_exec);
+	error = fd;
+#if 1
+	/* Sanity check */
+	if (rcu_dereference(fdt->fd[fd]) != NULL) {
+		printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
+		rcu_assign_pointer(fdt->fd[fd], NULL);
+	}
+#endif
+
+out:
+	spin_unlock(&files->file_lock);
+	return error;
+}
+
+int get_unused_fd(void)
+{
+	return alloc_fd(0, 0);
+}
+EXPORT_SYMBOL(get_unused_fd);
diff --git a/fs/file_table.c b/fs/file_table.c
index 83084225b4c3..efc06faede6c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -120,7 +120,7 @@ struct file *get_empty_filp(void)
 
 	tsk = current;
 	INIT_LIST_HEAD(&f->f_u.fu_list);
-	atomic_set(&f->f_count, 1);
+	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_uid = tsk->fsuid;
 	f->f_gid = tsk->fsgid;
@@ -161,7 +161,7 @@ EXPORT_SYMBOL(get_empty_filp);
  * code should be moved into this function.
  */
 struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
-		mode_t mode, const struct file_operations *fop)
+		fmode_t mode, const struct file_operations *fop)
 {
 	struct file *file;
 	struct path;
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(alloc_file);
  * of this should be moving to alloc_file().
  */
 int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
-	   mode_t mode, const struct file_operations *fop)
+	   fmode_t mode, const struct file_operations *fop)
 {
 	int error = 0;
 	file->f_path.dentry = dentry;
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(init_file);
 
 void fput(struct file *file)
 {
-	if (atomic_dec_and_test(&file->f_count))
+	if (atomic_long_dec_and_test(&file->f_count))
 		__fput(file);
 }
 
@@ -294,7 +294,7 @@ struct file *fget(unsigned int fd)
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (!atomic_inc_not_zero(&file->f_count)) {
+		if (!atomic_long_inc_not_zero(&file->f_count)) {
 			/* File object ref couldn't be taken */
 			rcu_read_unlock();
 			return NULL;
@@ -326,7 +326,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 		rcu_read_lock();
 		file = fcheck_files(files, fd);
 		if (file) {
-			if (atomic_inc_not_zero(&file->f_count))
+			if (atomic_long_inc_not_zero(&file->f_count))
 				*fput_needed = 1;
 			else
 				/* Didn't get the reference, someone's freed */
@@ -341,7 +341,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 
 void put_filp(struct file *file)
 {
-	if (atomic_dec_and_test(&file->f_count)) {
+	if (atomic_long_dec_and_test(&file->f_count)) {
 		security_file_free(file);
 		file_kill(file);
 		file_free(file);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index f37f87262837..d0e20ced62dd 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -8,6 +8,8 @@
 
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
@@ -214,6 +216,43 @@ int get_filesystem_list(char * buf)
 	return len;
 }
 
+#ifdef CONFIG_PROC_FS
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+	struct file_system_type * tmp;
+
+	read_lock(&file_systems_lock);
+	tmp = file_systems;
+	while (tmp) {
+		seq_printf(m, "%s\t%s\n",
+			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+			tmp->name);
+		tmp = tmp->next;
+	}
+	read_unlock(&file_systems_lock);
+	return 0;
+}
+
+static int filesystems_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, filesystems_proc_show, NULL);
+}
+
+static const struct file_operations filesystems_proc_fops = {
+	.open		= filesystems_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_filesystems_init(void)
+{
+	proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+	return 0;
+}
+module_init(proc_filesystems_init);
+#endif
+
 struct file_system_type *get_fs_type(const char *name)
 {
 	struct file_system_type *fs;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 25adfc3c693a..d0ff0b8cf309 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -8,7 +8,7 @@
  * pages against inodes.  ie: data writeback.  Writeout of the
  * inode itself is not handled here.
  *
- * 10Apr2002	akpm@zip.com.au
+ * 10Apr2002	Andrew Morton
  *		Split out of fs/inode.c
  *		Additions for address_space-based writeback
  */
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2060bf06b906..fd03330cadeb 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -97,7 +97,7 @@ void fuse_invalidate_attr(struct inode *inode)
  * timeout is unknown (unlink, rmdir, rename and in some cases
  * lookup)
  */
-static void fuse_invalidate_entry_cache(struct dentry *entry)
+void fuse_invalidate_entry_cache(struct dentry *entry)
 {
 	fuse_dentry_settime(entry, 0);
 }
@@ -112,18 +112,16 @@ static void fuse_invalidate_entry(struct dentry *entry)
 	fuse_invalidate_entry_cache(entry);
 }
 
-static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
-			     struct dentry *entry,
+static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req,
+			     u64 nodeid, struct qstr *name,
 			     struct fuse_entry_out *outarg)
 {
-	struct fuse_conn *fc = get_fuse_conn(dir);
-
 	memset(outarg, 0, sizeof(struct fuse_entry_out));
 	req->in.h.opcode = FUSE_LOOKUP;
-	req->in.h.nodeid = get_node_id(dir);
+	req->in.h.nodeid = nodeid;
 	req->in.numargs = 1;
-	req->in.args[0].size = entry->d_name.len + 1;
-	req->in.args[0].value = entry->d_name.name;
+	req->in.args[0].size = name->len + 1;
+	req->in.args[0].value = name->name;
 	req->out.numargs = 1;
 	if (fc->minor < 9)
 		req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
@@ -189,7 +187,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		attr_version = fuse_get_attr_version(fc);
 
 		parent = dget_parent(entry);
-		fuse_lookup_init(req, parent->d_inode, entry, &outarg);
+		fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
+				 &entry->d_name, &outarg);
 		request_send(fc, req);
 		dput(parent);
 		err = req->out.h.error;
@@ -225,7 +224,7 @@ static int invalid_nodeid(u64 nodeid)
 	return !nodeid || nodeid == FUSE_ROOT_ID;
 }
 
-static struct dentry_operations fuse_dentry_operations = {
+struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 };
 
@@ -239,85 +238,127 @@ int fuse_valid_type(int m)
  * Add a directory inode to a dentry, ensuring that no other dentry
  * refers to this inode.  Called with fc->inst_mutex.
  */
-static int fuse_d_add_directory(struct dentry *entry, struct inode *inode)
+static struct dentry *fuse_d_add_directory(struct dentry *entry,
+					   struct inode *inode)
 {
 	struct dentry *alias = d_find_alias(inode);
-	if (alias) {
+	if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
 		/* This tries to shrink the subtree below alias */
 		fuse_invalidate_entry(alias);
 		dput(alias);
 		if (!list_empty(&inode->i_dentry))
-			return -EBUSY;
+			return ERR_PTR(-EBUSY);
+	} else {
+		dput(alias);
 	}
-	d_add(entry, inode);
-	return 0;
+	return d_splice_alias(inode, entry);
 }
 
-static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
-				  struct nameidata *nd)
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+		     struct fuse_entry_out *outarg, struct inode **inode)
 {
-	int err;
-	struct fuse_entry_out outarg;
-	struct inode *inode = NULL;
-	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct fuse_req *req;
 	struct fuse_req *forget_req;
 	u64 attr_version;
+	int err;
 
-	if (entry->d_name.len > FUSE_NAME_MAX)
-		return ERR_PTR(-ENAMETOOLONG);
+	*inode = NULL;
+	err = -ENAMETOOLONG;
+	if (name->len > FUSE_NAME_MAX)
+		goto out;
 
 	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
 	if (IS_ERR(req))
-		return ERR_CAST(req);
+		goto out;
 
 	forget_req = fuse_get_req(fc);
+	err = PTR_ERR(forget_req);
 	if (IS_ERR(forget_req)) {
 		fuse_put_request(fc, req);
-		return ERR_CAST(forget_req);
+		goto out;
 	}
 
 	attr_version = fuse_get_attr_version(fc);
 
-	fuse_lookup_init(req, dir, entry, &outarg);
+	fuse_lookup_init(fc, req, nodeid, name, outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	/* Zero nodeid is same as -ENOENT, but with valid timeout */
-	if (!err && outarg.nodeid &&
-	    (invalid_nodeid(outarg.nodeid) ||
-	     !fuse_valid_type(outarg.attr.mode)))
-		err = -EIO;
-	if (!err && outarg.nodeid) {
-		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
-				  &outarg.attr, entry_attr_timeout(&outarg),
-				  attr_version);
-		if (!inode) {
-			fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
-			return ERR_PTR(-ENOMEM);
-		}
+	if (err || !outarg->nodeid)
+		goto out_put_forget;
+
+	err = -EIO;
+	if (!outarg->nodeid)
+		goto out_put_forget;
+	if (!fuse_valid_type(outarg->attr.mode))
+		goto out_put_forget;
+
+	*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
+			   &outarg->attr, entry_attr_timeout(outarg),
+			   attr_version);
+	err = -ENOMEM;
+	if (!*inode) {
+		fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+		goto out;
 	}
+	err = 0;
+
+ out_put_forget:
 	fuse_put_request(fc, forget_req);
-	if (err && err != -ENOENT)
-		return ERR_PTR(err);
+ out:
+	return err;
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+				  struct nameidata *nd)
+{
+	int err;
+	struct fuse_entry_out outarg;
+	struct inode *inode;
+	struct dentry *newent;
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	bool outarg_valid = true;
+
+	err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
+			       &outarg, &inode);
+	if (err == -ENOENT) {
+		outarg_valid = false;
+		err = 0;
+	}
+	if (err)
+		goto out_err;
+
+	err = -EIO;
+	if (inode && get_node_id(inode) == FUSE_ROOT_ID)
+		goto out_iput;
 
 	if (inode && S_ISDIR(inode->i_mode)) {
 		mutex_lock(&fc->inst_mutex);
-		err = fuse_d_add_directory(entry, inode);
+		newent = fuse_d_add_directory(entry, inode);
 		mutex_unlock(&fc->inst_mutex);
-		if (err) {
-			iput(inode);
-			return ERR_PTR(err);
-		}
-	} else
-		d_add(entry, inode);
+		err = PTR_ERR(newent);
+		if (IS_ERR(newent))
+			goto out_iput;
+	} else {
+		newent = d_splice_alias(inode, entry);
+	}
 
+	entry = newent ? newent : entry;
 	entry->d_op = &fuse_dentry_operations;
-	if (!err)
+	if (outarg_valid)
 		fuse_change_entry_timeout(entry, &outarg);
 	else
 		fuse_invalidate_entry_cache(entry);
-	return NULL;
+
+	return newent;
+
+ out_iput:
+	iput(inode);
+ out_err:
+	return ERR_PTR(err);
 }
 
 /*
@@ -857,7 +898,7 @@ static int fuse_access(struct inode *inode, int mask)
 		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
-	inarg.mask = mask;
+	inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
 	req->in.h.opcode = FUSE_ACCESS;
 	req->in.h.nodeid = get_node_id(inode);
 	req->in.numargs = 1;
@@ -886,7 +927,7 @@ static int fuse_access(struct inode *inode, int mask)
  * access request is sent.  Execute permission is still checked
  * locally based on file mode.
  */
-static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int fuse_permission(struct inode *inode, int mask)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	bool refreshed = false;
@@ -921,7 +962,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 		   exist.  So if permissions are revoked this won't be
 		   noticed immediately, only after the attribute
 		   timeout has expired */
-	} else if (nd && (nd->flags & (LOOKUP_ACCESS | LOOKUP_CHDIR))) {
+	} else if (mask & MAY_ACCESS) {
 		err = fuse_access(inode, mask);
 	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		if (!(inode->i_mode & S_IXUGO)) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8092f0d9fd1f..34930a964b82 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -101,6 +101,8 @@ void fuse_finish_open(struct inode *inode, struct file *file,
 		file->f_op = &fuse_direct_io_file_operations;
 	if (!(outarg->open_flags & FOPEN_KEEP_CACHE))
 		invalidate_inode_pages2(inode->i_mapping);
+	if (outarg->open_flags & FOPEN_NONSEEKABLE)
+		nonseekable_open(inode, file);
 	ff->fh = outarg->fh;
 	file->private_data = fuse_file_get(ff);
 }
@@ -893,7 +895,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	if (count == 0)
 		goto out;
 
-	err = remove_suid(file->f_path.dentry);
+	err = file_remove_suid(file);
 	if (err)
 		goto out;
 
@@ -1341,6 +1343,11 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
 	int err;
 
+	if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+		/* NLM needs asynchronous locks, which we don't support yet */
+		return -ENOLCK;
+	}
+
 	/* Unlock on close is handled by the flush method */
 	if (fl->fl_flags & FL_CLOSE)
 		return 0;
@@ -1365,7 +1372,9 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
-	if (cmd == F_GETLK) {
+	if (cmd == F_CANCELLK) {
+		err = 0;
+	} else if (cmd == F_GETLK) {
 		if (fc->no_lock) {
 			posix_test_lock(file, fl);
 			err = 0;
@@ -1373,7 +1382,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 			err = fuse_getlk(file, fl);
 	} else {
 		if (fc->no_lock)
-			err = posix_lock_file_wait(file, fl);
+			err = posix_lock_file(file, fl, NULL);
 		else
 			err = fuse_setlk(file, fl, 0);
 	}
@@ -1441,6 +1450,9 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
 	mutex_lock(&inode->i_mutex);
 	switch (origin) {
 	case SEEK_END:
+		retval = fuse_update_attributes(inode, NULL, file, NULL);
+		if (retval)
+			return retval;
 		offset += i_size_read(inode);
 		break;
 	case SEEK_CUR:
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bae948657c4f..35accfdd747f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -6,6 +6,9 @@
   See the file COPYING.
 */
 
+#ifndef _FS_FUSE_I_H
+#define _FS_FUSE_I_H
+
 #include <linux/fuse.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -363,6 +366,9 @@ struct fuse_conn {
 	/** Do not send separate SETATTR request before open(O_TRUNC)  */
 	unsigned atomic_o_trunc : 1;
 
+	/** Filesystem supports NFS exporting.  Only set in INIT */
+	unsigned export_support : 1;
+
 	/*
 	 * The following bitfields are only for optimization purposes
 	 * and hence races in setting them will not cause malfunction
@@ -464,6 +470,8 @@ static inline u64 get_node_id(struct inode *inode)
 /** Device operations */
 extern const struct file_operations fuse_dev_operations;
 
+extern struct dentry_operations fuse_dentry_operations;
+
 /**
  * Get a filled in inode
  */
@@ -471,6 +479,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
 			u64 attr_valid, u64 attr_version);
 
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+		     struct fuse_entry_out *outarg, struct inode **inode);
+
 /**
  * Send FORGET command
  */
@@ -604,6 +615,8 @@ void fuse_abort_conn(struct fuse_conn *fc);
  */
 void fuse_invalidate_attr(struct inode *inode);
 
+void fuse_invalidate_entry_cache(struct dentry *entry);
+
 /**
  * Acquire reference to fuse_conn
  */
@@ -645,3 +658,5 @@ void fuse_set_nowrite(struct inode *inode);
 void fuse_release_nowrite(struct inode *inode);
 
 u64 fuse_get_attr_version(struct fuse_conn *fc);
+
+#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3141690558c8..2e99f34b4435 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -18,6 +18,7 @@
 #include <linux/statfs.h>
 #include <linux/random.h>
 #include <linux/sched.h>
+#include <linux/exportfs.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -353,7 +354,7 @@ enum {
 	OPT_ERR
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{OPT_FD,			"fd=%u"},
 	{OPT_ROOTMODE,			"rootmode=%o"},
 	{OPT_USER_ID,			"user_id=%u"},
@@ -552,6 +553,167 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 	return fuse_iget(sb, 1, 0, &attr, 0, 0);
 }
 
+struct fuse_inode_handle
+{
+	u64 nodeid;
+	u32 generation;
+};
+
+static struct dentry *fuse_get_dentry(struct super_block *sb,
+				      struct fuse_inode_handle *handle)
+{
+	struct fuse_conn *fc = get_fuse_conn_super(sb);
+	struct inode *inode;
+	struct dentry *entry;
+	int err = -ESTALE;
+
+	if (handle->nodeid == 0)
+		goto out_err;
+
+	inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
+	if (!inode) {
+		struct fuse_entry_out outarg;
+		struct qstr name;
+
+		if (!fc->export_support)
+			goto out_err;
+
+		name.len = 1;
+		name.name = ".";
+		err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
+				       &inode);
+		if (err && err != -ENOENT)
+			goto out_err;
+		if (err || !inode) {
+			err = -ESTALE;
+			goto out_err;
+		}
+		err = -EIO;
+		if (get_node_id(inode) != handle->nodeid)
+			goto out_iput;
+	}
+	err = -ESTALE;
+	if (inode->i_generation != handle->generation)
+		goto out_iput;
+
+	entry = d_obtain_alias(inode);
+	if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID) {
+		entry->d_op = &fuse_dentry_operations;
+		fuse_invalidate_entry_cache(entry);
+	}
+
+	return entry;
+
+ out_iput:
+	iput(inode);
+ out_err:
+	return ERR_PTR(err);
+}
+
+static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+			   int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	bool encode_parent = connectable && !S_ISDIR(inode->i_mode);
+	int len = encode_parent ? 6 : 3;
+	u64 nodeid;
+	u32 generation;
+
+	if (*max_len < len)
+		return  255;
+
+	nodeid = get_fuse_inode(inode)->nodeid;
+	generation = inode->i_generation;
+
+	fh[0] = (u32)(nodeid >> 32);
+	fh[1] = (u32)(nodeid & 0xffffffff);
+	fh[2] = generation;
+
+	if (encode_parent) {
+		struct inode *parent;
+
+		spin_lock(&dentry->d_lock);
+		parent = dentry->d_parent->d_inode;
+		nodeid = get_fuse_inode(parent)->nodeid;
+		generation = parent->i_generation;
+		spin_unlock(&dentry->d_lock);
+
+		fh[3] = (u32)(nodeid >> 32);
+		fh[4] = (u32)(nodeid & 0xffffffff);
+		fh[5] = generation;
+	}
+
+	*max_len = len;
+	return encode_parent ? 0x82 : 0x81;
+}
+
+static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct fuse_inode_handle handle;
+
+	if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+		return NULL;
+
+	handle.nodeid = (u64) fid->raw[0] << 32;
+	handle.nodeid |= (u64) fid->raw[1];
+	handle.generation = fid->raw[2];
+	return fuse_get_dentry(sb, &handle);
+}
+
+static struct dentry *fuse_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	struct fuse_inode_handle parent;
+
+	if (fh_type != 0x82 || fh_len < 6)
+		return NULL;
+
+	parent.nodeid = (u64) fid->raw[3] << 32;
+	parent.nodeid |= (u64) fid->raw[4];
+	parent.generation = fid->raw[5];
+	return fuse_get_dentry(sb, &parent);
+}
+
+static struct dentry *fuse_get_parent(struct dentry *child)
+{
+	struct inode *child_inode = child->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(child_inode);
+	struct inode *inode;
+	struct dentry *parent;
+	struct fuse_entry_out outarg;
+	struct qstr name;
+	int err;
+
+	if (!fc->export_support)
+		return ERR_PTR(-ESTALE);
+
+	name.len = 2;
+	name.name = "..";
+	err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
+			       &name, &outarg, &inode);
+	if (err) {
+		if (err == -ENOENT)
+			return ERR_PTR(-ESTALE);
+		return ERR_PTR(err);
+	}
+
+	parent = d_obtain_alias(inode);
+	if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID) {
+		parent->d_op = &fuse_dentry_operations;
+		fuse_invalidate_entry_cache(parent);
+	}
+
+	return parent;
+}
+
+static const struct export_operations fuse_export_operations = {
+	.fh_to_dentry	= fuse_fh_to_dentry,
+	.fh_to_parent	= fuse_fh_to_parent,
+	.encode_fh	= fuse_encode_fh,
+	.get_parent	= fuse_get_parent,
+};
+
 static const struct super_operations fuse_super_operations = {
 	.alloc_inode    = fuse_alloc_inode,
 	.destroy_inode  = fuse_destroy_inode,
@@ -581,6 +743,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 				fc->no_lock = 1;
 			if (arg->flags & FUSE_ATOMIC_O_TRUNC)
 				fc->atomic_o_trunc = 1;
+			if (arg->minor >= 9) {
+				/* LOOKUP has dependency on proto version */
+				if (arg->flags & FUSE_EXPORT_SUPPORT)
+					fc->export_support = 1;
+			}
 			if (arg->flags & FUSE_BIG_WRITES)
 				fc->big_writes = 1;
 		} else {
@@ -607,7 +774,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
 	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
-		FUSE_BIG_WRITES;
+		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -652,6 +819,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = FUSE_SUPER_MAGIC;
 	sb->s_op = &fuse_super_operations;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_export_op = &fuse_export_operations;
 
 	file = fget(d.fd);
 	if (!file)
@@ -690,7 +858,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (is_bdev) {
 		fc->destroy_req = fuse_request_alloc();
 		if (!fc->destroy_req)
-			goto err_put_root;
+			goto err_free_init_req;
 	}
 
 	mutex_lock(&fuse_mutex);
@@ -720,6 +888,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
  err_unlock:
 	mutex_unlock(&fuse_mutex);
+ err_free_init_req:
 	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
@@ -781,7 +950,7 @@ static inline void unregister_fuseblk(void)
 }
 #endif
 
-static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void fuse_inode_init_once(void *foo)
 {
 	struct inode * inode = foo;
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 13391e546616..c962283d4e7f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1265,6 +1265,8 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
 	if (time_before(now, holdtime))
 		delay = holdtime - now;
+	if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+		delay = gl->gl_ops->go_min_hold_time;
 
 	spin_lock(&gl->gl_spin);
 	handle_callback(gl, state, 1, delay);
@@ -1578,8 +1580,6 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 		*p++ = 'a';
 	if (flags & GL_EXACT)
 		*p++ = 'E';
-	if (flags & GL_ATIME)
-		*p++ = 'a';
 	if (flags & GL_NOCACHE)
 		*p++ = 'c';
 	if (test_bit(HIF_HOLDER, &iflags))
@@ -1816,15 +1816,17 @@ restart:
 	if (gl) {
 		gi->gl = hlist_entry(gl->gl_list.next,
 				     struct gfs2_glock, gl_list);
-		if (gi->gl)
-			gfs2_glock_hold(gi->gl);
+	} else {
+		gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+				     struct gfs2_glock, gl_list);
 	}
+	if (gi->gl)
+		gfs2_glock_hold(gi->gl);
 	read_unlock(gl_lock_addr(gi->hash));
 	if (gl)
 		gfs2_glock_put(gl);
-	if (gl && gi->gl == NULL)
-		gi->hash++;
 	while (gi->gl == NULL) {
+		gi->hash++;
 		if (gi->hash >= GFS2_GL_HASH_SIZE)
 			return 1;
 		read_lock(gl_lock_addr(gi->hash));
@@ -1833,7 +1835,6 @@ restart:
 		if (gi->gl)
 			gfs2_glock_hold(gi->gl);
 		read_unlock(gl_lock_addr(gi->hash));
-		gi->hash++;
 	}
 
 	if (gi->sdp != gi->gl->gl_sbd)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 971d92af70fc..695c6b193611 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -24,7 +24,6 @@
 #define GL_ASYNC		0x00000040
 #define GL_EXACT		0x00000080
 #define GL_SKIP			0x00000100
-#define GL_ATIME		0x00000200
 #define GL_NOCACHE		0x00000400
 
 #define GLR_TRYFAILED		13
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 448697a5c462..f566ec1b4e8e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -386,20 +386,21 @@ struct gfs2_statfs_change_host {
 #define GFS2_DATA_ORDERED	2
 
 struct gfs2_args {
-	char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
-	char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
-	char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
-	int ar_spectator; /* Don't get a journal because we're always RO */
-	int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
-	int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
-	int ar_localcaching; /* Local-style caching (dangerous on multihost) */
-	int ar_debug; /* Oops on errors instead of trying to be graceful */
-	int ar_upgrade; /* Upgrade ondisk/multihost format */
-	unsigned int ar_num_glockd; /* Number of glockd threads */
-	int ar_posix_acl; /* Enable posix acls */
-	int ar_quota; /* off/account/on */
-	int ar_suiddir; /* suiddir support */
-	int ar_data; /* ordered/writeback */
+	char ar_lockproto[GFS2_LOCKNAME_LEN];	/* Name of the Lock Protocol */
+	char ar_locktable[GFS2_LOCKNAME_LEN];	/* Name of the Lock Table */
+	char ar_hostdata[GFS2_LOCKNAME_LEN];	/* Host specific data */
+	unsigned int ar_spectator:1;		/* Don't get a journal */
+	unsigned int ar_ignore_local_fs:1;	/* Ignore optimisations */
+	unsigned int ar_localflocks:1;		/* Let the VFS do flock|fcntl */
+	unsigned int ar_localcaching:1;		/* Local caching */
+	unsigned int ar_debug:1;		/* Oops on errors */
+	unsigned int ar_upgrade:1;		/* Upgrade ondisk format */
+	unsigned int ar_posix_acl:1;		/* Enable posix acls */
+	unsigned int ar_quota:2;		/* off/account/on */
+	unsigned int ar_suiddir:1;		/* suiddir support */
+	unsigned int ar_data:2;			/* ordered/writeback */
+	unsigned int ar_meta:1;			/* mount metafs */
+	unsigned int ar_num_glockd;		/* Number of glockd threads */
 };
 
 struct gfs2_tune {
@@ -419,7 +420,6 @@ struct gfs2_tune {
 	unsigned int gt_quota_scale_den; /* Denominator */
 	unsigned int gt_quota_cache_secs;
 	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
-	unsigned int gt_atime_quantum; /* Min secs between atime updates */
 	unsigned int gt_new_files_jdata;
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
 	unsigned int gt_stall_secs; /* Detects trouble! */
@@ -432,7 +432,7 @@ enum {
 	SDF_JOURNAL_CHECKED	= 0,
 	SDF_JOURNAL_LIVE	= 1,
 	SDF_SHUTDOWN		= 2,
-	SDF_NOATIME		= 3,
+	SDF_NOBARRIERS		= 3,
 };
 
 #define GFS2_FSNAME_LEN		256
@@ -461,7 +461,6 @@ struct gfs2_sb_host {
 
 struct gfs2_sbd {
 	struct super_block *sd_vfs;
-	struct super_block *sd_vfs_meta;
 	struct kobject sd_kobj;
 	unsigned long sd_flags;	/* SDF_... */
 	struct gfs2_sb_host sd_sb;
@@ -499,7 +498,9 @@ struct gfs2_sbd {
 
 	/* Inode Stuff */
 
-	struct inode *sd_master_dir;
+	struct dentry *sd_master_dir;
+	struct dentry *sd_root_dir;
+
 	struct inode *sd_jindex;
 	struct inode *sd_inum_inode;
 	struct inode *sd_statfs_inode;
@@ -634,7 +635,6 @@ struct gfs2_sbd {
 	/* Debugging crud */
 
 	unsigned long sd_last_warning;
-	struct vfsmount *sd_gfs2mnt;
 	struct dentry *debugfs_dir;    /* debugfs directory */
 	struct dentry *debugfs_dentry_glocks; /* for debugfs */
 };
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6da0ab355b8a..7cee695fa441 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -18,6 +18,7 @@
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
 #include <linux/security.h>
+#include <linux/time.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
 	struct gfs2_dinode_host *di = &ip->i_di;
 	const struct gfs2_dinode *str = buf;
+	struct timespec atime;
 	u16 height, depth;
 
 	if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
@@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	di->di_size = be64_to_cpu(str->di_size);
 	i_size_write(&ip->i_inode, di->di_size);
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
-	ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
-	ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+	atime.tv_sec = be64_to_cpu(str->di_atime);
+	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+	if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+		ip->i_inode.i_atime = atime;
 	ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
 	ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
 	ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
@@ -448,7 +452,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
 	struct qstr qstr;
 	struct inode *inode;
 	gfs2_str2qstr(&qstr, name);
-	inode = gfs2_lookupi(dip, &qstr, 1, NULL);
+	inode = gfs2_lookupi(dip, &qstr, 1);
 	/* gfs2_lookupi has inconsistent callers: vfs
 	 * related routines expect NULL for no entry found,
 	 * gfs2_lookup_simple callers expect ENOENT
@@ -477,7 +481,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
  */
 
 struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-			   int is_root, struct nameidata *nd)
+			   int is_root)
 {
 	struct super_block *sb = dir->i_sb;
 	struct gfs2_inode *dip = GFS2_I(dir);
@@ -1033,13 +1037,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 
 	if (bh)
 		brelse(bh);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
 	return inode;
 
 fail_gunlock2:
 	gfs2_glock_dq_uninit(ghs + 1);
-	if (inode)
+	if (inode && !IS_ERR(inode))
 		iput(inode);
 fail_gunlock:
 	gfs2_glock_dq(ghs);
@@ -1140,54 +1142,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	return 0;
 }
 
-/*
- * gfs2_ok_to_move - check if it's ok to move a directory to another directory
- * @this: move this
- * @to: to here
- *
- * Follow @to back to the root and make sure we don't encounter @this
- * Assumes we already hold the rename lock.
- *
- * Returns: errno
- */
-
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
-{
-	struct inode *dir = &to->i_inode;
-	struct super_block *sb = dir->i_sb;
-	struct inode *tmp;
-	struct qstr dotdot;
-	int error = 0;
-
-	gfs2_str2qstr(&dotdot, "..");
-
-	igrab(dir);
-
-	for (;;) {
-		if (dir == &this->i_inode) {
-			error = -EINVAL;
-			break;
-		}
-		if (dir == sb->s_root->d_inode) {
-			error = 0;
-			break;
-		}
-
-		tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
-		if (IS_ERR(tmp)) {
-			error = PTR_ERR(tmp);
-			break;
-		}
-
-		iput(dir);
-		dir = tmp;
-	}
-
-	iput(dir);
-
-	return error;
-}
-
 /**
  * gfs2_readlinki - return the contents of a symlink
  * @ip: the symlink's inode
@@ -1207,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
 	unsigned int x;
 	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
-	error = gfs2_glock_nq_atime(&i_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
 	if (error) {
 		gfs2_holder_uninit(&i_gh);
 		return error;
@@ -1243,101 +1197,6 @@ out:
 	return error;
 }
 
-/**
- * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
- *       conditionally update the inode's atime
- * @gh: the holder to acquire
- *
- * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
- * Update if the difference between the current time and the inode's current
- * atime is greater than an interval specified at mount.
- *
- * Returns: errno
- */
-
-int gfs2_glock_nq_atime(struct gfs2_holder *gh)
-{
-	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct gfs2_inode *ip = gl->gl_object;
-	s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum);
-	unsigned int state;
-	int flags;
-	int error;
-	struct timespec tv = CURRENT_TIME;
-
-	if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
-	    gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
-	    gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
-		return -EINVAL;
-
-	state = gh->gh_state;
-	flags = gh->gh_flags;
-
-	error = gfs2_glock_nq(gh);
-	if (error)
-		return error;
-
-	if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
-	    (sdp->sd_vfs->s_flags & MS_RDONLY))
-		return 0;
-
-	if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
-		gfs2_glock_dq(gh);
-		gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
-				   gh);
-		error = gfs2_glock_nq(gh);
-		if (error)
-			return error;
-
-		/* Verify that atime hasn't been updated while we were
-		   trying to get exclusive lock. */
-
-		tv = CURRENT_TIME;
-		if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
-			struct buffer_head *dibh;
-			struct gfs2_dinode *di;
-
-			error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-			if (error == -EROFS)
-				return 0;
-			if (error)
-				goto fail;
-
-			error = gfs2_meta_inode_buffer(ip, &dibh);
-			if (error)
-				goto fail_end_trans;
-
-			ip->i_inode.i_atime = tv;
-
-			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-			di = (struct gfs2_dinode *)dibh->b_data;
-			di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
-			di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
-			brelse(dibh);
-
-			gfs2_trans_end(sdp);
-		}
-
-		/* If someone else has asked for the glock,
-		   unlock and let them have it. Then reacquire
-		   in the original state. */
-		if (gfs2_glock_is_blocking(gl)) {
-			gfs2_glock_dq(gh);
-			gfs2_holder_reinit(state, flags, gh);
-			return gfs2_glock_nq(gh);
-		}
-	}
-
-	return 0;
-
-fail_end_trans:
-	gfs2_trans_end(sdp);
-fail:
-	gfs2_glock_dq(gh);
-	return error;
-}
-
 static int
 __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6074c2506f75..2d43f69610a0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -83,7 +83,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip);
 int gfs2_dinode_dealloc(struct gfs2_inode *inode);
 int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
 struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-			   int is_root, struct nameidata *nd);
+			   int is_root);
 struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 			   unsigned int mode, dev_t dev);
 int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
@@ -91,9 +91,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 		   const struct gfs2_inode *ip);
 int gfs2_permission(struct inode *inode, int mask);
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
-int gfs2_glock_nq_atime(struct gfs2_holder *gh);
 int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 09d78c216f48..0c4cbe6c8285 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -144,7 +144,8 @@ static int gdlm_mount(char *table_name, char *host_data,
 
 	error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
 				  &ls->dlm_lockspace,
-				  DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0),
+				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+				  (nodir ? DLM_LSFL_NODIR : 0),
 				  GDLM_LVB_SIZE);
 	if (error) {
 		log_error("dlm_new_lockspace error %d", error);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 6c6af9f5e3ab..ad305854bdc6 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/bio.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -584,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	memset(bh->b_data, 0, bh->b_size);
 	set_buffer_uptodate(bh);
 	clear_buffer_dirty(bh);
-	unlock_buffer(bh);
 
 	gfs2_ail1_empty(sdp, 0);
 	tail = current_tail(sdp);
@@ -601,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
 	lh->lh_hash = cpu_to_be32(hash);
 
-	set_buffer_dirty(bh);
-	if (sync_dirty_buffer(bh))
+	bh->b_end_io = end_buffer_write_sync;
+	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+		goto skip_barrier;
+	get_bh(bh);
+	submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+	wait_on_buffer(bh);
+	if (buffer_eopnotsupp(bh)) {
+		clear_buffer_eopnotsupp(bh);
+		set_buffer_uptodate(bh);
+		set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+		lock_buffer(bh);
+skip_barrier:
+		get_bh(bh);
+		submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
+		wait_on_buffer(bh);
+	}
+	if (!buffer_uptodate(bh))
 		gfs2_io_error_bh(sdp, bh);
 	brelse(bh);
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bcc668d0fadd..bb2cc303ac29 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,7 +24,7 @@
 #include "util.h"
 #include "glock.h"
 
-static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_inode_once(void *foo)
 {
 	struct gfs2_inode *ip = foo;
 
@@ -33,7 +33,7 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
 	ip->i_alloc = NULL;
 }
 
-static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_glock_once(void *foo)
 {
 	struct gfs2_glock *gl = foo;
 
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index b941f9f9f958..f96eb90a2cfa 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,10 +42,11 @@ enum {
 	Opt_nosuiddir,
 	Opt_data_writeback,
 	Opt_data_ordered,
+	Opt_meta,
 	Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_lockproto, "lockproto=%s"},
 	{Opt_locktable, "locktable=%s"},
 	{Opt_hostdata, "hostdata=%s"},
@@ -66,6 +67,7 @@ static match_table_t tokens = {
 	{Opt_nosuiddir, "nosuiddir"},
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_data_ordered, "data=ordered"},
+	{Opt_meta, "meta"},
 	{Opt_err, NULL}
 };
 
@@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 		case Opt_data_ordered:
 			args->ar_data = GFS2_DATA_ORDERED;
 			break;
+		case Opt_meta:
+			if (remount && args->ar_meta != 1)
+				goto cant_remount;
+			args->ar_meta = 1;
+			break;
 		case Opt_err:
 		default:
 			fs_info(sdp, "unknown option: %s\n", o);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index e64a1b04117a..27563816e1c5 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -512,8 +512,8 @@ static int gfs2_readpage(struct file *file, struct page *page)
 	int error;
 
 	unlock_page(page);
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-	error = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
 	if (unlikely(error))
 		goto out;
 	error = AOP_TRUNCATED_PAGE;
@@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
 	struct gfs2_holder gh;
 	int ret;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-	ret = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
 	if (unlikely(ret))
 		goto out_uninit;
 	if (!gfs2_is_stuffed(ip))
@@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	unsigned to = from + len;
 	struct page *page;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh);
-	error = gfs2_glock_nq_atime(&ip->i_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
 	if (unlikely(error))
 		goto out_uninit;
 
@@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
 	if (gfs2_is_stuffed(ip))
 		return 0;
 
-	if (offset > i_size_read(&ip->i_inode))
+	if (offset >= i_size_read(&ip->i_inode))
 		return 0;
 	return 1;
 }
@@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	 * unfortunately have the option of only flushing a range like
 	 * the VFS does.
 	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
-	rv = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	rv = gfs2_glock_nq(&gh);
 	if (rv)
 		return rv;
 	rv = gfs2_ok_for_dio(ip, rw, offset);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 990d9f4bc463..bbb8c36403a9 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -130,28 +130,17 @@ static int gfs2_get_name(struct dentry *parent, char *name,
 static struct dentry *gfs2_get_parent(struct dentry *child)
 {
 	struct qstr dotdot;
-	struct inode *inode;
 	struct dentry *dentry;
 
-	gfs2_str2qstr(&dotdot, "..");
-	inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
-
-	if (!inode)
-		return ERR_PTR(-ENOENT);
 	/*
-	 * In case of an error, @inode carries the error value, and we
-	 * have to return that as a(n invalid) pointer to dentry.
+	 * XXX(hch): it would be a good idea to keep this around as a
+	 *	     static variable.
 	 */
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	dentry = d_alloc_anon(inode);
-	if (!dentry) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
+	gfs2_str2qstr(&dotdot, "..");
 
-	dentry->d_op = &gfs2_dops;
+	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+	if (!IS_ERR(dentry))
+		dentry->d_op = &gfs2_dops;
 	return dentry;
 }
 
@@ -233,13 +222,9 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	gfs2_glock_dq_uninit(&i_gh);
 
 out_inode:
-	dentry = d_alloc_anon(inode);
-	if (!dentry) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	dentry->d_op = &gfs2_dops;
+	dentry = d_obtain_alias(inode);
+	if (!IS_ERR(dentry))
+		dentry->d_op = &gfs2_dops;
 	return dentry;
 
 fail_rgd:
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e9a366d4411c..3a747f8e2188 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -89,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
 	u64 offset = file->f_pos;
 	int error;
 
-	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
-	error = gfs2_glock_nq_atime(&d_gh);
+	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	error = gfs2_glock_nq(&d_gh);
 	if (error) {
 		gfs2_holder_uninit(&d_gh);
 		return error;
@@ -153,8 +153,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 	int error;
 	u32 fsflags;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-	error = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
 	if (error)
 		return error;
 
@@ -351,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	struct gfs2_alloc *al;
 	int ret;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
-	ret = gfs2_glock_nq_atime(&gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
 	if (ret)
 		goto out;
 
@@ -434,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	struct gfs2_holder i_gh;
 	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
-	error = gfs2_glock_nq_atime(&i_gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
 	if (error) {
 		gfs2_holder_uninit(&i_gh);
 		return error;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b4d1d6490633..b117fcf2c4f5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -40,6 +40,44 @@
 #define DO 0
 #define UNDO 1
 
+static const u32 gfs2_old_fs_formats[] = {
+        0
+};
+
+static const u32 gfs2_old_multihost_formats[] = {
+        0
+};
+
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+
+static void gfs2_tune_init(struct gfs2_tune *gt)
+{
+	spin_lock_init(&gt->gt_spin);
+
+	gt->gt_demote_secs = 300;
+	gt->gt_incore_log_blocks = 1024;
+	gt->gt_log_flush_secs = 60;
+	gt->gt_recoverd_secs = 60;
+	gt->gt_logd_secs = 1;
+	gt->gt_quotad_secs = 5;
+	gt->gt_quota_simul_sync = 64;
+	gt->gt_quota_warn_period = 10;
+	gt->gt_quota_scale_num = 1;
+	gt->gt_quota_scale_den = 1;
+	gt->gt_quota_cache_secs = 300;
+	gt->gt_quota_quantum = 60;
+	gt->gt_new_files_jdata = 0;
+	gt->gt_max_readahead = 1 << 18;
+	gt->gt_stall_secs = 600;
+	gt->gt_complain_secs = 10;
+	gt->gt_statfs_quantum = 30;
+	gt->gt_statfs_slow = 0;
+}
+
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp;
@@ -96,21 +134,271 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	return sdp;
 }
 
-static void init_vfs(struct super_block *sb, unsigned noatime)
+
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+
+static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
 {
-	struct gfs2_sbd *sdp = sb->s_fs_info;
+	unsigned int x;
 
-	sb->s_magic = GFS2_MAGIC;
-	sb->s_op = &gfs2_super_ops;
-	sb->s_export_op = &gfs2_export_ops;
-	sb->s_time_gran = 1;
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	if (sb->sb_magic != GFS2_MAGIC ||
+	    sb->sb_type != GFS2_METATYPE_SB) {
+		if (!silent)
+			printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+		return -EINVAL;
+	}
+
+	/*  If format numbers match exactly, we're done.  */
+
+	if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+		return 0;
+
+	if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+		for (x = 0; gfs2_old_fs_formats[x]; x++)
+			if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
+				break;
+
+		if (!gfs2_old_fs_formats[x]) {
+			printk(KERN_WARNING
+			       "GFS2: code version (%u, %u) is incompatible "
+			       "with ondisk format (%u, %u)\n",
+			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+			       sb->sb_fs_format, sb->sb_multihost_format);
+			printk(KERN_WARNING
+			       "GFS2: I don't know how to upgrade this FS\n");
+			return -EINVAL;
+		}
+	}
+
+	if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+		for (x = 0; gfs2_old_multihost_formats[x]; x++)
+			if (gfs2_old_multihost_formats[x] ==
+			    sb->sb_multihost_format)
+				break;
+
+		if (!gfs2_old_multihost_formats[x]) {
+			printk(KERN_WARNING
+			       "GFS2: code version (%u, %u) is incompatible "
+			       "with ondisk format (%u, %u)\n",
+			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+			       sb->sb_fs_format, sb->sb_multihost_format);
+			printk(KERN_WARNING
+			       "GFS2: I don't know how to upgrade this FS\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!sdp->sd_args.ar_upgrade) {
+		printk(KERN_WARNING
+		       "GFS2: code version (%u, %u) is incompatible "
+		       "with ondisk format (%u, %u)\n",
+		       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+		       sb->sb_fs_format, sb->sb_multihost_format);
+		printk(KERN_INFO
+		       "GFS2: Use the \"upgrade\" mount option to upgrade "
+		       "the FS\n");
+		printk(KERN_INFO "GFS2: See the manual for more details\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void end_bio_io_page(struct bio *bio, int error)
+{
+	struct page *page = bio->bi_private;
 
-	if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
-		set_bit(noatime, &sdp->sd_flags);
+	if (!error)
+		SetPageUptodate(page);
+	else
+		printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+	unlock_page(page);
+}
+
+static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
+{
+	const struct gfs2_sb *str = buf;
+
+	sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
+	sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
+	sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
+	sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+	sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+	sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+	sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+	sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
+	sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
+	sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
+	sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
+
+	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+}
+
+/**
+ * gfs2_read_super - Read the gfs2 super block from disk
+ * @sdp: The GFS2 super block
+ * @sector: The location of the super block
+ * @error: The error code to return
+ *
+ * This uses the bio functions to read the super block from disk
+ * because we want to be 100% sure that we never read cached data.
+ * A super block is read twice only during each GFS2 mount and is
+ * never written to by the filesystem. The first time its read no
+ * locks are held, and the only details which are looked at are those
+ * relating to the locking protocol. Once locking is up and working,
+ * the sb is read again under the lock to establish the location of
+ * the master directory (contains pointers to journals etc) and the
+ * root directory.
+ *
+ * Returns: 0 on success or error
+ */
+
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_sb *p;
+	struct page *page;
+	struct bio *bio;
+
+	page = alloc_page(GFP_NOFS);
+	if (unlikely(!page))
+		return -ENOBUFS;
+
+	ClearPageUptodate(page);
+	ClearPageDirty(page);
+	lock_page(page);
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	if (unlikely(!bio)) {
+		__free_page(page);
+		return -ENOBUFS;
+	}
 
-	/* Don't let the VFS update atimes.  GFS2 handles this itself. */
-	sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+	bio->bi_sector = sector * (sb->s_blocksize >> 9);
+	bio->bi_bdev = sb->s_bdev;
+	bio_add_page(bio, page, PAGE_SIZE, 0);
+
+	bio->bi_end_io = end_bio_io_page;
+	bio->bi_private = page;
+	submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+	wait_on_page_locked(page);
+	bio_put(bio);
+	if (!PageUptodate(page)) {
+		__free_page(page);
+		return -EIO;
+	}
+	p = kmap(page);
+	gfs2_sb_in(&sdp->sd_sb, p);
+	kunmap(page);
+	__free_page(page);
+	return 0;
+}
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @gl: the glock for the superblock (assumed to be held)
+ * @silent: Don't print message if mount fails
+ *
+ */
+
+static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+{
+	u32 hash_blocks, ind_blocks, leaf_blocks;
+	u32 tmp_blocks;
+	unsigned int x;
+	int error;
+
+	error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+	if (error) {
+		if (!silent)
+			fs_err(sdp, "can't read superblock\n");
+		return error;
+	}
+
+	error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+	if (error)
+		return error;
+
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+			       GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+	sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_dinode)) / sizeof(u64);
+	sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_meta_header)) / sizeof(u64);
+	sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+	sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+	sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+	sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+	sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_meta_header)) /
+			        sizeof(struct gfs2_quota_change);
+
+	/* Compute maximum reservation required to add a entry to a directory */
+
+	hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+			     sdp->sd_jbsize);
+
+	ind_blocks = 0;
+	for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+		tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+		ind_blocks += tmp_blocks;
+	}
+
+	leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+
+	sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+
+	sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_dinode);
+	sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_heightsize[x - 1] || m)
+			break;
+		sdp->sd_heightsize[x] = space;
+	}
+	sdp->sd_max_height = x;
+	sdp->sd_heightsize[x] = ~0;
+	gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+
+	sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+				 sizeof(struct gfs2_dinode);
+	sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_jheightsize[x - 1] || m)
+			break;
+		sdp->sd_jheightsize[x] = space;
+	}
+	sdp->sd_max_jheight = x;
+	sdp->sd_jheightsize[x] = ~0;
+	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+
+	return 0;
 }
 
 static int init_names(struct gfs2_sbd *sdp, int silent)
@@ -224,51 +512,59 @@ fail:
 	return error;
 }
 
-static inline struct inode *gfs2_lookup_root(struct super_block *sb,
-					     u64 no_addr)
+static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
+			    u64 no_addr, const char *name)
 {
-	return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct dentry *dentry;
+	struct inode *inode;
+
+	inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+	if (IS_ERR(inode)) {
+		fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
+		return PTR_ERR(inode);
+	}
+	dentry = d_alloc_root(inode);
+	if (!dentry) {
+		fs_err(sdp, "can't alloc %s dentry\n", name);
+		iput(inode);
+		return -ENOMEM;
+	}
+	dentry->d_op = &gfs2_dops;
+	*dptr = dentry;
+	return 0;
 }
 
-static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
+static int init_sb(struct gfs2_sbd *sdp, int silent)
 {
 	struct super_block *sb = sdp->sd_vfs;
 	struct gfs2_holder sb_gh;
 	u64 no_addr;
-	struct inode *inode;
-	int error = 0;
+	int ret;
 
-	if (undo) {
-		if (sb->s_root) {
-			dput(sb->s_root);
-			sb->s_root = NULL;
-		}
-		return 0;
+	ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+				LM_ST_SHARED, 0, &sb_gh);
+	if (ret) {
+		fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
+		return ret;
 	}
 
-	error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
-				 LM_ST_SHARED, 0, &sb_gh);
-	if (error) {
-		fs_err(sdp, "can't acquire superblock glock: %d\n", error);
-		return error;
-	}
-
-	error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
-	if (error) {
-		fs_err(sdp, "can't read superblock: %d\n", error);
+	ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
+	if (ret) {
+		fs_err(sdp, "can't read superblock: %d\n", ret);
 		goto out;
 	}
 
 	/* Set up the buffer cache and SB for real */
 	if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
-		error = -EINVAL;
+		ret = -EINVAL;
 		fs_err(sdp, "FS block size (%u) is too small for device "
 		       "block size (%u)\n",
 		       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
 		goto out;
 	}
 	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
-		error = -EINVAL;
+		ret = -EINVAL;
 		fs_err(sdp, "FS block size (%u) is too big for machine "
 		       "page size (%u)\n",
 		       sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
@@ -278,26 +574,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
 
 	/* Get the root inode */
 	no_addr = sdp->sd_sb.sb_root_dir.no_addr;
-	if (sb->s_type == &gfs2meta_fs_type)
-		no_addr = sdp->sd_sb.sb_master_dir.no_addr;
-	inode = gfs2_lookup_root(sb, no_addr);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		fs_err(sdp, "can't read in root inode: %d\n", error);
+	ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
+	if (ret)
 		goto out;
-	}
 
-	sb->s_root = d_alloc_root(inode);
-	if (!sb->s_root) {
-		fs_err(sdp, "can't get root dentry\n");
-		error = -ENOMEM;
-		iput(inode);
-	} else
-		sb->s_root->d_op = &gfs2_dops;
-	
+	/* Get the master inode */
+	no_addr = sdp->sd_sb.sb_master_dir.no_addr;
+	ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
+	if (ret) {
+		dput(sdp->sd_root_dir);
+		goto out;
+	}
+	sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
 out:
 	gfs2_glock_dq_uninit(&sb_gh);
-	return error;
+	return ret;
 }
 
 /**
@@ -372,6 +663,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
+	struct inode *master = sdp->sd_master_dir->d_inode;
 	struct gfs2_holder ji_gh;
 	struct task_struct *p;
 	struct gfs2_inode *ip;
@@ -383,7 +675,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 		goto fail_recoverd;
 	}
 
-	sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
+	sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
 	if (IS_ERR(sdp->sd_jindex)) {
 		fs_err(sdp, "can't lookup journal index: %d\n", error);
 		return PTR_ERR(sdp->sd_jindex);
@@ -506,25 +798,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 {
 	int error = 0;
 	struct gfs2_inode *ip;
-	struct inode *inode;
+	struct inode *master = sdp->sd_master_dir->d_inode;
 
 	if (undo)
 		goto fail_qinode;
 
-	inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		fs_err(sdp, "can't read in master directory: %d\n", error);
-		goto fail;
-	}
-	sdp->sd_master_dir = inode;
-
 	error = init_journal(sdp, undo);
 	if (error)
-		goto fail_master;
+		goto fail;
 
 	/* Read in the master inode number inode */
-	sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
+	sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
 	if (IS_ERR(sdp->sd_inum_inode)) {
 		error = PTR_ERR(sdp->sd_inum_inode);
 		fs_err(sdp, "can't read in inum inode: %d\n", error);
@@ -533,7 +817,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 
 
 	/* Read in the master statfs inode */
-	sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
+	sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
 	if (IS_ERR(sdp->sd_statfs_inode)) {
 		error = PTR_ERR(sdp->sd_statfs_inode);
 		fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -541,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	}
 
 	/* Read in the resource index inode */
-	sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
+	sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
 	if (IS_ERR(sdp->sd_rindex)) {
 		error = PTR_ERR(sdp->sd_rindex);
 		fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -552,7 +836,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	sdp->sd_rindex_uptodate = 0;
 
 	/* Read in the quota inode */
-	sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
+	sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
 	if (IS_ERR(sdp->sd_quota_inode)) {
 		error = PTR_ERR(sdp->sd_quota_inode);
 		fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -571,8 +855,6 @@ fail_inum:
 	iput(sdp->sd_inum_inode);
 fail_journal:
 	init_journal(sdp, UNDO);
-fail_master:
-	iput(sdp->sd_master_dir);
 fail:
 	return error;
 }
@@ -583,6 +865,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 	char buf[30];
 	int error = 0;
 	struct gfs2_inode *ip;
+	struct inode *master = sdp->sd_master_dir->d_inode;
 
 	if (sdp->sd_args.ar_spectator)
 		return 0;
@@ -590,7 +873,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
 	if (undo)
 		goto fail_qc_gh;
 
-	pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
+	pn = gfs2_lookup_simple(master, "per_node");
 	if (IS_ERR(pn)) {
 		error = PTR_ERR(pn);
 		fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -800,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 		goto fail;
 	}
 
-	init_vfs(sb, SDF_NOATIME);
+	sb->s_magic = GFS2_MAGIC;
+	sb->s_op = &gfs2_super_ops;
+	sb->s_export_op = &gfs2_export_ops;
+	sb->s_time_gran = 1;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
 
 	/* Set up the buffer cache and fill in some fake block size values
 	   to allow us to read-in the on-disk superblock. */
@@ -828,7 +1115,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 	if (error)
 		goto fail_lm;
 
-	error = init_sb(sdp, silent, DO);
+	error = init_sb(sdp, silent);
 	if (error)
 		goto fail_locking;
 
@@ -869,7 +1156,11 @@ fail_per_node:
 fail_inodes:
 	init_inodes(sdp, UNDO);
 fail_sb:
-	init_sb(sdp, 0, UNDO);
+	if (sdp->sd_root_dir)
+		dput(sdp->sd_root_dir);
+	if (sdp->sd_master_dir)
+		dput(sdp->sd_master_dir);
+	sb->s_root = NULL;
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
@@ -887,151 +1178,63 @@ fail:
 }
 
 static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
-		const char *dev_name, void *data, struct vfsmount *mnt)
+		       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	struct super_block *sb;
-	struct gfs2_sbd *sdp;
-	int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
-	if (error)
-		goto out;
-	sb = mnt->mnt_sb;
-	sdp = sb->s_fs_info;
-	sdp->sd_gfs2mnt = mnt;
-out:
-	return error;
+	return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
 }
 
-static int fill_super_meta(struct super_block *sb, struct super_block *new,
-			   void *data, int silent)
+static struct super_block *get_gfs2_sb(const char *dev_name)
 {
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct inode *inode;
-	int error = 0;
-
-	new->s_fs_info = sdp;
-	sdp->sd_vfs_meta = sb;
-
-	init_vfs(new, SDF_NOATIME);
-
-        /* Get the master inode */
-	inode = igrab(sdp->sd_master_dir);
-
-	new->s_root = d_alloc_root(inode);
-	if (!new->s_root) {
-		fs_err(sdp, "can't get root dentry\n");
-		error = -ENOMEM;
-		iput(inode);
-	} else
-		new->s_root->d_op = &gfs2_dops;
-
-	return error;
-}
-
-static int set_bdev_super(struct super_block *s, void *data)
-{
-	s->s_bdev = data;
-	s->s_dev = s->s_bdev->bd_dev;
-	return 0;
-}
-
-static int test_bdev_super(struct super_block *s, void *data)
-{
-	return s->s_bdev == data;
-}
-
-static struct super_block* get_gfs2_sb(const char *dev_name)
-{
-	struct kstat stat;
+	struct super_block *sb;
 	struct nameidata nd;
-	struct super_block *sb = NULL, *s;
 	int error;
 
 	error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
 	if (error) {
-		printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
-		       dev_name);
-		goto out;
-	}
-	error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
-
-	list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
-		if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
-		    (S_ISDIR(stat.mode) &&
-		     s == nd.path.dentry->d_inode->i_sb)) {
-			sb = s;
-			goto free_nd;
-		}
+		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
+		       dev_name, error);
+		return NULL;
 	}
-
-	printk(KERN_WARNING "GFS2: Unrecognized block device or "
-	       "mount point %s\n", dev_name);
-
-free_nd:
+	sb = nd.path.dentry->d_inode->i_sb;
+	if (sb && (sb->s_type == &gfs2_fs_type))
+		atomic_inc(&sb->s_active);
+	else
+		sb = NULL;
 	path_put(&nd.path);
-out:
 	return sb;
 }
 
 static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 			    const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	int error = 0;
-	struct super_block *sb = NULL, *new;
+	struct super_block *sb = NULL;
 	struct gfs2_sbd *sdp;
 
 	sb = get_gfs2_sb(dev_name);
 	if (!sb) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-		error = -ENOENT;
-		goto error;
+		return -ENOENT;
 	}
 	sdp = sb->s_fs_info;
-	if (sdp->sd_vfs_meta) {
-		printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
-		error = -EBUSY;
-		goto error;
-	}
-	down(&sb->s_bdev->bd_mount_sem);
-	new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
-	up(&sb->s_bdev->bd_mount_sem);
-	if (IS_ERR(new)) {
-		error = PTR_ERR(new);
-		goto error;
-	}
-	new->s_flags = flags;
-	strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
-	sb_set_blocksize(new, sb->s_blocksize);
-	error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
-	if (error) {
-		up_write(&new->s_umount);
-		deactivate_super(new);
-		goto error;
-	}
-
-	new->s_flags |= MS_ACTIVE;
-
-	/* Grab a reference to the gfs2 mount point */
-	atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
-	return simple_set_mnt(mnt, new);
-error:
-	return error;
+	mnt->mnt_sb = sb;
+	mnt->mnt_root = dget(sdp->sd_master_dir);
+	return 0;
 }
 
 static void gfs2_kill_sb(struct super_block *sb)
 {
-	if (sb->s_fs_info) {
-		gfs2_delete_debugfs_file(sb->s_fs_info);
-		gfs2_meta_syncfs(sb->s_fs_info);
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	if (sdp) {
+		gfs2_meta_syncfs(sdp);
+		dput(sdp->sd_root_dir);
+		dput(sdp->sd_master_dir);
+		sdp->sd_root_dir = NULL;
+		sdp->sd_master_dir = NULL;
 	}
+	shrink_dcache_sb(sb);
 	kill_block_super(sb);
-}
-
-static void gfs2_kill_sb_meta(struct super_block *sb)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	generic_shutdown_super(sb);
-	sdp->sd_vfs_meta = NULL;
-	atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
+	if (sdp)
+		gfs2_delete_debugfs_file(sdp);
 }
 
 struct file_system_type gfs2_fs_type = {
@@ -1046,7 +1249,6 @@ struct file_system_type gfs2meta_fs_type = {
 	.name = "gfs2meta",
 	.fs_flags = FS_REQUIRES_DEV,
 	.get_sb = gfs2_get_sb_meta,
-	.kill_sb = gfs2_kill_sb_meta,
 	.owner = THIS_MODULE,
 };
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1e252dfc5294..d232991b9046 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -69,12 +69,12 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
 			mark_inode_dirty(inode);
 			break;
 		} else if (PTR_ERR(inode) != -EEXIST ||
-			   (nd && (nd->intent.open.flags & O_EXCL))) {
+			   (nd && nd->flags & LOOKUP_EXCL)) {
 			gfs2_holder_uninit(ghs);
 			return PTR_ERR(inode);
 		}
 
-		inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+		inode = gfs2_lookupi(dir, &dentry->d_name, 0);
 		if (inode) {
 			if (!IS_ERR(inode)) {
 				gfs2_holder_uninit(ghs);
@@ -109,7 +109,7 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
 
 	dentry->d_op = &gfs2_dops;
 
-	inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
 	if (inode && IS_ERR(inode))
 		return ERR_CAST(inode);
 
@@ -159,9 +159,13 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
 
-	error = gfs2_glock_nq_m(2, ghs);
+	error = gfs2_glock_nq(ghs); /* parent */
 	if (error)
-		goto out;
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
 
 	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
@@ -245,8 +249,10 @@ out_alloc:
 	if (alloc_required)
 		gfs2_alloc_put(dip);
 out_gunlock:
-	gfs2_glock_dq_m(2, ghs);
-out:
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_glock_dq(ghs);
+out_parent:
 	gfs2_holder_uninit(ghs);
 	gfs2_holder_uninit(ghs + 1);
 	if (!error) {
@@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 
 	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
 	if (error)
-		goto out_rgrp;
+		goto out_gunlock;
 
 	error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
 	if (error)
@@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 
 out_end_trans:
 	gfs2_trans_end(sdp);
+out_gunlock:
 	gfs2_glock_dq(ghs + 2);
 out_rgrp:
 	gfs2_holder_uninit(ghs + 2);
@@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
 	struct gfs2_holder ri_gh;
 	int error;
 
-
 	error = gfs2_rindex_hold(sdp, &ri_gh);
 	if (error)
 		return error;
@@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
 	rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
 	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
 
-	error = gfs2_glock_nq_m(3, ghs);
+	error = gfs2_glock_nq(ghs); /* parent */
 	if (error)
-		goto out;
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
+
+	error = gfs2_glock_nq(ghs + 2); /* rgrp */
+	if (error)
+		goto out_rgrp;
 
 	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
 	if (error)
@@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
 	gfs2_trans_end(sdp);
 
 out_gunlock:
-	gfs2_glock_dq_m(3, ghs);
-out:
-	gfs2_holder_uninit(ghs);
-	gfs2_holder_uninit(ghs + 1);
+	gfs2_glock_dq(ghs + 2);
+out_rgrp:
 	gfs2_holder_uninit(ghs + 2);
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_holder_uninit(ghs + 1);
+	gfs2_glock_dq(ghs);
+out_parent:
+	gfs2_holder_uninit(ghs);
 	gfs2_glock_dq_uninit(&ri_gh);
 	return error;
 }
@@ -571,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	return 0;
 }
 
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+	struct inode *dir = &to->i_inode;
+	struct super_block *sb = dir->i_sb;
+	struct inode *tmp;
+	struct qstr dotdot;
+	int error = 0;
+
+	gfs2_str2qstr(&dotdot, "..");
+
+	igrab(dir);
+
+	for (;;) {
+		if (dir == &this->i_inode) {
+			error = -EINVAL;
+			break;
+		}
+		if (dir == sb->s_root->d_inode) {
+			error = 0;
+			break;
+		}
+
+		tmp = gfs2_lookupi(dir, &dotdot, 1);
+		if (IS_ERR(tmp)) {
+			error = PTR_ERR(tmp);
+			break;
+		}
+
+		iput(dir);
+		dir = tmp;
+	}
+
+	iput(dir);
+
+	return error;
+}
+
 /**
  * gfs2_rename - Rename a file
  * @odir: Parent directory of old file name
@@ -589,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
 	struct gfs2_inode *nip = NULL;
 	struct gfs2_sbd *sdp = GFS2_SB(odir);
-	struct gfs2_holder ghs[5], r_gh;
+	struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
 	struct gfs2_rgrpd *nrgd;
 	unsigned int num_gh;
 	int dir_rename = 0;
@@ -603,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			return 0;
 	}
 
-	/* Make sure we aren't trying to move a dirctory into it's subdir */
-
-	if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) {
-		dir_rename = 1;
 
-		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0,
-					   &r_gh);
+	if (odip != ndip) {
+		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+					   0, &r_gh);
 		if (error)
 			goto out;
 
-		error = gfs2_ok_to_move(ip, ndip);
-		if (error)
-			goto out_gunlock_r;
+		if (S_ISDIR(ip->i_inode.i_mode)) {
+			dir_rename = 1;
+			/* don't move a dirctory into it's subdir */
+			error = gfs2_ok_to_move(ip, ndip);
+			if (error)
+				goto out_gunlock_r;
+		}
 	}
 
 	num_gh = 1;
@@ -639,9 +706,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
 	}
 
-	error = gfs2_glock_nq_m(num_gh, ghs);
-	if (error)
-		goto out_uninit;
+	for (x = 0; x < num_gh; x++) {
+		error = gfs2_glock_nq(ghs + x);
+		if (error)
+			goto out_gunlock;
+	}
 
 	/* Check out the old directory */
 
@@ -804,12 +873,12 @@ out_alloc:
 	if (alloc_required)
 		gfs2_alloc_put(ndip);
 out_gunlock:
-	gfs2_glock_dq_m(num_gh, ghs);
-out_uninit:
-	for (x = 0; x < num_gh; x++)
+	while (x--) {
+		gfs2_glock_dq(ghs + x);
 		gfs2_holder_uninit(ghs + x);
+	}
 out_gunlock_r:
-	if (dir_rename)
+	if (r_gh.gh_gl)
 		gfs2_glock_dq_uninit(&r_gh);
 out:
 	return error;
@@ -915,12 +984,6 @@ int gfs2_permission(struct inode *inode, int mask)
 	return error;
 }
 
-static int gfs2_iop_permission(struct inode *inode, int mask,
-			       struct nameidata *nd)
-{
-	return gfs2_permission(inode, mask);
-}
-
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1150,7 +1213,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 }
 
 const struct inode_operations gfs2_file_iops = {
-	.permission = gfs2_iop_permission,
+	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
@@ -1169,7 +1232,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.rmdir = gfs2_rmdir,
 	.mknod = gfs2_mknod,
 	.rename = gfs2_rename,
-	.permission = gfs2_iop_permission,
+	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
@@ -1181,7 +1244,7 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
 	.readlink = gfs2_readlink,
 	.follow_link = gfs2_follow_link,
-	.permission = gfs2_iop_permission,
+	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index f66ea0f7a356..d5355d9b5926 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -20,6 +20,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/time.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -38,6 +39,7 @@
 #include "dir.h"
 #include "eattr.h"
 #include "bmap.h"
+#include "meta_io.h"
 
 /**
  * gfs2_write_inode - Make sure the inode is stable on the disk
@@ -50,16 +52,74 @@
 static int gfs2_write_inode(struct inode *inode, int sync)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-
-	/* Check this is a "normal" inode */
-	if (test_bit(GIF_USER, &ip->i_flags)) {
-		if (current->flags & PF_MEMALLOC)
-			return 0;
-		if (sync)
-			gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	struct buffer_head *bh;
+	struct timespec atime;
+	struct gfs2_dinode *di;
+	int ret = 0;
+
+	/* Check this is a "normal" inode, etc */
+	if (!test_bit(GIF_USER, &ip->i_flags) ||
+	    (current->flags & PF_MEMALLOC))
+		return 0;
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (ret)
+		goto do_flush;
+	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (ret)
+		goto do_unlock;
+	ret = gfs2_meta_inode_buffer(ip, &bh);
+	if (ret == 0) {
+		di = (struct gfs2_dinode *)bh->b_data;
+		atime.tv_sec = be64_to_cpu(di->di_atime);
+		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+		if (timespec_compare(&inode->i_atime, &atime) > 0) {
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_dinode_out(ip, bh->b_data);
+		}
+		brelse(bh);
 	}
+	gfs2_trans_end(sdp);
+do_unlock:
+	gfs2_glock_dq_uninit(&gh);
+do_flush:
+	if (sync != 0)
+		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	return ret;
+}
 
-	return 0;
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+	struct gfs2_holder t_gh;
+	int error;
+
+	gfs2_quota_sync(sdp);
+	gfs2_statfs_sync(sdp);
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+				   &t_gh);
+	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return error;
+
+	gfs2_meta_syncfs(sdp);
+	gfs2_log_shutdown(sdp);
+
+	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	if (t_gh.gh_gl)
+		gfs2_glock_dq_uninit(&t_gh);
+
+	gfs2_quota_cleanup(sdp);
+
+	return error;
 }
 
 /**
@@ -73,12 +133,6 @@ static void gfs2_put_super(struct super_block *sb)
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
 
-	if (!sdp)
-		return;
-
-	if (!strncmp(sb->s_type->name, "gfs2meta", 8))
-		return; /* Nothing to do */
-
 	/*  Unfreeze the filesystem, if we need to  */
 
 	mutex_lock(&sdp->sd_freeze_lock);
@@ -101,7 +155,6 @@ static void gfs2_put_super(struct super_block *sb)
 
 	/*  Release stuff  */
 
-	iput(sdp->sd_master_dir);
 	iput(sdp->sd_jindex);
 	iput(sdp->sd_inum_inode);
 	iput(sdp->sd_statfs_inode);
@@ -152,6 +205,7 @@ static void gfs2_write_super(struct super_block *sb)
  *
  * Flushes the log to disk.
  */
+
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
 	sb->s_dirt = 0;
@@ -270,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 		}
 	}
 
-	if (*flags & (MS_NOATIME | MS_NODIRATIME))
-		set_bit(SDF_NOATIME, &sdp->sd_flags);
-	else
-		clear_bit(SDF_NOATIME, &sdp->sd_flags);
-
-	/* Don't let the VFS update atimes.  GFS2 handles this itself. */
-	*flags |= MS_NOATIME | MS_NODIRATIME;
-
 	return error;
 }
 
@@ -295,6 +341,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
  * inode's blocks, or alternatively pass the baton on to another
  * node for later deallocation.
  */
+
 static void gfs2_drop_inode(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -333,6 +380,16 @@ static void gfs2_clear_inode(struct inode *inode)
 	}
 }
 
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+	do {
+		if (d1 == d2)
+			return 1;
+		d1 = d1->d_parent;
+	} while (!IS_ROOT(d1));
+	return 0;
+}
+
 /**
  * gfs2_show_options - Show mount options for /proc/mounts
  * @s: seq_file structure
@@ -346,6 +403,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
 	struct gfs2_args *args = &sdp->sd_args;
 
+	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+		seq_printf(s, ",meta");
 	if (args->ar_lockproto[0])
 		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
 	if (args->ar_locktable[0])
@@ -414,6 +473,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
  * conversion on the iopen lock, but we can change that later. This
  * is safe, just less efficient.
  */
+
 static void gfs2_delete_inode(struct inode *inode)
 {
 	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
@@ -478,8 +538,6 @@ out:
 	clear_inode(inode);
 }
 
-
-
 static struct inode *gfs2_alloc_inode(struct super_block *sb)
 {
 	struct gfs2_inode *ip;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 63a8a902d9db..c3ba3d9d0aac 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,313 +33,6 @@
 #include "trans.h"
 #include "util.h"
 
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
-
-/**
- * gfs2_tune_init - Fill a gfs2_tune structure with default values
- * @gt: tune
- *
- */
-
-void gfs2_tune_init(struct gfs2_tune *gt)
-{
-	spin_lock_init(&gt->gt_spin);
-
-	gt->gt_demote_secs = 300;
-	gt->gt_incore_log_blocks = 1024;
-	gt->gt_log_flush_secs = 60;
-	gt->gt_recoverd_secs = 60;
-	gt->gt_logd_secs = 1;
-	gt->gt_quotad_secs = 5;
-	gt->gt_quota_simul_sync = 64;
-	gt->gt_quota_warn_period = 10;
-	gt->gt_quota_scale_num = 1;
-	gt->gt_quota_scale_den = 1;
-	gt->gt_quota_cache_secs = 300;
-	gt->gt_quota_quantum = 60;
-	gt->gt_atime_quantum = 3600;
-	gt->gt_new_files_jdata = 0;
-	gt->gt_max_readahead = 1 << 18;
-	gt->gt_stall_secs = 600;
-	gt->gt_complain_secs = 10;
-	gt->gt_statfs_quantum = 30;
-	gt->gt_statfs_slow = 0;
-}
-
-/**
- * gfs2_check_sb - Check superblock
- * @sdp: the filesystem
- * @sb: The superblock
- * @silent: Don't print a message if the check fails
- *
- * Checks the version code of the FS is one that we understand how to
- * read and that the sizes of the various on-disk structures have not
- * changed.
- */
-
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
-{
-	unsigned int x;
-
-	if (sb->sb_magic != GFS2_MAGIC ||
-	    sb->sb_type != GFS2_METATYPE_SB) {
-		if (!silent)
-			printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
-		return -EINVAL;
-	}
-
-	/*  If format numbers match exactly, we're done.  */
-
-	if (sb->sb_fs_format == GFS2_FORMAT_FS &&
-	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
-		return 0;
-
-	if (sb->sb_fs_format != GFS2_FORMAT_FS) {
-		for (x = 0; gfs2_old_fs_formats[x]; x++)
-			if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-				break;
-
-		if (!gfs2_old_fs_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
-
-	if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
-		for (x = 0; gfs2_old_multihost_formats[x]; x++)
-			if (gfs2_old_multihost_formats[x] ==
-			    sb->sb_multihost_format)
-				break;
-
-		if (!gfs2_old_multihost_formats[x]) {
-			printk(KERN_WARNING
-			       "GFS2: code version (%u, %u) is incompatible "
-			       "with ondisk format (%u, %u)\n",
-			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-			       sb->sb_fs_format, sb->sb_multihost_format);
-			printk(KERN_WARNING
-			       "GFS2: I don't know how to upgrade this FS\n");
-			return -EINVAL;
-		}
-	}
-
-	if (!sdp->sd_args.ar_upgrade) {
-		printk(KERN_WARNING
-		       "GFS2: code version (%u, %u) is incompatible "
-		       "with ondisk format (%u, %u)\n",
-		       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-		       sb->sb_fs_format, sb->sb_multihost_format);
-		printk(KERN_INFO
-		       "GFS2: Use the \"upgrade\" mount option to upgrade "
-		       "the FS\n");
-		printk(KERN_INFO "GFS2: See the manual for more details\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-
-static void end_bio_io_page(struct bio *bio, int error)
-{
-	struct page *page = bio->bi_private;
-
-	if (!error)
-		SetPageUptodate(page);
-	else
-		printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
-	unlock_page(page);
-}
-
-static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
-{
-	const struct gfs2_sb *str = buf;
-
-	sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
-	sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
-	sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
-	sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
-	sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
-	sb->sb_bsize = be32_to_cpu(str->sb_bsize);
-	sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
-	sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
-	sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
-	sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
-	sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
-
-	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
-	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
-}
-
-/**
- * gfs2_read_super - Read the gfs2 super block from disk
- * @sdp: The GFS2 super block
- * @sector: The location of the super block
- * @error: The error code to return
- *
- * This uses the bio functions to read the super block from disk
- * because we want to be 100% sure that we never read cached data.
- * A super block is read twice only during each GFS2 mount and is
- * never written to by the filesystem. The first time its read no
- * locks are held, and the only details which are looked at are those
- * relating to the locking protocol. Once locking is up and working,
- * the sb is read again under the lock to establish the location of
- * the master directory (contains pointers to journals etc) and the
- * root directory.
- *
- * Returns: 0 on success or error
- */
-
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
-{
-	struct super_block *sb = sdp->sd_vfs;
-	struct gfs2_sb *p;
-	struct page *page;
-	struct bio *bio;
-
-	page = alloc_page(GFP_NOFS);
-	if (unlikely(!page))
-		return -ENOBUFS;
-
-	ClearPageUptodate(page);
-	ClearPageDirty(page);
-	lock_page(page);
-
-	bio = bio_alloc(GFP_NOFS, 1);
-	if (unlikely(!bio)) {
-		__free_page(page);
-		return -ENOBUFS;
-	}
-
-	bio->bi_sector = sector * (sb->s_blocksize >> 9);
-	bio->bi_bdev = sb->s_bdev;
-	bio_add_page(bio, page, PAGE_SIZE, 0);
-
-	bio->bi_end_io = end_bio_io_page;
-	bio->bi_private = page;
-	submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
-	wait_on_page_locked(page);
-	bio_put(bio);
-	if (!PageUptodate(page)) {
-		__free_page(page);
-		return -EIO;
-	}
-	p = kmap(page);
-	gfs2_sb_in(&sdp->sd_sb, p);
-	kunmap(page);
-	__free_page(page);
-	return 0;
-}
-
-/**
- * gfs2_read_sb - Read super block
- * @sdp: The GFS2 superblock
- * @gl: the glock for the superblock (assumed to be held)
- * @silent: Don't print message if mount fails
- *
- */
-
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
-{
-	u32 hash_blocks, ind_blocks, leaf_blocks;
-	u32 tmp_blocks;
-	unsigned int x;
-	int error;
-
-	error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
-	if (error) {
-		if (!silent)
-			fs_err(sdp, "can't read superblock\n");
-		return error;
-	}
-
-	error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
-	if (error)
-		return error;
-
-	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
-			       GFS2_BASIC_BLOCK_SHIFT;
-	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
-	sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
-			  sizeof(struct gfs2_dinode)) / sizeof(u64);
-	sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
-			  sizeof(struct gfs2_meta_header)) / sizeof(u64);
-	sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
-	sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
-	sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
-	sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
-	sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
-				sizeof(struct gfs2_meta_header)) /
-			        sizeof(struct gfs2_quota_change);
-
-	/* Compute maximum reservation required to add a entry to a directory */
-
-	hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
-			     sdp->sd_jbsize);
-
-	ind_blocks = 0;
-	for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
-		tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
-		ind_blocks += tmp_blocks;
-	}
-
-	leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
-
-	sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
-
-	sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
-				sizeof(struct gfs2_dinode);
-	sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
-	for (x = 2;; x++) {
-		u64 space, d;
-		u32 m;
-
-		space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
-		d = space;
-		m = do_div(d, sdp->sd_inptrs);
-
-		if (d != sdp->sd_heightsize[x - 1] || m)
-			break;
-		sdp->sd_heightsize[x] = space;
-	}
-	sdp->sd_max_height = x;
-	sdp->sd_heightsize[x] = ~0;
-	gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
-
-	sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
-				 sizeof(struct gfs2_dinode);
-	sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
-	for (x = 2;; x++) {
-		u64 space, d;
-		u32 m;
-
-		space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
-		d = space;
-		m = do_div(d, sdp->sd_inptrs);
-
-		if (d != sdp->sd_jheightsize[x - 1] || m)
-			break;
-		sdp->sd_jheightsize[x] = space;
-	}
-	sdp->sd_max_jheight = x;
-	sdp->sd_jheightsize[x] = ~0;
-	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
-
-	return 0;
-}
-
 /**
  * gfs2_jindex_hold - Grab a lock on the jindex
  * @sdp: The GFS2 superblock
@@ -389,7 +82,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 			break;
 
 		INIT_LIST_HEAD(&jd->extent_list);
-		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
+		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
 		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
 			if (!jd->jd_inode)
 				error = -ENOENT;
@@ -581,39 +274,6 @@ fail:
 	return error;
 }
 
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-	struct gfs2_holder t_gh;
-	int error;
-
-	gfs2_quota_sync(sdp);
-	gfs2_statfs_sync(sdp);
-
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-				   &t_gh);
-	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return error;
-
-	gfs2_meta_syncfs(sdp);
-	gfs2_log_shutdown(sdp);
-
-	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-
-	if (t_gh.gh_gl)
-		gfs2_glock_dq_uninit(&t_gh);
-
-	gfs2_quota_cleanup(sdp);
-
-	return error;
-}
-
 static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
 	const struct gfs2_statfs_change *str = buf;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 44361ecc44f7..50a4c9b1215e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -12,11 +12,6 @@
 
 #include "incore.h"
 
-void gfs2_tune_init(struct gfs2_tune *gt);
-
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
 void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
@@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
 			      struct gfs2_inode **ipp);
 
 int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
 
 int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 74846559fc3f..7e1879f1a02c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -269,14 +269,6 @@ ARGS_ATTR(quota,           "%u\n");
 ARGS_ATTR(suiddir,         "%d\n");
 ARGS_ATTR(data,            "%d\n");
 
-/* one oddball doesn't fit the macro mold */
-static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%d\n",
-			!!test_bit(SDF_NOATIME, &sdp->sd_flags));
-}
-static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
-
 static struct attribute *args_attrs[] = {
 	&args_attr_lockproto.attr,
 	&args_attr_locktable.attr,
@@ -292,7 +284,6 @@ static struct attribute *args_attrs[] = {
 	&args_attr_quota.attr,
 	&args_attr_suiddir.attr,
 	&args_attr_data.attr,
-	&args_attr_noatime.attr,
 	NULL,
 };
 
@@ -407,7 +398,6 @@ TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
 TUNE_ATTR(quota_quantum, 0);
-TUNE_ATTR(atime_quantum, 0);
 TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
@@ -427,7 +417,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_log_flush_secs.attr,
 	&tune_attr_quota_warn_period.attr,
 	&tune_attr_quota_quantum.attr,
-	&tune_attr_atime_quantum.attr,
 	&tune_attr_max_readahead.attr,
 	&tune_attr_complain_secs.attr,
 	&tune_attr_statfs_slow.attr,
diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c
index 24e75798ddf0..c6e97366e8ac 100644
--- a/fs/hfs/bitmap.c
+++ b/fs/hfs/bitmap.c
@@ -145,7 +145,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
 	if (!*num_bits)
 		return 0;
 
-	down(&HFS_SB(sb)->bitmap_lock);
+	mutex_lock(&HFS_SB(sb)->bitmap_lock);
 	bitmap = HFS_SB(sb)->bitmap;
 
 	pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits);
@@ -162,7 +162,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
 	HFS_SB(sb)->free_ablocks -= *num_bits;
 	hfs_bitmap_dirty(sb);
 out:
-	up(&HFS_SB(sb)->bitmap_lock);
+	mutex_unlock(&HFS_SB(sb)->bitmap_lock);
 	return pos;
 }
 
@@ -205,7 +205,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
 	if ((start + count) > HFS_SB(sb)->fs_ablocks)
 		return -2;
 
-	down(&HFS_SB(sb)->bitmap_lock);
+	mutex_lock(&HFS_SB(sb)->bitmap_lock);
 	/* bitmap is always on a 32-bit boundary */
 	curr = HFS_SB(sb)->bitmap + (start / 32);
 	len = count;
@@ -236,7 +236,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
 	}
 out:
 	HFS_SB(sb)->free_ablocks += len;
-	up(&HFS_SB(sb)->bitmap_lock);
+	mutex_unlock(&HFS_SB(sb)->bitmap_lock);
 	hfs_bitmap_dirty(sb);
 
 	return 0;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index f6621a785202..9b9d6395bad3 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -40,7 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	{
 	struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
 	HFS_I(tree->inode)->flags = 0;
-	init_MUTEX(&HFS_I(tree->inode)->extents_lock);
+	mutex_init(&HFS_I(tree->inode)->extents_lock);
 	switch (id) {
 	case HFS_EXT_CNID:
 		hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index ba851576ebb1..6d98f116ca03 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -190,6 +190,10 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
 
 	fd->search_key->cat.ParID = rec.thread.ParID;
 	len = fd->search_key->cat.CName.len = rec.thread.CName.len;
+	if (len > HFS_NAMELEN) {
+		printk(KERN_ERR "hfs: bad catalog namelength\n");
+		return -EIO;
+	}
 	memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len);
 	return hfs_brec_find(fd);
 }
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index c176f67ba0a5..2c16316d2917 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -343,16 +343,16 @@ int hfs_get_block(struct inode *inode, sector_t block,
 		goto done;
 	}
 
-	down(&HFS_I(inode)->extents_lock);
+	mutex_lock(&HFS_I(inode)->extents_lock);
 	res = hfs_ext_read_extent(inode, ablock);
 	if (!res)
 		dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents,
 					    ablock - HFS_I(inode)->cached_start);
 	else {
-		up(&HFS_I(inode)->extents_lock);
+		mutex_unlock(&HFS_I(inode)->extents_lock);
 		return -EIO;
 	}
-	up(&HFS_I(inode)->extents_lock);
+	mutex_unlock(&HFS_I(inode)->extents_lock);
 
 done:
 	map_bh(bh_result, sb, HFS_SB(sb)->fs_start +
@@ -375,7 +375,7 @@ int hfs_extend_file(struct inode *inode)
 	u32 start, len, goal;
 	int res;
 
-	down(&HFS_I(inode)->extents_lock);
+	mutex_lock(&HFS_I(inode)->extents_lock);
 	if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks)
 		goal = hfs_ext_lastblock(HFS_I(inode)->first_extents);
 	else {
@@ -425,7 +425,7 @@ int hfs_extend_file(struct inode *inode)
 			goto insert_extent;
 	}
 out:
-	up(&HFS_I(inode)->extents_lock);
+	mutex_unlock(&HFS_I(inode)->extents_lock);
 	if (!res) {
 		HFS_I(inode)->alloc_blocks += len;
 		mark_inode_dirty(inode);
@@ -487,7 +487,7 @@ void hfs_file_truncate(struct inode *inode)
 	if (blk_cnt == alloc_cnt)
 		goto out;
 
-	down(&HFS_I(inode)->extents_lock);
+	mutex_lock(&HFS_I(inode)->extents_lock);
 	hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
 	while (1) {
 		if (alloc_cnt == HFS_I(inode)->first_blocks) {
@@ -514,7 +514,7 @@ void hfs_file_truncate(struct inode *inode)
 		hfs_brec_remove(&fd);
 	}
 	hfs_find_exit(&fd);
-	up(&HFS_I(inode)->extents_lock);
+	mutex_unlock(&HFS_I(inode)->extents_lock);
 
 	HFS_I(inode)->alloc_blocks = blk_cnt;
 out:
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 147374b6f675..9955232fdf8c 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -11,6 +11,7 @@
 
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/mutex.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 
@@ -53,7 +54,7 @@ struct hfs_inode_info {
 	struct list_head open_dir_list;
 	struct inode *rsrc_inode;
 
-	struct semaphore extents_lock;
+	struct mutex extents_lock;
 
 	u16 alloc_blocks, clump_blocks;
 	sector_t fs_blocks;
@@ -139,7 +140,7 @@ struct hfs_sb_info {
 
 	struct nls_table *nls_io, *nls_disk;
 
-	struct semaphore bitmap_lock;
+	struct mutex bitmap_lock;
 
 	unsigned long flags;
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 97f8446c4ff4..c69b7ac75bf7 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
 	if (!inode)
 		return NULL;
 
-	init_MUTEX(&HFS_I(inode)->extents_lock);
+	mutex_init(&HFS_I(inode)->extents_lock);
 	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
 	hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
 	inode->i_ino = HFS_SB(sb)->next_id++;
@@ -281,7 +281,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
 
 	HFS_I(inode)->flags = 0;
 	HFS_I(inode)->rsrc_inode = NULL;
-	init_MUTEX(&HFS_I(inode)->extents_lock);
+	mutex_init(&HFS_I(inode)->extents_lock);
 	INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
 
 	/* Initialize the inode */
@@ -511,20 +511,10 @@ void hfs_clear_inode(struct inode *inode)
 	}
 }
 
-static int hfs_permission(struct inode *inode, int mask,
-			  struct nameidata *nd)
-{
-	if (S_ISREG(inode->i_mode) && mask & MAY_EXEC)
-		return 0;
-	return generic_permission(inode, mask, NULL);
-}
-
 static int hfs_file_open(struct inode *inode, struct file *file)
 {
 	if (HFS_IS_RSRC(inode))
 		inode = HFS_I(inode)->rsrc_inode;
-	if (atomic_read(&file->f_count) != 1)
-		return 0;
 	atomic_inc(&HFS_I(inode)->opencnt);
 	return 0;
 }
@@ -535,8 +525,6 @@ static int hfs_file_release(struct inode *inode, struct file *file)
 
 	if (HFS_IS_RSRC(inode))
 		inode = HFS_I(inode)->rsrc_inode;
-	if (atomic_read(&file->f_count) != 0)
-		return 0;
 	if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
 		mutex_lock(&inode->i_mutex);
 		hfs_file_truncate(inode);
@@ -621,7 +609,6 @@ static const struct inode_operations hfs_file_inode_operations = {
 	.lookup		= hfs_file_lookup,
 	.truncate	= hfs_file_truncate,
 	.setattr	= hfs_inode_setattr,
-	.permission	= hfs_permission,
 	.setxattr	= hfs_setxattr,
 	.getxattr	= hfs_getxattr,
 	.listxattr	= hfs_listxattr,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 8cf67974adf6..3c7c7637719c 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -173,7 +173,7 @@ enum {
 	opt_err
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{ opt_uid, "uid=%u" },
 	{ opt_gid, "gid=%u" },
 	{ opt_umask, "umask=%o" },
@@ -372,7 +372,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_op = &hfs_super_operations;
 	sb->s_flags |= MS_NODIRATIME;
-	init_MUTEX(&sbi->bitmap_lock);
+	mutex_init(&sbi->bitmap_lock);
 
 	res = hfs_mdb_get(sb);
 	if (res) {
@@ -432,7 +432,7 @@ static struct file_system_type hfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfs_init_once(struct kmem_cache *cachep, void *p)
+static void hfs_init_once(void *p)
 {
 	struct hfs_inode_info *i = p;
 
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index d128a25b74d2..ea30afc2a03c 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -32,6 +32,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
+	if (IS_ERR(page)) {
+		start = size;
+		goto out;
+	}
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	i = offset % 32;
@@ -73,6 +77,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 			break;
 		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
 					 NULL);
+		if (IS_ERR(page)) {
+			start = size;
+			goto out;
+		}
 		curr = pptr = kmap(page);
 		if ((size ^ offset) / PAGE_CACHE_BITS)
 			end = pptr + PAGE_CACHE_BITS / 32;
@@ -120,6 +128,10 @@ found:
 		offset += PAGE_CACHE_BITS;
 		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
 					 NULL);
+		if (IS_ERR(page)) {
+			start = size;
+			goto out;
+		}
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index ba117c445e78..f6874acb2cf2 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -168,6 +168,11 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
 		return -EIO;
 	}
 
+	if (be16_to_cpu(tmp.thread.nodeName.length) > 255) {
+		printk(KERN_ERR "hfs: catalog name length corrupted\n");
+		return -EIO;
+	}
+
 	hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID),
 				 &tmp.thread.nodeName);
 	return hfs_brec_find(fd);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 12e899cd7886..0022eec63cda 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -199,16 +199,19 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 		goto done;
 	}
 
-	down(&HFSPLUS_I(inode).extents_lock);
+	if (inode->i_ino == HFSPLUS_EXT_CNID)
+		return -EIO;
+
+	mutex_lock(&HFSPLUS_I(inode).extents_lock);
 	res = hfsplus_ext_read_extent(inode, ablock);
 	if (!res) {
 		dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
 					     HFSPLUS_I(inode).cached_start);
 	} else {
-		up(&HFSPLUS_I(inode).extents_lock);
+		mutex_unlock(&HFSPLUS_I(inode).extents_lock);
 		return -EIO;
 	}
-	up(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
 
 done:
 	dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
@@ -355,7 +358,7 @@ int hfsplus_file_extend(struct inode *inode)
 		return -ENOSPC;
 	}
 
-	down(&HFSPLUS_I(inode).extents_lock);
+	mutex_lock(&HFSPLUS_I(inode).extents_lock);
 	if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
 		goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
 	else {
@@ -408,7 +411,7 @@ int hfsplus_file_extend(struct inode *inode)
 			goto insert_extent;
 	}
 out:
-	up(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
 	if (!res) {
 		HFSPLUS_I(inode).alloc_blocks += len;
 		mark_inode_dirty(inode);
@@ -465,7 +468,7 @@ void hfsplus_file_truncate(struct inode *inode)
 	if (blk_cnt == alloc_cnt)
 		goto out;
 
-	down(&HFSPLUS_I(inode).extents_lock);
+	mutex_lock(&HFSPLUS_I(inode).extents_lock);
 	hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
 	while (1) {
 		if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
@@ -492,7 +495,7 @@ void hfsplus_file_truncate(struct inode *inode)
 		hfs_brec_remove(&fd);
 	}
 	hfs_find_exit(&fd);
-	up(&HFSPLUS_I(inode).extents_lock);
+	mutex_unlock(&HFSPLUS_I(inode).extents_lock);
 
 	HFSPLUS_I(inode).alloc_blocks = blk_cnt;
 out:
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 9e59537b43d5..f027a905225f 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -11,6 +11,7 @@
 #define _LINUX_HFSPLUS_FS_H
 
 #include <linux/fs.h>
+#include <linux/mutex.h>
 #include <linux/buffer_head.h>
 #include "hfsplus_raw.h"
 
@@ -154,7 +155,7 @@ struct hfsplus_sb_info {
 
 
 struct hfsplus_inode_info {
-	struct semaphore extents_lock;
+	struct mutex extents_lock;
 	u32 clump_blocks, alloc_blocks;
 	sector_t fs_blocks;
 	/* Allocation extents from catalog record or volume header */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 67e1c8b467c4..b207f0e6fc22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -163,7 +163,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
 
 	inode->i_ino = dir->i_ino;
 	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+	mutex_init(&HFSPLUS_I(inode).extents_lock);
 	HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
 
 	hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
@@ -238,24 +238,12 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
 	perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
 }
 
-static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	/* MAY_EXEC is also used for lookup, if no x bit is set allow lookup,
-	 * open_exec has the same test, so it's still not executable, if a x bit
-	 * is set fall back to standard permission check.
-	 */
-	if (S_ISREG(inode->i_mode) && mask & MAY_EXEC && !(inode->i_mode & 0111))
-		return 0;
-	return generic_permission(inode, mask, NULL);
-}
-
-
 static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode).rsrc_inode;
-	if (atomic_read(&file->f_count) != 1)
-		return 0;
+	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+		return -EOVERFLOW;
 	atomic_inc(&HFSPLUS_I(inode).opencnt);
 	return 0;
 }
@@ -266,8 +254,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode).rsrc_inode;
-	if (atomic_read(&file->f_count) != 0)
-		return 0;
 	if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
 		mutex_lock(&inode->i_mutex);
 		hfsplus_file_truncate(inode);
@@ -283,7 +269,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 static const struct inode_operations hfsplus_file_inode_operations = {
 	.lookup		= hfsplus_file_lookup,
 	.truncate	= hfsplus_file_truncate,
-	.permission	= hfsplus_permission,
 	.setxattr	= hfsplus_setxattr,
 	.getxattr	= hfsplus_getxattr,
 	.listxattr	= hfsplus_listxattr,
@@ -316,7 +301,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+	mutex_init(&HFSPLUS_I(inode).extents_lock);
 	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
 	HFSPLUS_I(inode).flags = 0;
 	memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 9997cbf8beb5..9699c56d323f 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -25,7 +25,7 @@ enum {
 	opt_force, opt_err
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{ opt_creator, "creator=%s" },
 	{ opt_type, "type=%s" },
 	{ opt_umask, "umask=%o" },
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index ce97a54518d8..eb74531a0a8e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -34,7 +34,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-	init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+	mutex_init(&HFSPLUS_I(inode).extents_lock);
 	HFSPLUS_I(inode).flags = 0;
 	HFSPLUS_I(inode).rsrc_inode = NULL;
 	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
@@ -356,7 +356,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
 		printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
-	} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
+	} else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) {
 		printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, "
 		       "use the force option at your own risk, mounting read-only.\n");
 		sb->s_flags |= MS_RDONLY;
@@ -485,7 +485,7 @@ static struct file_system_type hfsplus_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfsplus_init_once(struct kmem_cache *cachep, void *p)
+static void hfsplus_init_once(void *p)
 {
 	struct hfsplus_inode_info *i = p;
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5222345ddccf..7f34f4385de0 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -20,7 +20,7 @@
 struct hostfs_inode_info {
 	char *host_filename;
 	int fd;
-	int mode;
+	fmode_t mode;
 	struct inode vfs_inode;
 };
 
@@ -373,7 +373,8 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
 int hostfs_file_open(struct inode *ino, struct file *file)
 {
 	char *name;
-	int mode = 0, r = 0, w = 0, fd;
+	fmode_t mode = 0;
+	int r = 0, w = 0, fd;
 
 	mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
 	if ((mode & HOSTFS_I(ino)->mode) == mode)
@@ -822,7 +823,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	return err;
 }
 
-int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
+int hostfs_permission(struct inode *ino, int desired)
 {
 	char *name;
 	int r = 0, w = 0, x = 0, err;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index be8be5040e07..64ab52259204 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -143,5 +143,5 @@ const struct file_operations hpfs_file_ops =
 const struct inode_operations hpfs_file_iops =
 {
 	.truncate	= hpfs_truncate,
-	.setattr	= hpfs_notify_change,
+	.setattr	= hpfs_setattr,
 };
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 42ff60ccf2a9..c2ea31bae313 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -275,7 +275,7 @@ void hpfs_init_inode(struct inode *);
 void hpfs_read_inode(struct inode *);
 void hpfs_write_inode(struct inode *);
 void hpfs_write_inode_nolock(struct inode *);
-int hpfs_notify_change(struct dentry *, struct iattr *);
+int hpfs_setattr(struct dentry *, struct iattr *);
 void hpfs_write_if_changed(struct inode *);
 void hpfs_delete_inode(struct inode *);
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 85d3e1d9ac00..39a1bfbea312 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -260,19 +260,28 @@ void hpfs_write_inode_nolock(struct inode *i)
 	brelse(bh);
 }
 
-int hpfs_notify_change(struct dentry *dentry, struct iattr *attr)
+int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
-	int error=0;
+	int error = -EINVAL;
+
 	lock_kernel();
-	if ( ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) ||
-	     (hpfs_sb(inode->i_sb)->sb_root == inode->i_ino) ) {
-		error = -EINVAL;
-	} else if ((error = inode_change_ok(inode, attr))) {
-	} else if ((error = inode_setattr(inode, attr))) {
-	} else {
-		hpfs_write_inode(inode);
-	}
+	if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
+		goto out_unlock;
+	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
+		goto out_unlock;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		goto out_unlock;
+
+	error = inode_setattr(inode, attr);
+	if (error)
+		goto out_unlock;
+
+	hpfs_write_inode(inode);
+
+ out_unlock:
 	unlock_kernel();
 	return error;
 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d256559b4104..10783f3d265a 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -415,7 +415,7 @@ again:
 		d_drop(dentry);
 		spin_lock(&dentry->d_lock);
 		if (atomic_read(&dentry->d_count) > 1 ||
-		    permission(inode, MAY_WRITE, NULL) ||
+		    generic_permission(inode, MAY_WRITE, NULL) ||
 		    !S_ISREG(inode->i_mode) ||
 		    get_write_access(inode)) {
 			spin_unlock(&dentry->d_lock);
@@ -669,5 +669,5 @@ const struct inode_operations hpfs_dir_iops =
 	.rmdir		= hpfs_rmdir,
 	.mknod		= hpfs_mknod,
 	.rename		= hpfs_rename,
-	.setattr	= hpfs_notify_change,
+	.setattr	= hpfs_setattr,
 };
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f63a699ec659..29ad461d568f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -173,7 +173,7 @@ static void hpfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
 
@@ -215,7 +215,7 @@ enum {
 	Opt_timeshift, Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_help, "help"},
 	{Opt_uid, "uid=%u"},
 	{Opt_gid, "gid=%u"},
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 65077aa90f0a..2b3d1828db99 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -655,20 +655,13 @@ static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
 }
 
-int hppfs_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return generic_permission(inode, mask, NULL);
-}
-
 static const struct inode_operations hppfs_dir_iops = {
 	.lookup		= hppfs_lookup,
-	.permission	= hppfs_permission,
 };
 
 static const struct inode_operations hppfs_link_iops = {
 	.readlink	= hppfs_readlink,
 	.follow_link	= hppfs_follow_link,
-	.permission	= hppfs_permission,
 };
 
 static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index aeabf80f81a5..61edc701b0e6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -53,15 +53,17 @@ int sysctl_hugetlb_shm_group;
 enum {
 	Opt_size, Opt_nr_inodes,
 	Opt_mode, Opt_uid, Opt_gid,
+	Opt_pagesize,
 	Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_size,	"size=%s"},
 	{Opt_nr_inodes,	"nr_inodes=%s"},
 	{Opt_mode,	"mode=%o"},
 	{Opt_uid,	"uid=%u"},
 	{Opt_gid,	"gid=%u"},
+	{Opt_pagesize,	"pagesize=%s"},
 	{Opt_err,	NULL},
 };
 
@@ -80,6 +82,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	loff_t len, vma_len;
 	int ret;
+	struct hstate *h = hstate_file(file);
 
 	/*
 	 * vma address alignment (but not the pgoff alignment) has
@@ -92,7 +95,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
 	vma->vm_ops = &hugetlb_vm_ops;
 
-	if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT))
+	if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
@@ -103,9 +106,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	ret = -ENOMEM;
 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
-	if (vma->vm_flags & VM_MAYSHARE &&
-	    hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
-				  len >> HPAGE_SHIFT))
+	if (hugetlb_reserve_pages(inode,
+				vma->vm_pgoff >> huge_page_order(h),
+				len >> huge_page_shift(h), vma))
 		goto out;
 
 	ret = 0;
@@ -130,20 +133,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
+	struct hstate *h = hstate_file(file);
 
-	if (len & ~HPAGE_MASK)
+	if (len & ~huge_page_mask(h))
 		return -EINVAL;
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
 
 	if (addr) {
-		addr = ALIGN(addr, HPAGE_SIZE);
+		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
@@ -156,7 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		start_addr = TASK_UNMAPPED_BASE;
 
 full_search:
-	addr = ALIGN(start_addr, HPAGE_SIZE);
+	addr = ALIGN(start_addr, huge_page_size(h));
 
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
@@ -174,7 +178,7 @@ full_search:
 
 		if (!vma || addr + len <= vma->vm_start)
 			return addr;
-		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+		addr = ALIGN(vma->vm_end, huge_page_size(h));
 	}
 }
 #endif
@@ -225,10 +229,11 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			      size_t len, loff_t *ppos)
 {
+	struct hstate *h = hstate_file(filp);
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	unsigned long index = *ppos >> HPAGE_SHIFT;
-	unsigned long offset = *ppos & ~HPAGE_MASK;
+	unsigned long index = *ppos >> huge_page_shift(h);
+	unsigned long offset = *ppos & ~huge_page_mask(h);
 	unsigned long end_index;
 	loff_t isize;
 	ssize_t retval = 0;
@@ -243,17 +248,17 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 	if (!isize)
 		goto out;
 
-	end_index = (isize - 1) >> HPAGE_SHIFT;
+	end_index = (isize - 1) >> huge_page_shift(h);
 	for (;;) {
 		struct page *page;
-		int nr, ret;
+		unsigned long nr, ret;
 
 		/* nr is the maximum number of bytes to copy from this page */
-		nr = HPAGE_SIZE;
+		nr = huge_page_size(h);
 		if (index >= end_index) {
 			if (index > end_index)
 				goto out;
-			nr = ((isize - 1) & ~HPAGE_MASK) + 1;
+			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
 			if (nr <= offset) {
 				goto out;
 			}
@@ -287,8 +292,8 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 		offset += ret;
 		retval += ret;
 		len -= ret;
-		index += offset >> HPAGE_SHIFT;
-		offset &= ~HPAGE_MASK;
+		index += offset >> huge_page_shift(h);
+		offset &= ~huge_page_mask(h);
 
 		if (page)
 			page_cache_release(page);
@@ -298,7 +303,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			break;
 	}
 out:
-	*ppos = ((loff_t)index << HPAGE_SHIFT) + offset;
+	*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
 	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
@@ -339,8 +344,9 @@ static void truncate_huge_page(struct page *page)
 
 static void truncate_hugepages(struct inode *inode, loff_t lstart)
 {
+	struct hstate *h = hstate_inode(inode);
 	struct address_space *mapping = &inode->i_data;
-	const pgoff_t start = lstart >> HPAGE_SHIFT;
+	const pgoff_t start = lstart >> huge_page_shift(h);
 	struct pagevec pvec;
 	pgoff_t next;
 	int i, freed = 0;
@@ -441,7 +447,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
 			v_offset = 0;
 
 		__unmap_hugepage_range(vma,
-				vma->vm_start + v_offset, vma->vm_end);
+				vma->vm_start + v_offset, vma->vm_end, NULL);
 	}
 }
 
@@ -449,8 +455,9 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
 	pgoff_t pgoff;
 	struct address_space *mapping = inode->i_mapping;
+	struct hstate *h = hstate_inode(inode);
 
-	BUG_ON(offset & ~HPAGE_MASK);
+	BUG_ON(offset & ~huge_page_mask(h));
 	pgoff = offset >> PAGE_SHIFT;
 
 	i_size_write(inode, offset);
@@ -465,6 +472,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
+	struct hstate *h = hstate_inode(inode);
 	int error;
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -476,7 +484,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (ia_valid & ATTR_SIZE) {
 		error = -EINVAL;
-		if (!(attr->ia_size & ~HPAGE_MASK))
+		if (!(attr->ia_size & ~huge_page_mask(h)))
 			error = hugetlb_vmtruncate(inode, attr->ia_size);
 		if (error)
 			goto out;
@@ -610,9 +618,10 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
+	struct hstate *h = hstate_inode(dentry->d_inode);
 
 	buf->f_type = HUGETLBFS_MAGIC;
-	buf->f_bsize = HPAGE_SIZE;
+	buf->f_bsize = huge_page_size(h);
 	if (sbinfo) {
 		spin_lock(&sbinfo->stat_lock);
 		/* If no limits set, just report 0 for max/free/used
@@ -696,7 +705,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 };
 
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
 
@@ -743,6 +752,8 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 	char *p, *rest;
 	substring_t args[MAX_OPT_ARGS];
 	int option;
+	unsigned long long size = 0;
+	enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
 
 	if (!options)
 		return 0;
@@ -773,17 +784,13 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			break;
 
 		case Opt_size: {
- 			unsigned long long size;
 			/* memparse() will accept a K/M/G without a digit */
 			if (!isdigit(*args[0].from))
 				goto bad_val;
 			size = memparse(args[0].from, &rest);
-			if (*rest == '%') {
-				size <<= HPAGE_SHIFT;
-				size *= max_huge_pages;
-				do_div(size, 100);
-			}
-			pconfig->nr_blocks = (size >> HPAGE_SHIFT);
+			setsize = SIZE_STD;
+			if (*rest == '%')
+				setsize = SIZE_PERCENT;
 			break;
 		}
 
@@ -794,6 +801,19 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			pconfig->nr_inodes = memparse(args[0].from, &rest);
 			break;
 
+		case Opt_pagesize: {
+			unsigned long ps;
+			ps = memparse(args[0].from, &rest);
+			pconfig->hstate = size_to_hstate(ps);
+			if (!pconfig->hstate) {
+				printk(KERN_ERR
+				"hugetlbfs: Unsupported page size %lu MB\n",
+					ps >> 20);
+				return -EINVAL;
+			}
+			break;
+		}
+
 		default:
 			printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
 				 p);
@@ -801,6 +821,18 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			break;
 		}
 	}
+
+	/* Do size after hstate is set up */
+	if (setsize > NO_SIZE) {
+		struct hstate *h = pconfig->hstate;
+		if (setsize == SIZE_PERCENT) {
+			size <<= huge_page_shift(h);
+			size *= h->max_huge_pages;
+			do_div(size, 100);
+		}
+		pconfig->nr_blocks = (size >> huge_page_shift(h));
+	}
+
 	return 0;
 
 bad_val:
@@ -825,6 +857,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 	config.uid = current->fsuid;
 	config.gid = current->fsgid;
 	config.mode = 0755;
+	config.hstate = &default_hstate;
 	ret = hugetlbfs_parse_options(data, &config);
 	if (ret)
 		return ret;
@@ -833,14 +866,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sbinfo)
 		return -ENOMEM;
 	sb->s_fs_info = sbinfo;
+	sbinfo->hstate = config.hstate;
 	spin_lock_init(&sbinfo->stat_lock);
 	sbinfo->max_blocks = config.nr_blocks;
 	sbinfo->free_blocks = config.nr_blocks;
 	sbinfo->max_inodes = config.nr_inodes;
 	sbinfo->free_inodes = config.nr_inodes;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = HPAGE_SIZE;
-	sb->s_blocksize_bits = HPAGE_SHIFT;
+	sb->s_blocksize = huge_page_size(config.hstate);
+	sb->s_blocksize_bits = huge_page_shift(config.hstate);
 	sb->s_magic = HUGETLBFS_MAGIC;
 	sb->s_op = &hugetlbfs_ops;
 	sb->s_time_gran = 1;
@@ -942,7 +976,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 		goto out_dentry;
 
 	error = -ENOMEM;
-	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
+	if (hugetlb_reserve_pages(inode, 0,
+			size >> huge_page_shift(hstate_inode(inode)), NULL))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/fs/inode.c b/fs/inode.c
index c36d9480335c..0487ddba1397 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 		mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
 		mapping->assoc_mapping = NULL;
 		mapping->backing_dev_info = &default_backing_dev_info;
+		mapping->writeback_index = 0;
 
 		/*
 		 * If the block_device provides a backing_dev_info for client
@@ -209,7 +210,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	rwlock_init(&inode->i_data.tree_lock);
+	spin_lock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
@@ -224,7 +225,7 @@ void inode_init_once(struct inode *inode)
 
 EXPORT_SYMBOL(inode_init_once);
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct inode * inode = (struct inode *) foo;
 
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 6676c06bb7c1..d85c7d931cdf 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -323,7 +323,7 @@ out:
 }
 
 /*
- * remove_kevent - cleans up and ultimately frees the given kevent
+ * remove_kevent - cleans up the given kevent
  *
  * Caller must hold dev->ev_mutex.
  */
@@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev,
 
 	dev->event_count--;
 	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+}
 
+/*
+ * free_kevent - frees the given kevent.
+ */
+static void free_kevent(struct inotify_kernel_event *kevent)
+{
 	kfree(kevent->name);
 	kmem_cache_free(event_cachep, kevent);
 }
@@ -350,24 +356,25 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
 		struct inotify_kernel_event *kevent;
 		kevent = inotify_dev_get_event(dev);
 		remove_kevent(dev, kevent);
+		free_kevent(kevent);
 	}
 }
 
 /*
- * find_inode - resolve a user-given path to a specific inode and return a nd
+ * find_inode - resolve a user-given path to a specific inode
  */
-static int find_inode(const char __user *dirname, struct nameidata *nd,
+static int find_inode(const char __user *dirname, struct path *path,
 		      unsigned flags)
 {
 	int error;
 
-	error = __user_walk(dirname, flags, nd);
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
 	if (error)
 		return error;
 	/* you can only watch an inode if you have read permissions on it */
-	error = vfs_permission(nd, MAY_READ);
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
 	if (error)
-		path_put(&nd->path);
+		path_put(path);
 	return error;
 }
 
@@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 	dev = file->private_data;
 
 	while (1) {
-		int events;
 
 		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
 
 		mutex_lock(&dev->ev_mutex);
-		events = !list_empty(&dev->events);
-		mutex_unlock(&dev->ev_mutex);
-		if (events) {
+		if (!list_empty(&dev->events)) {
 			ret = 0;
 			break;
 		}
+		mutex_unlock(&dev->ev_mutex);
 
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
@@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 
-	mutex_lock(&dev->ev_mutex);
 	while (1) {
 		struct inotify_kernel_event *kevent;
 
@@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 			}
 			break;
 		}
+		remove_kevent(dev, kevent);
+
+		/*
+		 * Must perform the copy_to_user outside the mutex in order
+		 * to avoid a lock order reversal with mmap_sem.
+		 */
+		mutex_unlock(&dev->ev_mutex);
 
 		if (copy_to_user(buf, &kevent->event, event_size)) {
 			ret = -EFAULT;
@@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 			count -= kevent->event.len;
 		}
 
-		remove_kevent(dev, kevent);
+		free_kevent(kevent);
+
+		mutex_lock(&dev->ev_mutex);
 	}
 	mutex_unlock(&dev->ev_mutex);
 
@@ -566,7 +579,7 @@ static const struct inotify_operations inotify_user_ops = {
 	.destroy_watch	= free_inotify_user_watch,
 };
 
-asmlinkage long sys_inotify_init(void)
+asmlinkage long sys_inotify_init1(int flags)
 {
 	struct inotify_device *dev;
 	struct inotify_handle *ih;
@@ -574,7 +587,14 @@ asmlinkage long sys_inotify_init(void)
 	struct file *filp;
 	int fd, ret;
 
-	fd = get_unused_fd();
+	/* Check the IN_* constants for consistency.  */
+	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
+		return -EINVAL;
+
+	fd = get_unused_fd_flags(flags & O_CLOEXEC);
 	if (fd < 0)
 		return fd;
 
@@ -610,7 +630,7 @@ asmlinkage long sys_inotify_init(void)
 	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
 	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
 	filp->f_mode = FMODE_READ;
-	filp->f_flags = O_RDONLY;
+	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
 	filp->private_data = dev;
 
 	INIT_LIST_HEAD(&dev->events);
@@ -638,11 +658,16 @@ out_put_fd:
 	return ret;
 }
 
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+asmlinkage long sys_inotify_init(void)
+{
+	return sys_inotify_init1(0);
+}
+
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
 {
 	struct inode *inode;
 	struct inotify_device *dev;
-	struct nameidata nd;
+	struct path path;
 	struct file *filp;
 	int ret, fput_needed;
 	unsigned flags = 0;
@@ -662,12 +687,12 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 	if (mask & IN_ONLYDIR)
 		flags |= LOOKUP_DIRECTORY;
 
-	ret = find_inode(path, &nd, flags);
+	ret = find_inode(pathname, &path, flags);
 	if (unlikely(ret))
 		goto fput_and_out;
 
-	/* inode held in place by reference to nd; dev by fget on fd */
-	inode = nd.path.dentry->d_inode;
+	/* inode held in place by reference to path; dev by fget on fd */
+	inode = path.dentry->d_inode;
 	dev = filp->private_data;
 
 	mutex_lock(&dev->up_mutex);
@@ -676,7 +701,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
 		ret = create_watch(dev, inode, mask);
 	mutex_unlock(&dev->up_mutex);
 
-	path_put(&nd.path);
+	path_put(&path);
 fput_and_out:
 	fput_light(filp, fput_needed);
 	return ret;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7db32b3382d3..d152856c371b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -13,9 +13,14 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 
 #include <asm/ioctls.h>
 
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS	(UINT_MAX / sizeof(struct fiemap_extent))
+
 /**
  * vfs_ioctl - call filesystem specific ioctl methods
  * @filp:	open file to invoke ioctl method on
@@ -71,6 +76,276 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
 	return put_user(res, p);
 }
 
+/**
+ * fiemap_fill_next_extent - Fiemap helper function
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @logical:	Extent logical start offset, in bytes
+ * @phys:	Extent physical start offset, in bytes
+ * @len:	Extent length, in bytes
+ * @flags:	FIEMAP_EXTENT flags that describe this extent
+ *
+ * Called from file system ->fiemap callback. Will populate extent
+ * info as passed in via arguments and copy to user memory. On
+ * success, extent count on fieinfo is incremented.
+ *
+ * Returns 0 on success, -errno on error, 1 if this was the last
+ * extent that will fit in user array.
+ */
+#define SET_UNKNOWN_FLAGS	(FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS	(FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS	(FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
+			    u64 phys, u64 len, u32 flags)
+{
+	struct fiemap_extent extent;
+	struct fiemap_extent *dest = fieinfo->fi_extents_start;
+
+	/* only count the extents */
+	if (fieinfo->fi_extents_max == 0) {
+		fieinfo->fi_extents_mapped++;
+		return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+	}
+
+	if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
+		return 1;
+
+	if (flags & SET_UNKNOWN_FLAGS)
+		flags |= FIEMAP_EXTENT_UNKNOWN;
+	if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
+		flags |= FIEMAP_EXTENT_ENCODED;
+	if (flags & SET_NOT_ALIGNED_FLAGS)
+		flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+	memset(&extent, 0, sizeof(extent));
+	extent.fe_logical = logical;
+	extent.fe_physical = phys;
+	extent.fe_length = len;
+	extent.fe_flags = flags;
+
+	dest += fieinfo->fi_extents_mapped;
+	if (copy_to_user(dest, &extent, sizeof(extent)))
+		return -EFAULT;
+
+	fieinfo->fi_extents_mapped++;
+	if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
+		return 1;
+	return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+}
+EXPORT_SYMBOL(fiemap_fill_next_extent);
+
+/**
+ * fiemap_check_flags - check validity of requested flags for fiemap
+ * @fieinfo:	Fiemap context passed into ->fiemap
+ * @fs_flags:	Set of fiemap flags that the file system understands
+ *
+ * Called from file system ->fiemap callback. This will compute the
+ * intersection of valid fiemap flags and those that the fs supports. That
+ * value is then compared against the user supplied flags. In case of bad user
+ * flags, the invalid values will be written into the fieinfo structure, and
+ * -EBADR is returned, which tells ioctl_fiemap() to return those values to
+ * userspace. For this reason, a return code of -EBADR should be preserved.
+ *
+ * Returns 0 on success, -EBADR on bad flags.
+ */
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
+{
+	u32 incompat_flags;
+
+	incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
+	if (incompat_flags) {
+		fieinfo->fi_flags = incompat_flags;
+		return -EBADR;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(fiemap_check_flags);
+
+static int fiemap_check_ranges(struct super_block *sb,
+			       u64 start, u64 len, u64 *new_len)
+{
+	*new_len = len;
+
+	if (len == 0)
+		return -EINVAL;
+
+	if (start > sb->s_maxbytes)
+		return -EFBIG;
+
+	/*
+	 * Shrink request scope to what the fs can actually handle.
+	 */
+	if ((len > sb->s_maxbytes) ||
+	    (sb->s_maxbytes - len) < start)
+		*new_len = sb->s_maxbytes - start;
+
+	return 0;
+}
+
+static int ioctl_fiemap(struct file *filp, unsigned long arg)
+{
+	struct fiemap fiemap;
+	struct fiemap_extent_info fieinfo = { 0, };
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	u64 len;
+	int error;
+
+	if (!inode->i_op->fiemap)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
+			   sizeof(struct fiemap)))
+		return -EFAULT;
+
+	if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+		return -EINVAL;
+
+	error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+				    &len);
+	if (error)
+		return error;
+
+	fieinfo.fi_flags = fiemap.fm_flags;
+	fieinfo.fi_extents_max = fiemap.fm_extent_count;
+	fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+
+	if (fiemap.fm_extent_count != 0 &&
+	    !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+		       fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+		return -EFAULT;
+
+	if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+		filemap_write_and_wait(inode->i_mapping);
+
+	error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
+	fiemap.fm_flags = fieinfo.fi_flags;
+	fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+	if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+		error = -EFAULT;
+
+	return error;
+}
+
+#ifdef CONFIG_BLOCK
+
+#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
+#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
+
+/*
+ * @inode - the inode to map
+ * @arg - the pointer to userspace where we copy everything to
+ * @get_block - the fs's get_block function
+ *
+ * This does FIEMAP for block based inodes.  Basically it will just loop
+ * through get_block until we hit the number of extents we want to map, or we
+ * go past the end of the file and hit a hole.
+ *
+ * If it is possible to have data blocks beyond a hole past @inode->i_size, then
+ * please do not use this function, it will stop at the first unmapped block
+ * beyond i_size
+ */
+int generic_block_fiemap(struct inode *inode,
+			 struct fiemap_extent_info *fieinfo, u64 start,
+			 u64 len, get_block_t *get_block)
+{
+	struct buffer_head tmp;
+	unsigned int start_blk;
+	long long length = 0, map_len = 0;
+	u64 logical = 0, phys = 0, size = 0;
+	u32 flags = FIEMAP_EXTENT_MERGED;
+	int ret = 0;
+
+	if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
+		return ret;
+
+	start_blk = logical_to_blk(inode, start);
+
+	/* guard against change */
+	mutex_lock(&inode->i_mutex);
+
+	length = (long long)min_t(u64, len, i_size_read(inode));
+	map_len = length;
+
+	do {
+		/*
+		 * we set b_size to the total size we want so it will map as
+		 * many contiguous blocks as possible at once
+		 */
+		memset(&tmp, 0, sizeof(struct buffer_head));
+		tmp.b_size = map_len;
+
+		ret = get_block(inode, start_blk, &tmp, 0);
+		if (ret)
+			break;
+
+		/* HOLE */
+		if (!buffer_mapped(&tmp)) {
+			/*
+			 * first hole after going past the EOF, this is our
+			 * last extent
+			 */
+			if (length <= 0) {
+				flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				break;
+			}
+
+			length -= blk_to_logical(inode, 1);
+
+			/* if we have holes up to/past EOF then we're done */
+			if (length <= 0)
+				break;
+
+			start_blk++;
+		} else {
+			if (length <= 0 && size) {
+				ret = fiemap_fill_next_extent(fieinfo, logical,
+							      phys, size,
+							      flags);
+				if (ret)
+					break;
+			}
+
+			logical = blk_to_logical(inode, start_blk);
+			phys = blk_to_logical(inode, tmp.b_blocknr);
+			size = tmp.b_size;
+			flags = FIEMAP_EXTENT_MERGED;
+
+			length -= tmp.b_size;
+			start_blk += logical_to_blk(inode, size);
+
+			/*
+			 * if we are past the EOF we need to loop again to see
+			 * if there is a hole so we can mark this extent as the
+			 * last one, and if not keep mapping things until we
+			 * find a hole, or we run out of slots in the extent
+			 * array
+			 */
+			if (length <= 0)
+				continue;
+
+			ret = fiemap_fill_next_extent(fieinfo, logical, phys,
+						      size, flags);
+			if (ret)
+				break;
+		}
+		cond_resched();
+	} while (1);
+
+	mutex_unlock(&inode->i_mutex);
+
+	/* if ret is 1 then we just hit the end of the extent array */
+	if (ret == 1)
+		ret = 0;
+
+	return ret;
+}
+EXPORT_SYMBOL(generic_block_fiemap);
+
+#endif  /*  CONFIG_BLOCK  */
+
 static int file_ioctl(struct file *filp, unsigned int cmd,
 		unsigned long arg)
 {
@@ -80,6 +355,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
 	switch (cmd) {
 	case FIBMAP:
 		return ioctl_fibmap(filp, p);
+	case FS_IOC_FIEMAP:
+		return ioctl_fiemap(filp, arg);
 	case FIGETBSZ:
 		return put_user(inode->i_sb->s_blocksize, p);
 	case FIONREAD:
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c4a1c3c65aac..da3cc460d4df 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 				pgrp = task_pgrp(current);
 			else
 				pgrp = find_vpid(who);
-			do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
 					break;
-			} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
@@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 				pgrp = task_pgrp(current);
 			else
 				pgrp = find_vpid(who);
-			do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
 					continue;
@@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 					ret = tmpio;
 				else
 					ret = ioprio_best(ret, tmpio);
-			} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case IOPRIO_WHO_USER:
 			if (!who)
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index bb219138331a..e81a30593ba9 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -22,7 +22,7 @@ isofs_export_iget(struct super_block *sb,
 		  __u32 generation)
 {
 	struct inode *inode;
-	struct dentry *result;
+
 	if (block == 0)
 		return ERR_PTR(-ESTALE);
 	inode = isofs_iget(sb, block, offset);
@@ -32,12 +32,7 @@ isofs_export_iget(struct super_block *sb,
 		iput(inode);
 		return ERR_PTR(-ESTALE);
 	}
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+	return d_obtain_alias(inode);
 }
 
 /* This function is surprisingly simple.  The trick is understanding
@@ -51,7 +46,6 @@ static struct dentry *isofs_export_get_parent(struct dentry *child)
 	unsigned long parent_offset = 0;
 	struct inode *child_inode = child->d_inode;
 	struct iso_inode_info *e_child_inode = ISOFS_I(child_inode);
-	struct inode *parent_inode = NULL;
 	struct iso_directory_record *de = NULL;
 	struct buffer_head * bh = NULL;
 	struct dentry *rv = NULL;
@@ -104,28 +98,11 @@ static struct dentry *isofs_export_get_parent(struct dentry *child)
 	/* Normalize */
 	isofs_normalize_block_and_offset(de, &parent_block, &parent_offset);
 
-	/* Get the inode. */
-	parent_inode = isofs_iget(child_inode->i_sb,
-				  parent_block,
-				  parent_offset);
-	if (IS_ERR(parent_inode)) {
-		rv = ERR_CAST(parent_inode);
-		if (rv != ERR_PTR(-ENOMEM))
-			rv = ERR_PTR(-EACCES);
-		goto out;
-	}
-
-	/* Allocate the dentry. */
-	rv = d_alloc_anon(parent_inode);
-	if (rv == NULL) {
-		rv = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
+	rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block,
+				     parent_offset));
  out:
-	if (bh) {
+	if (bh)
 		brelse(bh);
-	}
 	return rv;
 }
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 044a254d526b..3f8af0f1505b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,7 +73,7 @@ static void isofs_destroy_inode(struct inode *inode)
 	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct iso_inode_info *ei = foo;
 
@@ -310,7 +310,7 @@ enum {
 	Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_norock, "norock"},
 	{Opt_nojoliet, "nojoliet"},
 	{Opt_unhide, "unhide"},
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 6bd48f0a7047..c2fb2dd0131f 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -209,6 +209,11 @@ repeat:
 
 	while (rs.len > 2) { /* There may be one byte for padding somewhere */
 		rr = (struct rock_ridge *)rs.chr;
+		/*
+		 * Ignore rock ridge info if rr->len is out of range, but
+		 * don't return -EIO because that would make the file
+		 * invisible.
+		 */
 		if (rr->len < 3)
 			goto out;	/* Something got screwed up here */
 		sig = isonum_721(rs.chr);
@@ -216,8 +221,12 @@ repeat:
 			goto eio;
 		rs.chr += rr->len;
 		rs.len -= rr->len;
+		/*
+		 * As above, just ignore the rock ridge info if rr->len
+		 * is bogus.
+		 */
 		if (rs.len < 0)
-			goto eio;	/* corrupted isofs */
+			goto out;	/* Something got screwed up here */
 
 		switch (sig) {
 		case SIG('R', 'R'):
@@ -307,6 +316,11 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
 repeat:
 	while (rs.len > 2) { /* There may be one byte for padding somewhere */
 		rr = (struct rock_ridge *)rs.chr;
+		/*
+		 * Ignore rock ridge info if rr->len is out of range, but
+		 * don't return -EIO because that would make the file
+		 * invisible.
+		 */
 		if (rr->len < 3)
 			goto out;	/* Something got screwed up here */
 		sig = isonum_721(rs.chr);
@@ -314,8 +328,12 @@ repeat:
 			goto eio;
 		rs.chr += rr->len;
 		rs.len -= rr->len;
+		/*
+		 * As above, just ignore the rock ridge info if rr->len
+		 * is bogus.
+		 */
 		if (rs.len < 0)
-			goto eio;	/* corrupted isofs */
+			goto out;	/* Something got screwed up here */
 
 		switch (sig) {
 #ifndef CONFIG_ZISOFS		/* No flag for SF or ZF */
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
new file mode 100644
index 000000000000..4e28beeed157
--- /dev/null
+++ b/fs/jbd/Kconfig
@@ -0,0 +1,30 @@
+config JBD
+	tristate
+	help
+	  This is a generic journalling layer for block devices.  It is
+	  currently used by the ext3 file system, but it could also be
+	  used to add journal support to other file systems or block
+	  devices such as RAID or LVM.
+
+	  If you are using the ext3 file system, you need to say Y here.
+	  If you are not using ext3 then you will probably want to say N.
+
+	  To compile this device as a module, choose M here: the module will be
+	  called jbd.  If you are compiling ext3 into the kernel, you
+	  cannot compile this code as a module.
+
+config JBD_DEBUG
+	bool "JBD (ext3) debugging support"
+	depends on JBD && DEBUG_FS
+	help
+	  If you are using the ext3 journaled file system (or potentially any
+	  other file system/device using JBD), this option allows you to
+	  enable debugging output while the system is running, in order to
+	  help track down any problems you are having.  By default the
+	  debugging output will be turned off.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
+	  number between 1 and 5, the higher the number, the more debugging
+	  output is generated.  To turn debugging off again, do
+	  "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index a5432bbbfb88..1bd8d4acc6f2 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -93,7 +93,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 	int ret = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
@@ -126,14 +127,29 @@ void __log_wait_for_space(journal_t *journal)
 
 		/*
 		 * Test again, another process may have checkpointed while we
-		 * were waiting for the checkpoint lock
+		 * were waiting for the checkpoint lock. If there are no
+		 * outstanding transactions there is nothing to checkpoint and
+		 * we can't make progress. Abort the journal in this case.
 		 */
 		spin_lock(&journal->j_state_lock);
+		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
 		if (__log_space_left(journal) < nblocks) {
+			int chkpt = journal->j_checkpoint_transactions != NULL;
+
+			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
-			log_do_checkpoint(journal);
+			if (chkpt) {
+				log_do_checkpoint(journal);
+			} else {
+				printk(KERN_ERR "%s: no transactions\n",
+				       __func__);
+				journal_abort(journal, 0);
+			}
+
 			spin_lock(&journal->j_state_lock);
+		} else {
+			spin_unlock(&journal->j_list_lock);
 		}
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
@@ -160,21 +176,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
  * buffers. Note that we take the buffers in the opposite ordering
  * from the one in which they were submitted for IO.
  *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
  * Called with j_list_lock held.
  */
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
 	tid_t this_tid;
 	int released = 0;
+	int ret = 0;
 
 	this_tid = transaction->t_tid;
 restart:
 	/* Did somebody clean up the transaction in the meanwhile? */
 	if (journal->j_checkpoint_transactions != transaction ||
 			transaction->t_tid != this_tid)
-		return;
+		return ret;
 	while (!released && transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
@@ -194,6 +214,9 @@ restart:
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
+
 		/*
 		 * Now in whatever state the buffer currently is, we know that
 		 * it has been written out and so we can drop it from the list
@@ -203,6 +226,8 @@ restart:
 		journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
+
+	return ret;
 }
 
 #define NR_BATCH	64
@@ -226,7 +251,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -256,6 +282,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		log_wait_commit(journal, tid);
 		ret = 1;
 	} else if (!buffer_dirty(bh)) {
+		ret = 1;
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__journal_remove_checkpoint(jh);
@@ -263,7 +292,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		__brelse(bh);
-		ret = 1;
 	} else {
 		/*
 		 * Important: we are about to write the buffer, and
@@ -295,6 +323,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
  * to disk. We submit larger chunks of data at once.
  *
  * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
  */
 int log_do_checkpoint(journal_t *journal)
 {
@@ -318,6 +347,7 @@ int log_do_checkpoint(journal_t *journal)
 	 * OK, we need to start writing disk blocks.  Take one transaction
 	 * and write it.
 	 */
+	result = 0;
 	spin_lock(&journal->j_list_lock);
 	if (!journal->j_checkpoint_transactions)
 		goto out;
@@ -334,7 +364,7 @@ restart:
 		int batch_count = 0;
 		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
-		int retry = 0;
+		int retry = 0, err;
 
 		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
@@ -347,6 +377,8 @@ restart:
 				break;
 			}
 			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (retry < 0 && !result)
+				result = retry;
 			if (!retry && (need_resched() ||
 				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
@@ -371,14 +403,18 @@ restart:
 		 * Now we have cleaned up the first transaction's checkpoint
 		 * list. Let's clean up the second one
 		 */
-		__wait_cp_io(journal, transaction);
+		err = __wait_cp_io(journal, transaction);
+		if (!result)
+			result = err;
 	}
 out:
 	spin_unlock(&journal->j_list_lock);
-	result = cleanup_journal_tail(journal);
 	if (result < 0)
-		return result;
-	return 0;
+		journal_abort(journal, result);
+	else
+		result = cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
 }
 
 /*
@@ -394,8 +430,9 @@ out:
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
- * we have an abort error outstanding.
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
  */
 
 int cleanup_journal_tail(journal_t *journal)
@@ -404,6 +441,9 @@ int cleanup_journal_tail(journal_t *journal)
 	tid_t		first_tid;
 	unsigned long	blocknr, freed;
 
+	if (is_journal_aborted(journal))
+		return 1;
+
 	/* OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
 	 *
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 5a8ca61498ca..25719d902c51 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 
 /*
  * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * not successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  * So here, we have a buffer which has just come off the forget list.  Look to
  * see if we can strip all buffers from the backing page.
  *
- * Called under lock_journal(), and possibly under journal_datalist_lock.  The
- * caller provided us with a ref against the buffer, and we drop that here.
+ * Called under journal->j_list_lock.  The caller provided us with a ref
+ * against the buffer, and we drop that here.
  */
 static void release_buffer_page(struct buffer_head *bh)
 {
@@ -63,7 +63,7 @@ static void release_buffer_page(struct buffer_head *bh)
 		goto nope;
 
 	/* OK, it's a truncated page */
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto nope;
 
 	page_cache_get(page);
@@ -78,6 +78,19 @@ nope:
 }
 
 /*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+	if (buffer_freed(bh)) {
+		clear_buffer_freed(bh);
+		release_buffer_page(bh);
+	} else
+		put_bh(bh);
+}
+
+/*
  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  * return 0.  j_list_lock is dropped in this case.
@@ -172,7 +185,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 /*
  *  Submit all the data buffers to disk
  */
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_submit_data_buffers(journal_t *journal,
 				transaction_t *commit_transaction)
 {
 	struct journal_head *jh;
@@ -180,6 +193,7 @@ static void journal_submit_data_buffers(journal_t *journal,
 	int locked;
 	int bufs = 0;
 	struct buffer_head **wbuf = journal->j_wbuf;
+	int err = 0;
 
 	/*
 	 * Whenever we unlock the journal and sleep, things can get added
@@ -207,7 +221,7 @@ write_out_data:
 		 * blocking lock_buffer().
 		 */
 		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
+			if (!trylock_buffer(bh)) {
 				BUFFER_TRACE(bh, "needs blocking lock");
 				spin_unlock(&journal->j_list_lock);
 				/* Write out all data to prevent deadlocks */
@@ -231,7 +245,7 @@ write_out_data:
 			if (locked)
 				unlock_buffer(bh);
 			BUFFER_TRACE(bh, "already cleaned up");
-			put_bh(bh);
+			release_data_buffer(bh);
 			continue;
 		}
 		if (locked && test_clear_buffer_dirty(bh)) {
@@ -253,15 +267,17 @@ write_out_data:
 			put_bh(bh);
 		} else {
 			BUFFER_TRACE(bh, "writeout complete: unfile");
+			if (unlikely(!buffer_uptodate(bh)))
+				err = -EIO;
 			__journal_unfile_buffer(jh);
 			jbd_unlock_bh_state(bh);
 			if (locked)
 				unlock_buffer(bh);
 			journal_remove_journal_head(bh);
-			/* Once for our safety reference, once for
+			/* One for our safety reference, other for
 			 * journal_remove_journal_head() */
 			put_bh(bh);
-			put_bh(bh);
+			release_data_buffer(bh);
 		}
 
 		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
@@ -271,6 +287,8 @@ write_out_data:
 	}
 	spin_unlock(&journal->j_list_lock);
 	journal_do_submit_data(wbuf, bufs);
+
+	return err;
 }
 
 /*
@@ -410,8 +428,7 @@ void journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = 0;
-	journal_submit_data_buffers(journal, commit_transaction);
+	err = journal_submit_data_buffers(journal, commit_transaction);
 
 	/*
 	 * Wait for all previously submitted IO to complete.
@@ -426,10 +443,21 @@ void journal_commit_transaction(journal_t *journal)
 		if (buffer_locked(bh)) {
 			spin_unlock(&journal->j_list_lock);
 			wait_on_buffer(bh);
-			if (unlikely(!buffer_uptodate(bh)))
-				err = -EIO;
 			spin_lock(&journal->j_list_lock);
 		}
+		if (unlikely(!buffer_uptodate(bh))) {
+			if (!trylock_page(bh->b_page)) {
+				spin_unlock(&journal->j_list_lock);
+				lock_page(bh->b_page);
+				spin_lock(&journal->j_list_lock);
+			}
+			if (bh->b_page->mapping)
+				set_bit(AS_EIO, &bh->b_page->mapping->flags);
+
+			unlock_page(bh->b_page);
+			SetPageError(bh->b_page);
+			err = -EIO;
+		}
 		if (!inverted_lock(journal, bh)) {
 			put_bh(bh);
 			spin_lock(&journal->j_list_lock);
@@ -443,17 +471,23 @@ void journal_commit_transaction(journal_t *journal)
 		} else {
 			jbd_unlock_bh_state(bh);
 		}
-		put_bh(bh);
+		release_data_buffer(bh);
 		cond_resched_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
 
-	if (err)
-		journal_abort(journal, err);
+	if (err) {
+		char b[BDEVNAME_SIZE];
 
-	journal_write_revoke_records(journal, commit_transaction);
+		printk(KERN_WARNING
+			"JBD: Detected IO errors while flushing file data "
+			"on %s\n", bdevname(journal->j_fs_dev, b));
+		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+			journal_abort(journal, err);
+		err = 0;
+	}
 
-	jbd_debug(3, "JBD: commit phase 2\n");
+	journal_write_revoke_records(journal, commit_transaction);
 
 	/*
 	 * If we found any dirty or locked buffers, then we should have
@@ -486,9 +520,10 @@ void journal_commit_transaction(journal_t *journal)
 		jh = commit_transaction->t_buffers;
 
 		/* If we're in abort mode, we just un-journal the buffer and
-		   release it for background writing. */
+		   release it. */
 
 		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
 			journal_refile_buffer(journal, jh);
 			/* If that was the last one, we need to clean up
@@ -730,6 +765,9 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
+	if (err)
+		journal_abort(journal, err);
+
 	jbd_debug(3, "JBD: commit phase 6\n");
 
 	if (journal_write_commit_record(journal, commit_transaction))
@@ -820,6 +858,8 @@ restart_loop:
 		if (buffer_jbddirty(bh)) {
 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
 			__journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 			__journal_refile_buffer(jh);
 			jbd_unlock_bh_state(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b99c3b3654c4..9e4fa52d7dc8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -68,7 +68,6 @@ EXPORT_SYMBOL(journal_set_features);
 EXPORT_SYMBOL(journal_create);
 EXPORT_SYMBOL(journal_load);
 EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_update_superblock);
 EXPORT_SYMBOL(journal_abort);
 EXPORT_SYMBOL(journal_errno);
 EXPORT_SYMBOL(journal_ack_err);
@@ -1122,9 +1121,12 @@ recovery_error:
  *
  * Release a journal_t structure once it is no longer in use by the
  * journaled object.
+ * Return <0 if we couldn't clean up the journal.
  */
-void journal_destroy(journal_t *journal)
+int journal_destroy(journal_t *journal)
 {
+	int err = 0;
+
 	/* Wait for the commit thread to wake up and die. */
 	journal_kill_thread(journal);
 
@@ -1147,11 +1149,16 @@ void journal_destroy(journal_t *journal)
 	J_ASSERT(journal->j_checkpoint_transactions == NULL);
 	spin_unlock(&journal->j_list_lock);
 
-	/* We can now mark the journal as empty. */
-	journal->j_tail = 0;
-	journal->j_tail_sequence = ++journal->j_transaction_sequence;
 	if (journal->j_sb_buffer) {
-		journal_update_superblock(journal, 1);
+		if (!is_journal_aborted(journal)) {
+			/* We can now mark the journal as empty. */
+			journal->j_tail = 0;
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			journal_update_superblock(journal, 1);
+		} else {
+			err = -EIO;
+		}
 		brelse(journal->j_sb_buffer);
 	}
 
@@ -1161,6 +1168,8 @@ void journal_destroy(journal_t *journal)
 		journal_destroy_revoke(journal);
 	kfree(journal->j_wbuf);
 	kfree(journal);
+
+	return err;
 }
 
 
@@ -1360,10 +1369,16 @@ int journal_flush(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (!err && journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
 		err = log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
 	cleanup_journal_tail(journal);
 
 	/* Finally, mark the journal as really needing no recovery.
@@ -1385,7 +1400,7 @@ int journal_flush(journal_t *journal)
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
 	spin_unlock(&journal->j_state_lock);
-	return err;
+	return 0;
 }
 
 /**
@@ -1636,9 +1651,10 @@ static int journal_init_journal_head_cache(void)
 
 static void journal_destroy_journal_head_cache(void)
 {
-	J_ASSERT(journal_head_cache != NULL);
-	kmem_cache_destroy(journal_head_cache);
-	journal_head_cache = NULL;
+	if (journal_head_cache) {
+		kmem_cache_destroy(journal_head_cache);
+		journal_head_cache = NULL;
+	}
 }
 
 /*
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 43bc5e5ed064..db5e982c5ddf 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -223,7 +223,7 @@ do {									\
  */
 int journal_recover(journal_t *journal)
 {
-	int			err;
+	int			err, err2;
 	journal_superblock_t *	sb;
 
 	struct recovery_info	info;
@@ -261,7 +261,10 @@ int journal_recover(journal_t *journal)
 	journal->j_transaction_sequence = ++info.end_transaction;
 
 	journal_clear_revoke(journal);
-	sync_blockdev(journal->j_fs_dev);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+
 	return err;
 }
 
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 1bb43e987f4b..c7bd649bbbdc 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -166,138 +166,123 @@ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
 	return NULL;
 }
 
+void journal_destroy_revoke_caches(void)
+{
+	if (revoke_record_cache) {
+		kmem_cache_destroy(revoke_record_cache);
+		revoke_record_cache = NULL;
+	}
+	if (revoke_table_cache) {
+		kmem_cache_destroy(revoke_table_cache);
+		revoke_table_cache = NULL;
+	}
+}
+
 int __init journal_init_revoke_caches(void)
 {
+	J_ASSERT(!revoke_record_cache);
+	J_ASSERT(!revoke_table_cache);
+
 	revoke_record_cache = kmem_cache_create("revoke_record",
 					   sizeof(struct jbd_revoke_record_s),
 					   0,
 					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
 					   NULL);
 	if (!revoke_record_cache)
-		return -ENOMEM;
+		goto record_cache_failure;
 
 	revoke_table_cache = kmem_cache_create("revoke_table",
 					   sizeof(struct jbd_revoke_table_s),
 					   0, SLAB_TEMPORARY, NULL);
-	if (!revoke_table_cache) {
-		kmem_cache_destroy(revoke_record_cache);
-		revoke_record_cache = NULL;
-		return -ENOMEM;
-	}
+	if (!revoke_table_cache)
+		goto table_cache_failure;
+
 	return 0;
-}
 
-void journal_destroy_revoke_caches(void)
-{
-	kmem_cache_destroy(revoke_record_cache);
-	revoke_record_cache = NULL;
-	kmem_cache_destroy(revoke_table_cache);
-	revoke_table_cache = NULL;
+table_cache_failure:
+	journal_destroy_revoke_caches();
+record_cache_failure:
+	return -ENOMEM;
 }
 
-/* Initialise the revoke table for a given journal to a given size. */
-
-int journal_init_revoke(journal_t *journal, int hash_size)
+static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
 {
-	int shift, tmp;
+	int shift = 0;
+	int tmp = hash_size;
+	struct jbd_revoke_table_s *table;
 
-	J_ASSERT (journal->j_revoke_table[0] == NULL);
+	table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+	if (!table)
+		goto out;
 
-	shift = 0;
-	tmp = hash_size;
 	while((tmp >>= 1UL) != 0UL)
 		shift++;
 
-	journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
-	if (!journal->j_revoke_table[0])
-		return -ENOMEM;
-	journal->j_revoke = journal->j_revoke_table[0];
-
-	/* Check that the hash_size is a power of two */
-	J_ASSERT(is_power_of_2(hash_size));
-
-	journal->j_revoke->hash_size = hash_size;
-
-	journal->j_revoke->hash_shift = shift;
-
-	journal->j_revoke->hash_table =
+	table->hash_size = hash_size;
+	table->hash_shift = shift;
+	table->hash_table =
 		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-	if (!journal->j_revoke->hash_table) {
-		kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-		journal->j_revoke = NULL;
-		return -ENOMEM;
+	if (!table->hash_table) {
+		kmem_cache_free(revoke_table_cache, table);
+		table = NULL;
+		goto out;
 	}
 
 	for (tmp = 0; tmp < hash_size; tmp++)
-		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+		INIT_LIST_HEAD(&table->hash_table[tmp]);
 
-	journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
-	if (!journal->j_revoke_table[1]) {
-		kfree(journal->j_revoke_table[0]->hash_table);
-		kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-		return -ENOMEM;
+out:
+	return table;
+}
+
+static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
+{
+	int i;
+	struct list_head *hash_list;
+
+	for (i = 0; i < table->hash_size; i++) {
+		hash_list = &table->hash_table[i];
+		J_ASSERT(list_empty(hash_list));
 	}
 
-	journal->j_revoke = journal->j_revoke_table[1];
+	kfree(table->hash_table);
+	kmem_cache_free(revoke_table_cache, table);
+}
 
-	/* Check that the hash_size is a power of two */
+/* Initialise the revoke table for a given journal to a given size. */
+int journal_init_revoke(journal_t *journal, int hash_size)
+{
+	J_ASSERT(journal->j_revoke_table[0] == NULL);
 	J_ASSERT(is_power_of_2(hash_size));
 
-	journal->j_revoke->hash_size = hash_size;
+	journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[0])
+		goto fail0;
 
-	journal->j_revoke->hash_shift = shift;
+	journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
+	if (!journal->j_revoke_table[1])
+		goto fail1;
 
-	journal->j_revoke->hash_table =
-		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
-	if (!journal->j_revoke->hash_table) {
-		kfree(journal->j_revoke_table[0]->hash_table);
-		kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
-		kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
-		journal->j_revoke = NULL;
-		return -ENOMEM;
-	}
-
-	for (tmp = 0; tmp < hash_size; tmp++)
-		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+	journal->j_revoke = journal->j_revoke_table[1];
 
 	spin_lock_init(&journal->j_revoke_lock);
 
 	return 0;
-}
 
-/* Destoy a journal's revoke table.  The table must already be empty! */
+fail1:
+	journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+	return -ENOMEM;
+}
 
+/* Destroy a journal's revoke table.  The table must already be empty! */
 void journal_destroy_revoke(journal_t *journal)
 {
-	struct jbd_revoke_table_s *table;
-	struct list_head *hash_list;
-	int i;
-
-	table = journal->j_revoke_table[0];
-	if (!table)
-		return;
-
-	for (i=0; i<table->hash_size; i++) {
-		hash_list = &table->hash_table[i];
-		J_ASSERT (list_empty(hash_list));
-	}
-
-	kfree(table->hash_table);
-	kmem_cache_free(revoke_table_cache, table);
-	journal->j_revoke = NULL;
-
-	table = journal->j_revoke_table[1];
-	if (!table)
-		return;
-
-	for (i=0; i<table->hash_size; i++) {
-		hash_list = &table->hash_table[i];
-		J_ASSERT (list_empty(hash_list));
-	}
-
-	kfree(table->hash_table);
-	kmem_cache_free(revoke_table_cache, table);
 	journal->j_revoke = NULL;
+	if (journal->j_revoke_table[0])
+		journal_destroy_revoke_table(journal->j_revoke_table[0]);
+	if (journal->j_revoke_table[1])
+		journal_destroy_revoke_table(journal->j_revoke_table[1]);
 }
 
 
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 67ff2024c23c..d15cd6e7251e 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -291,7 +291,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
 		goto out;
 	}
 
-	lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+	lock_map_acquire(&handle->h_lockdep_map);
 
 out:
 	return handle;
@@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 	journal_t *journal = handle->h_transaction->t_journal;
 	int need_brelse = 0;
 	struct journal_head *jh;
+	int ret = 0;
 
 	if (is_handle_aborted(handle))
-		return 0;
+		return ret;
 
 	jh = journal_add_journal_head(bh);
 	JBUFFER_TRACE(jh, "entry");
@@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 				   time if it is redirtied */
 			}
 
-			/* journal_clean_data_list() may have got there first */
+			/*
+			 * We cannot remove the buffer with io error from the
+			 * committing transaction, because otherwise it would
+			 * miss the error and the commit would not abort.
+			 */
+			if (unlikely(!buffer_uptodate(bh))) {
+				ret = -EIO;
+				goto no_journal;
+			}
+
 			if (jh->b_transaction != NULL) {
 				JBUFFER_TRACE(jh, "unfile from commit");
 				__journal_temp_unlink_buffer(jh);
@@ -1108,7 +1118,7 @@ no_journal:
 	}
 	JBUFFER_TRACE(jh, "exit");
 	journal_put_journal_head(jh);
-	return 0;
+	return ret;
 }
 
 /**
@@ -1448,7 +1458,7 @@ int journal_stop(handle_t *handle)
 		spin_unlock(&journal->j_state_lock);
 	}
 
-	lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+	lock_map_release(&handle->h_lockdep_map);
 
 	jbd_free_handle(handle);
 	return err;
@@ -1648,12 +1658,42 @@ out:
 	return;
 }
 
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The latter might still hold the a count on buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * tryinf to free that buffer.
+ *
+ * Called with journal->j_state_lock held.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+	transaction_t *transaction = NULL;
+	tid_t tid;
+
+	spin_lock(&journal->j_state_lock);
+	transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return;
+	}
+
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	log_wait_commit(journal, tid);
+}
 
 /**
  * int journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
  * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
  *
  *
  * For all the buffers on this page,
@@ -1682,9 +1722,11 @@ out:
  * journal_try_to_free_buffer() is changing its state.  But that
  * cannot happen because we never reallocate freed data as metadata
  * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
  */
 int journal_try_to_free_buffers(journal_t *journal,
-				struct page *page, gfp_t unused_gfp_mask)
+				struct page *page, gfp_t gfp_mask)
 {
 	struct buffer_head *head;
 	struct buffer_head *bh;
@@ -1713,7 +1755,28 @@ int journal_try_to_free_buffers(journal_t *journal,
 		if (buffer_jbd(bh))
 			goto busy;
 	} while ((bh = bh->b_this_page) != head);
+
 	ret = try_to_free_buffers(page);
+
+	/*
+	 * There are a number of places where journal_try_to_free_buffers()
+	 * could race with journal_commit_transaction(), the later still
+	 * holds the reference to the buffers to free while processing them.
+	 * try_to_free_buffers() failed to free those buffers. Some of the
+	 * caller of releasepage() request page buffers to be dropped, otherwise
+	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
+	 *
+	 * So, if the caller of try_to_release_page() wants the synchronous
+	 * behaviour(i.e make sure buffers are dropped upon return),
+	 * let's wait for the current transaction to finish flush of
+	 * dirty data buffers, then try to free those buffers again,
+	 * with the journal locked.
+	 */
+	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+		journal_wait_for_transaction_sync_data(journal);
+		ret = try_to_free_buffers(page);
+	}
+
 busy:
 	return ret;
 }
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
new file mode 100644
index 000000000000..f32f346f4b0a
--- /dev/null
+++ b/fs/jbd2/Kconfig
@@ -0,0 +1,33 @@
+config JBD2
+	tristate
+	select CRC32
+	help
+	  This is a generic journaling layer for block devices that support
+	  both 32-bit and 64-bit block numbers.  It is currently used by
+	  the ext4 and OCFS2 filesystems, but it could also be used to add
+	  journal support to other file systems or block devices such
+	  as RAID or LVM.
+
+	  If you are using ext4 or OCFS2, you need to say Y here.
+	  If you are not using ext4 or OCFS2 then you will
+	  probably want to say N.
+
+	  To compile this device as a module, choose M here. The module will be
+	  called jbd2.  If you are compiling ext4 or OCFS2 into the kernel,
+	  you cannot compile this code as a module.
+
+config JBD2_DEBUG
+	bool "JBD2 (ext4) debugging support"
+	depends on JBD2 && DEBUG_FS
+	help
+	  If you are using the ext4 journaled file system (or
+	  potentially any other filesystem/device using JBD2), this option
+	  allows you to enable debugging output while the system is running,
+	  in order to help track down any problems you are having.
+	  By default, the debugging output will be turned off.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+	  number between 1 and 5. The higher the number, the more debugging
+	  output is generated.  To turn debugging off again, do
+	  "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 91389c8aee8a..9203c3332f17 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 
@@ -93,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 	int ret = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __jbd2_journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
@@ -126,14 +128,29 @@ void __jbd2_log_wait_for_space(journal_t *journal)
 
 		/*
 		 * Test again, another process may have checkpointed while we
-		 * were waiting for the checkpoint lock
+		 * were waiting for the checkpoint lock. If there are no
+		 * outstanding transactions there is nothing to checkpoint and
+		 * we can't make progress. Abort the journal in this case.
 		 */
 		spin_lock(&journal->j_state_lock);
+		spin_lock(&journal->j_list_lock);
 		nblocks = jbd_space_needed(journal);
 		if (__jbd2_log_space_left(journal) < nblocks) {
+			int chkpt = journal->j_checkpoint_transactions != NULL;
+
+			spin_unlock(&journal->j_list_lock);
 			spin_unlock(&journal->j_state_lock);
-			jbd2_log_do_checkpoint(journal);
+			if (chkpt) {
+				jbd2_log_do_checkpoint(journal);
+			} else {
+				printk(KERN_ERR "%s: no transactions\n",
+				       __func__);
+				jbd2_journal_abort(journal, 0);
+			}
+
 			spin_lock(&journal->j_state_lock);
+		} else {
+			spin_unlock(&journal->j_list_lock);
 		}
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
@@ -160,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
  * buffers. Note that we take the buffers in the opposite ordering
  * from the one in which they were submitted for IO.
  *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
  * Called with j_list_lock held.
  */
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
 	struct journal_head *jh;
 	struct buffer_head *bh;
 	tid_t this_tid;
 	int released = 0;
+	int ret = 0;
 
 	this_tid = transaction->t_tid;
 restart:
 	/* Did somebody clean up the transaction in the meanwhile? */
 	if (journal->j_checkpoint_transactions != transaction ||
 			transaction->t_tid != this_tid)
-		return;
+		return ret;
 	while (!released && transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
@@ -194,6 +215,9 @@ restart:
 			spin_lock(&journal->j_list_lock);
 			goto restart;
 		}
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
+
 		/*
 		 * Now in whatever state the buffer currently is, we know that
 		 * it has been written out and so we can drop it from the list
@@ -203,6 +227,8 @@ restart:
 		jbd2_journal_remove_journal_head(bh);
 		__brelse(bh);
 	}
+
+	return ret;
 }
 
 #define NR_BATCH	64
@@ -226,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -258,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		jbd2_log_wait_commit(journal, tid);
 		ret = 1;
 	} else if (!buffer_dirty(bh)) {
+		ret = 1;
+		if (unlikely(buffer_write_io_error(bh)))
+			ret = -EIO;
 		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__jbd2_journal_remove_checkpoint(jh);
@@ -265,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		jbd_unlock_bh_state(bh);
 		jbd2_journal_remove_journal_head(bh);
 		__brelse(bh);
-		ret = 1;
 	} else {
 		/*
 		 * Important: we are about to write the buffer, and
@@ -298,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
  * to disk. We submit larger chunks of data at once.
  *
  * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
  */
 int jbd2_log_do_checkpoint(journal_t *journal)
 {
@@ -313,6 +343,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 * journal straight away.
 	 */
 	result = jbd2_cleanup_journal_tail(journal);
+	trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
+		   journal->j_devname, result);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
 	if (result <= 0)
 		return result;
@@ -321,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
 	 * OK, we need to start writing disk blocks.  Take one transaction
 	 * and write it.
 	 */
+	result = 0;
 	spin_lock(&journal->j_list_lock);
 	if (!journal->j_checkpoint_transactions)
 		goto out;
@@ -339,7 +372,7 @@ restart:
 		int batch_count = 0;
 		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
-		int retry = 0;
+		int retry = 0, err;
 
 		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
@@ -353,6 +386,8 @@ restart:
 			}
 			retry = __process_buffer(journal, jh, bhs, &batch_count,
 						 transaction);
+			if (retry < 0 && !result)
+				result = retry;
 			if (!retry && (need_resched() ||
 				spin_needbreak(&journal->j_list_lock))) {
 				spin_unlock(&journal->j_list_lock);
@@ -377,14 +412,18 @@ restart:
 		 * Now we have cleaned up the first transaction's checkpoint
 		 * list. Let's clean up the second one
 		 */
-		__wait_cp_io(journal, transaction);
+		err = __wait_cp_io(journal, transaction);
+		if (!result)
+			result = err;
 	}
 out:
 	spin_unlock(&journal->j_list_lock);
-	result = jbd2_cleanup_journal_tail(journal);
 	if (result < 0)
-		return result;
-	return 0;
+		jbd2_journal_abort(journal, result);
+	else
+		result = jbd2_cleanup_journal_tail(journal);
+
+	return (result < 0) ? result : 0;
 }
 
 /*
@@ -400,8 +439,9 @@ out:
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
- * we have an abort error outstanding.
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
  */
 
 int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -410,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	tid_t		first_tid;
 	unsigned long	blocknr, freed;
 
+	if (is_journal_aborted(journal))
+		return 1;
+
 	/* OK, work out the oldest transaction remaining in the log, and
 	 * the log block it starts at.
 	 *
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f8b3be873226..8b119e16aa36 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,6 +16,7 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
+#include <linux/marker.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -67,7 +68,7 @@ static void release_buffer_page(struct buffer_head *bh)
 		goto nope;
 
 	/* OK, it's a truncated page */
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto nope;
 
 	page_cache_get(page);
@@ -126,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	JBUFFER_TRACE(descriptor, "submit commit block");
 	lock_buffer(bh);
-	get_bh(bh);
-	set_buffer_dirty(bh);
+	clear_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
 	bh->b_end_io = journal_end_buffer_io_sync;
 
@@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal,
 	 * to remember if we sent a barrier request
 	 */
 	if (ret == -EOPNOTSUPP && barrier_done) {
-		char b[BDEVNAME_SIZE];
-
 		printk(KERN_WARNING
-			"JBD: barrier-based sync failed on %s - "
-			"disabling barriers\n",
-			bdevname(journal->j_dev, b));
+		       "JBD: barrier-based sync failed on %s - "
+		       "disabling barriers\n", journal->j_devname);
 		spin_lock(&journal->j_state_lock);
 		journal->j_flags &= ~JBD2_BARRIER;
 		spin_unlock(&journal->j_state_lock);
@@ -160,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		/* And try again, without the barrier */
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
+		clear_buffer_dirty(bh);
 		ret = submit_bh(WRITE, bh);
 	}
 	*cbh = bh;
@@ -262,8 +259,18 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 		jinode->i_flags |= JI_COMMIT_RUNNING;
 		spin_unlock(&journal->j_list_lock);
 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
-		if (!ret)
-			ret = err;
+		if (err) {
+			/*
+			 * Because AS_EIO is cleared by
+			 * wait_on_page_writeback_range(), set it again so
+			 * that user process can get -EIO from fsync().
+			 */
+			set_bit(AS_EIO,
+				&jinode->i_vfs_inode->i_mapping->flags);
+
+			if (!ret)
+				ret = err;
+		}
 		spin_lock(&journal->j_list_lock);
 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -361,6 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction = journal->j_running_transaction;
 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
 
+	trace_mark(jbd2_start_commit, "dev %s transaction %d",
+		   journal->j_devname, commit_transaction->t_tid);
 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
 			commit_transaction->t_tid);
 
@@ -495,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		jh = commit_transaction->t_buffers;
 
 		/* If we're in abort mode, we just un-journal the buffer and
-		   release it for background writing. */
+		   release it. */
 
 		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
 			jbd2_journal_refile_buffer(journal, jh);
 			/* If that was the last one, we need to clean up
@@ -670,8 +680,14 @@ start_journal_io:
 	 * commit block, which happens below in such setting.
 	 */
 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
-	if (err)
-		jbd2_journal_abort(journal, err);
+	if (err) {
+		printk(KERN_WARNING
+			"JBD2: Detected IO errors while flushing file data "
+		       "on %s\n", journal->j_devname);
+		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+			jbd2_journal_abort(journal, err);
+		err = 0;
+	}
 
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
@@ -770,6 +786,9 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
+	if (err)
+		jbd2_journal_abort(journal, err);
+
 	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -868,6 +887,8 @@ restart_loop:
 		if (buffer_jbddirty(bh)) {
 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 			__jbd2_journal_refile_buffer(jh);
 			jbd_unlock_bh_state(bh);
@@ -974,6 +995,12 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
+	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
+		   journal->j_devname, commit_transaction->t_tid,
+		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b26c6d9fe6ae..783de118de92 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -68,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features);
 EXPORT_SYMBOL(jbd2_journal_create);
 EXPORT_SYMBOL(jbd2_journal_load);
 EXPORT_SYMBOL(jbd2_journal_destroy);
-EXPORT_SYMBOL(jbd2_journal_update_superblock);
 EXPORT_SYMBOL(jbd2_journal_abort);
 EXPORT_SYMBOL(jbd2_journal_errno);
 EXPORT_SYMBOL(jbd2_journal_ack_err);
@@ -598,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 		if (ret)
 			*retp = ret;
 		else {
-			char b[BDEVNAME_SIZE];
-
 			printk(KERN_ALERT "%s: journal block not found "
 					"at offset %lu on %s\n",
-				__func__,
-				blocknr,
-				bdevname(journal->j_dev, b));
+			       __func__, blocknr, journal->j_devname);
 			err = -EIO;
 			__journal_abort_soft(journal, err);
 		}
@@ -902,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats;
 
 static void jbd2_stats_proc_init(journal_t *journal)
 {
-	char name[BDEVNAME_SIZE];
-
-	bdevname(journal->j_dev, name);
-	journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
+	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
 	if (journal->j_proc_entry) {
 		proc_create_data("history", S_IRUGO, journal->j_proc_entry,
 				 &jbd2_seq_history_fops, journal);
@@ -916,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
 
 static void jbd2_stats_proc_exit(journal_t *journal)
 {
-	char name[BDEVNAME_SIZE];
-
-	bdevname(journal->j_dev, name);
 	remove_proc_entry("info", journal->j_proc_entry);
 	remove_proc_entry("history", journal->j_proc_entry);
-	remove_proc_entry(name, proc_jbd2_stats);
+	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
 }
 
 static void journal_init_stats(journal_t *journal)
@@ -1019,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 {
 	journal_t *journal = journal_init_common();
 	struct buffer_head *bh;
+	char *p;
 	int n;
 
 	if (!journal)
@@ -1040,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 	journal->j_fs_dev = fs_dev;
 	journal->j_blk_offset = start;
 	journal->j_maxlen = len;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
 	jbd2_stats_proc_init(journal);
 
 	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1062,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 {
 	struct buffer_head *bh;
 	journal_t *journal = journal_init_common();
+	char *p;
 	int err;
 	int n;
 	unsigned long long blocknr;
@@ -1071,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 
 	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
 	journal->j_inode = inode;
+	bdevname(journal->j_dev, journal->j_devname);
+	p = journal->j_devname;
+	while ((p = strchr(p, '/')))
+		*p = '!';
+	p = journal->j_devname + strlen(journal->j_devname);
+	sprintf(p, ":%lu", journal->j_inode->i_ino);
 	jbd_debug(1,
 		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
 		  journal, inode->i_sb->s_id, inode->i_ino,
@@ -1254,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 		goto out;
 	}
 
+	if (buffer_write_io_error(bh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the journal
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		printk(KERN_ERR "JBD2: previous I/O error detected "
+		       "for journal superblock update for %s.\n",
+		       journal->j_devname);
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+	}
+
 	spin_lock(&journal->j_state_lock);
 	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1265,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 
 	BUFFER_TRACE(bh, "marking dirty");
 	mark_buffer_dirty(bh);
-	if (wait)
+	if (wait) {
 		sync_dirty_buffer(bh);
-	else
+		if (buffer_write_io_error(bh)) {
+			printk(KERN_ERR "JBD2: I/O error detected "
+			       "when updating journal superblock for %s.\n",
+			       journal->j_devname);
+			clear_buffer_write_io_error(bh);
+			set_buffer_uptodate(bh);
+		}
+	} else
 		ll_rw_block(SWRITE, 1, &bh);
 
 out:
@@ -1427,9 +1451,12 @@ recovery_error:
  *
  * Release a journal_t structure once it is no longer in use by the
  * journaled object.
+ * Return <0 if we couldn't clean up the journal.
  */
-void jbd2_journal_destroy(journal_t *journal)
+int jbd2_journal_destroy(journal_t *journal)
 {
+	int err = 0;
+
 	/* Wait for the commit thread to wake up and die. */
 	journal_kill_thread(journal);
 
@@ -1452,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal)
 	J_ASSERT(journal->j_checkpoint_transactions == NULL);
 	spin_unlock(&journal->j_list_lock);
 
-	/* We can now mark the journal as empty. */
-	journal->j_tail = 0;
-	journal->j_tail_sequence = ++journal->j_transaction_sequence;
 	if (journal->j_sb_buffer) {
-		jbd2_journal_update_superblock(journal, 1);
+		if (!is_journal_aborted(journal)) {
+			/* We can now mark the journal as empty. */
+			journal->j_tail = 0;
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			jbd2_journal_update_superblock(journal, 1);
+		} else {
+			err = -EIO;
+		}
 		brelse(journal->j_sb_buffer);
 	}
 
@@ -1468,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal)
 		jbd2_journal_destroy_revoke(journal);
 	kfree(journal->j_wbuf);
 	kfree(journal);
+
+	return err;
 }
 
 
@@ -1693,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (!err && journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
 		err = jbd2_log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
+
+	if (is_journal_aborted(journal))
+		return -EIO;
+
 	jbd2_cleanup_journal_tail(journal);
 
 	/* Finally, mark the journal as really needing no recovery.
@@ -1718,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal)
 	J_ASSERT(journal->j_head == journal->j_tail);
 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
 	spin_unlock(&journal->j_state_lock);
-	return err;
+	return 0;
 }
 
 /**
@@ -1762,23 +1802,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 }
 
 /*
- * journal_dev_name: format a character string to describe on what
- * device this journal is present.
- */
-
-static const char *journal_dev_name(journal_t *journal, char *buffer)
-{
-	struct block_device *bdev;
-
-	if (journal->j_inode)
-		bdev = journal->j_inode->i_sb->s_bdev;
-	else
-		bdev = journal->j_dev;
-
-	return bdevname(bdev, buffer);
-}
-
-/*
  * Journal abort has very specific semantics, which we describe
  * for journal abort.
  *
@@ -1794,13 +1817,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
 void __jbd2_journal_abort_hard(journal_t *journal)
 {
 	transaction_t *transaction;
-	char b[BDEVNAME_SIZE];
 
 	if (journal->j_flags & JBD2_ABORT)
 		return;
 
 	printk(KERN_ERR "Aborting journal on device %s.\n",
-		journal_dev_name(journal, b));
+	       journal->j_devname);
 
 	spin_lock(&journal->j_state_lock);
 	journal->j_flags |= JBD2_ABORT;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 058f50f65b76..73063285b13f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -225,7 +225,7 @@ do {									\
  */
 int jbd2_journal_recover(journal_t *journal)
 {
-	int			err;
+	int			err, err2;
 	journal_superblock_t *	sb;
 
 	struct recovery_info	info;
@@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal)
 	journal->j_transaction_sequence = ++info.end_transaction;
 
 	jbd2_journal_clear_revoke(journal);
-	sync_blockdev(journal->j_fs_dev);
+	err2 = sync_blockdev(journal->j_fs_dev);
+	if (!err)
+		err = err2;
+
 	return err;
 }
 
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4f7cadbb19fa..39b7805a599a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
 	INIT_LIST_HEAD(&transaction->t_inode_list);
+	INIT_LIST_HEAD(&transaction->t_private_list);
 
 	/* Set up the commit timer for the new transaction. */
 	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -301,7 +302,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 		goto out;
 	}
 
-	lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+	lock_map_acquire(&handle->h_lockdep_map);
 out:
 	return handle;
 }
@@ -1279,7 +1280,7 @@ int jbd2_journal_stop(handle_t *handle)
 		spin_unlock(&journal->j_state_lock);
 	}
 
-	lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+	lock_map_release(&handle->h_lockdep_map);
 
 	jbd2_free_handle(handle);
 	return err;
diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
new file mode 100644
index 000000000000..6ae169cd8faa
--- /dev/null
+++ b/fs/jffs2/Kconfig
@@ -0,0 +1,188 @@
+config JFFS2_FS
+	tristate "Journalling Flash File System v2 (JFFS2) support"
+	select CRC32
+	depends on MTD
+	help
+	  JFFS2 is the second generation of the Journalling Flash File System
+	  for use on diskless embedded devices. It provides improved wear
+	  levelling, compression and support for hard links. You cannot use
+	  this on normal block devices, only on 'MTD' devices.
+
+	  Further information on the design and implementation of JFFS2 is
+	  available at <http://sources.redhat.com/jffs2/>.
+
+config JFFS2_FS_DEBUG
+	int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
+	depends on JFFS2_FS
+	default "0"
+	help
+	  This controls the amount of debugging messages produced by the JFFS2
+	  code. Set it to zero for use in production systems. For evaluation,
+	  testing and debugging, it's advisable to set it to one. This will
+	  enable a few assertions and will print debugging messages at the
+	  KERN_DEBUG loglevel, where they won't normally be visible. Level 2
+	  is unlikely to be useful - it enables extra debugging in certain
+	  areas which at one point needed debugging, but when the bugs were
+	  located and fixed, the detailed messages were relegated to level 2.
+
+	  If reporting bugs, please try to have available a full dump of the
+	  messages at debug level 1 while the misbehaviour was occurring.
+
+config JFFS2_FS_WRITEBUFFER
+	bool "JFFS2 write-buffering support"
+	depends on JFFS2_FS
+	default y
+	help
+	  This enables the write-buffering support in JFFS2.
+
+	  This functionality is required to support JFFS2 on the following
+	  types of flash devices:
+	    - NAND flash
+	    - NOR flash with transparent ECC
+	    - DataFlash
+
+config JFFS2_FS_WBUF_VERIFY
+	bool "Verify JFFS2 write-buffer reads"
+	depends on JFFS2_FS_WRITEBUFFER
+	default n
+	help
+	  This causes JFFS2 to read back every page written through the
+	  write-buffer, and check for errors.
+
+config JFFS2_SUMMARY
+	bool "JFFS2 summary support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  This feature makes it possible to use summary information
+	  for faster filesystem mount.
+
+	  The summary information can be inserted into a filesystem image
+	  by the utility 'sumtool'.
+
+	  If unsure, say 'N'.
+
+config JFFS2_FS_XATTR
+	bool "JFFS2 XATTR support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config JFFS2_FS_POSIX_ACL
+	bool "JFFS2 POSIX Access Control Lists"
+	depends on JFFS2_FS_XATTR
+	default y
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config JFFS2_FS_SECURITY
+	bool "JFFS2 Security Labels"
+	depends on JFFS2_FS_XATTR
+	default y
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jffs2 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config JFFS2_COMPRESSION_OPTIONS
+	bool "Advanced compression options for JFFS2"
+	depends on JFFS2_FS
+	default n
+	help
+	  Enabling this option allows you to explicitly choose which
+	  compression modules, if any, are enabled in JFFS2. Removing
+	  compressors can mean you cannot read existing file systems,
+	  and enabling experimental compressors can mean that you
+	  write a file system which cannot be read by a standard kernel.
+
+	  If unsure, you should _definitely_ say 'N'.
+
+config JFFS2_ZLIB
+	bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	depends on JFFS2_FS
+	default y
+	help
+	  Zlib is designed to be a free, general-purpose, legally unencumbered,
+	  lossless data-compression library for use on virtually any computer
+	  hardware and operating system. See <http://www.gzip.org/zlib/> for
+	  further information.
+
+	  Say 'Y' if unsure.
+
+config JFFS2_LZO
+	bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	depends on JFFS2_FS
+	default n
+	help
+	  minilzo-based compression. Generally works better than Zlib.
+
+	  This feature was added in July, 2007. Say 'N' if you need
+	  compatibility with older bootloaders or kernels.
+
+config JFFS2_RTIME
+	bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default y
+	help
+	  Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
+
+config JFFS2_RUBIN
+	bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default n
+	help
+	  RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
+
+choice
+	prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
+	default JFFS2_CMODE_PRIORITY
+	depends on JFFS2_FS
+	help
+	  You can set here the default compression mode of JFFS2 from
+	  the available compression modes. Don't touch if unsure.
+
+config JFFS2_CMODE_NONE
+	bool "no compression"
+	help
+	  Uses no compression.
+
+config JFFS2_CMODE_PRIORITY
+	bool "priority"
+	help
+	  Tries the compressors in a predefined order and chooses the first
+	  successful one.
+
+config JFFS2_CMODE_SIZE
+	bool "size (EXPERIMENTAL)"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result.
+
+config JFFS2_CMODE_FAVOURLZO
+	bool "Favour LZO"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result but gives some preference to LZO (which has faster
+	  decompression) at the expense of size.
+
+endchoice
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 4c80404a9aba..d98713777a1b 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -314,7 +314,7 @@ static int jffs2_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int jffs2_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, jffs2_check_acl);
 }
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 0bb7f003fd80..8ca058aed384 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
 
 #define JFFS2_ACL_NOT_CACHED ((void *)-1)
 
-extern int jffs2_permission(struct inode *, int, struct nameidata *);
+extern int jffs2_permission(struct inode *, int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 86739ee53b37..f25e70c1b51c 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,8 +53,8 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
 }
 
 /* jffs2_compress:
- * @data: Pointer to uncompressed data
- * @cdata: Pointer to returned pointer to buffer for compressed data
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
  * @datalen: On entry, holds the amount of data available for compression.
  *	On exit, expected to hold the amount of data actually compressed.
  * @cdatalen: On entry, holds the amount of space available for compressed
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index c0c141f6fde1..6f60cc910f4c 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -38,8 +38,9 @@ const struct file_operations jffs2_dir_operations =
 {
 	.read =		generic_read_dir,
 	.readdir =	jffs2_readdir,
-	.ioctl =	jffs2_ioctl,
-	.fsync =	jffs2_fsync
+	.unlocked_ioctl=jffs2_ioctl,
+	.fsync =	jffs2_fsync,
+	.llseek =	generic_file_llseek,
 };
 
 
@@ -108,9 +109,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 		}
 	}
 
-	d_add(target, inode);
-
-	return NULL;
+	return d_splice_alias(inode, target);
 }
 
 /***********************************************************************/
@@ -311,7 +310,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	/* FIXME: If you care. We'd need to use frags for the target
 	   if it grows much more than this */
 	if (targetlen > 254)
-		return -EINVAL;
+		return -ENAMETOOLONG;
 
 	ri = jffs2_alloc_raw_inode();
 
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index dddb2a6c9e2c..259461b910af 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -68,7 +68,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 	instr->len = c->sector_size;
 	instr->callback = jffs2_erase_callback;
 	instr->priv = (unsigned long)(&instr[1]);
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	((struct erase_priv_struct *)instr->priv)->jeb = jeb;
 	((struct erase_priv_struct *)instr->priv)->c = c;
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 {
 	/* For NAND, if the failure did not occur at the device level for a
 	   specific physical page, don't bother updating the bad block table. */
-	if (jffs2_cleanmarker_oob(c) && (bad_offset != 0xffffffff)) {
+	if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
 		/* We had a device-level failure to erase.  Let's see if we've
 		   failed too many times. */
 		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5e920343b2c5..5a98aa87c853 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -46,7 +46,7 @@ const struct file_operations jffs2_file_operations =
  	.aio_read =	generic_file_aio_read,
  	.write =	do_sync_write,
  	.aio_write =	generic_file_aio_write,
-	.ioctl =	jffs2_ioctl,
+	.unlocked_ioctl=jffs2_ioctl,
 	.mmap =		generic_file_readonly_mmap,
 	.fsync =	jffs2_fsync,
 	.splice_read =	generic_file_splice_read,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 086c43830221..249305d65d5b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -207,6 +207,8 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = 0;
 	buf->f_ffree = 0;
 	buf->f_namelen = JFFS2_MAX_NAME_LEN;
+	buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC;
+	buf->f_fsid.val[1] = c->mtd->index;
 
 	spin_lock(&c->erase_completion_lock);
 	avail = c->dirty_size + c->free_size;
@@ -440,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
 
 	memset(ri, 0, sizeof(*ri));
 	/* Set OS-specific defaults for new inodes */
-	ri->uid = cpu_to_je16(current->fsuid);
+	ri->uid = cpu_to_je16(current_fsuid());
 
 	if (dir_i->i_mode & S_ISGID) {
 		ri->gid = cpu_to_je16(dir_i->i_gid);
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
 	} else {
-		ri->gid = cpu_to_je16(current->fsgid);
+		ri->gid = cpu_to_je16(current_fsgid());
 	}
 
 	/* POSIX ACLs have to be processed now, at least partly.
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index e2177210f621..9d41f43e47bb 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -12,8 +12,7 @@
 #include <linux/fs.h>
 #include "nodelist.h"
 
-int jffs2_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-		unsigned long arg)
+long jffs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	/* Later, this will provide for lsattr.jffs2 and chattr.jffs2, which
 	   will include compression support etc. */
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 31559f45fdde..4c41db91eaa4 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -12,7 +12,6 @@
 #ifndef _JFFS2_FS_I
 #define _JFFS2_FS_I
 
-#include <linux/version.h>
 #include <linux/rbtree.h>
 #include <linux/posix_acl.h>
 #include <linux/mutex.h>
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index a9bf9603c1ba..0875b60b4bf7 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -261,6 +261,10 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 
 	jffs2_sum_reset_collected(c->summary); /* reset collected summary */
 
+	/* adjust write buffer offset, else we get a non contiguous write bug */
+	if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
+		c->wbuf_ofs = 0xffffffff;
+
 	D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
 
 	return 0;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 2cc866cf134f..5e194a5c8e29 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -167,7 +167,7 @@ int jffs2_fsync(struct file *, struct dentry *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 
 /* ioctl.c */
-int jffs2_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+long jffs2_ioctl(struct file *, unsigned int, unsigned long);
 
 /* symlink.c */
 extern const struct inode_operations jffs2_symlink_inode_operations;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 629af01e5ade..6caf1e1ee26d 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -23,6 +23,8 @@
 
 int jffs2_sum_init(struct jffs2_sb_info *c)
 {
+	uint32_t sum_size = max_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
+
 	c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
 
 	if (!c->summary) {
@@ -30,7 +32,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
 		return -ENOMEM;
 	}
 
-	c->summary->sum_buf = vmalloc(c->sector_size);
+	c->summary->sum_buf = kmalloc(sum_size, GFP_KERNEL);
 
 	if (!c->summary->sum_buf) {
 		JFFS2_WARNING("Can't allocate buffer for writing out summary information!\n");
@@ -49,7 +51,7 @@ void jffs2_sum_exit(struct jffs2_sb_info *c)
 
 	jffs2_sum_disable_collecting(c->summary);
 
-	vfree(c->summary->sum_buf);
+	kfree(c->summary->sum_buf);
 	c->summary->sum_buf = NULL;
 
 	kfree(c->summary);
@@ -665,7 +667,7 @@ crc_err:
 /* Write summary data to flash - helper function for jffs2_sum_write_sumnode() */
 
 static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-					uint32_t infosize, uint32_t datasize, int padsize)
+				uint32_t infosize, uint32_t datasize, int padsize)
 {
 	struct jffs2_raw_summary isum;
 	union jffs2_sum_mem *temp;
@@ -676,6 +678,26 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	int ret;
 	size_t retlen;
 
+	if (padsize + datasize > MAX_SUMMARY_SIZE) {
+		/* It won't fit in the buffer. Abort summary for this jeb */
+		jffs2_sum_disable_collecting(c->summary);
+
+		JFFS2_WARNING("Summary too big (%d data, %d pad) in eraseblock at %08x\n",
+			      datasize, padsize, jeb->offset);
+		/* Non-fatal */
+		return 0;
+	}
+	/* Is there enough space for summary? */
+	if (padsize < 0) {
+		/* don't try to write out summary for this jeb */
+		jffs2_sum_disable_collecting(c->summary);
+
+		JFFS2_WARNING("Not enough space for summary, padsize = %d\n",
+			      padsize);
+		/* Non-fatal */
+		return 0;
+	}
+
 	memset(c->summary->sum_buf, 0xff, datasize);
 	memset(&isum, 0, sizeof(isum));
 
@@ -821,7 +843,7 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 {
 	int datasize, infosize, padsize;
 	struct jffs2_eraseblock *jeb;
-	int ret;
+	int ret = 0;
 
 	dbg_summary("called\n");
 
@@ -841,16 +863,6 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 	infosize += padsize;
 	datasize += padsize;
 
-	/* Is there enough space for summary? */
-	if (padsize < 0) {
-		/* don't try to write out summary for this jeb */
-		jffs2_sum_disable_collecting(c->summary);
-
-		JFFS2_WARNING("Not enough space for summary, padsize = %d\n", padsize);
-		spin_lock(&c->erase_completion_lock);
-		return 0;
-	}
-
 	ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
 	spin_lock(&c->erase_completion_lock);
 	return ret;
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index 8bf34f2fa5ce..60207a2ae952 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -13,6 +13,12 @@
 #ifndef JFFS2_SUMMARY_H
 #define JFFS2_SUMMARY_H
 
+/* Limit summary size to 64KiB so that we can kmalloc it. If the summary
+   is larger than that, we have to just ditch it and avoid using summary
+   for the eraseblock in question... and it probably doesn't hurt us much
+   anyway. */
+#define MAX_SUMMARY_SIZE 65536
+
 #include <linux/uio.h>
 #include <linux/jffs2.h>
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7da69eae49e4..4c4e18c54a51 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -22,6 +22,7 @@
 #include <linux/mtd/super.h>
 #include <linux/ctype.h>
 #include <linux/namei.h>
+#include <linux/exportfs.h>
 #include "compr.h"
 #include "nodelist.h"
 
@@ -44,7 +45,7 @@ static void jffs2_destroy_inode(struct inode *inode)
 	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
 
-static void jffs2_i_init_once(struct kmem_cache *cachep, void *foo)
+static void jffs2_i_init_once(void *foo)
 {
 	struct jffs2_inode_info *f = foo;
 
@@ -62,6 +63,52 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
 	return 0;
 }
 
+static struct inode *jffs2_nfs_get_inode(struct super_block *sb, uint64_t ino,
+					 uint32_t generation)
+{
+	/* We don't care about i_generation. We'll destroy the flash
+	   before we start re-using inode numbers anyway. And even
+	   if that wasn't true, we'd have other problems...*/
+	return jffs2_iget(sb, ino);
+}
+
+static struct dentry *jffs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					 int fh_len, int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+                                    jffs2_nfs_get_inode);
+}
+
+static struct dentry *jffs2_fh_to_parent(struct super_block *sb, struct fid *fid,
+					 int fh_len, int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+                                    jffs2_nfs_get_inode);
+}
+
+static struct dentry *jffs2_get_parent(struct dentry *child)
+{
+	struct jffs2_inode_info *f;
+	uint32_t pino;
+
+	BUG_ON(!S_ISDIR(child->d_inode->i_mode));
+
+	f = JFFS2_INODE_INFO(child->d_inode);
+
+	pino = f->inocache->pino_nlink;
+
+	JFFS2_DEBUG("Parent of directory ino #%u is #%u\n",
+		    f->inocache->ino, pino);
+
+	return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
+}
+
+static struct export_operations jffs2_export_ops = {
+	.get_parent = jffs2_get_parent,
+	.fh_to_dentry = jffs2_fh_to_dentry,
+	.fh_to_parent = jffs2_fh_to_parent,
+};
+
 static const struct super_operations jffs2_super_operations =
 {
 	.alloc_inode =	jffs2_alloc_inode,
@@ -104,6 +151,7 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
 	spin_lock_init(&c->inocache_lock);
 
 	sb->s_op = &jffs2_super_operations;
+	sb->s_export_op = &jffs2_export_ops;
 	sb->s_flags = sb->s_flags | MS_NOATIME;
 	sb->s_xattr = jffs2_xattr_handlers;
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 0e78b00035e4..d9a721e6db70 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -679,10 +679,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 
 	memset(c->wbuf,0xff,c->wbuf_pagesize);
 	/* adjust write buffer offset, else we get a non contiguous write bug */
-	if (SECTOR_ADDR(c->wbuf_ofs) == SECTOR_ADDR(c->wbuf_ofs+c->wbuf_pagesize))
-		c->wbuf_ofs += c->wbuf_pagesize;
-	else
-		c->wbuf_ofs = 0xffffffff;
+	c->wbuf_ofs += c->wbuf_pagesize;
 	c->wbuf_len = 0;
 	return 0;
 }
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 4d84bdc88299..d3e5c33665de 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -140,7 +140,7 @@ static int jfs_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
-int jfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int jfs_permission(struct inode *inode, int mask)
 {
 	return generic_permission(inode, mask, jfs_check_acl);
 }
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 455fa4292045..88475f10a389 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 
 #ifdef CONFIG_JFS_POSIX_ACL
 
-int jfs_permission(struct inode *, int, struct nameidata *);
+int jfs_permission(struct inode *, int);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_setattr(struct dentry *, struct iattr *);
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cd2ec2988b59..335c4de6552d 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1168,7 +1168,7 @@ journal_found:
 	bd_release(bdev);
 
       close:		/* close external log device */
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1514,7 +1514,7 @@ int lmLogClose(struct super_block *sb)
 	rc = lmLogShutdown(log);
 
 	bd_release(bdev);
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 
 	kfree(log);
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 854ff0ec574f..c350057087dd 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -182,7 +182,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 
 #endif
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct metapage *mp = (struct metapage *)foo;
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 2aba82386810..cc3cedffbfa1 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1511,25 +1511,12 @@ struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
 
 struct dentry *jfs_get_parent(struct dentry *dentry)
 {
-	struct super_block *sb = dentry->d_inode->i_sb;
-	struct dentry *parent = ERR_PTR(-ENOENT);
-	struct inode *inode;
 	unsigned long parent_ino;
 
 	parent_ino =
 		le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot);
-	inode = jfs_iget(sb, parent_ino);
-	if (IS_ERR(inode)) {
-		parent = ERR_CAST(inode);
-	} else {
-		parent = d_alloc_anon(inode);
-		if (!parent) {
-			parent = ERR_PTR(-ENOMEM);
-			iput(inode);
-		}
-	}
 
-	return parent;
+	return d_obtain_alias(jfs_iget(dentry->d_inode->i_sb, parent_ino));
 }
 
 const struct inode_operations jfs_dir_inode_operations = {
@@ -1560,6 +1547,7 @@ const struct file_operations jfs_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= jfs_compat_ioctl,
 #endif
+	.llseek		= generic_file_llseek,
 };
 
 static int jfs_ci_hash(struct dentry *dir, struct qstr *this)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0288e6d7936a..0dae345e481b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -22,6 +22,7 @@
 #include <linux/parser.h>
 #include <linux/completion.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mount.h>
 #include <linux/moduleparam.h>
 #include <linux/kthread.h>
@@ -198,7 +199,7 @@ enum {
 	Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_integrity, "integrity"},
 	{Opt_nointegrity, "nointegrity"},
 	{Opt_iocharset, "iocharset=%s"},
@@ -759,7 +760,7 @@ static struct file_system_type jfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index baeb71ee1cde..74688598bcf7 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -216,8 +216,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 
 	s->s_flags = MS_NOUSER;
 	s->s_maxbytes = ~0ULL;
-	s->s_blocksize = 1024;
-	s->s_blocksize_bits = 10;
+	s->s_blocksize = PAGE_SIZE;
+	s->s_blocksize_bits = PAGE_SHIFT;
 	s->s_magic = magic;
 	s->s_op = ops ? ops : &simple_super_operations;
 	s->s_time_gran = 1;
@@ -732,28 +732,6 @@ out:
 	return ret;
 }
 
-/*
- * This is what d_alloc_anon should have been.  Once the exportfs
- * argument transition has been finished I will update d_alloc_anon
- * to this prototype and this wrapper will go away.   --hch
- */
-static struct dentry *exportfs_d_alloc(struct inode *inode)
-{
-	struct dentry *dentry;
-
-	if (!inode)
-		return NULL;
-	if (IS_ERR(inode))
-		return ERR_PTR(PTR_ERR(inode));
-
-	dentry = d_alloc_anon(inode);
-	if (!dentry) {
-		iput(inode);
-		dentry = ERR_PTR(-ENOMEM);
-	}
-	return dentry;
-}
-
 /**
  * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
  * @sb:		filesystem to do the file handle conversion on
@@ -782,7 +760,7 @@ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	return exportfs_d_alloc(inode);
+	return d_obtain_alias(inode);
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_dentry);
 
@@ -815,7 +793,7 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	return exportfs_d_alloc(inode);
+	return d_obtain_alias(inode);
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 7725a0a9a555..97f6073ab339 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,6 @@
 obj-$(CONFIG_LOCKD) += lockd.o
 
 lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
-	        svcproc.o svcsubs.o mon.o xdr.o
+	        svcproc.o svcsubs.o mon.o xdr.o grace.o
 lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
 lockd-objs		      := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 0b45fd3a4bfd..8307dd64bf46 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
 	u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
 	int status;
 
-	status = lockd_up(nlm_init->protocol);
+	status = lockd_up();
 	if (status < 0)
 		return ERR_PTR(status);
 
-	host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
+	host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
 				   nlm_init->protocol, nlm_version,
-				   nlm_init->hostname,
-				   strlen(nlm_init->hostname));
+				   nlm_init->hostname);
 	if (host == NULL) {
 		lockd_down();
 		return ERR_PTR(-ENOLCK);
@@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 /*
  * The server lockd has called us back to tell us the lock was granted
  */
-__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
+__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 {
 	const struct file_lock *fl = &lock->fl;
 	const struct nfs_fh *fh = &lock->fh;
@@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock
 		 */
 		if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
 			continue;
-		if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
+		if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
 			continue;
 		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
 			continue;
@@ -216,7 +215,7 @@ reclaimer(void *ptr)
 	/* This one ensures that our parent doesn't terminate while the
 	 * reclaim is in progress */
 	lock_kernel();
-	lockd_up(0); /* note: this cannot fail as lockd is already running */
+	lockd_up();	/* note: this cannot fail as lockd is already running */
 
 	dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
 
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 1f6dc518505c..31668b690e03 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -582,7 +582,15 @@ again:
 	}
 	if (status < 0)
 		goto out_unlock;
-	status = nlm_stat_to_errno(resp->status);
+	/*
+	 * EAGAIN doesn't make sense for sleeping locks, and in some
+	 * cases NLM_LCK_DENIED is returned for a permanent error.  So
+	 * turn it into an ENOLCK.
+	 */
+	if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
+		status = -ENOLCK;
+	else
+		status = nlm_stat_to_errno(resp->status);
 out_unblock:
 	nlmclnt_finish_block(block);
 out:
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
new file mode 100644
index 000000000000..183cc1f0af1c
--- /dev/null
+++ b/fs/lockd/grace.c
@@ -0,0 +1,59 @@
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ */
+
+#include <linux/module.h>
+#include <linux/lockd/bind.h>
+
+static LIST_HEAD(grace_list);
+static DEFINE_SPINLOCK(grace_lock);
+
+/**
+ * locks_start_grace
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out.  Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void locks_start_grace(struct lock_manager *lm)
+{
+	spin_lock(&grace_lock);
+	list_add(&lm->list, &grace_list);
+	spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+
+/**
+ * locks_end_grace
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking.  The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void locks_end_grace(struct lock_manager *lm)
+{
+	spin_lock(&grace_lock);
+	list_del_init(&lm->list);
+	spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+int locks_in_grace(void)
+{
+	return !list_empty(&grace_list);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a17664c7eacc..9fd8889097b7 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -11,16 +11,17 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/sm_inter.h>
 #include <linux/mutex.h>
 
+#include <net/ipv6.h>
 
 #define NLMDBG_FACILITY		NLMDBG_HOSTCACHE
 #define NLM_HOST_NRHASH		32
-#define NLM_ADDRHASH(addr)	(ntohl(addr) & (NLM_HOST_NRHASH-1))
 #define NLM_HOST_REBIND		(60 * HZ)
 #define NLM_HOST_EXPIRE		(300 * HZ)
 #define NLM_HOST_COLLECT	(120 * HZ)
@@ -30,42 +31,115 @@ static unsigned long		next_gc;
 static int			nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 
-
 static void			nlm_gc_hosts(void);
-static struct nsm_handle *	__nsm_find(const struct sockaddr_in *,
-					const char *, unsigned int, int);
-static struct nsm_handle *	nsm_find(const struct sockaddr_in *sin,
-					 const char *hostname,
-					 unsigned int hostname_len);
+static struct nsm_handle	*nsm_find(const struct sockaddr *sap,
+						const size_t salen,
+						const char *hostname,
+						const size_t hostname_len,
+						const int create);
+
+struct nlm_lookup_host_info {
+	const int		server;		/* search for server|client */
+	const struct sockaddr	*sap;		/* address to search for */
+	const size_t		salen;		/* it's length */
+	const unsigned short	protocol;	/* transport to search for*/
+	const u32		version;	/* NLM version to search for */
+	const char		*hostname;	/* remote's hostname */
+	const size_t		hostname_len;	/* it's length */
+	const struct sockaddr	*src_sap;	/* our address (optional) */
+	const size_t		src_len;	/* it's length */
+};
+
+/*
+ * Hash function must work well on big- and little-endian platforms
+ */
+static unsigned int __nlm_hash32(const __be32 n)
+{
+	unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
+	return hash ^ (hash >> 8);
+}
+
+static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	return __nlm_hash32(sin->sin_addr.s_addr);
+}
+
+static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	const struct in6_addr addr = sin6->sin6_addr;
+	return __nlm_hash32(addr.s6_addr32[0]) ^
+	       __nlm_hash32(addr.s6_addr32[1]) ^
+	       __nlm_hash32(addr.s6_addr32[2]) ^
+	       __nlm_hash32(addr.s6_addr32[3]);
+}
+
+static unsigned int nlm_hash_address(const struct sockaddr *sap)
+{
+	unsigned int hash;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		hash = __nlm_hash_addr4(sap);
+		break;
+	case AF_INET6:
+		hash = __nlm_hash_addr6(sap);
+		break;
+	default:
+		hash = 0;
+	}
+	return hash & (NLM_HOST_NRHASH - 1);
+}
+
+static void nlm_clear_port(struct sockaddr *sap)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)sap)->sin_port = 0;
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)sap)->sin6_port = 0;
+		break;
+	}
+}
+
+static void nlm_display_address(const struct sockaddr *sap,
+				char *buf, const size_t len)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+
+	switch (sap->sa_family) {
+	case AF_UNSPEC:
+		snprintf(buf, len, "unspecified");
+		break;
+	case AF_INET:
+		snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
+		break;
+	case AF_INET6:
+		if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+			snprintf(buf, len, NIPQUAD_FMT,
+				 NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
+		else
+			snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
+		break;
+	default:
+		snprintf(buf, len, "unsupported address family");
+		break;
+	}
+}
 
 /*
  * Common host lookup routine for server & client
  */
-static struct nlm_host *nlm_lookup_host(int server,
-					const struct sockaddr_in *sin,
-					int proto, u32 version,
-					const char *hostname,
-					unsigned int hostname_len,
-					const struct sockaddr_in *ssin)
+static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 {
 	struct hlist_head *chain;
 	struct hlist_node *pos;
 	struct nlm_host	*host;
 	struct nsm_handle *nsm = NULL;
-	int		hash;
-
-	dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
-			", p=%d, v=%u, my role=%s, name=%.*s)\n",
-			NIPQUAD(ssin->sin_addr.s_addr),
-			NIPQUAD(sin->sin_addr.s_addr), proto, version,
-			server? "server" : "client",
-			hostname_len,
-			hostname? hostname : "<none>");
 
-
-	hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
-
-	/* Lock hash table */
 	mutex_lock(&nlm_host_mutex);
 
 	if (time_after_eq(jiffies, next_gc))
@@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server,
 	 * different NLM rpc_clients into one single nlm_host object.
 	 * This would allow us to have one nlm_host per address.
 	 */
-	chain = &nlm_hosts[hash];
+	chain = &nlm_hosts[nlm_hash_address(ni->sap)];
 	hlist_for_each_entry(host, pos, chain, h_hash) {
-		if (!nlm_cmp_addr(&host->h_addr, sin))
+		if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
 			continue;
 
 		/* See if we have an NSM handle for this client */
 		if (!nsm)
 			nsm = host->h_nsmhandle;
 
-		if (host->h_proto != proto)
+		if (host->h_proto != ni->protocol)
 			continue;
-		if (host->h_version != version)
+		if (host->h_version != ni->version)
 			continue;
-		if (host->h_server != server)
+		if (host->h_server != ni->server)
 			continue;
-		if (!nlm_cmp_addr(&host->h_saddr, ssin))
+		if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
 			continue;
 
 		/* Move to head of hash chain. */
@@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server,
 		hlist_add_head(&host->h_hash, chain);
 
 		nlm_get_host(host);
+		dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
+				host->h_name, host->h_addrbuf);
 		goto out;
 	}
-	if (nsm)
-		atomic_inc(&nsm->sm_count);
-
-	host = NULL;
 
-	/* Sadly, the host isn't in our hash table yet. See if
-	 * we have an NSM handle for it. If not, create one.
+	/*
+	 * The host wasn't in our hash table.  If we don't
+	 * have an NSM handle for it yet, create one.
 	 */
-	if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
-		goto out;
+	if (nsm)
+		atomic_inc(&nsm->sm_count);
+	else {
+		host = NULL;
+		nsm = nsm_find(ni->sap, ni->salen,
+				ni->hostname, ni->hostname_len, 1);
+		if (!nsm) {
+			dprintk("lockd: nlm_lookup_host failed; "
+				"no nsm handle\n");
+			goto out;
+		}
+	}
 
 	host = kzalloc(sizeof(*host), GFP_KERNEL);
 	if (!host) {
 		nsm_release(nsm);
+		dprintk("lockd: nlm_lookup_host failed; no memory\n");
 		goto out;
 	}
 	host->h_name	   = nsm->sm_name;
-	host->h_addr       = *sin;
-	host->h_addr.sin_port = 0;	/* ouch! */
-	host->h_saddr	   = *ssin;
-	host->h_version    = version;
-	host->h_proto      = proto;
+	memcpy(nlm_addr(host), ni->sap, ni->salen);
+	host->h_addrlen = ni->salen;
+	nlm_clear_port(nlm_addr(host));
+	memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+	host->h_version    = ni->version;
+	host->h_proto      = ni->protocol;
 	host->h_rpcclnt    = NULL;
 	mutex_init(&host->h_mutex);
 	host->h_nextrebind = jiffies + NLM_HOST_REBIND;
@@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server,
 	host->h_state      = 0;			/* pseudo NSM state */
 	host->h_nsmstate   = 0;			/* real NSM state */
 	host->h_nsmhandle  = nsm;
-	host->h_server	   = server;
+	host->h_server	   = ni->server;
 	hlist_add_head(&host->h_hash, chain);
 	INIT_LIST_HEAD(&host->h_lockowners);
 	spin_lock_init(&host->h_lock);
@@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server,
 	INIT_LIST_HEAD(&host->h_reclaim);
 
 	nrhosts++;
+
+	nlm_display_address((struct sockaddr *)&host->h_addr,
+				host->h_addrbuf, sizeof(host->h_addrbuf));
+	nlm_display_address((struct sockaddr *)&host->h_srcaddr,
+				host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
+
+	dprintk("lockd: nlm_lookup_host created host %s\n",
+			host->h_name);
+
 out:
 	mutex_unlock(&nlm_host_mutex);
 	return host;
@@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host)
 	kfree(host);
 }
 
-/*
- * Find an NLM server handle in the cache. If there is none, create it.
+/**
+ * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
+ * @sap: network address of server
+ * @salen: length of server address
+ * @protocol: transport protocol to use
+ * @version: NLM protocol version
+ * @hostname: '\0'-terminated hostname of server
+ *
+ * Returns an nlm_host structure that matches the passed-in
+ * [server address, transport protocol, NLM version, server hostname].
+ * If one doesn't already exist in the host cache, a new handle is
+ * created and returned.
  */
-struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
-				     int proto, u32 version,
-				     const char *hostname,
-				     unsigned int hostname_len)
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
+				     const size_t salen,
+				     const unsigned short protocol,
+				     const u32 version, const char *hostname)
 {
-	struct sockaddr_in ssin = {0};
-
-	return nlm_lookup_host(0, sin, proto, version,
-			       hostname, hostname_len, &ssin);
+	const struct sockaddr source = {
+		.sa_family	= AF_UNSPEC,
+	};
+	struct nlm_lookup_host_info ni = {
+		.server		= 0,
+		.sap		= sap,
+		.salen		= salen,
+		.protocol	= protocol,
+		.version	= version,
+		.hostname	= hostname,
+		.hostname_len	= strlen(hostname),
+		.src_sap	= &source,
+		.src_len	= sizeof(source),
+	};
+
+	dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
+			(hostname ? hostname : "<none>"), version,
+			(protocol == IPPROTO_UDP ? "udp" : "tcp"));
+
+	return nlm_lookup_host(&ni);
 }
 
-/*
- * Find an NLM client handle in the cache. If there is none, create it.
+/**
+ * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
+ * @rqstp: incoming NLM request
+ * @hostname: name of client host
+ * @hostname_len: length of client hostname
+ *
+ * Returns an nlm_host structure that matches the [client address,
+ * transport protocol, NLM version, client hostname] of the passed-in
+ * NLM request.  If one doesn't already exist in the host cache, a
+ * new handle is created and returned.
+ *
+ * Before possibly creating a new nlm_host, construct a sockaddr
+ * for a specific source address in case the local system has
+ * multiple network addresses.  The family of the address in
+ * rq_daddr is guaranteed to be the same as the family of the
+ * address in rq_addr, so it's safe to use the same family for
+ * the source address.
  */
-struct nlm_host *
-nlmsvc_lookup_host(struct svc_rqst *rqstp,
-			const char *hostname, unsigned int hostname_len)
+struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
+				    const char *hostname,
+				    const size_t hostname_len)
 {
-	struct sockaddr_in ssin = {0};
+	struct sockaddr_in sin = {
+		.sin_family	= AF_INET,
+	};
+	struct sockaddr_in6 sin6 = {
+		.sin6_family	= AF_INET6,
+	};
+	struct nlm_lookup_host_info ni = {
+		.server		= 1,
+		.sap		= svc_addr(rqstp),
+		.salen		= rqstp->rq_addrlen,
+		.protocol	= rqstp->rq_prot,
+		.version	= rqstp->rq_vers,
+		.hostname	= hostname,
+		.hostname_len	= hostname_len,
+		.src_len	= rqstp->rq_addrlen,
+	};
+
+	dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+			(int)hostname_len, hostname, rqstp->rq_vers,
+			(rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+
+	switch (ni.sap->sa_family) {
+	case AF_INET:
+		sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
+		ni.src_sap = (struct sockaddr *)&sin;
+		break;
+	case AF_INET6:
+		ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
+		ni.src_sap = (struct sockaddr *)&sin6;
+		break;
+	default:
+		return NULL;
+	}
 
-	ssin.sin_addr = rqstp->rq_daddr.addr;
-	return nlm_lookup_host(1, svc_addr_in(rqstp),
-			       rqstp->rq_prot, rqstp->rq_vers,
-			       hostname, hostname_len, &ssin);
+	return nlm_lookup_host(&ni);
 }
 
 /*
@@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host)
 {
 	struct rpc_clnt	*clnt;
 
-	dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n",
-			NIPQUAD(host->h_saddr.sin_addr),
-			NIPQUAD(host->h_addr.sin_addr));
+	dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
+			host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
 
 	/* Lock host handle */
 	mutex_lock(&host->h_mutex);
@@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host)
 		if (time_after_eq(jiffies, host->h_nextrebind)) {
 			rpc_force_rebind(clnt);
 			host->h_nextrebind = jiffies + NLM_HOST_REBIND;
-			dprintk("lockd: next rebind in %ld jiffies\n",
+			dprintk("lockd: next rebind in %lu jiffies\n",
 					host->h_nextrebind - jiffies);
 		}
 	} else {
@@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host)
 		};
 		struct rpc_create_args args = {
 			.protocol	= host->h_proto,
-			.address	= (struct sockaddr *)&host->h_addr,
-			.addrsize	= sizeof(host->h_addr),
-			.saddress	= (struct sockaddr *)&host->h_saddr,
+			.address	= nlm_addr(host),
+			.addrsize	= host->h_addrlen,
+			.saddress	= nlm_srcaddr(host),
 			.timeout	= &timeparms,
 			.servername	= host->h_name,
 			.program	= &nlm_program,
@@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin,
 	struct nsm_handle *nsm;
 	struct nlm_host	*host;
 
-	dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
-			hostname, NIPQUAD(sin->sin_addr));
-
-	/* Find the NSM handle for this peer */
-	if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
+	nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
+			hostname, hostname_len, 0);
+	if (nsm == NULL) {
+		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+				hostname_len, hostname);
 		return;
+	}
+
+	dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
+			hostname_len, hostname, nsm->sm_addrbuf);
 
 	/* When reclaiming locks on this peer, make sure that
 	 * we set up a new notification */
@@ -461,22 +628,23 @@ nlm_gc_hosts(void)
 static LIST_HEAD(nsm_handles);
 static DEFINE_SPINLOCK(nsm_lock);
 
-static struct nsm_handle *
-__nsm_find(const struct sockaddr_in *sin,
-		const char *hostname, unsigned int hostname_len,
-		int create)
+static struct nsm_handle *nsm_find(const struct sockaddr *sap,
+				   const size_t salen,
+				   const char *hostname,
+				   const size_t hostname_len,
+				   const int create)
 {
 	struct nsm_handle *nsm = NULL;
 	struct nsm_handle *pos;
 
-	if (!sin)
+	if (!sap)
 		return NULL;
 
 	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
 		if (printk_ratelimit()) {
 			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
 					    "in NFS lock request\n",
-				hostname_len, hostname);
+				(int)hostname_len, hostname);
 		}
 		return NULL;
 	}
@@ -489,7 +657,7 @@ retry:
 			if (strlen(pos->sm_name) != hostname_len
 			 || memcmp(pos->sm_name, hostname, hostname_len))
 				continue;
-		} else if (!nlm_cmp_addr(&pos->sm_addr, sin))
+		} else if (!nlm_cmp_addr(nsm_addr(pos), sap))
 			continue;
 		atomic_inc(&pos->sm_count);
 		kfree(nsm);
@@ -509,10 +677,13 @@ retry:
 	if (nsm == NULL)
 		return NULL;
 
-	nsm->sm_addr = *sin;
+	memcpy(nsm_addr(nsm), sap, salen);
+	nsm->sm_addrlen = salen;
 	nsm->sm_name = (char *) (nsm + 1);
 	memcpy(nsm->sm_name, hostname, hostname_len);
 	nsm->sm_name[hostname_len] = '\0';
+	nlm_display_address((struct sockaddr *)&nsm->sm_addr,
+				nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
 	atomic_set(&nsm->sm_count, 1);
 	goto retry;
 
@@ -521,13 +692,6 @@ found:
 	return nsm;
 }
 
-static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname,
-	 unsigned int hostname_len)
-{
-	return __nsm_find(sin, hostname, hostname_len, 1);
-}
-
 /*
  * Release an NSM handle
  */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e4d563543b11..4e7e958e8f67 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 
 	memset(&args, 0, sizeof(args));
 	args.mon_name = nsm->sm_name;
-	args.addr = nsm->sm_addr.sin_addr.s_addr;
+	args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
 	args.prog = NLM_PROGRAM;
 	args.vers = 3;
 	args.proc = NLMPROC_NSM_NOTIFY;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 2169af4d5455..c631a83931ce 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -50,8 +50,7 @@ EXPORT_SYMBOL(nlmsvc_ops);
 static DEFINE_MUTEX(nlmsvc_mutex);
 static unsigned int		nlmsvc_users;
 static struct task_struct	*nlmsvc_task;
-static struct svc_serv		*nlmsvc_serv;
-int				nlmsvc_grace_period;
+static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
 /*
@@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void)
 		return nlm_timeout * 5 * HZ;
 }
 
-unsigned long get_nfs_grace_period(void)
-{
-	unsigned long lockdgrace = get_lockd_grace_period();
-	unsigned long nfsdgrace = 0;
-
-	if (nlmsvc_ops)
-		nfsdgrace = nlmsvc_ops->get_grace_period();
-
-	return max(lockdgrace, nfsdgrace);
-}
-EXPORT_SYMBOL(get_nfs_grace_period);
+static struct lock_manager lockd_manager = {
+};
 
-static unsigned long set_grace_period(void)
+static void grace_ender(struct work_struct *not_used)
 {
-	nlmsvc_grace_period = 1;
-	return get_nfs_grace_period() + jiffies;
+	locks_end_grace(&lockd_manager);
 }
 
-static inline void clear_grace_period(void)
+static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
+
+static void set_grace_period(void)
 {
-	nlmsvc_grace_period = 0;
+	unsigned long grace_period = get_lockd_grace_period();
+
+	locks_start_grace(&lockd_manager);
+	cancel_delayed_work_sync(&grace_period_end);
+	schedule_delayed_work(&grace_period_end, grace_period);
 }
 
 /*
@@ -116,7 +111,6 @@ lockd(void *vrqstp)
 {
 	int		err = 0, preverr = 0;
 	struct svc_rqst *rqstp = vrqstp;
-	unsigned long grace_period_expire;
 
 	/* try_to_freeze() is called from svc_recv() */
 	set_freezable();
@@ -139,7 +133,7 @@ lockd(void *vrqstp)
 		nlm_timeout = LOCKD_DFLT_TIMEO;
 	nlmsvc_timeout = nlm_timeout * HZ;
 
-	grace_period_expire = set_grace_period();
+	set_grace_period();
 
 	/*
 	 * The main request loop. We don't terminate until the last
@@ -153,21 +147,12 @@ lockd(void *vrqstp)
 			flush_signals(current);
 			if (nlmsvc_ops) {
 				nlmsvc_invalidate_all();
-				grace_period_expire = set_grace_period();
+				set_grace_period();
 			}
 			continue;
 		}
 
-		/*
-		 * Retry any blocked locks that have been notified by
-		 * the VFS. Don't do this during grace period.
-		 * (Theoretically, there shouldn't even be blocked locks
-		 * during grace period).
-		 */
-		if (!nlmsvc_grace_period) {
-			timeout = nlmsvc_retry_blocked();
-		} else if (time_before(grace_period_expire, jiffies))
-			clear_grace_period();
+		timeout = nlmsvc_retry_blocked();
 
 		/*
 		 * Find a socket with data available and call its
@@ -194,43 +179,38 @@ lockd(void *vrqstp)
 
 		svc_process(rqstp);
 	}
-
 	flush_signals(current);
+	cancel_delayed_work_sync(&grace_period_end);
 	if (nlmsvc_ops)
 		nlmsvc_invalidate_all();
 	nlm_shutdown_hosts();
-
 	unlock_kernel();
-
-	nlmsvc_task = NULL;
-	nlmsvc_serv = NULL;
-
-	/* Exit the RPC thread */
-	svc_exit_thread(rqstp);
-
 	return 0;
 }
 
 /*
- * Make any sockets that are needed but not present.
- * If nlm_udpport or nlm_tcpport were set as module
- * options, make those sockets unconditionally
+ * Ensure there are active UDP and TCP listeners for lockd.
+ *
+ * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
+ * local services (such as rpc.statd) still require UDP, and
+ * some NFS servers do not yet support NLM over TCP.
+ *
+ * Returns zero if all listeners are available; otherwise a
+ * negative errno value is returned.
  */
-static int make_socks(struct svc_serv *serv, int proto)
+static int make_socks(struct svc_serv *serv)
 {
 	static int warned;
 	struct svc_xprt *xprt;
 	int err = 0;
 
-	if (proto == IPPROTO_UDP || nlm_udpport) {
-		xprt = svc_find_xprt(serv, "udp", 0, 0);
-		if (!xprt)
-			err = svc_create_xprt(serv, "udp", nlm_udpport,
-					      SVC_SOCK_DEFAULTS);
-		else
-			svc_xprt_put(xprt);
-	}
-	if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
+	xprt = svc_find_xprt(serv, "udp", 0, 0);
+	if (!xprt)
+		err = svc_create_xprt(serv, "udp", nlm_udpport,
+				      SVC_SOCK_DEFAULTS);
+	else
+		svc_xprt_put(xprt);
+	if (err >= 0) {
 		xprt = svc_find_xprt(serv, "tcp", 0, 0);
 		if (!xprt)
 			err = svc_create_xprt(serv, "tcp", nlm_tcpport,
@@ -250,22 +230,17 @@ static int make_socks(struct svc_serv *serv, int proto)
 /*
  * Bring up the lockd process if it's not already up.
  */
-int
-lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
+int lockd_up(void)
 {
 	struct svc_serv *serv;
-	struct svc_rqst *rqstp;
 	int		error = 0;
 
 	mutex_lock(&nlmsvc_mutex);
 	/*
 	 * Check whether we're already up and running.
 	 */
-	if (nlmsvc_serv) {
-		if (proto)
-			error = make_socks(nlmsvc_serv, proto);
+	if (nlmsvc_rqst)
 		goto out;
-	}
 
 	/*
 	 * Sanity check: if there's no pid,
@@ -276,21 +251,23 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
 	}
 
-	if ((error = make_socks(serv, proto)) < 0)
+	error = make_socks(serv);
+	if (error < 0)
 		goto destroy_and_out;
 
 	/*
 	 * Create the kernel thread and wait for it to start.
 	 */
-	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
-	if (IS_ERR(rqstp)) {
-		error = PTR_ERR(rqstp);
+	nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+	if (IS_ERR(nlmsvc_rqst)) {
+		error = PTR_ERR(nlmsvc_rqst);
+		nlmsvc_rqst = NULL;
 		printk(KERN_WARNING
 			"lockd_up: svc_rqst allocation failed, error=%d\n",
 			error);
@@ -298,16 +275,15 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 	}
 
 	svc_sock_update_bufs(serv);
-	nlmsvc_serv = rqstp->rq_server;
 
-	nlmsvc_task = kthread_run(lockd, rqstp, serv->sv_name);
+	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
 	if (IS_ERR(nlmsvc_task)) {
 		error = PTR_ERR(nlmsvc_task);
+		svc_exit_thread(nlmsvc_rqst);
 		nlmsvc_task = NULL;
-		nlmsvc_serv = NULL;
+		nlmsvc_rqst = NULL;
 		printk(KERN_WARNING
 			"lockd_up: kthread_run failed, error=%d\n", error);
-		svc_exit_thread(rqstp);
 		goto destroy_and_out;
 	}
 
@@ -346,6 +322,9 @@ lockd_down(void)
 		BUG();
 	}
 	kthread_stop(nlmsvc_task);
+	svc_exit_thread(nlmsvc_rqst);
+	nlmsvc_task = NULL;
+	nlmsvc_rqst = NULL;
 out:
 	mutex_unlock(&nlmsvc_mutex);
 }
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 2e27176ff42f..014f6ce48172 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -58,8 +58,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return 0;
 
 no_locks:
-	if (host)
-		nlm_release_host(host);
+	nlm_release_host(host);
  	if (error)
 		return error;	
 	return nlm_lck_denied_nolocks;
@@ -84,23 +83,17 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
 	struct nlm_host	*host;
 	struct nlm_file	*file;
-	int rc = rpc_success;
+	__be32 rc = rpc_success;
 
 	dprintk("lockd: TEST4        called\n");
 	resp->cookie = argp->cookie;
 
-	/* Don't accept test requests during grace period */
-	if (nlmsvc_grace_period) {
-		resp->status = nlm_lck_denied_grace_period;
-		return rc;
-	}
-
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
-	resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
+	resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
@@ -117,18 +110,12 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
 	struct nlm_host	*host;
 	struct nlm_file	*file;
-	int rc = rpc_success;
+	__be32 rc = rpc_success;
 
 	dprintk("lockd: LOCK          called\n");
 
 	resp->cookie = argp->cookie;
 
-	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period && !argp->reclaim) {
-		resp->status = nlm_lck_denied_grace_period;
-		return rc;
-	}
-
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,8 +133,9 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 #endif
 
 	/* Now try to lock the file */
-	resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
-					argp->block, &argp->cookie);
+	resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
+					argp->block, &argp->cookie,
+					argp->reclaim);
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
@@ -170,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -203,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -232,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	dprintk("lockd: GRANTED       called\n");
-	resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+	resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
 	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
 	return rpc_success;
 }
@@ -342,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period && !argp->reclaim) {
+	if (locks_in_grace() && !argp->reclaim) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -375,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -433,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 {
 	struct sockaddr_in	saddr;
 
-	memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
 	dprintk("lockd: SM_NOTIFY     called\n");
-	if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
-	 || ntohs(saddr.sin_port) >= 1024) {
+
+	if (!nlm_privileged_requester(rqstp)) {
 		char buf[RPC_MAX_ADDRBUFLEN];
 		printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
 				svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56a08ab9a4cb..6063a8e4b9f3 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -129,9 +129,9 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
 
 static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
 {
-	if(a->len != b->len)
+	if (a->len != b->len)
 		return 0;
-	if(memcmp(a->data,b->data,a->len))
+	if (memcmp(a->data, b->data, a->len))
 		return 0;
 	return 1;
 }
@@ -180,6 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 	struct nlm_block	*block;
 	struct nlm_rqst		*call = NULL;
 
+	nlm_get_host(host);
 	call = nlm_alloc_call(host);
 	if (call == NULL)
 		return NULL;
@@ -358,10 +359,10 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
  */
 __be32
 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
-			struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
+	    struct nlm_host *host, struct nlm_lock *lock, int wait,
+	    struct nlm_cookie *cookie, int reclaim)
 {
 	struct nlm_block	*block = NULL;
-	struct nlm_host		*host;
 	int			error;
 	__be32			ret;
 
@@ -373,11 +374,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 				(long long)lock->fl.fl_end,
 				wait);
 
-	/* Create host handle for callback */
-	host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
-	if (host == NULL)
-		return nlm_lck_denied_nolocks;
-
 	/* Lock file against concurrent access */
 	mutex_lock(&file->f_mutex);
 	/* Get existing block (in case client is busy-waiting)
@@ -385,8 +381,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	 */
 	block = nlmsvc_lookup_block(file, lock);
 	if (block == NULL) {
-		block = nlmsvc_create_block(rqstp, nlm_get_host(host), file,
-				lock, cookie);
+		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
 		ret = nlm_lck_denied_nolocks;
 		if (block == NULL)
 			goto out;
@@ -411,20 +406,29 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
+	if (locks_in_grace() && !reclaim) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+	if (reclaim && !locks_in_grace()) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+
 	if (!wait)
 		lock->fl.fl_flags &= ~FL_SLEEP;
 	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
 	lock->fl.fl_flags &= ~FL_SLEEP;
 
 	dprintk("lockd: vfs_lock_file returned %d\n", error);
-	switch(error) {
+	switch (error) {
 		case 0:
 			ret = nlm_granted;
 			goto out;
 		case -EAGAIN:
 			ret = nlm_lck_denied;
-			break;
-		case -EINPROGRESS:
+			goto out;
+		case FILE_LOCK_DEFERRED:
 			if (wait)
 				break;
 			/* Filesystem lock operation is in progress
@@ -439,10 +443,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 	}
 
-	ret = nlm_lck_denied;
-	if (!wait)
-		goto out;
-
 	ret = nlm_lck_blocked;
 
 	/* Append to list of blocked */
@@ -450,7 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 out:
 	mutex_unlock(&file->f_mutex);
 	nlmsvc_release_block(block);
-	nlm_release_host(host);
 	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
 	return ret;
 }
@@ -460,8 +459,8 @@ out:
  */
 __be32
 nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
-		struct nlm_lock *lock, struct nlm_lock *conflock,
-		struct nlm_cookie *cookie)
+		struct nlm_host *host, struct nlm_lock *lock,
+		struct nlm_lock *conflock, struct nlm_cookie *cookie)
 {
 	struct nlm_block 	*block = NULL;
 	int			error;
@@ -479,16 +478,9 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 
 	if (block == NULL) {
 		struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-		struct nlm_host	*host;
 
 		if (conf == NULL)
 			return nlm_granted;
-		/* Create host handle for callback */
-		host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
-		if (host == NULL) {
-			kfree(conf);
-			return nlm_lck_denied_nolocks;
-		}
 		block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
 		if (block == NULL) {
 			kfree(conf);
@@ -519,8 +511,12 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
+	if (locks_in_grace()) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
 	error = vfs_test_lock(file->f_file, &lock->fl);
-	if (error == -EINPROGRESS) {
+	if (error == FILE_LOCK_DEFERRED) {
 		ret = nlmsvc_defer_lock_rqst(rqstp, block);
 		goto out;
 	}
@@ -599,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
+	if (locks_in_grace())
+		return nlm_lck_denied_grace_period;
+
 	mutex_lock(&file->f_mutex);
 	block = nlmsvc_lookup_block(file, lock);
 	mutex_unlock(&file->f_mutex);
@@ -744,8 +743,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 	switch (error) {
 	case 0:
 		break;
-	case -EAGAIN:
-	case -EINPROGRESS:
+	case FILE_LOCK_DEFERRED:
 		dprintk("lockd: lock still blocked error %d\n", error);
 		nlmsvc_insert_block(block, NLM_NEVER);
 		nlmsvc_release_block(block);
@@ -897,7 +895,7 @@ nlmsvc_retry_blocked(void)
 
 		if (block->b_when == NLM_NEVER)
 			break;
-	        if (time_after(block->b_when,jiffies)) {
+		if (time_after(block->b_when, jiffies)) {
 			timeout = block->b_when - jiffies;
 			break;
 		}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index ce6952b50a75..548b0bb2b84d 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -87,8 +87,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return 0;
 
 no_locks:
-	if (host)
-		nlm_release_host(host);
+	nlm_release_host(host);
 	if (error)
 		return error;
 	return nlm_lck_denied_nolocks;
@@ -113,23 +112,17 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
 	struct nlm_host	*host;
 	struct nlm_file	*file;
-	int rc = rpc_success;
+	__be32 rc = rpc_success;
 
 	dprintk("lockd: TEST          called\n");
 	resp->cookie = argp->cookie;
 
-	/* Don't accept test requests during grace period */
-	if (nlmsvc_grace_period) {
-		resp->status = nlm_lck_denied_grace_period;
-		return rc;
-	}
-
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
-	resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
+	resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
@@ -147,18 +140,12 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 {
 	struct nlm_host	*host;
 	struct nlm_file	*file;
-	int rc = rpc_success;
+	__be32 rc = rpc_success;
 
 	dprintk("lockd: LOCK          called\n");
 
 	resp->cookie = argp->cookie;
 
-	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period && !argp->reclaim) {
-		resp->status = nlm_lck_denied_grace_period;
-		return rc;
-	}
-
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -176,8 +163,9 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 #endif
 
 	/* Now try to lock the file */
-	resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
-					       argp->block, &argp->cookie));
+	resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
+					       argp->block, &argp->cookie,
+					       argp->reclaim));
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
@@ -200,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -233,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -262,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	dprintk("lockd: GRANTED       called\n");
-	resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+	resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
 	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
 	return rpc_success;
 }
@@ -374,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period && !argp->reclaim) {
+	if (locks_in_grace() && !argp->reclaim) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -407,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -465,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 {
 	struct sockaddr_in	saddr;
 
-	memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
 	dprintk("lockd: SM_NOTIFY     called\n");
-	if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
-	 || ntohs(saddr.sin_port) >= 1024) {
+
+	if (!nlm_privileged_requester(rqstp)) {
 		char buf[RPC_MAX_ADDRBUFLEN];
 		printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
 				svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d1c48b539df8..34c2766e27c7 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -373,13 +373,16 @@ nlmsvc_free_host_resources(struct nlm_host *host)
 	}
 }
 
-/*
- * Remove all locks held for clients
+/**
+ * nlmsvc_invalidate_all - remove all locks held for clients
+ *
+ * Release all locks held by NFS clients.
+ *
  */
 void
 nlmsvc_invalidate_all(void)
 {
-	/* Release all locks held by NFS clients.
+	/*
 	 * Previously, the code would call
 	 * nlmsvc_free_host_resources for each client in
 	 * turn, which is about as inefficient as it gets.
@@ -396,6 +399,12 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file)
 	return sb == file->f_file->f_path.mnt->mnt_sb;
 }
 
+/**
+ * nlmsvc_unlock_all_by_sb - release locks held on this file system
+ * @sb: super block
+ *
+ * Release all locks held by clients accessing this file system.
+ */
 int
 nlmsvc_unlock_all_by_sb(struct super_block *sb)
 {
@@ -409,17 +418,22 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
 static int
 nlmsvc_match_ip(void *datap, struct nlm_host *host)
 {
-	__be32 *server_addr = datap;
-
-	return host->h_saddr.sin_addr.s_addr == *server_addr;
+	return nlm_cmp_addr(nlm_srcaddr(host), datap);
 }
 
+/**
+ * nlmsvc_unlock_all_by_ip - release local locks by IP address
+ * @server_addr: server's IP address as seen by clients
+ *
+ * Release all locks held by clients accessing this host
+ * via the passed in IP address.
+ */
 int
-nlmsvc_unlock_all_by_ip(__be32 server_addr)
+nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr)
 {
 	int ret;
-	ret = nlm_traverse_files(&server_addr, nlmsvc_match_ip, NULL);
-	return ret ? -EIO : 0;
 
+	ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL);
+	return ret ? -EIO : 0;
 }
 EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip);
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 3e459e18cc31..1f226290c67c 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
 	argp->state = ntohl(*p++);
 	/* Preserve the address in network byte order */
 	argp->addr = *p++;
-	argp->vers = *p++;
-	argp->proto = *p++;
 	return xdr_argsize_check(rqstp, p);
 }
 
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 43ff9397e6c6..50c493a8ad8e 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
 	argp->state = ntohl(*p++);
 	/* Preserve the address in network byte order */
 	argp->addr  = *p++;
-	argp->vers  = *p++;
-	argp->proto = *p++;
 	return xdr_argsize_check(rqstp, p);
 }
 
diff --git a/fs/locks.c b/fs/locks.c
index dce8c747371c..09062e3ff104 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -201,7 +201,7 @@ EXPORT_SYMBOL(locks_init_lock);
  * Initialises the fields of the file lock which are invariant for
  * free file_locks.
  */
-static void init_once(struct kmem_cache *cache, void *foo)
+static void init_once(void *foo)
 {
 	struct file_lock *lock = (struct file_lock *) foo;
 
@@ -779,8 +779,10 @@ find_conflict:
 		if (!flock_locks_conflict(request, fl))
 			continue;
 		error = -EAGAIN;
-		if (request->fl_flags & FL_SLEEP)
-			locks_insert_block(fl, request);
+		if (!(request->fl_flags & FL_SLEEP))
+			goto out;
+		error = FILE_LOCK_DEFERRED;
+		locks_insert_block(fl, request);
 		goto out;
 	}
 	if (request->fl_flags & FL_ACCESS)
@@ -836,7 +838,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			error = -EDEADLK;
 			if (posix_locks_deadlock(request, fl))
 				goto out;
-			error = -EAGAIN;
+			error = FILE_LOCK_DEFERRED;
 			locks_insert_block(fl, request);
 			goto out;
   		}
@@ -1035,7 +1037,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 	might_sleep ();
 	for (;;) {
 		error = posix_lock_file(filp, fl, NULL);
-		if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
+		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
 		if (!error)
@@ -1107,9 +1109,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 
 	for (;;) {
 		error = __posix_lock_file(inode, &fl, NULL);
-		if (error != -EAGAIN)
-			break;
-		if (!(fl.fl_flags & FL_SLEEP))
+		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
 		if (!error) {
@@ -1531,7 +1531,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
 	might_sleep();
 	for (;;) {
 		error = flock_lock_file(filp, fl);
-		if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
+		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
 		if (!error)
@@ -1580,7 +1580,8 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
 	cmd &= ~LOCK_NB;
 	unlock = (cmd == LOCK_UN);
 
-	if (!unlock && !(cmd & LOCK_MAND) && !(filp->f_mode & 3))
+	if (!unlock && !(cmd & LOCK_MAND) &&
+	    !(filp->f_mode & (FMODE_READ|FMODE_WRITE)))
 		goto out_putf;
 
 	error = flock_make_lock(filp, &lock, cmd);
@@ -1716,17 +1717,17 @@ out:
  * fl_grant is set. Callers expecting ->lock() to return asynchronously
  * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
  * the request is for a blocking lock. When ->lock() does return asynchronously,
- * it must return -EINPROGRESS, and call ->fl_grant() when the lock
+ * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock
  * request completes.
  * If the request is for non-blocking lock the file system should return
- * -EINPROGRESS then try to get the lock and call the callback routine with
- * the result. If the request timed out the callback routine will return a
+ * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
+ * with the result. If the request timed out the callback routine will return a
  * nonzero return code and the file system should release the lock. The file
  * system is also responsible to keep a corresponding posix lock when it
  * grants a lock so the VFS can find out which locks are locally held and do
  * the correct lock cleanup when required.
  * The underlying filesystem must not drop the kernel lock or call
- * ->fl_grant() before returning to the caller with a -EINPROGRESS
+ * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED
  * return code.
  */
 int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
@@ -1738,6 +1739,30 @@ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, str
 }
 EXPORT_SYMBOL_GPL(vfs_lock_file);
 
+static int do_lock_file_wait(struct file *filp, unsigned int cmd,
+			     struct file_lock *fl)
+{
+	int error;
+
+	error = security_file_lock(filp, fl->fl_type);
+	if (error)
+		return error;
+
+	for (;;) {
+		error = vfs_lock_file(filp, cmd, fl, NULL);
+		if (error != FILE_LOCK_DEFERRED)
+			break;
+		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+		if (!error)
+			continue;
+
+		locks_delete_block(fl);
+		break;
+	}
+
+	return error;
+}
+
 /* Apply the lock described by l to an open file descriptor.
  * This implements both the F_SETLK and F_SETLKW commands of fcntl().
  */
@@ -1795,26 +1820,7 @@ again:
 		goto out;
 	}
 
-	error = security_file_lock(filp, file_lock->fl_type);
-	if (error)
-		goto out;
-
-	if (filp->f_op && filp->f_op->lock != NULL)
-		error = filp->f_op->lock(filp, cmd, file_lock);
-	else {
-		for (;;) {
-			error = posix_lock_file(filp, file_lock, NULL);
-			if (error != -EAGAIN || cmd == F_SETLK)
-				break;
-			error = wait_event_interruptible(file_lock->fl_wait,
-					!file_lock->fl_next);
-			if (!error)
-				continue;
-
-			locks_delete_block(file_lock);
-			break;
-		}
-	}
+	error = do_lock_file_wait(filp, cmd, file_lock);
 
 	/*
 	 * Attempt to detect a close/fcntl race and recover by
@@ -1932,26 +1938,7 @@ again:
 		goto out;
 	}
 
-	error = security_file_lock(filp, file_lock->fl_type);
-	if (error)
-		goto out;
-
-	if (filp->f_op && filp->f_op->lock != NULL)
-		error = filp->f_op->lock(filp, cmd, file_lock);
-	else {
-		for (;;) {
-			error = posix_lock_file(filp, file_lock, NULL);
-			if (error != -EAGAIN || cmd == F_SETLK64)
-				break;
-			error = wait_event_interruptible(file_lock->fl_wait,
-					!file_lock->fl_next);
-			if (!error)
-				continue;
-
-			locks_delete_block(file_lock);
-			break;
-		}
-	}
+	error = do_lock_file_wait(filp, cmd, file_lock);
 
 	/*
 	 * Attempt to detect a close/fcntl race and recover by
@@ -2092,6 +2079,7 @@ int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 
 #ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
 static void lock_get_status(struct seq_file *f, struct file_lock *fl,
@@ -2197,12 +2185,31 @@ static void locks_stop(struct seq_file *f, void *v)
 	unlock_kernel();
 }
 
-struct seq_operations locks_seq_operations = {
+static const struct seq_operations locks_seq_operations = {
 	.start	= locks_start,
 	.next	= locks_next,
 	.stop	= locks_stop,
 	.show	= locks_show,
 };
+
+static int locks_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &locks_seq_operations);
+}
+
+static const struct file_operations proc_locks_operations = {
+	.open		= locks_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_locks_init(void)
+{
+	proc_create("locks", 0, NULL, &proc_locks_operations);
+	return 0;
+}
+module_init(proc_locks_init);
 #endif
 
 /**
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 84f6242ba6fc..d1d1eb84679d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,7 +68,7 @@ static void minix_destroy_inode(struct inode *inode)
 	kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
 
@@ -256,9 +256,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
 	if (!s->s_root)
 		goto out_iput;
 
-	if (!NO_TRUNCATE)
-		s->s_root->d_op = &minix_dentry_operations;
-
 	if (!(s->s_flags & MS_RDONLY)) {
 		if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
 			ms->s_state &= ~MINIX_VALID_FS;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 326edfe96108..e6a0b193bea4 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -2,11 +2,6 @@
 #include <linux/pagemap.h>
 #include <linux/minix_fs.h>
 
-/*
- * change the define below to 0 if you want names > info->s_namelen chars to be
- * truncated. Else they will be disallowed (ENAMETOOLONG).
- */
-#define NO_TRUNCATE 1
 #define INODE_VERSION(inode)	minix_sb(inode->i_sb)->s_version
 #define MINIX_V1		0x0001		/* original minix fs */
 #define MINIX_V2		0x0002		/* minix V2 fs */
@@ -83,7 +78,6 @@ extern const struct inode_operations minix_file_inode_operations;
 extern const struct inode_operations minix_dir_inode_operations;
 extern const struct file_operations minix_file_operations;
 extern const struct file_operations minix_dir_operations;
-extern struct dentry_operations minix_dentry_operations;
 
 static inline struct minix_sb_info *minix_sb(struct super_block *sb)
 {
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 102241bc9c79..32b131cd6121 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -18,30 +18,6 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
 	return err;
 }
 
-static int minix_hash(struct dentry *dentry, struct qstr *qstr)
-{
-	unsigned long hash;
-	int i;
-	const unsigned char *name;
-
-	i = minix_sb(dentry->d_inode->i_sb)->s_namelen;
-	if (i >= qstr->len)
-		return 0;
-	/* Truncate the name in place, avoids having to define a compare
-	   function. */
-	qstr->len = i;
-	name = qstr->name;
-	hash = init_name_hash();
-	while (i--)
-		hash = partial_name_hash(*name++, hash);
-	qstr->hash = end_name_hash(hash);
-	return 0;
-}
-
-struct dentry_operations minix_dentry_operations = {
-	.d_hash		= minix_hash,
-};
-
 static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode * inode = NULL;
diff --git a/fs/mpage.c b/fs/mpage.c
index dbcc7af76a15..552b80b3facc 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -6,7 +6,7 @@
  * Contains functions related to preparing and submitting BIOs which contain
  * multiple pagecache pages.
  *
- * 15May2002	akpm@zip.com.au
+ * 15May2002	Andrew Morton
  *		Initial version
  * 27Jun2002	axboe@suse.de
  *		use bio_add_page() to build bio's just the right size
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 1f7f2956412a..e844b9809d27 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -14,12 +14,7 @@
 
 /* Characters that are undesirable in an MS-DOS file name */
 static unsigned char bad_chars[] = "*?<>|\"";
-static unsigned char bad_if_strict_pc[] = "+=,; ";
-/* GEMDOS is less restrictive */
-static unsigned char bad_if_strict_atari[] = " ";
-
-#define bad_if_strict(opts) \
-	((opts)->atari ? bad_if_strict_atari : bad_if_strict_pc)
+static unsigned char bad_if_strict[] = "+=,; ";
 
 /***** Formats an MS-DOS file name. Rejects invalid names. */
 static int msdos_format_name(const unsigned char *name, int len,
@@ -40,21 +35,20 @@ static int msdos_format_name(const unsigned char *name, int len,
 			/* Get rid of dot - test for it elsewhere */
 			name++;
 			len--;
-		} else if (!opts->atari)
+		} else
 			return -EINVAL;
 	}
 	/*
-	 * disallow names that _really_ start with a dot for MS-DOS,
-	 * GEMDOS does not care
+	 * disallow names that _really_ start with a dot
 	 */
-	space = !opts->atari;
+	space = 1;
 	c = 0;
 	for (walk = res; len && walk - res < 8; walk++) {
 		c = *name++;
 		len--;
 		if (opts->name_check != 'r' && strchr(bad_chars, c))
 			return -EINVAL;
-		if (opts->name_check == 's' && strchr(bad_if_strict(opts), c))
+		if (opts->name_check == 's' && strchr(bad_if_strict, c))
 			return -EINVAL;
 		if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
 			return -EINVAL;
@@ -94,7 +88,7 @@ static int msdos_format_name(const unsigned char *name, int len,
 			if (opts->name_check != 'r' && strchr(bad_chars, c))
 				return -EINVAL;
 			if (opts->name_check == 's' &&
-			    strchr(bad_if_strict(opts), c))
+			    strchr(bad_if_strict, c))
 				return -EINVAL;
 			if (c < ' ' || c == ':' || c == '\\')
 				return -EINVAL;
@@ -243,6 +237,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 			   int is_dir, int is_hid, int cluster,
 			   struct timespec *ts, struct fat_slot_info *sinfo)
 {
+	struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
 	struct msdos_dir_entry de;
 	__le16 time, date;
 	int err;
@@ -252,7 +247,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
 	if (is_hid)
 		de.attr |= ATTR_HIDDEN;
 	de.lcase = 0;
-	fat_date_unix2dos(ts->tv_sec, &time, &date);
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
 	de.cdate = de.adate = 0;
 	de.ctime = 0;
 	de.ctime_cs = 0;
diff --git a/fs/namei.c b/fs/namei.c
index 01e67dddcc3d..09ce58e49e72 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -31,7 +31,6 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
-#include <asm/namei.h>
 #include <asm/uaccess.h>
 
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -185,6 +184,8 @@ int generic_permission(struct inode *inode, int mask,
 {
 	umode_t			mode = inode->i_mode;
 
+	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+
 	if (current->fsuid == inode->i_uid)
 		mode >>= 6;
 	else {
@@ -203,7 +204,7 @@ int generic_permission(struct inode *inode, int mask,
 	/*
 	 * If the DACs are ok we don't need any capability check.
 	 */
-	if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
+	if ((mask & ~mode) == 0)
 		return 0;
 
  check_capabilities:
@@ -211,8 +212,7 @@ int generic_permission(struct inode *inode, int mask,
 	 * Read/write DACs are always overridable.
 	 * Executable DACs are overridable if at least one exec bit is set.
 	 */
-	if (!(mask & MAY_EXEC) ||
-	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
+	if (!(mask & MAY_EXEC) || execute_ok(inode))
 		if (capable(CAP_DAC_OVERRIDE))
 			return 0;
 
@@ -226,13 +226,9 @@ int generic_permission(struct inode *inode, int mask,
 	return -EACCES;
 }
 
-int permission(struct inode *inode, int mask, struct nameidata *nd)
+int inode_permission(struct inode *inode, int mask)
 {
-	int retval, submask;
-	struct vfsmount *mnt = NULL;
-
-	if (nd)
-		mnt = nd->path.mnt;
+	int retval;
 
 	if (mask & MAY_WRITE) {
 		umode_t mode = inode->i_mode;
@@ -251,34 +247,12 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 			return -EACCES;
 	}
 
-	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
-		/*
-		 * MAY_EXEC on regular files is denied if the fs is mounted
-		 * with the "noexec" flag.
-		 */
-		if (mnt && (mnt->mnt_flags & MNT_NOEXEC))
-			return -EACCES;
-	}
-
 	/* Ordinary permission routines do not understand MAY_APPEND. */
-	submask = mask & ~MAY_APPEND;
-	if (inode->i_op && inode->i_op->permission) {
-		retval = inode->i_op->permission(inode, submask, nd);
-		if (!retval) {
-			/*
-			 * Exec permission on a regular file is denied if none
-			 * of the execute bits are set.
-			 *
-			 * This check should be done by the ->permission()
-			 * method.
-			 */
-			if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode) &&
-			    !(inode->i_mode & S_IXUGO))
-				return -EACCES;
-		}
-	} else {
-		retval = generic_permission(inode, submask, NULL);
-	}
+	if (inode->i_op && inode->i_op->permission)
+		retval = inode->i_op->permission(inode, mask);
+	else
+		retval = generic_permission(inode, mask, NULL);
+
 	if (retval)
 		return retval;
 
@@ -286,7 +260,8 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
 	if (retval)
 		return retval;
 
-	return security_inode_permission(inode, mask, nd);
+	return security_inode_permission(inode,
+			mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
 }
 
 /**
@@ -301,7 +276,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
  */
 int vfs_permission(struct nameidata *nd, int mask)
 {
-	return permission(nd->path.dentry->d_inode, mask, nd);
+	return inode_permission(nd->path.dentry->d_inode, mask);
 }
 
 /**
@@ -318,7 +293,7 @@ int vfs_permission(struct nameidata *nd, int mask)
  */
 int file_permission(struct file *file, int mask)
 {
-	return permission(file->f_path.dentry->d_inode, mask, NULL);
+	return inode_permission(file->f_path.dentry->d_inode, mask);
 }
 
 /*
@@ -459,8 +434,7 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
  * short-cut DAC fails, then call permission() to do more
  * complete permission check.
  */
-static int exec_permission_lite(struct inode *inode,
-				       struct nameidata *nd)
+static int exec_permission_lite(struct inode *inode)
 {
 	umode_t	mode = inode->i_mode;
 
@@ -486,7 +460,7 @@ static int exec_permission_lite(struct inode *inode,
 
 	return -EACCES;
 ok:
-	return security_inode_permission(inode, MAY_EXEC, nd);
+	return security_inode_permission(inode, MAY_EXEC);
 }
 
 /*
@@ -519,7 +493,14 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 	 */
 	result = d_lookup(parent, name);
 	if (!result) {
-		struct dentry * dentry = d_alloc(parent, name);
+		struct dentry *dentry;
+
+		/* Don't create child dentry for a dead directory. */
+		result = ERR_PTR(-ENOENT);
+		if (IS_DEADDIR(dir))
+			goto out_unlock;
+
+		dentry = d_alloc(parent, name);
 		result = ERR_PTR(-ENOMEM);
 		if (dentry) {
 			result = dir->i_op->lookup(dir, dentry, nd);
@@ -528,6 +509,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 			else
 				result = dentry;
 		}
+out_unlock:
 		mutex_unlock(&dir->i_mutex);
 		return result;
 	}
@@ -545,27 +527,16 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 	return result;
 }
 
-static int __emul_lookup_dentry(const char *, struct nameidata *);
-
 /* SMP-safe */
-static __always_inline int
+static __always_inline void
 walk_init_root(const char *name, struct nameidata *nd)
 {
 	struct fs_struct *fs = current->fs;
 
 	read_lock(&fs->lock);
-	if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
-		nd->path = fs->altroot;
-		path_get(&fs->altroot);
-		read_unlock(&fs->lock);
-		if (__emul_lookup_dentry(name,nd))
-			return 0;
-		read_lock(&fs->lock);
-	}
 	nd->path = fs->root;
 	path_get(&fs->root);
 	read_unlock(&fs->lock);
-	return 1;
 }
 
 /*
@@ -606,12 +577,9 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 
 	if (*link == '/') {
 		path_put(&nd->path);
-		if (!walk_init_root(link, nd))
-			/* weird __emul_prefix() stuff did it */
-			goto out;
+		walk_init_root(link, nd);
 	}
 	res = link_path_walk(link, nd);
-out:
 	if (nd->depth || res || nd->last_type!=LAST_NORM)
 		return res;
 	/*
@@ -889,7 +857,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		unsigned int c;
 
 		nd->flags |= LOOKUP_CONTINUE;
-		err = exec_permission_lite(inode, nd);
+		err = exec_permission_lite(inode);
 		if (err == -EAGAIN)
 			err = vfs_permission(nd, MAY_EXEC);
  		if (err)
@@ -1060,67 +1028,6 @@ static int path_walk(const char *name, struct nameidata *nd)
 	return link_path_walk(name, nd);
 }
 
-/* 
- * SMP-safe: Returns 1 and nd will have valid dentry and mnt, if
- * everything is done. Returns 0 and drops input nd, if lookup failed;
- */
-static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
-{
-	if (path_walk(name, nd))
-		return 0;		/* something went wrong... */
-
-	if (!nd->path.dentry->d_inode ||
-	    S_ISDIR(nd->path.dentry->d_inode->i_mode)) {
-		struct path old_path = nd->path;
-		struct qstr last = nd->last;
-		int last_type = nd->last_type;
-		struct fs_struct *fs = current->fs;
-
-		/*
-		 * NAME was not found in alternate root or it's a directory.
-		 * Try to find it in the normal root:
-		 */
-		nd->last_type = LAST_ROOT;
-		read_lock(&fs->lock);
-		nd->path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
-		if (path_walk(name, nd) == 0) {
-			if (nd->path.dentry->d_inode) {
-				path_put(&old_path);
-				return 1;
-			}
-			path_put(&nd->path);
-		}
-		nd->path = old_path;
-		nd->last = last;
-		nd->last_type = last_type;
-	}
-	return 1;
-}
-
-void set_fs_altroot(void)
-{
-	char *emul = __emul_prefix();
-	struct nameidata nd;
-	struct path path = {}, old_path;
-	int err;
-	struct fs_struct *fs = current->fs;
-
-	if (!emul)
-		goto set_it;
-	err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd);
-	if (!err)
-		path = nd.path;
-set_it:
-	write_lock(&fs->lock);
-	old_path = fs->altroot;
-	fs->altroot = path;
-	write_unlock(&fs->lock);
-	if (old_path.dentry)
-		path_put(&old_path);
-}
-
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int do_path_lookup(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
@@ -1136,14 +1043,6 @@ static int do_path_lookup(int dfd, const char *name,
 
 	if (*name=='/') {
 		read_lock(&fs->lock);
-		if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
-			nd->path = fs->altroot;
-			path_get(&fs->altroot);
-			read_unlock(&fs->lock);
-			if (__emul_lookup_dentry(name,nd))
-				goto out; /* found in altroot */
-			read_lock(&fs->lock);
-		}
 		nd->path = fs->root;
 		path_get(&fs->root);
 		read_unlock(&fs->lock);
@@ -1177,7 +1076,6 @@ static int do_path_lookup(int dfd, const char *name,
 	}
 
 	retval = path_walk(name, nd);
-out:
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
@@ -1195,6 +1093,15 @@ int path_lookup(const char *name, unsigned int flags,
 	return do_path_lookup(AT_FDCWD, name, flags, nd);
 }
 
+int kern_path(const char *name, unsigned int flags, struct path *path)
+{
+	struct nameidata nd;
+	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
+	if (!res)
+		*path = nd.path;
+	return res;
+}
+
 /**
  * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
  * @dentry:  pointer to dentry of the base directory
@@ -1227,9 +1134,16 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 
 }
 
-static int __path_lookup_intent_open(int dfd, const char *name,
-		unsigned int lookup_flags, struct nameidata *nd,
-		int open_flags, int create_mode)
+/**
+ * path_lookup_open - lookup a file path with open intent
+ * @dfd: the directory to use as base, or AT_FDCWD
+ * @name: pointer to file name
+ * @lookup_flags: lookup intent flags
+ * @nd: pointer to nameidata
+ * @open_flags: open intent flags
+ */
+int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
+		struct nameidata *nd, int open_flags)
 {
 	struct file *filp = get_empty_filp();
 	int err;
@@ -1238,7 +1152,7 @@ static int __path_lookup_intent_open(int dfd, const char *name,
 		return -ENFILE;
 	nd->intent.open.file = filp;
 	nd->intent.open.flags = open_flags;
-	nd->intent.open.create_mode = create_mode;
+	nd->intent.open.create_mode = 0;
 	err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
 	if (IS_ERR(nd->intent.open.file)) {
 		if (err == 0) {
@@ -1250,51 +1164,6 @@ static int __path_lookup_intent_open(int dfd, const char *name,
 	return err;
 }
 
-/**
- * path_lookup_open - lookup a file path with open intent
- * @dfd: the directory to use as base, or AT_FDCWD
- * @name: pointer to file name
- * @lookup_flags: lookup intent flags
- * @nd: pointer to nameidata
- * @open_flags: open intent flags
- */
-int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags,
-		struct nameidata *nd, int open_flags)
-{
-	return __path_lookup_intent_open(dfd, name, lookup_flags, nd,
-			open_flags, 0);
-}
-
-/**
- * path_lookup_create - lookup a file path with open + create intent
- * @dfd: the directory to use as base, or AT_FDCWD
- * @name: pointer to file name
- * @lookup_flags: lookup intent flags
- * @nd: pointer to nameidata
- * @open_flags: open intent flags
- * @create_mode: create intent flags
- */
-static int path_lookup_create(int dfd, const char *name,
-			      unsigned int lookup_flags, struct nameidata *nd,
-			      int open_flags, int create_mode)
-{
-	return __path_lookup_intent_open(dfd, name, lookup_flags|LOOKUP_CREATE,
-			nd, open_flags, create_mode);
-}
-
-int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
-		struct nameidata *nd, int open_flags)
-{
-	char *tmp = getname(name);
-	int err = PTR_ERR(tmp);
-
-	if (!IS_ERR(tmp)) {
-		err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
-		putname(tmp);
-	}
-	return err;
-}
-
 static struct dentry *__lookup_hash(struct qstr *name,
 		struct dentry *base, struct nameidata *nd)
 {
@@ -1317,7 +1186,14 @@ static struct dentry *__lookup_hash(struct qstr *name,
 
 	dentry = cached_lookup(base, name, nd);
 	if (!dentry) {
-		struct dentry *new = d_alloc(base, name);
+		struct dentry *new;
+
+		/* Don't create child dentry for a dead directory. */
+		dentry = ERR_PTR(-ENOENT);
+		if (IS_DEADDIR(inode))
+			goto out;
+
+		new = d_alloc(base, name);
 		dentry = ERR_PTR(-ENOMEM);
 		if (!new)
 			goto out;
@@ -1340,7 +1216,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 {
 	int err;
 
-	err = permission(nd->path.dentry->d_inode, MAY_EXEC, nd);
+	err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
 	if (err)
 		return ERR_PTR(err);
 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
@@ -1388,7 +1264,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	if (err)
 		return ERR_PTR(err);
 
-	err = permission(base->d_inode, MAY_EXEC, NULL);
+	err = inode_permission(base->d_inode, MAY_EXEC);
 	if (err)
 		return ERR_PTR(err);
 	return __lookup_hash(&this, base, NULL);
@@ -1416,22 +1292,40 @@ struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
 	return __lookup_hash(&this, base, NULL);
 }
 
-int __user_walk_fd(int dfd, const char __user *name, unsigned flags,
-			    struct nameidata *nd)
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+		 struct path *path)
 {
+	struct nameidata nd;
 	char *tmp = getname(name);
 	int err = PTR_ERR(tmp);
-
 	if (!IS_ERR(tmp)) {
-		err = do_path_lookup(dfd, tmp, flags, nd);
+
+		BUG_ON(flags & LOOKUP_PARENT);
+
+		err = do_path_lookup(dfd, tmp, flags, &nd);
 		putname(tmp);
+		if (!err)
+			*path = nd.path;
 	}
 	return err;
 }
 
-int __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
+static int user_path_parent(int dfd, const char __user *path,
+			struct nameidata *nd, char **name)
 {
-	return __user_walk_fd(AT_FDCWD, name, flags, nd);
+	char *s = getname(path);
+	int error;
+
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
+	if (error)
+		putname(s);
+	else
+		*name = s;
+
+	return error;
 }
 
 /*
@@ -1478,7 +1372,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 	BUG_ON(victim->d_parent->d_inode != dir);
 	audit_inode_child(victim->d_name.name, victim, dir);
 
-	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
@@ -1508,14 +1402,13 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
  *  3. We should have write and exec permissions on dir
  *  4. We can't do it if dir is immutable (done in permission())
  */
-static inline int may_create(struct inode *dir, struct dentry *child,
-			     struct nameidata *nd)
+static inline int may_create(struct inode *dir, struct dentry *child)
 {
 	if (child->d_inode)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	return permission(dir,MAY_WRITE | MAY_EXEC, nd);
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
 /* 
@@ -1548,20 +1441,18 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
 	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 
-	for (p = p1; p->d_parent != p; p = p->d_parent) {
-		if (p->d_parent == p2) {
-			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
-			return p;
-		}
+	p = d_ancestor(p2, p1);
+	if (p) {
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
 	}
 
-	for (p = p2; p->d_parent != p; p = p->d_parent) {
-		if (p->d_parent == p1) {
-			mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
-			return p;
-		}
+	p = d_ancestor(p1, p2);
+	if (p) {
+		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+		return p;
 	}
 
 	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
@@ -1581,7 +1472,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		struct nameidata *nd)
 {
-	int error = may_create(dir, dentry, nd);
+	int error = may_create(dir, dentry);
 
 	if (error)
 		return error;
@@ -1755,7 +1646,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	int will_write;
 	int flag = open_to_namei_flags(open_flag);
 
-	acc_mode = ACC_MODE(flag);
+	acc_mode = MAY_OPEN | ACC_MODE(flag);
 
 	/* O_TRUNC implies we need access checks for write permissions */
 	if (flag & O_TRUNC)
@@ -1780,8 +1671,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	/*
 	 * Create - we need to know the parent.
 	 */
-	error = path_lookup_create(dfd, pathname, LOOKUP_PARENT,
-				   &nd, flag, mode);
+	error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
 	if (error)
 		return ERR_PTR(error);
 
@@ -1792,10 +1682,20 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	 */
 	error = -EISDIR;
 	if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
-		goto exit;
+		goto exit_parent;
 
+	error = -ENFILE;
+	filp = get_empty_filp();
+	if (filp == NULL)
+		goto exit_parent;
+	nd.intent.open.file = filp;
+	nd.intent.open.flags = flag;
+	nd.intent.open.create_mode = mode;
 	dir = nd.path.dentry;
 	nd.flags &= ~LOOKUP_PARENT;
+	nd.flags |= LOOKUP_CREATE | LOOKUP_OPEN;
+	if (flag & O_EXCL)
+		nd.flags |= LOOKUP_EXCL;
 	mutex_lock(&dir->d_inode->i_mutex);
 	path.dentry = lookup_hash(&nd);
 	path.mnt = nd.path.mnt;
@@ -1900,6 +1800,7 @@ exit_dput:
 exit:
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
+exit_parent:
 	path_put(&nd.path);
 	return ERR_PTR(error);
 
@@ -1992,7 +1893,7 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 	if (nd->last_type != LAST_NORM)
 		goto fail;
 	nd->flags &= ~LOOKUP_PARENT;
-	nd->flags |= LOOKUP_CREATE;
+	nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
 	nd->intent.open.flags = O_EXCL;
 
 	/*
@@ -2025,7 +1926,7 @@ EXPORT_SYMBOL_GPL(lookup_create);
 
 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 {
-	int error = may_create(dir, dentry, NULL);
+	int error = may_create(dir, dentry);
 
 	if (error)
 		return error;
@@ -2071,20 +1972,18 @@ static int may_mknod(mode_t mode)
 asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 				unsigned dev)
 {
-	int error = 0;
-	char * tmp;
-	struct dentry * dentry;
+	int error;
+	char *tmp;
+	struct dentry *dentry;
 	struct nameidata nd;
 
 	if (S_ISDIR(mode))
 		return -EPERM;
-	tmp = getname(filename);
-	if (IS_ERR(tmp))
-		return PTR_ERR(tmp);
 
-	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
+	error = user_path_parent(dfd, filename, &nd, &tmp);
 	if (error)
-		goto out;
+		return error;
+
 	dentry = lookup_create(&nd, 0);
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
@@ -2116,7 +2015,6 @@ out_dput:
 out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 	path_put(&nd.path);
-out:
 	putname(tmp);
 
 	return error;
@@ -2129,7 +2027,7 @@ asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
 
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	int error = may_create(dir, dentry, NULL);
+	int error = may_create(dir, dentry);
 
 	if (error)
 		return error;
@@ -2156,14 +2054,10 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
 	struct dentry *dentry;
 	struct nameidata nd;
 
-	tmp = getname(pathname);
-	error = PTR_ERR(tmp);
-	if (IS_ERR(tmp))
+	error = user_path_parent(dfd, pathname, &nd, &tmp);
+	if (error)
 		goto out_err;
 
-	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
-	if (error)
-		goto out;
 	dentry = lookup_create(&nd, 1);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -2181,7 +2075,6 @@ out_dput:
 out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 	path_put(&nd.path);
-out:
 	putname(tmp);
 out_err:
 	return error;
@@ -2259,25 +2152,24 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	struct dentry *dentry;
 	struct nameidata nd;
 
-	name = getname(pathname);
-	if(IS_ERR(name))
-		return PTR_ERR(name);
-
-	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
+	error = user_path_parent(dfd, pathname, &nd, &name);
 	if (error)
-		goto exit;
+		return error;
 
 	switch(nd.last_type) {
-		case LAST_DOTDOT:
-			error = -ENOTEMPTY;
-			goto exit1;
-		case LAST_DOT:
-			error = -EINVAL;
-			goto exit1;
-		case LAST_ROOT:
-			error = -EBUSY;
-			goto exit1;
+	case LAST_DOTDOT:
+		error = -ENOTEMPTY;
+		goto exit1;
+	case LAST_DOT:
+		error = -EINVAL;
+		goto exit1;
+	case LAST_ROOT:
+		error = -EBUSY;
+		goto exit1;
 	}
+
+	nd.flags &= ~LOOKUP_PARENT;
+
 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
@@ -2294,7 +2186,6 @@ exit2:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 exit1:
 	path_put(&nd.path);
-exit:
 	putname(name);
 	return error;
 }
@@ -2343,22 +2234,22 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
  */
 static long do_unlinkat(int dfd, const char __user *pathname)
 {
-	int error = 0;
-	char * name;
+	int error;
+	char *name;
 	struct dentry *dentry;
 	struct nameidata nd;
 	struct inode *inode = NULL;
 
-	name = getname(pathname);
-	if(IS_ERR(name))
-		return PTR_ERR(name);
-
-	error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
+	error = user_path_parent(dfd, pathname, &nd, &name);
 	if (error)
-		goto exit;
+		return error;
+
 	error = -EISDIR;
 	if (nd.last_type != LAST_NORM)
 		goto exit1;
+
+	nd.flags &= ~LOOKUP_PARENT;
+
 	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
@@ -2382,7 +2273,6 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 		iput(inode);	/* truncate the inode here */
 exit1:
 	path_put(&nd.path);
-exit:
 	putname(name);
 	return error;
 
@@ -2408,9 +2298,9 @@ asmlinkage long sys_unlink(const char __user *pathname)
 	return do_unlinkat(AT_FDCWD, pathname);
 }
 
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 {
-	int error = may_create(dir, dentry, NULL);
+	int error = may_create(dir, dentry);
 
 	if (error)
 		return error;
@@ -2432,23 +2322,20 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
 asmlinkage long sys_symlinkat(const char __user *oldname,
 			      int newdfd, const char __user *newname)
 {
-	int error = 0;
-	char * from;
-	char * to;
+	int error;
+	char *from;
+	char *to;
 	struct dentry *dentry;
 	struct nameidata nd;
 
 	from = getname(oldname);
-	if(IS_ERR(from))
+	if (IS_ERR(from))
 		return PTR_ERR(from);
-	to = getname(newname);
-	error = PTR_ERR(to);
-	if (IS_ERR(to))
-		goto out_putname;
 
-	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
+	error = user_path_parent(newdfd, newname, &nd, &to);
 	if (error)
-		goto out;
+		goto out_putname;
+
 	dentry = lookup_create(&nd, 0);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -2457,14 +2344,13 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
-	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
+	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
 out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 	path_put(&nd.path);
-out:
 	putname(to);
 out_putname:
 	putname(from);
@@ -2484,7 +2370,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (!inode)
 		return -ENOENT;
 
-	error = may_create(dir, new_dentry, NULL);
+	error = may_create(dir, new_dentry);
 	if (error)
 		return error;
 
@@ -2498,19 +2384,19 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		return -EPERM;
 	if (!dir->i_op || !dir->i_op->link)
 		return -EPERM;
-	if (S_ISDIR(old_dentry->d_inode->i_mode))
+	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
 
 	error = security_inode_link(old_dentry, dir, new_dentry);
 	if (error)
 		return error;
 
-	mutex_lock(&old_dentry->d_inode->i_mutex);
+	mutex_lock(&inode->i_mutex);
 	DQUOT_INIT(dir);
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
-	mutex_unlock(&old_dentry->d_inode->i_mutex);
+	mutex_unlock(&inode->i_mutex);
 	if (!error)
-		fsnotify_link(dir, old_dentry->d_inode, new_dentry);
+		fsnotify_link(dir, inode, new_dentry);
 	return error;
 }
 
@@ -2528,27 +2414,25 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 			   int flags)
 {
 	struct dentry *new_dentry;
-	struct nameidata nd, old_nd;
+	struct nameidata nd;
+	struct path old_path;
 	int error;
-	char * to;
+	char *to;
 
 	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
 		return -EINVAL;
 
-	to = getname(newname);
-	if (IS_ERR(to))
-		return PTR_ERR(to);
-
-	error = __user_walk_fd(olddfd, oldname,
-			       flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-			       &old_nd);
+	error = user_path_at(olddfd, oldname,
+			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
+			     &old_path);
 	if (error)
-		goto exit;
-	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
+		return error;
+
+	error = user_path_parent(newdfd, newname, &nd, &to);
 	if (error)
 		goto out;
 	error = -EXDEV;
-	if (old_nd.path.mnt != nd.path.mnt)
+	if (old_path.mnt != nd.path.mnt)
 		goto out_release;
 	new_dentry = lookup_create(&nd, 0);
 	error = PTR_ERR(new_dentry);
@@ -2557,7 +2441,7 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
-	error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
+	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(new_dentry);
@@ -2565,10 +2449,9 @@ out_unlock:
 	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
 out_release:
 	path_put(&nd.path);
-out:
-	path_put(&old_nd.path);
-exit:
 	putname(to);
+out:
+	path_put(&old_path);
 
 	return error;
 }
@@ -2621,7 +2504,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	 * we'll need to flip '..'.
 	 */
 	if (new_dir != old_dir) {
-		error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
+		error = inode_permission(old_dentry->d_inode, MAY_WRITE);
 		if (error)
 			return error;
 	}
@@ -2696,7 +2579,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		return error;
 
 	if (!new_dentry->d_inode)
-		error = may_create(new_dir, new_dentry, NULL);
+		error = may_create(new_dir, new_dentry);
 	else
 		error = may_delete(new_dir, new_dentry, is_dir);
 	if (error)
@@ -2724,20 +2607,22 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return error;
 }
 
-static int do_rename(int olddfd, const char *oldname,
-			int newdfd, const char *newname)
+asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
+			     int newdfd, const char __user *newname)
 {
-	int error = 0;
-	struct dentry * old_dir, * new_dir;
-	struct dentry * old_dentry, *new_dentry;
-	struct dentry * trap;
+	struct dentry *old_dir, *new_dir;
+	struct dentry *old_dentry, *new_dentry;
+	struct dentry *trap;
 	struct nameidata oldnd, newnd;
+	char *from;
+	char *to;
+	int error;
 
-	error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
+	error = user_path_parent(olddfd, oldname, &oldnd, &from);
 	if (error)
 		goto exit;
 
-	error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
+	error = user_path_parent(newdfd, newname, &newnd, &to);
 	if (error)
 		goto exit1;
 
@@ -2754,6 +2639,10 @@ static int do_rename(int olddfd, const char *oldname,
 	if (newnd.last_type != LAST_NORM)
 		goto exit2;
 
+	oldnd.flags &= ~LOOKUP_PARENT;
+	newnd.flags &= ~LOOKUP_PARENT;
+	newnd.flags |= LOOKUP_RENAME_TARGET;
+
 	trap = lock_rename(new_dir, old_dir);
 
 	old_dentry = lookup_hash(&oldnd);
@@ -2799,29 +2688,11 @@ exit3:
 	unlock_rename(new_dir, old_dir);
 exit2:
 	path_put(&newnd.path);
+	putname(to);
 exit1:
 	path_put(&oldnd.path);
-exit:
-	return error;
-}
-
-asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
-			     int newdfd, const char __user *newname)
-{
-	int error;
-	char * from;
-	char * to;
-
-	from = getname(oldname);
-	if(IS_ERR(from))
-		return PTR_ERR(from);
-	to = getname(newname);
-	error = PTR_ERR(to);
-	if (!IS_ERR(to)) {
-		error = do_rename(olddfd, from, newdfd, to);
-		putname(to);
-	}
 	putname(from);
+exit:
 	return error;
 }
 
@@ -2959,8 +2830,7 @@ const struct inode_operations page_symlink_inode_operations = {
 	.put_link	= page_put_link,
 };
 
-EXPORT_SYMBOL(__user_walk);
-EXPORT_SYMBOL(__user_walk_fd);
+EXPORT_SYMBOL(user_path_at);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
@@ -2974,8 +2844,9 @@ EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(permission);
+EXPORT_SYMBOL(inode_permission);
 EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
diff --git a/fs/namespace.c b/fs/namespace.c
index 4f6f7635b59c..cce46702d33c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -112,9 +112,13 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		int err;
 
 		err = mnt_alloc_id(mnt);
-		if (err) {
-			kmem_cache_free(mnt_cache, mnt);
-			return NULL;
+		if (err)
+			goto out_free_cache;
+
+		if (name) {
+			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+			if (!mnt->mnt_devname)
+				goto out_free_id;
 		}
 
 		atomic_set(&mnt->mnt_count, 1);
@@ -127,16 +131,14 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		atomic_set(&mnt->__mnt_writers, 0);
-		if (name) {
-			int size = strlen(name) + 1;
-			char *newname = kmalloc(size, GFP_KERNEL);
-			if (newname) {
-				memcpy(newname, name, size);
-				mnt->mnt_devname = newname;
-			}
-		}
 	}
 	return mnt;
+
+out_free_id:
+	mnt_free_id(mnt);
+out_free_cache:
+	kmem_cache_free(mnt_cache, mnt);
+	return NULL;
 }
 
 /*
@@ -309,10 +311,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
 	 */
 	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
 	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-		printk(KERN_DEBUG "leak detected on mount(%p) writers "
+		WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
 				"count: %d\n",
 			mnt, atomic_read(&mnt->__mnt_writers));
-		WARN_ON(1);
 		/* use the flag to keep the dmesg spam down */
 		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
 	}
@@ -1129,27 +1130,27 @@ static int do_umount(struct vfsmount *mnt, int flags)
 
 asmlinkage long sys_umount(char __user * name, int flags)
 {
-	struct nameidata nd;
+	struct path path;
 	int retval;
 
-	retval = __user_walk(name, LOOKUP_FOLLOW, &nd);
+	retval = user_path(name, &path);
 	if (retval)
 		goto out;
 	retval = -EINVAL;
-	if (nd.path.dentry != nd.path.mnt->mnt_root)
+	if (path.dentry != path.mnt->mnt_root)
 		goto dput_and_out;
-	if (!check_mnt(nd.path.mnt))
+	if (!check_mnt(path.mnt))
 		goto dput_and_out;
 
 	retval = -EPERM;
 	if (!capable(CAP_SYS_ADMIN))
 		goto dput_and_out;
 
-	retval = do_umount(nd.path.mnt, flags);
+	retval = do_umount(path.mnt, flags);
 dput_and_out:
 	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
-	dput(nd.path.dentry);
-	mntput_no_expire(nd.path.mnt);
+	dput(path.dentry);
+	mntput_no_expire(path.mnt);
 out:
 	return retval;
 }
@@ -1166,19 +1167,19 @@ asmlinkage long sys_oldumount(char __user * name)
 
 #endif
 
-static int mount_is_safe(struct nameidata *nd)
+static int mount_is_safe(struct path *path)
 {
 	if (capable(CAP_SYS_ADMIN))
 		return 0;
 	return -EPERM;
 #ifdef notyet
-	if (S_ISLNK(nd->path.dentry->d_inode->i_mode))
+	if (S_ISLNK(path->dentry->d_inode->i_mode))
 		return -EPERM;
-	if (nd->path.dentry->d_inode->i_mode & S_ISVTX) {
-		if (current->uid != nd->path.dentry->d_inode->i_uid)
+	if (path->dentry->d_inode->i_mode & S_ISVTX) {
+		if (current->uid != path->dentry->d_inode->i_uid)
 			return -EPERM;
 	}
-	if (vfs_permission(nd, MAY_WRITE))
+	if (inode_permission(path->dentry->d_inode, MAY_WRITE))
 		return -EPERM;
 	return 0;
 #endif
@@ -1424,11 +1425,10 @@ out_unlock:
 
 /*
  * recursively change the type of the mountpoint.
- * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_change_type(struct nameidata *nd, int flag)
+static int do_change_type(struct path *path, int flag)
 {
-	struct vfsmount *m, *mnt = nd->path.mnt;
+	struct vfsmount *m, *mnt = path->mnt;
 	int recurse = flag & MS_REC;
 	int type = flag & ~MS_REC;
 	int err = 0;
@@ -1436,7 +1436,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (nd->path.dentry != nd->path.mnt->mnt_root)
+	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
 	down_write(&namespace_sem);
@@ -1458,40 +1458,39 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
 
 /*
  * do loopback mount.
- * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_loopback(struct nameidata *nd, char *old_name,
+static int do_loopback(struct path *path, char *old_name,
 				int recurse)
 {
-	struct nameidata old_nd;
+	struct path old_path;
 	struct vfsmount *mnt = NULL;
-	int err = mount_is_safe(nd);
+	int err = mount_is_safe(path);
 	if (err)
 		return err;
 	if (!old_name || !*old_name)
 		return -EINVAL;
-	err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
 	if (err)
 		return err;
 
 	down_write(&namespace_sem);
 	err = -EINVAL;
-	if (IS_MNT_UNBINDABLE(old_nd.path.mnt))
+	if (IS_MNT_UNBINDABLE(old_path.mnt))
 		goto out;
 
-	if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt))
+	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
 		goto out;
 
 	err = -ENOMEM;
 	if (recurse)
-		mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0);
+		mnt = copy_tree(old_path.mnt, old_path.dentry, 0);
 	else
-		mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0);
+		mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
 
 	if (!mnt)
 		goto out;
 
-	err = graft_tree(mnt, &nd->path);
+	err = graft_tree(mnt, path);
 	if (err) {
 		LIST_HEAD(umount_list);
 		spin_lock(&vfsmount_lock);
@@ -1502,7 +1501,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name,
 
 out:
 	up_write(&namespace_sem);
-	path_put(&old_nd.path);
+	path_put(&old_path);
 	return err;
 }
 
@@ -1527,33 +1526,37 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
  * on it - tough luck.
- * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
+static int do_remount(struct path *path, int flags, int mnt_flags,
 		      void *data)
 {
 	int err;
-	struct super_block *sb = nd->path.mnt->mnt_sb;
+	struct super_block *sb = path->mnt->mnt_sb;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!check_mnt(nd->path.mnt))
+	if (!check_mnt(path->mnt))
 		return -EINVAL;
 
-	if (nd->path.dentry != nd->path.mnt->mnt_root)
+	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
-		err = change_mount_flags(nd->path.mnt, flags);
+		err = change_mount_flags(path->mnt, flags);
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err)
-		nd->path.mnt->mnt_flags = mnt_flags;
+		path->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
-	if (!err)
-		security_sb_post_remount(nd->path.mnt, flags, data);
+	if (!err) {
+		security_sb_post_remount(path->mnt, flags, data);
+
+		spin_lock(&vfsmount_lock);
+		touch_mnt_namespace(path->mnt->mnt_ns);
+		spin_unlock(&vfsmount_lock);
+	}
 	return err;
 }
 
@@ -1567,90 +1570,85 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt)
 	return 0;
 }
 
-/*
- * noinline this do_mount helper to save do_mount stack space.
- */
-static noinline int do_move_mount(struct nameidata *nd, char *old_name)
+static int do_move_mount(struct path *path, char *old_name)
 {
-	struct nameidata old_nd;
-	struct path parent_path;
+	struct path old_path, parent_path;
 	struct vfsmount *p;
 	int err = 0;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (!old_name || !*old_name)
 		return -EINVAL;
-	err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
 	if (err)
 		return err;
 
 	down_write(&namespace_sem);
-	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path.mnt, &nd->path.dentry))
+	while (d_mountpoint(path->dentry) &&
+	       follow_down(&path->mnt, &path->dentry))
 		;
 	err = -EINVAL;
-	if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt))
+	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
 		goto out;
 
 	err = -ENOENT;
-	mutex_lock(&nd->path.dentry->d_inode->i_mutex);
-	if (IS_DEADDIR(nd->path.dentry->d_inode))
+	mutex_lock(&path->dentry->d_inode->i_mutex);
+	if (IS_DEADDIR(path->dentry->d_inode))
 		goto out1;
 
-	if (!IS_ROOT(nd->path.dentry) && d_unhashed(nd->path.dentry))
+	if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
 		goto out1;
 
 	err = -EINVAL;
-	if (old_nd.path.dentry != old_nd.path.mnt->mnt_root)
+	if (old_path.dentry != old_path.mnt->mnt_root)
 		goto out1;
 
-	if (old_nd.path.mnt == old_nd.path.mnt->mnt_parent)
+	if (old_path.mnt == old_path.mnt->mnt_parent)
 		goto out1;
 
-	if (S_ISDIR(nd->path.dentry->d_inode->i_mode) !=
-	      S_ISDIR(old_nd.path.dentry->d_inode->i_mode))
+	if (S_ISDIR(path->dentry->d_inode->i_mode) !=
+	      S_ISDIR(old_path.dentry->d_inode->i_mode))
 		goto out1;
 	/*
 	 * Don't move a mount residing in a shared parent.
 	 */
-	if (old_nd.path.mnt->mnt_parent &&
-	    IS_MNT_SHARED(old_nd.path.mnt->mnt_parent))
+	if (old_path.mnt->mnt_parent &&
+	    IS_MNT_SHARED(old_path.mnt->mnt_parent))
 		goto out1;
 	/*
 	 * Don't move a mount tree containing unbindable mounts to a destination
 	 * mount which is shared.
 	 */
-	if (IS_MNT_SHARED(nd->path.mnt) &&
-	    tree_contains_unbindable(old_nd.path.mnt))
+	if (IS_MNT_SHARED(path->mnt) &&
+	    tree_contains_unbindable(old_path.mnt))
 		goto out1;
 	err = -ELOOP;
-	for (p = nd->path.mnt; p->mnt_parent != p; p = p->mnt_parent)
-		if (p == old_nd.path.mnt)
+	for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent)
+		if (p == old_path.mnt)
 			goto out1;
 
-	err = attach_recursive_mnt(old_nd.path.mnt, &nd->path, &parent_path);
+	err = attach_recursive_mnt(old_path.mnt, path, &parent_path);
 	if (err)
 		goto out1;
 
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
-	list_del_init(&old_nd.path.mnt->mnt_expire);
+	list_del_init(&old_path.mnt->mnt_expire);
 out1:
-	mutex_unlock(&nd->path.dentry->d_inode->i_mutex);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
 out:
 	up_write(&namespace_sem);
 	if (!err)
 		path_put(&parent_path);
-	path_put(&old_nd.path);
+	path_put(&old_path);
 	return err;
 }
 
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
- * noinline this do_mount helper to save do_mount stack space.
  */
-static noinline int do_new_mount(struct nameidata *nd, char *type, int flags,
+static int do_new_mount(struct path *path, char *type, int flags,
 			int mnt_flags, char *name, void *data)
 {
 	struct vfsmount *mnt;
@@ -1666,31 +1664,31 @@ static noinline int do_new_mount(struct nameidata *nd, char *type, int flags,
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
-	return do_add_mount(mnt, nd, mnt_flags, NULL);
+	return do_add_mount(mnt, path, mnt_flags, NULL);
 }
 
 /*
  * add a mount into a namespace's mount tree
  * - provide the option of adding the new mount to an expiration list
  */
-int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
+int do_add_mount(struct vfsmount *newmnt, struct path *path,
 		 int mnt_flags, struct list_head *fslist)
 {
 	int err;
 
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
-	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path.mnt, &nd->path.dentry))
+	while (d_mountpoint(path->dentry) &&
+	       follow_down(&path->mnt, &path->dentry))
 		;
 	err = -EINVAL;
-	if (!check_mnt(nd->path.mnt))
+	if (!check_mnt(path->mnt))
 		goto unlock;
 
 	/* Refuse the same filesystem on the same mount point */
 	err = -EBUSY;
-	if (nd->path.mnt->mnt_sb == newmnt->mnt_sb &&
-	    nd->path.mnt->mnt_root == nd->path.dentry)
+	if (path->mnt->mnt_sb == newmnt->mnt_sb &&
+	    path->mnt->mnt_root == path->dentry)
 		goto unlock;
 
 	err = -EINVAL;
@@ -1698,7 +1696,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 		goto unlock;
 
 	newmnt->mnt_flags = mnt_flags;
-	if ((err = graft_tree(newmnt, &nd->path)))
+	if ((err = graft_tree(newmnt, path)))
 		goto unlock;
 
 	if (fslist) /* add to the specified expiration list */
@@ -1901,7 +1899,7 @@ int copy_mount_options(const void __user * data, unsigned long *where)
 long do_mount(char *dev_name, char *dir_name, char *type_page,
 		  unsigned long flags, void *data_page)
 {
-	struct nameidata nd;
+	struct path path;
 	int retval = 0;
 	int mnt_flags = 0;
 
@@ -1939,29 +1937,29 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
 
 	/* ... and get the mountpoint */
-	retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
+	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
 	if (retval)
 		return retval;
 
-	retval = security_sb_mount(dev_name, &nd.path,
+	retval = security_sb_mount(dev_name, &path,
 				   type_page, flags, data_page);
 	if (retval)
 		goto dput_out;
 
 	if (flags & MS_REMOUNT)
-		retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
-		retval = do_loopback(&nd, dev_name, flags & MS_REC);
+		retval = do_loopback(&path, dev_name, flags & MS_REC);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
-		retval = do_change_type(&nd, flags);
+		retval = do_change_type(&path, flags);
 	else if (flags & MS_MOVE)
-		retval = do_move_mount(&nd, dev_name);
+		retval = do_move_mount(&path, dev_name);
 	else
-		retval = do_new_mount(&nd, type_page, flags, mnt_flags,
+		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
 dput_out:
-	path_put(&nd.path);
+	path_put(&path);
 	return retval;
 }
 
@@ -1973,7 +1971,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		struct fs_struct *fs)
 {
 	struct mnt_namespace *new_ns;
-	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
+	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
 	struct vfsmount *p, *q;
 
 	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
@@ -2016,10 +2014,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 				pwdmnt = p;
 				fs->pwd.mnt = mntget(q);
 			}
-			if (p == fs->altroot.mnt) {
-				altrootmnt = p;
-				fs->altroot.mnt = mntget(q);
-			}
 		}
 		p = next_mnt(p, mnt_ns->root);
 		q = next_mnt(q, new_ns->root);
@@ -2030,8 +2024,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		mntput(rootmnt);
 	if (pwdmnt)
 		mntput(pwdmnt);
-	if (altrootmnt)
-		mntput(altrootmnt);
 
 	return new_ns;
 }
@@ -2184,28 +2176,26 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 			       const char __user * put_old)
 {
 	struct vfsmount *tmp;
-	struct nameidata new_nd, old_nd;
-	struct path parent_path, root_parent, root;
+	struct path new, old, parent_path, root_parent, root;
 	int error;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
-			    &new_nd);
+	error = user_path_dir(new_root, &new);
 	if (error)
 		goto out0;
 	error = -EINVAL;
-	if (!check_mnt(new_nd.path.mnt))
+	if (!check_mnt(new.mnt))
 		goto out1;
 
-	error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd);
+	error = user_path_dir(put_old, &old);
 	if (error)
 		goto out1;
 
-	error = security_sb_pivotroot(&old_nd.path, &new_nd.path);
+	error = security_sb_pivotroot(&old, &new);
 	if (error) {
-		path_put(&old_nd.path);
+		path_put(&old);
 		goto out1;
 	}
 
@@ -2214,69 +2204,69 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	path_get(&current->fs->root);
 	read_unlock(&current->fs->lock);
 	down_write(&namespace_sem);
-	mutex_lock(&old_nd.path.dentry->d_inode->i_mutex);
+	mutex_lock(&old.dentry->d_inode->i_mutex);
 	error = -EINVAL;
-	if (IS_MNT_SHARED(old_nd.path.mnt) ||
-		IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) ||
+	if (IS_MNT_SHARED(old.mnt) ||
+		IS_MNT_SHARED(new.mnt->mnt_parent) ||
 		IS_MNT_SHARED(root.mnt->mnt_parent))
 		goto out2;
 	if (!check_mnt(root.mnt))
 		goto out2;
 	error = -ENOENT;
-	if (IS_DEADDIR(new_nd.path.dentry->d_inode))
+	if (IS_DEADDIR(new.dentry->d_inode))
 		goto out2;
-	if (d_unhashed(new_nd.path.dentry) && !IS_ROOT(new_nd.path.dentry))
+	if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
 		goto out2;
-	if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry))
+	if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
 		goto out2;
 	error = -EBUSY;
-	if (new_nd.path.mnt == root.mnt ||
-	    old_nd.path.mnt == root.mnt)
+	if (new.mnt == root.mnt ||
+	    old.mnt == root.mnt)
 		goto out2; /* loop, on the same file system  */
 	error = -EINVAL;
 	if (root.mnt->mnt_root != root.dentry)
 		goto out2; /* not a mountpoint */
 	if (root.mnt->mnt_parent == root.mnt)
 		goto out2; /* not attached */
-	if (new_nd.path.mnt->mnt_root != new_nd.path.dentry)
+	if (new.mnt->mnt_root != new.dentry)
 		goto out2; /* not a mountpoint */
-	if (new_nd.path.mnt->mnt_parent == new_nd.path.mnt)
+	if (new.mnt->mnt_parent == new.mnt)
 		goto out2; /* not attached */
 	/* make sure we can reach put_old from new_root */
-	tmp = old_nd.path.mnt;
+	tmp = old.mnt;
 	spin_lock(&vfsmount_lock);
-	if (tmp != new_nd.path.mnt) {
+	if (tmp != new.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
 				goto out3; /* already mounted on put_old */
-			if (tmp->mnt_parent == new_nd.path.mnt)
+			if (tmp->mnt_parent == new.mnt)
 				break;
 			tmp = tmp->mnt_parent;
 		}
-		if (!is_subdir(tmp->mnt_mountpoint, new_nd.path.dentry))
+		if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
 			goto out3;
-	} else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry))
+	} else if (!is_subdir(old.dentry, new.dentry))
 		goto out3;
-	detach_mnt(new_nd.path.mnt, &parent_path);
+	detach_mnt(new.mnt, &parent_path);
 	detach_mnt(root.mnt, &root_parent);
 	/* mount old root on put_old */
-	attach_mnt(root.mnt, &old_nd.path);
+	attach_mnt(root.mnt, &old);
 	/* mount new_root on / */
-	attach_mnt(new_nd.path.mnt, &root_parent);
+	attach_mnt(new.mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	spin_unlock(&vfsmount_lock);
-	chroot_fs_refs(&root, &new_nd.path);
-	security_sb_post_pivotroot(&root, &new_nd.path);
+	chroot_fs_refs(&root, &new);
+	security_sb_post_pivotroot(&root, &new);
 	error = 0;
 	path_put(&root_parent);
 	path_put(&parent_path);
 out2:
-	mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex);
+	mutex_unlock(&old.dentry->d_inode->i_mutex);
 	up_write(&namespace_sem);
 	path_put(&root);
-	path_put(&old_nd.path);
+	path_put(&old);
 out1:
-	path_put(&new_nd.path);
+	path_put(&new);
 out0:
 	return error;
 out3:
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 011ef0b6d2d4..07e9715b8658 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -266,7 +266,7 @@ leave_me:;
 
 
 static int
-__ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
+__ncp_lookup_validate(struct dentry *dentry)
 {
 	struct ncp_server *server;
 	struct dentry *parent;
@@ -340,7 +340,7 @@ ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
 {
 	int res;
 	lock_kernel();
-	res = __ncp_lookup_validate(dentry, nd);
+	res = __ncp_lookup_validate(dentry);
 	unlock_kernel();
 	return res;
 }
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2e5ab1204dec..d642f0e5b365 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -64,7 +64,7 @@ static void ncp_destroy_inode(struct inode *inode)
 	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f447f4b4476c..c2e9cfd9e5a4 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -40,6 +40,16 @@ unsigned short nfs_callback_tcpport;
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
 
+/*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const sa_family_t	nfs_callback_family = AF_INET6;
+#else
+static const sa_family_t	nfs_callback_family = AF_INET;
+#endif
+
 static int param_set_port(const char *val, struct kernel_param *kp)
 {
 	char *endp;
@@ -105,7 +115,8 @@ int nfs_callback_up(void)
 	mutex_lock(&nfs_callback_mutex);
 	if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
 		goto out;
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+				nfs_callback_family, NULL);
 	ret = -ENOMEM;
 	if (!serv)
 		goto out_err;
@@ -115,7 +126,8 @@ int nfs_callback_up(void)
 	if (ret <= 0)
 		goto out_err;
 	nfs_callback_tcpport = ret;
-	dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
+	dprintk("NFS: Callback listener port = %u (af %u)\n",
+			nfs_callback_tcpport, nfs_callback_family);
 
 	nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
 	if (IS_ERR(nfs_callback_info.rqst)) {
@@ -148,8 +160,8 @@ out:
 	mutex_unlock(&nfs_callback_mutex);
 	return ret;
 out_err:
-	dprintk("Couldn't create callback socket or server thread; err = %d\n",
-		ret);
+	dprintk("NFS: Couldn't create callback socket or server thread; "
+		"err = %d\n", ret);
 	nfs_callback_info.users--;
 	goto out;
 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5ee23e7058b3..7547600b6174 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server,
 	server->nfs_client = clp;
 
 	/* Initialise the client representation from the mount data */
-	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+	server->flags = data->flags;
 
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
@@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void)
 	INIT_LIST_HEAD(&server->client_link);
 	INIT_LIST_HEAD(&server->master_link);
 
-	init_waitqueue_head(&server->active_wq);
 	atomic_set(&server->active, 0);
 
 	server->io_stats = nfs_alloc_iostats();
@@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server,
 		goto error;
 
 	/* Initialise the client representation from the mount data */
-	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+	server->flags = data->flags;
 	server->caps |= NFS_CAP_ATOMIC_OPEN;
 
 	if (data->rsize)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 28a238dab23a..3e64b98f3a93 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -156,6 +156,7 @@ typedef struct {
 	decode_dirent_t	decode;
 	int		plus;
 	unsigned long	timestamp;
+	unsigned long	gencount;
 	int		timestamp_valid;
 } nfs_readdir_descriptor_t;
 
@@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	struct file	*file = desc->file;
 	struct inode	*inode = file->f_path.dentry->d_inode;
 	struct rpc_cred	*cred = nfs_file_cred(file);
-	unsigned long	timestamp;
+	unsigned long	timestamp, gencount;
 	int		error;
 
 	dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
@@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 
  again:
 	timestamp = jiffies;
+	gencount = nfs_inc_attr_generation_counter();
 	error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
 					  NFS_SERVER(inode)->dtsize, desc->plus);
 	if (error < 0) {
@@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 		goto error;
 	}
 	desc->timestamp = timestamp;
+	desc->gencount = gencount;
 	desc->timestamp_valid = 1;
 	SetPageUptodate(page);
 	/* Ensure consistent page alignment of the data.
@@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 	desc->ptr = p;
-	if (desc->timestamp_valid)
+	if (desc->timestamp_valid) {
 		desc->entry->fattr->time_start = desc->timestamp;
-	else
+		desc->entry->fattr->gencount = desc->gencount;
+	} else
 		desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
 	return 0;
 }
@@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	struct rpc_cred	*cred = nfs_file_cred(file);
 	struct page	*page = NULL;
 	int		status;
-	unsigned long	timestamp;
+	unsigned long	timestamp, gencount;
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
 			(unsigned long long)*desc->dir_cookie);
@@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 		goto out;
 	}
 	timestamp = jiffies;
+	gencount = nfs_inc_attr_generation_counter();
 	status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
 						*desc->dir_cookie, page,
 						NFS_SERVER(inode)->dtsize,
@@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
 	if (status >= 0) {
 		desc->timestamp = timestamp;
+		desc->gencount = gencount;
 		desc->timestamp_valid = 1;
 		if ((status = dir_decode(desc)) == 0)
 			desc->entry->prev_cookie = *desc->dir_cookie;
@@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
  */
 void nfs_force_lookup_revalidate(struct inode *dir)
 {
-	NFS_I(dir)->cache_change_attribute = jiffies;
+	NFS_I(dir)->cache_change_attribute++;
 }
 
 /*
@@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 {
 	if (IS_ROOT(dentry))
 		return 1;
+	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+		return 0;
 	if (!nfs_verify_change_attribute(dir, dentry->d_time))
 		return 0;
 	/* Revalidate nfsi->cache_change_attribute before we declare a match */
@@ -699,9 +707,7 @@ static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
 {
 	if (NFS_PROTO(dir)->version == 2)
 		return 0;
-	if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0)
-		return 0;
-	return (nd->intent.open.flags & O_EXCL) != 0;
+	return nd && nfs_lookup_check_intent(nd, LOOKUP_EXCL);
 }
 
 /*
@@ -750,6 +756,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
 	/* Don't revalidate a negative dentry if we're creating a new file */
 	if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0)
 		return 0;
+	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
+		return 1;
 	return !nfs_check_verifier(dir, dentry);
 }
 
@@ -999,7 +1007,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 
 	/* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash
 	 * the dentry. */
-	if (nd->intent.open.flags & O_EXCL) {
+	if (nd->flags & LOOKUP_EXCL) {
 		d_instantiate(dentry, NULL);
 		goto out;
 	}
@@ -1507,7 +1515,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
 							GFP_KERNEL)) {
 		pagevec_add(&lru_pvec, page);
-		pagevec_lru_add(&lru_pvec);
+		pagevec_lru_add_file(&lru_pvec);
 		SetPageUptodate(page);
 		unlock_page(page);
 	} else
@@ -1884,7 +1892,7 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 		return status;
 	nfs_access_add_cache(inode, &cache);
 out:
-	if ((cache.mask & mask) == mask)
+	if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 		return 0;
 	return -EACCES;
 }
@@ -1907,17 +1915,17 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
 	return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
 }
 
-int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int nfs_permission(struct inode *inode, int mask)
 {
 	struct rpc_cred *cred;
 	int res = 0;
 
 	nfs_inc_stats(inode, NFSIOS_VFSACCESS);
 
-	if (mask == 0)
+	if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 		goto out;
 	/* Is this sys_access() ? */
-	if (nd != NULL && (nd->flags & LOOKUP_ACCESS))
+	if (mask & MAY_ACCESS)
 		goto force_lookup;
 
 	switch (inode->i_mode & S_IFMT) {
@@ -1926,8 +1934,7 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 		case S_IFREG:
 			/* NFSv4 has atomic_open... */
 			if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
-					&& nd != NULL
-					&& (nd->flags & LOOKUP_OPEN))
+					&& (mask & MAY_OPEN))
 				goto out;
 			break;
 		case S_IFDIR:
@@ -1950,6 +1957,9 @@ force_lookup:
 	} else
 		res = PTR_ERR(cred);
 out:
+	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
+		res = -EACCES;
+
 	dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
 		inode->i_sb->s_id, inode->i_ino, mask, res);
 	return res;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 78460657f5cb..d319b49f8f06 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 	/* origin == SEEK_END => we must revalidate the cached file length */
 	if (origin == SEEK_END) {
 		struct inode *inode = filp->f_mapping->host;
+
 		int retval = nfs_revalidate_file_size(inode, filp);
 		if (retval < 0)
 			return (loff_t)retval;
-	}
-	lock_kernel();	/* BKL needed? */
-	loff = generic_file_llseek_unlocked(filp, offset, origin);
-	unlock_kernel();
+
+		spin_lock(&inode->i_lock);
+		loff = generic_file_llseek_unlocked(filp, offset, origin);
+		spin_unlock(&inode->i_lock);
+	} else
+		loff = generic_file_llseek_unlocked(filp, offset, origin);
 	return loff;
 }
 
@@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 			filp->f_path.dentry->d_name.name,
 			fl->fl_type, fl->fl_flags);
 
-	/*
-	 * No BSD flocks over NFS allowed.
-	 * Note: we could try to fake a POSIX lock request here by
-	 * using ((u32) filp | 0x80000000) or some such as the pid.
-	 * Not sure whether that would be unique, though, or whether
-	 * that would break in other places.
-	 */
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
 
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index fae97196daad..b7c9b2df1f29 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -107,11 +107,10 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
 	 */
-	mntroot = d_alloc_anon(inode);
-	if (!mntroot) {
-		iput(inode);
+	mntroot = d_obtain_alias(inode);
+	if (IS_ERR(mntroot)) {
 		dprintk("nfs_get_root: get root dentry failed\n");
-		return ERR_PTR(-ENOMEM);
+		return mntroot;
 	}
 
 	security_d_instantiate(mntroot, inode);
@@ -277,11 +276,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
 	 * if the dentry tree reaches them; however if the dentry already
 	 * exists, we'll pick it up at this point and use it as the root
 	 */
-	mntroot = d_alloc_anon(inode);
-	if (!mntroot) {
-		iput(inode);
+	mntroot = d_obtain_alias(inode);
+	if (IS_ERR(mntroot)) {
 		dprintk("nfs_get_root: get root dentry failed\n");
-		return ERR_PTR(-ENOMEM);
+		return mntroot;
 	}
 
 	security_d_instantiate(mntroot, inode);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index df23f987da6b..b9195c02a863 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
 		nfsi->read_cache_jiffies = fattr->time_start;
-		nfsi->last_updated = now;
-		nfsi->cache_change_attribute = now;
+		nfsi->attr_gencount = fattr->gencount;
 		inode->i_atime = fattr->atime;
 		inode->i_mtime = fattr->mtime;
 		inode->i_ctime = fattr->ctime;
@@ -453,6 +452,7 @@ out_big:
 void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 {
 	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+		spin_lock(&inode->i_lock);
 		if ((attr->ia_valid & ATTR_MODE) != 0) {
 			int mode = attr->ia_mode & S_IALLUGO;
 			mode |= inode->i_mode & ~S_IALLUGO;
@@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 			inode->i_uid = attr->ia_uid;
 		if ((attr->ia_valid & ATTR_GID) != 0)
 			inode->i_gid = attr->ia_gid;
-		spin_lock(&inode->i_lock);
 		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 		spin_unlock(&inode->i_lock);
 	}
@@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 	}
 }
 
-static int nfs_wait_schedule(void *word)
-{
-	if (signal_pending(current))
-		return -ERESTARTSYS;
-	schedule();
-	return 0;
-}
-
-/*
- * Wait for the inode to get unlocked.
- */
-static int nfs_wait_on_inode(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	int error;
-
-	error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
-					nfs_wait_schedule, TASK_KILLABLE);
-
-	return error;
-}
-
-static void nfs_wake_up_inode(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	clear_bit(NFS_INO_REVALIDATING, &nfsi->flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING);
-}
-
 int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
@@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
 		inode->i_sb->s_id, (long long)NFS_FILEID(inode));
 
-	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
 	if (is_bad_inode(inode))
- 		goto out_nowait;
+		goto out;
 	if (NFS_STALE(inode))
- 		goto out_nowait;
-
-	status = nfs_wait_on_inode(inode);
-	if (status < 0)
 		goto out;
 
-	status = -ESTALE;
 	if (NFS_STALE(inode))
 		goto out;
 
+	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
 	status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
 	if (status != 0) {
 		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
@@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		goto out;
 	}
 
-	spin_lock(&inode->i_lock);
-	status = nfs_update_inode(inode, &fattr);
+	status = nfs_refresh_inode(inode, &fattr);
 	if (status) {
-		spin_unlock(&inode->i_lock);
 		dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
 			 inode->i_sb->s_id,
 			 (long long)NFS_FILEID(inode), status);
 		goto out;
 	}
-	spin_unlock(&inode->i_lock);
 
 	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
@@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		(long long)NFS_FILEID(inode));
 
  out:
-	nfs_wake_up_inode(inode);
-
- out_nowait:
 	return status;
 }
 
@@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 		return -EIO;
 	}
 
-	/* Do atomic weak cache consistency updates */
-	nfs_wcc_update_inode(inode, fattr);
-
 	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
 			nfsi->change_attr != fattr->change_attr)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
@@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 
 	if (invalid != 0)
 		nfsi->cache_validity |= invalid;
-	else
-		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
-				| NFS_INO_INVALID_ATIME
-				| NFS_INO_REVAL_PAGECACHE);
 
 	nfsi->read_cache_jiffies = fattr->time_start;
 	return 0;
 }
 
+static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+	return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
+}
+
+static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+	return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
+}
+
+static unsigned long nfs_attr_generation_counter;
+
+static unsigned long nfs_read_attr_generation_counter(void)
+{
+	smp_rmb();
+	return nfs_attr_generation_counter;
+}
+
+unsigned long nfs_inc_attr_generation_counter(void)
+{
+	unsigned long ret;
+	smp_rmb();
+	ret = ++nfs_attr_generation_counter;
+	smp_wmb();
+	return ret;
+}
+
+void nfs_fattr_init(struct nfs_fattr *fattr)
+{
+	fattr->valid = 0;
+	fattr->time_start = jiffies;
+	fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
+/**
+ * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * @inode - pointer to inode
+ * @fattr - attributes
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * To do so, the function first assumes that a more recent ctime means
+ * that the attributes in fattr are newer, however it also attempt to
+ * catch the case where ctime either didn't change, or went backwards
+ * (if someone reset the clock on the server) by looking at whether
+ * or not this RPC call was started after the inode was last updated.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns 'true' if it thinks the attributes in 'fattr' are
+ * more recent than the ones cached in the inode.
+ *
+ */
+static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+	const struct nfs_inode *nfsi = NFS_I(inode);
+
+	return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
+		nfs_ctime_need_update(inode, fattr) ||
+		nfs_size_need_update(inode, fattr) ||
+		((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+}
+
+static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+	if (nfs_inode_attrs_need_update(inode, fattr))
+		return nfs_update_inode(inode, fattr);
+	return nfs_check_inode_attributes(inode, fattr);
+}
+
 /**
  * nfs_refresh_inode - try to update the inode attribute cache
  * @inode - pointer to inode
@@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
  */
 int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	int status;
 
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
 		return 0;
 	spin_lock(&inode->i_lock);
-	if (time_after(fattr->time_start, nfsi->last_updated))
-		status = nfs_update_inode(inode, fattr);
-	else
-		status = nfs_check_inode_attributes(inode, fattr);
-
+	status = nfs_refresh_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
 	return status;
 }
 
+static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+	if (S_ISDIR(inode->i_mode))
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+		return 0;
+	return nfs_refresh_inode_locked(inode, fattr);
+}
+
 /**
  * nfs_post_op_update_inode - try to update the inode attribute cache
  * @inode - pointer to inode
@@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
  */
 int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
+	int status;
 
 	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-	if (S_ISDIR(inode->i_mode))
-		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+	status = nfs_post_op_update_inode_locked(inode, fattr);
 	spin_unlock(&inode->i_lock);
-	return nfs_refresh_inode(inode, fattr);
+	return status;
 }
 
 /**
@@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  */
 int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
 {
+	int status;
+
+	spin_lock(&inode->i_lock);
+	/* Don't do a WCC update if these attributes are already stale */
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
+			!nfs_inode_attrs_need_update(inode, fattr)) {
+		fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+		goto out_noforce;
+	}
 	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
 			(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
 		fattr->pre_change_attr = NFS_I(inode)->change_attr;
@@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
 		fattr->pre_size = i_size_read(inode);
 		fattr->valid |= NFS_ATTR_WCC;
 	}
-	return nfs_post_op_update_inode(inode, fattr);
+out_noforce:
+	status = nfs_post_op_update_inode_locked(inode, fattr);
+	spin_unlock(&inode->i_lock);
+	return status;
 }
 
 /*
@@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		}
 		/* If ctime has changed we should definitely clear access+acl caches */
 		if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
-			invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 	} else if (nfsi->change_attr != fattr->change_attr) {
 		dprintk("NFS: change_attr change on server for file %s/%ld\n",
 				inode->i_sb->s_id, inode->i_ino);
@@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	    inode->i_gid != fattr->gid)
 		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 
+	if (inode->i_nlink != fattr->nlink)
+		invalid |= NFS_INO_INVALID_ATTR;
+
 	inode->i_mode = fattr->mode;
 	inode->i_nlink = fattr->nlink;
 	inode->i_uid = fattr->uid;
@@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = now;
-		nfsi->last_updated = now;
+		nfsi->attr_gencount = nfs_inc_attr_generation_counter();
 	} else {
 		if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
 			if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
 				nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
 			nfsi->attrtimeo_timestamp = now;
 		}
-		/*
-		 * Avoid jiffy wraparound issues with nfsi->last_updated
-		 */
-		if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
-			nfsi->last_updated = nfsi->read_cache_jiffies;
 	}
 	invalid &= ~NFS_INO_INVALID_ATTR;
 	/* Don't invalidate the data if we were to blame */
@@ -1242,7 +1277,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #endif
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 24241fcbb98d..d212ee41caf2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
 
 /* super.c */
+void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
 extern struct file_system_type nfs_xdev_fs_type;
 #ifdef CONFIG_NFS_V4
 extern struct file_system_type nfs4_xdev_fs_type;
@@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat;
 
 extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
-extern void nfs_sb_active(struct nfs_server *server);
-extern void nfs_sb_deactive(struct nfs_server *server);
+extern void nfs_sb_active(struct super_block *sb);
+extern void nfs_sb_deactive(struct super_block *sb);
 
 /* namespace.c */
 extern char *nfs_path(const char *base,
@@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
 		PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
 
+#define IPV6_SCOPE_DELIMITER	'%'
+
+/*
+ * Set the port number in an address.  Be agnostic about the address
+ * family.
+ */
+static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
+{
+	struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+	struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		ap->sin_port = htons(port);
+		break;
+	case AF_INET6:
+		ap6->sin6_port = htons(port);
+		break;
+	}
+}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 779d2eb649c5..086a6830d785 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/nfs_fs.h>
+#include "internal.h"
 
 #ifdef RPC_DEBUG
 # define NFSDBG_FACILITY	NFSDBG_MOUNT
@@ -98,7 +99,7 @@ out_call_err:
 
 out_mnt_err:
 	dprintk("NFS: MNT server returned result %d\n", result.status);
-	status = -EACCES;
+	status = nfs_stat_to_errno(result.status);
 	goto out;
 }
 
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2f285ef76399..64a288ee046d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 
 	dprintk("--> nfs_follow_mountpoint()\n");
 
-	BUG_ON(IS_ROOT(dentry));
+	err = -ESTALE;
+	if (IS_ROOT(dentry))
+		goto out_err;
+
 	dprintk("%s: enter\n", __func__);
 	dput(nd->path.dentry);
 	nd->path.dentry = dget(dentry);
@@ -129,7 +132,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		goto out_err;
 
 	mntget(mnt);
-	err = do_add_mount(mnt, nd, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
+	err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
 			   &nfs_automount_list);
 	if (err < 0) {
 		mntput(mnt);
@@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 					   struct nfs_clone_mount *mountdata)
 {
 #ifdef CONFIG_NFS_V4
-	struct vfsmount *mnt = NULL;
+	struct vfsmount *mnt = ERR_PTR(-EINVAL);
 	switch (server->nfs_client->rpc_ops->version) {
 		case 2:
 		case 3:
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 423842f51ac9..cef62557c87d 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 
 	dprintk("NFS call getacl\n");
 	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+	nfs_fattr_init(&fattr);
 	status = rpc_call_sync(server->client_acl, &msg, 0);
 	dprintk("NFS reply getacl: %d\n", status);
 
@@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 
 	dprintk("NFS call setacl\n");
 	msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+	nfs_fattr_init(&fattr);
 	status = rpc_call_sync(server->client_acl, &msg, 0);
 	nfs_access_zap_cache(inode);
 	nfs_zap_acl_cache(inode);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1e750e4574a9..c55be7a7679e 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
 }
 
 static int
-nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
 		 struct nfs_fsinfo *info)
 {
 	struct rpc_message msg = {
@@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
 
 	dprintk("NFS call  fsinfo\n");
 	nfs_fattr_init(info->fattr);
-	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+	status = rpc_call_sync(client, &msg, 0);
 	dprintk("NFS reply fsinfo: %d\n", status);
 	return status;
 }
 
+/*
+ * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
+ * nfs_create_server
+ */
+static int
+nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+		   struct nfs_fsinfo *info)
+{
+	int	status;
+
+	status = do_proc_fsinfo(server->client, fhandle, info);
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
+	return status;
+}
+
 static int
 nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 		   struct nfs_pathconf *info)
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index b112857301f7..30befc39b3c6 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
 	return 0;
 }
 
-/*
- * Check if the string represents a "valid" IPv4 address
- */
-static inline int valid_ipaddr4(const char *buf)
+static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
+				     char *page, char *page2,
+				     const struct nfs4_fs_location *location)
 {
-	int rc, count, in[4];
-
-	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
-	if (rc != 4)
-		return -EINVAL;
-	for (count = 0; count < 4; count++) {
-		if (in[count] > 255)
-			return -EINVAL;
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	char *mnt_path;
+	int page2len;
+	unsigned int s;
+
+	mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+	if (IS_ERR(mnt_path))
+		return mnt;
+	mountdata->mnt_path = mnt_path;
+	page2 += strlen(mnt_path) + 1;
+	page2len = PAGE_SIZE - strlen(mnt_path) - 1;
+
+	for (s = 0; s < location->nservers; s++) {
+		const struct nfs4_string *buf = &location->servers[s];
+		struct sockaddr_storage addr;
+
+		if (buf->len <= 0 || buf->len >= PAGE_SIZE)
+			continue;
+
+		mountdata->addr = (struct sockaddr *)&addr;
+
+		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
+			continue;
+		nfs_parse_ip_address(buf->data, buf->len,
+				mountdata->addr, &mountdata->addrlen);
+		if (mountdata->addr->sa_family == AF_UNSPEC)
+			continue;
+		nfs_set_port(mountdata->addr, NFS_PORT);
+
+		strncpy(page2, buf->data, page2len);
+		page2[page2len] = '\0';
+		mountdata->hostname = page2;
+
+		snprintf(page, PAGE_SIZE, "%s:%s",
+				mountdata->hostname,
+				mountdata->mnt_path);
+
+		mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+		if (!IS_ERR(mnt))
+			break;
 	}
-	return 0;
+	return mnt;
 }
 
 /**
@@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
 	};
 	char *page = NULL, *page2 = NULL;
-	unsigned int s;
 	int loc, error;
 
 	if (locations == NULL || locations->nlocations <= 0)
@@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 		goto out;
 	}
 
-	loc = 0;
-	while (loc < locations->nlocations && IS_ERR(mnt)) {
+	for (loc = 0; loc < locations->nlocations; loc++) {
 		const struct nfs4_fs_location *location = &locations->locations[loc];
-		char *mnt_path;
 
 		if (location == NULL || location->nservers <= 0 ||
-		    location->rootpath.ncomponents == 0) {
-			loc++;
+		    location->rootpath.ncomponents == 0)
 			continue;
-		}
 
-		mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
-		if (IS_ERR(mnt_path)) {
-			loc++;
-			continue;
-		}
-		mountdata.mnt_path = mnt_path;
-
-		s = 0;
-		while (s < location->nservers) {
-			struct sockaddr_in addr = {
-				.sin_family	= AF_INET,
-				.sin_port	= htons(NFS_PORT),
-			};
-
-			if (location->servers[s].len <= 0 ||
-			    valid_ipaddr4(location->servers[s].data) < 0) {
-				s++;
-				continue;
-			}
-
-			mountdata.hostname = location->servers[s].data;
-			addr.sin_addr.s_addr = in_aton(mountdata.hostname),
-			mountdata.addr = (struct sockaddr *)&addr;
-			mountdata.addrlen = sizeof(addr);
-
-			snprintf(page, PAGE_SIZE, "%s:%s",
-					mountdata.hostname,
-					mountdata.mnt_path);
-
-			mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata);
-			if (!IS_ERR(mnt)) {
-				break;
-			}
-			s++;
-		}
-		loc++;
+		mnt = try_location(&mountdata, page, page2, location);
+		if (!IS_ERR(mnt))
+			break;
 	}
 
 out:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c910413eaeca..83e700a2b0c0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1659,8 +1659,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 		struct nfs_open_context *ctx;
 
 		ctx = nfs_file_open_context(sattr->ia_file);
-		cred = ctx->cred;
-		state = ctx->state;
+		if (ctx) {
+			cred = ctx->cred;
+			state = ctx->state;
+		}
 	}
 
 	status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 46763d1cd397..8478fc25daee 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -127,7 +127,7 @@ enum {
 	Opt_err
 };
 
-static match_table_t __initdata tokens = {
+static match_table_t __initconst tokens = {
 	{Opt_port, "port=%u"},
 	{Opt_rsize, "rsize=%u"},
 	{Opt_wsize, "wsize=%u"},
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4dbb84df1b68..193465210d7c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 
 	dprintk("%s: call getattr\n", __func__);
 	nfs_fattr_init(fattr);
-	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+	status = rpc_call_sync(server->client, &msg, 0);
+	/* Retry with default authentication if different */
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
 	dprintk("%s: reply getattr: %d\n", __func__, status);
 	if (status)
 		return status;
 	dprintk("%s: call statfs\n", __func__);
 	msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
 	msg.rpc_resp = &fsinfo;
-	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+	status = rpc_call_sync(server->client, &msg, 0);
+	/* Retry with default authentication if different */
+	if (status && server->nfs_client->cl_rpcclient != server->client)
+		status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
 	dprintk("%s: reply statfs: %d\n", __func__, status);
 	if (status)
 		return status;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 1b94e3650f5c..a3b0061dfd45 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -91,6 +91,7 @@ enum {
 	/* Mount options that take string arguments */
 	Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
 	Opt_addr, Opt_mountaddr, Opt_clientaddr,
+	Opt_lookupcache,
 
 	/* Special mount options */
 	Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -98,7 +99,7 @@ enum {
 	Opt_err
 };
 
-static match_table_t nfs_mount_option_tokens = {
+static const match_table_t nfs_mount_option_tokens = {
 	{ Opt_userspace, "bg" },
 	{ Opt_userspace, "fg" },
 	{ Opt_userspace, "retry=%s" },
@@ -154,6 +155,8 @@ static match_table_t nfs_mount_option_tokens = {
 	{ Opt_mounthost, "mounthost=%s" },
 	{ Opt_mountaddr, "mountaddr=%s" },
 
+	{ Opt_lookupcache, "lookupcache=%s" },
+
 	{ Opt_err, NULL }
 };
 
@@ -163,7 +166,7 @@ enum {
 	Opt_xprt_err
 };
 
-static match_table_t nfs_xprt_protocol_tokens = {
+static const match_table_t nfs_xprt_protocol_tokens = {
 	{ Opt_xprt_udp, "udp" },
 	{ Opt_xprt_tcp, "tcp" },
 	{ Opt_xprt_rdma, "rdma" },
@@ -180,7 +183,7 @@ enum {
 	Opt_sec_err
 };
 
-static match_table_t nfs_secflavor_tokens = {
+static const match_table_t nfs_secflavor_tokens = {
 	{ Opt_sec_none, "none" },
 	{ Opt_sec_none, "null" },
 	{ Opt_sec_sys, "sys" },
@@ -200,6 +203,22 @@ static match_table_t nfs_secflavor_tokens = {
 	{ Opt_sec_err, NULL }
 };
 
+enum {
+	Opt_lookupcache_all, Opt_lookupcache_positive,
+	Opt_lookupcache_none,
+
+	Opt_lookupcache_err
+};
+
+static match_table_t nfs_lookupcache_tokens = {
+	{ Opt_lookupcache_all, "all" },
+	{ Opt_lookupcache_positive, "pos" },
+	{ Opt_lookupcache_positive, "positive" },
+	{ Opt_lookupcache_none, "none" },
+
+	{ Opt_lookupcache_err, NULL }
+};
+
 
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
@@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
 static int nfs_xdev_get_sb(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static void nfs_kill_super(struct super_block *);
-static void nfs_put_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 
 static struct file_system_type nfs_fs_type = {
@@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
 	.write_inode	= nfs_write_inode,
-	.put_super	= nfs_put_super,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void)
 	unregister_filesystem(&nfs_fs_type);
 }
 
-void nfs_sb_active(struct nfs_server *server)
+void nfs_sb_active(struct super_block *sb)
 {
-	atomic_inc(&server->active);
-}
+	struct nfs_server *server = NFS_SB(sb);
 
-void nfs_sb_deactive(struct nfs_server *server)
-{
-	if (atomic_dec_and_test(&server->active))
-		wake_up(&server->active_wq);
+	if (atomic_inc_return(&server->active) == 1)
+		atomic_inc(&sb->s_active);
 }
 
-static void nfs_put_super(struct super_block *sb)
+void nfs_sb_deactive(struct super_block *sb)
 {
 	struct nfs_server *server = NFS_SB(sb);
-	/*
-	 * Make sure there are no outstanding ops to this server.
-	 * If so, wait for them to finish before allowing the
-	 * unmount to continue.
-	 */
-	wait_event(server->active_wq, atomic_read(&server->active) == 0);
+
+	if (atomic_dec_and_test(&server->active))
+		deactivate_super(sb);
 }
 
 /*
@@ -664,25 +675,6 @@ static void nfs_umount_begin(struct super_block *sb)
 }
 
 /*
- * Set the port number in an address.  Be agnostic about the address family.
- */
-static void nfs_set_port(struct sockaddr *sap, unsigned short port)
-{
-	switch (sap->sa_family) {
-	case AF_INET: {
-		struct sockaddr_in *ap = (struct sockaddr_in *)sap;
-		ap->sin_port = htons(port);
-		break;
-	}
-	case AF_INET6: {
-		struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
-		ap->sin6_port = htons(port);
-		break;
-	}
-	}
-}
-
-/*
  * Sanity-check a server address provided by the mount command.
  *
  * Address family must be initialized, and address must not be
@@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len,
 	*addr_len = 0;
 }
 
-#define IPV6_SCOPE_DELIMITER	'%'
-
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
-				    const char *delim,
-				    struct sockaddr_in6 *sin6)
+static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+				   const char *delim,
+				   struct sockaddr_in6 *sin6)
 {
 	char *p;
 	size_t len;
 
-	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
-		return ;
+	if ((string + str_len) == delim)
+		return 1;
+
 	if (*delim != IPV6_SCOPE_DELIMITER)
-		return;
+		return 0;
+
+	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+		return 0;
 
 	len = (string + str_len) - delim - 1;
 	p = kstrndup(delim + 1, len, GFP_KERNEL);
@@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
 			scope_id = dev->ifindex;
 			dev_put(dev);
 		} else {
-			/* scope_id is set to zero on error */
-			strict_strtoul(p, 10, &scope_id);
+			if (strict_strtoul(p, 10, &scope_id) == 0) {
+				kfree(p);
+				return 0;
+			}
 		}
 
 		kfree(p);
+
 		sin6->sin6_scope_id = scope_id;
 		dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+		return 1;
 	}
+
+	return 0;
 }
 
 static void nfs_parse_ipv6_address(char *string, size_t str_len,
@@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
 
 		sin6->sin6_family = AF_INET6;
 		*addr_len = sizeof(*sin6);
-		if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
-			nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
-			return;
+		if (in6_pton(string, str_len, addr,
+					IPV6_SCOPE_DELIMITER, &delim) != 0) {
+			if (nfs_parse_ipv6_scope_id(string, str_len,
+							delim, sin6) != 0)
+				return;
 		}
 	}
 
@@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
  * If there is a problem constructing the new sockaddr, set the address
  * family to AF_UNSPEC.
  */
-static void nfs_parse_ip_address(char *string, size_t str_len,
+void nfs_parse_ip_address(char *string, size_t str_len,
 				 struct sockaddr *sap, size_t *addr_len)
 {
 	unsigned int i, colons;
@@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw,
 					     &mnt->mount_server.addrlen);
 			kfree(string);
 			break;
+		case Opt_lookupcache:
+			string = match_strdup(args);
+			if (string == NULL)
+				goto out_nomem;
+			token = match_token(string,
+					nfs_lookupcache_tokens, args);
+			kfree(string);
+			switch (token) {
+				case Opt_lookupcache_all:
+					mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
+					break;
+				case Opt_lookupcache_positive:
+					mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
+					mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
+					break;
+				case Opt_lookupcache_none:
+					mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
+					break;
+				default:
+					errors++;
+					dfprintk(MOUNT, "NFS:   invalid "
+							"lookupcache argument\n");
+			};
+			break;
 
 		/*
 		 * Special options
@@ -1279,6 +1305,12 @@ static int nfs_parse_mount_options(char *raw,
 		}
 	}
 
+	if (errors > 0) {
+		dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
+				errors, (errors == 1 ? "" : "s"));
+		if (!sloppy)
+			return 0;
+	}
 	return 1;
 
 out_nomem:
@@ -1552,7 +1584,7 @@ static int nfs_validate_mount_data(void *options,
 		 * Translate to nfs_parsed_mount_data, which nfs_fill_super
 		 * can deal with.
 		 */
-		args->flags		= data->flags;
+		args->flags		= data->flags & NFS_MOUNT_FLAGMASK;
 		args->rsize		= data->rsize;
 		args->wsize		= data->wsize;
 		args->timeo		= data->timeo;
@@ -1718,9 +1750,9 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	 * ones were explicitly specified. Fall back to legacy behavior and
 	 * just return success.
 	 */
-	if ((nfsvers == 4 && options4->version == 1) ||
-	    (nfsvers <= 3 && options->version >= 1 &&
-	     options->version <= 6))
+	if ((nfsvers == 4 && (!options4 || options4->version == 1)) ||
+	    (nfsvers <= 3 && (!options || (options->version >= 1 &&
+					   options->version <= 6))))
 		return 0;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -2427,7 +2459,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 		compare_super = NULL;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
@@ -2512,7 +2544,7 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 		compare_super = NULL;
 
 	/* Get a superblock - note that we may end up sharing one that already exists */
-	s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+	s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto out_err_nosb;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 3adf8b266461..ecc295347775 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,10 +95,11 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
 static void nfs_async_unlink_release(void *calldata)
 {
 	struct nfs_unlinkdata	*data = calldata;
+	struct super_block *sb = data->dir->i_sb;
 
 	nfs_dec_sillycount(data->dir);
-	nfs_sb_deactive(NFS_SERVER(data->dir));
 	nfs_free_unlinkdata(data);
+	nfs_sb_deactive(sb);
 }
 
 static const struct rpc_call_ops nfs_unlink_ops = {
@@ -117,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		.rpc_message = &msg,
 		.callback_ops = &nfs_unlink_ops,
 		.callback_data = data,
+		.workqueue = nfsiod_workqueue,
 		.flags = RPC_TASK_ASYNC,
 	};
 	struct rpc_task *task;
@@ -148,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		nfs_dec_sillycount(dir);
 		return 0;
 	}
-	nfs_sb_active(NFS_SERVER(dir));
+	nfs_sb_active(dir->i_sb);
 	data->args.fh = NFS_FH(dir);
 	nfs_fattr_init(&data->res.dir_attr);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3229e217c773..9f9845859fc1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 		.bdi = mapping->backing_dev_info,
 		.sync_mode = WB_SYNC_NONE,
 		.nr_to_write = LONG_MAX,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
 		.for_writepages = 1,
-		.range_cyclic = 1,
 	};
 	int ret;
 
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 33bfcf09db46..5839b229cd0e 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -99,7 +99,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 	int fsidtype;
 	char *ep;
 	struct svc_expkey key;
-	struct svc_expkey *ek;
+	struct svc_expkey *ek = NULL;
 
 	if (mesg[mlen-1] != '\n')
 		return -EINVAL;
@@ -107,7 +107,8 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	err = -ENOMEM;
-	if (!buf) goto out;
+	if (!buf)
+		goto out;
 
 	err = -EINVAL;
 	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
@@ -151,34 +152,32 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	/* now we want a pathname, or empty meaning NEGATIVE  */
 	err = -EINVAL;
-	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) < 0)
+	len = qword_get(&mesg, buf, PAGE_SIZE);
+	if (len < 0)
 		goto out;
 	dprintk("Path seems to be <%s>\n", buf);
 	err = 0;
 	if (len == 0) {
 		set_bit(CACHE_NEGATIVE, &key.h.flags);
 		ek = svc_expkey_update(&key, ek);
-		if (ek)
-			cache_put(&ek->h, &svc_expkey_cache);
-		else err = -ENOMEM;
+		if (!ek)
+			err = -ENOMEM;
 	} else {
-		struct nameidata nd;
-		err = path_lookup(buf, 0, &nd);
+		err = kern_path(buf, 0, &key.ek_path);
 		if (err)
 			goto out;
 
 		dprintk("Found the path %s\n", buf);
-		key.ek_path = nd.path;
 
 		ek = svc_expkey_update(&key, ek);
-		if (ek)
-			cache_put(&ek->h, &svc_expkey_cache);
-		else
+		if (!ek)
 			err = -ENOMEM;
-		path_put(&nd.path);
+		path_put(&key.ek_path);
 	}
 	cache_flush();
  out:
+	if (ek)
+		cache_put(&ek->h, &svc_expkey_cache);
 	if (dom)
 		auth_domain_put(dom);
 	kfree(buf);
@@ -500,35 +499,22 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	int len;
 	int err;
 	struct auth_domain *dom = NULL;
-	struct nameidata nd;
-	struct svc_export exp, *expp;
+	struct svc_export exp = {}, *expp;
 	int an_int;
 
-	nd.path.dentry = NULL;
-	exp.ex_pathname = NULL;
-
-	/* fs locations */
-	exp.ex_fslocs.locations = NULL;
-	exp.ex_fslocs.locations_count = 0;
-	exp.ex_fslocs.migrated = 0;
-
-	exp.ex_uuid = NULL;
-
-	/* secinfo */
-	exp.ex_nflavors = 0;
-
 	if (mesg[mlen-1] != '\n')
 		return -EINVAL;
 	mesg[mlen-1] = 0;
 
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	err = -ENOMEM;
-	if (!buf) goto out;
+	if (!buf)
+		return -ENOMEM;
 
 	/* client */
-	len = qword_get(&mesg, buf, PAGE_SIZE);
 	err = -EINVAL;
-	if (len <= 0) goto out;
+	len = qword_get(&mesg, buf, PAGE_SIZE);
+	if (len <= 0)
+		goto out;
 
 	err = -ENOENT;
 	dom = auth_domain_find(buf);
@@ -537,25 +523,25 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	/* path */
 	err = -EINVAL;
-	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
-		goto out;
-	err = path_lookup(buf, 0, &nd);
-	if (err) goto out_no_path;
+	if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
+		goto out1;
+
+	err = kern_path(buf, 0, &exp.ex_path);
+	if (err)
+		goto out1;
 
-	exp.h.flags = 0;
 	exp.ex_client = dom;
-	exp.ex_path.mnt = nd.path.mnt;
-	exp.ex_path.dentry = nd.path.dentry;
-	exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
+
 	err = -ENOMEM;
+	exp.ex_pathname = kstrdup(buf, GFP_KERNEL);
 	if (!exp.ex_pathname)
-		goto out;
+		goto out2;
 
 	/* expiry */
 	err = -EINVAL;
 	exp.h.expiry_time = get_expiry(&mesg);
 	if (exp.h.expiry_time == 0)
-		goto out;
+		goto out3;
 
 	/* flags */
 	err = get_int(&mesg, &an_int);
@@ -563,22 +549,26 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		err = 0;
 		set_bit(CACHE_NEGATIVE, &exp.h.flags);
 	} else {
-		if (err || an_int < 0) goto out;	
+		if (err || an_int < 0)
+			goto out3;
 		exp.ex_flags= an_int;
 	
 		/* anon uid */
 		err = get_int(&mesg, &an_int);
-		if (err) goto out;
+		if (err)
+			goto out3;
 		exp.ex_anon_uid= an_int;
 
 		/* anon gid */
 		err = get_int(&mesg, &an_int);
-		if (err) goto out;
+		if (err)
+			goto out3;
 		exp.ex_anon_gid= an_int;
 
 		/* fsid */
 		err = get_int(&mesg, &an_int);
-		if (err) goto out;
+		if (err)
+			goto out3;
 		exp.ex_fsid = an_int;
 
 		while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
@@ -604,12 +594,13 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 				 */
 				break;
 			if (err)
-				goto out;
+				goto out4;
 		}
 
-		err = check_export(nd.path.dentry->d_inode, exp.ex_flags,
+		err = check_export(exp.ex_path.dentry->d_inode, exp.ex_flags,
 				   exp.ex_uuid);
-		if (err) goto out;
+		if (err)
+			goto out4;
 	}
 
 	expp = svc_export_lookup(&exp);
@@ -622,15 +613,16 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		err = -ENOMEM;
 	else
 		exp_put(expp);
- out:
+out4:
 	nfsd4_fslocs_free(&exp.ex_fslocs);
 	kfree(exp.ex_uuid);
+out3:
 	kfree(exp.ex_pathname);
-	if (nd.path.dentry)
-		path_put(&nd.path);
- out_no_path:
-	if (dom)
-		auth_domain_put(dom);
+out2:
+	path_put(&exp.ex_path);
+out1:
+	auth_domain_put(dom);
+out:
 	kfree(buf);
 	return err;
 }
@@ -998,7 +990,7 @@ exp_export(struct nfsctl_export *nxp)
 	struct svc_export	*exp = NULL;
 	struct svc_export	new;
 	struct svc_expkey	*fsid_key = NULL;
-	struct nameidata nd;
+	struct path path;
 	int		err;
 
 	/* Consistency check */
@@ -1021,12 +1013,12 @@ exp_export(struct nfsctl_export *nxp)
 
 
 	/* Look up the dentry */
-	err = path_lookup(nxp->ex_path, 0, &nd);
+	err = kern_path(nxp->ex_path, 0, &path);
 	if (err)
-		goto out_unlock;
+		goto out_put_clp;
 	err = -EINVAL;
 
-	exp = exp_get_by_name(clp, nd.path.mnt, nd.path.dentry, NULL);
+	exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL);
 
 	memset(&new, 0, sizeof(new));
 
@@ -1034,8 +1026,8 @@ exp_export(struct nfsctl_export *nxp)
 	if ((nxp->ex_flags & NFSEXP_FSID) &&
 	    (!IS_ERR(fsid_key = exp_get_fsid_key(clp, nxp->ex_dev))) &&
 	    fsid_key->ek_path.mnt &&
-	    (fsid_key->ek_path.mnt != nd.path.mnt ||
-	     fsid_key->ek_path.dentry != nd.path.dentry))
+	    (fsid_key->ek_path.mnt != path.mnt ||
+	     fsid_key->ek_path.dentry != path.dentry))
 		goto finish;
 
 	if (!IS_ERR(exp)) {
@@ -1051,7 +1043,7 @@ exp_export(struct nfsctl_export *nxp)
 		goto finish;
 	}
 
-	err = check_export(nd.path.dentry->d_inode, nxp->ex_flags, NULL);
+	err = check_export(path.dentry->d_inode, nxp->ex_flags, NULL);
 	if (err) goto finish;
 
 	err = -ENOMEM;
@@ -1064,7 +1056,7 @@ exp_export(struct nfsctl_export *nxp)
 	if (!new.ex_pathname)
 		goto finish;
 	new.ex_client = clp;
-	new.ex_path = nd.path;
+	new.ex_path = path;
 	new.ex_flags = nxp->ex_flags;
 	new.ex_anon_uid = nxp->ex_anon_uid;
 	new.ex_anon_gid = nxp->ex_anon_gid;
@@ -1090,9 +1082,9 @@ finish:
 		exp_put(exp);
 	if (fsid_key && !IS_ERR(fsid_key))
 		cache_put(&fsid_key->h, &svc_expkey_cache);
-	if (clp)
-		auth_domain_put(clp);
-	path_put(&nd.path);
+	path_put(&path);
+out_put_clp:
+	auth_domain_put(clp);
 out_unlock:
 	exp_writeunlock();
 out:
@@ -1121,7 +1113,7 @@ exp_unexport(struct nfsctl_export *nxp)
 {
 	struct auth_domain *dom;
 	svc_export *exp;
-	struct nameidata nd;
+	struct path path;
 	int		err;
 
 	/* Consistency check */
@@ -1138,13 +1130,13 @@ exp_unexport(struct nfsctl_export *nxp)
 		goto out_unlock;
 	}
 
-	err = path_lookup(nxp->ex_path, 0, &nd);
+	err = kern_path(nxp->ex_path, 0, &path);
 	if (err)
 		goto out_domain;
 
 	err = -EINVAL;
-	exp = exp_get_by_name(dom, nd.path.mnt, nd.path.dentry, NULL);
-	path_put(&nd.path);
+	exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL);
+	path_put(&path);
 	if (IS_ERR(exp))
 		goto out_domain;
 
@@ -1166,26 +1158,26 @@ out_unlock:
  * since its harder to fool a kernel module than a user space program.
  */
 int
-exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize)
+exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
 {
 	struct svc_export	*exp;
-	struct nameidata	nd;
+	struct path		path;
 	struct inode		*inode;
 	struct svc_fh		fh;
 	int			err;
 
 	err = -EPERM;
 	/* NB: we probably ought to check that it's NUL-terminated */
-	if (path_lookup(path, 0, &nd)) {
-		printk("nfsd: exp_rootfh path not found %s", path);
+	if (kern_path(name, 0, &path)) {
+		printk("nfsd: exp_rootfh path not found %s", name);
 		return err;
 	}
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 
 	dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
-		 path, nd.path.dentry, clp->name,
+		 name, path.dentry, clp->name,
 		 inode->i_sb->s_id, inode->i_ino);
-	exp = exp_parent(clp, nd.path.mnt, nd.path.dentry, NULL);
+	exp = exp_parent(clp, path.mnt, path.dentry, NULL);
 	if (IS_ERR(exp)) {
 		err = PTR_ERR(exp);
 		goto out;
@@ -1195,7 +1187,7 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize)
 	 * fh must be initialized before calling fh_compose
 	 */
 	fh_init(&fh, maxsize);
-	if (fh_compose(&fh, exp, nd.path.dentry, NULL))
+	if (fh_compose(&fh, exp, path.dentry, NULL))
 		err = -EINVAL;
 	else
 		err = 0;
@@ -1203,7 +1195,7 @@ exp_rootfh(svc_client *clp, char *path, struct knfsd_fh *f, int maxsize)
 	fh_put(&fh);
 	exp_put(exp);
 out:
-	path_put(&nd.path);
+	path_put(&path);
 	return err;
 }
 
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 9e4a568a5013..b2786a5f9afe 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -19,6 +19,13 @@
 
 #define NFSDDBG_FACILITY		NFSDDBG_LOCKD
 
+#ifdef CONFIG_LOCKD_V4
+#define nlm_stale_fh	nlm4_stale_fh
+#define nlm_failed	nlm4_failed
+#else
+#define nlm_stale_fh	nlm_lck_denied_nolocks
+#define nlm_failed	nlm_lck_denied_nolocks
+#endif
 /*
  * Note: we hold the dentry use count while the file is open.
  */
@@ -35,7 +42,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
 	fh.fh_export = NULL;
 
 	exp_readlock();
-	nfserr = nfsd_open(rqstp, &fh, S_IFREG, MAY_LOCK, filp);
+	nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
 	fh_put(&fh);
 	rqstp->rq_client = NULL;
 	exp_readunlock();
@@ -47,12 +54,10 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
 		return 0;
 	case nfserr_dropit:
 		return nlm_drop_reply;
-#ifdef CONFIG_LOCKD_V4
 	case nfserr_stale:
-		return nlm4_stale_fh;
-#endif
+		return nlm_stale_fh;
 	default:
-		return nlm_lck_denied;
+		return nlm_failed;
 	}
 }
 
@@ -65,7 +70,6 @@ nlm_fclose(struct file *filp)
 static struct nlmsvc_binding	nfsd_nlm_ops = {
 	.fopen		= nlm_fopen,		/* open file for locking */
 	.fclose		= nlm_fclose,		/* close file */
-	.get_grace_period = get_nfs4_grace_period,
 };
 
 void
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 1c3b7654e966..4e3219e84116 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -40,7 +40,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 	dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
 
 	fh = fh_copy(&resp->fh, &argp->fh);
-	if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
 		RETURN_STATUS(nfserr);
 
 	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
@@ -107,7 +108,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
 	dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
 
 	fh = fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
 
 	if (!nfserr) {
 		nfserr = nfserrno( nfsd_set_posix_acl(
@@ -134,7 +135,7 @@ static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
 	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	return fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
 }
 
 /*
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index b647f2f872dc..9981dbb377a3 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -36,7 +36,8 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
 	__be32 nfserr = 0;
 
 	fh = fh_copy(&resp->fh, &argp->fh);
-	if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
 		RETURN_STATUS(nfserr);
 
 	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
@@ -101,7 +102,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
 	__be32 nfserr = 0;
 
 	fh = fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
 
 	if (!nfserr) {
 		nfserr = nfserrno( nfsd_set_posix_acl(
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index c721a1e6e9dd..9dbd2eb91281 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 		SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	nfserr = fh_verify(rqstp, &resp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
 	if (nfserr)
 		RETURN_STATUS(nfserr);
 
@@ -242,7 +243,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 	attr   = &argp->attrs;
 
 	/* Get the directory inode */
-	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_CREATE);
+	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (nfserr)
 		RETURN_STATUS(nfserr);
 
@@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 	dprintk("nfsd: FSSTAT(3)   %s\n",
 				SVCFH_fmt(&argp->fh));
 
-	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
 	fh_put(&argp->fh);
 	RETURN_STATUS(nfserr);
 }
@@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 	resp->f_maxfilesize = ~(u32) 0;
 	resp->f_properties = NFS3_FSF_DEFAULT;
 
-	nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP);
+	nfserr = fh_verify(rqstp, &argp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
 
 	/* Check special features of the file system. May request
 	 * different read/write sizes for file systems known to have
@@ -597,7 +599,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
 	resp->p_case_insensitive = 0;
 	resp->p_case_preserving = 1;
 
-	nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP);
+	nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
 
 	if (nfserr == 0) {
 		struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index b6ed38380ab8..54b8b4140c8f 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt)
 	 * enough space for either:
 	 */
 	alloc = sizeof(struct posix_ace_state_array)
-		+ cnt*sizeof(struct posix_ace_state);
+		+ cnt*sizeof(struct posix_user_ace_state);
 	state->users = kzalloc(alloc, GFP_KERNEL);
 	if (!state->users)
 		return -ENOMEM;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 702fa577aa6e..094747a1227c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 
 	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
 	WRITE32(OP_CB_RECALL);
-	WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t));
+	WRITE32(cb_rec->cbr_stateid.si_generation);
+	WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
 	WRITE32(cb_rec->cbr_trunc);
 	WRITE32(len);
 	WRITEMEM(cb_rec->cbr_fhval, len);
@@ -379,6 +380,7 @@ static int do_probe_callback(void *data)
 		.addrsize	= sizeof(addr),
 		.timeout	= &timeparms,
 		.program	= &cb_program,
+		.prognumber	= cb->cb_prog,
 		.version	= nfs_cb_version[1]->number,
 		.authflavor	= RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
@@ -396,9 +398,6 @@ static int do_probe_callback(void *data)
 	addr.sin_port = htons(cb->cb_port);
 	addr.sin_addr.s_addr = htonl(cb->cb_addr);
 
-	/* Initialize rpc_stat */
-	memset(args.program->stats, 0, sizeof(struct rpc_stat));
-
 	/* Create RPC client */
 	client = rpc_create(&args);
 	if (IS_ERR(client)) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c309c881bd4e..669461e291ae 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -71,11 +71,11 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
 		return nfserr_inval;
 
 	if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-		accmode |= MAY_READ;
+		accmode |= NFSD_MAY_READ;
 	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-		accmode |= (MAY_WRITE | MAY_TRUNC);
+		accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
 	if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
-		accmode |= MAY_WRITE;
+		accmode |= NFSD_MAY_WRITE;
 
 	status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
 
@@ -126,7 +126,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 			&resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
 
 	if (!created)
-		status = do_open_permission(rqstp, current_fh, open, MAY_NOP);
+		status = do_open_permission(rqstp, current_fh, open,
+					    NFSD_MAY_NOP);
 
 out:
 	fh_put(&resfh);
@@ -157,7 +158,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
 		(open->op_iattr.ia_size == 0);
 
-	status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE);
+	status = do_open_permission(rqstp, current_fh, open,
+				    NFSD_MAY_OWNER_OVERRIDE);
 
 	return status;
 }
@@ -186,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
 		memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
 				rp->rp_openfh_len);
-		status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+		status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 		if (status)
 			dprintk("nfsd4_open: replay failed"
 				" restoring previous filehandle\n");
@@ -199,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/* Openowner is now set, so sequence id will get bumped.  Now we need
 	 * these checks before we do any creates: */
 	status = nfserr_grace;
-	if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+	if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
 		goto out;
 	status = nfserr_no_grace;
-	if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+	if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
 		goto out;
 
 	switch (open->op_claim_type) {
@@ -285,7 +287,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
 	memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
 	       putfh->pf_fhlen);
-	return fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+	return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 }
 
 static __be32
@@ -363,7 +365,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	fh_init(&resfh, NFS4_FHSIZE);
 
-	status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, MAY_CREATE);
+	status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
+			   NFSD_MAY_CREATE);
 	if (status == nfserr_symlink)
 		status = nfserr_notdir;
 	if (status)
@@ -445,7 +448,7 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	__be32 status;
 
-	status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+	status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 	if (status)
 		return status;
 
@@ -572,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	__be32 status;
 
-	if (nfs4_in_grace())
+	if (locks_in_grace())
 		return nfserr_grace;
 	status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
 			     remove->rm_name, remove->rm_namelen);
@@ -593,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (!cstate->save_fh.fh_dentry)
 		return status;
-	if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags
+	if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
 					& NFSEXP_NOSUBTREECHECK))
 		return nfserr_grace;
 	status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
@@ -730,7 +733,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	int count;
 	__be32 status;
 
-	status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+	status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 	if (status)
 		return status;
 
@@ -843,10 +846,13 @@ struct nfsd4_operation {
 #define ALLOWED_WITHOUT_FH 1
 /* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
 #define ALLOWED_ON_ABSENT_FS 2
+	char *op_name;
 };
 
 static struct nfsd4_operation nfsd4_ops[];
 
+static const char *nfsd4_op_name(unsigned opnum);
+
 /*
  * COMPOUND call.
  */
@@ -861,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	int		slack_bytes;
 	__be32		status;
 
-	status = nfserr_resource;
-	cstate = cstate_alloc();
-	if (cstate == NULL)
-		goto out;
-
 	resp->xbuf = &rqstp->rq_res;
 	resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
 	resp->tagp = resp->p;
@@ -884,11 +885,18 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
 		goto out;
 
+	status = nfserr_resource;
+	cstate = cstate_alloc();
+	if (cstate == NULL)
+		goto out;
+
 	status = nfs_ok;
 	while (!status && resp->opcnt < args->opcnt) {
 		op = &args->ops[resp->opcnt++];
 
-		dprintk("nfsv4 compound op #%d: %d\n", resp->opcnt, op->opnum);
+		dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
+			resp->opcnt, args->opcnt, op->opnum,
+			nfsd4_op_name(op->opnum));
 
 		/*
 		 * The XDR decode routines may have pre-set op->status;
@@ -949,129 +957,172 @@ encode_op:
 		nfsd4_increment_op_stats(op->opnum);
 	}
 
+	cstate_free(cstate);
 out:
 	nfsd4_release_compoundargs(args);
-	cstate_free(cstate);
+	dprintk("nfsv4 compound returned %d\n", ntohl(status));
 	return status;
 }
 
 static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
+		.op_name = "OP_ACCESS",
 	},
 	[OP_CLOSE] = {
 		.op_func = (nfsd4op_func)nfsd4_close,
+		.op_name = "OP_CLOSE",
 	},
 	[OP_COMMIT] = {
 		.op_func = (nfsd4op_func)nfsd4_commit,
+		.op_name = "OP_COMMIT",
 	},
 	[OP_CREATE] = {
 		.op_func = (nfsd4op_func)nfsd4_create,
+		.op_name = "OP_CREATE",
 	},
 	[OP_DELEGRETURN] = {
 		.op_func = (nfsd4op_func)nfsd4_delegreturn,
+		.op_name = "OP_DELEGRETURN",
 	},
 	[OP_GETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_getattr,
 		.op_flags = ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_GETATTR",
 	},
 	[OP_GETFH] = {
 		.op_func = (nfsd4op_func)nfsd4_getfh,
+		.op_name = "OP_GETFH",
 	},
 	[OP_LINK] = {
 		.op_func = (nfsd4op_func)nfsd4_link,
+		.op_name = "OP_LINK",
 	},
 	[OP_LOCK] = {
 		.op_func = (nfsd4op_func)nfsd4_lock,
+		.op_name = "OP_LOCK",
 	},
 	[OP_LOCKT] = {
 		.op_func = (nfsd4op_func)nfsd4_lockt,
+		.op_name = "OP_LOCKT",
 	},
 	[OP_LOCKU] = {
 		.op_func = (nfsd4op_func)nfsd4_locku,
+		.op_name = "OP_LOCKU",
 	},
 	[OP_LOOKUP] = {
 		.op_func = (nfsd4op_func)nfsd4_lookup,
+		.op_name = "OP_LOOKUP",
 	},
 	[OP_LOOKUPP] = {
 		.op_func = (nfsd4op_func)nfsd4_lookupp,
+		.op_name = "OP_LOOKUPP",
 	},
 	[OP_NVERIFY] = {
 		.op_func = (nfsd4op_func)nfsd4_nverify,
+		.op_name = "OP_NVERIFY",
 	},
 	[OP_OPEN] = {
 		.op_func = (nfsd4op_func)nfsd4_open,
+		.op_name = "OP_OPEN",
 	},
 	[OP_OPEN_CONFIRM] = {
 		.op_func = (nfsd4op_func)nfsd4_open_confirm,
+		.op_name = "OP_OPEN_CONFIRM",
 	},
 	[OP_OPEN_DOWNGRADE] = {
 		.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+		.op_name = "OP_OPEN_DOWNGRADE",
 	},
 	[OP_PUTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_PUTFH",
 	},
 	[OP_PUTPUBFH] = {
-		/* unsupported; just for future reference: */
+		/* unsupported, just for future reference: */
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_PUTPUBFH",
 	},
 	[OP_PUTROOTFH] = {
 		.op_func = (nfsd4op_func)nfsd4_putrootfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_PUTROOTFH",
 	},
 	[OP_READ] = {
 		.op_func = (nfsd4op_func)nfsd4_read,
+		.op_name = "OP_READ",
 	},
 	[OP_READDIR] = {
 		.op_func = (nfsd4op_func)nfsd4_readdir,
+		.op_name = "OP_READDIR",
 	},
 	[OP_READLINK] = {
 		.op_func = (nfsd4op_func)nfsd4_readlink,
+		.op_name = "OP_READLINK",
 	},
 	[OP_REMOVE] = {
 		.op_func = (nfsd4op_func)nfsd4_remove,
+		.op_name = "OP_REMOVE",
 	},
 	[OP_RENAME] = {
+		.op_name = "OP_RENAME",
 		.op_func = (nfsd4op_func)nfsd4_rename,
 	},
 	[OP_RENEW] = {
 		.op_func = (nfsd4op_func)nfsd4_renew,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_RENEW",
 	},
 	[OP_RESTOREFH] = {
 		.op_func = (nfsd4op_func)nfsd4_restorefh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_RESTOREFH",
 	},
 	[OP_SAVEFH] = {
 		.op_func = (nfsd4op_func)nfsd4_savefh,
+		.op_name = "OP_SAVEFH",
 	},
 	[OP_SECINFO] = {
 		.op_func = (nfsd4op_func)nfsd4_secinfo,
+		.op_name = "OP_SECINFO",
 	},
 	[OP_SETATTR] = {
 		.op_func = (nfsd4op_func)nfsd4_setattr,
+		.op_name = "OP_SETATTR",
 	},
 	[OP_SETCLIENTID] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_SETCLIENTID",
 	},
 	[OP_SETCLIENTID_CONFIRM] = {
 		.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_SETCLIENTID_CONFIRM",
 	},
 	[OP_VERIFY] = {
 		.op_func = (nfsd4op_func)nfsd4_verify,
+		.op_name = "OP_VERIFY",
 	},
 	[OP_WRITE] = {
 		.op_func = (nfsd4op_func)nfsd4_write,
+		.op_name = "OP_WRITE",
 	},
 	[OP_RELEASE_LOCKOWNER] = {
 		.op_func = (nfsd4op_func)nfsd4_release_lockowner,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+		.op_name = "OP_RELEASE_LOCKOWNER",
 	},
 };
 
+static const char *nfsd4_op_name(unsigned opnum)
+{
+	if (opnum < ARRAY_SIZE(nfsd4_ops))
+		return nfsd4_ops[opnum].op_name;
+	return "unknown_operation";
+}
+
 #define nfs4svc_decode_voidargs		NULL
 #define nfs4svc_release_void		NULL
 #define nfsd4_voidres			nfsd4_voidargs
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 145b3c877a27..bb93946ace22 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -51,7 +51,7 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
 /* Globals */
-static struct nameidata rec_dir;
+static struct path rec_dir;
 static int rec_dir_init = 0;
 
 static void
@@ -121,9 +121,9 @@ out_no_tfm:
 static void
 nfsd4_sync_rec_dir(void)
 {
-	mutex_lock(&rec_dir.path.dentry->d_inode->i_mutex);
-	nfsd_sync_dir(rec_dir.path.dentry);
-	mutex_unlock(&rec_dir.path.dentry->d_inode->i_mutex);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+	nfsd_sync_dir(rec_dir.dentry);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 }
 
 int
@@ -143,9 +143,9 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 	nfs4_save_user(&uid, &gid);
 
 	/* lock the parent */
-	mutex_lock(&rec_dir.path.dentry->d_inode->i_mutex);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
 
-	dentry = lookup_one_len(dname, rec_dir.path.dentry, HEXDIR_LEN-1);
+	dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
 		goto out_unlock;
@@ -155,15 +155,15 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 		dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
 		goto out_put;
 	}
-	status = mnt_want_write(rec_dir.path.mnt);
+	status = mnt_want_write(rec_dir.mnt);
 	if (status)
 		goto out_put;
-	status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU);
-	mnt_drop_write(rec_dir.path.mnt);
+	status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
+	mnt_drop_write(rec_dir.mnt);
 out_put:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&rec_dir.path.dentry->d_inode->i_mutex);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (status == 0) {
 		clp->cl_firststate = 1;
 		nfsd4_sync_rec_dir();
@@ -226,7 +226,7 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 
 	nfs4_save_user(&uid, &gid);
 
-	filp = dentry_open(dget(dir), mntget(rec_dir.path.mnt), O_RDONLY);
+	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY);
 	status = PTR_ERR(filp);
 	if (IS_ERR(filp))
 		goto out;
@@ -291,9 +291,9 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
 
 	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
 
-	mutex_lock(&rec_dir.path.dentry->d_inode->i_mutex);
-	dentry = lookup_one_len(name, rec_dir.path.dentry, namlen);
-	mutex_unlock(&rec_dir.path.dentry->d_inode->i_mutex);
+	mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+	dentry = lookup_one_len(name, rec_dir.dentry, namlen);
+	mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
 		return status;
@@ -302,7 +302,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
 	if (!dentry->d_inode)
 		goto out;
 
-	status = nfsd4_clear_clid_dir(rec_dir.path.dentry, dentry);
+	status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
 out:
 	dput(dentry);
 	return status;
@@ -318,7 +318,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	if (!rec_dir_init || !clp->cl_firststate)
 		return;
 
-	status = mnt_want_write(rec_dir.path.mnt);
+	status = mnt_want_write(rec_dir.mnt);
 	if (status)
 		goto out;
 	clp->cl_firststate = 0;
@@ -327,7 +327,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	nfs4_reset_user(uid, gid);
 	if (status == 0)
 		nfsd4_sync_rec_dir();
-	mnt_drop_write(rec_dir.path.mnt);
+	mnt_drop_write(rec_dir.mnt);
 out:
 	if (status)
 		printk("NFSD: Failed to remove expired client state directory"
@@ -357,17 +357,17 @@ nfsd4_recdir_purge_old(void) {
 
 	if (!rec_dir_init)
 		return;
-	status = mnt_want_write(rec_dir.path.mnt);
+	status = mnt_want_write(rec_dir.mnt);
 	if (status)
 		goto out;
-	status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old);
+	status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old);
 	if (status == 0)
 		nfsd4_sync_rec_dir();
-	mnt_drop_write(rec_dir.path.mnt);
+	mnt_drop_write(rec_dir.mnt);
 out:
 	if (status)
 		printk("nfsd4: failed to purge old clients from recovery"
-			" directory %s\n", rec_dir.path.dentry->d_name.name);
+			" directory %s\n", rec_dir.dentry->d_name.name);
 }
 
 static int
@@ -387,10 +387,10 @@ int
 nfsd4_recdir_load(void) {
 	int status;
 
-	status = nfsd4_list_rec_dir(rec_dir.path.dentry, load_recdir);
+	status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir);
 	if (status)
 		printk("nfsd4: failed loading clients from recovery"
-			" directory %s\n", rec_dir.path.dentry->d_name.name);
+			" directory %s\n", rec_dir.dentry->d_name.name);
 	return status;
 }
 
@@ -412,7 +412,7 @@ nfsd4_init_recdir(char *rec_dirname)
 
 	nfs4_save_user(&uid, &gid);
 
-	status = path_lookup(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+	status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
 			&rec_dir);
 	if (status)
 		printk("NFSD: unable to find recovery directory %s\n",
@@ -429,5 +429,5 @@ nfsd4_shutdown_recdir(void)
 	if (!rec_dir_init)
 		return;
 	rec_dir_init = 0;
-	path_put(&rec_dir.path);
+	path_put(&rec_dir);
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8799b8708188..b0bebc552a11 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@
 static time_t lease_time = 90;     /* default lease time */
 static time_t user_lease_time = 90;
 static time_t boot_time;
-static int in_grace = 1;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
@@ -1173,6 +1172,24 @@ static inline int deny_valid(u32 x)
 	return x <= NFS4_SHARE_DENY_BOTH;
 }
 
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used.  This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ *	OPEN allow read, deny write
+ *	OPEN allow both, deny none
+ *	DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
 static void
 set_access(unsigned int *access, unsigned long bmap) {
 	int i;
@@ -1570,6 +1587,10 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta
 		int err = get_write_access(inode);
 		if (err)
 			return nfserrno(err);
+		err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
+		if (err)
+			return nfserrno(err);
+		file_take_write(filp);
 	}
 	status = nfsd4_truncate(rqstp, cur_fh, open);
 	if (status) {
@@ -1579,8 +1600,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta
 	}
 	/* remember the open */
 	filp->f_mode |= open->op_share_access;
-	set_bit(open->op_share_access, &stp->st_access_bmap);
-	set_bit(open->op_share_deny, &stp->st_deny_bmap);
+	__set_bit(open->op_share_access, &stp->st_access_bmap);
+	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 
 	return nfs_ok;
 }
@@ -1618,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 		case NFS4_OPEN_CLAIM_NULL:
 			/* Let's not give out any delegations till everyone's
 			 * had the chance to reclaim theirs.... */
-			if (nfs4_in_grace())
+			if (locks_in_grace())
 				goto out;
 			if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
 				goto out;
@@ -1722,9 +1743,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		/* Stateid was not found, this is a new OPEN */
 		int flags = 0;
 		if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-			flags |= MAY_READ;
+			flags |= NFSD_MAY_READ;
 		if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-			flags |= MAY_WRITE;
+			flags |= NFSD_MAY_WRITE;
 		status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
 		if (status)
 			goto out;
@@ -1794,12 +1815,15 @@ out:
 	return status;
 }
 
+struct lock_manager nfsd4_manager = {
+};
+
 static void
-end_grace(void)
+nfsd4_end_grace(void)
 {
 	dprintk("NFSD: end of grace period\n");
 	nfsd4_recdir_purge_old();
-	in_grace = 0;
+	locks_end_grace(&nfsd4_manager);
 }
 
 static time_t
@@ -1816,8 +1840,8 @@ nfs4_laundromat(void)
 	nfs4_lock_state();
 
 	dprintk("NFSD: laundromat service - starting\n");
-	if (in_grace)
-		end_grace();
+	if (locks_in_grace())
+		nfsd4_end_grace();
 	list_for_each_safe(pos, next, &client_lru) {
 		clp = list_entry(pos, struct nfs4_client, cl_lru);
 		if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -1952,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 		return nfserr_bad_stateid;
 	else if (ONE_STATEID(stateid) && (flags & RD_STATE))
 		return nfs_ok;
-	else if (nfs4_in_grace()) {
+	else if (locks_in_grace()) {
 		/* Answer in remaining cases depends on existance of
 		 * conflicting state; so we must wait out the grace period. */
 		return nfserr_grace;
@@ -1971,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 static inline int
 io_during_grace_disallowed(struct inode *inode, int flags)
 {
-	return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
+	return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
 		&& mandatory_lock(inode);
 }
 
@@ -2610,7 +2634,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 return nfserr_inval;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh,
-				S_IFREG, MAY_LOCK))) {
+				S_IFREG, NFSD_MAY_LOCK))) {
 		dprintk("NFSD: nfsd4_lock: permission denied!\n");
 		return status;
 	}
@@ -2671,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	filp = lock_stp->st_vfs_file;
 
 	status = nfserr_grace;
-	if (nfs4_in_grace() && !lock->lk_reclaim)
+	if (locks_in_grace() && !lock->lk_reclaim)
 		goto out;
 	status = nfserr_no_grace;
-	if (!nfs4_in_grace() && lock->lk_reclaim)
+	if (!locks_in_grace() && lock->lk_reclaim)
 		goto out;
 
 	locks_init_lock(&file_lock);
@@ -2757,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	int error;
 	__be32 status;
 
-	if (nfs4_in_grace())
+	if (locks_in_grace())
 		return nfserr_grace;
 
 	if (check_lock_length(lockt->lt_offset, lockt->lt_length))
@@ -3170,9 +3194,9 @@ __nfs4_state_start(void)
 	unsigned long grace_time;
 
 	boot_time = get_seconds();
-	grace_time = get_nfs_grace_period();
+	grace_time = get_nfs4_grace_period();
 	lease_time = user_lease_time;
-	in_grace = 1;
+	locks_start_grace(&nfsd4_manager);
 	printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
 	       grace_time/HZ);
 	laundry_wq = create_singlethread_workqueue("nfsd4");
@@ -3191,12 +3215,6 @@ nfs4_state_start(void)
 	return;
 }
 
-int
-nfs4_in_grace(void)
-{
-	return in_grace;
-}
-
 time_t
 nfs4_lease_time(void)
 {
@@ -3249,12 +3267,14 @@ nfs4_state_shutdown(void)
 	nfs4_unlock_state();
 }
 
+/*
+ * user_recovery_dirname is protected by the nfsd_mutex since it's only
+ * accessed when nfsd is starting.
+ */
 static void
 nfs4_set_recdir(char *recdir)
 {
-	nfs4_lock_state();
 	strcpy(user_recovery_dirname, recdir);
-	nfs4_unlock_state();
 }
 
 /*
@@ -3264,20 +3284,26 @@ int
 nfs4_reset_recoverydir(char *recdir)
 {
 	int status;
-	struct nameidata nd;
+	struct path path;
 
-	status = path_lookup(recdir, LOOKUP_FOLLOW, &nd);
+	status = kern_path(recdir, LOOKUP_FOLLOW, &path);
 	if (status)
 		return status;
 	status = -ENOTDIR;
-	if (S_ISDIR(nd.path.dentry->d_inode->i_mode)) {
+	if (S_ISDIR(path.dentry->d_inode->i_mode)) {
 		nfs4_set_recdir(recdir);
 		status = 0;
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return status;
 }
 
+char *
+nfs4_recoverydir(void)
+{
+	return user_recovery_dirname;
+}
+
 /*
  * Called when leasetime is changed.
  *
@@ -3286,11 +3312,12 @@ nfs4_reset_recoverydir(char *recdir)
  * we start to register any changes in lease time.  If the administrator
  * really wants to change the lease time *now*, they can go ahead and bring
  * nfsd down and then back up again after changing the lease time.
+ *
+ * user_lease_time is protected by nfsd_mutex since it's only really accessed
+ * when nfsd is starting
  */
 void
 nfs4_reset_lease(time_t leasetime)
 {
-	lock_kernel();
 	user_lease_time = leasetime;
-	unlock_kernel();
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c513bbdf2d36..afcdf4b76843 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -413,6 +413,18 @@ out_nfserr:
 }
 
 static __be32
+nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+{
+	DECODE_HEAD;
+
+	READ_BUF(sizeof(stateid_t));
+	READ32(sid->si_generation);
+	COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+
+	DECODE_TAIL;
+}
+
+static __be32
 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
 {
 	DECODE_HEAD;
@@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 	DECODE_HEAD;
 
 	close->cl_stateowner = NULL;
-	READ_BUF(4 + sizeof(stateid_t));
+	READ_BUF(4);
 	READ32(close->cl_seqid);
-	READ32(close->cl_stateid.si_generation);
-	COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
+	return nfsd4_decode_stateid(argp, &close->cl_stateid);
 
 	DECODE_TAIL;
 }
@@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 static inline __be32
 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
 {
-	DECODE_HEAD;
-
-	READ_BUF(sizeof(stateid_t));
-	READ32(dr->dr_stateid.si_generation);
-	COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
-
-	DECODE_TAIL;
+	return nfsd4_decode_stateid(argp, &dr->dr_stateid);
 }
 
 static inline __be32
@@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 	READ32(lock->lk_is_new);
 
 	if (lock->lk_is_new) {
-		READ_BUF(36);
+		READ_BUF(4);
 		READ32(lock->lk_new_open_seqid);
-		READ32(lock->lk_new_open_stateid.si_generation);
-
-		COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t));
+		status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+		if (status)
+			return status;
+		READ_BUF(8 + sizeof(clientid_t));
 		READ32(lock->lk_new_lock_seqid);
 		COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
 		READ32(lock->lk_new_owner.len);
 		READ_BUF(lock->lk_new_owner.len);
 		READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
 	} else {
-		READ_BUF(20);
-		READ32(lock->lk_old_lock_stateid.si_generation);
-		COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t));
+		status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
+		if (status)
+			return status;
+		READ_BUF(4);
 		READ32(lock->lk_old_lock_seqid);
 	}
 
@@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
 	DECODE_HEAD;
 
 	locku->lu_stateowner = NULL;
-	READ_BUF(24 + sizeof(stateid_t));
+	READ_BUF(8);
 	READ32(locku->lu_type);
 	if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
 		goto xdr_error;
 	READ32(locku->lu_seqid);
-	READ32(locku->lu_stateid.si_generation);
-	COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
+	if (status)
+		return status;
+	READ_BUF(16);
 	READ64(locku->lu_offset);
 	READ64(locku->lu_length);
 
@@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 		READ32(open->op_delegate_type);
 		break;
 	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-		READ_BUF(sizeof(stateid_t) + 4);
-		COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+		status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+		if (status)
+			return status;
+		READ_BUF(4);
 		READ32(open->op_fname.len);
 		READ_BUF(open->op_fname.len);
 		SAVEMEM(open->op_fname.data, open->op_fname.len);
@@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
 	DECODE_HEAD;
 		    
 	open_conf->oc_stateowner = NULL;
-	READ_BUF(4 + sizeof(stateid_t));
-	READ32(open_conf->oc_req_stateid.si_generation);
-	COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
+	if (status)
+		return status;
+	READ_BUF(4);
 	READ32(open_conf->oc_seqid);
 						        
 	DECODE_TAIL;
@@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
 	DECODE_HEAD;
 		    
 	open_down->od_stateowner = NULL;
-	READ_BUF(12 + sizeof(stateid_t));
-	READ32(open_down->od_stateid.si_generation);
-	COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
+	if (status)
+		return status;
+	READ_BUF(12);
 	READ32(open_down->od_seqid);
 	READ32(open_down->od_share_access);
 	READ32(open_down->od_share_deny);
@@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
 {
 	DECODE_HEAD;
 
-	READ_BUF(sizeof(stateid_t) + 12);
-	READ32(read->rd_stateid.si_generation);
-	COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &read->rd_stateid);
+	if (status)
+		return status;
+	READ_BUF(12);
 	READ64(read->rd_offset);
 	READ32(read->rd_length);
 
@@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
-	DECODE_HEAD;
-
-	READ_BUF(sizeof(stateid_t));
-	READ32(setattr->sa_stateid.si_generation);
-	COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
-	if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
-		goto out;
+	__be32 status;
 
-	DECODE_TAIL;
+	status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+	if (status)
+		return status;
+	return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+				  &setattr->sa_iattr, &setattr->sa_acl);
 }
 
 static __be32
@@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 	int len;
 	DECODE_HEAD;
 
-	READ_BUF(sizeof(stateid_opaque_t) + 20);
-	READ32(write->wr_stateid.si_generation);
-	COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &write->wr_stateid);
+	if (status)
+		return status;
+	READ_BUF(16);
 	READ64(write->wr_offset);
 	READ32(write->wr_stable_how);
 	if (write->wr_stable_how > 2)
@@ -986,10 +999,74 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 }
 
 static __be32
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+{
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+{
+	return nfserr_opnotsupp;
+}
+
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+
+static nfsd4_dec nfsd4_dec_ops[] = {
+	[OP_ACCESS]		= (nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		= (nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		= (nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		= (nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	= (nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		= (nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		= (nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		= (nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		= (nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		= (nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		= (nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		= (nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		= (nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		= (nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		= (nfsd4_dec)nfsd4_decode_renew,
+	[OP_RESTOREFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		= (nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		= (nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		= (nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	= (nfsd4_dec)nfsd4_decode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
+	[OP_VERIFY]		= (nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		= (nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
+};
+
+struct nfsd4_minorversion_ops {
+	nfsd4_dec *decoders;
+	int nops;
+};
+
+static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
+	[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+};
+
+static __be32
 nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 {
 	DECODE_HEAD;
 	struct nfsd4_op *op;
+	struct nfsd4_minorversion_ops *ops;
 	int i;
 
 	/*
@@ -1019,6 +1096,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		}
 	}
 
+	if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion))
+		argp->opcnt = 0;
+
+	ops = &nfsd4_minorversion[argp->minorversion];
 	for (i = 0; i < argp->opcnt; i++) {
 		op = &argp->ops[i];
 		op->replay = NULL;
@@ -1056,120 +1137,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		}
 		op->opnum = ntohl(*argp->p++);
 
-		switch (op->opnum) {
-		case 2: /* Reserved operation */
-			op->opnum = OP_ILLEGAL;
-			if (argp->minorversion == 0)
-				op->status = nfserr_op_illegal;
-			else
-				op->status = nfserr_minor_vers_mismatch;
-			break;
-		case OP_ACCESS:
-			op->status = nfsd4_decode_access(argp, &op->u.access);
-			break;
-		case OP_CLOSE:
-			op->status = nfsd4_decode_close(argp, &op->u.close);
-			break;
-		case OP_COMMIT:
-			op->status = nfsd4_decode_commit(argp, &op->u.commit);
-			break;
-		case OP_CREATE:
-			op->status = nfsd4_decode_create(argp, &op->u.create);
-			break;
-		case OP_DELEGRETURN:
-			op->status = nfsd4_decode_delegreturn(argp, &op->u.delegreturn);
-			break;
-		case OP_GETATTR:
-			op->status = nfsd4_decode_getattr(argp, &op->u.getattr);
-			break;
-		case OP_GETFH:
-			op->status = nfs_ok;
-			break;
-		case OP_LINK:
-			op->status = nfsd4_decode_link(argp, &op->u.link);
-			break;
-		case OP_LOCK:
-			op->status = nfsd4_decode_lock(argp, &op->u.lock);
-			break;
-		case OP_LOCKT:
-			op->status = nfsd4_decode_lockt(argp, &op->u.lockt);
-			break;
-		case OP_LOCKU:
-			op->status = nfsd4_decode_locku(argp, &op->u.locku);
-			break;
-		case OP_LOOKUP:
-			op->status = nfsd4_decode_lookup(argp, &op->u.lookup);
-			break;
-		case OP_LOOKUPP:
-			op->status = nfs_ok;
-			break;
-		case OP_NVERIFY:
-			op->status = nfsd4_decode_verify(argp, &op->u.nverify);
-			break;
-		case OP_OPEN:
-			op->status = nfsd4_decode_open(argp, &op->u.open);
-			break;
-		case OP_OPEN_CONFIRM:
-			op->status = nfsd4_decode_open_confirm(argp, &op->u.open_confirm);
-			break;
-		case OP_OPEN_DOWNGRADE:
-			op->status = nfsd4_decode_open_downgrade(argp, &op->u.open_downgrade);
-			break;
-		case OP_PUTFH:
-			op->status = nfsd4_decode_putfh(argp, &op->u.putfh);
-			break;
-		case OP_PUTROOTFH:
-			op->status = nfs_ok;
-			break;
-		case OP_READ:
-			op->status = nfsd4_decode_read(argp, &op->u.read);
-			break;
-		case OP_READDIR:
-			op->status = nfsd4_decode_readdir(argp, &op->u.readdir);
-			break;
-		case OP_READLINK:
-			op->status = nfs_ok;
-			break;
-		case OP_REMOVE:
-			op->status = nfsd4_decode_remove(argp, &op->u.remove);
-			break;
-		case OP_RENAME:
-			op->status = nfsd4_decode_rename(argp, &op->u.rename);
-			break;
-		case OP_RESTOREFH:
-			op->status = nfs_ok;
-			break;
-		case OP_RENEW:
-			op->status = nfsd4_decode_renew(argp, &op->u.renew);
-			break;
-		case OP_SAVEFH:
-			op->status = nfs_ok;
-			break;
-		case OP_SECINFO:
-			op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo);
-			break;
-		case OP_SETATTR:
-			op->status = nfsd4_decode_setattr(argp, &op->u.setattr);
-			break;
-		case OP_SETCLIENTID:
-			op->status = nfsd4_decode_setclientid(argp, &op->u.setclientid);
-			break;
-		case OP_SETCLIENTID_CONFIRM:
-			op->status = nfsd4_decode_setclientid_confirm(argp, &op->u.setclientid_confirm);
-			break;
-		case OP_VERIFY:
-			op->status = nfsd4_decode_verify(argp, &op->u.verify);
-			break;
-		case OP_WRITE:
-			op->status = nfsd4_decode_write(argp, &op->u.write);
-			break;
-		case OP_RELEASE_LOCKOWNER:
-			op->status = nfsd4_decode_release_lockowner(argp, &op->u.release_lockowner);
-			break;
-		default:
+		if (op->opnum >= OP_ACCESS && op->opnum < ops->nops)
+			op->status = ops->decoders[op->opnum](argp, &op->u);
+		else {
 			op->opnum = OP_ILLEGAL;
 			op->status = nfserr_op_illegal;
-			break;
 		}
 
 		if (op->status) {
@@ -1201,11 +1173,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	*p++ = htonl((u32)((n) >> 32));				\
 	*p++ = htonl((u32)(n));					\
 } while (0)
-#define WRITEMEM(ptr,nbytes)     do {				\
+#define WRITEMEM(ptr,nbytes)     do { if (nbytes > 0) {		\
 	*(p + XDR_QUADLEN(nbytes) -1) = 0;                      \
 	memcpy(p, ptr, nbytes);					\
 	p += XDR_QUADLEN(nbytes);				\
-} while (0)
+}} while (0)
 #define WRITECINFO(c)		do {				\
 	*p++ = htonl(c.atomic);					\
 	*p++ = htonl(c.before_ctime_sec);				\
@@ -1224,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
  * Header routine to setup seqid operation replay cache
  */
 #define ENCODE_SEQID_OP_HEAD					\
-	__be32 *p;						\
 	__be32 *save;						\
 								\
 	save = resp->p;
@@ -1992,6 +1963,17 @@ fail:
 }
 
 static void
+nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
+{
+	ENCODE_HEAD;
+
+	RESERVE_SPACE(sizeof(stateid_t));
+	WRITE32(sid->si_generation);
+	WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+	ADJUST_ARGS();
+}
+
+static __be32
 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
 	ENCODE_HEAD;
@@ -2002,24 +1984,23 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITE32(access->ac_resp_access);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
 	ENCODE_SEQID_OP_HEAD;
 
-	if (!nfserr) {
-		RESERVE_SPACE(sizeof(stateid_t));
-		WRITE32(close->cl_stateid.si_generation);
-		WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	}
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &close->cl_stateid);
+
 	ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+	return nfserr;
 }
 
 
-static void
+static __be32
 nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
 {
 	ENCODE_HEAD;
@@ -2029,9 +2010,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITEMEM(commit->co_verf.data, 8);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
 {
 	ENCODE_HEAD;
@@ -2044,6 +2026,7 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITE32(create->cr_bmval[1]);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
 static __be32
@@ -2064,9 +2047,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 	return nfserr;
 }
 
-static void
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp)
+static __be32
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
 {
+	struct svc_fh *fhp = *fhpp;
 	unsigned int len;
 	ENCODE_HEAD;
 
@@ -2077,6 +2061,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
 		WRITEMEM(&fhp->fh_handle.fh_base, len);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
 /*
@@ -2104,46 +2089,42 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
 	ADJUST_ARGS();
 }
 
-static void
+static __be32
 nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
 {
 	ENCODE_SEQID_OP_HEAD;
 
-	if (!nfserr) {
-		RESERVE_SPACE(4 + sizeof(stateid_t));
-		WRITE32(lock->lk_resp_stateid.si_generation);
-		WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	} else if (nfserr == nfserr_denied)
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
+	else if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
 	ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
 {
 	if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
 {
 	ENCODE_SEQID_OP_HEAD;
 
-	if (!nfserr) {
-		RESERVE_SPACE(sizeof(stateid_t));
-		WRITE32(locku->lu_stateid.si_generation);
-		WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	}
-				        
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &locku->lu_stateid);
+
 	ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+	return nfserr;
 }
 
 
-static void
+static __be32
 nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
 {
 	ENCODE_HEAD;
@@ -2153,20 +2134,21 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 		WRITECINFO(link->li_cinfo);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
 
-static void
+static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
+	ENCODE_HEAD;
 	ENCODE_SEQID_OP_HEAD;
 
 	if (nfserr)
 		goto out;
 
-	RESERVE_SPACE(36 + sizeof(stateid_t));
-	WRITE32(open->op_stateid.si_generation);
-	WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t));
+	nfsd4_encode_stateid(resp, &open->op_stateid);
+	RESERVE_SPACE(40);
 	WRITECINFO(open->op_cinfo);
 	WRITE32(open->op_rflags);
 	WRITE32(2);
@@ -2179,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 	case NFS4_OPEN_DELEGATE_NONE:
 		break;
 	case NFS4_OPEN_DELEGATE_READ:
-		RESERVE_SPACE(20 + sizeof(stateid_t));
-		WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+		nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+		RESERVE_SPACE(20);
 		WRITE32(open->op_recall);
 
 		/*
@@ -2193,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 		ADJUST_ARGS();
 		break;
 	case NFS4_OPEN_DELEGATE_WRITE:
-		RESERVE_SPACE(32 + sizeof(stateid_t));
-		WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+		nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+		RESERVE_SPACE(32);
 		WRITE32(0);
 
 		/*
@@ -2219,36 +2201,31 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 	/* XXX save filehandle here */
 out:
 	ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
 	ENCODE_SEQID_OP_HEAD;
-				        
-	if (!nfserr) {
-		RESERVE_SPACE(sizeof(stateid_t));
-		WRITE32(oc->oc_resp_stateid.si_generation);
-		WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	}
+
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
 
 	ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
 	ENCODE_SEQID_OP_HEAD;
-				        
-	if (!nfserr) {
-		RESERVE_SPACE(sizeof(stateid_t));
-		WRITE32(od->od_stateid.si_generation);
-		WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	}
+
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &od->od_stateid);
 
 	ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+	return nfserr;
 }
 
 static __be32
@@ -2443,7 +2420,7 @@ err_no_verf:
 	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
 {
 	ENCODE_HEAD;
@@ -2453,9 +2430,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITECINFO(remove->rm_cinfo);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
 {
 	ENCODE_HEAD;
@@ -2466,9 +2444,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 		WRITECINFO(rename->rn_tinfo);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 		     struct nfsd4_secinfo *secinfo)
 {
@@ -2532,13 +2511,14 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 out:
 	if (exp)
 		exp_put(exp);
+	return nfserr;
 }
 
 /*
  * The SETATTR encode routine is special -- it always encodes a bitmap,
  * regardless of the error status.
  */
-static void
+static __be32
 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
 {
 	ENCODE_HEAD;
@@ -2555,9 +2535,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 		WRITE32(setattr->sa_bmval[1]);
 	}
 	ADJUST_ARGS();
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
 {
 	ENCODE_HEAD;
@@ -2574,9 +2555,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
 		WRITE32(0);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
-static void
+static __be32
 nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
 {
 	ENCODE_HEAD;
@@ -2588,8 +2570,56 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 		WRITEMEM(write->wr_verifier.data, 8);
 		ADJUST_ARGS();
 	}
+	return nfserr;
 }
 
+static __be32
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+{
+	return nfserr;
+}
+
+typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+
+static nfsd4_enc nfsd4_enc_ops[] = {
+	[OP_ACCESS]		= (nfsd4_enc)nfsd4_encode_access,
+	[OP_CLOSE]		= (nfsd4_enc)nfsd4_encode_close,
+	[OP_COMMIT]		= (nfsd4_enc)nfsd4_encode_commit,
+	[OP_CREATE]		= (nfsd4_enc)nfsd4_encode_create,
+	[OP_DELEGPURGE]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DELEGRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETATTR]		= (nfsd4_enc)nfsd4_encode_getattr,
+	[OP_GETFH]		= (nfsd4_enc)nfsd4_encode_getfh,
+	[OP_LINK]		= (nfsd4_enc)nfsd4_encode_link,
+	[OP_LOCK]		= (nfsd4_enc)nfsd4_encode_lock,
+	[OP_LOCKT]		= (nfsd4_enc)nfsd4_encode_lockt,
+	[OP_LOCKU]		= (nfsd4_enc)nfsd4_encode_locku,
+	[OP_LOOKUP]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LOOKUPP]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_NVERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_OPEN]		= (nfsd4_enc)nfsd4_encode_open,
+	[OP_OPEN_CONFIRM]	= (nfsd4_enc)nfsd4_encode_open_confirm,
+	[OP_OPEN_DOWNGRADE]	= (nfsd4_enc)nfsd4_encode_open_downgrade,
+	[OP_PUTFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTPUBFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_PUTROOTFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_READ]		= (nfsd4_enc)nfsd4_encode_read,
+	[OP_READDIR]		= (nfsd4_enc)nfsd4_encode_readdir,
+	[OP_READLINK]		= (nfsd4_enc)nfsd4_encode_readlink,
+	[OP_REMOVE]		= (nfsd4_enc)nfsd4_encode_remove,
+	[OP_RENAME]		= (nfsd4_enc)nfsd4_encode_rename,
+	[OP_RENEW]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RESTOREFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SAVEFH]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO]		= (nfsd4_enc)nfsd4_encode_secinfo,
+	[OP_SETATTR]		= (nfsd4_enc)nfsd4_encode_setattr,
+	[OP_SETCLIENTID]	= (nfsd4_enc)nfsd4_encode_setclientid,
+	[OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_VERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WRITE]		= (nfsd4_enc)nfsd4_encode_write,
+	[OP_RELEASE_LOCKOWNER]	= (nfsd4_enc)nfsd4_encode_noop,
+};
+
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
@@ -2601,101 +2631,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	statp = p++;	/* to be backfilled at the end */
 	ADJUST_ARGS();
 
-	switch (op->opnum) {
-	case OP_ACCESS:
-		nfsd4_encode_access(resp, op->status, &op->u.access);
-		break;
-	case OP_CLOSE:
-		nfsd4_encode_close(resp, op->status, &op->u.close);
-		break;
-	case OP_COMMIT:
-		nfsd4_encode_commit(resp, op->status, &op->u.commit);
-		break;
-	case OP_CREATE:
-		nfsd4_encode_create(resp, op->status, &op->u.create);
-		break;
-	case OP_DELEGRETURN:
-		break;
-	case OP_GETATTR:
-		op->status = nfsd4_encode_getattr(resp, op->status, &op->u.getattr);
-		break;
-	case OP_GETFH:
-		nfsd4_encode_getfh(resp, op->status, op->u.getfh);
-		break;
-	case OP_LINK:
-		nfsd4_encode_link(resp, op->status, &op->u.link);
-		break;
-	case OP_LOCK:
-		nfsd4_encode_lock(resp, op->status, &op->u.lock);
-		break;
-	case OP_LOCKT:
-		nfsd4_encode_lockt(resp, op->status, &op->u.lockt);
-		break;
-	case OP_LOCKU:
-		nfsd4_encode_locku(resp, op->status, &op->u.locku);
-		break;
-	case OP_LOOKUP:
-		break;
-	case OP_LOOKUPP:
-		break;
-	case OP_NVERIFY:
-		break;
-	case OP_OPEN:
-		nfsd4_encode_open(resp, op->status, &op->u.open);
-		break;
-	case OP_OPEN_CONFIRM:
-		nfsd4_encode_open_confirm(resp, op->status, &op->u.open_confirm);
-		break;
-	case OP_OPEN_DOWNGRADE:
-		nfsd4_encode_open_downgrade(resp, op->status, &op->u.open_downgrade);
-		break;
-	case OP_PUTFH:
-		break;
-	case OP_PUTROOTFH:
-		break;
-	case OP_READ:
-		op->status = nfsd4_encode_read(resp, op->status, &op->u.read);
-		break;
-	case OP_READDIR:
-		op->status = nfsd4_encode_readdir(resp, op->status, &op->u.readdir);
-		break;
-	case OP_READLINK:
-		op->status = nfsd4_encode_readlink(resp, op->status, &op->u.readlink);
-		break;
-	case OP_REMOVE:
-		nfsd4_encode_remove(resp, op->status, &op->u.remove);
-		break;
-	case OP_RENAME:
-		nfsd4_encode_rename(resp, op->status, &op->u.rename);
-		break;
-	case OP_RENEW:
-		break;
-	case OP_RESTOREFH:
-		break;
-	case OP_SAVEFH:
-		break;
-	case OP_SECINFO:
-		nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo);
-		break;
-	case OP_SETATTR:
-		nfsd4_encode_setattr(resp, op->status, &op->u.setattr);
-		break;
-	case OP_SETCLIENTID:
-		nfsd4_encode_setclientid(resp, op->status, &op->u.setclientid);
-		break;
-	case OP_SETCLIENTID_CONFIRM:
-		break;
-	case OP_VERIFY:
-		break;
-	case OP_WRITE:
-		nfsd4_encode_write(resp, op->status, &op->u.write);
-		break;
-	case OP_RELEASE_LOCKOWNER:
-		break;
-	default:
-		break;
-	}
-
+	if (op->opnum == OP_ILLEGAL)
+		goto status;
+	BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
+	       !nfsd4_enc_ops[op->opnum]);
+	op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+status:
 	/*
 	 * Note: We write the status directly, instead of using WRITE32(),
 	 * since it is already in network byte order.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5ac00c4fee91..e3f9783fdcf7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -12,6 +12,7 @@
 #include <linux/time.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/namei.h>
 #include <linux/fcntl.h>
 #include <linux/net.h>
 #include <linux/in.h>
@@ -310,9 +311,12 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 
 static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
 {
-	__be32 server_ip;
-	char *fo_path, c;
+	struct sockaddr_in sin = {
+		.sin_family	= AF_INET,
+	};
 	int b1, b2, b3, b4;
+	char c;
+	char *fo_path;
 
 	/* sanity check */
 	if (size == 0)
@@ -326,16 +330,18 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
 		return -EINVAL;
 
 	/* get ipv4 address */
-	if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
+	if (sscanf(fo_path, NIPQUAD_FMT "%c", &b1, &b2, &b3, &b4, &c) != 4)
 		return -EINVAL;
-	server_ip = htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4);
+	if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
+		return -EINVAL;
+	sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
 
-	return nlmsvc_unlock_all_by_ip(server_ip);
+	return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
 }
 
 static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	char *fo_path;
 	int error;
 
@@ -350,13 +356,13 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
 	if (qword_get(&buf, fo_path, size) < 0)
 		return -EINVAL;
 
-	error = path_lookup(fo_path, 0, &nd);
+	error = kern_path(fo_path, 0, &path);
 	if (error)
 		return error;
 
-	error = nlmsvc_unlock_all_by_sb(nd.path.mnt->mnt_sb);
+	error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
 
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
@@ -450,22 +456,26 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 	int i;
 	int rv;
 	int len;
-    	int npools = nfsd_nrpools();
+	int npools;
 	int *nthreads;
 
+	mutex_lock(&nfsd_mutex);
+	npools = nfsd_nrpools();
 	if (npools == 0) {
 		/*
 		 * NFS is shut down.  The admin can start it by
 		 * writing to the threads file but NOT the pool_threads
 		 * file, sorry.  Report zero threads.
 		 */
+		mutex_unlock(&nfsd_mutex);
 		strcpy(buf, "0\n");
 		return strlen(buf);
 	}
 
 	nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
+	rv = -ENOMEM;
 	if (nthreads == NULL)
-		return -ENOMEM;
+		goto out_free;
 
 	if (size > 0) {
 		for (i = 0; i < npools; i++) {
@@ -496,14 +506,16 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 		mesg += len;
 	}
 
+	mutex_unlock(&nfsd_mutex);
 	return (mesg-buf);
 
 out_free:
 	kfree(nthreads);
+	mutex_unlock(&nfsd_mutex);
 	return rv;
 }
 
-static ssize_t write_versions(struct file *file, char *buf, size_t size)
+static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
 	/*
 	 * Format:
@@ -566,14 +578,23 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
 	return len;
 }
 
-static ssize_t write_ports(struct file *file, char *buf, size_t size)
+static ssize_t write_versions(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_versions(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 {
 	if (size == 0) {
 		int len = 0;
-		lock_kernel();
+
 		if (nfsd_serv)
 			len = svc_xprt_names(nfsd_serv, buf, 0);
-		unlock_kernel();
 		return len;
 	}
 	/* Either a single 'fd' number is written, in which
@@ -593,19 +614,16 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 			return -EINVAL;
 		err = nfsd_create_serv();
 		if (!err) {
-			int proto = 0;
-			err = svc_addsock(nfsd_serv, fd, buf, &proto);
+			err = svc_addsock(nfsd_serv, fd, buf);
 			if (err >= 0) {
-				err = lockd_up(proto);
+				err = lockd_up();
 				if (err < 0)
 					svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
 			}
 			/* Decrease the count, but don't shutdown the
 			 * the service
 			 */
-			lock_kernel();
 			nfsd_serv->sv_nrthreads--;
-			unlock_kernel();
 		}
 		return err < 0 ? err : 0;
 	}
@@ -614,10 +632,8 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 		int len = 0;
 		if (!toclose)
 			return -ENOMEM;
-		lock_kernel();
 		if (nfsd_serv)
 			len = svc_sock_names(buf, nfsd_serv, toclose);
-		unlock_kernel();
 		if (len >= 0)
 			lockd_down();
 		kfree(toclose);
@@ -655,7 +671,6 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 		if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
 			if (port == 0)
 				return -EINVAL;
-			lock_kernel();
 			if (nfsd_serv) {
 				xprt = svc_find_xprt(nfsd_serv, transport,
 						     AF_UNSPEC, port);
@@ -666,13 +681,23 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 				} else
 					err = -ENOTCONN;
 			}
-			unlock_kernel();
 			return err < 0 ? err : 0;
 		}
 	}
 	return -EINVAL;
 }
 
+static ssize_t write_ports(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_ports(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+
 int nfsd_max_blksize;
 
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
@@ -691,13 +716,13 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 		if (bsize > NFSSVC_MAXBLKSIZE)
 			bsize = NFSSVC_MAXBLKSIZE;
 		bsize &= ~(1024-1);
-		lock_kernel();
+		mutex_lock(&nfsd_mutex);
 		if (nfsd_serv && nfsd_serv->sv_nrthreads) {
-			unlock_kernel();
+			mutex_unlock(&nfsd_mutex);
 			return -EBUSY;
 		}
 		nfsd_max_blksize = bsize;
-		unlock_kernel();
+		mutex_unlock(&nfsd_mutex);
 	}
 	return sprintf(buf, "%d\n", nfsd_max_blksize);
 }
@@ -705,16 +730,17 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 #ifdef CONFIG_NFSD_V4
 extern time_t nfs4_leasetime(void);
 
-static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 {
 	/* if size > 10 seconds, call
 	 * nfs4_reset_lease() then write out the new lease (seconds) as reply
 	 */
 	char *mesg = buf;
-	int rv;
+	int rv, lease;
 
 	if (size > 0) {
-		int lease;
+		if (nfsd_serv)
+			return -EBUSY;
 		rv = get_int(&mesg, &lease);
 		if (rv)
 			return rv;
@@ -726,24 +752,52 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
-static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_leasetime(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
+extern char *nfs4_recoverydir(void);
+
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
 	char *recdir;
 	int len, status;
 
-	if (size == 0 || size > PATH_MAX || buf[size-1] != '\n')
-		return -EINVAL;
-	buf[size-1] = 0;
+	if (size > 0) {
+		if (nfsd_serv)
+			return -EBUSY;
+		if (size > PATH_MAX || buf[size-1] != '\n')
+			return -EINVAL;
+		buf[size-1] = 0;
 
-	recdir = mesg;
-	len = qword_get(&mesg, recdir, size);
-	if (len <= 0)
-		return -EINVAL;
+		recdir = mesg;
+		len = qword_get(&mesg, recdir, size);
+		if (len <= 0)
+			return -EINVAL;
 
-	status = nfs4_reset_recoverydir(recdir);
+		status = nfs4_reset_recoverydir(recdir);
+	}
+	sprintf(buf, "%s\n", nfs4_recoverydir());
 	return strlen(buf);
 }
+
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+{
+	ssize_t rv;
+
+	mutex_lock(&nfsd_mutex);
+	rv = __write_recoverydir(file, buf, size);
+	mutex_unlock(&nfsd_mutex);
+	return rv;
+}
+
 #endif
 
 /*----------------------------------------------------------------------------*/
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 100ae5641162..cd25d91895a1 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -51,7 +51,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 		/* make sure parents give x permission to user */
 		int err;
 		parent = dget_parent(tdentry);
-		err = permission(parent->d_inode, MAY_EXEC, NULL);
+		err = inode_permission(parent->d_inode, MAY_EXEC);
 		if (err < 0) {
 			dput(parent);
 			break;
@@ -176,9 +176,24 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 	if (IS_ERR(exp))
 		return nfserrno(PTR_ERR(exp));
 
-	error = nfsd_setuser_and_check_port(rqstp, exp);
-	if (error)
-		goto out;
+	if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+		/* Elevate privileges so that the lack of 'r' or 'x'
+		 * permission on some parent directory will
+		 * not stop exportfs_decode_fh from being able
+		 * to reconnect a directory into the dentry cache.
+		 * The same problem can affect "SUBTREECHECK" exports,
+		 * but as nfsd_acceptable depends on correct
+		 * access control settings being in effect, we cannot
+		 * fix that case easily.
+		 */
+		current->cap_effective =
+			cap_raise_nfsd_set(current->cap_effective,
+					   current->cap_permitted);
+	} else {
+		error = nfsd_setuser_and_check_port(rqstp, exp);
+		if (error)
+			goto out;
+	}
 
 	/*
 	 * Look up the dentry using the NFS file handle.
@@ -215,6 +230,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
 		goto out;
 	}
 
+	if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+		error = nfsd_setuser_and_check_port(rqstp, exp);
+		if (error) {
+			dput(dentry);
+			goto out;
+		}
+	}
+
 	if (S_ISDIR(dentry->d_inode->i_mode) &&
 			(dentry->d_flags & DCACHE_DISCONNECTED)) {
 		printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
@@ -279,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 	if (error)
 		goto out;
 
-	if (!(access & MAY_LOCK)) {
-		/*
-		 * pseudoflavor restrictions are not enforced on NLM,
-		 * which clients virtually always use auth_sys for,
-		 * even while using RPCSEC_GSS for NFS.
-		 */
-		error = check_nfsd_access(exp, rqstp);
-		if (error)
-			goto out;
-	}
+	/*
+	 * pseudoflavor restrictions are not enforced on NLM,
+	 * which clients virtually always use auth_sys for,
+	 * even while using RPCSEC_GSS for NFS.
+	 */
+	if (access & NFSD_MAY_LOCK)
+		goto skip_pseudoflavor_check;
+	/*
+	 * Clients may expect to be able to use auth_sys during mount,
+	 * even if they use gss for everything else; see section 2.3.2
+	 * of rfc 2623.
+	 */
+	if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
+			&& exp->ex_path.dentry == dentry)
+		goto skip_pseudoflavor_check;
+
+	error = check_nfsd_access(exp, rqstp);
+	if (error)
+		goto out;
 
+skip_pseudoflavor_check:
 	/* Finally, check access permissions. */
 	error = nfsd_permission(rqstp, exp, dentry, access);
 
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6cfc96a12483..5cffeca7acef 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+	nfserr = fh_verify(rqstp, &resp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
 	return nfsd_return_attrs(nfserr, resp);
 }
 
@@ -215,11 +216,11 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 		SVCFH_fmt(dirfhp), argp->len, argp->name);
 
 	/* First verify the parent file handle */
-	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_EXEC);
+	nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
 	if (nfserr)
 		goto done; /* must fh_put dirfhp even on error */
 
-	/* Check for MAY_WRITE in nfsd_create if necessary */
+	/* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
 
 	nfserr = nfserr_acces;
 	if (!argp->len)
@@ -281,7 +282,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 					nfserr = nfsd_permission(rqstp,
 								 newfhp->fh_export,
 								 newfhp->fh_dentry,
-								 MAY_WRITE|MAY_LOCAL_ACCESS);
+								 NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
 					if (nfserr && nfserr != nfserr_rofs)
 						goto out_unlock;
 				}
@@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle   *argp,
 
 	dprintk("nfsd: STATFS   %s\n", SVCFH_fmt(&argp->fh));
 
-	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+			NFSD_MAY_BYPASS_GSS_ON_ROOT);
 	fh_put(&argp->fh);
 	return nfserr;
 }
@@ -614,6 +616,7 @@ nfserrno (int errno)
 #endif
 		{ nfserr_stale, -ESTALE },
 		{ nfserr_jukebox, -ETIMEDOUT },
+		{ nfserr_jukebox, -ERESTARTSYS },
 		{ nfserr_dropit, -EAGAIN },
 		{ nfserr_dropit, -ENOMEM },
 		{ nfserr_badname, -ESRCH },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 941041f4b136..07e4f5d7baa8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -21,6 +21,7 @@
 #include <linux/smp_lock.h>
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
+#include <linux/kthread.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -36,28 +37,38 @@
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
-/* these signals will be delivered to an nfsd thread 
- * when handling a request
- */
-#define ALLOWED_SIGS	(sigmask(SIGKILL))
-/* these signals will be delivered to an nfsd thread
- * when not handling a request. i.e. when waiting
- */
-#define SHUTDOWN_SIGS	(sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT))
-/* if the last thread dies with SIGHUP, then the exports table is
- * left unchanged ( like 2.4-{0-9} ).  Any other signal will clear
- * the exports table (like 2.2).
- */
-#define	SIG_NOCLEAN	SIGHUP
-
 extern struct svc_program	nfsd_program;
-static void			nfsd(struct svc_rqst *rqstp);
+static int			nfsd(void *vrqstp);
 struct timeval			nfssvc_boot;
-       struct svc_serv 		*nfsd_serv;
 static atomic_t			nfsd_busy;
 static unsigned long		nfsd_last_call;
 static DEFINE_SPINLOCK(nfsd_call_lock);
 
+/*
+ * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
+ * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
+ * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
+ *
+ * If (out side the lock) nfsd_serv is non-NULL, then it must point to a
+ * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
+ * of nfsd threads must exist and each must listed in ->sp_all_threads in each
+ * entry of ->sv_pools[].
+ *
+ * Transitions of the thread count between zero and non-zero are of particular
+ * interest since the svc_serv needs to be created and initialized at that
+ * point, or freed.
+ *
+ * Finally, the nfsd_mutex also protects some of the global variables that are
+ * accessed when nfsd starts and that are settable via the write_* routines in
+ * nfsctl.c. In particular:
+ *
+ *	user_recovery_dirname
+ *	user_lease_time
+ *	nfsd_versions
+ */
+DEFINE_MUTEX(nfsd_mutex);
+struct svc_serv 		*nfsd_serv;
+
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static struct svc_stat	nfsd_acl_svcstats;
 static struct svc_version *	nfsd_acl_version[] = {
@@ -145,13 +156,14 @@ int nfsd_vers(int vers, enum vers_op change)
 
 int nfsd_nrthreads(void)
 {
-	if (nfsd_serv == NULL)
-		return 0;
-	else
-		return nfsd_serv->sv_nrthreads;
+	int rv = 0;
+	mutex_lock(&nfsd_mutex);
+	if (nfsd_serv)
+		rv = nfsd_serv->sv_nrthreads;
+	mutex_unlock(&nfsd_mutex);
+	return rv;
 }
 
-static int killsig;	/* signal that was used to kill last nfsd */
 static void nfsd_last_thread(struct svc_serv *serv)
 {
 	/* When last nfsd thread exits we need to do some clean-up */
@@ -162,11 +174,9 @@ static void nfsd_last_thread(struct svc_serv *serv)
 	nfsd_racache_shutdown();
 	nfs4_state_shutdown();
 
-	printk(KERN_WARNING "nfsd: last server has exited\n");
-	if (killsig != SIG_NOCLEAN) {
-		printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
-		nfsd_export_flush();
-	}
+	printk(KERN_WARNING "nfsd: last server has exited, flushing export "
+			    "cache\n");
+	nfsd_export_flush();
 }
 
 void nfsd_reset_versions(void)
@@ -190,13 +200,14 @@ void nfsd_reset_versions(void)
 	}
 }
 
+
 int nfsd_create_serv(void)
 {
 	int err = 0;
-	lock_kernel();
+
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
 	if (nfsd_serv) {
 		svc_get(nfsd_serv);
-		unlock_kernel();
 		return 0;
 	}
 	if (nfsd_max_blksize == 0) {
@@ -217,13 +228,12 @@ int nfsd_create_serv(void)
 	}
 
 	atomic_set(&nfsd_busy, 0);
-	nfsd_serv = svc_create_pooled(&nfsd_program,
-				      nfsd_max_blksize,
-				      nfsd_last_thread,
-				      nfsd, SIG_NOCLEAN, THIS_MODULE);
+	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+				      AF_INET,
+				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
-	unlock_kernel();
+
 	do_gettimeofday(&nfssvc_boot);		/* record boot time */
 	return err;
 }
@@ -234,25 +244,24 @@ static int nfsd_init_socks(int port)
 	if (!list_empty(&nfsd_serv->sv_permsocks))
 		return 0;
 
-	error = lockd_up(IPPROTO_UDP);
-	if (error >= 0) {
-		error = svc_create_xprt(nfsd_serv, "udp", port,
+	error = svc_create_xprt(nfsd_serv, "udp", port,
 					SVC_SOCK_DEFAULTS);
-		if (error < 0)
-			lockd_down();
-	}
 	if (error < 0)
 		return error;
 
-	error = lockd_up(IPPROTO_TCP);
-	if (error >= 0) {
-		error = svc_create_xprt(nfsd_serv, "tcp", port,
+	error = lockd_up();
+	if (error < 0)
+		return error;
+
+	error = svc_create_xprt(nfsd_serv, "tcp", port,
 					SVC_SOCK_DEFAULTS);
-		if (error < 0)
-			lockd_down();
-	}
 	if (error < 0)
 		return error;
+
+	error = lockd_up();
+	if (error < 0)
+		return error;
+
 	return 0;
 }
 
@@ -282,6 +291,8 @@ int nfsd_set_nrthreads(int n, int *nthreads)
 	int tot = 0;
 	int err = 0;
 
+	WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
 	if (nfsd_serv == NULL || n <= 0)
 		return 0;
 
@@ -316,7 +327,6 @@ int nfsd_set_nrthreads(int n, int *nthreads)
 		nthreads[0] = 1;
 
 	/* apply the new numbers */
-	lock_kernel();
 	svc_get(nfsd_serv);
 	for (i = 0; i < n; i++) {
 		err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
@@ -325,7 +335,6 @@ int nfsd_set_nrthreads(int n, int *nthreads)
 			break;
 	}
 	svc_destroy(nfsd_serv);
-	unlock_kernel();
 
 	return err;
 }
@@ -334,8 +343,8 @@ int
 nfsd_svc(unsigned short port, int nrservs)
 {
 	int	error;
-	
-	lock_kernel();
+
+	mutex_lock(&nfsd_mutex);
 	dprintk("nfsd: creating service\n");
 	error = -EINVAL;
 	if (nrservs <= 0)
@@ -363,7 +372,7 @@ nfsd_svc(unsigned short port, int nrservs)
  failure:
 	svc_destroy(nfsd_serv);		/* Release server */
  out:
-	unlock_kernel();
+	mutex_unlock(&nfsd_mutex);
 	return error;
 }
 
@@ -391,18 +400,17 @@ update_thread_usage(int busy_threads)
 /*
  * This is the NFS server kernel thread
  */
-static void
-nfsd(struct svc_rqst *rqstp)
+static int
+nfsd(void *vrqstp)
 {
+	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
 	struct fs_struct *fsp;
-	int		err;
-	sigset_t shutdown_mask, allowed_mask;
+	int err, preverr = 0;
 
 	/* Lock module and set up kernel thread */
-	lock_kernel();
-	daemonize("nfsd");
+	mutex_lock(&nfsd_mutex);
 
-	/* After daemonize() this kernel thread shares current->fs
+	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with a
 	 * umask of 0 instead of init's umask. */
 	fsp = copy_fs_struct(current->fs);
@@ -414,14 +422,17 @@ nfsd(struct svc_rqst *rqstp)
 	current->fs = fsp;
 	current->fs->umask = 0;
 
-	siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS);
-	siginitsetinv(&allowed_mask, ALLOWED_SIGS);
+	/*
+	 * thread is spawned with all signals set to SIG_IGN, re-enable
+	 * the ones that will bring down the thread
+	 */
+	allow_signal(SIGKILL);
+	allow_signal(SIGHUP);
+	allow_signal(SIGINT);
+	allow_signal(SIGQUIT);
 
 	nfsdstats.th_cnt++;
-
-	rqstp->rq_task = current;
-
-	unlock_kernel();
+	mutex_unlock(&nfsd_mutex);
 
 	/*
 	 * We want less throttling in balance_dirty_pages() so that nfs to
@@ -435,26 +446,30 @@ nfsd(struct svc_rqst *rqstp)
 	 * The main request loop
 	 */
 	for (;;) {
-		/* Block all but the shutdown signals */
-		sigprocmask(SIG_SETMASK, &shutdown_mask, NULL);
-
 		/*
 		 * Find a socket with data available and call its
 		 * recvfrom routine.
 		 */
 		while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
 			;
-		if (err < 0)
+		if (err == -EINTR)
 			break;
+		else if (err < 0) {
+			if (err != preverr) {
+				printk(KERN_WARNING "%s: unexpected error "
+					"from svc_recv (%d)\n", __func__, -err);
+				preverr = err;
+			}
+			schedule_timeout_uninterruptible(HZ);
+			continue;
+		}
+
 		update_thread_usage(atomic_read(&nfsd_busy));
 		atomic_inc(&nfsd_busy);
 
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
 
-		/* Process request with signals blocked.  */
-		sigprocmask(SIG_SETMASK, &allowed_mask, NULL);
-
 		svc_process(rqstp);
 
 		/* Unlock export hash tables */
@@ -463,22 +478,10 @@ nfsd(struct svc_rqst *rqstp)
 		atomic_dec(&nfsd_busy);
 	}
 
-	if (err != -EINTR) {
-		printk(KERN_WARNING "nfsd: terminating on error %d\n", -err);
-	} else {
-		unsigned int	signo;
-
-		for (signo = 1; signo <= _NSIG; signo++)
-			if (sigismember(&current->pending.signal, signo) &&
-			    !sigismember(&current->blocked, signo))
-				break;
-		killsig = signo;
-	}
 	/* Clear signals before calling svc_exit_thread() */
 	flush_signals(current);
 
-	lock_kernel();
-
+	mutex_lock(&nfsd_mutex);
 	nfsdstats.th_cnt --;
 
 out:
@@ -486,8 +489,9 @@ out:
 	svc_exit_thread(rqstp);
 
 	/* Release module */
-	unlock_kernel();
+	mutex_unlock(&nfsd_mutex);
 	module_put_and_exit(0);
+	return 0;
 }
 
 static __be32 map_new_errors(u32 vers, __be32 nfserr)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a3a291f771f4..0bc56f6d9276 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -83,7 +83,6 @@ struct raparm_hbucket {
 	spinlock_t		pb_lock;
 } ____cacheline_aligned_in_smp;
 
-static struct raparms *		raparml;
 #define RAPARM_HASH_BITS	4
 #define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
 #define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
@@ -144,7 +143,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
 
 	/* Obtain dentry and export. */
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
 	if (err)
 		return err;
 
@@ -262,14 +261,14 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 {
 	struct dentry	*dentry;
 	struct inode	*inode;
-	int		accmode = MAY_SATTR;
+	int		accmode = NFSD_MAY_SATTR;
 	int		ftype = 0;
 	__be32		err;
 	int		host_err;
 	int		size_change = 0;
 
 	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
-		accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE;
+		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
 	if (iap->ia_valid & ATTR_SIZE)
 		ftype = S_IFREG;
 
@@ -331,7 +330,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	 */
 	if (iap->ia_valid & ATTR_SIZE) {
 		if (iap->ia_size < inode->i_size) {
-			err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
+			err = nfsd_permission(rqstp, fhp->fh_export, dentry,
+					NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
 			if (err)
 				goto out;
 		}
@@ -410,6 +410,7 @@ out_nfserr:
 static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 {
 	ssize_t buflen;
+	ssize_t ret;
 
 	buflen = vfs_getxattr(dentry, key, NULL, 0);
 	if (buflen <= 0)
@@ -419,7 +420,10 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 	if (!*buf)
 		return -ENOMEM;
 
-	return vfs_getxattr(dentry, key, *buf, buflen);
+	ret = vfs_getxattr(dentry, key, *buf, buflen);
+	if (ret < 0)
+		kfree(*buf);
+	return ret;
 }
 #endif
 
@@ -462,7 +466,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	unsigned int flags = 0;
 
 	/* Get inode */
-	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
+	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
 	if (error)
 		return error;
 
@@ -563,20 +567,20 @@ struct accessmap {
 	int		how;
 };
 static struct accessmap	nfs3_regaccess[] = {
-    {	NFS3_ACCESS_READ,	MAY_READ			},
-    {	NFS3_ACCESS_EXECUTE,	MAY_EXEC			},
-    {	NFS3_ACCESS_MODIFY,	MAY_WRITE|MAY_TRUNC		},
-    {	NFS3_ACCESS_EXTEND,	MAY_WRITE			},
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_TRUNC	},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE			},
 
     {	0,			0				}
 };
 
 static struct accessmap	nfs3_diraccess[] = {
-    {	NFS3_ACCESS_READ,	MAY_READ			},
-    {	NFS3_ACCESS_LOOKUP,	MAY_EXEC			},
-    {	NFS3_ACCESS_MODIFY,	MAY_EXEC|MAY_WRITE|MAY_TRUNC	},
-    {	NFS3_ACCESS_EXTEND,	MAY_EXEC|MAY_WRITE		},
-    {	NFS3_ACCESS_DELETE,	MAY_REMOVE			},
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_LOOKUP,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_EXEC|NFSD_MAY_WRITE	},
+    {	NFS3_ACCESS_DELETE,	NFSD_MAY_REMOVE			},
 
     {	0,			0				}
 };
@@ -589,10 +593,10 @@ static struct accessmap	nfs3_anyaccess[] = {
 	 * mainly at mode bits, and we make sure to ignore read-only
 	 * filesystem checks
 	 */
-    {	NFS3_ACCESS_READ,	MAY_READ			},
-    {	NFS3_ACCESS_EXECUTE,	MAY_EXEC			},
-    {	NFS3_ACCESS_MODIFY,	MAY_WRITE|MAY_LOCAL_ACCESS	},
-    {	NFS3_ACCESS_EXTEND,	MAY_WRITE|MAY_LOCAL_ACCESS	},
+    {	NFS3_ACCESS_READ,	NFSD_MAY_READ			},
+    {	NFS3_ACCESS_EXECUTE,	NFSD_MAY_EXEC			},
+    {	NFS3_ACCESS_MODIFY,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
+    {	NFS3_ACCESS_EXTEND,	NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS	},
 
     {	0,			0				}
 };
@@ -606,7 +610,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
 	u32			query, result = 0, sresult = 0;
 	__be32			error;
 
-	error = fh_verify(rqstp, fhp, 0, MAY_NOP);
+	error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
 	if (error)
 		goto out;
 
@@ -678,7 +682,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
 	 * in case a chmod has now revoked permission.
 	 */
-	err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE);
+	err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE);
 	if (err)
 		goto out;
 
@@ -689,7 +693,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 * or any access when mandatory locking enabled
 	 */
 	err = nfserr_perm;
-	if (IS_APPEND(inode) && (access & MAY_WRITE))
+	if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE))
 		goto out;
 	/*
 	 * We must ignore files (but only files) which might have mandatory
@@ -706,14 +710,14 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 * Check to see if there are any leases on this file.
 	 * This may block while leases are broken.
 	 */
-	host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
+	host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0));
 	if (host_err == -EWOULDBLOCK)
 		host_err = -ETIMEDOUT;
 	if (host_err) /* NOMEM or WOULDBLOCK */
 		goto out_nfserr;
 
-	if (access & MAY_WRITE) {
-		if (access & MAY_READ)
+	if (access & NFSD_MAY_WRITE) {
+		if (access & NFSD_MAY_READ)
 			flags = O_RDWR|O_LARGEFILE;
 		else
 			flags = O_WRONLY|O_LARGEFILE;
@@ -1069,12 +1073,12 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	if (file) {
 		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
-				MAY_READ|MAY_OWNER_OVERRIDE);
+				NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
 		if (err)
 			goto out;
 		err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
 	} else {
-		err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);
+		err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
 		if (err)
 			goto out;
 		err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
@@ -1098,13 +1102,13 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	if (file) {
 		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
-				MAY_WRITE|MAY_OWNER_OVERRIDE);
+				NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
 		if (err)
 			goto out;
 		err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
 				stablep);
 	} else {
-		err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);
+		err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
 		if (err)
 			goto out;
 
@@ -1136,7 +1140,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if ((u64)count > ~(u64)offset)
 		return nfserr_inval;
 
-	if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0)
+	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+	if (err)
 		return err;
 	if (EX_ISSYNC(fhp->fh_export)) {
 		if (file->f_op && file->f_op->fsync) {
@@ -1197,7 +1202,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (isdotent(fname, flen))
 		goto out;
 
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
 
@@ -1248,36 +1253,34 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		iap->ia_mode = 0;
 	iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
 
+	err = nfserr_inval;
+	if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) {
+		printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+		       type);
+		goto out;
+	}
+
+	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+	if (host_err)
+		goto out_nfserr;
+
 	/*
 	 * Get the dir op function pointer.
 	 */
 	err = 0;
 	switch (type) {
 	case S_IFREG:
-		host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
-		if (host_err)
-			goto out_nfserr;
 		host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
 		break;
 	case S_IFDIR:
-		host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
-		if (host_err)
-			goto out_nfserr;
 		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
 		break;
 	case S_IFCHR:
 	case S_IFBLK:
 	case S_IFIFO:
 	case S_IFSOCK:
-		host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
-		if (host_err)
-			goto out_nfserr;
 		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
 		break;
-	default:
-	        printk("nfsd: bad file type %o in nfsd_create\n", type);
-		host_err = -EINVAL;
-		goto out_nfserr;
 	}
 	if (host_err < 0) {
 		mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1289,7 +1292,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		write_inode_now(dchild->d_inode, 1);
 	}
 
-
 	err2 = nfsd_create_setattr(rqstp, resfhp, iap);
 	if (err2)
 		err = err2;
@@ -1334,7 +1336,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out;
 	if (!(iap->ia_valid & ATTR_MODE))
 		iap->ia_mode = 0;
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
 
@@ -1471,7 +1473,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 	__be32		err;
 	int		host_err;
 
-	err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
+	err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
 	if (err)
 		goto out;
 
@@ -1517,7 +1519,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct dentry	*dentry, *dnew;
 	__be32		err, cerr;
 	int		host_err;
-	umode_t		mode;
 
 	err = nfserr_noent;
 	if (!flen || !plen)
@@ -1526,7 +1527,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (isdotent(fname, flen))
 		goto out;
 
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
 	fh_lock(fhp);
@@ -1536,11 +1537,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (IS_ERR(dnew))
 		goto out_nfserr;
 
-	mode = S_IALLUGO;
-	/* Only the MODE ATTRibute is even vaguely meaningful */
-	if (iap && (iap->ia_valid & ATTR_MODE))
-		mode = iap->ia_mode & S_IALLUGO;
-
 	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
 	if (host_err)
 		goto out_nfserr;
@@ -1552,11 +1548,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		else {
 			strncpy(path_alloced, path, plen);
 			path_alloced[plen] = 0;
-			host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
+			host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced);
 			kfree(path_alloced);
 		}
 	} else
-		host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
+		host_err = vfs_symlink(dentry->d_inode, dnew, path);
 
 	if (!host_err) {
 		if (EX_ISSYNC(fhp->fh_export))
@@ -1591,10 +1587,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	__be32		err;
 	int		host_err;
 
-	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
-	err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP);
+	err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
 	if (err)
 		goto out;
 
@@ -1661,10 +1657,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	__be32		err;
 	int		host_err;
 
-	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
+	err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
 	if (err)
 		goto out;
-	err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE);
+	err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE);
 	if (err)
 		goto out;
 
@@ -1768,7 +1764,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	err = nfserr_acces;
 	if (!flen || isdotent(fname, flen))
 		goto out;
-	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE);
+	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE);
 	if (err)
 		goto out;
 
@@ -1822,6 +1818,115 @@ out:
 }
 
 /*
+ * We do this buffering because we must not call back into the file
+ * system's ->lookup() method from the filldir callback. That may well
+ * deadlock a number of file systems.
+ *
+ * This is based heavily on the implementation of same in XFS.
+ */
+struct buffered_dirent {
+	u64		ino;
+	loff_t		offset;
+	int		namlen;
+	unsigned int	d_type;
+	char		name[];
+};
+
+struct readdir_data {
+	char		*dirent;
+	size_t		used;
+	int		full;
+};
+
+static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
+				 loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct readdir_data *buf = __buf;
+	struct buffered_dirent *de = (void *)(buf->dirent + buf->used);
+	unsigned int reclen;
+
+	reclen = ALIGN(sizeof(struct buffered_dirent) + namlen, sizeof(u64));
+	if (buf->used + reclen > PAGE_SIZE) {
+		buf->full = 1;
+		return -EINVAL;
+	}
+
+	de->namlen = namlen;
+	de->offset = offset;
+	de->ino = ino;
+	de->d_type = d_type;
+	memcpy(de->name, name, namlen);
+	buf->used += reclen;
+
+	return 0;
+}
+
+static int nfsd_buffered_readdir(struct file *file, filldir_t func,
+				 struct readdir_cd *cdp, loff_t *offsetp)
+{
+	struct readdir_data buf;
+	struct buffered_dirent *de;
+	int host_err;
+	int size;
+	loff_t offset;
+
+	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
+	if (!buf.dirent)
+		return -ENOMEM;
+
+	offset = *offsetp;
+	cdp->err = nfserr_eof; /* will be cleared on successful read */
+
+	while (1) {
+		unsigned int reclen;
+
+		buf.used = 0;
+		buf.full = 0;
+
+		host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf);
+		if (buf.full)
+			host_err = 0;
+
+		if (host_err < 0)
+			break;
+
+		size = buf.used;
+
+		if (!size)
+			break;
+
+		de = (struct buffered_dirent *)buf.dirent;
+		while (size > 0) {
+			offset = de->offset;
+
+			if (func(cdp, de->name, de->namlen, de->offset,
+				 de->ino, de->d_type))
+				goto done;
+
+			if (cdp->err != nfs_ok)
+				goto done;
+
+			reclen = ALIGN(sizeof(*de) + de->namlen,
+				       sizeof(u64));
+			size -= reclen;
+			de = (struct buffered_dirent *)((char *)de + reclen);
+		}
+		offset = vfs_llseek(file, 0, SEEK_CUR);
+		if (!buf.full)
+			break;
+	}
+
+ done:
+	free_page((unsigned long)(buf.dirent));
+
+	if (host_err)
+		return nfserrno(host_err);
+
+	*offsetp = offset;
+	return cdp->err;
+}
+
+/*
  * Read entries from a directory.
  * The  NFSv3/4 verifier we ignore for now.
  */
@@ -1830,11 +1935,10 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 	     struct readdir_cd *cdp, filldir_t func)
 {
 	__be32		err;
-	int 		host_err;
 	struct file	*file;
 	loff_t		offset = *offsetp;
 
-	err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file);
+	err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file);
 	if (err)
 		goto out;
 
@@ -1844,21 +1948,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 		goto out_close;
 	}
 
-	/*
-	 * Read the directory entries. This silly loop is necessary because
-	 * readdir() is not guaranteed to fill up the entire buffer, but
-	 * may choose to do less.
-	 */
-
-	do {
-		cdp->err = nfserr_eof; /* will be cleared on successful read */
-		host_err = vfs_readdir(file, func, cdp);
-	} while (host_err >=0 && cdp->err == nfs_ok);
-	if (host_err)
-		err = nfserrno(host_err);
-	else
-		err = cdp->err;
-	*offsetp = vfs_llseek(file, 0, 1);
+	err = nfsd_buffered_readdir(file, func, cdp, offsetp);
 
 	if (err == nfserr_eof || err == nfserr_toosmall)
 		err = nfs_ok; /* can still be found in ->err */
@@ -1873,9 +1963,9 @@ out:
  * N.B. After this call fhp needs an fh_put
  */
 __be32
-nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
+nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-	__be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
+	__be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
 	if (!err && vfs_statfs(fhp->fh_dentry,stat))
 		err = nfserr_io;
 	return err;
@@ -1896,18 +1986,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	struct inode	*inode = dentry->d_inode;
 	int		err;
 
-	if (acc == MAY_NOP)
+	if (acc == NFSD_MAY_NOP)
 		return 0;
 #if 0
 	dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
 		acc,
-		(acc & MAY_READ)?	" read"  : "",
-		(acc & MAY_WRITE)?	" write" : "",
-		(acc & MAY_EXEC)?	" exec"  : "",
-		(acc & MAY_SATTR)?	" sattr" : "",
-		(acc & MAY_TRUNC)?	" trunc" : "",
-		(acc & MAY_LOCK)?	" lock"  : "",
-		(acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "",
+		(acc & NFSD_MAY_READ)?	" read"  : "",
+		(acc & NFSD_MAY_WRITE)?	" write" : "",
+		(acc & NFSD_MAY_EXEC)?	" exec"  : "",
+		(acc & NFSD_MAY_SATTR)?	" sattr" : "",
+		(acc & NFSD_MAY_TRUNC)?	" trunc" : "",
+		(acc & NFSD_MAY_LOCK)?	" lock"  : "",
+		(acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "",
 		inode->i_mode,
 		IS_IMMUTABLE(inode)?	" immut" : "",
 		IS_APPEND(inode)?	" append" : "",
@@ -1920,18 +2010,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	 * system.  But if it is IRIX doing check on write-access for a 
 	 * device special file, we ignore rofs.
 	 */
-	if (!(acc & MAY_LOCAL_ACCESS))
-		if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
+	if (!(acc & NFSD_MAY_LOCAL_ACCESS))
+		if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) {
 			if (exp_rdonly(rqstp, exp) ||
 			    __mnt_is_readonly(exp->ex_path.mnt))
 				return nfserr_rofs;
-			if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
+			if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode))
 				return nfserr_perm;
 		}
-	if ((acc & MAY_TRUNC) && IS_APPEND(inode))
+	if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode))
 		return nfserr_perm;
 
-	if (acc & MAY_LOCK) {
+	if (acc & NFSD_MAY_LOCK) {
 		/* If we cannot rely on authentication in NLM requests,
 		 * just allow locks, otherwise require read permission, or
 		 * ownership
@@ -1939,7 +2029,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 		if (exp->ex_flags & NFSEXP_NOAUTHNLM)
 			return 0;
 		else
-			acc = MAY_READ | MAY_OWNER_OVERRIDE;
+			acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE;
 	}
 	/*
 	 * The file owner always gets access permission for accesses that
@@ -1955,16 +2045,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	 * We must trust the client to do permission checking - using "ACCESS"
 	 * with NFSv3.
 	 */
-	if ((acc & MAY_OWNER_OVERRIDE) &&
+	if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
 	    inode->i_uid == current->fsuid)
 		return 0;
 
-	err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
+	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
+	err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
 
 	/* Allow read access to binaries even when mode 111 */
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
-	    acc == (MAY_READ | MAY_OWNER_OVERRIDE))
-		err = permission(inode, MAY_EXEC, NULL);
+	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
+		err = inode_permission(inode, MAY_EXEC);
 
 	return err? nfserrno(err) : 0;
 }
@@ -1972,11 +2063,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 void
 nfsd_racache_shutdown(void)
 {
-	if (!raparml)
-		return;
+	struct raparms *raparm, *last_raparm;
+	unsigned int i;
+
 	dprintk("nfsd: freeing readahead buffers.\n");
-	kfree(raparml);
-	raparml = NULL;
+
+	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+		raparm = raparm_hash[i].pb_head;
+		while(raparm) {
+			last_raparm = raparm;
+			raparm = raparm->p_next;
+			kfree(last_raparm);
+		}
+		raparm_hash[i].pb_head = NULL;
+	}
 }
 /*
  * Initialize readahead param cache
@@ -1987,35 +2087,38 @@ nfsd_racache_init(int cache_size)
 	int	i;
 	int	j = 0;
 	int	nperbucket;
+	struct raparms **raparm = NULL;
 
 
-	if (raparml)
+	if (raparm_hash[0].pb_head)
 		return 0;
-	if (cache_size < 2*RAPARM_HASH_SIZE)
-		cache_size = 2*RAPARM_HASH_SIZE;
-	raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
-
-	if (!raparml) {
-		printk(KERN_WARNING
-			"nfsd: Could not allocate memory read-ahead cache.\n");
-		return -ENOMEM;
-	}
+	nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
+	if (nperbucket < 2)
+		nperbucket = 2;
+	cache_size = nperbucket * RAPARM_HASH_SIZE;
 
 	dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
-	for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
-		raparm_hash[i].pb_head = NULL;
+
+	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
 		spin_lock_init(&raparm_hash[i].pb_lock);
-	}
-	nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
-	for (i = 0; i < cache_size - 1; i++) {
-		if (i % nperbucket == 0)
-			raparm_hash[j++].pb_head = raparml + i;
-		if (i % nperbucket < nperbucket-1)
-			raparml[i].p_next = raparml + i + 1;
+
+		raparm = &raparm_hash[i].pb_head;
+		for (j = 0; j < nperbucket; j++) {
+			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+			if (!*raparm)
+				goto out_nomem;
+			raparm = &(*raparm)->p_next;
+		}
+		*raparm = NULL;
 	}
 
 	nfsdstats.ra_size = cache_size;
 	return 0;
+
+out_nomem:
+	dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
+	nfsd_racache_shutdown();
+	return -ENOMEM;
 }
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 64965e1c21c4..9b0efdad8910 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -13,9 +13,7 @@
 #include <linux/nls.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
-#ifdef CONFIG_KMOD
 #include <linux/kmod.h>
-#endif
 #include <linux/spinlock.h>
 
 static struct nls_table default_table;
@@ -215,24 +213,7 @@ static struct nls_table *find_nls(char *charset)
 
 struct nls_table *load_nls(char *charset)
 {
-	struct nls_table *nls;
-#ifdef CONFIG_KMOD
-	int ret;
-#endif
-
-	nls = find_nls(charset);
-	if (nls)
-		return nls;
-
-#ifdef CONFIG_KMOD
-	ret = request_module("nls_%s", charset);
-	if (ret != 0) {
-		printk("Unable to load NLS charset %s\n", charset);
-		return NULL;
-	}
-	nls = find_nls(charset);
-#endif
-	return nls;
+	return try_then_request_module(find_nls(charset), "nls_%s", charset);
 }
 
 void unload_nls(struct nls_table *nls)
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 00e9ccde8e42..b38f944f0667 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1194,7 +1194,7 @@ lock_retry_remap:
 		tbh = bhs[i];
 		if (!tbh)
 			continue;
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			BUG();
 		/* The buffer dirty state is now irrelevant, just clean it. */
 		clear_buffer_dirty(tbh);
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 33ff314cc507..9669541d0119 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -665,7 +665,7 @@ lock_retry_remap:
 	for (i = 0; i < nr_bhs; i++) {
 		struct buffer_head *tbh = bhs[i];
 
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			continue;
 		if (unlikely(buffer_uptodate(tbh))) {
 			unlock_buffer(tbh);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3c5550cd11d6..3140a4429af1 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 			pages[nr] = *cached_page;
 			page_cache_get(*cached_page);
 			if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-				__pagevec_lru_add(lru_pvec);
+				__pagevec_lru_add_file(lru_pvec);
 			*cached_page = NULL;
 		}
 		index++;
@@ -2084,7 +2084,7 @@ err_out:
 						OSYNC_METADATA|OSYNC_DATA);
 		}
   	}
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
 			written ? "written" : "status", (unsigned long)written,
 			(long)status);
@@ -2118,7 +2118,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 		goto out;
 	if (!count)
 		goto out;
-	err = remove_suid(file->f_path.dentry);
+	err = file_remove_suid(file);
 	if (err)
 		goto out;
 	file_update_time(file);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 790defb847e7..17d32ca6bc35 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -586,7 +586,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 			struct buffer_head *tbh = bhs[i_bhs];
 
-			if (unlikely(test_set_buffer_locked(tbh)))
+			if (!trylock_buffer(tbh))
 				BUG();
 			BUG_ON(!buffer_uptodate(tbh));
 			clear_buffer_dirty(tbh);
@@ -779,7 +779,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 		struct buffer_head *tbh = bhs[i_bhs];
 
-		if (unlikely(test_set_buffer_locked(tbh)))
+		if (!trylock_buffer(tbh))
 			BUG();
 		BUG_ON(!buffer_uptodate(tbh));
 		clear_buffer_dirty(tbh);
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index e1781c8b1650..2ca00153b6ec 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
 	// TODO: Consider moving this lot to a separate function! (AIA)
 handle_name:
    {
-	struct dentry *real_dent, *new_dent;
 	MFT_RECORD *m;
 	ntfs_attr_search_ctx *ctx;
 	ntfs_inode *ni = NTFS_I(dent_inode);
@@ -255,93 +254,9 @@ handle_name:
 	}
 	nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
 
-	/*
-	 * Note: No need for dent->d_lock lock as i_mutex is held on the
-	 * parent inode.
-	 */
-
-	/* Does a dentry matching the nls_name exist already? */
-	real_dent = d_lookup(dent->d_parent, &nls_name);
-	/* If not, create it now. */
-	if (!real_dent) {
-		real_dent = d_alloc(dent->d_parent, &nls_name);
-		kfree(nls_name.name);
-		if (!real_dent) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-		new_dent = d_splice_alias(dent_inode, real_dent);
-		if (new_dent)
-			dput(real_dent);
-		else
-			new_dent = real_dent;
-		ntfs_debug("Done.  (Created new dentry.)");
-		return new_dent;
-	}
+	dent = d_add_ci(dent, dent_inode, &nls_name);
 	kfree(nls_name.name);
-	/* Matching dentry exists, check if it is negative. */
-	if (real_dent->d_inode) {
-		if (unlikely(real_dent->d_inode != dent_inode)) {
-			/* This can happen because bad inodes are unhashed. */
-			BUG_ON(!is_bad_inode(dent_inode));
-			BUG_ON(!is_bad_inode(real_dent->d_inode));
-		}
-		/*
-		 * Already have the inode and the dentry attached, decrement
-		 * the reference count to balance the ntfs_iget() we did
-		 * earlier on.  We found the dentry using d_lookup() so it
-		 * cannot be disconnected and thus we do not need to worry
-		 * about any NFS/disconnectedness issues here.
-		 */
-		iput(dent_inode);
-		ntfs_debug("Done.  (Already had inode and dentry.)");
-		return real_dent;
-	}
-	/*
-	 * Negative dentry: instantiate it unless the inode is a directory and
-	 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
-	 * in which case d_move() that in place of the found dentry.
-	 */
-	if (!S_ISDIR(dent_inode->i_mode)) {
-		/* Not a directory; everything is easy. */
-		d_instantiate(real_dent, dent_inode);
-		ntfs_debug("Done.  (Already had negative file dentry.)");
-		return real_dent;
-	}
-	spin_lock(&dcache_lock);
-	if (list_empty(&dent_inode->i_dentry)) {
-		/*
-		 * Directory without a 'disconnected' dentry; we need to do
-		 * d_instantiate() by hand because it takes dcache_lock which
-		 * we already hold.
-		 */
-		list_add(&real_dent->d_alias, &dent_inode->i_dentry);
-		real_dent->d_inode = dent_inode;
-		spin_unlock(&dcache_lock);
-		security_d_instantiate(real_dent, dent_inode);
-		ntfs_debug("Done.  (Already had negative directory dentry.)");
-		return real_dent;
-	}
-	/*
-	 * Directory with a 'disconnected' dentry; get a reference to the
-	 * 'disconnected' dentry.
-	 */
-	new_dent = list_entry(dent_inode->i_dentry.next, struct dentry,
-			d_alias);
-	dget_locked(new_dent);
-	spin_unlock(&dcache_lock);
-	/* Do security vodoo. */
-	security_d_instantiate(real_dent, dent_inode);
-	/* Move new_dent in place of real_dent. */
-	d_move(new_dent, real_dent);
-	/* Balance the ntfs_iget() we did above. */
-	iput(dent_inode);
-	/* Throw away real_dent. */
-	dput(real_dent);
-	/* Use new_dent as the actual dentry. */
-	ntfs_debug("Done.  (Already had negative, disconnected directory "
-			"dentry.)");
-	return new_dent;
+	return dent;
 
 eio_err_out:
 	ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
@@ -389,8 +304,6 @@ static struct dentry *ntfs_get_parent(struct dentry *child_dent)
 	ntfs_attr_search_ctx *ctx;
 	ATTR_RECORD *attr;
 	FILE_NAME_ATTR *fn;
-	struct inode *parent_vi;
-	struct dentry *parent_dent;
 	unsigned long parent_ino;
 	int err;
 
@@ -430,24 +343,8 @@ try_next:
 	/* Release the search context and the mft record of the child. */
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(ni);
-	/* Get the inode of the parent directory. */
-	parent_vi = ntfs_iget(vi->i_sb, parent_ino);
-	if (IS_ERR(parent_vi) || unlikely(is_bad_inode(parent_vi))) {
-		if (!IS_ERR(parent_vi))
-			iput(parent_vi);
-		ntfs_error(vi->i_sb, "Failed to get parent directory inode "
-				"0x%lx of child inode 0x%lx.", parent_ino,
-				vi->i_ino);
-		return ERR_PTR(-EACCES);
-	}
-	/* Finally get a dentry for the parent directory and return it. */
-	parent_dent = d_alloc_anon(parent_vi);
-	if (unlikely(!parent_dent)) {
-		iput(parent_vi);
-		return ERR_PTR(-ENOMEM);
-	}
-	ntfs_debug("Done for inode 0x%lx.", vi->i_ino);
-	return parent_dent;
+
+	return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
 }
 
 static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3e76f3b216bc..4a46743b5077 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3080,7 +3080,7 @@ struct kmem_cache *ntfs_inode_cache;
 struct kmem_cache *ntfs_big_inode_cache;
 
 /* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void ntfs_big_inode_init_once(void *foo)
 {
 	ntfs_inode *ni = (ntfs_inode *)foo;
 
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 3a8af75351e8..4087fbdac327 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -113,7 +113,7 @@ typedef struct {
  * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
  * file since it was last opened.  I think the names speak for themselves but
  * if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ * documentation: http://www.linux-ntfs.org/
  */
 enum {
 	USN_REASON_DATA_OVERWRITE	= const_cpu_to_le32(0x00000001),
@@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;
  * Source info flags (32-bit).  Information about the source of the change(s)
  * to the file.  For detailed descriptions of what these mean, see the Linux
  * NTFS project NTFS documentation:
- *	http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ *	http://www.linux-ntfs.org/
  */
 enum {
 	USN_SOURCE_DATA_MANAGEMENT	  = const_cpu_to_le32(0x00000001),
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f6956de56fdb..589dcdfdfe3c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -34,7 +34,8 @@ ocfs2-objs := \
 	symlink.o 		\
 	sysfile.o 		\
 	uptodate.o		\
-	ver.o
+	ver.o			\
+	xattr.o
 
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e068..0cc2deb9394c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,340 @@
 
 #include "buffer_head_io.h"
 
+
+/*
+ * Operations for a specific extent tree type.
+ *
+ * To implement an on-disk btree (extent tree) type in ocfs2, add
+ * an ocfs2_extent_tree_operations structure and the matching
+ * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
+ * for the allocation portion of the extent tree.
+ */
+struct ocfs2_extent_tree_operations {
+	/*
+	 * last_eb_blk is the block number of the right most leaf extent
+	 * block.  Most on-disk structures containing an extent tree store
+	 * this value for fast access.  The ->eo_set_last_eb_blk() and
+	 * ->eo_get_last_eb_blk() operations access this value.  They are
+	 *  both required.
+	 */
+	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
+				   u64 blkno);
+	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * The on-disk structure usually keeps track of how many total
+	 * clusters are stored in this extent tree.  This function updates
+	 * that value.  new_clusters is the delta, and must be
+	 * added to the total.  Required.
+	 */
+	void (*eo_update_clusters)(struct inode *inode,
+				   struct ocfs2_extent_tree *et,
+				   u32 new_clusters);
+
+	/*
+	 * If ->eo_insert_check() exists, it is called before rec is
+	 * inserted into the extent tree.  It is optional.
+	 */
+	int (*eo_insert_check)(struct inode *inode,
+			       struct ocfs2_extent_tree *et,
+			       struct ocfs2_extent_rec *rec);
+	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
+
+	/*
+	 * --------------------------------------------------------------
+	 * The remaining are internal to ocfs2_extent_tree and don't have
+	 * accessor functions
+	 */
+
+	/*
+	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
+	 * It is required.
+	 */
+	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
+	 * it exists.  If it does not, et->et_max_leaf_clusters is set
+	 * to 0 (unlimited).  Optional.
+	 */
+	void (*eo_fill_max_leaf_clusters)(struct inode *inode,
+					  struct ocfs2_extent_tree *et);
+};
+
+
+/*
+ * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
+ * in the methods.
+ */
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno);
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+					 struct ocfs2_extent_tree *et,
+					 u32 clusters);
+static int ocfs2_dinode_insert_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec);
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_dinode_update_clusters,
+	.eo_insert_check	= ocfs2_dinode_insert_check,
+	.eo_sanity_check	= ocfs2_dinode_sanity_check,
+	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
+};
+
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	return le64_to_cpu(di->i_last_eb_blk);
+}
+
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+					 struct ocfs2_extent_tree *et,
+					 u32 clusters)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	le32_add_cpu(&di->i_clusters, clusters);
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+static int ocfs2_dinode_insert_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+			(OCFS2_I(inode)->ip_clusters != rec->e_cpos),
+			"Device %s, asking for sparse allocation: inode %llu, "
+			"cpos %u, clusters %u\n",
+			osb->dev_str,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			rec->e_cpos,
+			OCFS2_I(inode)->ip_clusters);
+
+	return 0;
+}
+
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et)
+{
+	int ret = 0;
+	struct ocfs2_dinode *di;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+
+	di = et->et_object;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ret = -EIO;
+		ocfs2_error(inode->i_sb,
+			"Inode %llu has invalid path root",
+			(unsigned long long)OCFS2_I(inode)->ip_blkno);
+	}
+
+	return ret;
+}
+
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	et->et_root_el = &di->id2.i_list;
+}
+
+
+static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_root *xv = et->et_object;
+
+	et->et_root_el = &xv->xr_list;
+}
+
+static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					      u64 blkno)
+{
+	struct ocfs2_xattr_value_root *xv =
+		(struct ocfs2_xattr_value_root *)et->et_object;
+
+	xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_root *xv =
+		(struct ocfs2_xattr_value_root *) et->et_object;
+
+	return le64_to_cpu(xv->xr_last_eb_blk);
+}
+
+static void ocfs2_xattr_value_update_clusters(struct inode *inode,
+					      struct ocfs2_extent_tree *et,
+					      u32 clusters)
+{
+	struct ocfs2_xattr_value_root *xv =
+		(struct ocfs2_xattr_value_root *)et->et_object;
+
+	le32_add_cpu(&xv->xr_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
+	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
+};
+
+static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+
+	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+}
+
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
+						    struct ocfs2_extent_tree *et)
+{
+	et->et_max_leaf_clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb,
+					 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+}
+
+static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					     u64 blkno)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	return le64_to_cpu(xt->xt_last_eb_blk);
+}
+
+static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
+					     struct ocfs2_extent_tree *et,
+					     u32 clusters)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
+	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
+	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
+};
+
+static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+				     struct inode *inode,
+				     struct buffer_head *bh,
+				     void *obj,
+				     struct ocfs2_extent_tree_operations *ops)
+{
+	et->et_ops = ops;
+	et->et_root_bh = bh;
+	if (!obj)
+		obj = (void *)bh->b_data;
+	et->et_object = obj;
+
+	et->et_ops->eo_fill_root_el(et);
+	if (!et->et_ops->eo_fill_max_leaf_clusters)
+		et->et_max_leaf_clusters = 0;
+	else
+		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
+}
+
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct inode *inode,
+				   struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+}
+
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				       struct inode *inode,
+				       struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, inode, bh, NULL,
+				 &ocfs2_xattr_tree_et_ops);
+}
+
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct inode *inode,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_value_root *xv)
+{
+	__ocfs2_init_extent_tree(et, inode, bh, xv,
+				 &ocfs2_xattr_value_et_ops);
+}
+
+static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					    u64 new_last_eb_blk)
+{
+	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
+}
+
+static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	return et->et_ops->eo_get_last_eb_blk(et);
+}
+
+static inline void ocfs2_et_update_clusters(struct inode *inode,
+					    struct ocfs2_extent_tree *et,
+					    u32 clusters)
+{
+	et->et_ops->eo_update_clusters(inode, et, clusters);
+}
+
+static inline int ocfs2_et_insert_check(struct inode *inode,
+					struct ocfs2_extent_tree *et,
+					struct ocfs2_extent_rec *rec)
+{
+	int ret = 0;
+
+	if (et->et_ops->eo_insert_check)
+		ret = et->et_ops->eo_insert_check(inode, et, rec);
+	return ret;
+}
+
+static inline int ocfs2_et_sanity_check(struct inode *inode,
+					struct ocfs2_extent_tree *et)
+{
+	int ret = 0;
+
+	if (et->et_ops->eo_sanity_check)
+		ret = et->et_ops->eo_sanity_check(inode, et);
+	return ret;
+}
+
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 					 struct ocfs2_extent_block *eb);
@@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 }
 
 /*
- * Allocate and initialize a new path based on a disk inode tree.
- */
-static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
-{
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	struct ocfs2_extent_list *el = &di->id2.i_list;
-
-	return ocfs2_new_path(di_bh, el);
-}
-
-/*
  * Convenience function to journal all components in a path.
  */
 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
@@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt {
  */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct ocfs2_dinode *fe)
+			   struct ocfs2_extent_tree *et)
 {
 	int retval;
-	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_list *el = NULL;
 	struct ocfs2_extent_block *eb;
 	struct buffer_head *eb_bh = NULL;
+	u64 last_eb_blk = 0;
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		retval = -EIO;
-		goto bail;
-	}
+	el = et->et_root_el;
+	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 
-	if (fe->i_last_eb_blk) {
-		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
-					  &eb_bh, OCFS2_BH_CACHED, inode);
+	if (last_eb_blk) {
+		retval = ocfs2_read_block(inode, last_eb_blk,
+					  &eb_bh);
 		if (retval < 0) {
 			mlog_errno(retval);
 			goto bail;
 		}
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
-	} else
-		el = &fe->id2.i_list;
+	}
 
 	BUG_ON(el->l_tree_depth != 0);
 
 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
 bail:
-	if (eb_bh)
-		brelse(eb_bh);
+	brelse(eb_bh);
 
 	mlog_exit(retval);
 	return retval;
@@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
 bail:
 	if (status < 0) {
 		for(i = 0; i < wanted; i++) {
-			if (bhs[i])
-				brelse(bhs[i]);
+			brelse(bhs[i]);
 			bhs[i] = NULL;
 		}
 	}
@@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 static int ocfs2_add_branch(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct buffer_head *fe_bh,
+			    struct ocfs2_extent_tree *et,
 			    struct buffer_head *eb_bh,
 			    struct buffer_head **last_eb_bh,
 			    struct ocfs2_alloc_context *meta_ac)
@@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	u64 next_blkno, new_last_eb_blk;
 	struct buffer_head *bh;
 	struct buffer_head **new_eb_bhs = NULL;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *eb_el;
 	struct ocfs2_extent_list  *el;
@@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	BUG_ON(!last_eb_bh || !*last_eb_bh);
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
 	if (eb_bh) {
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
 	} else
-		el = &fe->id2.i_list;
+		el = et->et_root_el;
 
 	/* we never add a branch to a leaf. */
 	BUG_ON(!el->l_tree_depth);
@@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		mlog_errno(status);
 		goto bail;
 	}
-	status = ocfs2_journal_access(handle, inode, fe_bh,
+	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	}
 
 	/* Link the new branch into the rest of the tree (el will
-	 * either be on the fe, or the extent block passed in. */
+	 * either be on the root_bh, or the extent block passed in. */
 	i = le16_to_cpu(el->l_next_free_rec);
 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
@@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	/* fe needs a new last extent block pointer, as does the
 	 * next_leaf on the previously last-extent-block. */
-	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
 
 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
 	if (status < 0)
 		mlog_errno(status);
-	status = ocfs2_journal_dirty(handle, fe_bh);
+	status = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (status < 0)
 		mlog_errno(status);
 	if (eb_bh) {
@@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 bail:
 	if (new_eb_bhs) {
 		for (i = 0; i < new_blocks; i++)
-			if (new_eb_bhs[i])
-				brelse(new_eb_bhs[i]);
+			brelse(new_eb_bhs[i]);
 		kfree(new_eb_bhs);
 	}
 
@@ -717,16 +1031,15 @@ bail:
 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 				  handle_t *handle,
 				  struct inode *inode,
-				  struct buffer_head *fe_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct ocfs2_alloc_context *meta_ac,
 				  struct buffer_head **ret_new_eb_bh)
 {
 	int status, i;
 	u32 new_clusters;
 	struct buffer_head *new_eb_bh = NULL;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list  *fe_el;
+	struct ocfs2_extent_list  *root_el;
 	struct ocfs2_extent_list  *eb_el;
 
 	mlog_entry_void();
@@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	}
 
 	eb_el = &eb->h_list;
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	fe_el = &fe->id2.i_list;
+	root_el = et->et_root_el;
 
 	status = ocfs2_journal_access(handle, inode, new_eb_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	/* copy the fe data into the new extent block */
-	eb_el->l_tree_depth = fe_el->l_tree_depth;
-	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-		eb_el->l_recs[i] = fe_el->l_recs[i];
+	/* copy the root extent list data into the new extent block */
+	eb_el->l_tree_depth = root_el->l_tree_depth;
+	eb_el->l_next_free_rec = root_el->l_next_free_rec;
+	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		eb_el->l_recs[i] = root_el->l_recs[i];
 
 	status = ocfs2_journal_dirty(handle, new_eb_bh);
 	if (status < 0) {
@@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
+	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 
 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
 
-	/* update fe now */
-	le16_add_cpu(&fe_el->l_tree_depth, 1);
-	fe_el->l_recs[0].e_cpos = 0;
-	fe_el->l_recs[0].e_blkno = eb->h_blkno;
-	fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
-	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-		memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
-	fe_el->l_next_free_rec = cpu_to_le16(1);
+	/* update root_bh now */
+	le16_add_cpu(&root_el->l_tree_depth, 1);
+	root_el->l_recs[0].e_cpos = 0;
+	root_el->l_recs[0].e_blkno = eb->h_blkno;
+	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+	root_el->l_next_free_rec = cpu_to_le16(1);
 
 	/* If this is our 1st tree depth shift, then last_eb_blk
 	 * becomes the allocated extent block */
-	if (fe_el->l_tree_depth == cpu_to_le16(1))
-		fe->i_last_eb_blk = eb->h_blkno;
+	if (root_el->l_tree_depth == cpu_to_le16(1))
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
-	status = ocfs2_journal_dirty(handle, fe_bh);
+	status = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	new_eb_bh = NULL;
 	status = 0;
 bail:
-	if (new_eb_bh)
-		brelse(new_eb_bh);
+	brelse(new_eb_bh);
 
 	mlog_exit(status);
 	return status;
@@ -817,22 +1128,21 @@ bail:
  * 1) a lowest extent block is found, then we pass it back in
  *    *lowest_eb_bh and return '0'
  *
- * 2) the search fails to find anything, but the dinode has room. We
+ * 2) the search fails to find anything, but the root_el has room. We
  *    pass NULL back in *lowest_eb_bh, but still return '0'
  *
- * 3) the search fails to find anything AND the dinode is full, in
+ * 3) the search fails to find anything AND the root_el is full, in
  *    which case we return > 0
  *
  * return status < 0 indicates an error.
  */
 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 				    struct inode *inode,
-				    struct buffer_head *fe_bh,
+				    struct ocfs2_extent_tree *et,
 				    struct buffer_head **target_bh)
 {
 	int status = 0, i;
 	u64 blkno;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *el;
 	struct buffer_head *bh = NULL;
@@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	*target_bh = NULL;
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	el = &fe->id2.i_list;
+	el = et->et_root_el;
 
 	while(le16_to_cpu(el->l_tree_depth) > 1) {
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		if (bh) {
-			brelse(bh);
-			bh = NULL;
-		}
+		brelse(bh);
+		bh = NULL;
 
-		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
-					  inode);
+		status = ocfs2_read_block(inode, blkno, &bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 		if (le16_to_cpu(el->l_next_free_rec) <
 		    le16_to_cpu(el->l_count)) {
-			if (lowest_bh)
-				brelse(lowest_bh);
+			brelse(lowest_bh);
 			lowest_bh = bh;
 			get_bh(lowest_bh);
 		}
@@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	/* If we didn't find one and the fe doesn't have any room,
 	 * then return '1' */
-	if (!lowest_bh
-	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+	el = et->et_root_el;
+	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
 		status = 1;
 
 	*target_bh = lowest_bh;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -919,19 +1223,19 @@ bail:
  * *last_eb_bh will be updated by ocfs2_add_branch().
  */
 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
-			   struct buffer_head *di_bh, int *final_depth,
+			   struct ocfs2_extent_tree *et, int *final_depth,
 			   struct buffer_head **last_eb_bh,
 			   struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, shift;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+	struct ocfs2_extent_list *el = et->et_root_el;
+	int depth = le16_to_cpu(el->l_tree_depth);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *bh = NULL;
 
 	BUG_ON(meta_ac == NULL);
 
-	shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+	shift = ocfs2_find_branch_target(osb, inode, et, &bh);
 	if (shift < 0) {
 		ret = shift;
 		mlog_errno(ret);
@@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
 		/* ocfs2_shift_tree_depth will return us a buffer with
 		 * the new extent block (so we can pass that to
 		 * ocfs2_add_branch). */
-		ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+		ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
 					     meta_ac, &bh);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
 	/* call ocfs2_add_branch to add the final part of the tree with
 	 * the new data. */
 	mlog(0, "add branch. bh = %p\n", bh);
-	ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+	ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
 			       meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -990,15 +1294,6 @@ out:
 }
 
 /*
- * This is only valid for leaf nodes, which are the only ones that can
- * have empty extents anyway.
- */
-static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-{
-	return !rec->e_leaf_clusters;
-}
-
-/*
  * This function will discard the rightmost extent record.
  */
 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
@@ -1245,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode,
 
 		brelse(bh);
 		bh = NULL;
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
-				       &bh, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, blkno, &bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2067,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 				     struct ocfs2_path *right_path,
 				     int subtree_index,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
-				     int *deleted)
+				     int *deleted,
+				     struct ocfs2_extent_tree *et)
 {
 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
-	struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
 	struct ocfs2_extent_block *eb;
 
@@ -2123,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		 * We have to update i_last_eb_blk during the meta
 		 * data delete.
 		 */
-		ret = ocfs2_journal_access(handle, inode, di_bh,
+		ret = ocfs2_journal_access(handle, inode, et_root_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
@@ -2198,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		di->i_last_eb_blk = eb->h_blkno;
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
 		/*
 		 * Removal of the extent in the left leaf was skipped
@@ -2208,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		if (right_has_empty)
 			ocfs2_remove_empty_extent(left_leaf_el);
 
-		ret = ocfs2_journal_dirty(handle, di_bh);
+		ret = ocfs2_journal_dirty(handle, et_root_bh);
 		if (ret)
 			mlog_errno(ret);
 
@@ -2331,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 				    handle_t *handle, int orig_credits,
 				    struct ocfs2_path *path,
 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
-				    struct ocfs2_path **empty_extent_path)
+				    struct ocfs2_path **empty_extent_path,
+				    struct ocfs2_extent_tree *et)
 {
 	int ret, subtree_root, deleted;
 	u32 right_cpos;
@@ -2404,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 
 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
 						right_path, subtree_root,
-						dealloc, &deleted);
+						dealloc, &deleted, et);
 		if (ret == -EAGAIN) {
 			/*
 			 * The rotation has to temporarily stop due to
@@ -2447,29 +2742,20 @@ out:
 }
 
 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
-				       struct ocfs2_path *path,
-				       struct ocfs2_cached_dealloc_ctxt *dealloc)
+				struct ocfs2_path *path,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_extent_tree *et)
 {
 	int ret, subtree_index;
 	u32 cpos;
 	struct ocfs2_path *left_path = NULL;
-	struct ocfs2_dinode *di;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 
-	/*
-	 * XXX: This code assumes that the root is an inode, which is
-	 * true for now but may change as tree code gets generic.
-	 */
-	di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		ret = -EIO;
-		ocfs2_error(inode->i_sb,
-			    "Inode %llu has invalid path root",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		goto out;
-	}
 
+	ret = ocfs2_et_sanity_check(inode, et);
+	if (ret)
+		goto out;
 	/*
 	 * There's two ways we handle this depending on
 	 * whether path is the only existing one.
@@ -2526,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		di->i_last_eb_blk = eb->h_blkno;
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 	} else {
 		/*
 		 * 'path' is also the leftmost path which
@@ -2537,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		 */
 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
 
-		el = &di->id2.i_list;
+		el = et->et_root_el;
 		el->l_tree_depth = 0;
 		el->l_next_free_rec = 0;
 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
 
-		di->i_last_eb_blk = 0;
+		ocfs2_et_set_last_eb_blk(et, 0);
 	}
 
 	ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -2570,7 +2856,8 @@ out:
  */
 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
 				  struct ocfs2_path *path,
-				  struct ocfs2_cached_dealloc_ctxt *dealloc)
+				  struct ocfs2_cached_dealloc_ctxt *dealloc,
+				  struct ocfs2_extent_tree *et)
 {
 	int ret, orig_credits = handle->h_buffer_credits;
 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -2584,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
 	if (path->p_tree_depth == 0) {
 rightmost_no_delete:
 		/*
-		 * In-inode extents. This is trivially handled, so do
+		 * Inline extents. This is trivially handled, so do
 		 * it up front.
 		 */
 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
@@ -2638,7 +2925,7 @@ rightmost_no_delete:
 		 */
 
 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
-						  dealloc);
+						  dealloc, et);
 		if (ret)
 			mlog_errno(ret);
 		goto out;
@@ -2650,7 +2937,7 @@ rightmost_no_delete:
 	 */
 try_rotate:
 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
-				       dealloc, &restart_path);
+				       dealloc, &restart_path, et);
 	if (ret && ret != -EAGAIN) {
 		mlog_errno(ret);
 		goto out;
@@ -2662,7 +2949,7 @@ try_rotate:
 
 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
 					       tmp_path, dealloc,
-					       &restart_path);
+					       &restart_path, et);
 		if (ret && ret != -EAGAIN) {
 			mlog_errno(ret);
 			goto out;
@@ -2948,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 				handle_t *handle,
 				struct ocfs2_extent_rec *split_rec,
 				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_extent_tree *et,
 				int index)
 {
 	int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3068,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 		    le16_to_cpu(el->l_next_free_rec) == 1) {
 
 			ret = ocfs2_remove_rightmost_path(inode, handle,
-							  right_path, dealloc);
+							  right_path,
+							  dealloc, et);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -3095,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 				     int split_index,
 				     struct ocfs2_extent_rec *split_rec,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
-				     struct ocfs2_merge_ctxt *ctxt)
+				     struct ocfs2_merge_ctxt *ctxt,
+				     struct ocfs2_extent_tree *et)
 
 {
 	int ret = 0;
@@ -3113,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 * illegal.
 		 */
 		ret = ocfs2_rotate_tree_left(inode, handle, path,
-					     dealloc);
+					     dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3156,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
 
 		/* The merge left us with an empty extent, remove it. */
-		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+		ret = ocfs2_rotate_tree_left(inode, handle, path,
+					     dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3170,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 */
 		ret = ocfs2_merge_rec_left(inode, path,
 					   handle, rec,
-					   dealloc,
+					   dealloc, et,
 					   split_index);
 
 		if (ret) {
@@ -3179,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		}
 
 		ret = ocfs2_rotate_tree_left(inode, handle, path,
-					     dealloc);
+					     dealloc, et);
 		/*
 		 * Error from this last rotate is not critical, so
 		 * print but don't bubble it up.
@@ -3199,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			ret = ocfs2_merge_rec_left(inode,
 						   path,
 						   handle, split_rec,
-						   dealloc,
+						   dealloc, et,
 						   split_index);
 			if (ret) {
 				mlog_errno(ret);
@@ -3222,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			 * our leaf. Try to rotate it away.
 			 */
 			ret = ocfs2_rotate_tree_left(inode, handle, path,
-						     dealloc);
+						     dealloc, et);
 			if (ret)
 				mlog_errno(ret);
 			ret = 0;
@@ -3356,16 +3647,6 @@ rotate:
 	ocfs2_rotate_leaf(el, insert_rec);
 }
 
-static inline void ocfs2_update_dinode_clusters(struct inode *inode,
-						struct ocfs2_dinode *di,
-						u32 clusters)
-{
-	le32_add_cpu(&di->i_clusters, clusters);
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
 static void ocfs2_adjust_rightmost_records(struct inode *inode,
 					   handle_t *handle,
 					   struct ocfs2_path *path,
@@ -3567,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode,
 }
 
 /*
- * This function only does inserts on an allocation b-tree. For dinode
- * lists, ocfs2_insert_at_leaf() is called directly.
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
  *
  * right_path is the path we want to do the actual insert
  * in. left_path should only be passed in if we need to update that
@@ -3665,7 +3946,7 @@ out:
 
 static int ocfs2_do_insert_extent(struct inode *inode,
 				  handle_t *handle,
-				  struct buffer_head *di_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct ocfs2_extent_rec *insert_rec,
 				  struct ocfs2_insert_type *type)
 {
@@ -3673,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 	u32 cpos;
 	struct ocfs2_path *right_path = NULL;
 	struct ocfs2_path *left_path = NULL;
-	struct ocfs2_dinode *di;
 	struct ocfs2_extent_list *el;
 
-	di = (struct ocfs2_dinode *) di_bh->b_data;
-	el = &di->id2.i_list;
+	el = et->et_root_el;
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
+	ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -3691,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		goto out_update_clusters;
 	}
 
-	right_path = ocfs2_new_inode_path(di_bh);
+	right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -3741,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		 * ocfs2_rotate_tree_right() might have extended the
 		 * transaction without re-journaling our tree root.
 		 */
-		ret = ocfs2_journal_access(handle, inode, di_bh,
+		ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
@@ -3766,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 out_update_clusters:
 	if (type->ins_split == SPLIT_NONE)
-		ocfs2_update_dinode_clusters(inode, di,
-					     le16_to_cpu(insert_rec->e_leaf_clusters));
+		ocfs2_et_update_clusters(inode, et,
+					 le16_to_cpu(insert_rec->e_leaf_clusters));
 
-	ret = ocfs2_journal_dirty(handle, di_bh);
+	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (ret)
 		mlog_errno(ret);
 
@@ -3899,7 +4178,8 @@ out:
 static void ocfs2_figure_contig_type(struct inode *inode,
 				     struct ocfs2_insert_type *insert,
 				     struct ocfs2_extent_list *el,
-				     struct ocfs2_extent_rec *insert_rec)
+				     struct ocfs2_extent_rec *insert_rec,
+				     struct ocfs2_extent_tree *et)
 {
 	int i;
 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -3915,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode,
 		}
 	}
 	insert->ins_contig = contig_type;
+
+	if (insert->ins_contig != CONTIG_NONE) {
+		struct ocfs2_extent_rec *rec =
+				&el->l_recs[insert->ins_contig_index];
+		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+				   le16_to_cpu(insert_rec->e_leaf_clusters);
+
+		/*
+		 * Caller might want us to limit the size of extents, don't
+		 * calculate contiguousness if we might exceed that limit.
+		 */
+		if (et->et_max_leaf_clusters &&
+		    (len > et->et_max_leaf_clusters))
+			insert->ins_contig = CONTIG_NONE;
+	}
 }
 
 /*
@@ -3923,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
  * ocfs2_figure_appending_type() will figure out whether we'll have to
  * insert at the tail of the rightmost leaf.
  *
- * This should also work against the dinode list for tree's with 0
- * depth. If we consider the dinode list to be the rightmost leaf node
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
  * then the logic here makes sense.
  */
 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
@@ -3975,14 +4270,13 @@ set_tail_append:
  * structure.
  */
 static int ocfs2_figure_insert_type(struct inode *inode,
-				    struct buffer_head *di_bh,
+				    struct ocfs2_extent_tree *et,
 				    struct buffer_head **last_eb_bh,
 				    struct ocfs2_extent_rec *insert_rec,
 				    int *free_records,
 				    struct ocfs2_insert_type *insert)
 {
 	int ret;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_path *path = NULL;
@@ -3990,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 
 	insert->ins_split = SPLIT_NONE;
 
-	el = &di->id2.i_list;
+	el = et->et_root_el;
 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
 
 	if (el->l_tree_depth) {
@@ -4000,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
 		 * may want it later.
 		 */
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk), &bh,
-				       OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4023,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		le16_to_cpu(el->l_next_free_rec);
 
 	if (!insert->ins_tree_depth) {
-		ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+		ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
 		ocfs2_figure_appending_type(insert, el, insert_rec);
 		return 0;
 	}
 
-	path = ocfs2_new_inode_path(di_bh);
+	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4057,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
          *     into two types of appends: simple record append, or a
          *     rotate inside the tail leaf.
 	 */
-	ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+	ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
 
 	/*
 	 * The insert code isn't quite ready to deal with all cases of
@@ -4078,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 	 * the case that we're doing a tail append, so maybe we can
 	 * take advantage of that information somehow.
 	 */
-	if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+	if (ocfs2_et_get_last_eb_blk(et) ==
+	    path_leaf_bh(path)->b_blocknr) {
 		/*
 		 * Ok, ocfs2_find_path() returned us the rightmost
 		 * tree path. This might be an appending insert. There are
@@ -4108,7 +4401,7 @@ out:
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
-			struct buffer_head *fe_bh,
+			struct ocfs2_extent_tree *et,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
@@ -4121,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
 
-	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
-
 	mlog(0, "add %u clusters at position %u to inode %llu\n",
 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
-			(OCFS2_I(inode)->ip_clusters != cpos),
-			"Device %s, asking for sparse allocation: inode %llu, "
-			"cpos %u, clusters %u\n",
-			osb->dev_str,
-			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
-			OCFS2_I(inode)->ip_clusters);
-
 	memset(&rec, 0, sizeof(rec));
 	rec.e_cpos = cpu_to_le32(cpos);
 	rec.e_blkno = cpu_to_le64(start_blk);
 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
 	rec.e_flags = flags;
+	status = ocfs2_et_insert_check(inode, et, &rec);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
 
-	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
 					  &free_records, &insert);
 	if (status < 0) {
 		mlog_errno(status);
@@ -4154,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	     free_records, insert.ins_tree_depth);
 
 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
-		status = ocfs2_grow_tree(inode, handle, fe_bh,
+		status = ocfs2_grow_tree(inode, handle, et,
 					 &insert.ins_tree_depth, &last_eb_bh,
 					 meta_ac);
 		if (status) {
@@ -4164,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	}
 
 	/* Finally, we can add clusters. This might rotate the tree for us. */
-	status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
+	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
 	if (status < 0)
 		mlog_errno(status);
-	else
+	else if (et->et_ops == &ocfs2_dinode_et_ops)
 		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
-	if (last_eb_bh)
-		brelse(last_eb_bh);
+	brelse(last_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is specified by et, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+				struct inode *inode,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct ocfs2_extent_tree *et,
+				handle_t *handle,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret)
+{
+	int status = 0;
+	int free_extents;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+	u8 flags = 0;
+
+	BUG_ON(!clusters_to_add);
+
+	if (mark_unwritten)
+		flags = OCFS2_EXT_UNWRITTEN;
+
+	free_extents = ocfs2_num_free_extents(osb, inode, et);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		mlog(0, "we haven't reserved any metadata!\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(et->et_root_el))) {
+		mlog(0, "filesystem is really fragmented...\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+					clusters_to_add, &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
 
+	BUG_ON(num_bits > clusters_to_add);
+
+	/* reserve our write early -- insert_extent may update the inode */
+	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
+	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	status = ocfs2_insert_extent(osb, handle, inode, et,
+				     *logical_offset, block,
+				     num_bits, flags, meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, et->et_root_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= num_bits;
+	*logical_offset += num_bits;
+
+	if (clusters_to_add) {
+		mlog(0, "need to alloc once more, wanted = %u\n",
+		     clusters_to_add);
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+leave:
 	mlog_exit(status);
+	if (reason_ret)
+		*reason_ret = reason;
 	return status;
 }
 
@@ -4201,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
 static int ocfs2_split_and_insert(struct inode *inode,
 				  handle_t *handle,
 				  struct ocfs2_path *path,
-				  struct buffer_head *di_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct buffer_head **last_eb_bh,
 				  int split_index,
 				  struct ocfs2_extent_rec *orig_split_rec,
@@ -4215,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode,
 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
 	struct ocfs2_insert_type insert;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_dinode *di;
 
 leftright:
 	/*
@@ -4224,8 +4618,7 @@ leftright:
 	 */
 	rec = path_leaf_el(path)->l_recs[split_index];
 
-	di = (struct ocfs2_dinode *)di_bh->b_data;
-	rightmost_el = &di->id2.i_list;
+	rightmost_el = et->et_root_el;
 
 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
 	if (depth) {
@@ -4236,8 +4629,8 @@ leftright:
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
 	    le16_to_cpu(rightmost_el->l_count)) {
-		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
-				      meta_ac);
+		ret = ocfs2_grow_tree(inode, handle, et,
+				      &depth, last_eb_bh, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4274,8 +4667,7 @@ leftright:
 		do_leftright = 1;
 	}
 
-	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
-				     &insert);
+	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4317,8 +4709,9 @@ out:
  * of the tree is required. All other cases will degrade into a less
  * optimal tree layout.
  *
- * last_eb_bh should be the rightmost leaf block for any inode with a
- * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
  *
  * This code is optimized for readability - several passes might be
  * made over certain portions of the tree. All of those blocks will
@@ -4326,7 +4719,7 @@ out:
  * extra overhead is not expressed in terms of disk reads.
  */
 static int __ocfs2_mark_extent_written(struct inode *inode,
-				       struct buffer_head *di_bh,
+				       struct ocfs2_extent_tree *et,
 				       handle_t *handle,
 				       struct ocfs2_path *path,
 				       int split_index,
@@ -4366,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	 */
 	if (path->p_tree_depth) {
 		struct ocfs2_extent_block *eb;
-		struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk),
-				       &last_eb_bh, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+				       &last_eb_bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4403,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 		if (ctxt.c_split_covers_rec)
 			el->l_recs[split_index] = *split_rec;
 		else
-			ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+			ret = ocfs2_split_and_insert(inode, handle, path, et,
 						     &last_eb_bh, split_index,
 						     split_rec, meta_ac);
 		if (ret)
@@ -4411,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	} else {
 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
 						split_index, split_rec,
-						dealloc, &ctxt);
+						dealloc, &ctxt, et);
 		if (ret)
 			mlog_errno(ret);
 	}
@@ -4429,7 +4820,8 @@ out:
  *
  * The caller is responsible for passing down meta_ac if we'll need it.
  */
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4455,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
 	/*
 	 * XXX: This should be fixed up so that we just re-insert the
 	 * next extent records.
+	 *
+	 * XXX: This is a hack on the extent tree, maybe it should be
+	 * an op?
 	 */
-	ocfs2_extent_map_trunc(inode, 0);
+	if (et->et_ops == &ocfs2_dinode_et_ops)
+		ocfs2_extent_map_trunc(inode, 0);
 
-	left_path = ocfs2_new_inode_path(di_bh);
+	left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4489,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
 
-	ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
-					  index, &split_rec, meta_ac, dealloc);
+	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
+					  index, &split_rec, meta_ac,
+					  dealloc);
 	if (ret)
 		mlog_errno(ret);
 
@@ -4499,13 +4896,12 @@ out:
 	return ret;
 }
 
-static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 			    handle_t *handle, struct ocfs2_path *path,
 			    int index, u32 new_range,
 			    struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, depth, credits = handle->h_buffer_credits;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *rightmost_el, *el;
@@ -4522,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 
 	depth = path->p_tree_depth;
 	if (depth > 0) {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk),
-				       &last_eb_bh, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+				       &last_eb_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -4535,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 	} else
 		rightmost_el = path_leaf_el(path);
 
-	credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+	credits += path->p_tree_depth +
+		   ocfs2_extend_meta_needed(et->et_root_el);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -4544,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
 	    le16_to_cpu(rightmost_el->l_count)) {
-		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+		ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
 				      meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -4558,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 	insert.ins_split = SPLIT_RIGHT;
 	insert.ins_tree_depth = depth;
 
-	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
 	if (ret)
 		mlog_errno(ret);
 
@@ -4570,7 +4966,8 @@ out:
 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 			      struct ocfs2_path *path, int index,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      u32 cpos, u32 len)
+			      u32 cpos, u32 len,
+			      struct ocfs2_extent_tree *et)
 {
 	int ret;
 	u32 left_cpos, rec_range, trunc_range;
@@ -4582,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 	struct ocfs2_extent_block *eb;
 
 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
-		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4713,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 
 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
 
-	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4724,7 +5121,8 @@ out:
 	return ret;
 }
 
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+			struct ocfs2_extent_tree *et,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4733,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 	u32 rec_range, trunc_range;
 	struct ocfs2_extent_rec *rec;
 	struct ocfs2_extent_list *el;
-	struct ocfs2_path *path;
+	struct ocfs2_path *path = NULL;
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-	path = ocfs2_new_inode_path(di_bh);
+	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4790,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 
 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	} else {
-		ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+		ret = ocfs2_split_tree(inode, et, handle, path, index,
 				       trunc_range, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -4845,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 		}
 
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -5188,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
-				  OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		iput(inode);
 		mlog_errno(status);
@@ -5264,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 bail:
 	if (tl_inode)
 		iput(tl_inode);
-	if (tl_bh)
-		brelse(tl_bh);
+	brelse(tl_bh);
 
 	if (status < 0 && (*tl_copy)) {
 		kfree(*tl_copy);
@@ -6008,20 +6404,13 @@ bail:
 	return status;
 }
 
-static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
 	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);
 	return 0;
 }
 
-static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
-{
-	set_buffer_uptodate(bh);
-	mark_buffer_dirty(bh);
-	return ocfs2_journal_dirty_data(handle, bh);
-}
-
 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 				     unsigned int from, unsigned int to,
 				     struct page *page, int zero, u64 *phys)
@@ -6040,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 	 * here if they aren't - ocfs2_map_page_blocks()
 	 * might've skipped some
 	 */
-	if (ocfs2_should_order_data(inode)) {
-		ret = walk_page_buffers(handle,
-					page_buffers(page),
-					from, to, &partial,
-					ocfs2_ordered_zero_func);
-		if (ret < 0)
-			mlog_errno(ret);
-	} else {
+	ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, &partial,
+				ocfs2_zero_func);
+	if (ret < 0)
+		mlog_errno(ret);
+	else if (ocfs2_should_order_data(inode)) {
+		ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 		ret = walk_page_buffers(handle, page_buffers(page),
 					from, to, &partial,
-					ocfs2_writeback_zero_func);
+					ocfs2_journal_dirty_data);
+#endif
 		if (ret < 0)
 			mlog_errno(ret);
 	}
@@ -6215,20 +6605,29 @@ out:
 	return ret;
 }
 
-static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
+static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
+					     struct ocfs2_dinode *di)
 {
 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
 
-	memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2) -
+				    xattrsize);
+	else
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2));
 }
 
 void ocfs2_dinode_new_extent_list(struct inode *inode,
 				  struct ocfs2_dinode *di)
 {
-	ocfs2_zero_dinode_id2(inode, di);
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
 	di->id2.i_list.l_tree_depth = 0;
 	di->id2.i_list.l_next_free_rec = 0;
-	di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
+	di->id2.i_list.l_count = cpu_to_le16(
+		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
 }
 
 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
@@ -6245,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
 	 * We clear the entire i_data structure here so that all
 	 * fields can be properly initialized.
 	 */
-	ocfs2_zero_dinode_id2(inode, di);
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
 
-	idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
+	idata->id_count = cpu_to_le16(
+			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
 }
 
 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
@@ -6262,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct page **pages = NULL;
 	loff_t end = osb->s_clustersize;
+	struct ocfs2_extent_tree et;
 
 	has_data = i_size_read(inode) ? 1 : 0;
 
@@ -6361,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 * this proves to be false, we could always re-build
 		 * the in-inode data from our pages.
 		 */
-		ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
+		ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+		ret = ocfs2_insert_extent(osb, handle, inode, &et,
 					  0, block, 1, 0, NULL);
 		if (ret) {
 			mlog_errno(ret);
@@ -6404,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	handle_t *handle = NULL;
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_path *path = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
 
 	mlog_entry_void();
 
 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
 						     i_size_read(inode));
 
-	path = ocfs2_new_inode_path(fe_bh);
+	path = ocfs2_new_path(fe_bh, &di->id2.i_list);
 	if (!path) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -6581,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
 
 	if (fe->id2.i_list.l_tree_depth) {
-		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
-					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -6695,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 		mlog(ML_NOTICE,
 		     "Truncate completion has non-empty dealloc context\n");
 
-	if (tc->tc_last_eb_bh)
-		brelse(tc->tc_last_eb_bh);
+	brelse(tc->tc_last_eb_bh);
 
 	kfree(tc);
 }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd8011..70257c84cfbe 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,30 +26,102 @@
 #ifndef OCFS2_ALLOC_H
 #define OCFS2_ALLOC_H
 
+
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation.  Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_init_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+	struct ocfs2_extent_tree_operations	*et_ops;
+	struct buffer_head			*et_root_bh;
+	struct ocfs2_extent_list		*et_root_el;
+	void					*et_object;
+	unsigned int				et_max_leaf_clusters;
+};
+
+/*
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
+ */
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct inode *inode,
+				   struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				       struct inode *inode,
+				       struct buffer_head *bh);
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct inode *inode,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_value_root *xv);
+
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
-			struct buffer_head *fe_bh,
+			struct ocfs2_extent_tree *et,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
 			u8 flags,
 			struct ocfs2_alloc_context *meta_ac);
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+				struct inode *inode,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct ocfs2_extent_tree *et,
+				handle_t *handle,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret);
 struct ocfs2_cached_dealloc_ctxt;
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+			struct ocfs2_extent_tree *et,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct ocfs2_dinode *fe);
-/* how many new metadata chunks would an allocation need at maximum? */
-static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+			   struct ocfs2_extent_tree *et);
+
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
 {
 	/*
 	 * Rather than do all the work of determining how much we need
@@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
 	 * new tree_depth==0 extent_block, and one block at the new
 	 * top-of-the tree.
 	 */
-	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+	return le16_to_cpu(root_el->l_tree_depth) + 2;
 }
 
 void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
@@ -146,4 +218,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
 		return le16_to_cpu(rec->e_leaf_clusters);
 }
 
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+	return !rec->e_leaf_clusters;
+}
+
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1db080135c6d..c22543b33420 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  &bh, OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 	err = 0;
 
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(err);
 	return err;
@@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 {
 	int ret;
 	struct buffer_head *di_bh = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 
-	ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
-			       OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 	}
 
 	if (ocfs2_should_order_data(inode)) {
+		ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 		ret = walk_page_buffers(handle,
 					page_buffers(page),
 					from, to, NULL,
 					ocfs2_journal_dirty_data);
-		if (ret < 0) 
+#endif
+		if (ret < 0)
 			mlog_errno(ret);
 	}
 out:
@@ -594,7 +592,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %llu has a hole at block %llu\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 
-	journal_invalidatepage(journal, page, offset);
+	jbd2_journal_invalidatepage(journal, page, offset);
 }
 
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
 
 	if (!page_has_buffers(page))
 		return 0;
-	return journal_try_to_free_buffers(journal, page, wait);
+	return jbd2_journal_try_to_free_buffers(journal, page, wait);
 }
 
 static ssize_t ocfs2_direct_IO(int rw,
@@ -1073,12 +1071,19 @@ static void ocfs2_write_failure(struct inode *inode,
 	for(i = 0; i < wc->w_num_pages; i++) {
 		tmppage = wc->w_pages[i];
 
-		if (ocfs2_should_order_data(inode))
-			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
-					  from, to, NULL,
-					  ocfs2_journal_dirty_data);
+		if (page_has_buffers(tmppage)) {
+			if (ocfs2_should_order_data(inode)) {
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
+				walk_page_buffers(wc->w_handle,
+						  page_buffers(tmppage),
+						  from, to, NULL,
+						  ocfs2_journal_dirty_data);
+#endif
+			}
 
-		block_commit_write(tmppage, from, to);
+			block_commit_write(tmppage, from, to);
+		}
 	}
 }
 
@@ -1239,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 	int ret, i, new, should_zero = 0;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
+	struct ocfs2_extent_tree et;
 
 	new = phys == 0 ? 1 : 0;
 	if (new || unwritten)
@@ -1252,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		 * any additional semaphores or cluster locks.
 		 */
 		tmp_pos = cpos;
-		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-						 &tmp_pos, 1, 0, wc->w_di_bh,
-						 wc->w_handle, data_ac,
-						 meta_ac, NULL);
+		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
+					   &tmp_pos, 1, 0, wc->w_di_bh,
+					   wc->w_handle, data_ac,
+					   meta_ac, NULL);
 		/*
 		 * This shouldn't happen because we must have already
 		 * calculated the correct meta data allocation required. The
@@ -1273,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 			goto out;
 		}
 	} else if (unwritten) {
-		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ret = ocfs2_mark_extent_written(inode, &et,
 						wc->w_handle, cpos, 1, phys,
 						meta_ac, &wc->w_dealloc);
 		if (ret < 0) {
@@ -1662,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
+	struct ocfs2_extent_tree et;
 
 	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
@@ -1709,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
-		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
-					    extents_to_split, &data_ac, &meta_ac);
+		mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
+		     " clusters_to_add = %u, extents_to_split = %u\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
+		     clusters_to_alloc, extents_to_split);
+
+		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ret = ocfs2_lock_allocators(inode, &et,
+					    clusters_to_alloc, extents_to_split,
+					    &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+		credits = ocfs2_calc_extend_credits(inode->i_sb,
+						    &di->id2.i_list,
 						    clusters_to_alloc);
 
 	}
@@ -1901,12 +1918,18 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			to = PAGE_CACHE_SIZE;
 		}
 
-		if (ocfs2_should_order_data(inode))
-			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
-					  from, to, NULL,
-					  ocfs2_journal_dirty_data);
-
-		block_commit_write(tmppage, from, to);
+		if (page_has_buffers(tmppage)) {
+			if (ocfs2_should_order_data(inode)) {
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
+				walk_page_buffers(wc->w_handle,
+						  page_buffers(tmppage),
+						  from, to, NULL,
+						  ocfs2_journal_dirty_data);
+#endif
+			}
+			block_commit_write(tmppage, from, to);
+		}
 	}
 
 out_write_size:
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f136639f5b41..7e947c672469 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 	/* remove from dirty list before I/O. */
 	clear_buffer_dirty(bh);
 
-	get_bh(bh); /* for end_buffer_write_sync() */                   
+	get_bh(bh); /* for end_buffer_write_sync() */
 	bh->b_end_io = end_buffer_write_sync;
 	submit_bh(WRITE, bh);
 
@@ -88,22 +88,103 @@ out:
 	return ret;
 }
 
-int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
-		      struct buffer_head *bhs[], int flags,
-		      struct inode *inode)
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[])
+{
+	int status = 0;
+	unsigned int i;
+	struct buffer_head *bh;
+
+	if (!nr) {
+		mlog(ML_BH_IO, "No buffers will be read!\n");
+		goto bail;
+	}
+
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(osb->sb, block++);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "trying to sync read a jbd "
+			     "managed bh (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (buffer_dirty(bh)) {
+			/* This should probably be a BUG, or
+			 * at least return an error. */
+			mlog(ML_ERROR,
+			     "trying to sync read a dirty "
+			     "buffer! (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		lock_buffer(bh);
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "block %llu had the JBD bit set "
+			     "while I was in lock_buffer!",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		clear_buffer_uptodate(bh);
+		get_bh(bh); /* for end_buffer_read_sync() */
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+	}
+
+	for (i = nr; i > 0; i--) {
+		bh = bhs[i - 1];
+
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "the journal got the buffer while it was "
+			     "locked for io! (blocknr = %llu)\n",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. */
+			status = -EIO;
+			put_bh(bh);
+			bhs[i - 1] = NULL;
+		}
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags)
 {
 	int status = 0;
-	struct super_block *sb;
 	int i, ignore_cache = 0;
 	struct buffer_head *bh;
 
-	mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
-		   (unsigned long long)block, nr, flags, inode);
+	mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
+		   inode, (unsigned long long)block, nr, flags);
 
+	BUG_ON(!inode);
 	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
-	       (!inode || !(flags & OCFS2_BH_CACHED)));
+	       (flags & OCFS2_BH_IGNORE_CACHE));
 
-	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+	if (bhs == NULL) {
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
@@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		goto bail;
 	}
 
-	sb = osb->sb;
-
-	if (flags & OCFS2_BH_CACHED && !inode)
-		flags &= ~OCFS2_BH_CACHED;
-
-	if (inode)
-		mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+	mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
 	for (i = 0 ; i < nr ; i++) {
 		if (bhs[i] == NULL) {
-			bhs[i] = sb_getblk(sb, block++);
+			bhs[i] = sb_getblk(inode->i_sb, block++);
 			if (bhs[i] == NULL) {
-				if (inode)
-					mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+				mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 				status = -EIO;
 				mlog_errno(status);
 				goto bail;
 			}
 		}
 		bh = bhs[i];
-		ignore_cache = 0;
+		ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
 
 		/* There are three read-ahead cases here which we need to
 		 * be concerned with. All three assume a buffer has
@@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		 *    before our is-it-in-flight check.
 		 */
 
-		if (flags & OCFS2_BH_CACHED &&
-		    !ocfs2_buffer_uptodate(inode, bh)) {
+		if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
 			mlog(ML_UPTODATE,
 			     "bh (%llu), inode %llu not uptodate\n",
 			     (unsigned long long)bh->b_blocknr,
 			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+			/* We're using ignore_cache here to say
+			 * "go to disk" */
 			ignore_cache = 1;
 		}
 
 		/* XXX: Can we ever get this and *not* have the cached
 		 * flag set? */
 		if (buffer_jbd(bh)) {
-			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+			if (ignore_cache)
 				mlog(ML_BH_IO, "trying to sync read a jbd "
 					       "managed bh (blocknr = %llu)\n",
 				     (unsigned long long)bh->b_blocknr);
 			continue;
 		}
 
-		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+		if (ignore_cache) {
 			if (buffer_dirty(bh)) {
 				/* This should probably be a BUG, or
 				 * at least return an error. */
@@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 			 * previously read-ahead buffer may have
 			 * completed I/O while we were waiting for the
 			 * buffer lock. */
-			if ((flags & OCFS2_BH_CACHED)
+			if (!(flags & OCFS2_BH_IGNORE_CACHE)
 			    && !(flags & OCFS2_BH_READAHEAD)
 			    && ocfs2_buffer_uptodate(inode, bh)) {
 				unlock_buffer(bh);
@@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		/* Always set the buffer in the cache, even if it was
 		 * a forced read, or read-ahead which hasn't yet
 		 * completed. */
-		if (inode)
-			ocfs2_set_buffer_uptodate(inode, bh);
+		ocfs2_set_buffer_uptodate(inode, bh);
 	}
-	if (inode)
-		mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 
 	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
 	     (unsigned long long)block, nr,
-	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
+	     ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
+	     flags);
 
 bail:
 
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c2e78614c3e5..75e1dcb1ade7 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,31 +31,29 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
 			     int uptodate);
 
-static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+static inline int ocfs2_read_block(struct inode	       *inode,
 				   u64                  off,
-				   struct buffer_head **bh,
-				   int                  flags,
-				   struct inode        *inode);
+				   struct buffer_head **bh);
 
 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
 		      struct inode        *inode);
-int ocfs2_read_blocks(struct ocfs2_super          *osb,
+int ocfs2_read_blocks(struct inode	  *inode,
 		      u64                  block,
 		      int                  nr,
 		      struct buffer_head  *bhs[],
-		      int                  flags,
-		      struct inode        *inode);
+		      int                  flags);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[]);
 
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 				struct buffer_head *bh);
 
-#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_IGNORE_CACHE      1
 #define OCFS2_BH_READAHEAD         8
 
-static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
-				   struct buffer_head **bh, int flags,
-				   struct inode *inode)
+static inline int ocfs2_read_block(struct inode *inode, u64 off,
+				   struct buffer_head **bh)
 {
 	int status = 0;
 
@@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
 		goto bail;
 	}
 
-	status = ocfs2_read_blocks(osb, off, 1, bh,
-				   flags, inode);
+	status = ocfs2_read_blocks(inode, off, 1, bh, 0);
 
 bail:
 	return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 443d108211ab..6ebaa58e2c03 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -976,7 +976,7 @@ static void o2hb_region_release(struct config_item *item)
 	}
 
 	if (reg->hr_bdev)
-		blkdev_put(reg->hr_bdev);
+		blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
 
 	if (reg->hr_slots)
 		kfree(reg->hr_slots);
@@ -1268,7 +1268,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		goto out;
 
 	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
 	if (ret) {
 		reg->hr_bdev = NULL;
 		goto out;
@@ -1358,7 +1358,7 @@ out:
 		iput(inode);
 	if (ret < 0) {
 		if (reg->hr_bdev) {
-			blkdev_put(reg->hr_bdev);
+			blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
 			reg->hr_bdev = NULL;
 		}
 	}
@@ -1489,31 +1489,22 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
 		: NULL;
 }
 
-static int o2hb_heartbeat_group_make_item(struct config_group *group,
-					  const char *name,
-					  struct config_item **new_item)
+static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
+							  const char *name)
 {
 	struct o2hb_region *reg = NULL;
-	int ret = 0;
 
 	reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
-	if (reg == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (reg == NULL)
+		return ERR_PTR(-ENOMEM);
 
 	config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
 
-	*new_item = &reg->hr_item;
-
 	spin_lock(&o2hb_live_lock);
 	list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
 	spin_unlock(&o2hb_live_lock);
-out:
-	if (ret)
-		kfree(reg);
 
-	return ret;
+	return &reg->hr_item;
 }
 
 static void o2hb_heartbeat_group_drop_item(struct config_group *group,
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 23c732f27529..d8a0cb92cef6 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(CONN),
 	define_mask(QUORUM),
 	define_mask(EXPORT),
+	define_mask(XATTR),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 597e064bb94f..57670c680471 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -112,6 +112,7 @@
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
+#define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index d8bfa0eb41b2..52276c02f710 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v)
 			   "  message id:   %d\n"
 			   "  message type: %u\n"
 			   "  message key:  0x%08x\n"
-			   "  sock acquiry: %lu.%lu\n"
-			   "  send start:   %lu.%lu\n"
-			   "  wait start:   %lu.%lu\n",
+			   "  sock acquiry: %lu.%ld\n"
+			   "  send start:   %lu.%ld\n"
+			   "  wait start:   %lu.%ld\n",
 			   nst, (unsigned long)nst->st_task->pid,
 			   (unsigned long)nst->st_task->tgid,
 			   nst->st_task->comm, nst->st_node,
 			   nst->st_sc, nst->st_id, nst->st_msg_type,
 			   nst->st_msg_key,
 			   nst->st_sock_time.tv_sec,
-			   (unsigned long)nst->st_sock_time.tv_usec,
+			   (long)nst->st_sock_time.tv_usec,
 			   nst->st_send_time.tv_sec,
-			   (unsigned long)nst->st_send_time.tv_usec,
+			   (long)nst->st_send_time.tv_usec,
 			   nst->st_status_time.tv_sec,
-			   nst->st_status_time.tv_usec);
+			   (long)nst->st_status_time.tv_usec);
 	}
 
 	spin_unlock(&o2net_debug_lock);
@@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	return sc; /* unused, just needs to be null when done */
 }
 
-#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec
+#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
 
 static int sc_seq_show(struct seq_file *seq, void *v)
 {
@@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v)
 			   "  remote node:     %s\n"
 			   "  page off:        %zu\n"
 			   "  handshake ok:    %u\n"
-			   "  timer:           %lu.%lu\n"
-			   "  data ready:      %lu.%lu\n"
-			   "  advance start:   %lu.%lu\n"
-			   "  advance stop:    %lu.%lu\n"
-			   "  func start:      %lu.%lu\n"
-			   "  func stop:       %lu.%lu\n"
+			   "  timer:           %lu.%ld\n"
+			   "  data ready:      %lu.%ld\n"
+			   "  advance start:   %lu.%ld\n"
+			   "  advance stop:    %lu.%ld\n"
+			   "  func start:      %lu.%ld\n"
+			   "  func stop:       %lu.%ld\n"
 			   "  func key:        %u\n"
 			   "  func type:       %u\n",
 			   sc,
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index b364b7052e46..816a3f61330c 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -644,35 +644,23 @@ out:
 	return ret;
 }
 
-static int o2nm_node_group_make_item(struct config_group *group,
-				     const char *name,
-				     struct config_item **new_item)
+static struct config_item *o2nm_node_group_make_item(struct config_group *group,
+						     const char *name)
 {
 	struct o2nm_node *node = NULL;
-	int ret = 0;
 
-	if (strlen(name) > O2NM_MAX_NAME_LEN) {
-		ret = -ENAMETOOLONG;
-		goto out;
-	}
+	if (strlen(name) > O2NM_MAX_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
 
 	node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
-	if (node == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	if (node == NULL)
+		return ERR_PTR(-ENOMEM);
 
 	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
 	config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
 	spin_lock_init(&node->nd_lock);
 
-	*new_item = &node->nd_item;
-
-out:
-	if (ret)
-		kfree(node);
-
-	return ret;
+	return &node->nd_item;
 }
 
 static void o2nm_node_group_drop_item(struct config_group *group,
@@ -756,31 +744,25 @@ static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *gro
 }
 #endif
 
-static int o2nm_cluster_group_make_group(struct config_group *group,
-					 const char *name,
-					 struct config_group **new_group)
+static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
+							  const char *name)
 {
 	struct o2nm_cluster *cluster = NULL;
 	struct o2nm_node_group *ns = NULL;
-	struct config_group *o2hb_group = NULL;
+	struct config_group *o2hb_group = NULL, *ret = NULL;
 	void *defs = NULL;
-	int ret = 0;
 
 	/* this runs under the parent dir's i_mutex; there can be only
 	 * one caller in here at a time */
-	if (o2nm_single_cluster) {
-		ret = -ENOSPC;
-		goto out;
-	}
+	if (o2nm_single_cluster)
+		return ERR_PTR(-ENOSPC);
 
 	cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
 	ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
 	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
 	o2hb_group = o2hb_alloc_hb_set();
-	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) {
-		ret = -ENOMEM;
+	if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
 		goto out;
-	}
 
 	config_group_init_type_name(&cluster->cl_group, name,
 				    &o2nm_cluster_type);
@@ -797,15 +779,16 @@ static int o2nm_cluster_group_make_group(struct config_group *group,
 	cluster->cl_idle_timeout_ms    = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
 	cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
 
-	*new_group = &cluster->cl_group;
+	ret = &cluster->cl_group;
 	o2nm_single_cluster = cluster;
 
 out:
-	if (ret) {
+	if (ret == NULL) {
 		kfree(cluster);
 		kfree(ns);
 		o2hb_free_hb_set(o2hb_group);
 		kfree(defs);
+		ret = ERR_PTR(-ENOMEM);
 	}
 
 	return ret;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a27d61581bd6..2bcf706d9dd3 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
 #ifdef CONFIG_DEBUG_FS
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-		    u32 msgkey, struct task_struct *task, u8 node)
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+			   u32 msgkey, struct task_struct *task, u8 node)
 {
 	INIT_LIST_HEAD(&nst->st_net_debug_item);
 	nst->st_task = task;
@@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
 	nst->st_node = node;
 }
 
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
 	do_gettimeofday(&nst->st_sock_time);
 }
 
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
 	do_gettimeofday(&nst->st_send_time);
 }
 
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
 	do_gettimeofday(&nst->st_status_time);
 }
 
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
 					 struct o2net_sock_container *sc)
 {
 	nst->st_sc = sc;
 }
 
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
 {
 	nst->st_id = msg_id;
 }
+
+#else  /* CONFIG_DEBUG_FS */
+
+static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+				  u32 msgkey, struct task_struct *task, u8 node)
+{
+}
+
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+}
+
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+}
+
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+}
+
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+						struct o2net_sock_container *sc)
+{
+}
+
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+					u32 msg_id)
+{
+}
+
 #endif /* CONFIG_DEBUG_FS */
 
 static inline int o2net_reconnect_delay(void)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 18307ff81b77..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,42 +224,10 @@ struct o2net_send_tracking {
 	struct timeval			st_send_time;
 	struct timeval			st_status_time;
 };
-
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-		    u32 msgkey, struct task_struct *task, u8 node);
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-				  struct o2net_sock_container *sc);
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
-
 #else
 struct o2net_send_tracking {
 	u32	dummy;
 };
-
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-				  u32 msgkey, struct task_struct *task, u8 node)
-{
-}
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
-						struct o2net_sock_container *sc)
-{
-}
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
-					u32 msg_id)
-{
-}
 #endif	/* CONFIG_DEBUG_FS */
 
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8a1875848080..026e6eb85187 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
 
+static struct buffer_head *ocfs2_bread(struct inode *inode,
+				       int block, int *err, int reada)
+{
+	struct buffer_head *bh = NULL;
+	int tmperr;
+	u64 p_blkno;
+	int readflags = 0;
+
+	if (reada)
+		readflags |= OCFS2_BH_READAHEAD;
+
+	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!reada);
+		return NULL;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+					     NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (tmperr < 0) {
+		mlog_errno(tmperr);
+		goto fail;
+	}
+
+	tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+	if (tmperr < 0)
+		goto fail;
+
+	tmperr = 0;
+
+	*err = 0;
+	return bh;
+
+fail:
+	brelse(bh);
+	bh = NULL;
+
+	*err = -EIO;
+	return NULL;
+}
+
 /*
  * bh passed here can be an inode block or a dir data block, depending
  * on the inode inline data flag.
@@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, dir);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -260,14 +302,13 @@ restart:
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
-			/* read error, skip block & hope for the best */
+		if (ocfs2_read_block(dir, block, &bh)) {
+			/* read error, skip block & hope for the best.
+			 * ocfs2_read_block() has released the bh. */
 			ocfs2_error(dir->i_sb, "reading directory %llu, "
 				    "offset %lu\n",
 				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
 				    block);
-			brelse(bh);
 			goto next;
 		}
 		i = ocfs2_search_dirblock(bh, dir, name, namelen,
@@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, dir);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 	struct ocfs2_inline_data *data;
 	struct ocfs2_dir_entry *de;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
 			     i > 0; i--) {
 				tmp = ocfs2_bread(inode, ++blk, &err, 1);
-				if (tmp)
-					brelse(tmp);
+				brelse(tmp);
 			}
 			last_ra_blk = blk;
 			ra_sectors = 8;
@@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name,
 leave:
 	if (status < 0) {
 		*dirent = NULL;
-		if (*dirent_bh) {
-			brelse(*dirent_bh);
-			*dirent_bh = NULL;
-		}
+		brelse(*dirent_bh);
+		*dirent_bh = NULL;
 	}
 
 	mlog_exit(status);
@@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
 
 	ret = 0;
 bail:
-	if (dirent_bh)
-		brelse(dirent_bh);
+	brelse(dirent_bh);
 
 	mlog_exit(ret);
 	return ret;
@@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 
 	status = 0;
 bail:
-	if (new_bh)
-		brelse(new_bh);
+	brelse(new_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	struct buffer_head *dirdata_bh = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
 
 	alloc = ocfs2_clusters_for_bytes(sb, bytes);
 
@@ -1300,19 +1337,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	di->i_size = cpu_to_le64(sb->s_blocksize);
 	di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
-	dir->i_blocks = ocfs2_inode_sector_count(dir);
 
 	/*
 	 * This should never fail as our extent list is empty and all
 	 * related blocks have been journaled already.
 	 */
-	ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
-				  NULL);
+	ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
+				  0, NULL);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		goto out_commit;
 	}
 
+	/*
+	 * Set i_blocks after the extent insert for the most up to
+	 * date ip_clusters value.
+	 */
+	dir->i_blocks = ocfs2_inode_sector_count(dir);
+
 	ret = ocfs2_journal_dirty(handle, di_bh);
 	if (ret) {
 		mlog_errno(ret);
@@ -1332,11 +1374,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		}
 		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
 
-		ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
-					  len, 0, NULL);
+		ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
+					  blkno, len, 0, NULL);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			goto out_commit;
 		}
 	}
 
@@ -1378,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	if (extend) {
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
-		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
-						    1, 0, parent_fe_bh, handle,
-						    data_ac, meta_ac, NULL);
+		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+					      1, 0, parent_fe_bh, handle,
+					      data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
 			mlog_errno(status);
@@ -1425,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	int credits, num_free_extents, drop_alloc_sem = 0;
 	loff_t dir_i_size;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_extent_list *el = &fe->id2.i_list;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle = NULL;
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_dir_entry * de;
 	struct super_block *sb = osb->sb;
+	struct ocfs2_extent_tree et;
 
 	mlog_entry_void();
 
@@ -1474,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
-		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
+		num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
@@ -1482,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		}
 
 		if (!num_free_extents) {
-			status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
+			status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
 			if (status < 0) {
 				if (status != -ENOSPC)
 					mlog_errno(status);
@@ -1497,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+		credits = ocfs2_calc_extend_credits(sb, el, 1);
 	} else {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -1563,8 +1608,7 @@ bail:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
-	if (new_bh)
-		brelse(new_bh);
+	brelse(new_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1691,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 
 	status = 0;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -1751,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 	*ret_de_bh = bh;
 	bh = NULL;
 out:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 	return ret;
 }
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index e48aba698b77..533a789c3ef8 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -267,8 +267,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 	return writelen;
 }
 
-static void dlmfs_init_once(struct kmem_cache *cachep,
-			    void *foo)
+static void dlmfs_init_once(void *foo)
 {
 	struct dlmfs_inode_private *ip =
 		(struct dlmfs_inode_private *) foo;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index eae3d643a5e4..ec684426034b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 	} else {
 		/* Boo, we have to go to disk. */
 		/* read bh, cast, ocfs2_refresh_inode */
-		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
-					  bh, OCFS2_BH_CACHED, inode);
+		status = ocfs2_read_block(inode, oi->ip_blkno, bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail_refresh;
@@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode,
 		return 0;
 	}
 
-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  ret_bh,
-				  OCFS2_BH_CACHED,
-				  inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
 	if (status < 0)
 		mlog_errno(status);
 
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 67527cebf214..2f27b332d8b3 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -68,14 +68,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 		return ERR_PTR(-ESTALE);
 	}
 
-	result = d_alloc_anon(inode);
-
-	if (!result) {
-		iput(inode);
-		mlog_errno(-ENOMEM);
-		return ERR_PTR(-ENOMEM);
-	}
-	result->d_op = &ocfs2_dentry_ops;
+	result = d_obtain_alias(inode);
+	if (!IS_ERR(result))
+		result->d_op = &ocfs2_dentry_ops;
 
 	mlog_exit_ptr(result);
 	return result;
@@ -86,7 +81,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	int status;
 	u64 blkno;
 	struct dentry *parent;
-	struct inode *inode;
 	struct inode *dir = child->d_inode;
 
 	mlog_entry("(0x%p, '%.*s')\n", child,
@@ -109,21 +103,9 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 		goto bail_unlock;
 	}
 
-	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0);
-	if (IS_ERR(inode)) {
-		mlog(ML_ERROR, "Unable to create inode %llu\n",
-		     (unsigned long long)blkno);
-		parent = ERR_PTR(-EACCES);
-		goto bail_unlock;
-	}
-
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-
-	parent->d_op = &ocfs2_dentry_ops;
+	parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
+	if (!IS_ERR(parent))
+		parent->d_op = &ocfs2_dentry_ops;
 
 bail_unlock:
 	ocfs2_inode_unlock(dir, 0);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326fe..2baedac58234 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
+#include <linux/fiemap.h>
 
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
@@ -282,6 +284,50 @@ out:
 		kfree(new_emi);
 }
 
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	int ret, next_free;
+	u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+
+	ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+	el = &eb->h_list;
+
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		ret = -EROFS;
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		goto out;
+	}
+
+	if (el->l_tree_depth) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has non zero tree depth in "
+			    "leaf block %llu\n", inode->i_ino,
+			    (unsigned long long)eb_bh->b_blocknr);
+		ret = -EROFS;
+		goto out;
+	}
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+		ret = 1;
+
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
 /*
  * Return the 1st index within el which contains an extent start
  * larger than v_cluster.
@@ -335,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 			goto no_more_extents;
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+		ret = ocfs2_read_block(inode,
 				       le64_to_cpu(eb->h_next_leaf_blk),
-				       &next_eb_bh, OCFS2_BH_CACHED, inode);
+				       &next_eb_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -373,42 +419,28 @@ out:
 	return ret;
 }
 
-int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
-		       u32 *p_cluster, u32 *num_clusters,
-		       unsigned int *extent_flags)
+static int ocfs2_get_clusters_nocache(struct inode *inode,
+				      struct buffer_head *di_bh,
+				      u32 v_cluster, unsigned int *hole_len,
+				      struct ocfs2_extent_rec *ret_rec,
+				      unsigned int *is_last)
 {
-	int ret, i;
-	unsigned int flags = 0;
-	struct buffer_head *di_bh = NULL;
-	struct buffer_head *eb_bh = NULL;
+	int i, ret, tree_height, len;
 	struct ocfs2_dinode *di;
-	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_block *uninitialized_var(eb);
 	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec;
-	u32 coff;
-
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = -ERANGE;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
-				      num_clusters, extent_flags);
-	if (ret == 0)
-		goto out;
+	struct buffer_head *eb_bh = NULL;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	memset(ret_rec, 0, sizeof(*ret_rec));
+	if (is_last)
+		*is_last = 0;
 
 	di = (struct ocfs2_dinode *) di_bh->b_data;
 	el = &di->id2.i_list;
+	tree_height = le16_to_cpu(el->l_tree_depth);
 
-	if (el->l_tree_depth) {
+	if (tree_height > 0) {
 		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
 		if (ret) {
 			mlog_errno(ret);
@@ -431,46 +463,202 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	i = ocfs2_search_extent_list(el, v_cluster);
 	if (i == -1) {
 		/*
-		 * A hole was found. Return some canned values that
-		 * callers can key on. If asked for, num_clusters will
-		 * be populated with the size of the hole.
+		 * Holes can be larger than the maximum size of an
+		 * extent, so we return their lengths in a seperate
+		 * field.
 		 */
-		*p_cluster = 0;
-		if (num_clusters) {
+		if (hole_len) {
 			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
-							 v_cluster,
-							 num_clusters);
+							 v_cluster, &len);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
+
+			*hole_len = len;
+		}
+		goto out_hole;
+	}
+
+	rec = &el->l_recs[i];
+
+	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+	if (!rec->e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0)", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*ret_rec = *rec;
+
+	/*
+	 * Checking for last extent is potentially expensive - we
+	 * might have to look at the next leaf over to see if it's
+	 * empty.
+	 *
+	 * The first two checks are to see whether the caller even
+	 * cares for this information, and if the extent is at least
+	 * the last in it's list.
+	 *
+	 * If those hold true, then the extent is last if any of the
+	 * additional conditions hold true:
+	 *  - Extent list is in-inode
+	 *  - Extent list is right-most
+	 *  - Extent list is 2nd to rightmost, with empty right-most
+	 */
+	if (is_last) {
+		if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+			if (tree_height == 0)
+				*is_last = 1;
+			else if (eb->h_blkno == di->i_last_eb_blk)
+				*is_last = 1;
+			else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+				ret = ocfs2_last_eb_is_empty(inode, di);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+				if (ret == 1)
+					*is_last = 1;
+			}
+		}
+	}
+
+out_hole:
+	ret = 0;
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+					  u32 v_cluster,
+					  struct ocfs2_extent_rec *rec,
+					  u32 *p_cluster, u32 *num_clusters)
+
+{
+	u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
+
+	*p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+	*p_cluster = *p_cluster + coff;
+
+	if (num_clusters)
+		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
+}
+
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+	u32 coff;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
 		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
 	} else {
 		rec = &el->l_recs[i];
-
 		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
 
 		if (!rec->e_blkno) {
 			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-				    "record (%u, %u, 0)", inode->i_ino,
+				    "record (%u, %u, 0) in xattr", inode->i_ino,
 				    le32_to_cpu(rec->e_cpos),
 				    ocfs2_rec_clusters(el, rec));
 			ret = -EROFS;
 			goto out;
 		}
-
 		coff = v_cluster - le32_to_cpu(rec->e_cpos);
-
 		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
 						    le64_to_cpu(rec->e_blkno));
 		*p_cluster = *p_cluster + coff;
-
 		if (num_clusters)
 			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+	}
+out:
+	if (eb_bh)
+		brelse(eb_bh);
+	return ret;
+}
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+		       u32 *p_cluster, u32 *num_clusters,
+		       unsigned int *extent_flags)
+{
+	int ret;
+	unsigned int uninitialized_var(hole_len), flags = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = -ERANGE;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+				      num_clusters, extent_flags);
+	if (ret == 0)
+		goto out;
+
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		flags = rec->e_flags;
+	ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+					 &rec, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (rec.e_blkno == 0ULL) {
+		/*
+		 * A hole was found. Return some canned values that
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
+		 */
+		*p_cluster = 0;
+		if (num_clusters) {
+			*num_clusters = hole_len;
+		}
+	} else {
+		ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+					      p_cluster, num_clusters);
+		flags = rec.e_flags;
 
-		ocfs2_extent_map_insert_rec(inode, rec);
+		ocfs2_extent_map_insert_rec(inode, &rec);
 	}
 
 	if (extent_flags)
@@ -478,7 +666,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 
 out:
 	brelse(di_bh);
-	brelse(eb_bh);
 	return ret;
 }
 
@@ -521,3 +708,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 out:
 	return ret;
 }
+
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+			       struct fiemap_extent_info *fieinfo,
+			       u64 map_start)
+{
+	int ret;
+	unsigned int id_count;
+	struct ocfs2_dinode *di;
+	u64 phys;
+	u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	id_count = le16_to_cpu(di->id2.i_data.id_count);
+
+	if (map_start < id_count) {
+		phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+		phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+
+		ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+					      flags);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+#define OCFS2_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len)
+{
+	int ret, is_last;
+	u32 mapping_end, cpos;
+	unsigned int hole_size;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u64 len_bytes, phys_bytes, virt_bytes;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_extent_rec rec;
+
+	ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	/*
+	 * Handle inline-data separately.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+		goto out_unlock;
+	}
+
+	cpos = map_start >> osb->s_clustersize_bits;
+	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+					       map_start + map_len);
+	mapping_end -= cpos;
+	is_last = 0;
+	while (cpos < mapping_end && !is_last) {
+		u32 fe_flags;
+
+		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+						 &hole_size, &rec, &is_last);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (rec.e_blkno == 0ULL) {
+			cpos += hole_size;
+			continue;
+		}
+
+		fe_flags = 0;
+		if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+			fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+		if (is_last)
+			fe_flags |= FIEMAP_EXTENT_LAST;
+		len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+		phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+		virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+
+		ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+					      len_bytes, fe_flags);
+		if (ret)
+			break;
+
+		cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
+	}
+
+	if (ret > 0)
+		ret = 0;
+
+out_unlock:
+	brelse(di_bh);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ocfs2_inode_unlock(inode, 0);
+out:
+
+	return ret;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a22..1c4aa8b06f34 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,11 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
 int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 				u64 *ret_count, unsigned int *extent_flags);
 
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		 u64 map_start, u64 map_len);
+
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el);
+
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e8514e8b6ce8..8d3225a78073 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -55,6 +55,7 @@
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file,
 		goto bail;
 
 	journal = osb->journal->j_journal;
-	err = journal_force_commit(journal);
+	err = jbd2_journal_force_commit(journal);
 
 bail:
 	mlog_exit(err);
@@ -488,7 +489,7 @@ bail:
 }
 
 /*
- * extend allocation only here.
+ * extend file allocation only here.
  * we'll update all the disk stuff, and oip->alloc_size
  *
  * expect stuff to be locked, a transaction started and enough data /
@@ -497,189 +498,25 @@ bail:
  * Will return -EAGAIN, and a reason if a restart is needed.
  * If passed in, *reason will always be set, even in error.
  */
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
-			       struct inode *inode,
-			       u32 *logical_offset,
-			       u32 clusters_to_add,
-			       int mark_unwritten,
-			       struct buffer_head *fe_bh,
-			       handle_t *handle,
-			       struct ocfs2_alloc_context *data_ac,
-			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason_ret)
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret)
 {
-	int status = 0;
-	int free_extents;
-	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	enum ocfs2_alloc_restarted reason = RESTART_NONE;
-	u32 bit_off, num_bits;
-	u64 block;
-	u8 flags = 0;
-
-	BUG_ON(!clusters_to_add);
-
-	if (mark_unwritten)
-		flags = OCFS2_EXT_UNWRITTEN;
-
-	free_extents = ocfs2_num_free_extents(osb, inode, fe);
-	if (free_extents < 0) {
-		status = free_extents;
-		mlog_errno(status);
-		goto leave;
-	}
-
-	/* there are two cases which could cause us to EAGAIN in the
-	 * we-need-more-metadata case:
-	 * 1) we haven't reserved *any*
-	 * 2) we are so fragmented, we've needed to add metadata too
-	 *    many times. */
-	if (!free_extents && !meta_ac) {
-		mlog(0, "we haven't reserved any metadata!\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	} else if ((!free_extents)
-		   && (ocfs2_alloc_context_bits_left(meta_ac)
-		       < ocfs2_extend_meta_needed(fe))) {
-		mlog(0, "filesystem is really fragmented...\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	}
-
-	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
-					clusters_to_add, &bit_off, &num_bits);
-	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
-		goto leave;
-	}
-
-	BUG_ON(num_bits > clusters_to_add);
-
-	/* reserve our write early -- insert_extent may update the inode */
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
-	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
-				     *logical_offset, block, num_bits,
-				     flags, meta_ac);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	clusters_to_add -= num_bits;
-	*logical_offset += num_bits;
-
-	if (clusters_to_add) {
-		mlog(0, "need to alloc once more, clusters = %u, wanted = "
-		     "%u\n", fe->i_clusters, clusters_to_add);
-		status = -EAGAIN;
-		reason = RESTART_TRANS;
-	}
-
-leave:
-	mlog_exit(status);
-	if (reason_ret)
-		*reason_ret = reason;
-	return status;
-}
-
-/*
- * For a given allocation, determine which allocators will need to be
- * accessed, and lock them, reserving the appropriate number of bits.
- *
- * Sparse file systems call this from ocfs2_write_begin_nolock()
- * and ocfs2_allocate_unwritten_extents().
- *
- * File systems which don't support holes call this from
- * ocfs2_extend_allocation().
- */
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac)
-{
-	int ret = 0, num_free_extents;
-	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	*meta_ac = NULL;
-	if (data_ac)
-		*data_ac = NULL;
-
-	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
-
-	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-	     "clusters_to_add = %u, extents_to_split = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
-	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
-
-	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
-	if (num_free_extents < 0) {
-		ret = num_free_extents;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	/*
-	 * Sparse allocation file systems need to be more conservative
-	 * with reserving room for expansion - the actual allocation
-	 * happens while we've got a journal handle open so re-taking
-	 * a cluster lock (because we ran out of room for another
-	 * extent) will violate ordering rules.
-	 *
-	 * Most of the time we'll only be seeing this 1 cluster at a time
-	 * anyway.
-	 *
-	 * Always lock for any unwritten extents - we might want to
-	 * add blocks during a split.
-	 */
-	if (!num_free_extents ||
-	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
-		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
-		if (ret < 0) {
-			if (ret != -ENOSPC)
-				mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	if (clusters_to_add == 0)
-		goto out;
-
-	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
-	if (ret < 0) {
-		if (ret != -ENOSPC)
-			mlog_errno(ret);
-		goto out;
-	}
+	int ret;
+	struct ocfs2_extent_tree et;
 
-out:
-	if (ret) {
-		if (*meta_ac) {
-			ocfs2_free_alloc_context(*meta_ac);
-			*meta_ac = NULL;
-		}
-
-		/*
-		 * We cannot have an error and a non null *data_ac.
-		 */
-	}
+	ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
+	ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
+					   clusters_to_add, mark_unwritten,
+					   &et, handle,
+					   data_ac, meta_ac, reason_ret);
 
 	return ret;
 }
@@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;
 
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 
@@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
-				  OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
-				       &meta_ac);
+	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
+	     clusters_to_add);
+	ocfs2_init_dinode_extent_tree(&et, inode, bh);
+	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				       &data_ac, &meta_ac);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
+					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@@ -753,16 +597,16 @@ restarted_transaction:
 
 	prev_clusters = OCFS2_I(inode)->ip_clusters;
 
-	status = ocfs2_do_extend_allocation(osb,
-					    inode,
-					    &logical_start,
-					    clusters_to_add,
-					    mark_unwritten,
-					    bh,
-					    handle,
-					    data_ac,
-					    meta_ac,
-					    &why);
+	status = ocfs2_add_inode_data(osb,
+				      inode,
+				      &logical_start,
+				      clusters_to_add,
+				      mark_unwritten,
+				      bh,
+				      handle,
+				      data_ac,
+				      meta_ac,
+				      &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -789,7 +633,7 @@ restarted_transaction:
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
-							    fe,
+							    &fe->id2.i_list,
 							    clusters_to_add);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
@@ -826,10 +670,8 @@ leave:
 		restart_func = 0;
 		goto restart_all;
 	}
-	if (bh) {
-		brelse(bh);
-		bh = NULL;
-	}
+	brelse(bh);
+	bh = NULL;
 
 	mlog_exit(status);
 	return status;
@@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 			goto bail_unlock;
 		}
 
-		if (i_size_read(inode) > attr->ia_size)
+		if (i_size_read(inode) > attr->ia_size) {
+			if (ocfs2_should_order_data(inode)) {
+				status = ocfs2_begin_ordered_truncate(inode,
+								      attr->ia_size);
+				if (status)
+					goto bail_unlock;
+			}
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
-		else
+		} else
 			status = ocfs2_extend_file(inode, bh, attr->ia_size);
 		if (status < 0) {
 			if (status != -ENOSPC)
@@ -1140,8 +988,7 @@ bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -1176,7 +1023,7 @@ bail:
 	return err;
 }
 
-int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int ocfs2_permission(struct inode *inode, int mask)
 {
 	int ret;
 
@@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 	struct buffer_head *bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-			       oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 	struct buffer_head *di_bh = NULL;
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       OCFS2_I(inode)->ip_blkno, &di_bh,
-				       OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+				       &di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	handle_t *handle;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
 
-	ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
 				  dealloc);
 	if (ret) {
 		mlog_errno(ret);
@@ -1766,8 +1614,8 @@ out_inode_unlock:
 out_rw_unlock:
 	ocfs2_rw_unlock(inode, 1);
 
-	mutex_unlock(&inode->i_mutex);
 out:
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
@@ -2040,7 +1888,7 @@ out_dio:
 		 */
 		if (old_size != i_size_read(inode) ||
 		    old_clusters != OCFS2_I(inode)->ip_clusters) {
-			ret = journal_force_commit(osb->journal->j_journal);
+			ret = jbd2_journal_force_commit(osb->journal->j_journal);
 			if (ret < 0)
 				written = ret;
 		}
@@ -2227,7 +2075,12 @@ const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 	.fallocate	= ocfs2_fallocate,
+	.fiemap		= ocfs2_fiemap,
 };
 
 const struct inode_operations ocfs2_special_file_iops = {
@@ -2236,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = {
 	.permission	= ocfs2_permission,
 };
 
+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
 const struct file_operations ocfs2_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -2250,6 +2107,7 @@ const struct file_operations ocfs2_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+	.lock		= ocfs2_lock,
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
@@ -2266,5 +2124,51 @@ const struct file_operations ocfs2_dops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+	.lock		= ocfs2_lock,
+	.flock		= ocfs2_flock,
+};
+
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.flock		= ocfs2_flock,
+	.splice_read	= ocfs2_file_splice_read,
+	.splice_write	= ocfs2_file_splice_write,
+};
+
+const struct file_operations ocfs2_dops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_dir_release,
+	.open		= ocfs2_dir_open,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
 	.flock		= ocfs2_flock,
 };
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 048ddcaf5c80..e92382cbca5f 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -28,9 +28,12 @@
 
 extern const struct file_operations ocfs2_fops;
 extern const struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
 extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
 
 struct ocfs2_file_private {
 	struct file		*fp_file;
@@ -38,32 +41,22 @@ struct ocfs2_file_private {
 	struct ocfs2_lock_res	fp_flock;
 };
 
-enum ocfs2_alloc_restarted {
-	RESTART_NONE = 0,
-	RESTART_TRANS,
-	RESTART_META
-};
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
-			       struct inode *inode,
-			       u32 *logical_offset,
-			       u32 clusters_to_add,
-			       int mark_unwritten,
-			       struct buffer_head *fe_bh,
-			       handle_t *handle,
-			       struct ocfs2_alloc_context *data_ac,
-			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask,
-		     struct nameidata *nd);
+int ocfs2_permission(struct inode *inode, int mask);
 
 int ocfs2_should_update_atime(struct inode *inode,
 			      struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec7..4903688f72a9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,6 +49,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	struct super_block *sb;
 	struct ocfs2_super *osb;
 	int status = -EINVAL;
+	int use_plocks = 1;
 
 	mlog_entry("(0x%p, size:%llu)\n", inode,
 		   (unsigned long long)le64_to_cpu(fe->i_size));
@@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);
 
+	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+		use_plocks = 0;
+
 	/* this means that read_inode cannot create a superblock inode
 	 * today.  change if needed. */
 	if (!OCFS2_IS_VALID_DINODE(fe) ||
@@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	switch (inode->i_mode & S_IFMT) {
 	    case S_IFREG:
-		    inode->i_fop = &ocfs2_fops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_fops;
+		    else
+			    inode->i_fop = &ocfs2_fops_no_plocks;
 		    inode->i_op = &ocfs2_file_iops;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    case S_IFDIR:
 		    inode->i_op = &ocfs2_dir_iops;
-		    inode->i_fop = &ocfs2_dops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_dops;
+		    else
+			    inode->i_fop = &ocfs2_dops_no_plocks;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    case S_IFLNK:
@@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		}
 	}
 
-	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
-				  can_lock ? inode : NULL);
+	if (can_lock)
+		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+					   OCFS2_BH_IGNORE_CACHE);
+	else
+		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 	 * data and fast symlinks.
 	 */
 	if (fe->i_clusters) {
+		if (ocfs2_should_order_data(inode))
+			ocfs2_begin_ordered_truncate(inode, 0);
+
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
@@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		goto bail_unlock_dir;
 	}
 
+	/*Free extended attribute resources associated with this inode.*/
+	status = ocfs2_xattr_remove(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
 	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
 				    orphan_dir_bh);
 	if (status < 0)
@@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode)
 	oi->ip_last_trans = 0;
 	oi->ip_dir_start_lookup = 0;
 	oi->ip_blkno = 0ULL;
+	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+				       &oi->ip_jinode);
 
 bail:
 	mlog_exit_void();
@@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode)
 }
 
 /*
- * TODO: this should probably be merged into ocfs2_get_block
- *
- * However, you now need to pay attention to the cont_prepare_write()
- * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
- * expects never to extend).
- */
-struct buffer_head *ocfs2_bread(struct inode *inode,
-				int block, int *err, int reada)
-{
-	struct buffer_head *bh = NULL;
-	int tmperr;
-	u64 p_blkno;
-	int readflags = OCFS2_BH_CACHED;
-
-	if (reada)
-		readflags |= OCFS2_BH_READAHEAD;
-
-	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
-	    i_size_read(inode)) {
-		BUG_ON(!reada);
-		return NULL;
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-					     NULL);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (tmperr < 0) {
-		mlog_errno(tmperr);
-		goto fail;
-	}
-
-	tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
-				  readflags, inode);
-	if (tmperr < 0)
-		goto fail;
-
-	tmperr = 0;
-
-	*err = 0;
-	return bh;
-
-fail:
-	if (bh) {
-		brelse(bh);
-		bh = NULL;
-	}
-	*err = -EIO;
-	return NULL;
-}
-
-/*
  * This is called from our getattr.
  */
 int ocfs2_inode_revalidate(struct dentry *dentry)
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 390a85596aa0..2f37af9bcc4a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -40,6 +40,9 @@ struct ocfs2_inode_info
 	/* protects allocation changes on this inode. */
 	struct rw_semaphore		ip_alloc_sem;
 
+	/* protects extended attribute changes on this inode */
+	struct rw_semaphore		ip_xattr_sem;
+
 	/* These fields are protected by ip_lock */
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
@@ -68,6 +71,7 @@ struct ocfs2_inode_info
 	struct ocfs2_extent_map		ip_extent_map;
 
 	struct inode			vfs_inode;
+	struct jbd2_inode		ip_jinode;
 };
 
 /*
@@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache;
 
 extern const struct address_space_operations ocfs2_aops;
 
-struct buffer_head *ocfs2_bread(struct inode *inode, int block,
-				int *err, int reada);
 void ocfs2_clear_inode(struct inode *inode);
 void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7b142f0ce995..9fcd36dcc9a0 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -102,8 +102,7 @@ bail_unlock:
 bail:
 	mutex_unlock(&inode->i_mutex);
 
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a8c19cb3cfdd..81e40677eecb 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
 static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-				      int dirty);
+				      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 				 int slot_num);
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
@@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 		goto finally;
 	}
 
-	journal_lock_updates(journal->j_journal);
-	status = journal_flush(journal->j_journal);
-	journal_unlock_updates(journal->j_journal);
+	jbd2_journal_lock_updates(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal);
+	jbd2_journal_unlock_updates(journal->j_journal);
 	if (status < 0) {
 		up_write(&journal->j_trans_barrier);
 		mlog_errno(status);
@@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 
 	down_read(&osb->journal->j_trans_barrier);
 
-	handle = journal_start(journal, max_buffs);
+	handle = jbd2_journal_start(journal, max_buffs);
 	if (IS_ERR(handle)) {
 		up_read(&osb->journal->j_trans_barrier);
 
@@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 
 	BUG_ON(!handle);
 
-	ret = journal_stop(handle);
+	ret = jbd2_journal_stop(handle);
 	if (ret < 0)
 		mlog_errno(ret);
 
@@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
  * transaction. extend_trans will either extend the current handle by
  * nblocks, or commit it and start a new one with nblocks credits.
  *
- * This might call journal_restart() which will commit dirty buffers
+ * This might call jbd2_journal_restart() which will commit dirty buffers
  * and then restart the transaction. Before calling
  * ocfs2_extend_trans(), any changed blocks should have been
  * dirtied. After calling it, all blocks which need to be changed must
@@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #ifdef CONFIG_OCFS2_DEBUG_FS
 	status = 1;
 #else
-	status = journal_extend(handle, nblocks);
+	status = jbd2_journal_extend(handle, nblocks);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #endif
 
 	if (status > 0) {
-		mlog(0, "journal_extend failed, trying journal_restart\n");
-		status = journal_restart(handle, nblocks);
+		mlog(0,
+		     "jbd2_journal_extend failed, trying "
+		     "jbd2_journal_restart\n");
+		status = jbd2_journal_restart(handle, nblocks);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle,
 	switch (type) {
 	case OCFS2_JOURNAL_ACCESS_CREATE:
 	case OCFS2_JOURNAL_ACCESS_WRITE:
-		status = journal_get_write_access(handle, bh);
+		status = jbd2_journal_get_write_access(handle, bh);
 		break;
 
 	case OCFS2_JOURNAL_ACCESS_UNDO:
-		status = journal_get_undo_access(handle, bh);
+		status = jbd2_journal_get_undo_access(handle, bh);
 		break;
 
 	default:
@@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle,
 	mlog_entry("(bh->b_blocknr=%llu)\n",
 		   (unsigned long long)bh->b_blocknr);
 
-	status = journal_dirty_metadata(handle, bh);
+	status = jbd2_journal_dirty_metadata(handle, bh);
 	if (status < 0)
 		mlog(ML_ERROR, "Could not dirty metadata buffer. "
 		     "(bh->b_blocknr=%llu)\n",
@@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle,
 	return status;
 }
 
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int ocfs2_journal_dirty_data(handle_t *handle,
 			     struct buffer_head *bh)
 {
@@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
 
 	return err;
 }
+#endif
 
-#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
 {
@@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
 	spin_lock(&journal->j_state_lock);
 	journal->j_commit_interval = commit_interval;
 	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
-		journal->j_flags |= JFS_BARRIER;
+		journal->j_flags |= JBD2_BARRIER;
 	else
-		journal->j_flags &= ~JFS_BARRIER;
+		journal->j_flags &= ~JBD2_BARRIER;
 	spin_unlock(&journal->j_state_lock);
 }
 
@@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
 
 	/* call the kernels journal init function now */
-	j_journal = journal_init_inode(inode);
+	j_journal = jbd2_journal_init_inode(inode);
 	if (j_journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
 		status = -EINVAL;
 		goto done;
 	}
 
-	mlog(0, "Returned from journal_init_inode\n");
+	mlog(0, "Returned from jbd2_journal_init_inode\n");
 	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
 
 	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
@@ -550,8 +554,7 @@ done:
 	if (status < 0) {
 		if (inode_lock)
 			ocfs2_inode_unlock(inode, 1);
-		if (bh != NULL)
-			brelse(bh);
+		brelse(bh);
 		if (inode) {
 			OCFS2_I(inode)->ip_open_count--;
 			iput(inode);
@@ -562,8 +565,18 @@ done:
 	return status;
 }
 
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+	le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+	return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
+
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
-				      int dirty)
+				      int dirty, int replayed)
 {
 	int status;
 	unsigned int flags;
@@ -593,6 +606,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 		flags &= ~OCFS2_JOURNAL_DIRTY_FL;
 	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
 
+	if (replayed)
+		ocfs2_bump_recovery_generation(fe);
+
 	status = ocfs2_write_block(osb, bh, journal->j_inode);
 	if (status < 0)
 		mlog_errno(status);
@@ -626,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	if (journal->j_state != OCFS2_JOURNAL_LOADED)
 		goto done;
 
-	/* need to inc inode use count as journal_destroy will iput. */
+	/* need to inc inode use count - jbd2_journal_destroy will iput. */
 	if (!igrab(inode))
 		BUG();
 
@@ -655,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
 
 	if (ocfs2_mount_local(osb)) {
-		journal_lock_updates(journal->j_journal);
-		status = journal_flush(journal->j_journal);
-		journal_unlock_updates(journal->j_journal);
+		jbd2_journal_lock_updates(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal);
+		jbd2_journal_unlock_updates(journal->j_journal);
 		if (status < 0)
 			mlog_errno(status);
 	}
@@ -667,13 +683,13 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 		 * Do not toggle if flush was unsuccessful otherwise
 		 * will leave dirty metadata in a "clean" journal
 		 */
-		status = ocfs2_journal_toggle_dirty(osb, 0);
+		status = ocfs2_journal_toggle_dirty(osb, 0, 0);
 		if (status < 0)
 			mlog_errno(status);
 	}
 
 	/* Shutdown the kernel journal system */
-	journal_destroy(journal->j_journal);
+	jbd2_journal_destroy(journal->j_journal);
 
 	OCFS2_I(inode)->ip_open_count--;
 
@@ -698,19 +714,19 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
 {
 	int olderr;
 
-	olderr = journal_errno(journal);
+	olderr = jbd2_journal_errno(journal);
 	if (olderr) {
 		mlog(ML_ERROR, "File system error %d recorded in "
 		     "journal %u.\n", olderr, slot);
 		mlog(ML_ERROR, "File system on device %s needs checking.\n",
 		     sb->s_id);
 
-		journal_ack_err(journal);
-		journal_clear_err(journal);
+		jbd2_journal_ack_err(journal);
+		jbd2_journal_clear_err(journal);
 	}
 }
 
-int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 {
 	int status = 0;
 	struct ocfs2_super *osb;
@@ -721,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
 
 	osb = journal->j_osb;
 
-	status = journal_load(journal->j_journal);
+	status = jbd2_journal_load(journal->j_journal);
 	if (status < 0) {
 		mlog(ML_ERROR, "Failed to load journal!\n");
 		goto done;
@@ -729,7 +745,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
 
 	ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
 
-	status = ocfs2_journal_toggle_dirty(osb, 1);
+	status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
 	if (status < 0) {
 		mlog_errno(status);
 		goto done;
@@ -765,13 +781,13 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
 
 	BUG_ON(!journal);
 
-	status = journal_wipe(journal->j_journal, full);
+	status = jbd2_journal_wipe(journal->j_journal, full);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+	status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -834,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
 
 		/* We are reading journal data which should not
 		 * be put in the uptodate cache */
-		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
-					   p_blkno, p_blocks, bhs, 0,
-					   NULL);
+		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
+						p_blkno, p_blocks, bhs);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -852,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
 
 bail:
 	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
-		if (bhs[i])
-			brelse(bhs[i]);
+		brelse(bhs[i]);
 	mlog_exit(status);
 	return status;
 }
@@ -1034,6 +1048,12 @@ restart:
 	spin_unlock(&osb->osb_lock);
 	mlog(0, "All nodes recovered\n");
 
+	/* Refresh all journal recovery generations from disk */
+	status = ocfs2_check_journals_nolocks(osb);
+	status = (status == -EROFS) ? 0 : status;
+	if (status < 0)
+		mlog_errno(status);
+
 	ocfs2_super_unlock(osb, 1);
 
 	/* We always run recovery on our own orphan dir - the dead
@@ -1096,6 +1116,43 @@ out:
 	mlog_exit_void();
 }
 
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+				    int slot_num,
+				    struct buffer_head **bh,
+				    struct inode **ret_inode)
+{
+	int status = -EACCES;
+	struct inode *inode = NULL;
+
+	BUG_ON(slot_num >= osb->max_slots);
+
+	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+					    slot_num);
+	if (!inode || is_bad_inode(inode)) {
+		mlog_errno(status);
+		goto bail;
+	}
+	SET_INODE_JOURNAL(inode);
+
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+				   OCFS2_BH_IGNORE_CACHE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+
+bail:
+	if (inode) {
+		if (status || !ret_inode)
+			iput(inode);
+		else
+			*ret_inode = inode;
+	}
+	return status;
+}
+
 /* Does the actual journal replay and marks the journal inode as
  * clean. Will only replay if the journal inode is marked dirty. */
 static int ocfs2_replay_journal(struct ocfs2_super *osb,
@@ -1109,22 +1166,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	struct ocfs2_dinode *fe;
 	journal_t *journal = NULL;
 	struct buffer_head *bh = NULL;
+	u32 slot_reco_gen;
 
-	inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
-					    slot_num);
-	if (inode == NULL) {
-		status = -EACCES;
+	status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
+	if (status) {
 		mlog_errno(status);
 		goto done;
 	}
-	if (is_bad_inode(inode)) {
-		status = -EACCES;
-		iput(inode);
-		inode = NULL;
-		mlog_errno(status);
+
+	fe = (struct ocfs2_dinode *)bh->b_data;
+	slot_reco_gen = ocfs2_get_recovery_generation(fe);
+	brelse(bh);
+	bh = NULL;
+
+	/*
+	 * As the fs recovery is asynchronous, there is a small chance that
+	 * another node mounted (and recovered) the slot before the recovery
+	 * thread could get the lock. To handle that, we dirty read the journal
+	 * inode for that slot to get the recovery generation. If it is
+	 * different than what we expected, the slot has been recovered.
+	 * If not, it needs recovery.
+	 */
+	if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+		mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+		     osb->slot_recovery_generations[slot_num], slot_reco_gen);
+		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+		status = -EBUSY;
 		goto done;
 	}
-	SET_INODE_JOURNAL(inode);
+
+	/* Continue with recovery as the journal has not yet been recovered */
 
 	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
 	if (status < 0) {
@@ -1138,9 +1209,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	fe = (struct ocfs2_dinode *) bh->b_data;
 
 	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+	slot_reco_gen = ocfs2_get_recovery_generation(fe);
 
 	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
 		mlog(0, "No recovery required for node %d\n", node_num);
+		/* Refresh recovery generation for the slot */
+		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
 		goto done;
 	}
 
@@ -1157,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	}
 
 	mlog(0, "calling journal_init_inode\n");
-	journal = journal_init_inode(inode);
+	journal = jbd2_journal_init_inode(inode);
 	if (journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
 		status = -EIO;
 		goto done;
 	}
 
-	status = journal_load(journal);
+	status = jbd2_journal_load(journal);
 	if (status < 0) {
 		mlog_errno(status);
 		if (!igrab(inode))
 			BUG();
-		journal_destroy(journal);
+		jbd2_journal_destroy(journal);
 		goto done;
 	}
 
@@ -1177,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 
 	/* wipe the journal */
 	mlog(0, "flushing the journal.\n");
-	journal_lock_updates(journal);
-	status = journal_flush(journal);
-	journal_unlock_updates(journal);
+	jbd2_journal_lock_updates(journal);
+	status = jbd2_journal_flush(journal);
+	jbd2_journal_unlock_updates(journal);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -1188,6 +1262,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	flags &= ~OCFS2_JOURNAL_DIRTY_FL;
 	fe->id1.journal1.ij_flags = cpu_to_le32(flags);
 
+	/* Increment recovery generation to indicate successful recovery */
+	ocfs2_bump_recovery_generation(fe);
+	osb->slot_recovery_generations[slot_num] =
+					ocfs2_get_recovery_generation(fe);
+
 	status = ocfs2_write_block(osb, bh, inode);
 	if (status < 0)
 		mlog_errno(status);
@@ -1195,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	if (!igrab(inode))
 		BUG();
 
-	journal_destroy(journal);
+	jbd2_journal_destroy(journal);
 
 done:
 	/* drop the lock on this nodes journal */
@@ -1205,8 +1284,7 @@ done:
 	if (inode)
 		iput(inode);
 
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -1252,6 +1330,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 
 	status = ocfs2_replay_journal(osb, node_num, slot_num);
 	if (status < 0) {
+		if (status == -EBUSY) {
+			mlog(0, "Skipping recovery for slot %u (node %u) "
+			     "as another node has recovered it\n", slot_num,
+			     node_num);
+			status = 0;
+			goto done;
+		}
 		mlog_errno(status);
 		goto done;
 	}
@@ -1334,21 +1419,46 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 {
 	unsigned int node_num;
 	int status, i;
+	u32 gen;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dinode *di;
 
 	/* This is called with the super block cluster lock, so we
 	 * know that the slot map can't change underneath us. */
 
-	spin_lock(&osb->osb_lock);
 	for (i = 0; i < osb->max_slots; i++) {
-		if (i == osb->slot_num)
+		/* Read journal inode to get the recovery generation */
+		status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+		di = (struct ocfs2_dinode *)bh->b_data;
+		gen = ocfs2_get_recovery_generation(di);
+		brelse(bh);
+		bh = NULL;
+
+		spin_lock(&osb->osb_lock);
+		osb->slot_recovery_generations[i] = gen;
+
+		mlog(0, "Slot %u recovery generation is %u\n", i,
+		     osb->slot_recovery_generations[i]);
+
+		if (i == osb->slot_num) {
+			spin_unlock(&osb->osb_lock);
 			continue;
+		}
 
 		status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
-		if (status == -ENOENT)
+		if (status == -ENOENT) {
+			spin_unlock(&osb->osb_lock);
 			continue;
+		}
 
-		if (__ocfs2_recovery_map_test(osb, node_num))
+		if (__ocfs2_recovery_map_test(osb, node_num)) {
+			spin_unlock(&osb->osb_lock);
 			continue;
+		}
 		spin_unlock(&osb->osb_lock);
 
 		/* Ok, we have a slot occupied by another node which
@@ -1364,10 +1474,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 			mlog_errno(status);
 			goto bail;
 		}
-
-		spin_lock(&osb->osb_lock);
 	}
-	spin_unlock(&osb->osb_lock);
 
 	status = 0;
 bail:
@@ -1603,49 +1710,41 @@ static int ocfs2_commit_thread(void *arg)
 	return 0;
 }
 
-/* Look for a dirty journal without taking any cluster locks. Used for
- * hard readonly access to determine whether the file system journals
- * require recovery. */
+/* Reads all the journal inodes without taking any cluster locks. Used
+ * for hard readonly access to determine whether any journal requires
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
 int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
 {
 	int ret = 0;
 	unsigned int slot;
-	struct buffer_head *di_bh;
+	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
-	struct inode *journal = NULL;
+	int journal_dirty = 0;
 
 	for(slot = 0; slot < osb->max_slots; slot++) {
-		journal = ocfs2_get_system_file_inode(osb,
-						      JOURNAL_SYSTEM_INODE,
-						      slot);
-		if (!journal || is_bad_inode(journal)) {
-			ret = -EACCES;
-			mlog_errno(ret);
-			goto out;
-		}
-
-		di_bh = NULL;
-		ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
-				       0, journal);
-		if (ret < 0) {
+		ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
+		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
 		di = (struct ocfs2_dinode *) di_bh->b_data;
 
+		osb->slot_recovery_generations[slot] =
+					ocfs2_get_recovery_generation(di);
+
 		if (le32_to_cpu(di->id1.journal1.ij_flags) &
 		    OCFS2_JOURNAL_DIRTY_FL)
-			ret = -EROFS;
+			journal_dirty = 1;
 
 		brelse(di_bh);
-		if (ret)
-			break;
+		di_bh = NULL;
 	}
 
 out:
-	if (journal)
-		iput(journal);
-
+	if (journal_dirty)
+		ret = -EROFS;
 	return ret;
 }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index db82be2532ed..d4d14e9a3cea 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,7 +27,12 @@
 #define OCFS2_JOURNAL_H
 
 #include <linux/fs.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
 
 enum ocfs2_journal_state {
 	OCFS2_JOURNAL_FREE = 0,
@@ -161,7 +166,8 @@ int    ocfs2_journal_init(struct ocfs2_journal *journal,
 void   ocfs2_journal_shutdown(struct ocfs2_super *osb);
 int    ocfs2_journal_wipe(struct ocfs2_journal *journal,
 			  int full);
-int    ocfs2_journal_load(struct ocfs2_journal *journal, int local);
+int    ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+			  int replayed);
 int    ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
 void   ocfs2_recovery_thread(struct ocfs2_super *osb,
 			     int node_num);
@@ -214,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
  *                          buffer. Will have to call ocfs2_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
- *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
- *                             the current handle commits.
+ *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
+ *                           the current handle commits.
  */
 
 /* You must always start_trans with a number of buffs > 0, but it's
@@ -267,8 +273,10 @@ int                  ocfs2_journal_access(handle_t *handle,
  */
 int                  ocfs2_journal_dirty(handle_t *handle,
 					 struct buffer_head *bh);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int                  ocfs2_journal_dirty_data(handle_t *handle,
 					      struct buffer_head *bh);
+#endif
 
 /*
  *  Credit Macros:
@@ -282,6 +290,9 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* simple file updates like chmod, etc. */
 #define OCFS2_INODE_UPDATE_CREDITS 1
 
+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 
@@ -339,11 +350,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
 			     + OCFS2_UNLINK_CREDITS)
 
+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+					  + OCFS2_INODE_UPDATE_CREDITS \
+					  + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 static inline int ocfs2_calc_extend_credits(struct super_block *sb,
-					    struct ocfs2_dinode *fe,
+					    struct ocfs2_extent_list *root_el,
 					    u32 bits_wanted)
 {
-	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
 
 	/* bitmap dinode, group desc. + relinked group. */
 	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -354,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 	 * however many metadata chunks needed * a remaining suballoc
 	 * alloc. */
 	sysfile_bitmap_blocks = 1 +
-		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
 
 	/* this does not include *new* metadata blocks, which are
-	 * accounted for in sysfile_bitmap_blocks. fe +
+	 * accounted for in sysfile_bitmap_blocks. root_el +
 	 * prev. last_eb_blk + blocks along edge of tree.
 	 * calc_symlink_credits passes because we just need 1
 	 * credit for the dinode there. */
-	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 
-	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
 }
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
@@ -414,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	return credits;
 }
 
+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+					       loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
+						   new_size);
+}
+
 #endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 28e492e4ec88..687b28713c32 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -47,8 +48,6 @@
 
 #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
 
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
-
 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
@@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 					  struct inode *local_alloc_inode);
 
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+#ifdef CONFIG_OCFS2_FS_STATS
+
+static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+#define LA_DEBUG_BUF_SZ	PAGE_CACHE_SIZE
+#define LA_DEBUG_VER	1
+static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
+				   size_t count, loff_t *ppos)
+{
+	static DEFINE_MUTEX(la_debug_mutex);
+	struct ocfs2_super *osb = file->private_data;
+	int written, ret;
+	char *buf = osb->local_alloc_debug_buf;
+
+	mutex_lock(&la_debug_mutex);
+	memset(buf, 0, LA_DEBUG_BUF_SZ);
+
+	written = snprintf(buf, LA_DEBUG_BUF_SZ,
+			   "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
+			   LA_DEBUG_VER,
+			   (unsigned long long)osb->la_last_gd,
+			   osb->local_alloc_default_bits,
+			   osb->local_alloc_bits, osb->local_alloc_state);
+
+	ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
+
+	mutex_unlock(&la_debug_mutex);
+	return ret;
+}
+
+static const struct file_operations ocfs2_la_debug_fops = {
+	.open =		ocfs2_la_debug_open,
+	.read =		ocfs2_la_debug_read,
+};
+
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+	osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
+	if (!osb->local_alloc_debug_buf)
+		return;
+
+	osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
+						     S_IFREG|S_IRUSR,
+						     osb->osb_debug_root,
+						     osb,
+						     &ocfs2_la_debug_fops);
+	if (!osb->local_alloc_debug) {
+		kfree(osb->local_alloc_debug_buf);
+		osb->local_alloc_debug_buf = NULL;
+	}
+}
+
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+	if (osb->local_alloc_debug)
+		debugfs_remove(osb->local_alloc_debug);
+
+	if (osb->local_alloc_debug_buf)
+		kfree(osb->local_alloc_debug_buf);
+
+	osb->local_alloc_debug_buf = NULL;
+	osb->local_alloc_debug = NULL;
+}
+#else	/* CONFIG_OCFS2_FS_STATS */
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+	return;
+}
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+	return;
+}
+#endif
+
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
-	BUG_ON(osb->s_clustersize_bits > 20);
+	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+		osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
 
-	/* Size local alloc windows by the megabyte */
-	return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters)
+{
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+	    osb->local_alloc_state == OCFS2_LA_THROTTLED)
+		if (num_clusters >= osb->local_alloc_default_bits) {
+			cancel_delayed_work(&osb->la_enable_wq);
+			osb->local_alloc_state = OCFS2_LA_ENABLED;
+		}
+	spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+	struct ocfs2_super *osb =
+		container_of(work, struct ocfs2_super,
+			     la_enable_wq.work);
+	spin_lock(&osb->osb_lock);
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+	spin_unlock(&osb->osb_lock);
 }
 
 /*
  * Tell us whether a given allocation should use the local alloc
  * file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
  */
 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 {
-	int la_bits = ocfs2_local_alloc_window_bits(osb);
 	int ret = 0;
+	int la_bits;
+
+	spin_lock(&osb->osb_lock);
+	la_bits = osb->local_alloc_bits;
 
-	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+	if (!ocfs2_la_state_enabled(osb))
 		goto bail;
 
 	/* la_bits should be at least twice the size (in clusters) of
@@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 bail:
 	mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
 	     osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+	spin_unlock(&osb->osb_lock);
 	return ret;
 }
 
@@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	if (osb->local_alloc_size == 0)
+	ocfs2_init_la_debug(osb);
+
+	if (osb->local_alloc_bits == 0)
 		goto bail;
 
-	if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+	if (osb->local_alloc_bits >= osb->bitmap_cpg) {
 		mlog(ML_NOTICE, "Requested local alloc window %d is larger "
 		     "than max possible %u. Using defaults.\n",
-		     ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
-		osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+		     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
+		osb->local_alloc_bits =
+			ocfs2_megabytes_to_clusters(osb->sb,
+						    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
 	}
 
 	/* read the alloc off disk */
@@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0, inode);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
 bail:
 	if (status < 0)
-		if (alloc_bh)
-			brelse(alloc_bh);
+		brelse(alloc_bh);
 	if (inode)
 		iput(inode);
 
-	mlog(0, "Local alloc window bits = %d\n",
-	     ocfs2_local_alloc_window_bits(osb));
+	if (status < 0)
+		ocfs2_shutdown_la_debug(osb);
+
+	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
 
 	mlog_exit(status);
 	return status;
@@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
+	cancel_delayed_work(&osb->la_enable_wq);
+	flush_workqueue(ocfs2_wq);
+
+	ocfs2_shutdown_la_debug(osb);
+
 	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
 		goto out;
 
@@ -295,8 +410,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock:
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);
 
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
@@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 
 	mutex_lock(&inode->i_mutex);
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0, inode);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -372,8 +486,7 @@ bail:
 		*alloc_copy = NULL;
 	}
 
-	if (alloc_bh)
-		brelse(alloc_bh);
+	brelse(alloc_bh);
 
 	if (inode) {
 		mutex_unlock(&inode->i_mutex);
@@ -441,8 +554,7 @@ out_unlock:
 out_mutex:
 	mutex_unlock(&main_bm_inode->i_mutex);
 
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);
 
 	iput(main_bm_inode);
 
@@ -453,8 +565,48 @@ out:
 	return status;
 }
 
+/* Check to see if the local alloc window is within ac->ac_max_block */
+static int ocfs2_local_alloc_in_range(struct inode *inode,
+				      struct ocfs2_alloc_context *ac,
+				      u32 bits_wanted)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+	int start;
+	u64 block_off;
+
+	if (!ac->ac_max_block)
+		return 1;
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (start == -1) {
+		mlog_errno(-ENOSPC);
+		return 0;
+	}
+
+	/*
+	 * Converting (bm_off + start + bits_wanted) to blocks gives us
+	 * the blkno just past our actual allocation.  This is perfect
+	 * to compare with ac_max_block.
+	 */
+	block_off = ocfs2_clusters_to_blocks(inode->i_sb,
+					     le32_to_cpu(la->la_bm_off) +
+					     start + bits_wanted);
+	mlog(0, "Checking %llu against %llu\n",
+	     (unsigned long long)block_off,
+	     (unsigned long long)ac->ac_max_block);
+	if (block_off > ac->ac_max_block)
+		return 0;
+
+	return 1;
+}
+
 /*
- * make sure we've got at least bitswanted contiguous bits in the
+ * make sure we've got at least bits_wanted contiguous bits in the
  * local alloc. You lose them when you drop i_mutex.
  *
  * We will add ourselves to the transaction passed in, but may start
@@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 
 	mutex_lock(&local_alloc_inode->i_mutex);
 
-	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
-		status = -ENOSPC;
-		goto bail;
-	}
-
-	if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
-		mlog(0, "Asking for more than my max window size!\n");
+	/*
+	 * We must double check state and allocator bits because
+	 * another process may have changed them while holding i_mutex.
+	 */
+	spin_lock(&osb->osb_lock);
+	if (!ocfs2_la_state_enabled(osb) ||
+	    (bits_wanted > osb->local_alloc_bits)) {
+		spin_unlock(&osb->osb_lock);
 		status = -ENOSPC;
 		goto bail;
 	}
+	spin_unlock(&osb->osb_lock);
 
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 
@@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 				mlog_errno(status);
 			goto bail;
 		}
+
+		/*
+		 * Under certain conditions, the window slide code
+		 * might have reduced the number of bits available or
+		 * disabled the the local alloc entirely. Re-check
+		 * here and return -ENOSPC if necessary.
+		 */
+		status = -ENOSPC;
+		if (!ocfs2_la_state_enabled(osb))
+			goto bail;
+
+		free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+			le32_to_cpu(alloc->id1.bitmap1.i_used);
+		if (bits_wanted > free_bits)
+			goto bail;
+	}
+
+	if (ac->ac_max_block)
+		mlog(0, "Calling in_range for max block %llu\n",
+		     (unsigned long long)ac->ac_max_block);
+
+	if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
+					bits_wanted)) {
+		/*
+		 * The window is outside ac->ac_max_block.
+		 * This errno tells the caller to keep localalloc enabled
+		 * but to get the allocation from the main bitmap.
+		 */
+		status = -EFBIG;
+		goto bail;
 	}
 
 	ac->ac_inode = local_alloc_inode;
@@ -789,6 +973,85 @@ bail:
 	return status;
 }
 
+enum ocfs2_la_event {
+	OCFS2_LA_EVENT_SLIDE,		/* Normal window slide. */
+	OCFS2_LA_EVENT_FRAGMENTED,	/* The global bitmap has
+					 * enough bits theoretically
+					 * free, but a contiguous
+					 * allocation could not be
+					 * found. */
+	OCFS2_LA_EVENT_ENOSPC,		/* Global bitmap doesn't have
+					 * enough bits free to satisfy
+					 * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+				  enum ocfs2_la_event event)
+{
+	unsigned int bits;
+	int state;
+
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+		WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+		goto out_unlock;
+	}
+
+	/*
+	 * ENOSPC and fragmentation are treated similarly for now.
+	 */
+	if (event == OCFS2_LA_EVENT_ENOSPC ||
+	    event == OCFS2_LA_EVENT_FRAGMENTED) {
+		/*
+		 * We ran out of contiguous space in the primary
+		 * bitmap. Drastically reduce the number of bits used
+		 * by local alloc until we have to disable it.
+		 */
+		bits = osb->local_alloc_bits >> 1;
+		if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+			/*
+			 * By setting state to THROTTLED, we'll keep
+			 * the number of local alloc bits used down
+			 * until an event occurs which would give us
+			 * reason to assume the bitmap situation might
+			 * have changed.
+			 */
+			osb->local_alloc_state = OCFS2_LA_THROTTLED;
+			osb->local_alloc_bits = bits;
+		} else {
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+		queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+				   OCFS2_LA_ENABLE_INTERVAL);
+		goto out_unlock;
+	}
+
+	/*
+	 * Don't increase the size of the local alloc window until we
+	 * know we might be able to fulfill the request. Otherwise, we
+	 * risk bouncing around the global bitmap during periods of
+	 * low space.
+	 */
+	if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+		osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+out_unlock:
+	state = osb->local_alloc_state;
+	spin_unlock(&osb->osb_lock);
+
+	return state;
+}
+
 static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 						struct ocfs2_alloc_context **ac,
 						struct inode **bitmap_inode,
@@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+retry_enospc:
+	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
 
 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status == -ENOSPC) {
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		ocfs2_free_ac_resource(*ac);
+		memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+		goto retry_enospc;
+	}
 	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
+		mlog_errno(status);
 		goto bail;
 	}
 
@@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 		     "one\n");
 
 	mlog(0, "Allocating %u clusters for a new window.\n",
-	     ocfs2_local_alloc_window_bits(osb));
+	     osb->local_alloc_bits);
 
 	/* Instruct the allocation code to try the most recently used
 	 * cluster group. We'll re-record the group used this pass
@@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 	/* we used the generic suballoc reserve function, but we set
 	 * everything up nicely, so there's no reason why we can't use
 	 * the more specific cluster api to claim bits. */
-	status = ocfs2_claim_clusters(osb, handle, ac,
-				      ocfs2_local_alloc_window_bits(osb),
+	status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
 				      &cluster_off, &cluster_count);
+	if (status == -ENOSPC) {
+retry_enospc:
+		/*
+		 * Note: We could also try syncing the journal here to
+		 * allow use of any free bits which the current
+		 * transaction can't give us access to. --Mark
+		 */
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		status = ocfs2_claim_clusters(osb, handle, ac,
+					      osb->local_alloc_bits,
+					      &cluster_off,
+					      &cluster_count);
+		if (status == -ENOSPC)
+			goto retry_enospc;
+		/*
+		 * We only shrunk the *minimum* number of in our
+		 * request - it's entirely possible that the allocator
+		 * might give us more than we asked for.
+		 */
+		if (status == 0) {
+			spin_lock(&osb->osb_lock);
+			osb->local_alloc_bits = cluster_count;
+			spin_unlock(&osb->osb_lock);
+		}
+	}
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
+	ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
+
 	/* This will lock the main bitmap for us. */
 	status = ocfs2_local_alloc_reserve_for_window(osb,
 						      &ac,
@@ -976,8 +1277,7 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);
 
 	if (main_bm_inode)
 		iput(main_bm_inode);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 3f76631e110c..ac5ea9f86653 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 				 u32 *bit_off,
 				 u32 *num_bits);
 
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
+
 #endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 203f87143877..544ac6245175 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -24,6 +24,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/fcntl.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
 #include "locks.h"
 
 static int ocfs2_do_flock(struct file *file, struct inode *inode,
@@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	else
 		return ocfs2_do_flock(file, inode, cmd, fl);
 }
+
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(inode))
+		return -ENOLCK;
+
+	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 9743ef2324ec..496d488b271f 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -27,5 +27,6 @@
 #define OCFS2_LOCKS_H
 
 int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
 
 #endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d5d808fe0140..485a6aa0ad39 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,6 +60,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -327,14 +328,9 @@ leave:
 	if (status == -ENOSPC)
 		mlog(0, "Disk is full\n");
 
-	if (new_fe_bh)
-		brelse(new_fe_bh);
-
-	if (de_bh)
-		brelse(de_bh);
-
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
+	brelse(new_fe_bh);
+	brelse(de_bh);
+	brelse(parent_fe_bh);
 
 	if ((status < 0) && inode)
 		iput(inode);
@@ -647,12 +643,9 @@ out_unlock_inode:
 out:
 	ocfs2_inode_unlock(dir, 1);
 
-	if (de_bh)
-		brelse(de_bh);
-	if (fe_bh)
-		brelse(fe_bh);
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
+	brelse(de_bh);
+	brelse(fe_bh);
+	brelse(parent_fe_bh);
 
 	mlog_exit(err);
 
@@ -851,17 +844,10 @@ leave:
 		iput(orphan_dir);
 	}
 
-	if (fe_bh)
-		brelse(fe_bh);
-
-	if (dirent_bh)
-		brelse(dirent_bh);
-
-	if (parent_node_bh)
-		brelse(parent_node_bh);
-
-	if (orphan_entry_bh)
-		brelse(orphan_entry_bh);
+	brelse(fe_bh);
+	brelse(dirent_bh);
+	brelse(parent_node_bh);
+	brelse(orphan_entry_bh);
 
 	mlog_exit(status);
 
@@ -1372,24 +1358,15 @@ bail:
 
 	if (new_inode)
 		iput(new_inode);
-	if (newfe_bh)
-		brelse(newfe_bh);
-	if (old_inode_bh)
-		brelse(old_inode_bh);
-	if (old_dir_bh)
-		brelse(old_dir_bh);
-	if (new_dir_bh)
-		brelse(new_dir_bh);
-	if (new_de_bh)
-		brelse(new_de_bh);
-	if (old_de_bh)
-		brelse(old_de_bh);
-	if (old_inode_de_bh)
-		brelse(old_inode_de_bh);
-	if (orphan_entry_bh)
-		brelse(orphan_entry_bh);
-	if (insert_entry_bh)
-		brelse(insert_entry_bh);
+	brelse(newfe_bh);
+	brelse(old_inode_bh);
+	brelse(old_dir_bh);
+	brelse(new_dir_bh);
+	brelse(new_de_bh);
+	brelse(old_de_bh);
+	brelse(old_inode_de_bh);
+	brelse(orphan_entry_bh);
+	brelse(insert_entry_bh);
 
 	mlog_exit(status);
 
@@ -1492,8 +1469,7 @@ bail:
 
 	if (bhs) {
 		for(i = 0; i < blocks; i++)
-			if (bhs[i])
-				brelse(bhs[i]);
+			brelse(bhs[i]);
 		kfree(bhs);
 	}
 
@@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
-						    new_fe_bh,
-						    handle, data_ac, NULL,
-						    NULL);
+		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
+					      new_fe_bh,
+					      handle, data_ac, NULL,
+					      NULL);
 		if (status < 0) {
 			if (status != -ENOSPC && status != -EINTR) {
 				mlog(ML_ERROR,
@@ -1659,12 +1635,9 @@ bail:
 
 	ocfs2_inode_unlock(dir, 1);
 
-	if (new_fe_bh)
-		brelse(new_fe_bh);
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
-	if (de_bh)
-		brelse(de_bh);
+	brelse(new_fe_bh);
+	brelse(parent_fe_bh);
+	brelse(de_bh);
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
@@ -1759,8 +1732,7 @@ leave:
 		iput(orphan_dir_inode);
 	}
 
-	if (orphan_dir_bh)
-		brelse(orphan_dir_bh);
+	brelse(orphan_dir_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
-	status = ocfs2_read_block(osb,
+	status = ocfs2_read_block(orphan_dir_inode,
 				  OCFS2_I(orphan_dir_inode)->ip_blkno,
-				  &orphan_dir_bh, OCFS2_BH_CACHED,
-				  orphan_dir_inode);
+				  &orphan_dir_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
 
 leave:
-	if (orphan_dir_bh)
-		brelse(orphan_dir_bh);
+	brelse(orphan_dir_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	}
 
 leave:
-	if (target_de_bh)
-		brelse(target_de_bh);
+	brelse(target_de_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1cb814be8ef1..a21a465490c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,7 +34,12 @@
 #include <linux/workqueue.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
 
 /* For union ocfs2_dlm_lksb */
 #include "stackglue.h"
@@ -171,9 +176,13 @@ struct ocfs2_alloc_stats
 
 enum ocfs2_local_alloc_state
 {
-	OCFS2_LA_UNUSED = 0,
-	OCFS2_LA_ENABLED,
-	OCFS2_LA_DISABLED
+	OCFS2_LA_UNUSED = 0,	/* Local alloc will never be used for
+				 * this mountpoint. */
+	OCFS2_LA_ENABLED,	/* Local alloc is in use. */
+	OCFS2_LA_THROTTLED,	/* Local alloc is in use, but number
+				 * of bits has been reduced. */
+	OCFS2_LA_DISABLED	/* Local alloc has temporarily been
+				 * disabled. */
 };
 
 enum ocfs2_mount_options
@@ -184,6 +193,8 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
 	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
@@ -204,6 +215,8 @@ struct ocfs2_super
 
 	struct ocfs2_slot_info *slot_info;
 
+	u32 *slot_recovery_generations;
+
 	spinlock_t node_map_lock;
 
 	u64 root_blkno;
@@ -212,6 +225,7 @@ struct ocfs2_super
 	u32 bitmap_cpg;
 	u8 *uuid;
 	char *uuid_str;
+	u32 uuid_hash;
 	u8 *vol_label;
 	u64 first_cluster_group_blkno;
 	u32 fs_generation;
@@ -239,6 +253,7 @@ struct ocfs2_super
 	int s_sectsize_bits;
 	int s_clustersize;
 	int s_clustersize_bits;
+	unsigned int s_xattr_inline_size;
 
 	atomic_t vol_state;
 	struct mutex recovery_lock;
@@ -250,11 +265,27 @@ struct ocfs2_super
 	struct ocfs2_journal *journal;
 	unsigned long osb_commit_interval;
 
-	int local_alloc_size;
-	enum ocfs2_local_alloc_state local_alloc_state;
+	struct delayed_work		la_enable_wq;
+
+	/*
+	 * Must hold local alloc i_mutex and osb->osb_lock to change
+	 * local_alloc_bits. Reads can be done under either lock.
+	 */
+	unsigned int local_alloc_bits;
+	unsigned int local_alloc_default_bits;
+
+	enum ocfs2_local_alloc_state local_alloc_state; /* protected
+							 * by osb_lock */
+
 	struct buffer_head *local_alloc_bh;
+
 	u64 la_last_gd;
 
+#ifdef CONFIG_OCFS2_FS_STATS
+	struct dentry *local_alloc_debug;
+	char *local_alloc_debug_buf;
+#endif
+
 	/* Next two fields are for local node slot recovery during
 	 * mount. */
 	int dirty;
@@ -338,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
@@ -552,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
 	return pages_per_cluster;
 }
 
+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+						       unsigned int megs)
+{
+	BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+
+	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
 static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
 {
 	spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3f1945177629..f24ce3d3f956 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -64,6 +64,7 @@
 #define OCFS2_INODE_SIGNATURE		"INODE01"
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -90,7 +91,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
-					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
+					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+					 | OCFS2_FEATURE_INCOMPAT_XATTR)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 
 /*
@@ -127,10 +129,6 @@
 /* Support for data packed into inode blocks */
 #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA	0x0040
 
-/* Support for the extended slot map */
-#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
-
-
 /*
  * Support for alternate, userspace cluster stacks.  If set, the superblock
  * field s_cluster_info contains a tag for the alternate stack in use as
@@ -142,6 +140,12 @@
  */
 #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK	0x0080
 
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -299,6 +303,12 @@ struct ocfs2_new_group_input {
  */
 #define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE	8
 
+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE     256
+
 struct ocfs2_system_inode_info {
 	char	*si_name;
 	int	si_iflags;
@@ -563,7 +573,7 @@ struct ocfs2_super_block {
 /*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
 					   before tunefs required */
 	__le16 s_tunefs_flag;
-	__le32 s_reserved1;
+	__le32 s_uuid_hash;		/* hash value of uuid */
 	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
 					 * group header */
 /*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
@@ -571,7 +581,11 @@ struct ocfs2_super_block {
 /*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
 						     stack.  Only valid
 						     with INCOMPAT flag. */
-/*B8*/  __le64 s_reserved2[17];		/* Fill out superblock */
+/*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
+					   for this fs*/
+	__le16 s_reserved0;
+	__le32 s_reserved1;
+/*C0*/  __le64 s_reserved2[16];		/* Fill out superblock */
 /*140*/
 
 	/*
@@ -621,7 +635,8 @@ struct ocfs2_dinode {
 					   belongs to */
 	__le16 i_suballoc_bit;		/* Bit offset in suballocator
 					   block group */
-/*10*/	__le32 i_reserved0;
+/*10*/	__le16 i_reserved0;
+	__le16 i_xattr_inline_size;
 	__le32 i_clusters;		/* Cluster count */
 	__le32 i_uid;			/* Owner UID */
 	__le32 i_gid;			/* Owning GID */
@@ -640,11 +655,12 @@ struct ocfs2_dinode {
 	__le32 i_atime_nsec;
 	__le32 i_ctime_nsec;
 	__le32 i_mtime_nsec;
-	__le32 i_attr;
+/*70*/	__le32 i_attr;
 	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
 					   was set in i_flags */
 	__le16 i_dyn_features;
-/*70*/	__le64 i_reserved2[8];
+	__le64 i_xattr_loc;
+/*80*/	__le64 i_reserved2[7];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -660,7 +676,10 @@ struct ocfs2_dinode {
 		struct {		/* Info for journal system
 					   inodes */
 			__le32 ij_flags;	/* Mounted, version, etc. */
-			__le32 ij_pad;
+			__le32 ij_recovery_generation; /* Incremented when the
+							  journal is recovered
+							  after an unclean
+							  shutdown */
 		} journal1;
 	} id1;				/* Inode type dependant 1 */
 /*C0*/	union {
@@ -712,6 +731,136 @@ struct ocfs2_group_desc
 /*40*/	__u8    bg_bitmap[0];
 };
 
+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+	__le32	xe_name_hash;    /* hash value of xattr prefix+suffix. */
+	__le16	xe_name_offset;  /* byte offset from the 1st etnry in the local
+				    local xattr storage(inode, xattr block or
+				    xattr bucket). */
+	__u8	xe_name_len;	 /* xattr name len, does't include prefix. */
+	__u8	xe_type;         /* the low 7 bits indicates the name prefix's
+				  * type and the highest 1 bits indicate whether
+				  * the EA is stored in the local storage. */
+	__le64	xe_value_size;	 /* real xattr value length. */
+};
+
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+	__le16	xh_count;                       /* contains the count of how
+						   many records are in the
+						   local xattr storage. */
+	__le16	xh_free_start;                  /* current offset for storing
+						   xattr. */
+	__le16	xh_name_value_len;              /* total length of name/value
+						   length in this bucket. */
+	__le16	xh_num_buckets;                 /* bucket nums in one extent
+						   record, only valid in the
+						   first bucket. */
+	__le64  xh_csum;
+	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+
+/*
+ * On disk structure for xattr value root.
+ *
+ * It is used when one extended attribute's size is larger, and we will save it
+ * in an outside cluster. It will stored in a b-tree like file content.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/	__le32	xr_clusters;              /* clusters covered by xattr value. */
+	__le32	xr_reserved0;
+	__le64	xr_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xr_list; /* Extent record list */
+};
+
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/	__le32	xt_clusters;              /* clusters covered by xattr. */
+	__le32	xt_reserved0;
+	__le64	xt_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+
+#define OCFS2_XATTR_INDEXED	0x1
+#define OCFS2_HASH_SHIFT	5
+#define OCFS2_XATTR_ROUND	3
+#define OCFS2_XATTR_SIZE(size)	(((size) + OCFS2_XATTR_ROUND) & \
+				~(OCFS2_XATTR_ROUND))
+
+#define OCFS2_XATTR_BUCKET_SIZE			4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET 	(OCFS2_XATTR_BUCKET_SIZE \
+						 / OCFS2_MIN_BLOCKSIZE)
+
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/	__u8	xb_signature[8];     /* Signature for verification */
+	__le16	xb_suballoc_slot;    /* Slot suballocator this
+					block belongs to. */
+	__le16	xb_suballoc_bit;     /* Bit offset in suballocator
+					block group */
+	__le32	xb_fs_generation;    /* Must match super block */
+/*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
+	__le64	xb_csum;
+/*20*/	__le16	xb_flags;            /* Indicates whether this block contains
+					real xattr or a xattr tree. */
+	__le16	xb_reserved0;
+	__le32  xb_reserved1;
+	__le64	xb_reserved2;
+/*30*/	union {
+		struct ocfs2_xattr_header xb_header; /* xattr header if this
+							block contains xattr */
+		struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+							block cotains xattr
+							tree. */
+	} xb_attrs;
+};
+
+#define OCFS2_XATTR_ENTRY_LOCAL		0x80
+#define OCFS2_XATTR_TYPE_MASK		0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+					 int local)
+{
+	if (local)
+		xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+	else
+		xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+	xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
+
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
@@ -725,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb)
 		offsetof(struct ocfs2_dinode, id2.i_data.id_data);
 }
 
+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+						   struct ocfs2_dinode *di)
+{
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			xattrsize;
+	else
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
 static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 {
 	int size;
@@ -735,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 	return size / sizeof(struct ocfs2_extent_rec);
 }
 
+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+						struct super_block *sb,
+						struct ocfs2_dinode *di)
+{
+	int size;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+			xattrsize;
+	else
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
 	int size;
@@ -798,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
 	return 0;
 
 }
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
 #else
 static inline int ocfs2_fast_symlink_chars(int blocksize)
 {
@@ -881,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
 
 	return 0;
 }
+
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
 #endif  /* __KERNEL__ */
 
 
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
new file mode 100644
index 000000000000..b91c78f8f558
--- /dev/null
+++ b/fs/ocfs2/ocfs2_jbd_compat.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_jbd_compat.h
+ *
+ * Compatibility defines for JBD.
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_JBD_COMPAT_H
+#define OCFS2_JBD_COMPAT_H
+
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# error Should not have been included
+#endif
+
+struct jbd2_inode {
+	unsigned int dummy;
+};
+
+#define JBD2_BARRIER			JFS_BARRIER
+#define JBD2_DEFAULT_MAX_COMMIT_AGE	JBD_DEFAULT_MAX_COMMIT_AGE
+
+#define jbd2_journal_ack_err			journal_ack_err
+#define jbd2_journal_clear_err			journal_clear_err
+#define jbd2_journal_destroy			journal_destroy
+#define jbd2_journal_dirty_metadata		journal_dirty_metadata
+#define jbd2_journal_errno			journal_errno
+#define jbd2_journal_extend			journal_extend
+#define jbd2_journal_flush			journal_flush
+#define jbd2_journal_force_commit		journal_force_commit
+#define jbd2_journal_get_write_access		journal_get_write_access
+#define jbd2_journal_get_undo_access		journal_get_undo_access
+#define jbd2_journal_init_inode			journal_init_inode
+#define jbd2_journal_invalidatepage		journal_invalidatepage
+#define jbd2_journal_load			journal_load
+#define jbd2_journal_lock_updates		journal_lock_updates
+#define jbd2_journal_restart			journal_restart
+#define jbd2_journal_start			journal_start
+#define jbd2_journal_start_commit		journal_start_commit
+#define jbd2_journal_stop			journal_stop
+#define jbd2_journal_try_to_free_buffers	journal_try_to_free_buffers
+#define jbd2_journal_unlock_updates		journal_unlock_updates
+#define jbd2_journal_wipe			journal_wipe
+#define jbd2_log_wait_commit			log_wait_commit
+
+static inline int jbd2_journal_file_inode(handle_t *handle,
+					  struct jbd2_inode *inode)
+{
+	return 0;
+}
+
+static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+						      loff_t new_size)
+{
+	return 0;
+}
+
+static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
+					       struct inode *inode)
+{
+	return;
+}
+
+static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
+						  struct jbd2_inode *jinode)
+{
+	return;
+}
+
+
+#endif  /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 8166968e9015..ffd48db229a7 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
 		if (cluster > clusters)
 			break;
 
-		ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+		ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
 		if (ret < 0) {
 			mlog_errno(ret);
 			break;
@@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
 	 * update the superblock last.
 	 * It doesn't matter if the write failed.
 	 */
-	ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
-			       &super_bh, 0, NULL);
+	ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
+				     &super_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 					      first_new_cluster - 1);
 
-	ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
-			       main_bm_inode);
+	ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
@@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out_unlock;
 	}
 
-	ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+	ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
 	if (ret < 0) {
 		mlog(ML_ERROR, "Can't read the group descriptor # %llu "
 		     "from the device.", (unsigned long long)input->group);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bb5ff8939bf1..bdda2d8f8508 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 	 * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
 	 * this is not true, the read of -1 (UINT64_MAX) will fail.
 	 */
-	ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
-				si->si_inode);
+	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
+				OCFS2_BH_IGNORE_CACHE);
 	if (ret == 0) {
 		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
@@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 		     (unsigned long long)blkno);
 
 		bh = NULL;  /* Acquire a fresh bh */
-		status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
+					   OCFS2_BH_IGNORE_CACHE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 353fc35c6748..faec2d879357 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -28,6 +28,7 @@
 #include "ocfs2.h"  /* For struct ocfs2_lock_res */
 #include "stackglue.h"
 
+#include <linux/dlm_plock.h>
 
 /*
  * The control protocol starts with a handshake.  Until the handshake
@@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 {
 }
 
+static int user_plock(struct ocfs2_cluster_connection *conn,
+		      u64 ino,
+		      struct file *file,
+		      int cmd,
+		      struct file_lock *fl)
+{
+	/*
+	 * This more or less just demuxes the plock request into any
+	 * one of three dlm calls.
+	 *
+	 * Internally, fs/dlm will pass these to a misc device, which
+	 * a userspace daemon will read and write to.
+	 *
+	 * For now, cancel requests (which happen internally only),
+	 * are turned into unlocks. Most of this function taken from
+	 * gfs2_lock.
+	 */
+
+	if (cmd == F_CANCELLK) {
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+	else
+		return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
+
 /*
  * Compare a requested locking protocol version against the current one.
  *
@@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
 	.dlm_unlock	= user_dlm_unlock,
 	.lock_status	= user_dlm_lock_status,
 	.lock_lvb	= user_dlm_lvb,
+	.plock		= user_plock,
 	.dump_lksb	= user_dlm_dump_lksb,
 };
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 10e149ae5e3a..68b668b0e60a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name,
 		goto out;
 	}
 
-	/* Ok, the stack is pinned */
-	p->sp_count++;
 	active_stack = p;
-
 	rc = 0;
 
 out:
+	/* If we found it, pin it */
+	if (!rc)
+		active_stack->sp_count++;
+
 	spin_unlock(&ocfs2_stack_lock);
 	return rc;
 }
@@ -287,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
 
+int ocfs2_stack_supports_plocks(void)
+{
+	return active_stack && active_stack->sp_ops->plock;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl)
+{
+	WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+	if (active_stack->sp_ops->plock)
+		return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
+
 int ocfs2_cluster_connect(const char *stack_name,
 			  const char *group,
 			  int grouplen,
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index db56281dd1be..c571af375ef8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -28,6 +28,10 @@
 #include "dlm/dlmapi.h"
 #include <linux/dlm.h>
 
+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
+
 /*
  * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
  * some day, but right now we need it.  Let's fake it.  This value is larger
@@ -187,6 +191,17 @@ struct ocfs2_stack_operations {
 	void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
 
 	/*
+	 * Cluster-aware posix locks
+	 *
+	 * This is NULL for stacks which do not support posix locks.
+	 */
+	int (*plock)(struct ocfs2_cluster_connection *conn,
+		     u64 ino,
+		     struct file *file,
+		     int cmd,
+		     struct file_lock *fl);
+
+	/*
 	 * This is an optoinal debugging hook.  If provided, the
 	 * stack can dump debugging information about this lock.
 	 */
@@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
 
+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl);
+
 void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
 
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d2d278fb9819..c5ff18b46b57 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle,
 				  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
-				   struct buffer_head *bh);
+				   struct buffer_head *bh,
+				   u64 max_block);
 
 static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
 				      u16 *bit_off, u16 *bits_found);
 static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
 				    u16 *bit_off, u16 *bits_found);
 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 				     struct ocfs2_alloc_context *ac,
@@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 						u64 data_blkno,
 						u64 *bg_blkno,
 						u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     struct ocfs2_alloc_context **ac);
 
-static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
 	struct inode *inode = ac->ac_inode;
 
@@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 		iput(inode);
 		ac->ac_inode = NULL;
 	}
-	if (ac->ac_bh) {
-		brelse(ac->ac_bh);
-		ac->ac_bh = NULL;
-	}
+	brelse(ac->ac_bh);
+	ac->ac_bh = NULL;
 }
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
  */
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
-				   struct buffer_head *bh)
+				   struct buffer_head *bh,
+				   u64 max_block)
 {
 	int status, credits;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 	mlog_entry_void();
 
 	cl = &fe->id2.i_chain;
-	status = ocfs2_reserve_clusters(osb,
-					le16_to_cpu(cl->cl_cpg),
-					&ac);
+	status = ocfs2_reserve_clusters_with_limit(osb,
+						   le16_to_cpu(cl->cl_cpg),
+						   max_block, &ac);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -394,8 +399,7 @@ bail:
 	if (ac)
 		ocfs2_free_alloc_context(ac);
 
-	if (bg_bh)
-		brelse(bg_bh);
+	brelse(bg_bh);
 
 	mlog_exit(status);
 	return status;
@@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+						 ac->ac_max_block);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
@@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	get_bh(bh);
 	ac->ac_bh = bh;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
 }
 
-int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
-			       struct ocfs2_alloc_context **ac)
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac)
 {
 	int status;
 	u32 slot;
@@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_bits_wanted = blocks;
 	(*ac)->ac_which = OCFS2_AC_USE_META;
 	slot = osb->slot_num;
 	(*ac)->ac_group_search = ocfs2_block_group_search;
@@ -532,6 +536,15 @@ bail:
 	return status;
 }
 
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_extent_list *root_el,
+			       struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_new_metadata_blocks(osb,
+					ocfs2_extend_meta_needed(root_el),
+					ac);
+}
+
 static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
 					      struct ocfs2_alloc_context *ac)
 {
@@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 	(*ac)->ac_group_search = ocfs2_block_group_search;
 
 	/*
+	 * stat(2) can't handle i_ino > 32bits, so we tell the
+	 * lower levels not to allocate us a block group past that
+	 * limit.  The 'inode64' mount option avoids this behavior.
+	 */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+		(*ac)->ac_max_block = (u32)~0U;
+
+	/*
 	 * slot is set when we successfully steal inode from other nodes.
 	 * It is reset in 3 places:
 	 * 1. when we flush the truncate log
@@ -661,9 +682,9 @@ bail:
 /* Callers don't need to care which bitmap (local alloc or main) to
  * use so we figure it out for them, but unfortunately this clutters
  * things a bit. */
-int ocfs2_reserve_clusters(struct ocfs2_super *osb,
-			   u32 bits_wanted,
-			   struct ocfs2_alloc_context **ac)
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     struct ocfs2_alloc_context **ac)
 {
 	int status;
 
@@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
 	}
 
 	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_max_block = max_block;
 
 	status = -ENOSPC;
 	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
 		status = ocfs2_reserve_local_alloc_bits(osb,
 							bits_wanted,
 							*ac);
-		if ((status < 0) && (status != -ENOSPC)) {
+		if (status == -EFBIG) {
+			/* The local alloc window is outside ac_max_block.
+			 * use the main bitmap. */
+			status = -ENOSPC;
+		} else if ((status < 0) && (status != -ENOSPC)) {
 			mlog_errno(status);
 			goto bail;
-		} else if (status == -ENOSPC) {
-			/* reserve_local_bits will return enospc with
-			 * the local alloc inode still locked, so we
-			 * can change this safely here. */
-			mlog(0, "Disabling local alloc\n");
-			/* We set to OCFS2_LA_DISABLED so that umount
-			 * can clean up what's left of the local
-			 * allocation */
-			osb->local_alloc_state = OCFS2_LA_DISABLED;
 		}
 	}
 
@@ -718,6 +735,13 @@ bail:
 	return status;
 }
 
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+}
+
 /*
  * More or less lifted from ext3. I'll leave their description below:
  *
@@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
 static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
 				      u16 *bit_off, u16 *bits_found)
 {
 	int search = -ENOSPC;
 	int ret;
+	u64 blkoff;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u16 tmp_off, tmp_found;
 	unsigned int max_bits, gd_cluster_off;
 
@@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 		if (ret)
 			return ret;
 
+		if (max_block) {
+			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+							  gd_cluster_off +
+							  tmp_off + tmp_found);
+			mlog(0, "Checking %llu against %llu\n",
+			     (unsigned long long)blkoff,
+			     (unsigned long long)max_block);
+			if (blkoff > max_block)
+				return -ENOSPC;
+		}
+
 		/* ocfs2_block_group_find_clear_bits() might
 		 * return success, but we still want to return
 		 * -ENOSPC unless it found the minimum number
@@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 			*bit_off = tmp_off;
 			*bits_found = tmp_found;
 			search = 0; /* success */
+		} else if (tmp_found) {
+			/*
+			 * Don't show bits which we'll be returning
+			 * for allocation to the local alloc bitmap.
+			 */
+			ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
 		}
 	}
 
@@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
 				    u16 *bit_off, u16 *bits_found)
 {
 	int ret = -ENOSPC;
+	u64 blkoff;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
 
 	BUG_ON(min_bits != 1);
 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
 
-	if (bg->bg_free_bits_count)
+	if (bg->bg_free_bits_count) {
 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
 							group_bh, bits_wanted,
 							le16_to_cpu(bg->bg_bits),
 							bit_off, bits_found);
+		if (!ret && max_block) {
+			blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+				*bits_found;
+			mlog(0, "Checking %llu against %llu\n",
+			     (unsigned long long)blkoff,
+			     (unsigned long long)max_block);
+			if (blkoff > max_block)
+				ret = -ENOSPC;
+		}
+	}
 
 	return ret;
 }
@@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	struct ocfs2_group_desc *gd;
 	struct inode *alloc_inode = ac->ac_inode;
 
-	ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
-			       &group_bh, OCFS2_BH_CACHED, alloc_inode);
+	ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	}
 
 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-				  bit_off, &found);
+				  ac->ac_max_block, bit_off, &found);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
@@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	     bits_wanted, chain,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
 
-	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+	status = ocfs2_read_block(alloc_inode,
 				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
-				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
+				  &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	/* for now, the chain search is a bit simplistic. We just use
 	 * the 1st group with any empty bits. */
 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
-					     bits_wanted, min_bits, bit_off,
+					     bits_wanted, min_bits,
+					     ac->ac_max_block, bit_off,
 					     &tmp_bits)) == -ENOSPC) {
 		if (!bg->bg_next_group)
 			break;
 
-		if (prev_group_bh) {
-			brelse(prev_group_bh);
-			prev_group_bh = NULL;
-		}
+		brelse(prev_group_bh);
+		prev_group_bh = NULL;
+
 		next_group = le64_to_cpu(bg->bg_next_group);
 		prev_group_bh = group_bh;
 		group_bh = NULL;
-		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
-					  next_group, &group_bh,
-					  OCFS2_BH_CACHED, alloc_inode);
+		status = ocfs2_read_block(alloc_inode,
+					  next_group, &group_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
-	if (group_bh)
-		brelse(group_bh);
-	if (prev_group_bh)
-		brelse(prev_group_bh);
+	brelse(group_bh);
+	brelse(prev_group_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 {
 	int status = 0;
 	u32 tmp_used;
-	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
 	struct buffer_head *group_bh = NULL;
@@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
 	     (unsigned long long)bg_blkno, start_bit);
 
-	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
-				  alloc_inode);
+	status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	}
 
 bail:
-	if (group_bh)
-		brelse(group_bh);
+	brelse(group_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle,
 	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
 					  bg_start_bit, bg_blkno,
 					  num_clusters);
-	if (status < 0)
+	if (status < 0) {
 		mlog_errno(status);
+		goto out;
+	}
 
+	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+					 num_clusters);
+
+out:
 	mlog_exit(status);
 	return status;
 }
@@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
 	}
 }
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode,
+			  struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*meta_ac = NULL;
+	if (data_ac)
+		*data_ac = NULL;
+
+	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+	num_free_extents = ocfs2_num_free_extents(osb, inode, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Sparse allocation file systems need to be more conservative
+	 * with reserving room for expansion - the actual allocation
+	 * happens while we've got a journal handle open so re-taking
+	 * a cluster lock (because we ran out of room for another
+	 * extent) will violate ordering rules.
+	 *
+	 * Most of the time we'll only be seeing this 1 cluster at a time
+	 * anyway.
+	 *
+	 * Always lock for any unwritten extents - we might want to
+	 * add blocks during a split.
+	 */
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_to_add == 0)
+		goto out;
+
+	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null *data_ac.
+		 */
+	}
+
+	return ret;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 544c600662bd..4df159d8f450 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -28,10 +28,11 @@
 
 typedef int (group_search_t)(struct inode *,
 			     struct buffer_head *,
-			     u32,
-			     u32,
-			     u16 *,
-			     u16 *);
+			     u32,			/* bits_wanted */
+			     u32,			/* min_bits */
+			     u64,			/* max_block */
+			     u16 *,			/* *bit_off */
+			     u16 *);			/* *bits_found */
 
 struct ocfs2_alloc_context {
 	struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -51,6 +52,8 @@ struct ocfs2_alloc_context {
 	group_search_t *ac_group_search;
 
 	u64    ac_last_group;
+	u64    ac_max_block;  /* Highest block number to allocate. 0 is
+				 is the same as ~0 - unlimited */
 };
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
@@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
 	return ac->ac_bits_wanted - ac->ac_bits_given;
 }
 
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
+			       struct ocfs2_extent_list *root_el,
 			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 			    struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
@@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
  * apis above. */
 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 				      struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 
 /* given a cluster offset, calculate which block group it belongs to
  * and return that block offset. */
@@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 int ocfs2_check_group_descriptor(struct super_block *sb,
 				 struct ocfs2_dinode *di,
 				 struct ocfs2_group_desc *gd);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ccecfe5094fa..304b63ac78cf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -64,6 +64,7 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "ver.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -154,10 +155,13 @@ enum {
 	Opt_localalloc,
 	Opt_localflocks,
 	Opt_stack,
+	Opt_user_xattr,
+	Opt_nouser_xattr,
+	Opt_inode64,
 	Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_barrier, "barrier=%u"},
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
@@ -173,6 +177,9 @@ static match_table_t tokens = {
 	{Opt_localalloc, "localalloc=%d"},
 	{Opt_localflocks, "localflocks"},
 	{Opt_stack, "cluster_stack=%s"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_inode64, "inode64"},
 	{Opt_err, NULL}
 };
 
@@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
 		ocfs2_schedule_truncate_log_flush(osb, 0);
 	}
 
-	if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+	if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+				      &target)) {
 		if (wait)
-			log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
-					target);
+			jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+					     target);
 	}
 	return 0;
 }
@@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
 	if (!oi)
 		return NULL;
 
+	jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
 	return &oi->vfs_inode;
 }
 
@@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		goto out;
 	}
 
+	/* Probably don't want this on remount; it might
+	 * mess with other nodes */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+	    (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+		goto out;
+	}
+
 	/* We're going to/from readonly mode. */
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		/* Lock here so the check of HARD_RO and the potential
@@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
-	osb->local_alloc_size = parsed_options.localalloc_opt;
+	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
+	osb->local_alloc_bits = osb->local_alloc_default_bits;
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	return status;
 
 read_super_error:
-	if (bh != NULL)
-		brelse(bh);
+	brelse(bh);
 
 	if (inode)
 		iput(inode);
@@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_data_writeback:
 			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
 			break;
+		case Opt_user_xattr:
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+			break;
+		case Opt_nouser_xattr:
+			mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+			break;
 		case Opt_atime_quantum:
 			if (match_int(&args[0], &option)) {
 				status = 0;
@@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 			if (option < 0)
 				return 0;
 			if (option == 0)
-				option = JBD_DEFAULT_MAX_COMMIT_AGE;
+				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			mopt->commit_interval = HZ * option;
 			break;
 		case Opt_localalloc:
@@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb,
 			       OCFS2_STACK_LABEL_LEN);
 			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
 			break;
+		case Opt_inode64:
+			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
 	struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
 	unsigned long opts = osb->s_mount_opt;
+	unsigned int local_alloc_megs;
 
 	if (opts & OCFS2_MOUNT_HB_LOCAL)
 		seq_printf(s, ",_netdev,heartbeat=local");
@@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",commit=%u",
 			   (unsigned) (osb->osb_commit_interval / HZ));
 
-	if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
-		seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+	local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
+	if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+		seq_printf(s, ",localalloc=%d", local_alloc_megs);
 
 	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
 		seq_printf(s, ",localflocks,");
@@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 			   osb->osb_cluster_stack);
 
+	if (opts & OCFS2_MOUNT_NOUSERXATTR)
+		seq_printf(s, ",nouser_xattr");
+	else
+		seq_printf(s, ",user_xattr");
+
+	if (opts & OCFS2_MOUNT_INODE64)
+		seq_printf(s, ",inode64");
+
 	return 0;
 }
 
@@ -1118,7 +1155,7 @@ bail:
 	return status;
 }
 
-static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
+static void ocfs2_inode_init_once(void *data)
 {
 	struct ocfs2_inode_info *oi = data;
 
@@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
 	oi->ip_dir_start_lookup = 0;
 
 	init_rwsem(&oi->ip_alloc_sem);
+	init_rwsem(&oi->ip_xattr_sem);
 	mutex_init(&oi->ip_io_mutex);
 
 	oi->ip_blkno = 0ULL;
@@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
 	/* this is needed to support O_LARGEFILE */
@@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	osb->slot_num = OCFS2_INVALID_SLOT;
 
+	osb->s_xattr_inline_size = le16_to_cpu(
+					di->id2.i_super.s_xattr_inline_size);
+
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
 	osb->local_alloc_bh = NULL;
+	INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
 
 	init_waitqueue_head(&osb->osb_mount_event);
 
@@ -1442,6 +1485,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
 
+	osb->slot_recovery_generations =
+		kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+			GFP_KERNEL);
+	if (!osb->slot_recovery_generations) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
 	init_waitqueue_head(&osb->osb_wipe_event);
 	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
 					sizeof(*osb->osb_orphan_wipes),
@@ -1559,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->first_cluster_group_blkno =
 		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
 	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+	osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
 	mlog(0, "vol_label: %s\n", osb->vol_label);
 	mlog(0, "uuid: %s\n", osb->uuid_str);
 	mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
@@ -1703,7 +1756,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 	local = ocfs2_mount_local(osb);
 
 	/* will play back anything left in the journal. */
-	status = ocfs2_journal_load(osb->journal, local);
+	status = ocfs2_journal_load(osb->journal, local, dirty);
 	if (status < 0) {
 		mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
 		goto finally;
@@ -1768,6 +1821,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	ocfs2_free_slot_info(osb);
 
 	kfree(osb->osb_orphan_wipes);
+	kfree(osb->slot_recovery_generations);
 	/* FIXME
 	 * This belongs in journal shutdown, but because we have to
 	 * allocate osb->journal at the start of ocfs2_initalize_osb(),
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ba9dbb51d25b..cbd03dfdc7b9 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -50,6 +50,7 @@
 #include "inode.h"
 #include "journal.h"
 #include "symlink.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 
 	mlog_entry_void();
 
-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  bh,
-				  OCFS2_BH_CACHED,
-				  inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
 	if (status < 0) {
 		mlog_errno(status);
 		link = ERR_PTR(status);
@@ -157,8 +154,7 @@ bail:
 		kunmap(page);
 		page_cache_release(page);
 	}
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	return ERR_PTR(status);
 }
@@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
 	.follow_link	= ocfs2_follow_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
 const struct inode_operations ocfs2_fast_symlink_inode_operations = {
 	.readlink	= ocfs2_readlink,
 	.follow_link	= ocfs2_follow_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 4da8851f2b23..187b99ff0368 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,7 +53,11 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/rbtree.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+#endif
 
 #define MLOG_MASK_PREFIX ML_UPTODATE
 
@@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
 	ci->ci_num_cached--;
 }
 
-/* Called when we remove a chunk of metadata from an inode. We don't
- * bother reverting things to an inlined array in the case of a remove
- * which moves us back under the limit. */
-void ocfs2_remove_from_cache(struct inode *inode,
-			     struct buffer_head *bh)
+static void ocfs2_remove_block_from_cache(struct inode *inode,
+					  sector_t block)
 {
 	int index;
-	sector_t block = bh->b_blocknr;
 	struct ocfs2_meta_cache_item *item = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
@@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode,
 		kmem_cache_free(ocfs2_uptodate_cachep, item);
 }
 
+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	sector_t block = bh->b_blocknr;
+
+	ocfs2_remove_block_from_cache(inode, block);
+}
+
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+					    sector_t block,
+					    u32 c_len)
+{
+	unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+
+	for (i = 0; i < b_len; i++, block++)
+		ocfs2_remove_block_from_cache(inode, block);
+}
+
 int __init init_ocfs2_uptodate_cache(void)
 {
 	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 2e73206059a8..531b4b3a0c47 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
 				   struct buffer_head *bh);
 void ocfs2_remove_from_cache(struct inode *inode,
 			     struct buffer_head *bh);
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+					    sector_t block,
+					    u32 c_len);
 int ocfs2_buffer_read_ahead(struct inode *inode,
 			    struct buffer_head *bh);
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 000000000000..802c41492214
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,4832 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is taken from ext3.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/sort.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_XATTR
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "super.h"
+#include "xattr.h"
+
+
+struct ocfs2_xattr_def_value_root {
+	struct ocfs2_xattr_value_root	xv;
+	struct ocfs2_extent_rec		er;
+};
+
+struct ocfs2_xattr_bucket {
+	struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+	struct ocfs2_xattr_header *xh;
+};
+
+#define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE	80
+
+static struct ocfs2_xattr_def_value_root def_xv = {
+	.xv.xr_list.l_count = cpu_to_le16(1),
+};
+
+struct xattr_handler *ocfs2_xattr_handlers[] = {
+	&ocfs2_xattr_user_handler,
+	&ocfs2_xattr_trusted_handler,
+	NULL
+};
+
+static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
+	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
+};
+
+struct ocfs2_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ocfs2_xattr_search {
+	struct buffer_head *inode_bh;
+	/*
+	 * xattr_bh point to the block buffer head which has extended attribute
+	 * when extended attribute in inode, xattr_bh is equal to inode_bh.
+	 */
+	struct buffer_head *xattr_bh;
+	struct ocfs2_xattr_header *header;
+	struct ocfs2_xattr_bucket bucket;
+	void *base;
+	void *end;
+	struct ocfs2_xattr_entry *here;
+	int not_found;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset);
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					struct ocfs2_xattr_tree_root *xt,
+					char *buffer,
+					size_t buffer_size);
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs);
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+					  struct buffer_head *xb_bh);
+
+static inline const char *ocfs2_xattr_prefix(int name_index)
+{
+	struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+		handler = ocfs2_xattr_handler_map[name_index];
+
+	return handler ? handler->prefix : NULL;
+}
+
+static u32 ocfs2_xattr_name_hash(struct inode *inode,
+				 const char *name,
+				 int name_len)
+{
+	/* Get hash value of uuid from super block */
+	u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
+	int i;
+
+	/* hash extended attribute name */
+	for (i = 0; i < name_len; i++) {
+		hash = (hash << OCFS2_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	return hash;
+}
+
+/*
+ * ocfs2_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static void ocfs2_xattr_hash_entry(struct inode *inode,
+				   struct ocfs2_xattr_header *header,
+				   struct ocfs2_xattr_entry *entry)
+{
+	u32 hash = 0;
+	char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
+
+	hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
+	entry->xe_name_hash = cpu_to_le32(hash);
+
+	return;
+}
+
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+					 u32 clusters_to_add,
+					 struct buffer_head *xattr_bh,
+					 struct ocfs2_xattr_value_root *xv)
+{
+	int status = 0;
+	int restart_func = 0;
+	int credits = 0;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+	struct ocfs2_extent_tree et;
+
+	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
+
+	ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+
+restart_all:
+
+	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				       &data_ac, &meta_ac);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+					    clusters_to_add);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	status = ocfs2_journal_access(handle, inode, xattr_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	prev_clusters = le32_to_cpu(xv->xr_clusters);
+	status = ocfs2_add_clusters_in_btree(osb,
+					     inode,
+					     &logical_start,
+					     clusters_to_add,
+					     0,
+					     &et,
+					     handle,
+					     data_ac,
+					     meta_ac,
+					     &why);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, xattr_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			mlog(0, "restarting function.\n");
+			restart_func = 1;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			mlog(0, "restarting transaction.\n");
+			/* TODO: This can be more intelligent. */
+			credits = ocfs2_calc_extend_credits(osb->sb,
+							    et.et_root_el,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+leave:
+	if (handle) {
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+
+	return status;
+}
+
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+				      struct buffer_head *root_bh,
+				      struct ocfs2_xattr_value_root *xv,
+				      u32 cpos, u32 phys_cpos, u32 len,
+				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+				  dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xv->xr_clusters, -len);
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+				   u32 old_clusters,
+				   u32 new_clusters,
+				   struct buffer_head *root_bh,
+				   struct ocfs2_xattr_value_root *xv)
+{
+	int ret = 0;
+	u32 trunc_len, cpos, phys_cpos, alloc_size;
+	u64 block;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (old_clusters <= new_clusters)
+		return 0;
+
+	cpos = new_clusters;
+	trunc_len = old_clusters - new_clusters;
+	while (trunc_len) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+					       &alloc_size, &xv->xr_list);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > trunc_len)
+			alloc_size = trunc_len;
+
+		ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+						 phys_cpos, alloc_size,
+						 &dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+		ocfs2_remove_xattr_clusters_from_cache(inode, block,
+						       alloc_size);
+		cpos += alloc_size;
+		trunc_len -= alloc_size;
+	}
+
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+				      struct buffer_head *root_bh,
+				      struct ocfs2_xattr_value_root *xv,
+				      int len)
+{
+	int ret;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+	u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+
+	if (new_clusters == old_clusters)
+		return 0;
+
+	if (new_clusters > old_clusters)
+		ret = ocfs2_xattr_extend_allocation(inode,
+						    new_clusters - old_clusters,
+						    root_bh, xv);
+	else
+		ret = ocfs2_xattr_shrink_size(inode,
+					      old_clusters, new_clusters,
+					      root_bh, xv);
+
+	return ret;
+}
+
+static int ocfs2_xattr_list_entry(char *buffer, size_t size,
+				  size_t *result, const char *prefix,
+				  const char *name, int name_len)
+{
+	char *p = buffer + *result;
+	int prefix_len = strlen(prefix);
+	int total_len = prefix_len + name_len + 1;
+
+	*result += total_len;
+
+	/* we are just looking for how big our buffer needs to be */
+	if (!size)
+		return 0;
+
+	if (*result > size)
+		return -ERANGE;
+
+	memcpy(p, prefix, prefix_len);
+	memcpy(p + prefix_len, name, name_len);
+	p[prefix_len + name_len] = '\0';
+
+	return 0;
+}
+
+static int ocfs2_xattr_list_entries(struct inode *inode,
+				    struct ocfs2_xattr_header *header,
+				    char *buffer, size_t buffer_size)
+{
+	size_t result = 0;
+	int i, type, ret;
+	const char *prefix, *name;
+
+	for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
+
+		if (prefix) {
+			name = (const char *)header +
+				le16_to_cpu(entry->xe_name_offset);
+
+			ret = ocfs2_xattr_list_entry(buffer, buffer_size,
+						     &result, prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return result;
+}
+
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct ocfs2_xattr_header *header = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return ret;
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+		ret = -EFAULT;
+		goto cleanup;
+	}
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		ret = ocfs2_xattr_list_entries(inode, header,
+					       buffer, buffer_size);
+	} else {
+		struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+		ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+						   buffer, buffer_size);
+	}
+cleanup:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+			char *buffer,
+			size_t size)
+{
+	int ret = 0, i_ret = 0, b_ret = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
+		return -EOPNOTSUPP;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return ret;
+
+	ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_read(&oi->ip_xattr_sem);
+	i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+	if (i_ret < 0)
+		b_ret = 0;
+	else {
+		if (buffer) {
+			buffer += i_ret;
+			size -= i_ret;
+		}
+		b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+					       buffer, size);
+		if (b_ret < 0)
+			i_ret = 0;
+	}
+	up_read(&oi->ip_xattr_sem);
+	ocfs2_inode_unlock(dentry->d_inode, 0);
+
+	brelse(di_bh);
+
+	return i_ret + b_ret;
+}
+
+static int ocfs2_xattr_find_entry(int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_entry *entry;
+	size_t name_len;
+	int i, cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	entry = xs->here;
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		cmp = name_index - ocfs2_xattr_get_type(entry);
+		if (!cmp)
+			cmp = name_len - entry->xe_name_len;
+		if (!cmp)
+			cmp = memcmp(name, (xs->base +
+				     le16_to_cpu(entry->xe_name_offset)),
+				     name_len);
+		if (cmp == 0)
+			break;
+		entry += 1;
+	}
+	xs->here = entry;
+
+	return cmp ? -ENODATA : 0;
+}
+
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_value_root *xv,
+					 void *buffer,
+					 size_t len)
+{
+	u32 cpos, p_cluster, num_clusters, bpc, clusters;
+	u64 blkno;
+	int i, ret = 0;
+	size_t cplen, blocksize;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_list *el;
+
+	el = &xv->xr_list;
+	clusters = le32_to_cpu(xv->xr_clusters);
+	bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	blocksize = inode->i_sb->s_blocksize;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		/* Copy ocfs2_xattr_value */
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(inode, blkno, &bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cplen = len >= blocksize ? blocksize : len;
+			memcpy(buffer, bh->b_data, cplen);
+			len -= cplen;
+			buffer += cplen;
+
+			brelse(bh);
+			bh = NULL;
+			if (len == 0)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_xattr_value_root *xv;
+	size_t size;
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return -ENODATA;
+
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (ret)
+		return ret;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		if (size > buffer_size)
+			return -ERANGE;
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       le16_to_cpu(xs->here->xe_name_offset) +
+			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+		} else {
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + le16_to_cpu(
+				 xs->here->xe_name_offset) +
+				OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				return ret;
+			}
+		}
+	}
+
+	return size;
+}
+
+static int ocfs2_xattr_block_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_value_root *xv;
+	size_t size;
+	int ret = -ENODATA, name_offset, name_len, block_off, i;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+		ret = -EFAULT;
+		goto cleanup;
+	}
+
+	xs->xattr_bh = blk_bh;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		xs->header = &xb->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	} else
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
+
+	if (ret)
+		goto cleanup;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		ret = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+
+		name_offset = le16_to_cpu(xs->here->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
+		i = xs->here - xs->header->xh_entries;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+								xs->bucket.xh,
+								i,
+								&block_off,
+								&name_offset);
+			xs->base = xs->bucket.bhs[block_off]->b_data;
+		}
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       name_offset + name_len, size);
+		} else {
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + name_offset + name_len);
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
+	}
+	ret = size;
+cleanup:
+	for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+		brelse(xs->bucket.bhs[i]);
+	memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+	brelse(blk_bh);
+	return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+int ocfs2_xattr_get(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    void *buffer,
+		    size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		ret = -ENODATA;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_read(&oi->ip_xattr_sem);
+	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+				    buffer_size, &xis);
+	if (ret == -ENODATA)
+		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+					    buffer_size, &xbs);
+	up_read(&oi->ip_xattr_sem);
+	ocfs2_inode_unlock(inode, 0);
+
+	brelse(di_bh);
+
+	return ret;
+}
+
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+					   struct ocfs2_xattr_value_root *xv,
+					   const void *value,
+					   int value_len)
+{
+	int ret = 0, i, cp_len, credits;
+	u16 blocksize = inode->i_sb->s_blocksize;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+	u64 blkno;
+	struct buffer_head *bh = NULL;
+	handle_t *handle;
+
+	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+
+	credits = clusters * bpc;
+	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(inode, blkno, &bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			ret = ocfs2_journal_access(handle,
+						   inode,
+						   bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			cp_len = value_len > blocksize ? blocksize : value_len;
+			memcpy(bh->b_data, value, cp_len);
+			value_len -= cp_len;
+			value += cp_len;
+			if (cp_len < blocksize)
+				memset(bh->b_data + cp_len, 0,
+				       blocksize - cp_len);
+
+			ret = ocfs2_journal_dirty(handle, bh);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+			brelse(bh);
+			bh = NULL;
+
+			/*
+			 * XXX: do we need to empty all the following
+			 * blocks in this cluster?
+			 */
+			if (!value_len)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	brelse(bh);
+
+	return ret;
+}
+
+static int ocfs2_xattr_cleanup(struct inode *inode,
+			       struct ocfs2_xattr_info *xi,
+			       struct ocfs2_xattr_search *xs,
+			       size_t offs)
+{
+	handle_t *handle = NULL;
+	int ret = 0;
+	size_t name_len = strlen(xi->name);
+	void *val = xs->base + offs;
+	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+	/* Decrease xattr count */
+	le16_add_cpu(&xs->header->xh_count, -1);
+	/* Remove the xattr entry and tree root which has already be set*/
+	memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
+	memset(val, 0, size);
+
+	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_update_entry(struct inode *inode,
+				    struct ocfs2_xattr_info *xi,
+				    struct ocfs2_xattr_search *xs,
+				    size_t offs)
+{
+	handle_t *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	xs->here->xe_name_offset = cpu_to_le16(offs);
+	xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+	if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
+		ocfs2_xattr_set_local(xs->here, 1);
+	else
+		ocfs2_xattr_set_local(xs->here, 0);
+	ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+
+	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set_value_outside()
+ *
+ * Set large size value in B tree.
+ */
+static int ocfs2_xattr_set_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_info *xi,
+					 struct ocfs2_xattr_search *xs,
+					 size_t offs)
+{
+	size_t name_len = strlen(xi->name);
+	void *val = xs->base + offs;
+	struct ocfs2_xattr_value_root *xv = NULL;
+	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	int ret = 0;
+
+	memset(val, 0, size);
+	memcpy(val, xi->name, name_len);
+	xv = (struct ocfs2_xattr_value_root *)
+		(val + OCFS2_XATTR_SIZE(name_len));
+	xv->xr_clusters = 0;
+	xv->xr_last_eb_blk = 0;
+	xv->xr_list.l_tree_depth = 0;
+	xv->xr_list.l_count = cpu_to_le16(1);
+	xv->xr_list.l_next_free_rec = 0;
+
+	ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+					 xi->value_len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+					      xi->value_len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set_entry_local()
+ *
+ * Set, replace or remove extended attribute in local.
+ */
+static void ocfs2_xattr_set_entry_local(struct inode *inode,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xs,
+					struct ocfs2_xattr_entry *last,
+					size_t min_offs)
+{
+	size_t name_len = strlen(xi->name);
+	int i;
+
+	if (xi->value && xs->not_found) {
+		/* Insert the new xattr entry. */
+		le16_add_cpu(&xs->header->xh_count, 1);
+		ocfs2_xattr_set_type(last, xi->name_index);
+		ocfs2_xattr_set_local(last, 1);
+		last->xe_name_len = name_len;
+	} else {
+		void *first_val;
+		void *val;
+		size_t offs, size;
+
+		first_val = xs->base + min_offs;
+		offs = le16_to_cpu(xs->here->xe_name_offset);
+		val = xs->base + offs;
+
+		if (le64_to_cpu(xs->here->xe_value_size) >
+		    OCFS2_XATTR_INLINE_SIZE)
+			size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_ROOT_SIZE;
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+
+		if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_SIZE(xi->value_len)) {
+			/* The old and the new value have the
+			   same size. Just replace the value. */
+			ocfs2_xattr_set_local(xs->here, 1);
+			xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+			/* Clear value bytes. */
+			memset(val + OCFS2_XATTR_SIZE(name_len),
+			       0,
+			       OCFS2_XATTR_SIZE(xi->value_len));
+			memcpy(val + OCFS2_XATTR_SIZE(name_len),
+			       xi->value,
+			       xi->value_len);
+			return;
+		}
+		/* Remove the old name+value. */
+		memmove(first_val + size, first_val, val - first_val);
+		memset(first_val, 0, size);
+		xs->here->xe_name_hash = 0;
+		xs->here->xe_name_offset = 0;
+		ocfs2_xattr_set_local(xs->here, 1);
+		xs->here->xe_value_size = 0;
+
+		min_offs += size;
+
+		/* Adjust all value offsets. */
+		last = xs->header->xh_entries;
+		for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+			size_t o = le16_to_cpu(last->xe_name_offset);
+
+			if (o < offs)
+				last->xe_name_offset = cpu_to_le16(o + size);
+			last += 1;
+		}
+
+		if (!xi->value) {
+			/* Remove the old entry. */
+			last -= 1;
+			memmove(xs->here, xs->here + 1,
+				(void *)last - (void *)xs->here);
+			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+			le16_add_cpu(&xs->header->xh_count, -1);
+		}
+	}
+	if (xi->value) {
+		/* Insert the new name+value. */
+		size_t size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_SIZE(xi->value_len);
+		void *val = xs->base + min_offs - size;
+
+		xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
+		memset(val, 0, size);
+		memcpy(val, xi->name, name_len);
+		memcpy(val + OCFS2_XATTR_SIZE(name_len),
+		       xi->value,
+		       xi->value_len);
+		xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+		ocfs2_xattr_set_local(xs->here, 1);
+		ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+	}
+
+	return;
+}
+
+/*
+ * ocfs2_xattr_set_entry()
+ *
+ * Set extended attribute entry into inode or block.
+ *
+ * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
+ * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
+ * then set value in B tree with set_value_outside().
+ */
+static int ocfs2_xattr_set_entry(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs,
+				 int flag)
+{
+	struct ocfs2_xattr_entry *last;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
+	size_t size_l = 0;
+	handle_t *handle = NULL;
+	int free, i, ret;
+	struct ocfs2_xattr_info xi_l = {
+		.name_index = xi->name_index,
+		.name = xi->name,
+		.value = xi->value,
+		.value_len = xi->value_len,
+	};
+
+	/* Compute min_offs, last and free space. */
+	last = xs->header->xh_entries;
+
+	for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+		size_t offs = le16_to_cpu(last->xe_name_offset);
+		if (offs < min_offs)
+			min_offs = offs;
+		last += 1;
+	}
+
+	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	if (free < 0)
+		return -EFAULT;
+
+	if (!xs->not_found) {
+		size_t size = 0;
+		if (ocfs2_xattr_is_local(xs->here))
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_ROOT_SIZE;
+		free += (size + sizeof(struct ocfs2_xattr_entry));
+	}
+	/* Check free space in inode or block */
+	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		if (free < sizeof(struct ocfs2_xattr_entry) +
+			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_ROOT_SIZE) {
+			ret = -ENOSPC;
+			goto out;
+		}
+		size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+		xi_l.value = (void *)&def_xv;
+		xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
+	} else if (xi->value) {
+		if (free < sizeof(struct ocfs2_xattr_entry) +
+			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->value_len)) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
+
+	if (!xs->not_found) {
+		/* For existing extended attribute */
+		size_t size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+		void *val = xs->base + offs;
+
+		if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
+			/* Replace existing local xattr with tree root */
+			ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
+							    offs);
+			if (ret < 0)
+				mlog_errno(ret);
+			goto out;
+		} else if (!ocfs2_xattr_is_local(xs->here)) {
+			/* For existing xattr which has value outside */
+			struct ocfs2_xattr_value_root *xv = NULL;
+			xv = (struct ocfs2_xattr_value_root *)(val +
+				OCFS2_XATTR_SIZE(name_len));
+
+			if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+				/*
+				 * If new value need set outside also,
+				 * first truncate old value to new value,
+				 * then set new value with set_value_outside().
+				 */
+				ret = ocfs2_xattr_value_truncate(inode,
+								 xs->xattr_bh,
+								 xv,
+								 xi->value_len);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = __ocfs2_xattr_set_value_outside(inode,
+								xv,
+								xi->value,
+								xi->value_len);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_xattr_update_entry(inode,
+							       xi,
+							       xs,
+							       offs);
+				if (ret < 0)
+					mlog_errno(ret);
+				goto out;
+			} else {
+				/*
+				 * If new value need set in local,
+				 * just trucate old value to zero.
+				 */
+				 ret = ocfs2_xattr_value_truncate(inode,
+								 xs->xattr_bh,
+								 xv,
+								 0);
+				if (ret < 0)
+					mlog_errno(ret);
+			}
+		}
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+		/* set extended attribute in external block. */
+		ret = ocfs2_extend_trans(handle,
+					 OCFS2_INODE_UPDATE_CREDITS +
+					 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/*
+	 * Set value in local, include set tree root in local.
+	 * This is the first step for value size >INLINE_SIZE.
+	 */
+	ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
+
+	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+		ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
+	    (flag & OCFS2_INLINE_XATTR_FL)) {
+		struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+		unsigned int xattrsize = osb->s_xattr_inline_size;
+
+		/*
+		 * Adjust extent record count or inline data size
+		 * to reserve space for extended attribute.
+		 */
+		if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+			struct ocfs2_inline_data *idata = &di->id2.i_data;
+			le16_add_cpu(&idata->id_count, -xattrsize);
+		} else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+			struct ocfs2_extent_list *el = &di->id2.i_list;
+			le16_add_cpu(&el->l_count, -(xattrsize /
+					sizeof(struct ocfs2_extent_rec)));
+		}
+		di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+	}
+	/* Update xattr flag */
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= flag;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+	/* Update inode ctime */
+	inode->i_ctime = CURRENT_TIME;
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
+	if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/*
+		 * Set value outside in B tree.
+		 * This is the second step for value size > INLINE_SIZE.
+		 */
+		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+		if (ret < 0) {
+			int ret2;
+
+			mlog_errno(ret);
+			/*
+			 * If set value outside failed, we have to clean
+			 * the junk tree root we have already set in local.
+			 */
+			ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+			if (ret2 < 0)
+				mlog_errno(ret2);
+		}
+	}
+out:
+	return ret;
+
+}
+
+static int ocfs2_remove_value_outside(struct inode*inode,
+				      struct buffer_head *bh,
+				      struct ocfs2_xattr_header *header)
+{
+	int ret = 0, i;
+
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+
+		if (!ocfs2_xattr_is_local(entry)) {
+			struct ocfs2_xattr_value_root *xv;
+			void *val;
+
+			val = (void *)header +
+				le16_to_cpu(entry->xe_name_offset);
+			xv = (struct ocfs2_xattr_value_root *)
+				(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+			ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+			if (ret < 0) {
+				mlog_errno(ret);
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+				    struct buffer_head *di_bh)
+{
+
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_xattr_header *header;
+	int ret;
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_remove_value_outside(inode, di_bh, header);
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_remove(struct inode *inode,
+				    struct buffer_head *blk_bh)
+{
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+		ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+	} else
+		ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
+
+	return ret;
+}
+
+static int ocfs2_xattr_free_block(struct inode *inode,
+				  u64 block)
+{
+	struct inode *xb_alloc_inode;
+	struct buffer_head *xb_alloc_bh = NULL;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	int ret = 0;
+	u64 blk, bg_blkno;
+	u16 bit;
+
+	ret = ocfs2_read_block(inode, block, &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = ocfs2_xattr_block_remove(inode, blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	blk = le64_to_cpu(xb->xb_blkno);
+	bit = le16_to_cpu(xb->xb_suballoc_bit);
+	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+				EXTENT_ALLOC_SYSTEM_INODE,
+				le16_to_cpu(xb->xb_suballoc_slot));
+	if (!xb_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&xb_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	ocfs2_inode_unlock(xb_alloc_inode, 1);
+	brelse(xb_alloc_bh);
+out_mutex:
+	mutex_unlock(&xb_alloc_inode->i_mutex);
+	iput(xb_alloc_inode);
+out:
+	brelse(blk_bh);
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	handle_t *handle;
+	int ret;
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return 0;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (di->i_xattr_loc) {
+		ret = ocfs2_xattr_free_block(inode,
+					     le64_to_cpu(di->i_xattr_loc));
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	di->i_xattr_loc = 0;
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_has_space_inline(struct inode *inode,
+					struct ocfs2_dinode *di)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+	int free;
+
+	if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
+		return 0;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		struct ocfs2_inline_data *idata = &di->id2.i_data;
+		free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
+	} else if (ocfs2_inode_is_fast_symlink(inode)) {
+		free = ocfs2_fast_symlink_chars(inode->i_sb) -
+			le64_to_cpu(di->i_size);
+	} else {
+		struct ocfs2_extent_list *el = &di->id2.i_list;
+		free = (le16_to_cpu(el->l_count) -
+			le16_to_cpu(el->l_next_free_rec)) *
+			sizeof(struct ocfs2_extent_rec);
+	}
+	if (free >= xattrsize)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_find()
+ *
+ * Find extended attribute in inode block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+	int has_space = 0;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		down_read(&oi->ip_alloc_sem);
+		has_space = ocfs2_xattr_has_space_inline(inode, di);
+		up_read(&oi->ip_alloc_sem);
+		if (!has_space)
+			return 0;
+	}
+
+	xs->xattr_bh = xs->inode_bh;
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	else
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	/* Find the named attribute. */
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+		if (ret && ret != -ENODATA)
+			return ret;
+		xs->not_found = ret;
+	}
+
+	return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_set()
+ *
+ * Set, replace or remove an extended attribute into inode block.
+ *
+ */
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return -ENOSPC;
+
+	down_write(&oi->ip_alloc_sem);
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		if (!ocfs2_xattr_has_space_inline(inode, di)) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
+
+	ret = ocfs2_xattr_set_entry(inode, xi, xs,
+				(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+out:
+	up_write(&oi->ip_alloc_sem);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_find()
+ *
+ * Find extended attribute in external block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+			ret = -EFAULT;
+			goto cleanup;
+	}
+
+	xs->xattr_bh = blk_bh;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		xs->header = &xb->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	} else
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
+
+	if (ret && ret != -ENODATA) {
+		xs->xattr_bh = NULL;
+		goto cleanup;
+	}
+	xs->not_found = ret;
+	return 0;
+cleanup:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+/*
+ * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
+ * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
+ * re-initialized.
+ */
+static int ocfs2_restore_xattr_block(struct inode *inode,
+				     struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+
+	BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
+		le16_to_cpu(el->l_next_free_rec) != 0);
+
+	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+	       offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+	xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
+
+	ocfs2_journal_dirty(handle, xs->xattr_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_set()
+ *
+ * Set, replace or remove an extended attribute into external block.
+ *
+ */
+static int ocfs2_xattr_block_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_xattr_block *xblk = NULL;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	int ret;
+
+	if (!xs->xattr_bh) {
+		/*
+		 * Alloc one external block for extended attribute
+		 * outside of inode.
+		 */
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		handle = ocfs2_start_trans(osb,
+					   OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			goto out;
+		}
+		ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+					   &suballoc_bit_start, &num_got,
+					   &first_blkno);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		new_bh = sb_getblk(inode->i_sb, first_blkno);
+		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+		ret = ocfs2_journal_access(handle, inode, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		/* Initialize ocfs2_xattr_block */
+		xs->xattr_bh = new_bh;
+		xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+		memset(xblk, 0, inode->i_sb->s_blocksize);
+		strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+		xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+		xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+		xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+		xblk->xb_blkno = cpu_to_le64(first_blkno);
+
+		xs->header = &xblk->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+		xs->here = xs->header->xh_entries;
+
+
+		ret = ocfs2_journal_dirty(handle, new_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		di->i_xattr_loc = cpu_to_le64(first_blkno);
+		ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+		if (ret < 0)
+			mlog_errno(ret);
+out_commit:
+		ocfs2_commit_trans(osb, handle);
+out:
+		if (meta_ac)
+			ocfs2_free_alloc_context(meta_ac);
+		if (ret < 0)
+			return ret;
+	} else
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		/* Set extended attribute into external block */
+		ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+		if (!ret || ret != -ENOSPC)
+			goto end;
+
+		ret = ocfs2_xattr_create_index_block(inode, xs);
+		if (ret)
+			goto end;
+	}
+
+	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+	if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
+		ret = ocfs2_restore_xattr_block(inode, xs);
+
+end:
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set()
+ *
+ * Set, replace or remove an extended attribute for this inode.
+ * value is NULL to remove an existing extended attribute, else either
+ * create or replace an extended attribute.
+ */
+int ocfs2_xattr_set(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    const void *value,
+		    size_t value_len,
+		    int flags)
+{
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	int ret;
+	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	struct ocfs2_xattr_info xi = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+	/*
+	 * Scan inode and external block to find the same name
+	 * extended attribute and collect search infomation.
+	 */
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	if (xis.not_found && xbs.not_found) {
+		ret = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		ret = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		ret = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+
+	if (!value) {
+		/* Remove existing extended attribute */
+		if (!xis.not_found)
+			ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+		else if (!xbs.not_found)
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+	} else {
+		/* We always try to set extended attribute into inode first*/
+		ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+		if (!ret && !xbs.not_found) {
+			/*
+			 * If succeed and that extended attribute existing in
+			 * external block, then we will remove it.
+			 */
+			xi.value = NULL;
+			xi.value_len = 0;
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+		} else if (ret == -ENOSPC) {
+			if (di->i_xattr_loc && !xbs.xattr_bh) {
+				ret = ocfs2_xattr_block_find(inode, name_index,
+							     name, &xbs);
+				if (ret)
+					goto cleanup;
+			}
+			/*
+			 * If no space in inode, we will set extended attribute
+			 * into external block.
+			 */
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+			if (ret)
+				goto cleanup;
+			if (!xis.not_found) {
+				/*
+				 * If succeed and that extended attribute
+				 * existing in inode, we will remove it.
+				 */
+				xi.value = NULL;
+				xi.value_len = 0;
+				ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+			}
+		}
+	}
+cleanup:
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+	brelse(xbs.xattr_bh);
+	for (i = 0; i < blk_per_bucket; i++)
+		brelse(xbs.bucket.bhs[i]);
+
+	return ret;
+}
+
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_rec(struct inode *inode,
+			       u32 name_hash,
+			       u64 *p_blkno,
+			       u32 *e_cpos,
+			       u32 *num_clusters,
+			       struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+	u64 e_blkno = 0;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr tree block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+			e_blkno = le64_to_cpu(rec->e_blkno);
+			break;
+		}
+	}
+
+	if (!e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0) in xattr", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*p_blkno = le64_to_cpu(rec->e_blkno);
+	*num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	if (e_cpos)
+		*e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+typedef int (xattr_bucket_func)(struct inode *inode,
+				struct ocfs2_xattr_bucket *bucket,
+				void *para);
+
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+				   struct buffer_head *header_bh,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u16 *xe_index,
+				   int *found)
+{
+	int i, ret = 0, cmp = 1, block_off, new_offset;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)header_bh->b_data;
+	size_t name_len = strlen(name);
+	struct ocfs2_xattr_entry *xe = NULL;
+	struct buffer_head *name_bh = NULL;
+	char *xe_name;
+
+	/*
+	 * We don't use binary search in the bucket because there
+	 * may be multiple entries with the same name hash.
+	 */
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash))
+			continue;
+		else if (name_hash < le32_to_cpu(xe->xe_name_hash))
+			break;
+
+		cmp = name_index - ocfs2_xattr_get_type(xe);
+		if (!cmp)
+			cmp = name_len - xe->xe_name_len;
+		if (cmp)
+			continue;
+
+		ret = ocfs2_xattr_bucket_get_name_value(inode,
+							xh,
+							i,
+							&block_off,
+							&new_offset);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
+				       &name_bh);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		xe_name = name_bh->b_data + new_offset;
+
+		cmp = memcmp(name, xe_name, name_len);
+		brelse(name_bh);
+		name_bh = NULL;
+
+		if (cmp == 0) {
+			*xe_index = i;
+			*found = 1;
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Find the specified xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
+ * the num of the valid buckets.
+ *
+ * Return the buffer_head this xattr should reside in. And if the xattr's
+ * hash is in the gap of 2 buckets, return the lower bucket.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u64 p_blkno,
+				   u32 first_hash,
+				   u32 num_clusters,
+				   struct ocfs2_xattr_search *xs)
+{
+	int ret, found = 0;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lower_bh = NULL;
+	struct ocfs2_xattr_header *xh = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	u16 index = 0;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int low_bucket = 0, bucket, high_bucket;
+	u32 last_hash;
+	u64 blkno;
+
+	ret = ocfs2_read_block(inode, p_blkno, &bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
+
+	while (low_bucket <= high_bucket) {
+		brelse(bh);
+		bh = NULL;
+		bucket = (low_bucket + high_bucket) / 2;
+
+		blkno = p_blkno + bucket * blk_per_bucket;
+
+		ret = ocfs2_read_block(inode, blkno, &bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xh = (struct ocfs2_xattr_header *)bh->b_data;
+		xe = &xh->xh_entries[0];
+		if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+			high_bucket = bucket - 1;
+			continue;
+		}
+
+		/*
+		 * Check whether the hash of the last entry in our
+		 * bucket is larger than the search one. for an empty
+		 * bucket, the last one is also the first one.
+		 */
+		if (xh->xh_count)
+			xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+
+		last_hash = le32_to_cpu(xe->xe_name_hash);
+
+		/* record lower_bh which may be the insert place. */
+		brelse(lower_bh);
+		lower_bh = bh;
+		bh = NULL;
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+			low_bucket = bucket + 1;
+			continue;
+		}
+
+		/* the searched xattr should reside in this bucket if exists. */
+		ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+					      name_index, name, name_hash,
+					      &index, &found);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/*
+	 * Record the bucket we have found.
+	 * When the xattr's hash value is in the gap of 2 buckets, we will
+	 * always set it to the previous bucket.
+	 */
+	if (!lower_bh) {
+		/*
+		 * We can't find any bucket whose first name_hash is less
+		 * than the find name_hash.
+		 */
+		BUG_ON(bh->b_blocknr != p_blkno);
+		lower_bh = bh;
+		bh = NULL;
+	}
+	xs->bucket.bhs[0] = lower_bh;
+	xs->bucket.xh = (struct ocfs2_xattr_header *)
+					xs->bucket.bhs[0]->b_data;
+	lower_bh = NULL;
+
+	xs->header = xs->bucket.xh;
+	xs->base = xs->bucket.bhs[0]->b_data;
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (found) {
+		/*
+		 * If we have found the xattr enty, read all the blocks in
+		 * this bucket.
+		 */
+		ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xs->here = &xs->header->xh_entries[index];
+		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
+		     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+	} else
+		ret = -ENODATA;
+
+out:
+	brelse(bh);
+	brelse(lower_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	u64 p_blkno = 0;
+	u32 first_hash, num_clusters = 0;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return -ENODATA;
+
+	mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
+	     name, name_hash, name_index);
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+
+	mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
+	     "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+
+	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+				      p_blkno, first_hash, num_clusters, xs);
+
+out:
+	return ret;
+}
+
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+				       u64 blkno,
+				       u32 clusters,
+				       xattr_bucket_func *func,
+				       void *para)
+{
+	int i, j, ret = 0;
+	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+	u32 num_buckets = clusters * bpc;
+	struct ocfs2_xattr_bucket bucket;
+
+	memset(&bucket, 0, sizeof(bucket));
+
+	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
+	     clusters, blkno);
+
+	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+					bucket.bhs, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
+		/*
+		 * The real bucket num in this series of blocks is stored
+		 * in the 1st bucket.
+		 */
+		if (i == 0)
+			num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+
+		mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+		     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+		if (func) {
+			ret = func(inode, &bucket, para);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		for (j = 0; j < blk_per_bucket; j++)
+			brelse(bucket.bhs[j]);
+		memset(&bucket, 0, sizeof(bucket));
+	}
+
+out:
+	for (j = 0; j < blk_per_bucket; j++)
+		brelse(bucket.bhs[j]);
+
+	return ret;
+}
+
+struct ocfs2_xattr_tree_list {
+	char *buffer;
+	size_t buffer_size;
+	size_t result;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset)
+{
+	u16 name_offset;
+
+	if (index < 0 || index >= le16_to_cpu(xh->xh_count))
+		return -EINVAL;
+
+	name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
+
+	*block_off = name_offset >> inode->i_sb->s_blocksize_bits;
+	*new_offset = name_offset % inode->i_sb->s_blocksize;
+
+	return 0;
+}
+
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+				   struct ocfs2_xattr_bucket *bucket,
+				   void *para)
+{
+	int ret = 0, type;
+	struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
+	int i, block_off, new_offset;
+	const char *prefix, *name;
+
+	for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
+
+		if (prefix) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+								bucket->xh,
+								i,
+								&block_off,
+								&new_offset);
+			if (ret)
+				break;
+
+			name = (const char *)bucket->bhs[block_off]->b_data +
+				new_offset;
+			ret = ocfs2_xattr_list_entry(xl->buffer,
+						     xl->buffer_size,
+						     &xl->result,
+						     prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					     struct ocfs2_xattr_tree_root *xt,
+					     char *buffer,
+					     size_t buffer_size)
+{
+	struct ocfs2_extent_list *el = &xt->xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
+	u64 p_blkno = 0;
+	struct ocfs2_xattr_tree_list xl = {
+		.buffer = buffer,
+		.buffer_size = buffer_size,
+		.result = 0,
+	};
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+					  &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+						  ocfs2_list_xattr_bucket,
+						  &xl);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+	ret = xl.result;
+out:
+	return ret;
+}
+
+static int cmp_xe(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_hash = le32_to_cpu(l->xe_name_hash);
+	u32 r_hash = le32_to_cpu(r->xe_name_hash);
+
+	if (l_hash > r_hash)
+		return 1;
+	if (l_hash < r_hash)
+		return -1;
+	return 0;
+}
+
+static void swap_xe(void *a, void *b, int size)
+{
+	struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+
+	tmp = *l;
+	memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+	memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+
+/*
+ * When the ocfs2_xattr_block is filled up, new bucket will be created
+ * and all the xattr entries will be moved to the new bucket.
+ * Note: we need to sort the entries since they are not saved in order
+ * in the ocfs2_xattr_block.
+ */
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+					   struct buffer_head *xb_bh,
+					   struct buffer_head *xh_bh,
+					   struct buffer_head *data_bh)
+{
+	int i, blocksize = inode->i_sb->s_blocksize;
+	u16 offset, size, off_change;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_block *xb =
+				(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_header *xh =
+				(struct ocfs2_xattr_header *)xh_bh->b_data;
+	u16 count = le16_to_cpu(xb_xh->xh_count);
+	char *target = xh_bh->b_data, *src = xb_bh->b_data;
+
+	mlog(0, "cp xattr from block %llu to bucket %llu\n",
+	     (unsigned long long)xb_bh->b_blocknr,
+	     (unsigned long long)xh_bh->b_blocknr);
+
+	memset(xh_bh->b_data, 0, blocksize);
+	if (data_bh)
+		memset(data_bh->b_data, 0, blocksize);
+	/*
+	 * Since the xe_name_offset is based on ocfs2_xattr_header,
+	 * there is a offset change corresponding to the change of
+	 * ocfs2_xattr_header's position.
+	 */
+	off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	xe = &xb_xh->xh_entries[count - 1];
+	offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+	size = blocksize - offset;
+
+	/* copy all the names and values. */
+	if (data_bh)
+		target = data_bh->b_data;
+	memcpy(target + offset, src + offset, size);
+
+	/* Init new header now. */
+	xh->xh_count = xb_xh->xh_count;
+	xh->xh_num_buckets = cpu_to_le16(1);
+	xh->xh_name_value_len = cpu_to_le16(size);
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+
+	/* copy all the entries. */
+	target = xh_bh->b_data;
+	offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+	size = count * sizeof(struct ocfs2_xattr_entry);
+	memcpy(target + offset, (char *)xb_xh + offset, size);
+
+	/* Change the xe offset for all the xe because of the move. */
+	off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+		 offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	for (i = 0; i < count; i++)
+		le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+
+	mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
+	     offset, size, off_change);
+
+	sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+}
+
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ *
+ * When the entry is in xattr block, xattr_bh indicates the storage place.
+ * While if the entry is in index b-tree, "bucket" indicates the
+ * real place of the xattr.
+ */
+static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+					   struct ocfs2_xattr_search *xs,
+					   struct buffer_head *old_bh,
+					   struct buffer_head *new_bh)
+{
+	int ret = 0;
+	char *buf = old_bh->b_data;
+	struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+	struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+	int i, blocksize = inode->i_sb->s_blocksize;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	xs->bucket.bhs[0] = new_bh;
+	get_bh(new_bh);
+	xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
+	xs->header = xs->bucket.xh;
+
+	xs->base = new_bh->b_data;
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (!xs->not_found) {
+		if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+			ret = ocfs2_read_blocks(inode,
+					xs->bucket.bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					0);
+			if (ret) {
+				mlog_errno(ret);
+				return ret;
+			}
+
+			i = xs->here - old_xh->xh_entries;
+			xs->here = &xs->header->xh_entries[i];
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs)
+{
+	int ret, credits = OCFS2_SUBALLOC_ALLOC;
+	u32 bit_off, len;
+	u64 blkno;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_alloc_context *data_ac;
+	struct buffer_head *xh_bh = NULL, *data_bh = NULL;
+	struct buffer_head *xb_bh = xs->xattr_bh;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xr;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+	u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	mlog(0, "create xattr index block for %llu\n",
+	     (unsigned long long)xb_bh->b_blocknr);
+
+	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+
+	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * XXX:
+	 * We can use this lock for now, and maybe move to a dedicated mutex
+	 * if performance becomes a problem later.
+	 */
+	down_write(&oi->ip_alloc_sem);
+
+	/*
+	 * 3 more credits, one for xattr block update, one for the 1st block
+	 * of the new xattr bucket and one for the value/data.
+	 */
+	credits += 3;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_sem;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xb_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * The bucket may spread in many blocks, and
+	 * we will only touch the 1st block and the last block
+	 * in the whole bucket(one for entry and one for data).
+	 */
+	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+	mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+
+	xh_bh = sb_getblk(inode->i_sb, blkno);
+	if (!xh_bh) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+
+	ret = ocfs2_journal_access(handle, inode, xh_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (bpb > 1) {
+		data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
+		if (!data_bh) {
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ocfs2_set_new_buffer_uptodate(inode, data_bh);
+
+		ret = ocfs2_journal_access(handle, inode, data_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+
+	ocfs2_journal_dirty(handle, xh_bh);
+	if (data_bh)
+		ocfs2_journal_dirty(handle, data_bh);
+
+	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+
+	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
+	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+	       offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+	xr = &xb->xb_attrs.xb_root;
+	xr->xt_clusters = cpu_to_le32(1);
+	xr->xt_last_eb_blk = 0;
+	xr->xt_list.l_tree_depth = 0;
+	xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+	xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+
+	xr->xt_list.l_recs[0].e_cpos = 0;
+	xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+
+	xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+
+	ret = ocfs2_journal_dirty(handle, xb_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_sem:
+	up_write(&oi->ip_alloc_sem);
+
+out:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	brelse(xh_bh);
+	brelse(data_bh);
+
+	return ret;
+}
+
+static int cmp_xe_offset(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+	u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+
+	if (l_name_offset < r_name_offset)
+		return 1;
+	if (l_name_offset > r_name_offset)
+		return -1;
+	return 0;
+}
+
+/*
+ * defrag a xattr bucket if we find that the bucket has some
+ * holes beteen name/value pairs.
+ * We will move all the name/value pairs to the end of the bucket
+ * so that we can spare some space for insertion.
+ */
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+				     struct ocfs2_xattr_bucket *bucket)
+{
+	int ret, i;
+	size_t end, offset, len, value_len;
+	struct ocfs2_xattr_header *xh;
+	char *entries, *buf, *bucket_buf = NULL;
+	u64 blkno = bucket->bhs[0]->b_blocknr;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u16 xh_free_start;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	handle_t *handle;
+	struct buffer_head **bhs;
+	struct ocfs2_xattr_entry *xe;
+
+	bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+			GFP_NOFS);
+	if (!bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
+	if (ret)
+		goto out;
+
+	/*
+	 * In order to make the operation more efficient and generic,
+	 * we copy all the blocks into a contiguous memory and do the
+	 * defragment there, so if anything is error, we will not touch
+	 * the real block.
+	 */
+	bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+	if (!bucket_buf) {
+		ret = -EIO;
+		goto out;
+	}
+
+	buf = bucket_buf;
+	for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+		memcpy(buf, bhs[i]->b_data, blocksize);
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto commit;
+		}
+	}
+
+	xh = (struct ocfs2_xattr_header *)bucket_buf;
+	entries = (char *)xh->xh_entries;
+	xh_free_start = le16_to_cpu(xh->xh_free_start);
+
+	mlog(0, "adjust xattr bucket in %llu, count = %u, "
+	     "xh_free_start = %u, xh_name_value_len = %u.\n",
+	     blkno, le16_to_cpu(xh->xh_count), xh_free_start,
+	     le16_to_cpu(xh->xh_name_value_len));
+
+	/*
+	 * sort all the entries by their offset.
+	 * the largest will be the first, so that we can
+	 * move them to the end one by one.
+	 */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe_offset, swap_xe);
+
+	/* Move all name/values to the end of the bucket. */
+	xe = xh->xh_entries;
+	end = OCFS2_XATTR_BUCKET_SIZE;
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+		offset = le16_to_cpu(xe->xe_name_offset);
+		if (ocfs2_xattr_is_local(xe))
+			value_len = OCFS2_XATTR_SIZE(
+					le64_to_cpu(xe->xe_value_size));
+		else
+			value_len = OCFS2_XATTR_ROOT_SIZE;
+		len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+
+		/*
+		 * We must make sure that the name/value pair
+		 * exist in the same block. So adjust end to
+		 * the previous block end if needed.
+		 */
+		if (((end - len) / blocksize !=
+			(end - 1) / blocksize))
+			end = end - end % blocksize;
+
+		if (end > offset + len) {
+			memmove(bucket_buf + end - len,
+				bucket_buf + offset, len);
+			xe->xe_name_offset = cpu_to_le16(end - len);
+		}
+
+		mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
+				"bucket %llu\n", (unsigned long long)blkno);
+
+		end -= len;
+	}
+
+	mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
+			"bucket %llu\n", (unsigned long long)blkno);
+
+	if (xh_free_start == end)
+		goto commit;
+
+	memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
+	xh->xh_free_start = cpu_to_le16(end);
+
+	/* sort the entries by their name_hash. */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+
+	buf = bucket_buf;
+	for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+		memcpy(bhs[i]->b_data, buf, blocksize);
+		ocfs2_journal_dirty(handle, bhs[i]);
+	}
+
+commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+
+	if (bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(bhs[i]);
+	}
+	kfree(bhs);
+
+	kfree(bucket_buf);
+	return ret;
+}
+
+/*
+ * Move half nums of the xattr bucket in the previous cluster to this new
+ * cluster. We only touch the last cluster of the previous extend record.
+ *
+ * first_bh is the first buffer_head of a series of bucket in the same
+ * extent rec and header_bh is the header of one bucket in this cluster.
+ * They will be updated if we move the data header_bh contains to the new
+ * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+					       handle_t *handle,
+					       struct buffer_head **first_bh,
+					       struct buffer_head **header_bh,
+					       u64 new_blkno,
+					       u64 prev_blkno,
+					       u32 num_clusters,
+					       u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+	int blocksize = inode->i_sb->s_blocksize;
+	struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+	struct ocfs2_xattr_header *new_xh;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)((*first_bh)->b_data);
+
+	BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
+
+	prev_bh = *first_bh;
+	get_bh(prev_bh);
+	xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+
+	prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+
+	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
+	     prev_blkno, new_blkno);
+
+	/*
+	 * We need to update the 1st half of the new cluster and
+	 * 1 more for the update of the 1st bucket of the previous
+	 * extent record.
+	 */
+	credits = bpc / 2 + 1;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, prev_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+		old_bh = new_bh = NULL;
+		new_bh = sb_getblk(inode->i_sb, new_blkno);
+		if (!new_bh) {
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+		ret = ocfs2_journal_access(handle, inode, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			brelse(new_bh);
+			goto out;
+		}
+
+		ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			brelse(new_bh);
+			goto out;
+		}
+
+		memcpy(new_bh->b_data, old_bh->b_data, blocksize);
+
+		if (i == 0) {
+			new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
+			new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
+
+			if (first_hash)
+				*first_hash = le32_to_cpu(
+					new_xh->xh_entries[0].xe_name_hash);
+			new_first_bh = new_bh;
+			get_bh(new_first_bh);
+		}
+
+		ocfs2_journal_dirty(handle, new_bh);
+
+		if (*header_bh == old_bh) {
+			brelse(*header_bh);
+			*header_bh = new_bh;
+			get_bh(*header_bh);
+
+			brelse(*first_bh);
+			*first_bh = new_first_bh;
+			get_bh(*first_bh);
+		}
+		brelse(new_bh);
+		brelse(old_bh);
+	}
+
+	le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
+
+	ocfs2_journal_dirty(handle, prev_bh);
+out:
+	brelse(prev_bh);
+	brelse(new_first_bh);
+	return ret;
+}
+
+static int ocfs2_read_xattr_bucket(struct inode *inode,
+				   u64 blkno,
+				   struct buffer_head **bhs,
+				   int new)
+{
+	int ret = 0;
+	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	if (!new)
+		return ocfs2_read_blocks(inode, blkno,
+					 blk_per_bucket, bhs, 0);
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		bhs[i] = sb_getblk(inode->i_sb, blkno + i);
+		if (bhs[i] == NULL) {
+			ret = -EIO;
+			mlog_errno(ret);
+			break;
+		}
+		ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+	}
+
+	return ret;
+}
+
+/*
+ * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * first_hash will record the 1st hash of the new bucket.
+ */
+static int ocfs2_half_xattr_bucket(struct inode *inode,
+				   handle_t *handle,
+				   u64 blk,
+				   u64 new_blk,
+				   u32 *first_hash,
+				   int new_bucket_head)
+{
+	int ret, i;
+	u16 count, start, len, name_value_len, xe_len, name_offset;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct buffer_head **s_bhs, **t_bhs = NULL;
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	int blocksize = inode->i_sb->s_blocksize;
+
+	mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+	     blk, new_blk);
+
+	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!s_bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!t_bhs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/* copy the whole bucket to the new first. */
+	for (i = 0; i < blk_per_bucket; i++)
+		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+
+	/* update the new bucket. */
+	xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+	count = le16_to_cpu(xh->xh_count);
+	start = count / 2;
+
+	/*
+	 * Calculate the total name/value len and xh_free_start for
+	 * the old bucket first.
+	 */
+	name_offset = OCFS2_XATTR_BUCKET_SIZE;
+	name_value_len = 0;
+	for (i = 0; i < start; i++) {
+		xe = &xh->xh_entries[i];
+		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		if (ocfs2_xattr_is_local(xe))
+			xe_len +=
+			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			xe_len += OCFS2_XATTR_ROOT_SIZE;
+		name_value_len += xe_len;
+		if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+			name_offset = le16_to_cpu(xe->xe_name_offset);
+	}
+
+	/*
+	 * Now begin the modification to the new bucket.
+	 *
+	 * In the new bucket, We just move the xattr entry to the beginning
+	 * and don't touch the name/value. So there will be some holes in the
+	 * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+	 * called.
+	 */
+	xe = &xh->xh_entries[start];
+	len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+	mlog(0, "mv xattr entry len %d from %d to %d\n", len,
+	     (int)((char *)xe - (char *)xh),
+	     (int)((char *)xh->xh_entries - (char *)xh));
+	memmove((char *)xh->xh_entries, (char *)xe, len);
+	xe = &xh->xh_entries[count - start];
+	len = sizeof(struct ocfs2_xattr_entry) * start;
+	memset((char *)xe, 0, len);
+
+	le16_add_cpu(&xh->xh_count, -start);
+	le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+
+	/* Calculate xh_free_start for the new bucket. */
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		if (ocfs2_xattr_is_local(xe))
+			xe_len +=
+			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			xe_len += OCFS2_XATTR_ROOT_SIZE;
+		if (le16_to_cpu(xe->xe_name_offset) <
+		    le16_to_cpu(xh->xh_free_start))
+			xh->xh_free_start = xe->xe_name_offset;
+	}
+
+	/* set xh->xh_num_buckets for the new xh. */
+	if (new_bucket_head)
+		xh->xh_num_buckets = cpu_to_le16(1);
+	else
+		xh->xh_num_buckets = 0;
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ocfs2_journal_dirty(handle, t_bhs[i]);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+	/* store the first_hash of the new bucket. */
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+
+	/*
+	 * Now only update the 1st block of the old bucket.
+	 * Please note that the entry has been sorted already above.
+	 */
+	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	memset(&xh->xh_entries[start], 0,
+	       sizeof(struct ocfs2_xattr_entry) * (count - start));
+	xh->xh_count = cpu_to_le16(start);
+	xh->xh_free_start = cpu_to_le16(name_offset);
+	xh->xh_name_value_len = cpu_to_le16(name_value_len);
+
+	ocfs2_journal_dirty(handle, s_bhs[0]);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	if (s_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(s_bhs[i]);
+	}
+	kfree(s_bhs);
+
+	if (t_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(t_bhs[i]);
+	}
+	kfree(t_bhs);
+
+	return ret;
+}
+
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+				 handle_t *handle,
+				 u64 s_blkno,
+				 u64 t_blkno,
+				 int t_is_new)
+{
+	int ret, i;
+	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int blocksize = inode->i_sb->s_blocksize;
+	struct buffer_head **s_bhs, **t_bhs = NULL;
+
+	BUG_ON(s_blkno == t_blkno);
+
+	mlog(0, "cp bucket %llu to %llu, target is %d\n",
+	     s_blkno, t_blkno, t_is_new);
+
+	s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+			GFP_NOFS);
+	if (!s_bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+	if (ret)
+		goto out;
+
+	t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+			GFP_NOFS);
+	if (!t_bhs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret)
+			goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+		ocfs2_journal_dirty(handle, t_bhs[i]);
+	}
+
+out:
+	if (s_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(s_bhs[i]);
+	}
+	kfree(s_bhs);
+
+	if (t_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(t_bhs[i]);
+	}
+	kfree(t_bhs);
+
+	return ret;
+}
+
+/*
+ * Copy one xattr cluster from src_blk to to_blk.
+ * The to_blk will become the first bucket header of the cluster, so its
+ * xh_num_buckets will be initialized as the bucket num in the cluster.
+ */
+static int ocfs2_cp_xattr_cluster(struct inode *inode,
+				  handle_t *handle,
+				  struct buffer_head *first_bh,
+				  u64 src_blk,
+				  u64 to_blk,
+				  u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_header *xh;
+	u64 to_blk_start = to_blk;
+
+	mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+
+	/*
+	 * We need to update the new cluster and 1 more for the update of
+	 * the 1st bucket of the previous extent rec.
+	 */
+	credits = bpc + 1;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < num_buckets; i++) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle,
+					    src_blk, to_blk, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	}
+
+	/* update the old bucket header. */
+	xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+	le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+
+	ocfs2_journal_dirty(handle, first_bh);
+
+	/* update the new bucket header. */
+	ret = ocfs2_read_block(inode, to_blk_start, &bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	xh->xh_num_buckets = cpu_to_le16(num_buckets);
+
+	ocfs2_journal_dirty(handle, bh);
+
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+out:
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * Move half of the xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static int ocfs2_half_xattr_cluster(struct inode *inode,
+				    handle_t *handle,
+				    u64 prev_blk,
+				    u64 new_blk,
+				    u32 *first_hash)
+{
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int ret, credits = 2 * blk_per_bucket;
+
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
+
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	return  ocfs2_half_xattr_bucket(inode, handle, prev_blk,
+					new_blk, first_hash, 1);
+}
+
+/*
+ * Move some xattrs from the old cluster to the new one since they are not
+ * contiguous in ocfs2 xattr tree.
+ *
+ * new_blk starts a new separate cluster, and we will move some xattrs from
+ * prev_blk to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has more
+ *    than 1 bucket, so just move half nums of bucket into the new cluster and
+ *    update the first_bh and header_bh if the insert bucket has been moved
+ *    to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ *    a) If the previous extent rec has more than one cluster and the insert
+ *       place isn't in the last cluster, copy the entire last cluster to the
+ *       new one. This time, we don't need to upate the first_bh and header_bh
+ *       since they will not be moved into the new cluster.
+ *    b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ *       the new one. And we set the extend flag to zero if the insert place is
+ *       moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+					    handle_t *handle,
+					    struct buffer_head **first_bh,
+					    struct buffer_head **header_bh,
+					    u64 new_blk,
+					    u64 prev_blk,
+					    u32 prev_clusters,
+					    u32 *v_start,
+					    int *extend)
+{
+	int ret = 0;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
+	     prev_blk, prev_clusters, new_blk);
+
+	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+							  handle,
+							  first_bh,
+							  header_bh,
+							  new_blk,
+							  prev_blk,
+							  prev_clusters,
+							  v_start);
+	else {
+		u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+
+		if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+			ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+						     last_blk, new_blk,
+						     v_start);
+		else {
+			ret = ocfs2_half_xattr_cluster(inode, handle,
+						       last_blk, new_blk,
+						       v_start);
+
+			if ((*header_bh)->b_blocknr == last_blk && extend)
+				*extend = 0;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * We also need to limit the maximum size of a btree leaf, otherwise we'll
+ * lose the benefits of hashing because we'll have to search large leaves.
+ * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
+ * if it's bigger).
+ *
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+				       struct buffer_head *root_bh,
+				       struct buffer_head **first_bh,
+				       struct buffer_head **header_bh,
+				       u32 *num_clusters,
+				       u32 prev_cpos,
+				       u64 prev_blkno,
+				       int *extend)
+{
+	int ret, credits;
+	u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 prev_clusters = *num_clusters;
+	u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+	u64 block;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;
+
+	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
+	     "previous xattr blkno = %llu\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     prev_cpos, prev_blkno);
+
+	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+
+	ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				    &data_ac, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+					    clusters_to_add);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+				     clusters_to_add, &bit_off, &num_bits);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
+	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (prev_blkno + prev_clusters * bpc == block &&
+	    (prev_clusters + num_bits) << osb->s_clustersize_bits <=
+	     OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
+		/*
+		 * If this cluster is contiguous with the old one and
+		 * adding this new cluster, we don't surpass the limit of
+		 * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
+		 * initialized and used like other buckets in the previous
+		 * cluster.
+		 * So add it as a contiguous one. The caller will handle
+		 * its init process.
+		 */
+		v_start = prev_cpos + prev_clusters;
+		*num_clusters = prev_clusters + num_bits;
+		mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
+		     num_bits);
+	} else {
+		ret = ocfs2_adjust_xattr_cross_cluster(inode,
+						       handle,
+						       first_bh,
+						       header_bh,
+						       block,
+						       prev_blkno,
+						       prev_clusters,
+						       &v_start,
+						       extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	}
+
+	if (handle->h_buffer_credits < credits) {
+		/*
+		 * The journal has been restarted before, and don't
+		 * have enough space for the insertion, so extend it
+		 * here.
+		 */
+		ret = ocfs2_extend_trans(handle, credits);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	}
+	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
+	     num_bits, block, v_start);
+	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
+				  num_bits, 0, meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+leave:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+/*
+ * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+				     struct buffer_head *first_bh,
+				     struct buffer_head *start_bh,
+				     u32 num_clusters)
+{
+	int ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u64 start_blk = start_bh->b_blocknr, end_blk;
+	u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+	handle_t *handle;
+	struct ocfs2_xattr_header *first_xh =
+				(struct ocfs2_xattr_header *)first_bh->b_data;
+	u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+
+	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
+	     "from %llu, len = %u\n", start_blk,
+	     (unsigned long long)first_bh->b_blocknr, num_clusters);
+
+	BUG_ON(bucket >= num_buckets);
+
+	end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+
+	/*
+	 * We will touch all the buckets after the start_bh(include it).
+	 * Add one more bucket and modify the first_bh.
+	 */
+	credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto commit;
+	}
+
+	while (end_blk != start_blk) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+					    end_blk + blk_per_bucket, 0);
+		if (ret)
+			goto commit;
+		end_blk -= blk_per_bucket;
+	}
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
+				      start_blk + blk_per_bucket, NULL, 0);
+
+	le16_add_cpu(&first_xh->xh_num_buckets, 1);
+	ocfs2_journal_dirty(handle, first_bh);
+
+commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * xb_bh is the ocfs2_xattr_block.
+ * We will move all the buckets starting from header_bh to the next place. As
+ * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * We will allocate a new cluster if current cluster is full and adjust
+ * header_bh and first_bh if the insert place is moved to the new cluster.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+				      struct buffer_head *xb_bh,
+				      struct buffer_head *header_bh)
+{
+	struct ocfs2_xattr_header *first_xh = NULL;
+	struct buffer_head *first_bh = NULL;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)header_bh->b_data;
+	u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int ret, num_buckets, extend = 1;
+	u64 p_blkno;
+	u32 e_cpos, num_clusters;
+
+	mlog(0, "Add new xattr bucket starting form %llu\n",
+	     (unsigned long long)header_bh->b_blocknr);
+
+	/*
+	 * Add refrence for header_bh here because it may be
+	 * changed in ocfs2_add_new_xattr_cluster and we need
+	 * to free it in the end.
+	 */
+	get_bh(header_bh);
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+	first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+
+	if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+		ret = ocfs2_add_new_xattr_cluster(inode,
+						  xb_bh,
+						  &first_bh,
+						  &header_bh,
+						  &num_clusters,
+						  e_cpos,
+						  p_blkno,
+						  &extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (extend)
+		ret = ocfs2_extend_xattr_bucket(inode,
+						first_bh,
+						header_bh,
+						num_clusters);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(first_bh);
+	brelse(header_bh);
+	return ret;
+}
+
+static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					int offs)
+{
+	int block_off = offs >> inode->i_sb->s_blocksize_bits;
+
+	offs = offs % inode->i_sb->s_blocksize;
+	return bucket->bhs[block_off]->b_data + offs;
+}
+
+/*
+ * Handle the normal xattr set, including replace, delete and new.
+ *
+ * Note: "local" indicates the real data's locality. So we can't
+ * just its bucket locality by its length.
+ */
+static void ocfs2_xattr_set_entry_normal(struct inode *inode,
+					 struct ocfs2_xattr_info *xi,
+					 struct ocfs2_xattr_search *xs,
+					 u32 name_hash,
+					 int local)
+{
+	struct ocfs2_xattr_entry *last, *xe;
+	int name_len = strlen(xi->name);
+	struct ocfs2_xattr_header *xh = xs->header;
+	u16 count = le16_to_cpu(xh->xh_count), start;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	char *val;
+	size_t offs, size, new_size;
+
+	last = &xh->xh_entries[count];
+	if (!xs->not_found) {
+		xe = xs->here;
+		offs = le16_to_cpu(xe->xe_name_offset);
+		if (ocfs2_xattr_is_local(xe))
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+		/*
+		 * If the new value will be stored outside, xi->value has been
+		 * initalized as an empty ocfs2_xattr_value_root, and the same
+		 * goes with xi->value_len, so we can set new_size safely here.
+		 * See ocfs2_xattr_set_in_bucket.
+		 */
+		new_size = OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->value_len);
+
+		le16_add_cpu(&xh->xh_name_value_len, -size);
+		if (xi->value) {
+			if (new_size > size)
+				goto set_new_name_value;
+
+			/* Now replace the old value with new one. */
+			if (local)
+				xe->xe_value_size = cpu_to_le64(xi->value_len);
+			else
+				xe->xe_value_size = 0;
+
+			val = ocfs2_xattr_bucket_get_val(inode,
+							 &xs->bucket, offs);
+			memset(val + OCFS2_XATTR_SIZE(name_len), 0,
+			       size - OCFS2_XATTR_SIZE(name_len));
+			if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
+				memcpy(val + OCFS2_XATTR_SIZE(name_len),
+				       xi->value, xi->value_len);
+
+			le16_add_cpu(&xh->xh_name_value_len, new_size);
+			ocfs2_xattr_set_local(xe, local);
+			return;
+		} else {
+			/*
+			 * Remove the old entry if there is more than one.
+			 * We don't remove the last entry so that we can
+			 * use it to indicate the hash value of the empty
+			 * bucket.
+			 */
+			last -= 1;
+			le16_add_cpu(&xh->xh_count, -1);
+			if (xh->xh_count) {
+				memmove(xe, xe + 1,
+					(void *)last - (void *)xe);
+				memset(last, 0,
+				       sizeof(struct ocfs2_xattr_entry));
+			} else
+				xh->xh_free_start =
+					cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+
+			return;
+		}
+	} else {
+		/* find a new entry for insert. */
+		int low = 0, high = count - 1, tmp;
+		struct ocfs2_xattr_entry *tmp_xe;
+
+		while (low <= high && count) {
+			tmp = (low + high) / 2;
+			tmp_xe = &xh->xh_entries[tmp];
+
+			if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+				low = tmp + 1;
+			else if (name_hash <
+				 le32_to_cpu(tmp_xe->xe_name_hash))
+				high = tmp - 1;
+			else {
+				low = tmp;
+				break;
+			}
+		}
+
+		xe = &xh->xh_entries[low];
+		if (low != count)
+			memmove(xe + 1, xe, (void *)last - (void *)xe);
+
+		le16_add_cpu(&xh->xh_count, 1);
+		memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
+		xe->xe_name_hash = cpu_to_le32(name_hash);
+		xe->xe_name_len = name_len;
+		ocfs2_xattr_set_type(xe, xi->name_index);
+	}
+
+set_new_name_value:
+	/* Insert the new name+value. */
+	size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
+
+	/*
+	 * We must make sure that the name/value pair
+	 * exists in the same block.
+	 */
+	offs = le16_to_cpu(xh->xh_free_start);
+	start = offs - size;
+
+	if (start >> inode->i_sb->s_blocksize_bits !=
+	    (offs - 1) >> inode->i_sb->s_blocksize_bits) {
+		offs = offs - offs % blocksize;
+		xh->xh_free_start = cpu_to_le16(offs);
+	}
+
+	val = ocfs2_xattr_bucket_get_val(inode,
+					 &xs->bucket, offs - size);
+	xe->xe_name_offset = cpu_to_le16(offs - size);
+
+	memset(val, 0, size);
+	memcpy(val, xi->name, name_len);
+	memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
+
+	xe->xe_value_size = cpu_to_le64(xi->value_len);
+	ocfs2_xattr_set_local(xe, local);
+	xs->here = xe;
+	le16_add_cpu(&xh->xh_free_start, -size);
+	le16_add_cpu(&xh->xh_name_value_len, size);
+
+	return;
+}
+
+static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
+					     handle_t *handle,
+					     struct ocfs2_xattr_search *xs,
+					     struct buffer_head **bhs,
+					     u16 bh_num)
+{
+	int ret = 0, off, block_off;
+	struct ocfs2_xattr_entry *xe = xs->here;
+
+	/*
+	 * First calculate all the blocks we should journal_access
+	 * and journal_dirty. The first block should always be touched.
+	 */
+	ret = ocfs2_journal_dirty(handle, bhs[0]);
+	if (ret)
+		mlog_errno(ret);
+
+	/* calc the data. */
+	off = le16_to_cpu(xe->xe_name_offset);
+	block_off = off >> inode->i_sb->s_blocksize_bits;
+	ret = ocfs2_journal_dirty(handle, bhs[block_off]);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+/*
+ * Set the xattr entry in the specified bucket.
+ * The bucket is indicated by xs->bucket and it should have the enough
+ * space for the xattr insertion.
+ */
+static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+					   struct ocfs2_xattr_info *xi,
+					   struct ocfs2_xattr_search *xs,
+					   u32 name_hash,
+					   int local)
+{
+	int i, ret;
+	handle_t *handle = NULL;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
+	     (unsigned long)xi->value_len, xi->name_index,
+	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+
+	if (!xs->bucket.bhs[1]) {
+		ret = ocfs2_read_blocks(inode,
+					xs->bucket.bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, blk_per_bucket);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+
+	/*Only dirty the blocks we have touched in set xattr. */
+	ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
+						xs->bucket.bhs, blk_per_bucket);
+	if (ret)
+		mlog_errno(ret);
+out:
+	ocfs2_commit_trans(osb, handle);
+
+	return ret;
+}
+
+static int ocfs2_xattr_value_update_size(struct inode *inode,
+					 struct buffer_head *xe_bh,
+					 struct ocfs2_xattr_entry *xe,
+					 u64 new_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, 1);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xe_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	xe->xe_value_size = cpu_to_le64(new_size);
+
+	ret = ocfs2_journal_dirty(handle, xe_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * Truncate the specified xe_off entry in xattr bucket.
+ * bucket is indicated by header_bh and len is the new length.
+ * Both the ocfs2_xattr_value_root and the entry will be updated here.
+ *
+ * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
+ */
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+					     struct buffer_head *header_bh,
+					     int xe_off,
+					     int len)
+{
+	int ret, offset;
+	u64 value_blk;
+	struct buffer_head *value_bh = NULL;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)header_bh->b_data;
+	size_t blocksize = inode->i_sb->s_blocksize;
+
+	xe = &xh->xh_entries[xe_off];
+
+	BUG_ON(!xe || ocfs2_xattr_is_local(xe));
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	value_blk = offset / blocksize;
+
+	/* We don't allow ocfs2_xattr_value to be stored in different block. */
+	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+	value_blk += header_bh->b_blocknr;
+
+	ret = ocfs2_read_block(inode, value_blk, &value_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xv = (struct ocfs2_xattr_value_root *)
+		(value_bh->b_data + offset % blocksize);
+
+	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+	     xe_off, (unsigned long long)header_bh->b_blocknr, len);
+	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	brelse(value_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
+						struct ocfs2_xattr_search *xs,
+						int len)
+{
+	int ret, offset;
+	struct ocfs2_xattr_entry *xe = xs->here;
+	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
+
+	BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+
+	offset = xe - xh->xh_entries;
+	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+						offset, len);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+						struct ocfs2_xattr_search *xs,
+						char *val,
+						int value_len)
+{
+	int offset;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe = xs->here;
+
+	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+
+	return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+}
+
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
+	     cpos, len, (unsigned long long)blkno);
+
+	ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
+
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+				  &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+					 struct ocfs2_xattr_search *xs)
+{
+	handle_t *handle = NULL;
+	struct ocfs2_xattr_header *xh = xs->bucket.xh;
+	struct ocfs2_xattr_entry *last = &xh->xh_entries[
+						le16_to_cpu(xh->xh_count) - 1];
+	int ret = 0;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/* Remove the old entry. */
+	memmove(xs->here, xs->here + 1,
+		(void *)last - (void *)xs->here);
+	memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+	le16_add_cpu(&xh->xh_count, -1);
+
+	ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+}
+
+/*
+ * Set the xattr name/value in the bucket specified in xs.
+ *
+ * As the new value in xi may be stored in the bucket or in an outside cluster,
+ * we divide the whole process into 3 steps:
+ * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
+ * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
+ * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
+ * 4. If the clusters for the new outside value can't be allocated, we need
+ *    to free the xattr we allocated in set.
+ */
+static int ocfs2_xattr_set_in_bucket(struct inode *inode,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xs)
+{
+	int ret, local = 1;
+	size_t value_len;
+	char *val = (char *)xi->value;
+	struct ocfs2_xattr_entry *xe = xs->here;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
+					      strlen(xi->name));
+
+	if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
+		/*
+		 * We need to truncate the xattr storage first.
+		 *
+		 * If both the old and new value are stored to
+		 * outside block, we only need to truncate
+		 * the storage and then set the value outside.
+		 *
+		 * If the new value should be stored within block,
+		 * we should free all the outside block first and
+		 * the modification to the xattr block will be done
+		 * by following steps.
+		 */
+		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+			value_len = xi->value_len;
+		else
+			value_len = 0;
+
+		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+							   value_len);
+		if (ret)
+			goto out;
+
+		if (value_len)
+			goto set_value_outside;
+	}
+
+	value_len = xi->value_len;
+	/* So we have to handle the inside block change now. */
+	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/*
+		 * If the new value will be stored outside of block,
+		 * initalize a new empty value root and insert it first.
+		 */
+		local = 0;
+		xi->value = &def_xv;
+		xi->value_len = OCFS2_XATTR_ROOT_SIZE;
+	}
+
+	ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+		goto out;
+
+	/* allocate the space now for the outside block storage. */
+	ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+						   value_len);
+	if (ret) {
+		mlog_errno(ret);
+
+		if (xs->not_found) {
+			/*
+			 * We can't allocate enough clusters for outside
+			 * storage and we have allocated xattr already,
+			 * so need to remove it.
+			 */
+			ocfs2_xattr_bucket_remove_xs(inode, xs);
+		}
+		goto out;
+	}
+
+set_value_outside:
+	ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+out:
+	return ret;
+}
+
+/* check whether the xattr bucket is filled up with the same hash value. */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+					      struct ocfs2_xattr_bucket *bucket)
+{
+	struct ocfs2_xattr_header *xh = bucket->xh;
+
+	if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+	    xh->xh_entries[0].xe_name_hash) {
+		mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+		     "hash = %u\n",
+		     (unsigned long long)bucket->bhs[0]->b_blocknr,
+		     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	u16 count, header_size, xh_free_start;
+	int i, free, max_free, need, old;
+	size_t value_size = 0, name_len = strlen(xi->name);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	int ret, allocation = 0;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	mlog_entry("Set xattr %s in xattr index block\n", xi->name);
+
+try_again:
+	xh = xs->header;
+	count = le16_to_cpu(xh->xh_count);
+	xh_free_start = le16_to_cpu(xh->xh_free_start);
+	header_size = sizeof(struct ocfs2_xattr_header) +
+			count * sizeof(struct ocfs2_xattr_entry);
+	max_free = OCFS2_XATTR_BUCKET_SIZE -
+		le16_to_cpu(xh->xh_name_value_len) - header_size;
+
+	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
+			"of %u which exceed block size\n",
+			(unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+			header_size);
+
+	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+		value_size = OCFS2_XATTR_ROOT_SIZE;
+	else if (xi->value)
+		value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+	if (xs->not_found)
+		need = sizeof(struct ocfs2_xattr_entry) +
+			OCFS2_XATTR_SIZE(name_len) + value_size;
+	else {
+		need = value_size + OCFS2_XATTR_SIZE(name_len);
+
+		/*
+		 * We only replace the old value if the new length is smaller
+		 * than the old one. Otherwise we will allocate new space in the
+		 * bucket to store it.
+		 */
+		xe = xs->here;
+		if (ocfs2_xattr_is_local(xe))
+			old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+		if (old >= value_size)
+			need = 0;
+	}
+
+	free = xh_free_start - header_size;
+	/*
+	 * We need to make sure the new name/value pair
+	 * can exist in the same block.
+	 */
+	if (xh_free_start % blocksize < need)
+		free -= xh_free_start % blocksize;
+
+	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
+	     "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
+	     " %u\n", xs->not_found,
+	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
+	     le16_to_cpu(xh->xh_name_value_len));
+
+	if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+		if (need <= max_free &&
+		    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+			/*
+			 * We can create the space by defragment. Since only the
+			 * name/value will be moved, the xe shouldn't be changed
+			 * in xs.
+			 */
+			ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			xh_free_start = le16_to_cpu(xh->xh_free_start);
+			free = xh_free_start - header_size;
+			if (xh_free_start % blocksize < need)
+				free -= xh_free_start % blocksize;
+
+			if (free >= need)
+				goto xattr_set;
+
+			mlog(0, "Can't get enough space for xattr insert by "
+			     "defragment. Need %u bytes, but we have %d, so "
+			     "allocate new bucket for it.\n", need, free);
+		}
+
+		/*
+		 * We have to add new buckets or clusters and one
+		 * allocation should leave us enough space for insert.
+		 */
+		BUG_ON(allocation);
+
+		/*
+		 * We do not allow for overlapping ranges between buckets. And
+		 * the maximum number of collisions we will allow for then is
+		 * one bucket's worth, so check it here whether we need to
+		 * add a new bucket for the insert.
+		 */
+		ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_add_new_xattr_bucket(inode,
+						 xs->xattr_bh,
+						 xs->bucket.bhs[0]);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(xs->bucket.bhs[i]);
+
+		memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+						   xi->name_index,
+						   xi->name, xs);
+		if (ret && ret != -ENODATA)
+			goto out;
+		xs->not_found = ret;
+		allocation = 1;
+		goto try_again;
+	}
+
+xattr_set:
+	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					void *para)
+{
+	int ret = 0;
+	struct ocfs2_xattr_header *xh = bucket->xh;
+	u16 i;
+	struct ocfs2_xattr_entry *xe;
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = ocfs2_xattr_bucket_value_truncate(inode,
+							bucket->bhs[0],
+							i, 0);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+					  struct buffer_head *xb_bh)
+{
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos, num_clusters;
+	u64 p_blkno;
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+					  &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+						  ocfs2_delete_xattr_in_bucket,
+						  NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
+					     p_blkno, e_cpos, num_clusters);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * 'trusted' attributes support
+ */
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+				       size_t list_size, const char *name,
+				       size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ocfs2_xattr_trusted_list,
+	.get	= ocfs2_xattr_trusted_get,
+	.set	= ocfs2_xattr_trusted_set,
+};
+
+
+/*
+ * 'user' attributes support
+ */
+
+#define XATTR_USER_PREFIX "user."
+
+static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+				void *buffer, size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+				const void *value, size_t size, int flags)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ocfs2_xattr_user_list,
+	.get	= ocfs2_xattr_user_get,
+	.set	= ocfs2_xattr_user_set,
+};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 000000000000..c25c7c62a059
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,68 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+enum ocfs2_xattr_type {
+	OCFS2_XATTR_INDEX_USER = 1,
+	OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+	OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+	OCFS2_XATTR_INDEX_TRUSTED,
+	OCFS2_XATTR_INDEX_SECURITY,
+	OCFS2_XATTR_MAX
+};
+
+extern struct xattr_handler ocfs2_xattr_user_handler;
+extern struct xattr_handler ocfs2_xattr_trusted_handler;
+
+extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+			   size_t, int);
+extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
+extern struct xattr_handler *ocfs2_xattr_handlers[];
+
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+	u16 len = sb->s_blocksize -
+		 offsetof(struct ocfs2_xattr_header, xh_entries);
+
+	return len / sizeof(struct ocfs2_xattr_entry);
+}
+#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/Makefile b/fs/omfs/Makefile
new file mode 100644
index 000000000000..8b82b63f1129
--- /dev/null
+++ b/fs/omfs/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_OMFS_FS) += omfs.o
+
+omfs-y := bitmap.o dir.o file.o inode.o
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
new file mode 100644
index 000000000000..e1c0ec0ae989
--- /dev/null
+++ b/fs/omfs/bitmap.c
@@ -0,0 +1,193 @@
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <asm/div64.h>
+#include "omfs.h"
+
+unsigned long omfs_count_free(struct super_block *sb)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int nbits = sb->s_blocksize * 8;
+
+	for (i = 0; i < sbi->s_imap_size; i++)
+		sum += nbits - bitmap_weight(sbi->s_imap[i], nbits);
+
+	return sum;
+}
+
+/*
+ *  Counts the run of zero bits starting at bit up to max.
+ *  It handles the case where a run might spill over a buffer.
+ *  Called with bitmap lock.
+ */
+static int count_run(unsigned long **addr, int nbits,
+		int addrlen, int bit, int max)
+{
+	int count = 0;
+	int x;
+
+	for (; addrlen > 0; addrlen--, addr++) {
+		x = find_next_bit(*addr, nbits, bit);
+		count += x - bit;
+
+		if (x < nbits || count > max)
+			return min(count, max);
+
+		bit = 0;
+	}
+	return min(count, max);
+}
+
+/*
+ * Sets or clears the run of count bits starting with bit.
+ * Called with bitmap lock.
+ */
+static int set_run(struct super_block *sb, int map,
+		int nbits, int bit, int count, int set)
+{
+	int i;
+	int err;
+	struct buffer_head *bh;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+
+ 	err = -ENOMEM;
+	bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+	if (!bh)
+		goto out;
+
+	for (i = 0; i < count; i++, bit++) {
+		if (bit >= nbits) {
+			bit = 0;
+			map++;
+
+			mark_buffer_dirty(bh);
+			brelse(bh);
+			bh = sb_bread(sb,
+				clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+			if (!bh)
+				goto out;
+		}
+		if (set) {
+			set_bit(bit, sbi->s_imap[map]);
+			set_bit(bit, (unsigned long *)bh->b_data);
+		} else {
+			clear_bit(bit, sbi->s_imap[map]);
+			clear_bit(bit, (unsigned long *)bh->b_data);
+		}
+	}
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	err = 0;
+out:
+	return err;
+}
+
+/*
+ * Tries to allocate exactly one block.  Returns true if sucessful.
+ */
+int omfs_allocate_block(struct super_block *sb, u64 block)
+{
+	struct buffer_head *bh;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	unsigned int map, bit;
+	int ret = 0;
+	u64 tmp;
+
+	tmp = block;
+	bit = do_div(tmp, bits_per_entry);
+	map = tmp;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	if (map >= sbi->s_imap_size || test_and_set_bit(bit, sbi->s_imap[map]))
+		goto out;
+
+	if (sbi->s_bitmap_ino > 0) {
+		bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+		if (!bh)
+			goto out;
+
+		set_bit(bit, (unsigned long *)bh->b_data);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+	}
+	ret = 1;
+out:
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
+
+
+/*
+ *  Tries to allocate a set of blocks.	The request size depends on the
+ *  type: for inodes, we must allocate sbi->s_mirrors blocks, and for file
+ *  blocks, we try to allocate sbi->s_clustersize, but can always get away
+ *  with just one block.
+ */
+int omfs_allocate_range(struct super_block *sb,
+			int min_request,
+			int max_request,
+			u64 *return_block,
+			int *return_size)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	int ret = 0;
+	int i, run, bit;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	for (i = 0; i < sbi->s_imap_size; i++) {
+		bit = 0;
+		while (bit < bits_per_entry) {
+			bit = find_next_zero_bit(sbi->s_imap[i], bits_per_entry,
+				bit);
+
+			if (bit == bits_per_entry)
+				break;
+
+			run = count_run(&sbi->s_imap[i], bits_per_entry,
+				sbi->s_imap_size-i, bit, max_request);
+
+			if (run >= min_request)
+				goto found;
+			bit += run;
+		}
+	}
+	ret = -ENOSPC;
+	goto out;
+
+found:
+	*return_block = i * bits_per_entry + bit;
+	*return_size = run;
+	ret = set_run(sb, i, bits_per_entry, bit, run, 1);
+
+out:
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
+
+/*
+ * Clears count bits starting at a given block.
+ */
+int omfs_clear_range(struct super_block *sb, u64 block, int count)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int bits_per_entry = 8 * sb->s_blocksize;
+	u64 tmp;
+	unsigned int map, bit;
+	int ret;
+
+	tmp = block;
+	bit = do_div(tmp, bits_per_entry);
+	map = tmp;
+
+	if (map >= sbi->s_imap_size)
+		return 0;
+
+	mutex_lock(&sbi->s_bitmap_lock);
+	ret = set_run(sb, map, bits_per_entry, bit, count, 0);
+	mutex_unlock(&sbi->s_bitmap_lock);
+	return ret;
+}
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
new file mode 100644
index 000000000000..c7275cfbdcfb
--- /dev/null
+++ b/fs/omfs/dir.c
@@ -0,0 +1,505 @@
+/*
+ * OMFS (as used by RIO Karma) directory operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/buffer_head.h>
+#include "omfs.h"
+
+static int omfs_hash(const char *name, int namelen, int mod)
+{
+	int i, hash = 0;
+	for (i = 0; i < namelen; i++)
+		hash ^= tolower(name[i]) << (i % 24);
+	return hash % mod;
+}
+
+/*
+ * Finds the bucket for a given name and reads the containing block;
+ * *ofs is set to the offset of the first list entry.
+ */
+static struct buffer_head *omfs_get_bucket(struct inode *dir,
+		const char *name, int namelen, int *ofs)
+{
+	int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
+	int block = clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino);
+	int bucket = omfs_hash(name, namelen, nbuckets);
+
+	*ofs = OMFS_DIR_START + bucket * 8;
+	return sb_bread(dir->i_sb, block);
+}
+
+static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
+				const char *name, int namelen,
+				u64 *prev_block)
+{
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+	int err = -ENOENT;
+	*prev_block = ~0;
+
+	while (block != ~0) {
+		bh = sb_bread(dir->i_sb,
+			clus_to_blk(OMFS_SB(dir->i_sb), block));
+		if (!bh) {
+			err = -EIO;
+			goto err;
+		}
+
+		oi = (struct omfs_inode *) bh->b_data;
+		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, block)) {
+			brelse(bh);
+			goto err;
+		}
+
+		if (strncmp(oi->i_name, name, namelen) == 0)
+			return bh;
+
+		*prev_block = block;
+		block = be64_to_cpu(oi->i_sibling);
+		brelse(bh);
+	}
+err:
+	return ERR_PTR(err);
+}
+
+static struct buffer_head *omfs_find_entry(struct inode *dir,
+					   const char *name, int namelen)
+{
+	struct buffer_head *bh;
+	int ofs;
+	u64 block, dummy;
+
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		return ERR_PTR(-EIO);
+
+	block = be64_to_cpu(*((__be64 *) &bh->b_data[ofs]));
+	brelse(bh);
+
+	return omfs_scan_list(dir, block, name, namelen, &dummy);
+}
+
+int omfs_make_empty(struct inode *inode, struct super_block *sb)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	int block = clus_to_blk(sbi, inode->i_ino);
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+
+	bh = sb_bread(sb, block);
+	if (!bh)
+		return -ENOMEM;
+
+	memset(bh->b_data, 0, sizeof(struct omfs_inode));
+
+	if (inode->i_mode & S_IFDIR) {
+		memset(&bh->b_data[OMFS_DIR_START], 0xff,
+			sbi->s_sys_blocksize - OMFS_DIR_START);
+	} else
+		omfs_make_empty_table(bh, OMFS_EXTENT_START);
+
+	oi = (struct omfs_inode *) bh->b_data;
+	oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+	oi->i_sibling = ~cpu_to_be64(0ULL);
+
+	mark_buffer_dirty(bh);
+	brelse(bh);
+	return 0;
+}
+
+static int omfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct omfs_inode *oi;
+	struct buffer_head *bh;
+	u64 block;
+	__be64 *entry;
+	int ofs;
+
+	/* just prepend to head of queue in proper bucket */
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		goto out;
+
+	entry = (__be64 *) &bh->b_data[ofs];
+	block = be64_to_cpu(*entry);
+	*entry = cpu_to_be64(inode->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	/* now set the sibling and parent pointers on the new inode */
+	bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), inode->i_ino));
+	if (!bh)
+		goto out;
+
+	oi = (struct omfs_inode *) bh->b_data;
+	memcpy(oi->i_name, name, namelen);
+	memset(oi->i_name + namelen, 0, OMFS_NAMELEN - namelen);
+	oi->i_sibling = cpu_to_be64(block);
+	oi->i_parent = cpu_to_be64(dir->i_ino);
+	mark_buffer_dirty(bh);
+	brelse(bh);
+
+	dir->i_ctime = CURRENT_TIME_SEC;
+
+	/* mark affected inodes dirty to rebuild checksums */
+	mark_inode_dirty(dir);
+	mark_inode_dirty(inode);
+	return 0;
+out:
+	return -ENOMEM;
+}
+
+static int omfs_delete_entry(struct dentry *dentry)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct inode *dirty;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct omfs_inode *oi;
+	struct buffer_head *bh, *bh2;
+	__be64 *entry, next;
+	u64 block, prev;
+	int ofs;
+	int err = -ENOMEM;
+
+	/* delete the proper node in the bucket's linked list */
+	bh = omfs_get_bucket(dir, name, namelen, &ofs);
+	if (!bh)
+		goto out;
+
+	entry = (__be64 *) &bh->b_data[ofs];
+	block = be64_to_cpu(*entry);
+
+	bh2 = omfs_scan_list(dir, block, name, namelen, &prev);
+	if (IS_ERR(bh2)) {
+		err = PTR_ERR(bh2);
+		goto out_free_bh;
+	}
+
+	oi = (struct omfs_inode *) bh2->b_data;
+	next = oi->i_sibling;
+	brelse(bh2);
+
+	if (prev != ~0) {
+		/* found in middle of list, get list ptr */
+		brelse(bh);
+		bh = sb_bread(dir->i_sb,
+			clus_to_blk(OMFS_SB(dir->i_sb), prev));
+		if (!bh)
+			goto out;
+
+		oi = (struct omfs_inode *) bh->b_data;
+		entry = &oi->i_sibling;
+	}
+
+	*entry = next;
+	mark_buffer_dirty(bh);
+
+	if (prev != ~0) {
+		dirty = omfs_iget(dir->i_sb, prev);
+		if (!IS_ERR(dirty)) {
+			mark_inode_dirty(dirty);
+			iput(dirty);
+		}
+	}
+
+	err = 0;
+out_free_bh:
+	brelse(bh);
+out:
+	return err;
+}
+
+static int omfs_dir_is_empty(struct inode *inode)
+{
+	int nbuckets = (inode->i_size - OMFS_DIR_START) / 8;
+	struct buffer_head *bh;
+	u64 *ptr;
+	int i;
+
+	bh = sb_bread(inode->i_sb, clus_to_blk(OMFS_SB(inode->i_sb),
+			inode->i_ino));
+
+	if (!bh)
+		return 0;
+
+	ptr = (u64 *) &bh->b_data[OMFS_DIR_START];
+
+	for (i = 0; i < nbuckets; i++, ptr++)
+		if (*ptr != ~0)
+			break;
+
+	brelse(bh);
+	return *ptr != ~0;
+}
+
+static int omfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int ret;
+	struct inode *inode = dentry->d_inode;
+
+	ret = omfs_delete_entry(dentry);
+	if (ret)
+		goto end_unlink;
+
+	inode_dec_link_count(inode);
+	mark_inode_dirty(dir);
+
+end_unlink:
+	return ret;
+}
+
+static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int err = -ENOTEMPTY;
+	struct inode *inode = dentry->d_inode;
+
+	if (omfs_dir_is_empty(inode)) {
+		err = omfs_unlink(dir, dentry);
+		if (!err)
+			inode_dec_link_count(inode);
+	}
+	return err;
+}
+
+static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int err;
+	struct inode *inode = omfs_new_inode(dir, mode);
+
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	err = omfs_make_empty(inode, dir->i_sb);
+	if (err)
+		goto out_free_inode;
+
+	err = omfs_add_link(dentry, inode);
+	if (err)
+		goto out_free_inode;
+
+	d_instantiate(dentry, inode);
+	return 0;
+
+out_free_inode:
+	iput(inode);
+	return err;
+}
+
+static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	return omfs_add_node(dir, dentry, mode | S_IFDIR);
+}
+
+static int omfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
+{
+	return omfs_add_node(dir, dentry, mode | S_IFREG);
+}
+
+static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct buffer_head *bh;
+	struct inode *inode = NULL;
+
+	if (dentry->d_name.len > OMFS_NAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	bh = omfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
+	if (!IS_ERR(bh)) {
+		struct omfs_inode *oi = (struct omfs_inode *)bh->b_data;
+		ino_t ino = be64_to_cpu(oi->i_head.h_self);
+		brelse(bh);
+		inode = omfs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	d_add(dentry, inode);
+	return NULL;
+}
+
+/* sanity check block's self pointer */
+int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+	u64 fsblock)
+{
+	int is_bad;
+	u64 ino = be64_to_cpu(header->h_self);
+	is_bad = ((ino != fsblock) || (ino < sbi->s_root_ino) ||
+		(ino > sbi->s_num_blocks));
+
+	if (is_bad)
+		printk(KERN_WARNING "omfs: bad hash chain detected\n");
+
+	return is_bad;
+}
+
+static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+		u64 fsblock, int hindex)
+{
+	struct inode *dir = filp->f_dentry->d_inode;
+	struct buffer_head *bh;
+	struct omfs_inode *oi;
+	u64 self;
+	int res = 0;
+	unsigned char d_type;
+
+	/* follow chain in this bucket */
+	while (fsblock != ~0) {
+		bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb),
+				fsblock));
+		if (!bh)
+			goto out;
+
+		oi = (struct omfs_inode *) bh->b_data;
+		if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
+			brelse(bh);
+			goto out;
+		}
+
+		self = fsblock;
+		fsblock = be64_to_cpu(oi->i_sibling);
+
+		/* skip visited nodes */
+		if (hindex) {
+			hindex--;
+			brelse(bh);
+			continue;
+		}
+
+		d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
+
+		res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
+			OMFS_NAMELEN), filp->f_pos, self, d_type);
+		if (res == 0)
+			filp->f_pos++;
+		brelse(bh);
+	}
+out:
+	return res;
+}
+
+static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *new_inode = new_dentry->d_inode;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct buffer_head *bh;
+	int is_dir;
+	int err;
+
+	is_dir = S_ISDIR(old_inode->i_mode);
+
+	if (new_inode) {
+		/* overwriting existing file/dir */
+		err = -ENOTEMPTY;
+		if (is_dir && !omfs_dir_is_empty(new_inode))
+			goto out;
+
+		err = -ENOENT;
+		bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
+			new_dentry->d_name.len);
+		if (IS_ERR(bh))
+			goto out;
+		brelse(bh);
+
+		err = omfs_unlink(new_dir, new_dentry);
+		if (err)
+			goto out;
+	}
+
+	/* since omfs locates files by name, we need to unlink _before_
+	 * adding the new link or we won't find the old one */
+	inode_inc_link_count(old_inode);
+	err = omfs_unlink(old_dir, old_dentry);
+	if (err) {
+		inode_dec_link_count(old_inode);
+		goto out;
+	}
+
+	err = omfs_add_link(new_dentry, old_inode);
+	if (err)
+		goto out;
+
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+out:
+	return err;
+}
+
+static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *dir = filp->f_dentry->d_inode;
+	struct buffer_head *bh;
+	loff_t offset, res;
+	unsigned int hchain, hindex;
+	int nbuckets;
+	u64 fsblock;
+	int ret = -EINVAL;
+
+	if (filp->f_pos >> 32)
+		goto success;
+
+	switch ((unsigned long) filp->f_pos) {
+	case 0:
+		if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
+			goto success;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		if (filldir(dirent, "..", 2, 1,
+		    parent_ino(filp->f_dentry), DT_DIR) < 0)
+			goto success;
+		filp->f_pos = 1 << 20;
+		/* fall through */
+	}
+
+	nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
+
+	/* high 12 bits store bucket + 1 and low 20 bits store hash index */
+	hchain = (filp->f_pos >> 20) - 1;
+	hindex = filp->f_pos & 0xfffff;
+
+	bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino));
+	if (!bh)
+		goto out;
+
+	offset = OMFS_DIR_START + hchain * 8;
+
+	for (; hchain < nbuckets; hchain++, offset += 8) {
+		fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
+
+		res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
+		hindex = 0;
+		if (res < 0)
+			break;
+
+		filp->f_pos = (hchain+2) << 20;
+	}
+	brelse(bh);
+success:
+	ret = 0;
+out:
+	return ret;
+}
+
+struct inode_operations omfs_dir_inops = {
+	.lookup = omfs_lookup,
+	.mkdir = omfs_mkdir,
+	.rename = omfs_rename,
+	.create = omfs_create,
+	.unlink = omfs_unlink,
+	.rmdir = omfs_rmdir,
+};
+
+struct file_operations omfs_dir_operations = {
+	.read = generic_read_dir,
+	.readdir = omfs_readdir,
+	.llseek = generic_file_llseek,
+};
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
new file mode 100644
index 000000000000..834b2331f6b3
--- /dev/null
+++ b/fs/omfs/file.c
@@ -0,0 +1,365 @@
+/*
+ * OMFS (as used by RIO Karma) file operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include "omfs.h"
+
+static int omfs_sync_file(struct file *file, struct dentry *dentry,
+		int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	err = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return err;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return err;
+	err |= omfs_sync_inode(inode);
+	return err ? -EIO : 0;
+}
+
+static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
+{
+	return (sbi->s_sys_blocksize - offset -
+		sizeof(struct omfs_extent)) /
+		sizeof(struct omfs_extent_entry) + 1;
+}
+
+void omfs_make_empty_table(struct buffer_head *bh, int offset)
+{
+	struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset];
+
+	oe->e_next = ~cpu_to_be64(0ULL);
+	oe->e_extent_count = cpu_to_be32(1),
+	oe->e_fill = cpu_to_be32(0x22),
+	oe->e_entry.e_cluster = ~cpu_to_be64(0ULL);
+	oe->e_entry.e_blocks = ~cpu_to_be64(0ULL);
+}
+
+int omfs_shrink_inode(struct inode *inode)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	struct omfs_extent *oe;
+	struct omfs_extent_entry *entry;
+	struct buffer_head *bh;
+	u64 next, last;
+	u32 extent_count;
+	u32 max_extents;
+	int ret;
+
+	/* traverse extent table, freeing each entry that is greater
+	 * than inode->i_size;
+	 */
+	next = inode->i_ino;
+
+	/* only support truncate -> 0 for now */
+	ret = -EIO;
+	if (inode->i_size != 0)
+		goto out;
+
+	bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+	if (!bh)
+		goto out;
+
+	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+	max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+
+	for (;;) {
+
+		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+			goto out_brelse;
+
+		extent_count = be32_to_cpu(oe->e_extent_count);
+
+		if (extent_count > max_extents)
+			goto out_brelse;
+
+		last = next;
+		next = be64_to_cpu(oe->e_next);
+		entry = &oe->e_entry;
+
+		/* ignore last entry as it is the terminator */
+		for (; extent_count > 1; extent_count--) {
+			u64 start, count;
+			start = be64_to_cpu(entry->e_cluster);
+			count = be64_to_cpu(entry->e_blocks);
+
+			omfs_clear_range(inode->i_sb, start, (int) count);
+			entry++;
+		}
+		omfs_make_empty_table(bh, (char *) oe - bh->b_data);
+		mark_buffer_dirty(bh);
+		brelse(bh);
+
+		if (last != inode->i_ino)
+			omfs_clear_range(inode->i_sb, last, sbi->s_mirrors);
+
+		if (next == ~0)
+			break;
+
+		bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+		if (!bh)
+			goto out;
+		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+		max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+	}
+	ret = 0;
+out:
+	return ret;
+out_brelse:
+	brelse(bh);
+	return ret;
+}
+
+static void omfs_truncate(struct inode *inode)
+{
+	omfs_shrink_inode(inode);
+	mark_inode_dirty(inode);
+}
+
+/*
+ * Add new blocks to the current extent, or create new entries/continuations
+ * as necessary.
+ */
+static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
+			u64 *ret_block)
+{
+	struct omfs_extent_entry *terminator;
+	struct omfs_extent_entry *entry = &oe->e_entry;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	u32 extent_count = be32_to_cpu(oe->e_extent_count);
+	u64 new_block = 0;
+	u32 max_count;
+	int new_count;
+	int ret = 0;
+
+	/* reached the end of the extent table with no blocks mapped.
+	 * there are three possibilities for adding: grow last extent,
+	 * add a new extent to the current extent table, and add a
+	 * continuation inode.  in last two cases need an allocator for
+	 * sbi->s_cluster_size
+	 */
+
+	/* TODO: handle holes */
+
+	/* should always have a terminator */
+	if (extent_count < 1)
+		return -EIO;
+
+	/* trivially grow current extent, if next block is not taken */
+	terminator = entry + extent_count - 1;
+	if (extent_count > 1) {
+		entry = terminator-1;
+		new_block = be64_to_cpu(entry->e_cluster) +
+			be64_to_cpu(entry->e_blocks);
+
+		if (omfs_allocate_block(inode->i_sb, new_block)) {
+			entry->e_blocks =
+				cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1);
+			terminator->e_blocks = ~(cpu_to_be64(
+				be64_to_cpu(~terminator->e_blocks) + 1));
+			goto out;
+		}
+	}
+	max_count = omfs_max_extents(sbi, OMFS_EXTENT_START);
+
+	/* TODO: add a continuation block here */
+	if (be32_to_cpu(oe->e_extent_count) > max_count-1)
+		return -EIO;
+
+	/* try to allocate a new cluster */
+	ret = omfs_allocate_range(inode->i_sb, 1, sbi->s_clustersize,
+		&new_block, &new_count);
+	if (ret)
+		goto out_fail;
+
+	/* copy terminator down an entry */
+	entry = terminator;
+	terminator++;
+	memcpy(terminator, entry, sizeof(struct omfs_extent_entry));
+
+	entry->e_cluster = cpu_to_be64(new_block);
+	entry->e_blocks = cpu_to_be64((u64) new_count);
+
+	terminator->e_blocks = ~(cpu_to_be64(
+		be64_to_cpu(~terminator->e_blocks) + (u64) new_count));
+
+	/* write in new entry */
+	oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count));
+
+out:
+	*ret_block = new_block;
+out_fail:
+	return ret;
+}
+
+/*
+ * Scans across the directory table for a given file block number.
+ * If block not found, return 0.
+ */
+static sector_t find_block(struct inode *inode, struct omfs_extent_entry *ent,
+			sector_t block, int count, int *left)
+{
+	/* count > 1 because of terminator */
+	sector_t searched = 0;
+	for (; count > 1; count--) {
+		int numblocks = clus_to_blk(OMFS_SB(inode->i_sb),
+			be64_to_cpu(ent->e_blocks));
+
+		if (block >= searched  &&
+		    block < searched + numblocks) {
+			/*
+			 * found it at cluster + (block - searched)
+			 * numblocks - (block - searched) is remainder
+			 */
+			*left = numblocks - (block - searched);
+			return clus_to_blk(OMFS_SB(inode->i_sb),
+				be64_to_cpu(ent->e_cluster)) +
+				block - searched;
+		}
+		searched += numblocks;
+		ent++;
+	}
+	return 0;
+}
+
+static int omfs_get_block(struct inode *inode, sector_t block,
+			  struct buffer_head *bh_result, int create)
+{
+	struct buffer_head *bh;
+	sector_t next, offset;
+	int ret;
+	u64 new_block;
+	u32 max_extents;
+	int extent_count;
+	struct omfs_extent *oe;
+	struct omfs_extent_entry *entry;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	int max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int remain;
+
+	ret = -EIO;
+	bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino));
+	if (!bh)
+		goto out;
+
+	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+	max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+	next = inode->i_ino;
+
+	for (;;) {
+
+		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+			goto out_brelse;
+
+		extent_count = be32_to_cpu(oe->e_extent_count);
+		next = be64_to_cpu(oe->e_next);
+		entry = &oe->e_entry;
+
+		if (extent_count > max_extents)
+			goto out_brelse;
+
+		offset = find_block(inode, entry, block, extent_count, &remain);
+		if (offset > 0) {
+			ret = 0;
+			map_bh(bh_result, inode->i_sb, offset);
+			if (remain > max_blocks)
+				remain = max_blocks;
+			bh_result->b_size = (remain << inode->i_blkbits);
+			goto out_brelse;
+		}
+		if (next == ~0)
+			break;
+
+		brelse(bh);
+		bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+		if (!bh)
+			goto out;
+		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+		max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+	}
+	if (create) {
+		ret = omfs_grow_extent(inode, oe, &new_block);
+		if (ret == 0) {
+			mark_buffer_dirty(bh);
+			mark_inode_dirty(inode);
+			map_bh(bh_result, inode->i_sb,
+					clus_to_blk(sbi, new_block));
+		}
+	}
+out_brelse:
+	brelse(bh);
+out:
+	return ret;
+}
+
+static int omfs_readpage(struct file *file, struct page *page)
+{
+	return block_read_full_page(page, omfs_get_block);
+}
+
+static int omfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, omfs_get_block);
+}
+
+static int omfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, omfs_get_block, wbc);
+}
+
+static int
+omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, omfs_get_block);
+}
+
+static int omfs_write_begin(struct file *file, struct address_space *mapping,
+			loff_t pos, unsigned len, unsigned flags,
+			struct page **pagep, void **fsdata)
+{
+	*pagep = NULL;
+	return block_write_begin(file, mapping, pos, len, flags,
+				pagep, fsdata, omfs_get_block);
+}
+
+static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
+{
+	return generic_block_bmap(mapping, block, omfs_get_block);
+}
+
+struct file_operations omfs_file_operations = {
+	.llseek = generic_file_llseek,
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = generic_file_aio_read,
+	.aio_write = generic_file_aio_write,
+	.mmap = generic_file_mmap,
+	.fsync = omfs_sync_file,
+	.splice_read = generic_file_splice_read,
+};
+
+struct inode_operations omfs_file_inops = {
+	.truncate = omfs_truncate
+};
+
+struct address_space_operations omfs_aops = {
+	.readpage = omfs_readpage,
+	.readpages = omfs_readpages,
+	.writepage = omfs_writepage,
+	.writepages = omfs_writepages,
+	.sync_page = block_sync_page,
+	.write_begin = omfs_write_begin,
+	.write_end = generic_write_end,
+	.bmap = omfs_bmap,
+};
+
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
new file mode 100644
index 000000000000..cbf047a847c5
--- /dev/null
+++ b/fs/omfs/inode.c
@@ -0,0 +1,553 @@
+/*
+ * Optimized MPEG FS - inode and super operations.
+ * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/vmalloc.h>
+#include <linux/crc-itu-t.h>
+#include "omfs.h"
+
+MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
+MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
+MODULE_LICENSE("GPL");
+
+struct inode *omfs_new_inode(struct inode *dir, int mode)
+{
+	struct inode *inode;
+	u64 new_block;
+	int err;
+	int len;
+	struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb);
+
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors,
+			&new_block, &len);
+	if (err)
+		goto fail;
+
+	inode->i_ino = new_block;
+	inode->i_mode = mode;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_blocks = 0;
+	inode->i_mapping->a_ops = &omfs_aops;
+
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		inode->i_op = &omfs_dir_inops;
+		inode->i_fop = &omfs_dir_operations;
+		inode->i_size = sbi->s_sys_blocksize;
+		inc_nlink(inode);
+		break;
+	case S_IFREG:
+		inode->i_op = &omfs_file_inops;
+		inode->i_fop = &omfs_file_operations;
+		inode->i_size = 0;
+		break;
+	}
+
+	insert_inode_hash(inode);
+	mark_inode_dirty(inode);
+	return inode;
+fail:
+	make_bad_inode(inode);
+	iput(inode);
+	return ERR_PTR(err);
+}
+
+/*
+ * Update the header checksums for a dirty inode based on its contents.
+ * Caller is expected to hold the buffer head underlying oi and mark it
+ * dirty.
+ */
+static void omfs_update_checksums(struct omfs_inode *oi)
+{
+	int xor, i, ofs = 0, count;
+	u16 crc = 0;
+	unsigned char *ptr = (unsigned char *) oi;
+
+	count = be32_to_cpu(oi->i_head.h_body_size);
+	ofs = sizeof(struct omfs_header);
+
+	crc = crc_itu_t(crc, ptr + ofs, count);
+	oi->i_head.h_crc = cpu_to_be16(crc);
+
+	xor = ptr[0];
+	for (i = 1; i < OMFS_XOR_COUNT; i++)
+		xor ^= ptr[i];
+
+	oi->i_head.h_check_xor = xor;
+}
+
+static int omfs_write_inode(struct inode *inode, int wait)
+{
+	struct omfs_inode *oi;
+	struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+	struct buffer_head *bh, *bh2;
+	unsigned int block;
+	u64 ctime;
+	int i;
+	int ret = -EIO;
+	int sync_failed = 0;
+
+	/* get current inode since we may have written sibling ptrs etc. */
+	block = clus_to_blk(sbi, inode->i_ino);
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		goto out;
+
+	oi = (struct omfs_inode *) bh->b_data;
+
+	oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+	if (S_ISDIR(inode->i_mode))
+		oi->i_type = OMFS_DIR;
+	else if (S_ISREG(inode->i_mode))
+		oi->i_type = OMFS_FILE;
+	else {
+		printk(KERN_WARNING "omfs: unknown file type: %d\n",
+			inode->i_mode);
+		goto out_brelse;
+	}
+
+	oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize -
+		sizeof(struct omfs_header));
+	oi->i_head.h_version = 1;
+	oi->i_head.h_type = OMFS_INODE_NORMAL;
+	oi->i_head.h_magic = OMFS_IMAGIC;
+	oi->i_size = cpu_to_be64(inode->i_size);
+
+	ctime = inode->i_ctime.tv_sec * 1000LL +
+		((inode->i_ctime.tv_nsec + 999)/1000);
+	oi->i_ctime = cpu_to_be64(ctime);
+
+	omfs_update_checksums(oi);
+
+	mark_buffer_dirty(bh);
+	if (wait) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			sync_failed = 1;
+	}
+
+	/* if mirroring writes, copy to next fsblock */
+	for (i = 1; i < sbi->s_mirrors; i++) {
+		bh2 = sb_bread(inode->i_sb, block + i *
+			(sbi->s_blocksize / sbi->s_sys_blocksize));
+		if (!bh2)
+			goto out_brelse;
+
+		memcpy(bh2->b_data, bh->b_data, bh->b_size);
+		mark_buffer_dirty(bh2);
+		if (wait) {
+			sync_dirty_buffer(bh2);
+			if (buffer_req(bh2) && !buffer_uptodate(bh2))
+				sync_failed = 1;
+		}
+		brelse(bh2);
+	}
+	ret = (sync_failed) ? -EIO : 0;
+out_brelse:
+	brelse(bh);
+out:
+	return ret;
+}
+
+int omfs_sync_inode(struct inode *inode)
+{
+	return omfs_write_inode(inode, 1);
+}
+
+/*
+ * called when an entry is deleted, need to clear the bits in the
+ * bitmaps.
+ */
+static void omfs_delete_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_size = 0;
+		omfs_shrink_inode(inode);
+	}
+
+	omfs_clear_range(inode->i_sb, inode->i_ino, 2);
+	clear_inode(inode);
+}
+
+struct inode *omfs_iget(struct super_block *sb, ino_t ino)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct omfs_inode *oi;
+	struct buffer_head *bh;
+	unsigned int block;
+	u64 ctime;
+	unsigned long nsecs;
+	struct inode *inode;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	block = clus_to_blk(sbi, ino);
+	bh = sb_bread(inode->i_sb, block);
+	if (!bh)
+		goto iget_failed;
+
+	oi = (struct omfs_inode *)bh->b_data;
+
+	/* check self */
+	if (ino != be64_to_cpu(oi->i_head.h_self))
+		goto fail_bh;
+
+	inode->i_uid = sbi->s_uid;
+	inode->i_gid = sbi->s_gid;
+
+	ctime = be64_to_cpu(oi->i_ctime);
+	nsecs = do_div(ctime, 1000) * 1000L;
+
+	inode->i_atime.tv_sec = ctime;
+	inode->i_mtime.tv_sec = ctime;
+	inode->i_ctime.tv_sec = ctime;
+	inode->i_atime.tv_nsec = nsecs;
+	inode->i_mtime.tv_nsec = nsecs;
+	inode->i_ctime.tv_nsec = nsecs;
+
+	inode->i_mapping->a_ops = &omfs_aops;
+
+	switch (oi->i_type) {
+	case OMFS_DIR:
+		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);
+		inode->i_op = &omfs_dir_inops;
+		inode->i_fop = &omfs_dir_operations;
+		inode->i_size = sbi->s_sys_blocksize;
+		inc_nlink(inode);
+		break;
+	case OMFS_FILE:
+		inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask);
+		inode->i_fop = &omfs_file_operations;
+		inode->i_size = be64_to_cpu(oi->i_size);
+		break;
+	}
+	brelse(bh);
+	unlock_new_inode(inode);
+	return inode;
+fail_bh:
+	brelse(bh);
+iget_failed:
+	iget_failed(inode);
+	return ERR_PTR(-EIO);
+}
+
+static void omfs_put_super(struct super_block *sb)
+{
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	kfree(sbi->s_imap);
+	kfree(sbi);
+	sb->s_fs_info = NULL;
+}
+
+static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *s = dentry->d_sb;
+	struct omfs_sb_info *sbi = OMFS_SB(s);
+	buf->f_type = OMFS_MAGIC;
+	buf->f_bsize = sbi->s_blocksize;
+	buf->f_blocks = sbi->s_num_blocks;
+	buf->f_files = sbi->s_num_blocks;
+	buf->f_namelen = OMFS_NAMELEN;
+
+	buf->f_bfree = buf->f_bavail = buf->f_ffree =
+		omfs_count_free(s);
+	return 0;
+}
+
+static struct super_operations omfs_sops = {
+	.write_inode	= omfs_write_inode,
+	.delete_inode	= omfs_delete_inode,
+	.put_super	= omfs_put_super,
+	.statfs		= omfs_statfs,
+	.show_options	= generic_show_options,
+};
+
+/*
+ * For Rio Karma, there is an on-disk free bitmap whose location is
+ * stored in the root block.  For ReplayTV, there is no such free bitmap
+ * so we have to walk the tree.  Both inodes and file data are allocated
+ * from the same map.  This array can be big (300k) so we allocate
+ * in units of the blocksize.
+ */
+static int omfs_get_imap(struct super_block *sb)
+{
+	int bitmap_size;
+	int array_size;
+	int count;
+	struct omfs_sb_info *sbi = OMFS_SB(sb);
+	struct buffer_head *bh;
+	unsigned long **ptr;
+	sector_t block;
+
+	bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8);
+	array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize);
+
+	if (sbi->s_bitmap_ino == ~0ULL)
+		goto out;
+
+	sbi->s_imap_size = array_size;
+	sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL);
+	if (!sbi->s_imap)
+		goto nomem;
+
+	block = clus_to_blk(sbi, sbi->s_bitmap_ino);
+	ptr = sbi->s_imap;
+	for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
+		bh = sb_bread(sb, block++);
+		if (!bh)
+			goto nomem_free;
+		*ptr = kmalloc(sb->s_blocksize, GFP_KERNEL);
+		if (!*ptr) {
+			brelse(bh);
+			goto nomem_free;
+		}
+		memcpy(*ptr, bh->b_data, sb->s_blocksize);
+		if (count < sb->s_blocksize)
+			memset((void *)*ptr + count, 0xff,
+				sb->s_blocksize - count);
+		brelse(bh);
+		ptr++;
+	}
+out:
+	return 0;
+
+nomem_free:
+	for (count = 0; count < array_size; count++)
+		kfree(sbi->s_imap[count]);
+
+	kfree(sbi->s_imap);
+nomem:
+	sbi->s_imap = NULL;
+	sbi->s_imap_size = 0;
+	return -ENOMEM;
+}
+
+enum {
+	Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask
+};
+
+static const match_table_t tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_dmask, "dmask=%o"},
+	{Opt_fmask, "fmask=%o"},
+};
+
+static int parse_options(char *options, struct omfs_sb_info *sbi)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_uid = option;
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_gid = option;
+			break;
+		case Opt_umask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_fmask = sbi->s_dmask = option;
+			break;
+		case Opt_dmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_dmask = option;
+			break;
+		case Opt_fmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			sbi->s_fmask = option;
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int omfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head *bh, *bh2;
+	struct omfs_super_block *omfs_sb;
+	struct omfs_root_block *omfs_rb;
+	struct omfs_sb_info *sbi;
+	struct inode *root;
+	sector_t start;
+	int ret = -EINVAL;
+
+	save_mount_options(sb, (char *) data);
+
+	sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+
+	sbi->s_uid = current->uid;
+	sbi->s_gid = current->gid;
+	sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+
+	if (!parse_options((char *) data, sbi))
+		goto end;
+
+	sb->s_maxbytes = 0xffffffff;
+
+	sb_set_blocksize(sb, 0x200);
+
+	bh = sb_bread(sb, 0);
+	if (!bh)
+		goto end;
+
+	omfs_sb = (struct omfs_super_block *)bh->b_data;
+
+	if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) {
+		if (!silent)
+			printk(KERN_ERR "omfs: Invalid superblock (%x)\n",
+				   omfs_sb->s_magic);
+		goto out_brelse_bh;
+	}
+	sb->s_magic = OMFS_MAGIC;
+
+	sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks);
+	sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize);
+	sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors);
+	sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block);
+	sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize);
+	mutex_init(&sbi->s_bitmap_lock);
+
+	if (sbi->s_sys_blocksize > PAGE_SIZE) {
+		printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n",
+			sbi->s_sys_blocksize);
+		goto out_brelse_bh;
+	}
+
+	if (sbi->s_blocksize < sbi->s_sys_blocksize ||
+	    sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR "omfs: block size (%d) is out of range\n",
+			sbi->s_blocksize);
+		goto out_brelse_bh;
+	}
+
+	/*
+	 * Use sys_blocksize as the fs block since it is smaller than a
+	 * page while the fs blocksize can be larger.
+	 */
+	sb_set_blocksize(sb, sbi->s_sys_blocksize);
+
+	/*
+	 * ...and the difference goes into a shift.  sys_blocksize is always
+	 * a power of two factor of blocksize.
+	 */
+	sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
+		get_bitmask_order(sbi->s_sys_blocksize);
+
+	start = clus_to_blk(sbi, be64_to_cpu(omfs_sb->s_root_block));
+	bh2 = sb_bread(sb, start);
+	if (!bh2)
+		goto out_brelse_bh;
+
+	omfs_rb = (struct omfs_root_block *)bh2->b_data;
+
+	sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap);
+	sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize);
+
+	if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) {
+		printk(KERN_ERR "omfs: block count discrepancy between "
+			"super and root blocks (%llx, %llx)\n",
+			(unsigned long long)sbi->s_num_blocks,
+			(unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks));
+		goto out_brelse_bh2;
+	}
+
+	ret = omfs_get_imap(sb);
+	if (ret)
+		goto out_brelse_bh2;
+
+	sb->s_op = &omfs_sops;
+
+	root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir));
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out_brelse_bh2;
+	}
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		goto out_brelse_bh2;
+	}
+	printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);
+
+	ret = 0;
+out_brelse_bh2:
+	brelse(bh2);
+out_brelse_bh:
+	brelse(bh);
+end:
+	return ret;
+}
+
+static int omfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *m)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m);
+}
+
+static struct file_system_type omfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "omfs",
+	.get_sb = omfs_get_sb,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init init_omfs_fs(void)
+{
+	return register_filesystem(&omfs_fs_type);
+}
+
+static void __exit exit_omfs_fs(void)
+{
+	unregister_filesystem(&omfs_fs_type);
+}
+
+module_init(init_omfs_fs);
+module_exit(exit_omfs_fs);
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
new file mode 100644
index 000000000000..2bc0f0670406
--- /dev/null
+++ b/fs/omfs/omfs.h
@@ -0,0 +1,67 @@
+#ifndef _OMFS_H
+#define _OMFS_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "omfs_fs.h"
+
+/* In-memory structures */
+struct omfs_sb_info {
+	u64 s_num_blocks;
+	u64 s_bitmap_ino;
+	u64 s_root_ino;
+	u32 s_blocksize;
+	u32 s_mirrors;
+	u32 s_sys_blocksize;
+	u32 s_clustersize;
+	int s_block_shift;
+	unsigned long **s_imap;
+	int s_imap_size;
+	struct mutex s_bitmap_lock;
+	int s_uid;
+	int s_gid;
+	int s_dmask;
+	int s_fmask;
+};
+
+/* convert a cluster number to a scaled block number */
+static inline sector_t clus_to_blk(struct omfs_sb_info *sbi, sector_t block)
+{
+	return block << sbi->s_block_shift;
+}
+
+static inline struct omfs_sb_info *OMFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* bitmap.c */
+extern unsigned long omfs_count_free(struct super_block *sb);
+extern int omfs_allocate_block(struct super_block *sb, u64 block);
+extern int omfs_allocate_range(struct super_block *sb, int min_request,
+			int max_request, u64 *return_block, int *return_size);
+extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
+
+/* dir.c */
+extern struct file_operations omfs_dir_operations;
+extern struct inode_operations omfs_dir_inops;
+extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
+extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+			u64 fsblock);
+
+/* file.c */
+extern struct file_operations omfs_file_operations;
+extern struct inode_operations omfs_file_inops;
+extern struct address_space_operations omfs_aops;
+extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
+extern int omfs_shrink_inode(struct inode *inode);
+
+/* inode.c */
+extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
+extern struct inode *omfs_new_inode(struct inode *dir, int mode);
+extern int omfs_reserve_block(struct super_block *sb, sector_t block);
+extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino);
+extern int omfs_sync_inode(struct inode *inode);
+
+#endif
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
new file mode 100644
index 000000000000..12cca245d6e8
--- /dev/null
+++ b/fs/omfs/omfs_fs.h
@@ -0,0 +1,80 @@
+#ifndef _OMFS_FS_H
+#define _OMFS_FS_H
+
+/* OMFS On-disk structures */
+
+#define OMFS_MAGIC 0xC2993D87
+#define OMFS_IMAGIC 0xD2
+
+#define OMFS_DIR 'D'
+#define OMFS_FILE 'F'
+#define OMFS_INODE_NORMAL 'e'
+#define OMFS_INODE_CONTINUATION 'c'
+#define OMFS_INODE_SYSTEM 's'
+#define OMFS_NAMELEN 256
+#define OMFS_DIR_START 0x1b8
+#define OMFS_EXTENT_START 0x1d0
+#define OMFS_EXTENT_CONT 0x40
+#define OMFS_XOR_COUNT 19
+#define OMFS_MAX_BLOCK_SIZE 8192
+
+struct omfs_super_block {
+	char s_fill1[256];
+	__be64 s_root_block;		/* block number of omfs_root_block */
+	__be64 s_num_blocks;		/* total number of FS blocks */
+	__be32 s_magic;			/* OMFS_MAGIC */
+	__be32 s_blocksize;		/* size of a block */
+	__be32 s_mirrors;		/* # of mirrors of system blocks */
+	__be32 s_sys_blocksize;		/* size of non-data blocks */
+};
+
+struct omfs_header {
+	__be64 h_self;			/* FS block where this is located */
+	__be32 h_body_size;		/* size of useful data after header */
+	__be16 h_crc;			/* crc-ccitt of body_size bytes */
+	char h_fill1[2];
+	u8 h_version;			/* version, always 1 */
+	char h_type;			/* OMFS_INODE_X */
+	u8 h_magic;			/* OMFS_IMAGIC */
+	u8 h_check_xor;			/* XOR of header bytes before this */
+	__be32 h_fill2;
+};
+
+struct omfs_root_block {
+	struct omfs_header r_head;	/* header */
+	__be64 r_fill1;
+	__be64 r_num_blocks;		/* total number of FS blocks */
+	__be64 r_root_dir;		/* block # of root directory */
+	__be64 r_bitmap;		/* block # of free space bitmap */
+	__be32 r_blocksize;		/* size of a block */
+	__be32 r_clustersize;		/* size allocated for data blocks */
+	__be64 r_mirrors;		/* # of mirrors of system blocks */
+	char r_name[OMFS_NAMELEN];	/* partition label */
+};
+
+struct omfs_inode {
+	struct omfs_header i_head;	/* header */
+	__be64 i_parent;		/* parent containing this inode */
+	__be64 i_sibling;		/* next inode in hash bucket */
+	__be64 i_ctime;			/* ctime, in milliseconds */
+	char i_fill1[35];
+	char i_type;			/* OMFS_[DIR,FILE] */
+	__be32 i_fill2;
+	char i_fill3[64];
+	char i_name[OMFS_NAMELEN];	/* filename */
+	__be64 i_size;			/* size of file, in bytes */
+};
+
+struct omfs_extent_entry {
+	__be64 e_cluster;		/* start location of a set of blocks */
+	__be64 e_blocks;		/* number of blocks after e_cluster */
+};
+
+struct omfs_extent {
+	__be64 e_next;			/* next extent table location */
+	__be32 e_extent_count;		/* total # extents in this table */
+	__be32 e_fill;
+	struct omfs_extent_entry e_entry;	/* start of extent entries */
+};
+
+#endif
diff --git a/fs/open.c b/fs/open.c
index a99ad09c3197..83cdb9dee0c1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -64,7 +64,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 		memcpy(buf, &st, sizeof(st));
 	else {
 		if (sizeof buf->f_blocks == 4) {
-			if ((st.f_blocks | st.f_bfree | st.f_bavail) &
+			if ((st.f_blocks | st.f_bfree | st.f_bavail |
+			     st.f_bsize | st.f_frsize) &
 			    0xffffffff00000000ULL)
 				return -EOVERFLOW;
 			/*
@@ -121,37 +122,37 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 	return 0;
 }
 
-asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
+asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * buf)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (!error) {
 		struct statfs tmp;
-		error = vfs_statfs_native(nd.path.dentry, &tmp);
+		error = vfs_statfs_native(path.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
 
 
-asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64 __user *buf)
+asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct statfs64 __user *buf)
 {
-	struct nameidata nd;
+	struct path path;
 	long error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (!error) {
 		struct statfs64 tmp;
-		error = vfs_statfs64(nd.path.dentry, &tmp);
+		error = vfs_statfs64(path.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
@@ -222,20 +223,20 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	return err;
 }
 
-static long do_sys_truncate(const char __user * path, loff_t length)
+static long do_sys_truncate(const char __user *pathname, loff_t length)
 {
-	struct nameidata nd;
-	struct inode * inode;
+	struct path path;
+	struct inode *inode;
 	int error;
 
 	error = -EINVAL;
 	if (length < 0)	/* sorry, but loff_t says... */
 		goto out;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		goto out;
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 
 	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
 	error = -EISDIR;
@@ -246,16 +247,16 @@ static long do_sys_truncate(const char __user * path, loff_t length)
 	if (!S_ISREG(inode->i_mode))
 		goto dput_and_out;
 
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto dput_and_out;
 
-	error = vfs_permission(&nd, MAY_WRITE);
+	error = inode_permission(inode, MAY_WRITE);
 	if (error)
 		goto mnt_drop_write_and_out;
 
 	error = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+	if (IS_APPEND(inode))
 		goto mnt_drop_write_and_out;
 
 	error = get_write_access(inode);
@@ -273,15 +274,15 @@ static long do_sys_truncate(const char __user * path, loff_t length)
 	error = locks_verify_truncate(inode, NULL, length);
 	if (!error) {
 		DQUOT_INIT(inode);
-		error = do_truncate(nd.path.dentry, length, 0, NULL);
+		error = do_truncate(path.dentry, length, 0, NULL);
 	}
 
 put_write_and_out:
 	put_write_access(inode);
 mnt_drop_write_and_out:
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(path.mnt);
 dput_and_out:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -424,7 +425,8 @@ out:
  */
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
-	struct nameidata nd;
+	struct path path;
+	struct inode *inode;
 	int old_fsuid, old_fsgid;
 	kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
 	int res;
@@ -447,7 +449,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 		 * FIXME: There is a race here against sys_capset.  The
 		 * capabilities can change yet we will restore the old
 		 * value below.  We should hold task_capabilities_lock,
-		 * but we cannot because user_path_walk can sleep.
+		 * but we cannot because user_path_at can sleep.
 		 */
 #endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
 		if (current->uid)
@@ -456,14 +458,25 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 			old_cap = cap_set_effective(current->cap_permitted);
 	}
 
-	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+	res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
 	if (res)
 		goto out;
 
-	res = vfs_permission(&nd, mode);
+	inode = path.dentry->d_inode;
+
+	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+		/*
+		 * MAY_EXEC on regular files is denied if the fs is mounted
+		 * with the "noexec" flag.
+		 */
+		res = -EACCES;
+		if (path.mnt->mnt_flags & MNT_NOEXEC)
+			goto out_path_release;
+	}
+
+	res = inode_permission(inode, mode | MAY_ACCESS);
 	/* SuS v2 requires we report a read only fs too */
-	if(res || !(mode & S_IWOTH) ||
-	   special_file(nd.path.dentry->d_inode->i_mode))
+	if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
 		goto out_path_release;
 	/*
 	 * This is a rare case where using __mnt_is_readonly()
@@ -475,11 +488,11 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 	 * inherently racy and know that the fs may change
 	 * state before we even see this result.
 	 */
-	if (__mnt_is_readonly(nd.path.mnt))
+	if (__mnt_is_readonly(path.mnt))
 		res = -EROFS;
 
 out_path_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	current->fsuid = old_fsuid;
 	current->fsgid = old_fsgid;
@@ -497,22 +510,21 @@ asmlinkage long sys_access(const char __user *filename, int mode)
 
 asmlinkage long sys_chdir(const char __user * filename)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = __user_walk(filename,
-			    LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd);
+	error = user_path_dir(filename, &path);
 	if (error)
 		goto out;
 
-	error = vfs_permission(&nd, MAY_EXEC);
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
 	if (error)
 		goto dput_and_out;
 
-	set_fs_pwd(current->fs, &nd.path);
+	set_fs_pwd(current->fs, &path);
 
 dput_and_out:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -534,7 +546,7 @@ asmlinkage long sys_fchdir(unsigned int fd)
 	if (!S_ISDIR(inode->i_mode))
 		goto out_putf;
 
-	error = file_permission(file, MAY_EXEC);
+	error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
 	if (!error)
 		set_fs_pwd(current->fs, &file->f_path);
 out_putf:
@@ -545,14 +557,14 @@ out:
 
 asmlinkage long sys_chroot(const char __user * filename)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+	error = user_path_dir(filename, &path);
 	if (error)
 		goto out;
 
-	error = vfs_permission(&nd, MAY_EXEC);
+	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
 	if (error)
 		goto dput_and_out;
 
@@ -560,11 +572,10 @@ asmlinkage long sys_chroot(const char __user * filename)
 	if (!capable(CAP_SYS_CHROOT))
 		goto dput_and_out;
 
-	set_fs_root(current->fs, &nd.path);
-	set_fs_altroot();
+	set_fs_root(current->fs, &path);
 	error = 0;
 dput_and_out:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -589,9 +600,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	err = mnt_want_write(file->f_path.mnt);
 	if (err)
 		goto out_putf;
-	err = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out_drop_write;
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
@@ -599,8 +607,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	err = notify_change(dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
-
-out_drop_write:
 	mnt_drop_write(file->f_path.mnt);
 out_putf:
 	fput(file);
@@ -611,36 +617,29 @@ out:
 asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
 			     mode_t mode)
 {
-	struct nameidata nd;
-	struct inode * inode;
+	struct path path;
+	struct inode *inode;
 	int error;
 	struct iattr newattrs;
 
-	error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
+	error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
 	if (error)
 		goto out;
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto dput_and_out;
-
-	error = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out_drop_write;
-
 	mutex_lock(&inode->i_mutex);
 	if (mode == (mode_t) -1)
 		mode = inode->i_mode;
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	error = notify_change(nd.path.dentry, &newattrs);
+	error = notify_change(path.dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
-
-out_drop_write:
-	mnt_drop_write(nd.path.mnt);
+	mnt_drop_write(path.mnt);
 dput_and_out:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -652,18 +651,10 @@ asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
 
 static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 {
-	struct inode * inode;
+	struct inode *inode = dentry->d_inode;
 	int error;
 	struct iattr newattrs;
 
-	error = -ENOENT;
-	if (!(inode = dentry->d_inode)) {
-		printk(KERN_ERR "chown_common: NULL inode\n");
-		goto out;
-	}
-	error = -EPERM;
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		goto out;
 	newattrs.ia_valid =  ATTR_CTIME;
 	if (user != (uid_t) -1) {
 		newattrs.ia_valid |= ATTR_UID;
@@ -679,25 +670,25 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 	mutex_lock(&inode->i_mutex);
 	error = notify_change(dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
-out:
+
 	return error;
 }
 
 asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(filename, &nd);
+	error = user_path(filename, &path);
 	if (error)
 		goto out;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(nd.path.dentry, user, group);
-	mnt_drop_write(nd.path.mnt);
+	error = chown_common(path.dentry, user, group);
+	mnt_drop_write(path.mnt);
 out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -705,7 +696,7 @@ out:
 asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 			     gid_t group, int flag)
 {
-	struct nameidata nd;
+	struct path path;
 	int error = -EINVAL;
 	int follow;
 
@@ -713,35 +704,35 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 		goto out;
 
 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-	error = __user_walk_fd(dfd, filename, follow, &nd);
+	error = user_path_at(dfd, filename, follow, &path);
 	if (error)
 		goto out;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(nd.path.dentry, user, group);
-	mnt_drop_write(nd.path.mnt);
+	error = chown_common(path.dentry, user, group);
+	mnt_drop_write(path.mnt);
 out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
 
 asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk_link(filename, &nd);
+	error = user_lpath(filename, &path);
 	if (error)
 		goto out;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto out_release;
-	error = chown_common(nd.path.dentry, user, group);
-	mnt_drop_write(nd.path.mnt);
+	error = chown_common(path.dentry, user, group);
+	mnt_drop_write(path.mnt);
 out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return error;
 }
@@ -807,7 +798,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	int error;
 
 	f->f_flags = flags;
-	f->f_mode = ((flags+1) & O_ACCMODE) | FMODE_LSEEK |
+	f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 	inode = dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
@@ -972,71 +963,6 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 }
 EXPORT_SYMBOL(dentry_open);
 
-/*
- * Find an empty file descriptor entry, and mark it busy.
- */
-int get_unused_fd_flags(int flags)
-{
-	struct files_struct * files = current->files;
-	int fd, error;
-	struct fdtable *fdt;
-
-  	error = -EMFILE;
-	spin_lock(&files->file_lock);
-
-repeat:
-	fdt = files_fdtable(files);
-	fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
-				files->next_fd);
-
-	/*
-	 * N.B. For clone tasks sharing a files structure, this test
-	 * will limit the total number of files that can be opened.
-	 */
-	if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
-		goto out;
-
-	/* Do we need to expand the fd array or fd set?  */
-	error = expand_files(files, fd);
-	if (error < 0)
-		goto out;
-
-	if (error) {
-		/*
-	 	 * If we needed to expand the fs array we
-		 * might have blocked - try again.
-		 */
-		error = -EMFILE;
-		goto repeat;
-	}
-
-	FD_SET(fd, fdt->open_fds);
-	if (flags & O_CLOEXEC)
-		FD_SET(fd, fdt->close_on_exec);
-	else
-		FD_CLR(fd, fdt->close_on_exec);
-	files->next_fd = fd + 1;
-#if 1
-	/* Sanity check */
-	if (fdt->fd[fd] != NULL) {
-		printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd);
-		fdt->fd[fd] = NULL;
-	}
-#endif
-	error = fd;
-
-out:
-	spin_unlock(&files->file_lock);
-	return error;
-}
-
-int get_unused_fd(void)
-{
-	return get_unused_fd_flags(0);
-}
-
-EXPORT_SYMBOL(get_unused_fd);
-
 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
 {
 	struct fdtable *fdt = files_fdtable(files);
@@ -1215,8 +1141,7 @@ EXPORT_SYMBOL(sys_close);
 asmlinkage long sys_vhangup(void)
 {
 	if (capable(CAP_SYS_TTY_CONFIG)) {
-		/* XXX: this needs locking */
-		tty_vhangup(current->signal->tty);
+		tty_vhangup_self();
 		return 0;
 	}
 	return -EPERM;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d17b4fd204e1..d41bdc784de4 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -167,6 +167,7 @@ static int openpromfs_readdir(struct file *, void *, filldir_t);
 static const struct file_operations openprom_operations = {
 	.read		= generic_read_dir,
 	.readdir	= openpromfs_readdir,
+	.llseek		= generic_file_llseek,
 };
 
 static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
@@ -430,7 +431,7 @@ static struct file_system_type openprom_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static void op_inode_init_once(struct kmem_cache * cachep, void *data)
+static void op_inode_init_once(void *data)
 {
 	struct op_inode_info *oi = (struct op_inode_info *) data;
 
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 3d3e16631472..a97b477ac0fc 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -275,16 +275,6 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
 	id = data[0x1fc] & 15;
 	put_dev_sector(sect);
 
-#ifdef CONFIG_BLK_DEV_MFM
-	if (MAJOR(bdev->bd_dev) == MFM_ACORN_MAJOR) {
-		extern void xd_set_geometry(struct block_device *,
-			unsigned char, unsigned char, unsigned int);
-		xd_set_geometry(bdev, dr->secspertrack, heads, 1);
-		invalidate_bh_lrus();
-		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
-	}
-#endif
-
 	/*
 	 * Work out start of non-adfs partition.
 	 */
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6149e4b58c88..633f7a0ebb2c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -120,22 +120,21 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
  * a pointer to that same buffer (for convenience).
  */
 
-char *disk_name(struct gendisk *hd, int part, char *buf)
+char *disk_name(struct gendisk *hd, int partno, char *buf)
 {
-	if (!part)
+	if (!partno)
 		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
 	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
-		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part);
+		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
 	else
-		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part);
+		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
 
 	return buf;
 }
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
-	int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
-	return disk_name(bdev->bd_disk, part, buf);
+	return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
 }
 
 EXPORT_SYMBOL(bdevname);
@@ -169,7 +168,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	if (isdigit(state->name[strlen(state->name)-1]))
 		sprintf(state->name, "p");
 
-	state->limit = hd->minors;
+	state->limit = disk_max_parts(hd);
 	i = res = err = 0;
 	while (!res && check_part[i]) {
 		memset(&state->parts, 0, sizeof(state->parts));
@@ -196,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	return ERR_PTR(res);
 }
 
+static ssize_t part_partition_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%d\n", p->partno);
+}
+
 static ssize_t part_start_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
@@ -204,21 +211,22 @@ static ssize_t part_start_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
 }
 
-static ssize_t part_size_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+ssize_t part_size_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
-static ssize_t part_stat_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+ssize_t part_stat_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
+	int cpu;
 
-	preempt_disable();
-	part_round_stats(p);
-	preempt_enable();
+	cpu = part_stat_lock();
+	part_round_stats(cpu, p);
+	part_stat_unlock();
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -238,17 +246,17 @@ static ssize_t part_stat_show(struct device *dev,
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-static ssize_t part_fail_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+ssize_t part_fail_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
 
 	return sprintf(buf, "%d\n", p->make_it_fail);
 }
 
-static ssize_t part_fail_store(struct device *dev,
-			       struct device_attribute *attr,
-			       const char *buf, size_t count)
+ssize_t part_fail_store(struct device *dev,
+			struct device_attribute *attr,
+			const char *buf, size_t count)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	int i;
@@ -260,6 +268,7 @@ static ssize_t part_fail_store(struct device *dev,
 }
 #endif
 
+static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
@@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail =
 #endif
 
 static struct attribute *part_attrs[] = {
+	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
 	&dev_attr_stat.attr,
@@ -300,40 +310,34 @@ struct device_type part_type = {
 	.release	= part_release,
 };
 
-static inline void partition_sysfs_add_subdir(struct hd_struct *p)
+static void delete_partition_rcu_cb(struct rcu_head *head)
 {
-	struct kobject *k;
+	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
 
-	k = kobject_get(&p->dev.kobj);
-	p->holder_dir = kobject_create_and_add("holders", k);
-	kobject_put(k);
+	part->start_sect = 0;
+	part->nr_sects = 0;
+	part_stat_set_all(part, 0);
+	put_device(part_to_dev(part));
 }
 
-static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
+void delete_partition(struct gendisk *disk, int partno)
 {
-	struct kobject *k;
+	struct disk_part_tbl *ptbl = disk->part_tbl;
+	struct hd_struct *part;
 
-	k = kobject_get(&disk->dev.kobj);
-	disk->holder_dir = kobject_create_and_add("holders", k);
-	disk->slave_dir = kobject_create_and_add("slaves", k);
-	kobject_put(k);
-}
-
-void delete_partition(struct gendisk *disk, int part)
-{
-	struct hd_struct *p = disk->part[part-1];
-
-	if (!p)
+	if (partno >= ptbl->len)
 		return;
-	if (!p->nr_sects)
+
+	part = ptbl->part[partno];
+	if (!part)
 		return;
-	disk->part[part-1] = NULL;
-	p->start_sect = 0;
-	p->nr_sects = 0;
-	part_stat_set_all(p, 0);
-	kobject_put(p->holder_dir);
-	device_del(&p->dev);
-	put_device(&p->dev);
+
+	blk_free_devt(part_devt(part));
+	rcu_assign_pointer(ptbl->part[partno], NULL);
+	kobject_put(part->holder_dir);
+	device_del(part_to_dev(part));
+
+	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -344,86 +348,132 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
+int add_partition(struct gendisk *disk, int partno,
+		  sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
+	dev_t devt = MKDEV(0, 0);
+	struct device *ddev = disk_to_dev(disk);
+	struct device *pdev;
+	struct disk_part_tbl *ptbl;
+	const char *dname;
 	int err;
 
+	err = disk_expand_part_tbl(disk, partno);
+	if (err)
+		return err;
+	ptbl = disk->part_tbl;
+
+	if (ptbl->part[partno])
+		return -EBUSY;
+
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
-		return;
+		return -ENOMEM;
 
 	if (!init_part_stats(p)) {
-		kfree(p);
-		return;
+		err = -ENOMEM;
+		goto out_free;
 	}
+	pdev = part_to_dev(p);
+
 	p->start_sect = start;
 	p->nr_sects = len;
-	p->partno = part;
-	p->policy = disk->policy;
+	p->partno = partno;
+	p->policy = get_disk_ro(disk);
 
-	if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
-		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-		"%sp%d", disk->dev.bus_id, part);
+	dname = dev_name(ddev);
+	if (isdigit(dname[strlen(dname) - 1]))
+		snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
 	else
-		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-			 "%s%d", disk->dev.bus_id, part);
+		snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
+
+	device_initialize(pdev);
+	pdev->class = &block_class;
+	pdev->type = &part_type;
+	pdev->parent = ddev;
 
-	device_initialize(&p->dev);
-	p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
-	p->dev.class = &block_class;
-	p->dev.type = &part_type;
-	p->dev.parent = &disk->dev;
-	disk->part[part-1] = p;
+	err = blk_alloc_devt(p, &devt);
+	if (err)
+		goto out_free;
+	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
-	p->dev.uevent_suppress = 1;
-	device_add(&p->dev);
-	partition_sysfs_add_subdir(p);
-	p->dev.uevent_suppress = 0;
-	if (flags & ADDPART_FLAG_WHOLEDISK)
-		err = device_create_file(&p->dev, &dev_attr_whole_disk);
+	pdev->uevent_suppress = 1;
+	err = device_add(pdev);
+	if (err)
+		goto out_put;
+
+	err = -ENOMEM;
+	p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
+	if (!p->holder_dir)
+		goto out_del;
+
+	pdev->uevent_suppress = 0;
+	if (flags & ADDPART_FLAG_WHOLEDISK) {
+		err = device_create_file(pdev, &dev_attr_whole_disk);
+		if (err)
+			goto out_del;
+	}
+
+	/* everything is up and running, commence */
+	INIT_RCU_HEAD(&p->rcu_head);
+	rcu_assign_pointer(ptbl->part[partno], p);
 
 	/* suppress uevent if the disk supresses it */
-	if (!disk->dev.uevent_suppress)
-		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
+	if (!ddev->uevent_suppress)
+		kobject_uevent(&pdev->kobj, KOBJ_ADD);
+
+	return 0;
+
+out_free:
+	kfree(p);
+	return err;
+out_del:
+	kobject_put(p->holder_dir);
+	device_del(pdev);
+out_put:
+	put_device(pdev);
+	blk_free_devt(devt);
+	return err;
 }
 
 /* Not exported, helper to add_disk(). */
 void register_disk(struct gendisk *disk)
 {
+	struct device *ddev = disk_to_dev(disk);
 	struct block_device *bdev;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 	char *s;
-	int i;
-	struct hd_struct *p;
 	int err;
 
-	disk->dev.parent = disk->driverfs_dev;
-	disk->dev.devt = MKDEV(disk->major, disk->first_minor);
+	ddev->parent = disk->driverfs_dev;
 
-	strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN);
+	strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
 	/* ewww... some of these buggers have / in the name... */
-	s = strchr(disk->dev.bus_id, '/');
+	s = strchr(ddev->bus_id, '/');
 	if (s)
 		*s = '!';
 
 	/* delay uevents, until we scanned partition table */
-	disk->dev.uevent_suppress = 1;
+	ddev->uevent_suppress = 1;
 
-	if (device_add(&disk->dev))
+	if (device_add(ddev))
 		return;
 #ifndef CONFIG_SYSFS_DEPRECATED
-	err = sysfs_create_link(block_depr, &disk->dev.kobj,
-				kobject_name(&disk->dev.kobj));
+	err = sysfs_create_link(block_depr, &ddev->kobj,
+				kobject_name(&ddev->kobj));
 	if (err) {
-		device_del(&disk->dev);
+		device_del(ddev);
 		return;
 	}
 #endif
-	disk_sysfs_add_subdirs(disk);
+	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
 	/* No minors to use for partitions */
-	if (disk->minors == 1)
+	if (!disk_partitionable(disk))
 		goto exit;
 
 	/* No such device (e.g., media were just removed) */
@@ -435,58 +485,93 @@ void register_disk(struct gendisk *disk)
 		goto exit;
 
 	bdev->bd_invalidated = 1;
-	err = blkdev_get(bdev, FMODE_READ, 0);
+	err = blkdev_get(bdev, FMODE_READ);
 	if (err < 0)
 		goto exit;
-	blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ);
 
 exit:
 	/* announce disk after possible partitions are created */
-	disk->dev.uevent_suppress = 0;
-	kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
+	ddev->uevent_suppress = 0;
+	kobject_uevent(&ddev->kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
-	for (i = 1; i < disk->minors; i++) {
-		p = disk->part[i-1];
-		if (!p || !p->nr_sects)
-			continue;
-		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
-	}
+	disk_part_iter_init(&piter, disk, 0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+	disk_part_iter_exit(&piter);
 }
 
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 	struct parsed_partitions *state;
-	int p, res;
+	int p, highest, res;
 
 	if (bdev->bd_part_count)
 		return -EBUSY;
 	res = invalidate_partition(disk, 0);
 	if (res)
 		return res;
-	bdev->bd_invalidated = 0;
-	for (p = 1; p < disk->minors; p++)
-		delete_partition(disk, p);
+
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+	while ((part = disk_part_iter_next(&piter)))
+		delete_partition(disk, part->partno);
+	disk_part_iter_exit(&piter);
+
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
 	if (IS_ERR(state))	/* I/O error reading the partition table */
 		return -EIO;
 
 	/* tell userspace that the media / partition table may have changed */
-	kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
+	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+
+	/* Detect the highest partition number and preallocate
+	 * disk->part_tbl.  This is an optimization and not strictly
+	 * necessary.
+	 */
+	for (p = 1, highest = 0; p < state->limit; p++)
+		if (state->parts[p].size)
+			highest = p;
 
+	disk_expand_part_tbl(disk, highest);
+
+	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size = state->parts[p].size;
 		sector_t from = state->parts[p].from;
 		if (!size)
 			continue;
+		if (from >= get_capacity(disk)) {
+			printk(KERN_WARNING
+			       "%s: p%d ignored, start %llu is behind the end of the disk\n",
+			       disk->disk_name, p, (unsigned long long) from);
+			continue;
+		}
 		if (from + size > get_capacity(disk)) {
-			printk(" %s: p%d exceeds device capacity\n",
-				disk->disk_name, p);
+			/*
+			 * we can not ignore partitions of broken tables
+			 * created by for example camera firmware, but we
+			 * limit them to the end of the disk to avoid
+			 * creating invalid block devices
+			 */
+			printk(KERN_WARNING
+			       "%s: p%d size %llu limited to end of disk\n",
+			       disk->disk_name, p, (unsigned long long) size);
+			size = get_capacity(disk) - from;
+		}
+		res = add_partition(disk, p, from, size, state->parts[p].flags);
+		if (res) {
+			printk(KERN_ERR " %s: p%d could not be added: %d\n",
+				disk->disk_name, p, -res);
+			continue;
 		}
-		add_partition(disk, p, from, size, state->parts[p].flags);
 #ifdef CONFIG_BLK_DEV_MD
 		if (state->parts[p].flags & ADDPART_FLAG_RAID)
 			md_autodetect_dev(bdev->bd_dev+p);
@@ -519,25 +604,31 @@ EXPORT_SYMBOL(read_dev_sector);
 
 void del_gendisk(struct gendisk *disk)
 {
-	int p;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 
 	/* invalidate stuff */
-	for (p = disk->minors - 1; p > 0; p--) {
-		invalidate_partition(disk, p);
-		delete_partition(disk, p);
+	disk_part_iter_init(&piter, disk,
+			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+	while ((part = disk_part_iter_next(&piter))) {
+		invalidate_partition(disk, part->partno);
+		delete_partition(disk, part->partno);
 	}
+	disk_part_iter_exit(&piter);
+
 	invalidate_partition(disk, 0);
-	disk->capacity = 0;
+	blk_free_devt(disk_to_dev(disk)->devt);
+	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 	unlink_gendisk(disk);
-	disk_stat_set_all(disk, 0);
-	disk->stamp = 0;
+	part_stat_set_all(&disk->part0, 0);
+	disk->part0.stamp = 0;
 
-	kobject_put(disk->holder_dir);
+	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
 	disk->driverfs_dev = NULL;
 #ifndef CONFIG_SYSFS_DEPRECATED
-	sysfs_remove_link(block_depr, disk->dev.bus_id);
+	sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 #endif
-	device_del(&disk->dev);
+	device_del(disk_to_dev(disk));
 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 17ae8ecd9e8b..98dbe1a84528 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -5,15 +5,13 @@
  * add_gd_partition adds a partitions details to the devices partition
  * description.
  */
-enum { MAX_PART = 256 };
-
 struct parsed_partitions {
 	char name[BDEVNAME_SIZE];
 	struct {
 		sector_t from;
 		sector_t size;
 		int flags;
-	} parts[MAX_PART];
+	} parts[DISK_MAX_PARTS];
 	int next;
 	int limit;
 };
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index e7b07006bc41..038a6022152f 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,13 +95,6 @@
 #include "check.h"
 #include "efi.h"
 
-#undef EFI_DEBUG
-#ifdef EFI_DEBUG
-#define Dprintk(x...) printk(KERN_DEBUG x)
-#else
-#define Dprintk(x...)
-#endif
-
 /* This allows a kernel command line option 'gpt' to override
  * the test for invalid PMBR.  Not __initdata because reloading
  * the partition tables happens after init too.
@@ -305,10 +298,10 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 
 	/* Check the GUID Partition Table signature */
 	if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
-		Dprintk("GUID Partition Table Header signature is wrong:"
-			"%lld != %lld\n",
-			(unsigned long long)le64_to_cpu((*gpt)->signature),
-			(unsigned long long)GPT_HEADER_SIGNATURE);
+		pr_debug("GUID Partition Table Header signature is wrong:"
+			 "%lld != %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->signature),
+			 (unsigned long long)GPT_HEADER_SIGNATURE);
 		goto fail;
 	}
 
@@ -318,9 +311,8 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 	crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
 
 	if (crc != origcrc) {
-		Dprintk
-		    ("GUID Partition Table Header CRC is wrong: %x != %x\n",
-		     crc, origcrc);
+		pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
+			 crc, origcrc);
 		goto fail;
 	}
 	(*gpt)->header_crc32 = cpu_to_le32(origcrc);
@@ -328,9 +320,9 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 	/* Check that the my_lba entry points to the LBA that contains
 	 * the GUID Partition Table */
 	if (le64_to_cpu((*gpt)->my_lba) != lba) {
-		Dprintk("GPT my_lba incorrect: %lld != %lld\n",
-			(unsigned long long)le64_to_cpu((*gpt)->my_lba),
-			(unsigned long long)lba);
+		pr_debug("GPT my_lba incorrect: %lld != %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->my_lba),
+			 (unsigned long long)lba);
 		goto fail;
 	}
 
@@ -339,15 +331,15 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 	 */
 	lastlba = last_lba(bdev);
 	if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
-		Dprintk("GPT: first_usable_lba incorrect: %lld > %lld\n",
-			(unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
-			(unsigned long long)lastlba);
+		pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
+			 (unsigned long long)lastlba);
 		goto fail;
 	}
 	if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
-		Dprintk("GPT: last_usable_lba incorrect: %lld > %lld\n",
-			(unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
-			(unsigned long long)lastlba);
+		pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+			 (unsigned long long)lastlba);
 		goto fail;
 	}
 
@@ -360,7 +352,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
 			le32_to_cpu((*gpt)->sizeof_partition_entry));
 
 	if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
-		Dprintk("GUID Partitition Entry Array CRC check failed.\n");
+		pr_debug("GUID Partitition Entry Array CRC check failed.\n");
 		goto fail_ptes;
 	}
 
@@ -616,7 +608,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
 		return 0;
 	}
 
-	Dprintk("GUID Partition Table is valid!  Yea!\n");
+	pr_debug("GUID Partition Table is valid!  Yea!\n");
 
 	for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
 		if (!is_pte_valid(&ptes[i], last_lba(bdev)))
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 0fdda2e8a4cc..8652fb99e962 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -133,17 +133,17 @@ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
 	bool is_vista = false;
 
 	BUG_ON(!data || !ph);
-	if (MAGIC_PRIVHEAD != BE64(data)) {
+	if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) {
 		ldm_error("Cannot find PRIVHEAD structure. LDM database is"
 			" corrupt. Aborting.");
 		return false;
 	}
-	ph->ver_major = BE16(data + 0x000C);
-	ph->ver_minor = BE16(data + 0x000E);
-	ph->logical_disk_start = BE64(data + 0x011B);
-	ph->logical_disk_size = BE64(data + 0x0123);
-	ph->config_start = BE64(data + 0x012B);
-	ph->config_size = BE64(data + 0x0133);
+	ph->ver_major = get_unaligned_be16(data + 0x000C);
+	ph->ver_minor = get_unaligned_be16(data + 0x000E);
+	ph->logical_disk_start = get_unaligned_be64(data + 0x011B);
+	ph->logical_disk_size = get_unaligned_be64(data + 0x0123);
+	ph->config_start = get_unaligned_be64(data + 0x012B);
+	ph->config_size = get_unaligned_be64(data + 0x0133);
 	/* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
 	if (ph->ver_major == 2 && ph->ver_minor == 12)
 		is_vista = true;
@@ -191,14 +191,14 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
 {
 	BUG_ON (!data || !toc);
 
-	if (MAGIC_TOCBLOCK != BE64 (data)) {
+	if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) {
 		ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
 		return false;
 	}
 	strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
 	toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
-	toc->bitmap1_start = BE64 (data + 0x2E);
-	toc->bitmap1_size  = BE64 (data + 0x36);
+	toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
+	toc->bitmap1_size  = get_unaligned_be64(data + 0x36);
 
 	if (strncmp (toc->bitmap1_name, TOC_BITMAP1,
 			sizeof (toc->bitmap1_name)) != 0) {
@@ -208,8 +208,8 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
 	}
 	strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
 	toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
-	toc->bitmap2_start = BE64 (data + 0x50);
-	toc->bitmap2_size  = BE64 (data + 0x58);
+	toc->bitmap2_start = get_unaligned_be64(data + 0x50);
+	toc->bitmap2_size  = get_unaligned_be64(data + 0x58);
 	if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
 			sizeof (toc->bitmap2_name)) != 0) {
 		ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
@@ -237,22 +237,22 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
 {
 	BUG_ON (!data || !vm);
 
-	if (MAGIC_VMDB != BE32 (data)) {
+	if (MAGIC_VMDB != get_unaligned_be32(data)) {
 		ldm_crit ("Cannot find the VMDB, database may be corrupt.");
 		return false;
 	}
 
-	vm->ver_major = BE16 (data + 0x12);
-	vm->ver_minor = BE16 (data + 0x14);
+	vm->ver_major = get_unaligned_be16(data + 0x12);
+	vm->ver_minor = get_unaligned_be16(data + 0x14);
 	if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
 		ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
 			"Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
 		return false;
 	}
 
-	vm->vblk_size     = BE32 (data + 0x08);
-	vm->vblk_offset   = BE32 (data + 0x0C);
-	vm->last_vblk_seq = BE32 (data + 0x04);
+	vm->vblk_size     = get_unaligned_be32(data + 0x08);
+	vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
+	vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
 
 	ldm_debug ("Parsed VMDB successfully.");
 	return true;
@@ -507,7 +507,7 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
 		goto out;				/* Already logged */
 
 	/* Are there uncommitted transactions? */
-	if (BE16(data + 0x10) != 0x01) {
+	if (get_unaligned_be16(data + 0x10) != 0x01) {
 		ldm_crit ("Database is not in a consistent state.  Aborting.");
 		goto out;
 	}
@@ -802,7 +802,7 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_CMP3;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	comp = &vb->vblk.comp;
@@ -851,7 +851,7 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_DGR3;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	dgrp = &vb->vblk.dgrp;
@@ -895,7 +895,7 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_DGR4;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	dgrp = &vb->vblk.dgrp;
@@ -931,7 +931,7 @@ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_DSK3;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	disk = &vb->vblk.disk;
@@ -968,7 +968,7 @@ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	len += VBLK_SIZE_DSK4;
-	if (len != BE32 (buffer + 0x14))
+	if (len != get_unaligned_be32(buffer + 0x14))
 		return false;
 
 	disk = &vb->vblk.disk;
@@ -1034,14 +1034,14 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 	}
 	len += VBLK_SIZE_PRT3;
-	if (len > BE32(buffer + 0x14)) {
+	if (len > get_unaligned_be32(buffer + 0x14)) {
 		ldm_error("len %d > BE32(buffer + 0x14) %d", len,
-				BE32(buffer + 0x14));
+				get_unaligned_be32(buffer + 0x14));
 		return false;
 	}
 	part = &vb->vblk.part;
-	part->start = BE64(buffer + 0x24 + r_name);
-	part->volume_offset = BE64(buffer + 0x2C + r_name);
+	part->start = get_unaligned_be64(buffer + 0x24 + r_name);
+	part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name);
 	part->size = ldm_get_vnum(buffer + 0x34 + r_name);
 	part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
 	part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
@@ -1139,9 +1139,9 @@ static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 	}
 	len += VBLK_SIZE_VOL5;
-	if (len > BE32(buffer + 0x14)) {
+	if (len > get_unaligned_be32(buffer + 0x14)) {
 		ldm_error("len %d > BE32(buffer + 0x14) %d", len,
-				BE32(buffer + 0x14));
+				get_unaligned_be32(buffer + 0x14));
 		return false;
 	}
 	volu = &vb->vblk.volu;
@@ -1294,9 +1294,9 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
 
 	BUG_ON (!data || !frags);
 
-	group = BE32 (data + 0x08);
-	rec   = BE16 (data + 0x0C);
-	num   = BE16 (data + 0x0E);
+	group = get_unaligned_be32(data + 0x08);
+	rec   = get_unaligned_be16(data + 0x0C);
+	num   = get_unaligned_be16(data + 0x0E);
 	if ((num < 1) || (num > 4)) {
 		ldm_error ("A VBLK claims to have %d parts.", num);
 		return false;
@@ -1425,12 +1425,12 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
 		}
 
 		for (v = 0; v < perbuf; v++, data+=size) {  /* For each vblk */
-			if (MAGIC_VBLK != BE32 (data)) {
+			if (MAGIC_VBLK != get_unaligned_be32(data)) {
 				ldm_error ("Expected to find a VBLK.");
 				goto out;
 			}
 
-			recs = BE16 (data + 0x0E);	/* Number of records */
+			recs = get_unaligned_be16(data + 0x0E);	/* Number of records */
 			if (recs == 1) {
 				if (!ldm_ldmdb_add (data, size, ldb))
 					goto out;	/* Already logged */
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 80f63b5fdd9f..30e08e809c1d 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -98,11 +98,6 @@ struct parsed_partitions;
 #define TOC_BITMAP1		"config"	/* Names of the two defined */
 #define TOC_BITMAP2		"log"		/* bitmaps in the TOCBLOCK. */
 
-/* Most numbers we deal with are big-endian and won't be aligned. */
-#define BE16(x)			((u16)be16_to_cpu(get_unaligned((__be16*)(x))))
-#define BE32(x)			((u32)be32_to_cpu(get_unaligned((__be32*)(x))))
-#define BE64(x)			((u64)be64_to_cpu(get_unaligned((__be64*)(x))))
-
 /* Borrowed from msdos.c */
 #define SYS_IND(p)		(get_unaligned(&(p)->sys_ind))
 
diff --git a/fs/pipe.c b/fs/pipe.c
index 700f4e0d9572..fcba6542b8d0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -777,45 +777,10 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
 /*
  * The file_operations structs are not static because they
  * are also used in linux/fs/fifo.c to do operations on FIFOs.
+ *
+ * Pipes reuse fifos' file_operations structs.
  */
-const struct file_operations read_fifo_fops = {
-	.llseek		= no_llseek,
-	.read		= do_sync_read,
-	.aio_read	= pipe_read,
-	.write		= bad_pipe_w,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_read_open,
-	.release	= pipe_read_release,
-	.fasync		= pipe_read_fasync,
-};
-
-const struct file_operations write_fifo_fops = {
-	.llseek		= no_llseek,
-	.read		= bad_pipe_r,
-	.write		= do_sync_write,
-	.aio_write	= pipe_write,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_write_open,
-	.release	= pipe_write_release,
-	.fasync		= pipe_write_fasync,
-};
-
-const struct file_operations rdwr_fifo_fops = {
-	.llseek		= no_llseek,
-	.read		= do_sync_read,
-	.aio_read	= pipe_read,
-	.write		= do_sync_write,
-	.aio_write	= pipe_write,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_rdwr_open,
-	.release	= pipe_rdwr_release,
-	.fasync		= pipe_rdwr_fasync,
-};
-
-static const struct file_operations read_pipe_fops = {
+const struct file_operations read_pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= pipe_read,
@@ -827,7 +792,7 @@ static const struct file_operations read_pipe_fops = {
 	.fasync		= pipe_read_fasync,
 };
 
-static const struct file_operations write_pipe_fops = {
+const struct file_operations write_pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
 	.write		= do_sync_write,
@@ -839,7 +804,7 @@ static const struct file_operations write_pipe_fops = {
 	.fasync		= pipe_write_fasync,
 };
 
-static const struct file_operations rdwr_pipe_fops = {
+const struct file_operations rdwr_pipefifo_fops = {
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= pipe_read,
@@ -927,7 +892,7 @@ static struct inode * get_pipe_inode(void)
 	inode->i_pipe = pipe;
 
 	pipe->readers = pipe->writers = 1;
-	inode->i_fop = &rdwr_pipe_fops;
+	inode->i_fop = &rdwr_pipefifo_fops;
 
 	/*
 	 * Mark the inode dirty from the very beginning,
@@ -950,7 +915,7 @@ fail_inode:
 	return NULL;
 }
 
-struct file *create_write_pipe(void)
+struct file *create_write_pipe(int flags)
 {
 	int err;
 	struct inode *inode;
@@ -978,12 +943,12 @@ struct file *create_write_pipe(void)
 	d_instantiate(dentry, inode);
 
 	err = -ENFILE;
-	f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipe_fops);
+	f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops);
 	if (!f)
 		goto err_dentry;
 	f->f_mapping = inode->i_mapping;
 
-	f->f_flags = O_WRONLY;
+	f->f_flags = O_WRONLY | (flags & O_NONBLOCK);
 	f->f_version = 0;
 
 	return f;
@@ -1007,7 +972,7 @@ void free_write_pipe(struct file *f)
 	put_filp(f);
 }
 
-struct file *create_read_pipe(struct file *wrf)
+struct file *create_read_pipe(struct file *wrf, int flags)
 {
 	struct file *f = get_empty_filp();
 	if (!f)
@@ -1019,34 +984,37 @@ struct file *create_read_pipe(struct file *wrf)
 	f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
 
 	f->f_pos = 0;
-	f->f_flags = O_RDONLY;
-	f->f_op = &read_pipe_fops;
+	f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
+	f->f_op = &read_pipefifo_fops;
 	f->f_mode = FMODE_READ;
 	f->f_version = 0;
 
 	return f;
 }
 
-int do_pipe(int *fd)
+int do_pipe_flags(int *fd, int flags)
 {
 	struct file *fw, *fr;
 	int error;
 	int fdw, fdr;
 
-	fw = create_write_pipe();
+	if (flags & ~(O_CLOEXEC | O_NONBLOCK))
+		return -EINVAL;
+
+	fw = create_write_pipe(flags);
 	if (IS_ERR(fw))
 		return PTR_ERR(fw);
-	fr = create_read_pipe(fw);
+	fr = create_read_pipe(fw, flags);
 	error = PTR_ERR(fr);
 	if (IS_ERR(fr))
 		goto err_write_pipe;
 
-	error = get_unused_fd();
+	error = get_unused_fd_flags(flags);
 	if (error < 0)
 		goto err_read_pipe;
 	fdr = error;
 
-	error = get_unused_fd();
+	error = get_unused_fd_flags(flags);
 	if (error < 0)
 		goto err_fdr;
 	fdw = error;
@@ -1074,16 +1042,21 @@ int do_pipe(int *fd)
 	return error;
 }
 
+int do_pipe(int *fd)
+{
+	return do_pipe_flags(fd, 0);
+}
+
 /*
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
  */
-asmlinkage long __weak sys_pipe(int __user *fildes)
+asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
 {
 	int fd[2];
 	int error;
 
-	error = do_pipe(fd);
+	error = do_pipe_flags(fd, flags);
 	if (!error) {
 		if (copy_to_user(fildes, fd, sizeof(fd))) {
 			sys_close(fd[0]);
@@ -1094,6 +1067,11 @@ asmlinkage long __weak sys_pipe(int __user *fildes)
 	return error;
 }
 
+asmlinkage long __weak sys_pipe(int __user *fildes)
+{
+	return sys_pipe2(fildes, 0);
+}
+
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
new file mode 100644
index 000000000000..50f8f0600f06
--- /dev/null
+++ b/fs/proc/Kconfig
@@ -0,0 +1,69 @@
+config PROC_FS
+	bool "/proc file system support" if EMBEDDED
+	default y
+	help
+	  This is a virtual file system providing information about the status
+	  of the system. "Virtual" means that it doesn't take up any space on
+	  your hard disk: the files are created on the fly by the kernel when
+	  you try to access them. Also, you cannot read the files with older
+	  version of the program less: you need to use more or cat.
+
+	  It's totally cool; for example, "cat /proc/interrupts" gives
+	  information about what the different IRQs are used for at the moment
+	  (there is a small number of Interrupt ReQuest lines in your computer
+	  that are used by the attached devices to gain the CPU's attention --
+	  often a source of trouble if two devices are mistakenly configured
+	  to use the same IRQ). The program procinfo to display some
+	  information about your system gathered from the /proc file system.
+
+	  Before you can use the /proc file system, it has to be mounted,
+	  meaning it has to be given a location in the directory hierarchy.
+	  That location should be /proc. A command such as "mount -t proc proc
+	  /proc" or the equivalent line in /etc/fstab does the job.
+
+	  The /proc file system is explained in the file
+	  <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
+	  ("man 5 proc").
+
+	  This option will enlarge your kernel by about 67 KB. Several
+	  programs depend on this, so everyone should say Y here.
+
+config PROC_KCORE
+	bool "/proc/kcore support" if !ARM
+	depends on PROC_FS && MMU
+
+config PROC_VMCORE
+        bool "/proc/vmcore support (EXPERIMENTAL)"
+        depends on PROC_FS && CRASH_DUMP
+	default y
+        help
+        Exports the dump image of crashed kernel in ELF format.
+
+config PROC_SYSCTL
+	bool "Sysctl support (/proc/sys)" if EMBEDDED
+	depends on PROC_FS
+	select SYSCTL
+	default y
+	---help---
+	  The sysctl interface provides a means of dynamically changing
+	  certain kernel parameters and variables on the fly without requiring
+	  a recompile of the kernel or reboot of the system.  The primary
+	  interface is through /proc/sys.  If you say Y here a tree of
+	  modifiable sysctl entries will be generated beneath the
+          /proc/sys directory. They are explained in the files
+	  in <file:Documentation/sysctl/>.  Note that enabling this
+	  option will enlarge the kernel by at least 8 KB.
+
+	  As it is generally a good thing, you should say Y here unless
+	  building a kernel for install/rescue disks or your system is very
+	  limited in memory.
+
+config PROC_PAGE_MONITOR
+ 	default y
+	depends on PROC_FS && MMU
+	bool "Enable /proc page monitoring" if EMBEDDED
+ 	help
+	  Various /proc files exist to monitor process memory utilization:
+	  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
+	  /proc/kpagecount, and /proc/kpageflags. Disabling these
+          interfaces will reduce the size of the kernel by approximately 4kb.
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ebaba0213546..63d965193b22 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,11 +8,20 @@ proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
-		proc_tty.o proc_misc.o
-
+		proc_tty.o
+proc-y	+= cmdline.o
+proc-y	+= cpuinfo.o
+proc-y	+= devices.o
+proc-y	+= interrupts.o
+proc-y	+= loadavg.o
+proc-y	+= meminfo.o
+proc-y	+= stat.o
+proc-y	+= uptime.o
+proc-y	+= version.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
 proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
 proc-$(CONFIG_PROC_DEVICETREE)	+= proc_devtree.o
 proc-$(CONFIG_PRINTK)	+= kmsg.o
+proc-$(CONFIG_PROC_PAGE_MONITOR)	+= page.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 797d775e0354..bb9f4b05703d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,16 +80,12 @@
 #include <linux/delayacct.h>
 #include <linux/seq_file.h>
 #include <linux/pid_namespace.h>
+#include <linux/tracehook.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include "internal.h"
 
-/* Gcc optimizes away "strlen(x)" for constant x */
-#define ADDBUF(buffer, string) \
-do { memcpy(buffer, string, strlen(string)); \
-     buffer += strlen(string); } while (0)
-
 static inline void task_name(struct seq_file *m, struct task_struct *p)
 {
 	int i;
@@ -168,8 +164,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	rcu_read_lock();
 	ppid = pid_alive(p) ?
 		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
-	tpid = pid_alive(p) && p->ptrace ?
-		task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
+	tpid = 0;
+	if (pid_alive(p)) {
+		struct task_struct *tracer = tracehook_tracer_task(p);
+		if (tracer)
+			tpid = task_pid_nr_ns(tracer, ns);
+	}
 	seq_printf(m,
 		"State:\t%s\n"
 		"Tgid:\t%d\n"
@@ -256,7 +256,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 	sigemptyset(&ignored);
 	sigemptyset(&caught);
 
-	rcu_read_lock();
 	if (lock_task_sighand(p, &flags)) {
 		pending = p->pending.signal;
 		shpending = p->signal->shared_pending.signal;
@@ -267,7 +266,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 		qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
 		unlock_task_sighand(p, &flags);
 	}
-	rcu_read_unlock();
 
 	seq_printf(m, "Threads:\t%d\n", num_threads);
 	seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
@@ -332,65 +330,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	return 0;
 }
 
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-static cputime_t task_utime(struct task_struct *p)
-{
-	return p->utime;
-}
-
-static cputime_t task_stime(struct task_struct *p)
-{
-	return p->stime;
-}
-#else
-static cputime_t task_utime(struct task_struct *p)
-{
-	clock_t utime = cputime_to_clock_t(p->utime),
-		total = utime + cputime_to_clock_t(p->stime);
-	u64 temp;
-
-	/*
-	 * Use CFS's precise accounting:
-	 */
-	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
-
-	if (total) {
-		temp *= utime;
-		do_div(temp, total);
-	}
-	utime = (clock_t)temp;
-
-	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
-	return p->prev_utime;
-}
-
-static cputime_t task_stime(struct task_struct *p)
-{
-	clock_t stime;
-
-	/*
-	 * Use CFS's precise accounting. (we subtract utime from
-	 * the total, to make sure the total observed by userspace
-	 * grows monotonically - apps rely on that):
-	 */
-	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
-			cputime_to_clock_t(task_utime(p));
-
-	if (stime >= 0)
-		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
-
-	return p->prev_stime;
-}
-#endif
-
-static cputime_t task_gtime(struct task_struct *p)
-{
-	return p->gtime;
-}
-
 static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task, int whole)
 {
@@ -449,20 +388,20 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
 		/* add up live thread stats at the group level */
 		if (whole) {
+			struct task_cputime cputime;
 			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
-				utime = cputime_add(utime, task_utime(t));
-				stime = cputime_add(stime, task_stime(t));
 				gtime = cputime_add(gtime, task_gtime(t));
 				t = next_thread(t);
 			} while (t != task);
 
 			min_flt += sig->min_flt;
 			maj_flt += sig->maj_flt;
-			utime = cputime_add(utime, sig->utime);
-			stime = cputime_add(stime, sig->stime);
+			thread_group_cputime(task, &cputime);
+			utime = cputime.utime;
+			stime = cputime.stime;
 			gtime = cputime_add(gtime, sig->gtime);
 		}
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 58c3e6a8e15e..486cf3fe7139 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -53,6 +53,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/init.h>
 #include <linux/capability.h>
 #include <linux/file.h>
@@ -69,6 +70,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -146,9 +148,6 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
 	return count;
 }
 
-int maps_protect;
-EXPORT_SYMBOL(maps_protect);
-
 static struct fs_struct *get_fs_struct(struct task_struct *task)
 {
 	struct fs_struct *fs;
@@ -162,7 +161,6 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
 
 static int get_nr_threads(struct task_struct *tsk)
 {
-	/* Must be called with the rcu_read_lock held */
 	unsigned long flags;
 	int count = 0;
 
@@ -231,10 +229,14 @@ static int check_mem_permission(struct task_struct *task)
 	 * If current is actively ptrace'ing, and would also be
 	 * permitted to freshly attach with ptrace now, permit it.
 	 */
-	if (task->parent == current && (task->ptrace & PT_PTRACED) &&
-	    task_is_stopped_or_traced(task) &&
-	    ptrace_may_access(task, PTRACE_MODE_ATTACH))
-		return 0;
+	if (task_is_stopped_or_traced(task)) {
+		int match;
+		rcu_read_lock();
+		match = (tracehook_tracer_task(task) == current);
+		rcu_read_unlock();
+		if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
+			return 0;
+	}
 
 	/*
 	 * Noone else is allowed.
@@ -465,14 +467,10 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
 
 	struct rlimit rlim[RLIM_NLIMITS];
 
-	rcu_read_lock();
-	if (!lock_task_sighand(task,&flags)) {
-		rcu_read_unlock();
+	if (!lock_task_sighand(task, &flags))
 		return 0;
-	}
 	memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
 	unlock_task_sighand(task, &flags);
-	rcu_read_unlock();
 
 	/*
 	 * print the file header
@@ -504,6 +502,26 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
 	return count;
 }
 
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static int proc_pid_syscall(struct task_struct *task, char *buffer)
+{
+	long nr;
+	unsigned long args[6], sp, pc;
+
+	if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+		return sprintf(buffer, "running\n");
+
+	if (nr < 0)
+		return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+
+	return sprintf(buffer,
+		       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
+		       nr,
+		       args[0], args[1], args[2], args[3], args[4], args[5],
+		       sp, pc);
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -1694,9 +1712,9 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	file = fcheck_files(files, fd);
 	if (!file)
 		goto out_unlock;
-	if (file->f_mode & 1)
+	if (file->f_mode & FMODE_READ)
 		inode->i_mode |= S_IRUSR | S_IXUSR;
-	if (file->f_mode & 2)
+	if (file->f_mode & FMODE_WRITE)
 		inode->i_mode |= S_IWUSR | S_IXUSR;
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
@@ -1834,8 +1852,7 @@ static const struct file_operations proc_fd_operations = {
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
-static int proc_fd_permission(struct inode *inode, int mask,
-				struct nameidata *nd)
+static int proc_fd_permission(struct inode *inode, int mask)
 {
 	int rv;
 
@@ -2376,29 +2393,54 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
 }
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
+static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
 {
+	struct task_io_accounting acct = task->ioac;
+	unsigned long flags;
+
+	if (whole && lock_task_sighand(task, &flags)) {
+		struct task_struct *t = task;
+
+		task_io_accounting_add(&acct, &task->signal->ioac);
+		while_each_thread(task, t)
+			task_io_accounting_add(&acct, &t->ioac);
+
+		unlock_task_sighand(task, &flags);
+	}
 	return sprintf(buffer,
-#ifdef CONFIG_TASK_XACCT
 			"rchar: %llu\n"
 			"wchar: %llu\n"
 			"syscr: %llu\n"
 			"syscw: %llu\n"
-#endif
 			"read_bytes: %llu\n"
 			"write_bytes: %llu\n"
 			"cancelled_write_bytes: %llu\n",
-#ifdef CONFIG_TASK_XACCT
-			(unsigned long long)task->rchar,
-			(unsigned long long)task->wchar,
-			(unsigned long long)task->syscr,
-			(unsigned long long)task->syscw,
-#endif
-			(unsigned long long)task->ioac.read_bytes,
-			(unsigned long long)task->ioac.write_bytes,
-			(unsigned long long)task->ioac.cancelled_write_bytes);
+			(unsigned long long)acct.rchar,
+			(unsigned long long)acct.wchar,
+			(unsigned long long)acct.syscr,
+			(unsigned long long)acct.syscw,
+			(unsigned long long)acct.read_bytes,
+			(unsigned long long)acct.write_bytes,
+			(unsigned long long)acct.cancelled_write_bytes);
+}
+
+static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 0);
+}
+
+static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+{
+	return do_io_accounting(task, buffer, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
+
+static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *task)
+{
+	seq_printf(m, "%08x\n", task->personality);
+	return 0;
 }
-#endif
 
 /*
  * Thread groups
@@ -2416,10 +2458,14 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("environ",    S_IRUSR, environ),
 	INF("auxv",       S_IRUSR, pid_auxv),
 	ONE("status",     S_IRUGO, pid_status),
+	ONE("personality", S_IRUSR, pid_personality),
 	INF("limits",	  S_IRUSR, pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
 #endif
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	INF("syscall",    S_IRUSR, pid_syscall),
+#endif
 	INF("cmdline",    S_IRUGO, pid_cmdline),
 	ONE("stat",       S_IRUGO, tgid_stat),
 	ONE("statm",      S_IRUGO, pid_statm),
@@ -2470,7 +2516,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	INF("io",	S_IRUGO, pid_io_accounting),
+	INF("io",	S_IRUGO, tgid_io_accounting),
 #endif
 };
 
@@ -2748,10 +2794,14 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("environ",   S_IRUSR, environ),
 	INF("auxv",      S_IRUSR, pid_auxv),
 	ONE("status",    S_IRUGO, pid_status),
+	ONE("personality", S_IRUSR, pid_personality),
 	INF("limits",	 S_IRUSR, pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
 #endif
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+	INF("syscall",   S_IRUSR, pid_syscall),
+#endif
 	INF("cmdline",   S_IRUGO, pid_cmdline),
 	ONE("stat",      S_IRUGO, tid_stat),
 	ONE("statm",     S_IRUGO, pid_statm),
@@ -2797,6 +2847,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
 #endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	INF("io",	S_IRUGO, tid_io_accounting),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file * filp,
@@ -3036,9 +3089,7 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	generic_fillattr(inode, stat);
 
 	if (p) {
-		rcu_read_lock();
 		stat->nlink += get_nr_threads(p);
-		rcu_read_unlock();
 		put_task_struct(p);
 	}
 
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
new file mode 100644
index 000000000000..82676e3fcd1d
--- /dev/null
+++ b/fs/proc/cmdline.c
@@ -0,0 +1,29 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int cmdline_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%s\n", saved_command_line);
+	return 0;
+}
+
+static int cmdline_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, cmdline_proc_show, NULL);
+}
+
+static const struct file_operations cmdline_proc_fops = {
+	.open		= cmdline_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_cmdline_init(void)
+{
+	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+	return 0;
+}
+module_init(proc_cmdline_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
new file mode 100644
index 000000000000..5a1e539a234b
--- /dev/null
+++ b/fs/proc/cpuinfo.c
@@ -0,0 +1,24 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+extern const struct seq_operations cpuinfo_op;
+static int cpuinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpuinfo_op);
+}
+
+static const struct file_operations proc_cpuinfo_operations = {
+	.open		= cpuinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_cpuinfo_init(void)
+{
+	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+	return 0;
+}
+module_init(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
new file mode 100644
index 000000000000..59ee7da959c9
--- /dev/null
+++ b/fs/proc/devices.c
@@ -0,0 +1,70 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static int devinfo_show(struct seq_file *f, void *v)
+{
+	int i = *(loff_t *) v;
+
+	if (i < CHRDEV_MAJOR_HASH_SIZE) {
+		if (i == 0)
+			seq_printf(f, "Character devices:\n");
+		chrdev_show(f, i);
+	}
+#ifdef CONFIG_BLOCK
+	else {
+		i -= CHRDEV_MAJOR_HASH_SIZE;
+		if (i == 0)
+			seq_printf(f, "\nBlock devices:\n");
+		blkdev_show(f, i);
+	}
+#endif
+	return 0;
+}
+
+static void *devinfo_start(struct seq_file *f, loff_t *pos)
+{
+	if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+		return pos;
+	return NULL;
+}
+
+static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+		return NULL;
+	return pos;
+}
+
+static void devinfo_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations devinfo_ops = {
+	.start = devinfo_start,
+	.next  = devinfo_next,
+	.stop  = devinfo_stop,
+	.show  = devinfo_show
+};
+
+static int devinfo_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &devinfo_ops);
+}
+
+static const struct file_operations proc_devinfo_operations = {
+	.open		= devinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_devices_init(void)
+{
+	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	return 0;
+}
+module_init(proc_devices_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43e54e86cefd..60a359b35582 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -300,10 +300,10 @@ out:
 	return rtn;
 }
 
-static DEFINE_IDR(proc_inum_idr);
+static DEFINE_IDA(proc_inum_ida);
 static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 
-#define PROC_DYNAMIC_FIRST 0xF0000000UL
+#define PROC_DYNAMIC_FIRST 0xF0000000U
 
 /*
  * Return an inode number between PROC_DYNAMIC_FIRST and
@@ -311,36 +311,34 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
  */
 static unsigned int get_inode_number(void)
 {
-	int i, inum = 0;
+	unsigned int i;
 	int error;
 
 retry:
-	if (idr_pre_get(&proc_inum_idr, GFP_KERNEL) == 0)
+	if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
 		return 0;
 
 	spin_lock(&proc_inum_lock);
-	error = idr_get_new(&proc_inum_idr, NULL, &i);
+	error = ida_get_new(&proc_inum_ida, &i);
 	spin_unlock(&proc_inum_lock);
 	if (error == -EAGAIN)
 		goto retry;
 	else if (error)
 		return 0;
 
-	inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST;
-
-	/* inum will never be more than 0xf0ffffff, so no check
-	 * for overflow.
-	 */
-
-	return inum;
+	if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
+		spin_lock(&proc_inum_lock);
+		ida_remove(&proc_inum_ida, i);
+		spin_unlock(&proc_inum_lock);
+		return 0;
+	}
+	return PROC_DYNAMIC_FIRST + i;
 }
 
 static void release_inode_number(unsigned int inum)
 {
-	int id = (inum - PROC_DYNAMIC_FIRST) | ~MAX_ID_MASK;
-
 	spin_lock(&proc_inum_lock);
-	idr_remove(&proc_inum_idr, id);
+	ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
 	spin_unlock(&proc_inum_lock);
 }
 
@@ -549,9 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 
 	for (tmp = dir->subdir; tmp; tmp = tmp->next)
 		if (strcmp(tmp->name, dp->name) == 0) {
-			printk(KERN_WARNING "proc_dir_entry '%s' already "
-					"registered\n", dp->name);
-			dump_stack();
+			WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
+				dir->name, dp->name);
 			break;
 		}
 
@@ -597,6 +594,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	ent->pde_users = 0;
 	spin_lock_init(&ent->pde_unload_lock);
 	ent->pde_unload_completion = NULL;
+	INIT_LIST_HEAD(&ent->pde_openers);
  out:
 	return ent;
 }
@@ -789,15 +787,25 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	spin_unlock(&de->pde_unload_lock);
 
 continue_removing:
+	spin_lock(&de->pde_unload_lock);
+	while (!list_empty(&de->pde_openers)) {
+		struct pde_opener *pdeo;
+
+		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+		list_del(&pdeo->lh);
+		spin_unlock(&de->pde_unload_lock);
+		pdeo->release(pdeo->inode, pdeo->file);
+		kfree(pdeo);
+		spin_lock(&de->pde_unload_lock);
+	}
+	spin_unlock(&de->pde_unload_lock);
+
 	if (S_ISDIR(de->mode))
 		parent->nlink--;
 	de->nlink = 0;
-	if (de->subdir) {
-		printk(KERN_WARNING "%s: removing non-empty directory "
+	WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
 			"'%s/%s', leaking at least '%s'\n", __func__,
 			de->parent->name, de->name, de->subdir->name);
-		WARN_ON(1);
-	}
 	if (atomic_dec_and_test(&de->count))
 		free_proc_entry(de);
 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b08d10017911..2543fd00c658 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/sysctl.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -65,6 +66,8 @@ static void proc_delete_inode(struct inode *inode)
 			module_put(de->owner);
 		de_put(de);
 	}
+	if (PROC_I(inode)->sysctl)
+		sysctl_head_put(PROC_I(inode)->sysctl);
 	clear_inode(inode);
 }
 
@@ -84,6 +87,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
+	ei->sysctl = NULL;
+	ei->sysctl_entry = NULL;
 	inode = &ei->vfs_inode;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	return inode;
@@ -94,27 +99,20 @@ static void proc_destroy_inode(struct inode *inode)
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
 
 	inode_init_once(&ei->vfs_inode);
 }
 
-int __init proc_init_inodecache(void)
+void __init proc_init_inodecache(void)
 {
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD|SLAB_PANIC),
 					     init_once);
-	return 0;
-}
-
-static int proc_remount(struct super_block *sb, int *flags, char *data)
-{
-	*flags |= MS_NODIRATIME;
-	return 0;
 }
 
 static const struct super_operations proc_sops = {
@@ -123,15 +121,19 @@ static const struct super_operations proc_sops = {
 	.drop_inode	= generic_delete_inode,
 	.delete_inode	= proc_delete_inode,
 	.statfs		= simple_statfs,
-	.remount_fs	= proc_remount,
 };
 
-static void pde_users_dec(struct proc_dir_entry *pde)
+static void __pde_users_dec(struct proc_dir_entry *pde)
 {
-	spin_lock(&pde->pde_unload_lock);
 	pde->pde_users--;
 	if (pde->pde_unload_completion && pde->pde_users == 0)
 		complete(pde->pde_unload_completion);
+}
+
+static void pde_users_dec(struct proc_dir_entry *pde)
+{
+	spin_lock(&pde->pde_unload_lock);
+	__pde_users_dec(pde);
 	spin_unlock(&pde->pde_unload_lock);
 }
 
@@ -318,36 +320,97 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	struct proc_dir_entry *pde = PDE(inode);
 	int rv = 0;
 	int (*open)(struct inode *, struct file *);
+	int (*release)(struct inode *, struct file *);
+	struct pde_opener *pdeo;
+
+	/*
+	 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
+	 * sequence. ->release won't be called because ->proc_fops will be
+	 * cleared. Depending on complexity of ->release, consequences vary.
+	 *
+	 * We can't wait for mercy when close will be done for real, it's
+	 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
+	 * by hand in remove_proc_entry(). For this, save opener's credentials
+	 * for later.
+	 */
+	pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
+	if (!pdeo)
+		return -ENOMEM;
 
 	spin_lock(&pde->pde_unload_lock);
 	if (!pde->proc_fops) {
 		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+		kfree(pdeo);
+		return -EINVAL;
 	}
 	pde->pde_users++;
 	open = pde->proc_fops->open;
+	release = pde->proc_fops->release;
 	spin_unlock(&pde->pde_unload_lock);
 
 	if (open)
 		rv = open(inode, file);
 
-	pde_users_dec(pde);
+	spin_lock(&pde->pde_unload_lock);
+	if (rv == 0 && release) {
+		/* To know what to release. */
+		pdeo->inode = inode;
+		pdeo->file = file;
+		/* Strictly for "too late" ->release in proc_reg_release(). */
+		pdeo->release = release;
+		list_add(&pdeo->lh, &pde->pde_openers);
+	} else
+		kfree(pdeo);
+	__pde_users_dec(pde);
+	spin_unlock(&pde->pde_unload_lock);
 	return rv;
 }
 
+static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
+					struct inode *inode, struct file *file)
+{
+	struct pde_opener *pdeo;
+
+	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+		if (pdeo->inode == inode && pdeo->file == file)
+			return pdeo;
+	}
+	return NULL;
+}
+
 static int proc_reg_release(struct inode *inode, struct file *file)
 {
 	struct proc_dir_entry *pde = PDE(inode);
 	int rv = 0;
 	int (*release)(struct inode *, struct file *);
+	struct pde_opener *pdeo;
 
 	spin_lock(&pde->pde_unload_lock);
+	pdeo = find_pde_opener(pde, inode, file);
 	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+		/*
+		 * Can't simply exit, __fput() will think that everything is OK,
+		 * and move on to freeing struct file. remove_proc_entry() will
+		 * find slacker in opener's list and will try to do non-trivial
+		 * things with struct file. Therefore, remove opener from list.
+		 *
+		 * But if opener is removed from list, who will ->release it?
+		 */
+		if (pdeo) {
+			list_del(&pdeo->lh);
+			spin_unlock(&pde->pde_unload_lock);
+			rv = pdeo->release(inode, file);
+			kfree(pdeo);
+		} else
+			spin_unlock(&pde->pde_unload_lock);
 		return rv;
 	}
 	pde->pde_users++;
 	release = pde->proc_fops->release;
+	if (pdeo) {
+		list_del(&pdeo->lh);
+		kfree(pdeo);
+	}
 	spin_unlock(&pde->pde_unload_lock);
 
 	if (release)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 28cbca805905..3e8aeb8b61ce 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -45,8 +45,6 @@ do {						\
 extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
 
-extern int maps_protect;
-
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task);
 extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
@@ -67,7 +65,7 @@ extern const struct inode_operations proc_net_inode_operations;
 
 void free_proc_entry(struct proc_dir_entry *de);
 
-int proc_init_inodecache(void);
+void proc_init_inodecache(void);
 
 static inline struct pid *proc_pid(struct inode *inode)
 {
@@ -88,3 +86,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
 		struct dentry *dentry);
 int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 		filldir_t filldir);
+
+struct pde_opener {
+	struct inode *inode;
+	struct file *file;
+	int (*release)(struct inode *, struct file *);
+	struct list_head lh;
+};
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
new file mode 100644
index 000000000000..05029c0e2f24
--- /dev/null
+++ b/fs/proc/interrupts.c
@@ -0,0 +1,53 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/irqnr.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/interrupts
+ */
+static void *int_seq_start(struct seq_file *f, loff_t *pos)
+{
+	return (*pos <= nr_irqs) ? pos : NULL;
+}
+
+static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos > nr_irqs)
+		return NULL;
+	return pos;
+}
+
+static void int_seq_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static const struct seq_operations int_seq_ops = {
+	.start = int_seq_start,
+	.next  = int_seq_next,
+	.stop  = int_seq_stop,
+	.show  = show_interrupts
+};
+
+static int interrupts_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &int_seq_ops);
+}
+
+static const struct file_operations proc_interrupts_operations = {
+	.open		= interrupts_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init proc_interrupts_init(void)
+{
+	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
+	return 0;
+}
+module_init(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e78c81fcf547..59b43a068872 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,6 +23,12 @@
 
 #define CORE_STR "CORE"
 
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS	0
+#endif
+
+static struct proc_dir_entry *proc_root_kcore;
+
 static int open_kcore(struct inode * inode, struct file * filp)
 {
 	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
@@ -30,7 +36,7 @@ static int open_kcore(struct inode * inode, struct file * filp)
 
 static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
 
-const struct file_operations proc_kcore_operations = {
+static const struct file_operations proc_kcore_operations = {
 	.read		= read_kcore,
 	.open		= open_kcore,
 };
@@ -164,11 +170,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
 	elf->e_entry	= 0;
 	elf->e_phoff	= sizeof(struct elfhdr);
 	elf->e_shoff	= 0;
-#if defined(CONFIG_H8300)
-	elf->e_flags	= ELF_FLAGS;
-#else
-	elf->e_flags	= 0;
-#endif
+	elf->e_flags	= ELF_CORE_EFLAGS;
 	elf->e_ehsize	= sizeof(struct elfhdr);
 	elf->e_phentsize= sizeof(struct elf_phdr);
 	elf->e_phnum	= nphdr;
@@ -399,3 +401,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 
 	return acc;
 }
+
+static int __init proc_kcore_init(void)
+{
+	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
+	if (proc_root_kcore)
+		proc_root_kcore->size =
+				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
+	return 0;
+}
+module_init(proc_kcore_init);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index ff3b90b56e9d..7ca78346d3f0 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -10,6 +10,7 @@
 #include <linux/time.h>
 #include <linux/kernel.h>
 #include <linux/poll.h>
+#include <linux/proc_fs.h>
 #include <linux/fs.h>
 
 #include <asm/uaccess.h>
@@ -47,9 +48,16 @@ static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 }
 
 
-const struct file_operations proc_kmsg_operations = {
+static const struct file_operations proc_kmsg_operations = {
 	.read		= kmsg_read,
 	.poll		= kmsg_poll,
 	.open		= kmsg_open,
 	.release	= kmsg_release,
 };
+
+static int __init proc_kmsg_init(void)
+{
+	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+	return 0;
+}
+module_init(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
new file mode 100644
index 000000000000..9bca39cf99ee
--- /dev/null
+++ b/fs/proc/loadavg.c
@@ -0,0 +1,51 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/time.h>
+
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
+static int loadavg_proc_show(struct seq_file *m, void *v)
+{
+	int a, b, c;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		a = avenrun[0] + (FIXED_1/200);
+		b = avenrun[1] + (FIXED_1/200);
+		c = avenrun[2] + (FIXED_1/200);
+	} while (read_seqretry(&xtime_lock, seq));
+
+	seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+		LOAD_INT(a), LOAD_FRAC(a),
+		LOAD_INT(b), LOAD_FRAC(b),
+		LOAD_INT(c), LOAD_FRAC(c),
+		nr_running(), nr_threads,
+		task_active_pid_ns(current)->last_pid);
+	return 0;
+}
+
+static int loadavg_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, loadavg_proc_show, NULL);
+}
+
+static const struct file_operations loadavg_proc_fops = {
+	.open		= loadavg_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_loadavg_init(void)
+{
+	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+	return 0;
+}
+module_init(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
new file mode 100644
index 000000000000..b1675c4e66da
--- /dev/null
+++ b/fs/proc/meminfo.c
@@ -0,0 +1,168 @@
+#include <linux/fs.h>
+#include <linux/hugetlb.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmzone.h>
+#include <linux/proc_fs.h>
+#include <linux/quicklist.h>
+#include <linux/seq_file.h>
+#include <linux/swap.h>
+#include <linux/vmstat.h>
+#include <asm/atomic.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include "internal.h"
+
+void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
+{
+}
+
+static int meminfo_proc_show(struct seq_file *m, void *v)
+{
+	struct sysinfo i;
+	unsigned long committed;
+	unsigned long allowed;
+	struct vmalloc_info vmi;
+	long cached;
+	unsigned long pages[NR_LRU_LISTS];
+	int lru;
+
+/*
+ * display in kilobytes.
+ */
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+	si_meminfo(&i);
+	si_swapinfo(&i);
+	committed = atomic_long_read(&vm_committed_space);
+	allowed = ((totalram_pages - hugetlb_total_pages())
+		* sysctl_overcommit_ratio / 100) + total_swap_pages;
+
+	cached = global_page_state(NR_FILE_PAGES) -
+			total_swapcache_pages - i.bufferram;
+	if (cached < 0)
+		cached = 0;
+
+	get_vmalloc_info(&vmi);
+
+	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+	/*
+	 * Tagged format, for easy grepping and expansion.
+	 */
+	seq_printf(m,
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"SwapCached:     %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
+#ifdef CONFIG_UNEVICTABLE_LRU
+		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
+#endif
+#ifdef CONFIG_HIGHMEM
+		"HighTotal:      %8lu kB\n"
+		"HighFree:       %8lu kB\n"
+		"LowTotal:       %8lu kB\n"
+		"LowFree:        %8lu kB\n"
+#endif
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Mapped:         %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		"PageTables:     %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+		"Quicklists:     %8lu kB\n"
+#endif
+		"NFS_Unstable:   %8lu kB\n"
+		"Bounce:         %8lu kB\n"
+		"WritebackTmp:   %8lu kB\n"
+		"CommitLimit:    %8lu kB\n"
+		"Committed_AS:   %8lu kB\n"
+		"VmallocTotal:   %8lu kB\n"
+		"VmallocUsed:    %8lu kB\n"
+		"VmallocChunk:   %8lu kB\n",
+		K(i.totalram),
+		K(i.freeram),
+		K(i.bufferram),
+		K(cached),
+		K(total_swapcache_pages),
+		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_ACTIVE_ANON]),
+		K(pages[LRU_INACTIVE_ANON]),
+		K(pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_FILE]),
+#ifdef CONFIG_UNEVICTABLE_LRU
+		K(pages[LRU_UNEVICTABLE]),
+		K(global_page_state(NR_MLOCK)),
+#endif
+#ifdef CONFIG_HIGHMEM
+		K(i.totalhigh),
+		K(i.freehigh),
+		K(i.totalram-i.totalhigh),
+		K(i.freeram-i.freehigh),
+#endif
+		K(i.totalswap),
+		K(i.freeswap),
+		K(global_page_state(NR_FILE_DIRTY)),
+		K(global_page_state(NR_WRITEBACK)),
+		K(global_page_state(NR_ANON_PAGES)),
+		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE) +
+				global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		K(global_page_state(NR_SLAB_RECLAIMABLE)),
+		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+		K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+		K(quicklist_total_size()),
+#endif
+		K(global_page_state(NR_UNSTABLE_NFS)),
+		K(global_page_state(NR_BOUNCE)),
+		K(global_page_state(NR_WRITEBACK_TEMP)),
+		K(allowed),
+		K(committed),
+		(unsigned long)VMALLOC_TOTAL >> 10,
+		vmi.used >> 10,
+		vmi.largest_chunk >> 10
+		);
+
+	hugetlb_report_meminfo(m);
+
+	arch_report_meminfo(m);
+
+	return 0;
+#undef K
+}
+
+static int meminfo_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, meminfo_proc_show, NULL);
+}
+
+static const struct file_operations meminfo_proc_fops = {
+	.open		= meminfo_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_meminfo_init(void)
+{
+	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	return 0;
+}
+module_init(proc_meminfo_init);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 79ecd281d2cb..3f87d2632947 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 	}
 
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
 		   vma->vm_start,
 		   vma->vm_end,
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   vma->vm_pgoff << PAGE_SHIFT,
+		   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
diff --git a/fs/proc/page.c b/fs/proc/page.c
new file mode 100644
index 000000000000..767d95a6d1b1
--- /dev/null
+++ b/fs/proc/page.c
@@ -0,0 +1,147 @@
+#include <linux/bootmem.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+#define KPMSIZE sizeof(u64)
+#define KPMMASK (KPMSIZE - 1)
+/* /proc/kpagecount - an array exposing page counts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page count.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 pcount;
+
+	pfn = src / KPMSIZE;
+	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		ppage = NULL;
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		pfn++;
+		if (!ppage)
+			pcount = 0;
+		else
+			pcount = page_mapcount(ppage);
+
+		if (put_user(pcount, out++)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpagecount_operations = {
+	.llseek = mem_lseek,
+	.read = kpagecount_read,
+};
+
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
+
+/* These macros are used to decouple internal flags from exported ones */
+
+#define KPF_LOCKED     0
+#define KPF_ERROR      1
+#define KPF_REFERENCED 2
+#define KPF_UPTODATE   3
+#define KPF_DIRTY      4
+#define KPF_LRU        5
+#define KPF_ACTIVE     6
+#define KPF_SLAB       7
+#define KPF_WRITEBACK  8
+#define KPF_RECLAIM    9
+#define KPF_BUDDY     10
+
+#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos)
+
+static ssize_t kpageflags_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 kflags, uflags;
+
+	pfn = src / KPMSIZE;
+	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		ppage = NULL;
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		pfn++;
+		if (!ppage)
+			kflags = 0;
+		else
+			kflags = ppage->flags;
+
+		uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
+			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
+			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
+			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
+			kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
+			kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
+			kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
+			kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
+			kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
+			kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
+			kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
+
+		if (put_user(uflags, out++)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpageflags_operations = {
+	.llseek = mem_lseek,
+	.read = kpageflags_read,
+};
+
+static int __init proc_page_init(void)
+{
+	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
+	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+	return 0;
+}
+module_init(proc_page_init);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index eca471bc8512..d777789b7a89 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -4,6 +4,7 @@
  * Copyright 1997 Paul Mackerras
  */
 #include <linux/errno.h>
+#include <linux/init.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
@@ -214,7 +215,7 @@ void proc_device_tree_add_node(struct device_node *np,
 /*
  * Called on initialization to set up the /proc/device-tree subtree
  */
-void proc_device_tree_init(void)
+void __init proc_device_tree_init(void)
 {
 	struct device_node *root;
 	if ( !have_of )
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
deleted file mode 100644
index c652d469dc08..000000000000
--- a/fs/proc/proc_misc.c
+++ /dev/null
@@ -1,923 +0,0 @@
-/*
- *  linux/fs/proc/proc_misc.c
- *
- *  linux/fs/proc/array.c
- *  Copyright (C) 1992  by Linus Torvalds
- *  based on ideas by Darren Senn
- *
- *  This used to be the part of array.c. See the rest of history and credits
- *  there. I took this into a separate file and switched the thing to generic
- *  proc_file_inode_operations, leaving in array.c only per-process stuff.
- *  Inumbers allocation made dynamic (via create_proc_entry()).  AV, May 1999.
- *
- * Changes:
- * Fulton Green      :  Encapsulated position metric calculations.
- *			<kernel@FultonGreen.com>
- */
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/kernel_stat.h>
-#include <linux/fs.h>
-#include <linux/tty.h>
-#include <linux/string.h>
-#include <linux/mman.h>
-#include <linux/proc_fs.h>
-#include <linux/ioport.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/pagemap.h>
-#include <linux/interrupt.h>
-#include <linux/swap.h>
-#include <linux/slab.h>
-#include <linux/genhd.h>
-#include <linux/smp.h>
-#include <linux/signal.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/seq_file.h>
-#include <linux/times.h>
-#include <linux/profile.h>
-#include <linux/utsname.h>
-#include <linux/blkdev.h>
-#include <linux/hugetlb.h>
-#include <linux/jiffies.h>
-#include <linux/sysrq.h>
-#include <linux/vmalloc.h>
-#include <linux/crash_dump.h>
-#include <linux/pid_namespace.h>
-#include <linux/bootmem.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/io.h>
-#include <asm/tlb.h>
-#include <asm/div64.h>
-#include "internal.h"
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-/*
- * Warning: stuff below (imported functions) assumes that its output will fit
- * into one page. For some of those functions it may be wrong. Moreover, we
- * have a way to deal with that gracefully. Right now I used straightforward
- * wrappers, but this needs further analysis wrt potential overflows.
- */
-extern int get_hardware_list(char *);
-extern int get_stram_list(char *);
-extern int get_exec_domain_list(char *);
-extern int get_dma_list(char *);
-
-static int proc_calc_metrics(char *page, char **start, off_t off,
-				 int count, int *eof, int len)
-{
-	if (len <= off+count) *eof = 1;
-	*start = page + off;
-	len -= off;
-	if (len>count) len = count;
-	if (len<0) len = 0;
-	return len;
-}
-
-static int loadavg_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int a, b, c;
-	int len;
-	unsigned long seq;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		a = avenrun[0] + (FIXED_1/200);
-		b = avenrun[1] + (FIXED_1/200);
-		c = avenrun[2] + (FIXED_1/200);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-		LOAD_INT(a), LOAD_FRAC(a),
-		LOAD_INT(b), LOAD_FRAC(b),
-		LOAD_INT(c), LOAD_FRAC(c),
-		nr_running(), nr_threads,
-		task_active_pid_ns(current)->last_pid);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
-static int uptime_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	struct timespec uptime;
-	struct timespec idle;
-	int len;
-	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
-
-	do_posix_clock_monotonic_gettime(&uptime);
-	monotonic_to_bootbased(&uptime);
-	cputime_to_timespec(idletime, &idle);
-	len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
-			(unsigned long) uptime.tv_sec,
-			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
-			(unsigned long) idle.tv_sec,
-			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
-
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
-int __attribute__((weak)) arch_report_meminfo(char *page)
-{
-	return 0;
-}
-
-static int meminfo_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	struct sysinfo i;
-	int len;
-	unsigned long committed;
-	unsigned long allowed;
-	struct vmalloc_info vmi;
-	long cached;
-
-/*
- * display in kilobytes.
- */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
-	si_meminfo(&i);
-	si_swapinfo(&i);
-	committed = atomic_long_read(&vm_committed_space);
-	allowed = ((totalram_pages - hugetlb_total_pages())
-		* sysctl_overcommit_ratio / 100) + total_swap_pages;
-
-	cached = global_page_state(NR_FILE_PAGES) -
-			total_swapcache_pages - i.bufferram;
-	if (cached < 0)
-		cached = 0;
-
-	get_vmalloc_info(&vmi);
-
-	/*
-	 * Tagged format, for easy grepping and expansion.
-	 */
-	len = sprintf(page,
-		"MemTotal:     %8lu kB\n"
-		"MemFree:      %8lu kB\n"
-		"Buffers:      %8lu kB\n"
-		"Cached:       %8lu kB\n"
-		"SwapCached:   %8lu kB\n"
-		"Active:       %8lu kB\n"
-		"Inactive:     %8lu kB\n"
-#ifdef CONFIG_HIGHMEM
-		"HighTotal:    %8lu kB\n"
-		"HighFree:     %8lu kB\n"
-		"LowTotal:     %8lu kB\n"
-		"LowFree:      %8lu kB\n"
-#endif
-		"SwapTotal:    %8lu kB\n"
-		"SwapFree:     %8lu kB\n"
-		"Dirty:        %8lu kB\n"
-		"Writeback:    %8lu kB\n"
-		"AnonPages:    %8lu kB\n"
-		"Mapped:       %8lu kB\n"
-		"Slab:         %8lu kB\n"
-		"SReclaimable: %8lu kB\n"
-		"SUnreclaim:   %8lu kB\n"
-		"PageTables:   %8lu kB\n"
-		"NFS_Unstable: %8lu kB\n"
-		"Bounce:       %8lu kB\n"
-		"WritebackTmp: %8lu kB\n"
-		"CommitLimit:  %8lu kB\n"
-		"Committed_AS: %8lu kB\n"
-		"VmallocTotal: %8lu kB\n"
-		"VmallocUsed:  %8lu kB\n"
-		"VmallocChunk: %8lu kB\n",
-		K(i.totalram),
-		K(i.freeram),
-		K(i.bufferram),
-		K(cached),
-		K(total_swapcache_pages),
-		K(global_page_state(NR_ACTIVE)),
-		K(global_page_state(NR_INACTIVE)),
-#ifdef CONFIG_HIGHMEM
-		K(i.totalhigh),
-		K(i.freehigh),
-		K(i.totalram-i.totalhigh),
-		K(i.freeram-i.freehigh),
-#endif
-		K(i.totalswap),
-		K(i.freeswap),
-		K(global_page_state(NR_FILE_DIRTY)),
-		K(global_page_state(NR_WRITEBACK)),
-		K(global_page_state(NR_ANON_PAGES)),
-		K(global_page_state(NR_FILE_MAPPED)),
-		K(global_page_state(NR_SLAB_RECLAIMABLE) +
-				global_page_state(NR_SLAB_UNRECLAIMABLE)),
-		K(global_page_state(NR_SLAB_RECLAIMABLE)),
-		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
-		K(global_page_state(NR_PAGETABLE)),
-		K(global_page_state(NR_UNSTABLE_NFS)),
-		K(global_page_state(NR_BOUNCE)),
-		K(global_page_state(NR_WRITEBACK_TEMP)),
-		K(allowed),
-		K(committed),
-		(unsigned long)VMALLOC_TOTAL >> 10,
-		vmi.used >> 10,
-		vmi.largest_chunk >> 10
-		);
-
-		len += hugetlb_report_meminfo(page + len);
-
-	len += arch_report_meminfo(page + len);
-
-	return proc_calc_metrics(page, start, off, count, eof, len);
-#undef K
-}
-
-extern const struct seq_operations fragmentation_op;
-static int fragmentation_open(struct inode *inode, struct file *file)
-{
-	(void)inode;
-	return seq_open(file, &fragmentation_op);
-}
-
-static const struct file_operations fragmentation_file_operations = {
-	.open		= fragmentation_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-extern const struct seq_operations pagetypeinfo_op;
-static int pagetypeinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &pagetypeinfo_op);
-}
-
-static const struct file_operations pagetypeinfo_file_ops = {
-	.open		= pagetypeinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-extern const struct seq_operations zoneinfo_op;
-static int zoneinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &zoneinfo_op);
-}
-
-static const struct file_operations proc_zoneinfo_file_operations = {
-	.open		= zoneinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int version_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len;
-
-	len = snprintf(page, PAGE_SIZE, linux_proc_banner,
-		utsname()->sysname,
-		utsname()->release,
-		utsname()->version);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
-extern const struct seq_operations cpuinfo_op;
-static int cpuinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &cpuinfo_op);
-}
-
-static const struct file_operations proc_cpuinfo_operations = {
-	.open		= cpuinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int devinfo_show(struct seq_file *f, void *v)
-{
-	int i = *(loff_t *) v;
-
-	if (i < CHRDEV_MAJOR_HASH_SIZE) {
-		if (i == 0)
-			seq_printf(f, "Character devices:\n");
-		chrdev_show(f, i);
-	}
-#ifdef CONFIG_BLOCK
-	else {
-		i -= CHRDEV_MAJOR_HASH_SIZE;
-		if (i == 0)
-			seq_printf(f, "\nBlock devices:\n");
-		blkdev_show(f, i);
-	}
-#endif
-	return 0;
-}
-
-static void *devinfo_start(struct seq_file *f, loff_t *pos)
-{
-	if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
-		return pos;
-	return NULL;
-}
-
-static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
-{
-	(*pos)++;
-	if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
-		return NULL;
-	return pos;
-}
-
-static void devinfo_stop(struct seq_file *f, void *v)
-{
-	/* Nothing to do */
-}
-
-static const struct seq_operations devinfo_ops = {
-	.start = devinfo_start,
-	.next  = devinfo_next,
-	.stop  = devinfo_stop,
-	.show  = devinfo_show
-};
-
-static int devinfo_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &devinfo_ops);
-}
-
-static const struct file_operations proc_devinfo_operations = {
-	.open		= devinfo_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-extern const struct seq_operations vmstat_op;
-static int vmstat_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &vmstat_op);
-}
-static const struct file_operations proc_vmstat_file_operations = {
-	.open		= vmstat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-#ifdef CONFIG_PROC_HARDWARE
-static int hardware_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = get_hardware_list(page);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-#endif
-
-#ifdef CONFIG_STRAM_PROC
-static int stram_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = get_stram_list(page);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-#endif
-
-#ifdef CONFIG_BLOCK
-static int partitions_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &partitions_op);
-}
-static const struct file_operations proc_partitions_operations = {
-	.open		= partitions_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int diskstats_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &diskstats_op);
-}
-static const struct file_operations proc_diskstats_operations = {
-	.open		= diskstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
-#ifdef CONFIG_MODULES
-extern const struct seq_operations modules_op;
-static int modules_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &modules_op);
-}
-static const struct file_operations proc_modules_operations = {
-	.open		= modules_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
-#ifdef CONFIG_SLABINFO
-static int slabinfo_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slabinfo_op);
-}
-static const struct file_operations proc_slabinfo_operations = {
-	.open		= slabinfo_open,
-	.read		= seq_read,
-	.write		= slabinfo_write,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-extern const struct seq_operations slabstats_op;
-static int slabstats_open(struct inode *inode, struct file *file)
-{
-	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	int ret = -ENOMEM;
-	if (n) {
-		ret = seq_open(file, &slabstats_op);
-		if (!ret) {
-			struct seq_file *m = file->private_data;
-			*n = PAGE_SIZE / (2 * sizeof(unsigned long));
-			m->private = n;
-			n = NULL;
-		}
-		kfree(n);
-	}
-	return ret;
-}
-
-static const struct file_operations proc_slabstats_operations = {
-	.open		= slabstats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-#endif
-#endif
-
-#ifdef CONFIG_MMU
-static int vmalloc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &vmalloc_op);
-}
-
-static const struct file_operations proc_vmalloc_operations = {
-	.open		= vmalloc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
-#ifndef arch_irq_stat_cpu
-#define arch_irq_stat_cpu(cpu) 0
-#endif
-#ifndef arch_irq_stat
-#define arch_irq_stat() 0
-#endif
-
-static int show_stat(struct seq_file *p, void *v)
-{
-	int i;
-	unsigned long jif;
-	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-	cputime64_t guest;
-	u64 sum = 0;
-	struct timespec boottime;
-	unsigned int *per_irq_sum;
-
-	per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
-	if (!per_irq_sum)
-		return -ENOMEM;
-
-	user = nice = system = idle = iowait =
-		irq = softirq = steal = cputime64_zero;
-	guest = cputime64_zero;
-	getboottime(&boottime);
-	jif = boottime.tv_sec;
-
-	for_each_possible_cpu(i) {
-		int j;
-
-		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
-		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
-		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
-		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
-		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
-		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
-		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
-		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
-		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-		for (j = 0; j < NR_IRQS; j++) {
-			unsigned int temp = kstat_cpu(i).irqs[j];
-			sum += temp;
-			per_irq_sum[j] += temp;
-		}
-		sum += arch_irq_stat_cpu(i);
-	}
-	sum += arch_irq_stat();
-
-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
-		(unsigned long long)cputime64_to_clock_t(user),
-		(unsigned long long)cputime64_to_clock_t(nice),
-		(unsigned long long)cputime64_to_clock_t(system),
-		(unsigned long long)cputime64_to_clock_t(idle),
-		(unsigned long long)cputime64_to_clock_t(iowait),
-		(unsigned long long)cputime64_to_clock_t(irq),
-		(unsigned long long)cputime64_to_clock_t(softirq),
-		(unsigned long long)cputime64_to_clock_t(steal),
-		(unsigned long long)cputime64_to_clock_t(guest));
-	for_each_online_cpu(i) {
-
-		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
-		user = kstat_cpu(i).cpustat.user;
-		nice = kstat_cpu(i).cpustat.nice;
-		system = kstat_cpu(i).cpustat.system;
-		idle = kstat_cpu(i).cpustat.idle;
-		iowait = kstat_cpu(i).cpustat.iowait;
-		irq = kstat_cpu(i).cpustat.irq;
-		softirq = kstat_cpu(i).cpustat.softirq;
-		steal = kstat_cpu(i).cpustat.steal;
-		guest = kstat_cpu(i).cpustat.guest;
-		seq_printf(p,
-			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
-			i,
-			(unsigned long long)cputime64_to_clock_t(user),
-			(unsigned long long)cputime64_to_clock_t(nice),
-			(unsigned long long)cputime64_to_clock_t(system),
-			(unsigned long long)cputime64_to_clock_t(idle),
-			(unsigned long long)cputime64_to_clock_t(iowait),
-			(unsigned long long)cputime64_to_clock_t(irq),
-			(unsigned long long)cputime64_to_clock_t(softirq),
-			(unsigned long long)cputime64_to_clock_t(steal),
-			(unsigned long long)cputime64_to_clock_t(guest));
-	}
-	seq_printf(p, "intr %llu", (unsigned long long)sum);
-
-	for (i = 0; i < NR_IRQS; i++)
-		seq_printf(p, " %u", per_irq_sum[i]);
-
-	seq_printf(p,
-		"\nctxt %llu\n"
-		"btime %lu\n"
-		"processes %lu\n"
-		"procs_running %lu\n"
-		"procs_blocked %lu\n",
-		nr_context_switches(),
-		(unsigned long)jif,
-		total_forks,
-		nr_running(),
-		nr_iowait());
-
-	kfree(per_irq_sum);
-	return 0;
-}
-
-static int stat_open(struct inode *inode, struct file *file)
-{
-	unsigned size = 4096 * (1 + num_possible_cpus() / 32);
-	char *buf;
-	struct seq_file *m;
-	int res;
-
-	/* don't ask for more than the kmalloc() max size, currently 128 KB */
-	if (size > 128 * 1024)
-		size = 128 * 1024;
-	buf = kmalloc(size, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	res = single_open(file, show_stat, NULL);
-	if (!res) {
-		m = file->private_data;
-		m->buf = buf;
-		m->size = size;
-	} else
-		kfree(buf);
-	return res;
-}
-static const struct file_operations proc_stat_operations = {
-	.open		= stat_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-/*
- * /proc/interrupts
- */
-static void *int_seq_start(struct seq_file *f, loff_t *pos)
-{
-	return (*pos <= NR_IRQS) ? pos : NULL;
-}
-
-static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
-{
-	(*pos)++;
-	if (*pos > NR_IRQS)
-		return NULL;
-	return pos;
-}
-
-static void int_seq_stop(struct seq_file *f, void *v)
-{
-	/* Nothing to do */
-}
-
-
-static const struct seq_operations int_seq_ops = {
-	.start = int_seq_start,
-	.next  = int_seq_next,
-	.stop  = int_seq_stop,
-	.show  = show_interrupts
-};
-
-static int interrupts_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &int_seq_ops);
-}
-
-static const struct file_operations proc_interrupts_operations = {
-	.open		= interrupts_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int filesystems_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = get_filesystem_list(page);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
-static int cmdline_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len;
-
-	len = sprintf(page, "%s\n", saved_command_line);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
-static int locks_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &locks_seq_operations);
-}
-
-static const struct file_operations proc_locks_operations = {
-	.open		= locks_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int execdomains_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
-{
-	int len = get_exec_domain_list(page);
-	return proc_calc_metrics(page, start, off, count, eof, len);
-}
-
-#ifdef CONFIG_MAGIC_SYSRQ
-/*
- * writing 'C' to /proc/sysrq-trigger is like sysrq-C
- */
-static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
-				   size_t count, loff_t *ppos)
-{
-	if (count) {
-		char c;
-
-		if (get_user(c, buf))
-			return -EFAULT;
-		__handle_sysrq(c, NULL, 0);
-	}
-	return count;
-}
-
-static const struct file_operations proc_sysrq_trigger_operations = {
-	.write		= write_sysrq_trigger,
-};
-#endif
-
-#ifdef CONFIG_PROC_PAGE_MONITOR
-#define KPMSIZE sizeof(u64)
-#define KPMMASK (KPMSIZE - 1)
-/* /proc/kpagecount - an array exposing page counts
- *
- * Each entry is a u64 representing the corresponding
- * physical page count.
- */
-static ssize_t kpagecount_read(struct file *file, char __user *buf,
-			     size_t count, loff_t *ppos)
-{
-	u64 __user *out = (u64 __user *)buf;
-	struct page *ppage;
-	unsigned long src = *ppos;
-	unsigned long pfn;
-	ssize_t ret = 0;
-	u64 pcount;
-
-	pfn = src / KPMSIZE;
-	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
-	if (src & KPMMASK || count & KPMMASK)
-		return -EINVAL;
-
-	while (count > 0) {
-		ppage = NULL;
-		if (pfn_valid(pfn))
-			ppage = pfn_to_page(pfn);
-		pfn++;
-		if (!ppage)
-			pcount = 0;
-		else
-			pcount = page_mapcount(ppage);
-
-		if (put_user(pcount, out++)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		count -= KPMSIZE;
-	}
-
-	*ppos += (char __user *)out - buf;
-	if (!ret)
-		ret = (char __user *)out - buf;
-	return ret;
-}
-
-static struct file_operations proc_kpagecount_operations = {
-	.llseek = mem_lseek,
-	.read = kpagecount_read,
-};
-
-/* /proc/kpageflags - an array exposing page flags
- *
- * Each entry is a u64 representing the corresponding
- * physical page flags.
- */
-
-/* These macros are used to decouple internal flags from exported ones */
-
-#define KPF_LOCKED     0
-#define KPF_ERROR      1
-#define KPF_REFERENCED 2
-#define KPF_UPTODATE   3
-#define KPF_DIRTY      4
-#define KPF_LRU        5
-#define KPF_ACTIVE     6
-#define KPF_SLAB       7
-#define KPF_WRITEBACK  8
-#define KPF_RECLAIM    9
-#define KPF_BUDDY     10
-
-#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos)
-
-static ssize_t kpageflags_read(struct file *file, char __user *buf,
-			     size_t count, loff_t *ppos)
-{
-	u64 __user *out = (u64 __user *)buf;
-	struct page *ppage;
-	unsigned long src = *ppos;
-	unsigned long pfn;
-	ssize_t ret = 0;
-	u64 kflags, uflags;
-
-	pfn = src / KPMSIZE;
-	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
-	if (src & KPMMASK || count & KPMMASK)
-		return -EINVAL;
-
-	while (count > 0) {
-		ppage = NULL;
-		if (pfn_valid(pfn))
-			ppage = pfn_to_page(pfn);
-		pfn++;
-		if (!ppage)
-			kflags = 0;
-		else
-			kflags = ppage->flags;
-
-		uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
-			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
-			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
-			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
-			kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) |
-			kpf_copy_bit(kflags, KPF_LRU, PG_lru) |
-			kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) |
-			kpf_copy_bit(kflags, KPF_SLAB, PG_slab) |
-			kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) |
-			kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) |
-			kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy);
-
-		if (put_user(uflags, out++)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		count -= KPMSIZE;
-	}
-
-	*ppos += (char __user *)out - buf;
-	if (!ret)
-		ret = (char __user *)out - buf;
-	return ret;
-}
-
-static struct file_operations proc_kpageflags_operations = {
-	.llseek = mem_lseek,
-	.read = kpageflags_read,
-};
-#endif /* CONFIG_PROC_PAGE_MONITOR */
-
-struct proc_dir_entry *proc_root_kcore;
-
-void __init proc_misc_init(void)
-{
-	static struct {
-		char *name;
-		int (*read_proc)(char*,char**,off_t,int,int*,void*);
-	} *p, simple_ones[] = {
-		{"loadavg",     loadavg_read_proc},
-		{"uptime",	uptime_read_proc},
-		{"meminfo",	meminfo_read_proc},
-		{"version",	version_read_proc},
-#ifdef CONFIG_PROC_HARDWARE
-		{"hardware",	hardware_read_proc},
-#endif
-#ifdef CONFIG_STRAM_PROC
-		{"stram",	stram_read_proc},
-#endif
-		{"filesystems",	filesystems_read_proc},
-		{"cmdline",	cmdline_read_proc},
-		{"execdomains",	execdomains_read_proc},
-		{NULL,}
-	};
-	for (p = simple_ones; p->name; p++)
-		create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL);
-
-	proc_symlink("mounts", NULL, "self/mounts");
-
-	/* And now for trickier ones */
-#ifdef CONFIG_PRINTK
-	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
-#endif
-	proc_create("locks", 0, NULL, &proc_locks_operations);
-	proc_create("devices", 0, NULL, &proc_devinfo_operations);
-	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
-#ifdef CONFIG_BLOCK
-	proc_create("partitions", 0, NULL, &proc_partitions_operations);
-#endif
-	proc_create("stat", 0, NULL, &proc_stat_operations);
-	proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
-#ifdef CONFIG_SLABINFO
-	proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-	proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
-#endif
-#endif
-#ifdef CONFIG_MMU
-	proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
-#endif
-	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
-	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
-	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
-	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
-#ifdef CONFIG_BLOCK
-	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
-#endif
-#ifdef CONFIG_MODULES
-	proc_create("modules", 0, NULL, &proc_modules_operations);
-#endif
-#ifdef CONFIG_SCHEDSTATS
-	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-#endif
-#ifdef CONFIG_PROC_KCORE
-	proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations);
-	if (proc_root_kcore)
-		proc_root_kcore->size =
-				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
-#endif
-#ifdef CONFIG_PROC_PAGE_MONITOR
-	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
-	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
-#endif
-#ifdef CONFIG_PROC_VMCORE
-	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
-#endif
-#ifdef CONFIG_MAGIC_SYSRQ
-	proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations);
-#endif
-}
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 83f357b30d71..7bc296f424ae 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -27,6 +27,11 @@
 #include "internal.h"
 
 
+static struct net *get_proc_net(const struct inode *inode)
+{
+	return maybe_get_net(PDE_NET(PDE(inode)));
+}
+
 int seq_open_net(struct inode *ino, struct file *f,
 		 const struct seq_operations *ops, int size)
 {
@@ -51,6 +56,30 @@ int seq_open_net(struct inode *ino, struct file *f,
 }
 EXPORT_SYMBOL_GPL(seq_open_net);
 
+int single_open_net(struct inode *inode, struct file *file,
+		int (*show)(struct seq_file *, void *))
+{
+	int err;
+	struct net *net;
+
+	err = -ENXIO;
+	net = get_proc_net(inode);
+	if (net == NULL)
+		goto err_net;
+
+	err = single_open(file, show, net);
+	if (err < 0)
+		goto err_open;
+
+	return 0;
+
+err_open:
+	put_net(net);
+err_net:
+	return err;
+}
+EXPORT_SYMBOL_GPL(single_open_net);
+
 int seq_release_net(struct inode *ino, struct file *f)
 {
 	struct seq_file *seq;
@@ -63,6 +92,14 @@ int seq_release_net(struct inode *ino, struct file *f)
 }
 EXPORT_SYMBOL_GPL(seq_release_net);
 
+int single_release_net(struct inode *ino, struct file *f)
+{
+	struct seq_file *seq = f->private_data;
+	put_net(seq->private);
+	return single_release(ino, f);
+}
+EXPORT_SYMBOL_GPL(single_release_net);
+
 static struct net *get_proc_task_net(struct inode *dir)
 {
 	struct task_struct *task;
@@ -153,12 +190,6 @@ void proc_net_remove(struct net *net, const char *name)
 }
 EXPORT_SYMBOL_GPL(proc_net_remove);
 
-struct net *get_proc_net(const struct inode *inode)
-{
-	return maybe_get_net(PDE_NET(PDE(inode)));
-}
-EXPORT_SYMBOL_GPL(get_proc_net);
-
 static __net_init int proc_net_ns_init(struct net *net)
 {
 	struct proc_dir_entry *netd, *net_statd;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5acc001d49f6..94fcfff6863a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1,7 +1,7 @@
 /*
  * /proc/sys support
  */
-
+#include <linux/init.h>
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
@@ -10,149 +10,110 @@
 static struct dentry_operations proc_sys_dentry_operations;
 static const struct file_operations proc_sys_file_operations;
 static const struct inode_operations proc_sys_inode_operations;
+static const struct file_operations proc_sys_dir_file_operations;
+static const struct inode_operations proc_sys_dir_operations;
 
-static void proc_sys_refresh_inode(struct inode *inode, struct ctl_table *table)
-{
-	/* Refresh the cached information bits in the inode */
-	if (table) {
-		inode->i_uid = 0;
-		inode->i_gid = 0;
-		inode->i_mode = table->mode;
-		if (table->proc_handler) {
-			inode->i_mode |= S_IFREG;
-			inode->i_nlink = 1;
-		} else {
-			inode->i_mode |= S_IFDIR;
-			inode->i_nlink = 0;	/* It is too hard to figure out */
-		}
-	}
-}
-
-static struct inode *proc_sys_make_inode(struct inode *dir, struct ctl_table *table)
+static struct inode *proc_sys_make_inode(struct super_block *sb,
+		struct ctl_table_header *head, struct ctl_table *table)
 {
 	struct inode *inode;
-	struct proc_inode *dir_ei, *ei;
-	int depth;
+	struct proc_inode *ei;
 
-	inode = new_inode(dir->i_sb);
+	inode = new_inode(sb);
 	if (!inode)
 		goto out;
 
-	/* A directory is always one deeper than it's parent */
-	dir_ei = PROC_I(dir);
-	depth = dir_ei->fd + 1;
-
+	sysctl_head_get(head);
 	ei = PROC_I(inode);
-	ei->fd = depth;
+	ei->sysctl = head;
+	ei->sysctl_entry = table;
+
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_op = &proc_sys_inode_operations;
-	inode->i_fop = &proc_sys_file_operations;
 	inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
-	proc_sys_refresh_inode(inode, table);
+	inode->i_mode = table->mode;
+	if (!table->child) {
+		inode->i_mode |= S_IFREG;
+		inode->i_op = &proc_sys_inode_operations;
+		inode->i_fop = &proc_sys_file_operations;
+	} else {
+		inode->i_mode |= S_IFDIR;
+		inode->i_nlink = 0;
+		inode->i_op = &proc_sys_dir_operations;
+		inode->i_fop = &proc_sys_dir_file_operations;
+	}
 out:
 	return inode;
 }
 
-static struct dentry *proc_sys_ancestor(struct dentry *dentry, int depth)
-{
-	for (;;) {
-		struct proc_inode *ei;
-
-		ei = PROC_I(dentry->d_inode);
-		if (ei->fd == depth)
-			break; /* found */
-
-		dentry = dentry->d_parent;
-	}
-	return dentry;
-}
-
-static struct ctl_table *proc_sys_lookup_table_one(struct ctl_table *table,
-							struct qstr *name)
+static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
 {
 	int len;
-	for ( ; table->ctl_name || table->procname; table++) {
+	for ( ; p->ctl_name || p->procname; p++) {
 
-		if (!table->procname)
+		if (!p->procname)
 			continue;
 
-		len = strlen(table->procname);
+		len = strlen(p->procname);
 		if (len != name->len)
 			continue;
 
-		if (memcmp(table->procname, name->name, len) != 0)
+		if (memcmp(p->procname, name->name, len) != 0)
 			continue;
 
 		/* I have a match */
-		return table;
+		return p;
 	}
 	return NULL;
 }
 
-static struct ctl_table *proc_sys_lookup_table(struct dentry *dentry,
-						struct ctl_table *table)
+static struct ctl_table_header *grab_header(struct inode *inode)
 {
-	struct dentry *ancestor;
-	struct proc_inode *ei;
-	int depth, i;
+	if (PROC_I(inode)->sysctl)
+		return sysctl_head_grab(PROC_I(inode)->sysctl);
+	else
+		return sysctl_head_next(NULL);
+}
 
-	ei = PROC_I(dentry->d_inode);
-	depth = ei->fd;
+static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
+					struct nameidata *nd)
+{
+	struct ctl_table_header *head = grab_header(dir);
+	struct ctl_table *table = PROC_I(dir)->sysctl_entry;
+	struct ctl_table_header *h = NULL;
+	struct qstr *name = &dentry->d_name;
+	struct ctl_table *p;
+	struct inode *inode;
+	struct dentry *err = ERR_PTR(-ENOENT);
 
-	if (depth == 0)
-		return table;
+	if (IS_ERR(head))
+		return ERR_CAST(head);
 
-	for (i = 1; table && (i <= depth); i++) {
-		ancestor = proc_sys_ancestor(dentry, i);
-		table = proc_sys_lookup_table_one(table, &ancestor->d_name);
-		if (table)
-			table = table->child;
+	if (table && !table->child) {
+		WARN_ON(1);
+		goto out;
 	}
-	return table;
 
-}
-static struct ctl_table *proc_sys_lookup_entry(struct dentry *dparent,
-						struct qstr *name,
-						struct ctl_table *table)
-{
-	table = proc_sys_lookup_table(dparent, table);
-	if (table)
-		table = proc_sys_lookup_table_one(table, name);
-	return table;
-}
+	table = table ? table->child : head->ctl_table;
 
-static struct ctl_table *do_proc_sys_lookup(struct dentry *parent,
-						struct qstr *name,
-						struct ctl_table_header **ptr)
-{
-	struct ctl_table_header *head;
-	struct ctl_table *table = NULL;
-
-	for (head = sysctl_head_next(NULL); head;
-			head = sysctl_head_next(head)) {
-		table = proc_sys_lookup_entry(parent, name, head->ctl_table);
-		if (table)
-			break;
+	p = find_in_table(table, name);
+	if (!p) {
+		for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
+			if (h->attached_to != table)
+				continue;
+			p = find_in_table(h->attached_by, name);
+			if (p)
+				break;
+		}
 	}
-	*ptr = head;
-	return table;
-}
 
-static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
-					struct nameidata *nd)
-{
-	struct ctl_table_header *head;
-	struct inode *inode;
-	struct dentry *err;
-	struct ctl_table *table;
-
-	err = ERR_PTR(-ENOENT);
-	table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-	if (!table)
+	if (!p)
 		goto out;
 
 	err = ERR_PTR(-ENOMEM);
-	inode = proc_sys_make_inode(dir, table);
+	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
+	if (h)
+		sysctl_head_finish(h);
+
 	if (!inode)
 		goto out;
 
@@ -168,22 +129,14 @@ out:
 static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 		size_t count, loff_t *ppos, int write)
 {
-	struct dentry *dentry = filp->f_dentry;
-	struct ctl_table_header *head;
-	struct ctl_table *table;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
 	ssize_t error;
 	size_t res;
 
-	table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-	/* Has the sysctl entry disappeared on us? */
-	error = -ENOENT;
-	if (!table)
-		goto out;
-
-	/* Has the sysctl entry been replaced by a directory? */
-	error = -EISDIR;
-	if (!table->proc_handler)
-		goto out;
+	if (IS_ERR(head))
+		return PTR_ERR(head);
 
 	/*
 	 * At this point we know that the sysctl was not unregistered
@@ -193,6 +146,11 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 	if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
 		goto out;
 
+	/* if that can happen at all, it should be -EINVAL, not -EISDIR */
+	error = -EINVAL;
+	if (!table->proc_handler)
+		goto out;
+
 	/* careful: calling conventions are nasty here */
 	res = count;
 	error = table->proc_handler(table, write, filp, buf, &res, ppos);
@@ -218,82 +176,86 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
 
 
 static int proc_sys_fill_cache(struct file *filp, void *dirent,
-				filldir_t filldir, struct ctl_table *table)
+				filldir_t filldir,
+				struct ctl_table_header *head,
+				struct ctl_table *table)
 {
-	struct ctl_table_header *head;
-	struct ctl_table *child_table = NULL;
 	struct dentry *child, *dir = filp->f_path.dentry;
 	struct inode *inode;
 	struct qstr qname;
 	ino_t ino = 0;
 	unsigned type = DT_UNKNOWN;
-	int ret;
 
 	qname.name = table->procname;
 	qname.len  = strlen(table->procname);
 	qname.hash = full_name_hash(qname.name, qname.len);
 
-	/* Suppress duplicates.
-	 * Only fill a directory entry if it is the value that
-	 * an ordinary lookup of that name returns.  Hide all
-	 * others.
-	 *
-	 * If we ever cache this translation in the dcache
-	 * I should do a dcache lookup first.  But for now
-	 * it is just simpler not to.
-	 */
-	ret = 0;
-	child_table = do_proc_sys_lookup(dir, &qname, &head);
-	sysctl_head_finish(head);
-	if (child_table != table)
-		return 0;
-
 	child = d_lookup(dir, &qname);
 	if (!child) {
-		struct dentry *new;
-		new = d_alloc(dir, &qname);
-		if (new) {
-			inode = proc_sys_make_inode(dir->d_inode, table);
-			if (!inode)
-				child = ERR_PTR(-ENOMEM);
-			else {
-				new->d_op = &proc_sys_dentry_operations;
-				d_add(new, inode);
+		child = d_alloc(dir, &qname);
+		if (child) {
+			inode = proc_sys_make_inode(dir->d_sb, head, table);
+			if (!inode) {
+				dput(child);
+				return -ENOMEM;
+			} else {
+				child->d_op = &proc_sys_dentry_operations;
+				d_add(child, inode);
 			}
-			if (child)
-				dput(new);
-			else
-				child = new;
+		} else {
+			return -ENOMEM;
 		}
 	}
-	if (!child || IS_ERR(child) || !child->d_inode)
-		goto end_instantiate;
 	inode = child->d_inode;
-	if (inode) {
-		ino  = inode->i_ino;
-		type = inode->i_mode >> 12;
-	}
+	ino  = inode->i_ino;
+	type = inode->i_mode >> 12;
 	dput(child);
-end_instantiate:
-	if (!ino)
-		ino= find_inode_number(dir, &qname);
-	if (!ino)
-		ino = 1;
-	return filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+	return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+}
+
+static int scan(struct ctl_table_header *head, ctl_table *table,
+		unsigned long *pos, struct file *file,
+		void *dirent, filldir_t filldir)
+{
+
+	for (; table->ctl_name || table->procname; table++, (*pos)++) {
+		int res;
+
+		/* Can't do anything without a proc name */
+		if (!table->procname)
+			continue;
+
+		if (*pos < file->f_pos)
+			continue;
+
+		res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+		if (res)
+			return res;
+
+		file->f_pos = *pos + 1;
+	}
+	return 0;
 }
 
 static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
-	struct dentry *dentry = filp->f_dentry;
+	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	struct ctl_table_header *head = NULL;
-	struct ctl_table *table;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_table_header *h = NULL;
 	unsigned long pos;
-	int ret;
+	int ret = -EINVAL;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
 
-	ret = -ENOTDIR;
-	if (!S_ISDIR(inode->i_mode))
+	if (table && !table->child) {
+		WARN_ON(1);
 		goto out;
+	}
+
+	table = table ? table->child : head->ctl_table;
 
 	ret = 0;
 	/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
@@ -311,30 +273,17 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	}
 	pos = 2;
 
-	/* - Find each instance of the directory
-	 * - Read all entries in each instance
-	 * - Before returning an entry to user space lookup the entry
-	 *   by name and if I find a different entry don't return
-	 *   this one because it means it is a buried dup.
-	 * For sysctl this should only happen for directory entries.
-	 */
-	for (head = sysctl_head_next(NULL); head; head = sysctl_head_next(head)) {
-		table = proc_sys_lookup_table(dentry, head->ctl_table);
+	ret = scan(head, table, &pos, filp, dirent, filldir);
+	if (ret)
+		goto out;
 
-		if (!table)
+	for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
+		if (h->attached_to != table)
 			continue;
-
-		for (; table->ctl_name || table->procname; table++, pos++) {
-			/* Can't do anything without a proc name */
-			if (!table->procname)
-				continue;
-
-			if (pos < filp->f_pos)
-				continue;
-
-			if (proc_sys_fill_cache(filp, dirent, filldir, table) < 0)
-				goto out;
-			filp->f_pos = pos + 1;
+		ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
+		if (ret) {
+			sysctl_head_finish(h);
+			break;
 		}
 	}
 	ret = 1;
@@ -343,7 +292,7 @@ out:
 	return ret;
 }
 
-static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int proc_sys_permission(struct inode *inode, int mask)
 {
 	/*
 	 * sysctl entries that are not writeable,
@@ -351,45 +300,22 @@ static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *
 	 */
 	struct ctl_table_header *head;
 	struct ctl_table *table;
-	struct dentry *dentry;
-	int mode;
-	int depth;
 	int error;
 
-	head = NULL;
-	depth = PROC_I(inode)->fd;
+	/* Executable files are not allowed under /proc/sys/ */
+	if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))
+		return -EACCES;
 
-	/* First check the cached permissions, in case we don't have
-	 * enough information to lookup the sysctl table entry.
-	 */
-	error = -EACCES;
-	mode = inode->i_mode;
-
-	if (current->euid == 0)
-		mode >>= 6;
-	else if (in_group_p(0))
-		mode >>= 3;
+	head = grab_header(inode);
+	if (IS_ERR(head))
+		return PTR_ERR(head);
 
-	if ((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)
-		error = 0;
+	table = PROC_I(inode)->sysctl_entry;
+	if (!table) /* global root - r-xr-xr-x */
+		error = mask & MAY_WRITE ? -EACCES : 0;
+	else /* Use the permissions on the sysctl table entry */
+		error = sysctl_perm(head->root, table, mask);
 
-	/* If we can't get a sysctl table entry the permission
-	 * checks on the cached mode will have to be enough.
-	 */
-	if (!nd || !depth)
-		goto out;
-
-	dentry = nd->path.dentry;
-	table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-
-	/* If the entry does not exist deny permission */
-	error = -EACCES;
-	if (!table)
-		goto out;
-
-	/* Use the permissions on the sysctl table entry */
-	error = sysctl_perm(head->root, table, mask);
-out:
 	sysctl_head_finish(head);
 	return error;
 }
@@ -409,42 +335,80 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
 	return error;
 }
 
-/* I'm lazy and don't distinguish between files and directories,
- * until access time.
- */
+static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ctl_table_header *head = grab_header(inode);
+	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+	if (IS_ERR(head))
+		return PTR_ERR(head);
+
+	generic_fillattr(inode, stat);
+	if (table)
+		stat->mode = (stat->mode & S_IFMT) | table->mode;
+
+	sysctl_head_finish(head);
+	return 0;
+}
+
 static const struct file_operations proc_sys_file_operations = {
 	.read		= proc_sys_read,
 	.write		= proc_sys_write,
+};
+
+static const struct file_operations proc_sys_dir_file_operations = {
 	.readdir	= proc_sys_readdir,
+	.llseek		= generic_file_llseek,
 };
 
 static const struct inode_operations proc_sys_inode_operations = {
+	.permission	= proc_sys_permission,
+	.setattr	= proc_sys_setattr,
+	.getattr	= proc_sys_getattr,
+};
+
+static const struct inode_operations proc_sys_dir_operations = {
 	.lookup		= proc_sys_lookup,
 	.permission	= proc_sys_permission,
 	.setattr	= proc_sys_setattr,
+	.getattr	= proc_sys_getattr,
 };
 
 static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct ctl_table_header *head;
-	struct ctl_table *table;
-	table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
-	proc_sys_refresh_inode(dentry->d_inode, table);
-	sysctl_head_finish(head);
-	return !!table;
+	return !PROC_I(dentry->d_inode)->sysctl->unregistering;
+}
+
+static int proc_sys_delete(struct dentry *dentry)
+{
+	return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
+}
+
+static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
+			    struct qstr *name)
+{
+	struct dentry *dentry = container_of(qstr, struct dentry, d_name);
+	if (qstr->len != name->len)
+		return 1;
+	if (memcmp(qstr->name, name->name, name->len))
+		return 1;
+	return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
 }
 
 static struct dentry_operations proc_sys_dentry_operations = {
 	.d_revalidate	= proc_sys_revalidate,
+	.d_delete	= proc_sys_delete,
+	.d_compare	= proc_sys_compare,
 };
 
-static struct proc_dir_entry *proc_sys_root;
-
-int proc_sys_init(void)
+int __init proc_sys_init(void)
 {
+	struct proc_dir_entry *proc_sys_root;
+
 	proc_sys_root = proc_mkdir("sys", NULL);
-	proc_sys_root->proc_iops = &proc_sys_inode_operations;
-	proc_sys_root->proc_fops = &proc_sys_file_operations;
+	proc_sys_root->proc_iops = &proc_sys_dir_operations;
+	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
 	proc_sys_root->nlink = 0;
 	return 0;
 }
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 21f490f5d65c..d153946d6d15 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -136,54 +136,6 @@ static const struct file_operations proc_tty_drivers_operations = {
 	.release	= seq_release,
 };
 
-static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
-{
-	return (*pos < NR_LDISCS) ? pos : NULL;
-}
-
-static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	(*pos)++;
-	return (*pos < NR_LDISCS) ? pos : NULL;
-}
-
-static void tty_ldiscs_seq_stop(struct seq_file *m, void *v)
-{
-}
-
-static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
-{
-	int i = *(loff_t *)v;
-	struct tty_ldisc *ld;
-	
-	ld = tty_ldisc_get(i);
-	if (ld == NULL)
-		return 0;
-	seq_printf(m, "%-10s %2d\n", ld->name ? ld->name : "???", i);
-	tty_ldisc_put(i);
-	return 0;
-}
-
-static const struct seq_operations tty_ldiscs_seq_ops = {
-	.start	= tty_ldiscs_seq_start,
-	.next	= tty_ldiscs_seq_next,
-	.stop	= tty_ldiscs_seq_stop,
-	.show	= tty_ldiscs_seq_show,
-};
-
-static int proc_tty_ldiscs_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &tty_ldiscs_seq_ops);
-}
-
-static const struct file_operations tty_ldiscs_proc_fops = {
-	.owner		= THIS_MODULE,
-	.open		= proc_tty_ldiscs_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 /*
  * This function is called by tty_register_driver() to handle
  * registering the driver's /proc handler into /proc/tty/driver/<foo>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 95117538a4f6..7761602af9de 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -104,9 +104,9 @@ static struct file_system_type proc_fs_type = {
 
 void __init proc_root_init(void)
 {
-	int err = proc_init_inodecache();
-	if (err)
-		return;
+	int err;
+
+	proc_init_inodecache();
 	err = register_filesystem(&proc_fs_type);
 	if (err)
 		return;
@@ -117,7 +117,7 @@ void __init proc_root_init(void)
 		return;
 	}
 
-	proc_misc_init();
+	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
 
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
new file mode 100644
index 000000000000..81904f07679d
--- /dev/null
+++ b/fs/proc/stat.c
@@ -0,0 +1,153 @@
+#include <linux/cpumask.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <asm/cputime.h>
+
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
+static int show_stat(struct seq_file *p, void *v)
+{
+	int i, j;
+	unsigned long jif;
+	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+	cputime64_t guest;
+	u64 sum = 0;
+	struct timespec boottime;
+	unsigned int per_irq_sum;
+
+	user = nice = system = idle = iowait =
+		irq = softirq = steal = cputime64_zero;
+	guest = cputime64_zero;
+	getboottime(&boottime);
+	jif = boottime.tv_sec;
+
+	for_each_possible_cpu(i) {
+		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
+		nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
+		system = cputime64_add(system, kstat_cpu(i).cpustat.system);
+		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
+		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
+		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
+		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+
+		for_each_irq_nr(j)
+			sum += kstat_irqs_cpu(j, i);
+
+		sum += arch_irq_stat_cpu(i);
+	}
+	sum += arch_irq_stat();
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+		(unsigned long long)cputime64_to_clock_t(user),
+		(unsigned long long)cputime64_to_clock_t(nice),
+		(unsigned long long)cputime64_to_clock_t(system),
+		(unsigned long long)cputime64_to_clock_t(idle),
+		(unsigned long long)cputime64_to_clock_t(iowait),
+		(unsigned long long)cputime64_to_clock_t(irq),
+		(unsigned long long)cputime64_to_clock_t(softirq),
+		(unsigned long long)cputime64_to_clock_t(steal),
+		(unsigned long long)cputime64_to_clock_t(guest));
+	for_each_online_cpu(i) {
+
+		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
+		user = kstat_cpu(i).cpustat.user;
+		nice = kstat_cpu(i).cpustat.nice;
+		system = kstat_cpu(i).cpustat.system;
+		idle = kstat_cpu(i).cpustat.idle;
+		iowait = kstat_cpu(i).cpustat.iowait;
+		irq = kstat_cpu(i).cpustat.irq;
+		softirq = kstat_cpu(i).cpustat.softirq;
+		steal = kstat_cpu(i).cpustat.steal;
+		guest = kstat_cpu(i).cpustat.guest;
+		seq_printf(p,
+			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+			i,
+			(unsigned long long)cputime64_to_clock_t(user),
+			(unsigned long long)cputime64_to_clock_t(nice),
+			(unsigned long long)cputime64_to_clock_t(system),
+			(unsigned long long)cputime64_to_clock_t(idle),
+			(unsigned long long)cputime64_to_clock_t(iowait),
+			(unsigned long long)cputime64_to_clock_t(irq),
+			(unsigned long long)cputime64_to_clock_t(softirq),
+			(unsigned long long)cputime64_to_clock_t(steal),
+			(unsigned long long)cputime64_to_clock_t(guest));
+	}
+	seq_printf(p, "intr %llu", (unsigned long long)sum);
+
+	/* sum again ? it could be updated? */
+	for_each_irq_nr(j) {
+		per_irq_sum = 0;
+
+		for_each_possible_cpu(i)
+			per_irq_sum += kstat_irqs_cpu(j, i);
+
+		seq_printf(p, " %u", per_irq_sum);
+	}
+
+	seq_printf(p,
+		"\nctxt %llu\n"
+		"btime %lu\n"
+		"processes %lu\n"
+		"procs_running %lu\n"
+		"procs_blocked %lu\n",
+		nr_context_switches(),
+		(unsigned long)jif,
+		total_forks,
+		nr_running(),
+		nr_iowait());
+
+	return 0;
+}
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+	unsigned size = 4096 * (1 + num_possible_cpus() / 32);
+	char *buf;
+	struct seq_file *m;
+	int res;
+
+	/* don't ask for more than the kmalloc() max size, currently 128 KB */
+	if (size > 128 * 1024)
+		size = 128 * 1024;
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	res = single_open(file, show_stat, NULL);
+	if (!res) {
+		m = file->private_data;
+		m->buf = buf;
+		m->size = size;
+	} else
+		kfree(buf);
+	return res;
+}
+
+static const struct file_operations proc_stat_operations = {
+	.open		= stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_stat_init(void)
+{
+	proc_create("stat", 0, NULL, &proc_stat_operations);
+	return 0;
+}
+module_init(proc_stat_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 164bd9f9ede3..b770c095e45c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -198,11 +198,8 @@ static int do_maps_open(struct inode *inode, struct file *file,
 	return ret;
 }
 
-static int show_map(struct seq_file *m, void *v)
+static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 {
-	struct proc_maps_private *priv = m->private;
-	struct task_struct *task = priv->task;
-	struct vm_area_struct *vma = v;
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
 	int flags = vma->vm_flags;
@@ -210,23 +207,20 @@ static int show_map(struct seq_file *m, void *v)
 	dev_t dev = 0;
 	int len;
 
-	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
-		return -EACCES;
-
 	if (file) {
 		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
 
-	seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
 			vma->vm_start,
 			vma->vm_end,
 			flags & VM_READ ? 'r' : '-',
 			flags & VM_WRITE ? 'w' : '-',
 			flags & VM_EXEC ? 'x' : '-',
 			flags & VM_MAYSHARE ? 's' : 'p',
-			vma->vm_pgoff << PAGE_SHIFT,
+			((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
 			MAJOR(dev), MINOR(dev), ino, &len);
 
 	/*
@@ -257,6 +251,15 @@ static int show_map(struct seq_file *m, void *v)
 		}
 	}
 	seq_putc(m, '\n');
+}
+
+static int show_map(struct seq_file *m, void *v)
+{
+	struct vm_area_struct *vma = v;
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
+
+	show_map_vma(m, vma);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
@@ -367,9 +370,10 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 static int show_smap(struct seq_file *m, void *v)
 {
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
-	int ret;
 	struct mm_walk smaps_walk = {
 		.pmd_entry = smaps_pte_range,
 		.mm = vma->vm_mm,
@@ -381,9 +385,7 @@ static int show_smap(struct seq_file *m, void *v)
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
-	ret = show_map(m, v);
-	if (ret)
-		return ret;
+	show_map_vma(m, vma);
 
 	seq_printf(m,
 		   "Size:           %8lu kB\n"
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.referenced >> 10,
 		   mss.swap >> 10);
 
-	return ret;
+	if (m->count < m->size)  /* vma is copied successfully */
+		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+	return 0;
 }
 
 static const struct seq_operations proc_pid_smaps_op = {
@@ -636,7 +640,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	struct pagemapread pm;
 	int pagecount;
 	int ret = -ESRCH;
-	struct mm_walk pagemap_walk;
+	struct mm_walk pagemap_walk = {};
 	unsigned long src;
 	unsigned long svpfn;
 	unsigned long start_vaddr;
@@ -742,22 +746,11 @@ const struct file_operations proc_pagemap_operations = {
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
 
-static int show_numa_map_checked(struct seq_file *m, void *v)
-{
-	struct proc_maps_private *priv = m->private;
-	struct task_struct *task = priv->task;
-
-	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
-		return -EACCES;
-
-	return show_numa_map(m, v);
-}
-
 static const struct seq_operations proc_pid_numa_maps_op = {
         .start  = m_start,
         .next   = m_next,
         .stop   = m_stop,
-        .show   = show_numa_map_checked
+        .show   = show_numa_map,
 };
 
 static int numa_maps_open(struct inode *inode, struct file *file)
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d84e7121df8..219bd79ea894 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -110,11 +110,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 static int show_map(struct seq_file *m, void *_vml)
 {
 	struct vm_list_struct *vml = _vml;
-	struct proc_maps_private *priv = m->private;
-	struct task_struct *task = priv->task;
-
-	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
-		return -EACCES;
 
 	return nommu_vma_show(m, vml->vma);
 }
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
new file mode 100644
index 000000000000..0c10a0b3f146
--- /dev/null
+++ b/fs/proc/uptime.c
@@ -0,0 +1,43 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
+#include <asm/cputime.h>
+
+static int uptime_proc_show(struct seq_file *m, void *v)
+{
+	struct timespec uptime;
+	struct timespec idle;
+	cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
+
+	do_posix_clock_monotonic_gettime(&uptime);
+	monotonic_to_bootbased(&uptime);
+	cputime_to_timespec(idletime, &idle);
+	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
+			(unsigned long) uptime.tv_sec,
+			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
+			(unsigned long) idle.tv_sec,
+			(idle.tv_nsec / (NSEC_PER_SEC / 100)));
+	return 0;
+}
+
+static int uptime_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, uptime_proc_show, NULL);
+}
+
+static const struct file_operations uptime_proc_fops = {
+	.open		= uptime_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_uptime_init(void)
+{
+	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	return 0;
+}
+module_init(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
new file mode 100644
index 000000000000..76817a60678c
--- /dev/null
+++ b/fs/proc/version.c
@@ -0,0 +1,34 @@
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/utsname.h>
+
+static int version_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, linux_proc_banner,
+		utsname()->sysname,
+		utsname()->release,
+		utsname()->version);
+	return 0;
+}
+
+static int version_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, version_proc_show, NULL);
+}
+
+static const struct file_operations version_proc_fops = {
+	.open		= version_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init proc_version_init(void)
+{
+	proc_create("version", 0, NULL, &version_proc_fops);
+	return 0;
+}
+module_init(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9ac0f5e064e0..03ec59504906 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -32,10 +32,7 @@ static size_t elfcorebuf_sz;
 /* Total size of vmcore file. */
 static u64 vmcore_size;
 
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
-struct proc_dir_entry *proc_vmcore = NULL;
+static struct proc_dir_entry *proc_vmcore = NULL;
 
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
@@ -165,14 +162,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 	return acc;
 }
 
-static int open_vmcore(struct inode *inode, struct file *filp)
-{
-	return 0;
-}
-
-const struct file_operations proc_vmcore_operations = {
+static const struct file_operations proc_vmcore_operations = {
 	.read		= read_vmcore,
-	.open		= open_vmcore,
 };
 
 static struct vmcore* __init get_new_element(void)
@@ -653,7 +644,7 @@ static int __init vmcore_init(void)
 	int rc = 0;
 
 	/* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
-	if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX))
+	if (!(is_vmcore_usable()))
 		return rc;
 	rc = parse_crash_elf_headers();
 	if (rc) {
@@ -661,7 +652,7 @@ static int __init vmcore_init(void)
 		return rc;
 	}
 
-	/* Initialize /proc/vmcore size if proc is already up. */
+	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
 	if (proc_vmcore)
 		proc_vmcore->size = vmcore_size;
 	return 0;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b31ab78052b3..2aad1044b84c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -553,7 +553,7 @@ static void qnx4_destroy_inode(struct inode *inode)
 	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
 
diff --git a/fs/quota.c b/fs/quota.c
index db1cc9f3c7aa..7f4386ebc23a 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -186,7 +186,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
 
 void sync_dquots(struct super_block *sb, int type)
 {
-	int cnt, dirty;
+	int cnt;
 
 	if (sb) {
 		if (sb->s_qcop->quota_sync)
@@ -198,11 +198,17 @@ void sync_dquots(struct super_block *sb, int type)
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		/* This test just improves performance so it needn't be reliable... */
-		for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++)
-			if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt)
-			    && info_any_dirty(&sb_dqopt(sb)->info[cnt]))
-				dirty = 1;
-		if (!dirty)
+		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+			if (type != -1 && type != cnt)
+				continue;
+			if (!sb_has_quota_enabled(sb, cnt))
+				continue;
+			if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
+			    list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
+				continue;
+			break;
+		}
+		if (cnt == MAXQUOTAS)
 			continue;
 		sb->s_count++;
 		spin_unlock(&sb_lock);
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index a6cf9269105c..5ae15b13eeb0 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -1,6 +1,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/quota.h>
+#include <linux/quotaops.h>
 #include <linux/dqblk_v1.h>
 #include <linux/quotaio_v1.h>
 #include <linux/kernel.h>
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index 234ada903633..b53827dc02d9 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/quotaops.h>
 
 #include <asm/byteorder.h>
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 52312ec93ff4..76acdbc34611 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = {
  * size 0 on the assumption that it's going to be used for an mmap of shared
  * memory
  */
-static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
 	struct pagevec lru_pvec;
 	unsigned long npages, xpages, loop, limit;
@@ -112,12 +112,12 @@ static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 			goto add_error;
 
 		if (!pagevec_add(&lru_pvec, page))
-			__pagevec_lru_add(&lru_pvec);
+			__pagevec_lru_add_file(&lru_pvec);
 
 		unlock_page(page);
 	}
 
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 	return 0;
 
  fsize_exceeded:
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b13123424e49..f031d1c925f0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
 		default:
diff --git a/fs/read_write.c b/fs/read_write.c
index 9ba495d5a29b..969a6d9c020b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,39 +31,61 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
+/**
+ * generic_file_llseek_unlocked - lockless generic llseek implementation
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * Updates the file offset to the value specified by @offset and @origin.
+ * Locking must be provided by the caller.
+ */
 loff_t
 generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 {
-	loff_t retval;
 	struct inode *inode = file->f_mapping->host;
 
 	switch (origin) {
-		case SEEK_END:
-			offset += inode->i_size;
-			break;
-		case SEEK_CUR:
-			offset += file->f_pos;
+	case SEEK_END:
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+		break;
 	}
-	retval = -EINVAL;
-	if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
-		/* Special lock needed here? */
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-			file->f_version = 0;
-		}
-		retval = offset;
+
+	if (offset < 0 || offset > inode->i_sb->s_maxbytes)
+		return -EINVAL;
+
+	/* Special lock needed here? */
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
 	}
-	return retval;
+
+	return offset;
 }
 EXPORT_SYMBOL(generic_file_llseek_unlocked);
 
+/**
+ * generic_file_llseek - generic llseek implementation for regular files
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @origin:	type of seek
+ *
+ * This is a generic implemenation of ->llseek useable for all normal local
+ * filesystems.  It just updates the file offset to the value specified by
+ * @offset and @origin under i_mutex.
+ */
 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
-	loff_t n;
+	loff_t rval;
+
 	mutex_lock(&file->f_dentry->d_inode->i_mutex);
-	n = generic_file_llseek_unlocked(file, offset, origin);
+	rval = generic_file_llseek_unlocked(file, offset, origin);
 	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
-	return n;
+
+	return rval;
 }
 EXPORT_SYMBOL(generic_file_llseek);
 
diff --git a/fs/readdir.c b/fs/readdir.c
index 4e026e5407fb..b318d9b5af2e 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset
 	if (buf->result)
 		return -EINVAL;
 	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->result = -EOVERFLOW;
 		return -EOVERFLOW;
+	}
 	buf->result++;
 	dirent = buf->dirent;
 	if (!access_ok(VERIFY_WRITE, dirent,
@@ -115,7 +117,7 @@ asmlinkage long old_readdir(unsigned int fd, struct old_linux_dirent __user * di
 	buf.dirent = dirent;
 
 	error = vfs_readdir(file, fillonedir, &buf);
-	if (error >= 0)
+	if (buf.result)
 		error = buf.result;
 
 	fput(file);
@@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
 	if (reclen > buf->count)
 		return -EINVAL;
 	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->error = -EOVERFLOW;
 		return -EOVERFLOW;
+	}
 	dirent = buf->previous;
 	if (dirent) {
 		if (__put_user(offset, &dirent->d_off))
@@ -205,9 +209,8 @@ asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * diren
 	buf.error = 0;
 
 	error = vfs_readdir(file, filldir, &buf);
-	if (error < 0)
-		goto out_putf;
-	error = buf.error;
+	if (error >= 0)
+		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
 		if (put_user(file->f_pos, &lastdirent->d_off))
@@ -215,8 +218,6 @@ asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * diren
 		else
 			error = count - buf.count;
 	}
-
-out_putf:
 	fput(file);
 out:
 	return error;
@@ -289,19 +290,16 @@ asmlinkage long sys_getdents64(unsigned int fd, struct linux_dirent64 __user * d
 	buf.error = 0;
 
 	error = vfs_readdir(file, filldir64, &buf);
-	if (error < 0)
-		goto out_putf;
-	error = buf.error;
+	if (error >= 0)
+		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
 		typeof(lastdirent->d_off) d_off = file->f_pos;
-		error = -EFAULT;
 		if (__put_user(d_off, &lastdirent->d_off))
-			goto out_putf;
-		error = count - buf.count;
+			error = -EFAULT;
+		else
+			error = count - buf.count;
 	}
-
-out_putf:
 	fput(file);
 out:
 	return error;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index a804903d31d1..33408417038c 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -296,6 +296,7 @@ const struct file_operations reiserfs_file_operations = {
 	.aio_write = generic_file_aio_write,
 	.splice_read = generic_file_splice_read,
 	.splice_write = generic_file_splice_write,
+	.llseek = generic_file_llseek,
 };
 
 const struct inode_operations reiserfs_file_inode_operations = {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 192269698a8a..6c4c2c69449f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1522,7 +1522,6 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb,
 
 {
 	struct cpu_key key;
-	struct dentry *result;
 	struct inode *inode;
 
 	key.on_disk_key.k_objectid = objectid;
@@ -1535,16 +1534,8 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb,
 		inode = NULL;
 	}
 	reiserfs_write_unlock(sb);
-	if (!inode)
-		inode = ERR_PTR(-ESTALE);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+
+	return d_obtain_alias(inode);
 }
 
 struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
@@ -2435,7 +2426,7 @@ static int reiserfs_write_full_page(struct page *page,
 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
 		} else {
-			if (test_set_buffer_locked(bh)) {
+			if (!trylock_buffer(bh)) {
 				redirty_page_for_writepage(wbc, page);
 				continue;
 			}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e396b2fa4743..9643c3bbeb3b 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -34,15 +34,10 @@
 **		        from within kupdate, it will ignore the immediate flag
 */
 
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
 #include <linux/time.h>
 #include <linux/semaphore.h>
-
 #include <linux/vmalloc.h>
 #include <linux/reiserfs_fs.h>
-
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/fcntl.h>
@@ -54,6 +49,9 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/uaccess.h>
+
+#include <asm/system.h>
 
 /* gets a struct reiserfs_journal_list * from a list head */
 #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
@@ -558,13 +556,13 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
 static inline void lock_journal(struct super_block *p_s_sb)
 {
 	PROC_INFO_INC(p_s_sb, journal.lock_journal);
-	down(&SB_JOURNAL(p_s_sb)->j_lock);
+	mutex_lock(&SB_JOURNAL(p_s_sb)->j_mutex);
 }
 
 /* unlock the current transaction */
 static inline void unlock_journal(struct super_block *p_s_sb)
 {
-	up(&SB_JOURNAL(p_s_sb)->j_lock);
+	mutex_unlock(&SB_JOURNAL(p_s_sb)->j_mutex);
 }
 
 static inline void get_journal_list(struct reiserfs_journal_list *jl)
@@ -629,7 +627,7 @@ static int journal_list_still_alive(struct super_block *s,
 static void release_buffer_page(struct buffer_head *bh)
 {
 	struct page *page = bh->b_page;
-	if (!page->mapping && !TestSetPageLocked(page)) {
+	if (!page->mapping && trylock_page(page)) {
 		page_cache_get(page);
 		put_bh(bh);
 		if (!page->mapping)
@@ -857,7 +855,7 @@ static int write_ordered_buffers(spinlock_t * lock,
 		jh = JH_ENTRY(list->next);
 		bh = jh->bh;
 		get_bh(bh);
-		if (test_set_buffer_locked(bh)) {
+		if (!trylock_buffer(bh)) {
 			if (!buffer_dirty(bh)) {
 				list_move(&jh->list, &tmp);
 				goto loop_next;
@@ -1045,9 +1043,9 @@ static int flush_commit_list(struct super_block *s,
 	}
 
 	/* make sure nobody is trying to flush this one at the same time */
-	down(&jl->j_commit_lock);
+	mutex_lock(&jl->j_commit_mutex);
 	if (!journal_list_still_alive(s, trans_id)) {
-		up(&jl->j_commit_lock);
+		mutex_unlock(&jl->j_commit_mutex);
 		goto put_jl;
 	}
 	BUG_ON(jl->j_trans_id == 0);
@@ -1057,7 +1055,7 @@ static int flush_commit_list(struct super_block *s,
 		if (flushall) {
 			atomic_set(&(jl->j_older_commits_done), 1);
 		}
-		up(&jl->j_commit_lock);
+		mutex_unlock(&jl->j_commit_mutex);
 		goto put_jl;
 	}
 
@@ -1181,7 +1179,7 @@ static int flush_commit_list(struct super_block *s,
 	if (flushall) {
 		atomic_set(&(jl->j_older_commits_done), 1);
 	}
-	up(&jl->j_commit_lock);
+	mutex_unlock(&jl->j_commit_mutex);
       put_jl:
 	put_journal_list(s, jl);
 
@@ -1411,8 +1409,8 @@ static int flush_journal_list(struct super_block *s,
 
 	/* if flushall == 0, the lock is already held */
 	if (flushall) {
-		down(&journal->j_flush_sem);
-	} else if (!down_trylock(&journal->j_flush_sem)) {
+		mutex_lock(&journal->j_flush_mutex);
+	} else if (mutex_trylock(&journal->j_flush_mutex)) {
 		BUG();
 	}
 
@@ -1642,7 +1640,7 @@ static int flush_journal_list(struct super_block *s,
 	jl->j_state = 0;
 	put_journal_list(s, jl);
 	if (flushall)
-		up(&journal->j_flush_sem);
+		mutex_unlock(&journal->j_flush_mutex);
 	put_fs_excl();
 	return err;
 }
@@ -1772,12 +1770,12 @@ static int kupdate_transactions(struct super_block *s,
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	chunk.nr = 0;
 
-	down(&journal->j_flush_sem);
+	mutex_lock(&journal->j_flush_mutex);
 	if (!journal_list_still_alive(s, orig_trans_id)) {
 		goto done;
 	}
 
-	/* we've got j_flush_sem held, nobody is going to delete any
+	/* we've got j_flush_mutex held, nobody is going to delete any
 	 * of these lists out from underneath us
 	 */
 	while ((num_trans && transactions_flushed < num_trans) ||
@@ -1812,7 +1810,7 @@ static int kupdate_transactions(struct super_block *s,
 	}
 
       done:
-	up(&journal->j_flush_sem);
+	mutex_unlock(&journal->j_flush_mutex);
 	return ret;
 }
 
@@ -2556,7 +2554,7 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 	INIT_LIST_HEAD(&jl->j_working_list);
 	INIT_LIST_HEAD(&jl->j_tail_bh_list);
 	INIT_LIST_HEAD(&jl->j_bh_list);
-	sema_init(&jl->j_commit_lock, 1);
+	mutex_init(&jl->j_commit_mutex);
 	SB_JOURNAL(s)->j_num_lists++;
 	get_journal_list(jl);
 	return jl;
@@ -2577,7 +2575,7 @@ static int release_journal_dev(struct super_block *super,
 	if (journal->j_dev_bd != NULL) {
 		if (journal->j_dev_bd->bd_dev != super->s_dev)
 			bd_release(journal->j_dev_bd);
-		result = blkdev_put(journal->j_dev_bd);
+		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
 		journal->j_dev_bd = NULL;
 	}
 
@@ -2595,7 +2593,7 @@ static int journal_init_dev(struct super_block *super,
 {
 	int result;
 	dev_t jdev;
-	int blkdev_mode = FMODE_READ | FMODE_WRITE;
+	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
 	char b[BDEVNAME_SIZE];
 
 	result = 0;
@@ -2610,6 +2608,7 @@ static int journal_init_dev(struct super_block *super,
 	/* there is no "jdev" option and journal is on separate device */
 	if ((!jdev_name || !jdev_name[0])) {
 		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
 			journal->j_dev_bd = NULL;
@@ -2620,7 +2619,7 @@ static int journal_init_dev(struct super_block *super,
 		} else if (jdev != super->s_dev) {
 			result = bd_claim(journal->j_dev_bd, journal);
 			if (result) {
-				blkdev_put(journal->j_dev_bd);
+				blkdev_put(journal->j_dev_bd, blkdev_mode);
 				return result;
 			}
 
@@ -2630,7 +2629,9 @@ static int journal_init_dev(struct super_block *super,
 		return 0;
 	}
 
-	journal->j_dev_bd = open_bdev_excl(jdev_name, 0, journal);
+	journal->j_dev_mode = blkdev_mode;
+	journal->j_dev_bd = open_bdev_exclusive(jdev_name,
+						blkdev_mode, journal);
 	if (IS_ERR(journal->j_dev_bd)) {
 		result = PTR_ERR(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
@@ -2837,8 +2838,8 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
 	journal->j_last = NULL;
 	journal->j_first = NULL;
 	init_waitqueue_head(&(journal->j_join_wait));
-	sema_init(&journal->j_lock, 1);
-	sema_init(&journal->j_flush_sem, 1);
+	mutex_init(&journal->j_mutex);
+	mutex_init(&journal->j_flush_mutex);
 
 	journal->j_trans_id = 10;
 	journal->j_mount_id = 10;
@@ -3873,7 +3874,7 @@ int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
 {
 	PROC_INFO_INC(p_s_sb, journal.prepare);
 
-	if (test_set_buffer_locked(bh)) {
+	if (!trylock_buffer(bh)) {
 		if (!wait)
 			return 0;
 		lock_buffer(bh);
@@ -4030,7 +4031,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	 * the new transaction is fully setup, and we've already flushed the
 	 * ordered bh list
 	 */
-	down(&jl->j_commit_lock);
+	mutex_lock(&jl->j_commit_mutex);
 
 	/* save the transaction id in case we need to commit it later */
 	commit_trans_id = jl->j_trans_id;
@@ -4196,7 +4197,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 		lock_kernel();
 	}
 	BUG_ON(!list_empty(&jl->j_tail_bh_list));
-	up(&jl->j_commit_lock);
+	mutex_unlock(&jl->j_commit_mutex);
 
 	/* honor the flush wishes from the caller, simple commits can
 	 ** be done outside the journal lock, they are done below
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index c1add28dd45e..f89ebb943f3f 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -383,7 +383,6 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
 	struct inode *inode = NULL;
 	struct reiserfs_dir_entry de;
 	INITIALIZE_PATH(path_to_entry);
-	struct dentry *parent;
 	struct inode *dir = child->d_inode;
 
 	if (dir->i_nlink == 0) {
@@ -401,15 +400,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
 	inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
 	reiserfs_write_unlock(dir->i_sb);
 
-	if (!inode || IS_ERR(inode)) {
-		return ERR_PTR(-EACCES);
-	}
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(inode);
 }
 
 /* add entry to the directory (entry can be hidden). 
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index b9dbeeca7049..37173fa07d15 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -8,8 +8,6 @@
 
 /* proc info support a la one created by Sizif@Botik.RU for PGC */
 
-/* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */
-
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/seq_file.h>
@@ -621,7 +619,6 @@ int reiserfs_global_version_in_proc(char *buffer, char **start,
 #endif
 
 /*
- * $Log: procfs.c,v $
  * Revision 1.1.8.2  2001/07/15 17:08:42  god
  *  . use get_super() in procfs.c
  *  . remove remove_save_link() from reiserfs_do_truncate()
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1d40f2bd1970..663a91f5dce8 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -22,11 +22,11 @@
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
+#include <linux/quotaops.h>
 #include <linux/vfs.h>
 #include <linux/mnt_namespace.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/quotaops.h>
 
 struct file_system_type reiserfs_fs_type;
 
@@ -182,7 +182,7 @@ static int finish_unfinished(struct super_block *s)
 			int ret = reiserfs_quota_on_mount(s, i);
 			if (ret < 0)
 				reiserfs_warning(s,
-						 "reiserfs: cannot turn on journalled quota: error %d",
+						 "reiserfs: cannot turn on journaled quota: error %d",
 						 ret);
 		}
 	}
@@ -520,7 +520,7 @@ static void reiserfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
 
@@ -876,7 +876,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 				     mount options were selected. */
 				  unsigned long *blocks,	/* strtol-ed from NNN of resize=NNN */
 				  char **jdev_name,
-				  unsigned int *commit_max_age)
+				  unsigned int *commit_max_age,
+				  char **qf_names,
+				  unsigned int *qfmt)
 {
 	int c;
 	char *arg = NULL;
@@ -992,9 +994,11 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		if (c == 'u' || c == 'g') {
 			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
 
-			if (sb_any_quota_enabled(s)) {
+			if ((sb_any_quota_enabled(s) ||
+			     sb_any_quota_suspended(s)) &&
+			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
 				reiserfs_warning(s,
-						 "reiserfs_parse_options: cannot change journalled quota options when quota turned on.");
+						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
 				return 0;
 			}
 			if (*arg) {	/* Some filename specified? */
@@ -1011,46 +1015,54 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 							 "reiserfs_parse_options: quotafile must be on filesystem root.");
 					return 0;
 				}
-				REISERFS_SB(s)->s_qf_names[qtype] =
+				qf_names[qtype] =
 				    kmalloc(strlen(arg) + 1, GFP_KERNEL);
-				if (!REISERFS_SB(s)->s_qf_names[qtype]) {
+				if (!qf_names[qtype]) {
 					reiserfs_warning(s,
 							 "reiserfs_parse_options: not enough memory for storing quotafile name.");
 					return 0;
 				}
-				strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
+				strcpy(qf_names[qtype], arg);
 				*mount_options |= 1 << REISERFS_QUOTA;
 			} else {
-				kfree(REISERFS_SB(s)->s_qf_names[qtype]);
-				REISERFS_SB(s)->s_qf_names[qtype] = NULL;
+				if (qf_names[qtype] !=
+				    REISERFS_SB(s)->s_qf_names[qtype])
+					kfree(qf_names[qtype]);
+				qf_names[qtype] = NULL;
 			}
 		}
 		if (c == 'f') {
 			if (!strcmp(arg, "vfsold"))
-				REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD;
+				*qfmt = QFMT_VFS_OLD;
 			else if (!strcmp(arg, "vfsv0"))
-				REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0;
+				*qfmt = QFMT_VFS_V0;
 			else {
 				reiserfs_warning(s,
 						 "reiserfs_parse_options: unknown quota format specified.");
 				return 0;
 			}
+			if ((sb_any_quota_enabled(s) ||
+			     sb_any_quota_suspended(s)) &&
+			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
+				reiserfs_warning(s,
+						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
+				return 0;
+			}
 		}
 #else
 		if (c == 'u' || c == 'g' || c == 'f') {
 			reiserfs_warning(s,
-					 "reiserfs_parse_options: journalled quota options not supported.");
+					 "reiserfs_parse_options: journaled quota options not supported.");
 			return 0;
 		}
 #endif
 	}
 
 #ifdef CONFIG_QUOTA
-	if (!REISERFS_SB(s)->s_jquota_fmt
-	    && (REISERFS_SB(s)->s_qf_names[USRQUOTA]
-		|| REISERFS_SB(s)->s_qf_names[GRPQUOTA])) {
+	if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
+	    && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
 		reiserfs_warning(s,
-				 "reiserfs_parse_options: journalled quota format not specified.");
+				 "reiserfs_parse_options: journaled quota format not specified.");
 		return 0;
 	}
 	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
@@ -1130,6 +1142,21 @@ static void handle_attrs(struct super_block *s)
 	}
 }
 
+#ifdef CONFIG_QUOTA
+static void handle_quota_files(struct super_block *s, char **qf_names,
+			       unsigned int *qfmt)
+{
+	int i;
+
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+			kfree(REISERFS_SB(s)->s_qf_names[i]);
+		REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
+	}
+	REISERFS_SB(s)->s_jquota_fmt = *qfmt;
+}
+#endif
+
 static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 {
 	struct reiserfs_super_block *rs;
@@ -1141,23 +1168,30 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	char *new_opts = kstrdup(arg, GFP_KERNEL);
 	int err;
+	char *qf_names[MAXQUOTAS];
+	unsigned int qfmt = 0;
 #ifdef CONFIG_QUOTA
 	int i;
+
+	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
 
 	rs = SB_DISK_SUPER_BLOCK(s);
 
 	if (!reiserfs_parse_options
-	    (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) {
+	    (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
+	    qf_names, &qfmt)) {
 #ifdef CONFIG_QUOTA
-		for (i = 0; i < MAXQUOTAS; i++) {
-			kfree(REISERFS_SB(s)->s_qf_names[i]);
-			REISERFS_SB(s)->s_qf_names[i] = NULL;
-		}
+		for (i = 0; i < MAXQUOTAS; i++)
+			if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+				kfree(qf_names[i]);
 #endif
 		err = -EINVAL;
 		goto out_err;
 	}
+#ifdef CONFIG_QUOTA
+	handle_quota_files(s, qf_names, &qfmt);
+#endif
 
 	handle_attrs(s);
 
@@ -1570,6 +1604,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	char *jdev_name;
 	struct reiserfs_sb_info *sbi;
 	int errval = -EINVAL;
+	char *qf_names[MAXQUOTAS] = {};
+	unsigned int qfmt = 0;
 
 	save_mount_options(s, data);
 
@@ -1597,9 +1633,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	jdev_name = NULL;
 	if (reiserfs_parse_options
 	    (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
-	     &commit_max_age) == 0) {
+	     &commit_max_age, qf_names, &qfmt) == 0) {
 		goto error;
 	}
+#ifdef CONFIG_QUOTA
+	handle_quota_files(s, qf_names, &qfmt);
+#endif
 
 	if (blocks) {
 		SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option "
@@ -1819,7 +1858,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 
 	return (0);
 
-      error:
+error:
 	if (jinit_done) {	/* kill the commit thread, free journal ram */
 		journal_release_error(NULL, s);
 	}
@@ -1830,10 +1869,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 #ifdef CONFIG_QUOTA
 	{
 		int j;
-		for (j = 0; j < MAXQUOTAS; j++) {
-			kfree(sbi->s_qf_names[j]);
-			sbi->s_qf_names[j] = NULL;
-		}
+		for (j = 0; j < MAXQUOTAS; j++)
+			kfree(qf_names[j]);
 	}
 #endif
 	kfree(sbi);
@@ -1980,7 +2017,7 @@ static int reiserfs_release_dquot(struct dquot *dquot)
 
 static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
 {
-	/* Are we journalling quotas? */
+	/* Are we journaling quotas? */
 	if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
 	    REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
@@ -2021,26 +2058,27 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-			     char *path, int remount)
+			     char *name, int remount)
 {
 	int err;
-	struct nameidata nd;
+	struct path path;
 	struct inode *inode;
+	struct reiserfs_transaction_handle th;
 
 	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
 		return -EINVAL;
 	/* No more checks needed? Path and format_id are bogus anyway... */
 	if (remount)
-		return vfs_quota_on(sb, type, format_id, path, 1);
-	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+		return vfs_quota_on(sb, type, format_id, name, 1);
+	err = kern_path(name, LOOKUP_FOLLOW, &path);
 	if (err)
 		return err;
 	/* Quotafile not on the same filesystem? */
-	if (nd.path.mnt->mnt_sb != sb) {
-		path_put(&nd.path);
-		return -EXDEV;
+	if (path.mnt->mnt_sb != sb) {
+		err = -EXDEV;
+		goto out;
 	}
-	inode = nd.path.dentry->d_inode;
+	inode = path.dentry->d_inode;
 	/* We must not pack tails for quota files on reiserfs for quota IO to work */
 	if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
 		err = reiserfs_unpack(inode, NULL);
@@ -2048,24 +2086,37 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 			reiserfs_warning(sb,
 				"reiserfs: Unpacking tail of quota file failed"
 				" (%d). Cannot turn on quotas.", err);
-			path_put(&nd.path);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 		mark_inode_dirty(inode);
 	}
-	/* Not journalling quota? No more tests needed... */
-	if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] &&
-	    !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) {
-		path_put(&nd.path);
-		return vfs_quota_on(sb, type, format_id, path, 0);
-	}
-	/* Quotafile not of fs root? */
-	if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-		reiserfs_warning(sb,
+	/* Journaling quota? */
+	if (REISERFS_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (path.dentry->d_parent != sb->s_root)
+			reiserfs_warning(sb,
 				 "reiserfs: Quota file not on filesystem root. "
 				 "Journalled quota will not work.");
-	path_put(&nd.path);
-	return vfs_quota_on(sb, type, format_id, path, 0);
+	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (reiserfs_file_data_log(inode)) {
+		/* Just start temporary transaction and finish it */
+		err = journal_begin(&th, sb, 1);
+		if (err)
+			goto out;
+		err = journal_end_sync(&th, sb, 1);
+		if (err)
+			goto out;
+	}
+	err = vfs_quota_on_path(sb, type, format_id, &path);
+out:
+	path_put(&path);
+	return err;
 }
 
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index d7c4935c1034..ad92461cbfc3 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -155,7 +155,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode,
 	xadir = open_xa_dir(inode, flags);
 	if (IS_ERR(xadir)) {
 		return ERR_CAST(xadir);
-	} else if (xadir && !xadir->d_inode) {
+	} else if (!xadir->d_inode) {
 		dput(xadir);
 		return ERR_PTR(-ENODATA);
 	}
@@ -1250,7 +1250,7 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
 	return error;
 }
 
-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int reiserfs_permission(struct inode *inode, int mask)
 {
 	/*
 	 * We don't do permission checks on the internal objects.
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 5e90a95ad60b..056008db1377 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -6,8 +6,6 @@
 #include <linux/reiserfs_xattr.h>
 #include <asm/uaccess.h>
 
-#define XATTR_SECURITY_PREFIX "security."
-
 static int
 security_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 024a938ca60f..60abe2bb1f98 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -7,8 +7,6 @@
 #include <linux/reiserfs_xattr.h>
 #include <asm/uaccess.h>
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static int
 trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 073f39364b11..1384efcb938e 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -10,8 +10,6 @@
 # include <linux/reiserfs_acl.h>
 #endif
 
-#define XATTR_USER_PREFIX "user."
-
 static int
 user_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 3f13d491c7c7..60d2f822e87b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -418,7 +418,8 @@ static int
 romfs_readpage(struct file *file, struct page * page)
 {
 	struct inode *inode = page->mapping->host;
-	loff_t offset, avail, readlen;
+	loff_t offset, size;
+	unsigned long filled;
 	void *buf;
 	int result = -EIO;
 
@@ -430,21 +431,29 @@ romfs_readpage(struct file *file, struct page * page)
 
 	/* 32 bit warning -- but not for us :) */
 	offset = page_offset(page);
-	if (offset < i_size_read(inode)) {
-		avail = inode->i_size-offset;
-		readlen = min_t(unsigned long, avail, PAGE_SIZE);
-		if (romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen) == readlen) {
-			if (readlen < PAGE_SIZE) {
-				memset(buf + readlen,0,PAGE_SIZE-readlen);
-			}
-			SetPageUptodate(page);
-			result = 0;
+	size = i_size_read(inode);
+	filled = 0;
+	result = 0;
+	if (offset < size) {
+		unsigned long readlen;
+
+		size -= offset;
+		readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+		filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
+
+		if (filled != readlen) {
+			SetPageError(page);
+			filled = 0;
+			result = -EIO;
 		}
 	}
-	if (result) {
-		memset(buf, 0, PAGE_SIZE);
-		SetPageError(page);
-	}
+
+	if (filled < PAGE_SIZE)
+		memset(buf + filled, 0, PAGE_SIZE-filled);
+
+	if (!result)
+		SetPageUptodate(page);
 	flush_dcache_page(page);
 
 	unlock_page(page);
@@ -577,7 +586,7 @@ static void romfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct romfs_inode_info *ei = foo;
 
diff --git a/fs/select.c b/fs/select.c
index da0e88201c3a..87df51eadcf2 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -24,9 +24,64 @@
 #include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
+#include <linux/hrtimer.h>
 
 #include <asm/uaccess.h>
 
+
+/*
+ * Estimate expected accuracy in ns from a timeval.
+ *
+ * After quite a bit of churning around, we've settled on
+ * a simple thing of taking 0.1% of the timeout as the
+ * slack, with a cap of 100 msec.
+ * "nice" tasks get a 0.5% slack instead.
+ *
+ * Consider this comment an open invitation to come up with even
+ * better solutions..
+ */
+
+static long __estimate_accuracy(struct timespec *tv)
+{
+	long slack;
+	int divfactor = 1000;
+
+	if (task_nice(current) > 0)
+		divfactor = divfactor / 5;
+
+	slack = tv->tv_nsec / divfactor;
+	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
+
+	if (slack > 100 * NSEC_PER_MSEC)
+		slack =  100 * NSEC_PER_MSEC;
+
+	if (slack < 0)
+		slack = 0;
+	return slack;
+}
+
+static long estimate_accuracy(struct timespec *tv)
+{
+	unsigned long ret;
+	struct timespec now;
+
+	/*
+	 * Realtime tasks get a slack of 0 for obvious reasons.
+	 */
+
+	if (rt_task(current))
+		return 0;
+
+	ktime_get_ts(&now);
+	now = timespec_sub(*tv, now);
+	ret = __estimate_accuracy(&now);
+	if (ret < current->timer_slack_ns)
+		return current->timer_slack_ns;
+	return ret;
+}
+
+
+
 struct poll_table_page {
 	struct poll_table_page * next;
 	struct poll_table_entry * entry;
@@ -130,6 +185,79 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	add_wait_queue(wait_address, &entry->wait);
 }
 
+/**
+ * poll_select_set_timeout - helper function to setup the timeout value
+ * @to:		pointer to timespec variable for the final timeout
+ * @sec:	seconds (from user space)
+ * @nsec:	nanoseconds (from user space)
+ *
+ * Note, we do not use a timespec for the user space value here, That
+ * way we can use the function for timeval and compat interfaces as well.
+ *
+ * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
+ */
+int poll_select_set_timeout(struct timespec *to, long sec, long nsec)
+{
+	struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec};
+
+	if (!timespec_valid(&ts))
+		return -EINVAL;
+
+	/* Optimize for the zero timeout value here */
+	if (!sec && !nsec) {
+		to->tv_sec = to->tv_nsec = 0;
+	} else {
+		ktime_get_ts(to);
+		*to = timespec_add_safe(*to, ts);
+	}
+	return 0;
+}
+
+static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec rts;
+	struct timeval rtv;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&rts);
+	rts = timespec_sub(*end_time, rts);
+	if (rts.tv_sec < 0)
+		rts.tv_sec = rts.tv_nsec = 0;
+
+	if (timeval) {
+		rtv.tv_sec = rts.tv_sec;
+		rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+
+	} else if (!copy_to_user(p, &rts, sizeof(rts)))
+		return ret;
+
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
 #define FDS_IN(fds, n)		(fds->in + n)
 #define FDS_OUT(fds, n)		(fds->out + n)
 #define FDS_EX(fds, n)		(fds->ex + n)
@@ -182,11 +310,13 @@ get_max:
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
-int do_select(int n, fd_set_bits *fds, s64 *timeout)
+int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 {
+	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
 	poll_table *wait;
-	int retval, i;
+	int retval, i, timed_out = 0;
+	unsigned long slack = 0;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -198,12 +328,17 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 
 	poll_initwait(&table);
 	wait = &table.pt;
-	if (!*timeout)
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 		wait = NULL;
+		timed_out = 1;
+	}
+
+	if (end_time && !timed_out)
+		slack = estimate_accuracy(end_time);
+
 	retval = 0;
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
-		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -259,27 +394,25 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 			cond_resched();
 		}
 		wait = NULL;
-		if (retval || !*timeout || signal_pending(current))
+		if (retval || timed_out || signal_pending(current))
 			break;
 		if (table.error) {
 			retval = table.error;
 			break;
 		}
 
-		if (*timeout < 0) {
-			/* Wait indefinitely */
-			__timeout = MAX_SCHEDULE_TIMEOUT;
-		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) {
-			/* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */
-			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
-			*timeout -= __timeout;
-		} else {
-			__timeout = *timeout;
-			*timeout = 0;
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
 		}
-		__timeout = schedule_timeout(__timeout);
-		if (*timeout >= 0)
-			*timeout += __timeout;
+
+		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -300,7 +433,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, s64 *timeout)
+			   fd_set __user *exp, struct timespec *end_time)
 {
 	fd_set_bits fds;
 	void *bits;
@@ -351,7 +484,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	zero_fd_set(n, fds.res_out);
 	zero_fd_set(n, fds.res_ex);
 
-	ret = do_select(n, &fds, timeout);
+	ret = do_select(n, &fds, end_time);
 
 	if (ret < 0)
 		goto out;
@@ -377,7 +510,7 @@ out_nofds:
 asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			fd_set __user *exp, struct timeval __user *tvp)
 {
-	s64 timeout = -1;
+	struct timespec end_time, *to = NULL;
 	struct timeval tv;
 	int ret;
 
@@ -385,43 +518,15 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 		if (copy_from_user(&tv, tvp, sizeof(tv)))
 			return -EFAULT;
 
-		if (tv.tv_sec < 0 || tv.tv_usec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ);
-			timeout += tv.tv_sec * HZ;
-		}
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tvp) {
-		struct timeval rtv;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
-		rtv.tv_sec = timeout;
-		if (timeval_compare(&rtv, &tv) >= 0)
-			rtv = tv;
-		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = core_sys_select(n, inp, outp, exp, to);
+	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
 
 	return ret;
 }
@@ -431,25 +536,17 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		fd_set __user *exp, struct timespec __user *tsp,
 		const sigset_t __user *sigmask, size_t sigsetsize)
 {
-	s64 timeout = MAX_SCHEDULE_TIMEOUT;
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
+	struct timespec ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		if (ts.tv_sec < 0 || ts.tv_nsec < 0)
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
 			return -EINVAL;
-
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
-			timeout += ts.tv_sec * HZ;
-		}
 	}
 
 	if (sigmask) {
@@ -463,32 +560,8 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &timeout);
-
-	if (tsp) {
-		struct timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-						1000;
-		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND)
-				ret = -EINTR;
-		}
-	}
+	ret = core_sys_select(n, inp, outp, exp, &end_time);
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
 		/*
@@ -574,18 +647,24 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 }
 
 static int do_poll(unsigned int nfds,  struct poll_list *list,
-		   struct poll_wqueues *wait, s64 *timeout)
+		   struct poll_wqueues *wait, struct timespec *end_time)
 {
-	int count = 0;
 	poll_table* pt = &wait->pt;
+	ktime_t expire, *to = NULL;
+	int timed_out = 0, count = 0;
+	unsigned long slack = 0;
 
 	/* Optimise the no-wait case */
-	if (!(*timeout))
+	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
 		pt = NULL;
+		timed_out = 1;
+	}
+
+	if (end_time && !timed_out)
+		slack = estimate_accuracy(end_time);
 
 	for (;;) {
 		struct poll_list *walk;
-		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		for (walk = list; walk != NULL; walk = walk->next) {
@@ -617,27 +696,21 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			if (signal_pending(current))
 				count = -EINTR;
 		}
-		if (count || !*timeout)
+		if (count || timed_out)
 			break;
 
-		if (*timeout < 0) {
-			/* Wait indefinitely */
-			__timeout = MAX_SCHEDULE_TIMEOUT;
-		} else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) {
-			/*
-			 * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in
-			 * a loop
-			 */
-			__timeout = MAX_SCHEDULE_TIMEOUT - 1;
-			*timeout -= __timeout;
-		} else {
-			__timeout = *timeout;
-			*timeout = 0;
+		/*
+		 * If this is the first loop and we have a timeout
+		 * given, then we convert to ktime_t and set the to
+		 * pointer to the expiry value.
+		 */
+		if (end_time && !to) {
+			expire = timespec_to_ktime(*end_time);
+			to = &expire;
 		}
 
-		__timeout = schedule_timeout(__timeout);
-		if (*timeout >= 0)
-			*timeout += __timeout;
+		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+			timed_out = 1;
 	}
 	__set_current_state(TASK_RUNNING);
 	return count;
@@ -646,7 +719,8 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
 			sizeof(struct pollfd))
 
-int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
+int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+		struct timespec *end_time)
 {
 	struct poll_wqueues table;
  	int err = -EFAULT, fdcount, len, size;
@@ -686,7 +760,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 	}
 
 	poll_initwait(&table);
-	fdcount = do_poll(nfds, head, &table, timeout);
+	fdcount = do_poll(nfds, head, &table, end_time);
 	poll_freewait(&table);
 
 	for (walk = head; walk; walk = walk->next) {
@@ -712,16 +786,21 @@ out_fds:
 
 static long do_restart_poll(struct restart_block *restart_block)
 {
-	struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0;
-	int nfds = restart_block->arg1;
-	s64 timeout = ((s64)restart_block->arg3<<32) | (s64)restart_block->arg2;
+	struct pollfd __user *ufds = restart_block->poll.ufds;
+	int nfds = restart_block->poll.nfds;
+	struct timespec *to = NULL, end_time;
 	int ret;
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	if (restart_block->poll.has_timeout) {
+		end_time.tv_sec = restart_block->poll.tv_sec;
+		end_time.tv_nsec = restart_block->poll.tv_nsec;
+		to = &end_time;
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
 	if (ret == -EINTR) {
 		restart_block->fn = do_restart_poll;
-		restart_block->arg2 = timeout & 0xFFFFFFFF;
-		restart_block->arg3 = (u64)timeout >> 32;
 		ret = -ERESTART_RESTARTBLOCK;
 	}
 	return ret;
@@ -730,31 +809,32 @@ static long do_restart_poll(struct restart_block *restart_block)
 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 			long timeout_msecs)
 {
-	s64 timeout_jiffies;
+	struct timespec end_time, *to = NULL;
 	int ret;
 
-	if (timeout_msecs > 0) {
-#if HZ > 1000
-		/* We can only overflow if HZ > 1000 */
-		if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
-			timeout_jiffies = -1;
-		else
-#endif
-			timeout_jiffies = msecs_to_jiffies(timeout_msecs) + 1;
-	} else {
-		/* Infinite (< 0) or no (0) timeout */
-		timeout_jiffies = timeout_msecs;
+	if (timeout_msecs >= 0) {
+		to = &end_time;
+		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
+			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout_jiffies);
+	ret = do_sys_poll(ufds, nfds, to);
+
 	if (ret == -EINTR) {
 		struct restart_block *restart_block;
+
 		restart_block = &current_thread_info()->restart_block;
 		restart_block->fn = do_restart_poll;
-		restart_block->arg0 = (unsigned long)ufds;
-		restart_block->arg1 = nfds;
-		restart_block->arg2 = timeout_jiffies & 0xFFFFFFFF;
-		restart_block->arg3 = (u64)timeout_jiffies >> 32;
+		restart_block->poll.ufds = ufds;
+		restart_block->poll.nfds = nfds;
+
+		if (timeout_msecs >= 0) {
+			restart_block->poll.tv_sec = end_time.tv_sec;
+			restart_block->poll.tv_nsec = end_time.tv_nsec;
+			restart_block->poll.has_timeout = 1;
+		} else
+			restart_block->poll.has_timeout = 0;
+
 		ret = -ERESTART_RESTARTBLOCK;
 	}
 	return ret;
@@ -766,21 +846,16 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 	size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
-	struct timespec ts;
-	s64 timeout = -1;
+	struct timespec ts, end_time, *to = NULL;
 	int ret;
 
 	if (tsp) {
 		if (copy_from_user(&ts, tsp, sizeof(ts)))
 			return -EFAULT;
 
-		/* Cast to u64 to make GCC stop complaining */
-		if ((u64)ts.tv_sec >= (u64)MAX_INT64_SECONDS)
-			timeout = -1;	/* infinite */
-		else {
-			timeout = DIV_ROUND_UP(ts.tv_nsec, NSEC_PER_SEC/HZ);
-			timeout += ts.tv_sec * HZ;
-		}
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
 	}
 
 	if (sigmask) {
@@ -794,7 +869,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = do_sys_poll(ufds, nfds, &timeout);
+	ret = do_sys_poll(ufds, nfds, to);
 
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR) {
@@ -812,31 +887,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 	} else if (sigmask)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	if (tsp && timeout >= 0) {
-		struct timespec rts;
-
-		if (current->personality & STICKY_TIMEOUTS)
-			goto sticky;
-		/* Yes, we know it's actually an s64, but it's also positive. */
-		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
-						1000;
-		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) >= 0)
-			rts = ts;
-		if (copy_to_user(tsp, &rts, sizeof(rts))) {
-		sticky:
-			/*
-			 * If an application puts its timeval in read-only
-			 * memory, we don't want the Linux-specific update to
-			 * the timeval to cause a fault after the select has
-			 * completed successfully. However, because we're not
-			 * updating the timeval, we can't restart the system
-			 * call.
-			 */
-			if (ret == -ERESTARTNOHAND && timeout >= 0)
-				ret = -EINTR;
-		}
-	}
+	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	return ret;
 }
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 3f54dbd6c49b..eba2eabcd2b8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 			goto Done;
 	}
 	/* we need at least one record in buffer */
+	pos = m->index;
+	p = m->op->start(m, &pos);
 	while (1) {
-		pos = m->index;
-		p = m->op->start(m, &pos);
 		err = PTR_ERR(p);
 		if (!p || IS_ERR(p))
 			break;
@@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 			break;
 		if (unlikely(err))
 			m->count = 0;
+		if (unlikely(!m->count)) {
+			p = m->op->next(m, p, &pos);
+			m->index = pos;
+			continue;
+		}
 		if (m->count < m->size)
 			goto Fill;
 		m->op->stop(m, p);
@@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 			goto Enomem;
 		m->count = 0;
 		m->version = 0;
+		pos = m->index;
+		p = m->op->start(m, &pos);
 	}
 	m->op->stop(m, p);
 	m->count = 0;
@@ -443,6 +450,37 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
 	return -1;
 }
 
+int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+{
+	if (m->count < m->size) {
+		int len = bitmap_scnprintf(m->buf + m->count,
+				m->size - m->count, bits, nr_bits);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
+	}
+	m->count = m->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_bitmap);
+
+int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+		unsigned int nr_bits)
+{
+	if (m->count < m->size) {
+		int len = bitmap_scnlistprintf(m->buf + m->count,
+				m->size - m->count, bits, nr_bits);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
+	}
+	m->count = m->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_bitmap_list);
+
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
 	return NULL + (*pos == 0);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 619725644c75..9c39bc7f8431 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -205,11 +205,19 @@ static const struct file_operations signalfd_fops = {
 	.read		= signalfd_read,
 };
 
-asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
+asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
+			      size_t sizemask, int flags)
 {
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
 
+	/* Check the SFD_* constants for consistency.  */
+	BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
+		return -EINVAL;
+
 	if (sizemask != sizeof(sigset_t) ||
 	    copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
 		return -EINVAL;
@@ -227,7 +235,8 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 		 * When we call this, the initialization must be complete, since
 		 * anon_inode_getfd() will install the fd.
 		 */
-		ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx);
+		ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
+				       flags & (O_CLOEXEC | O_NONBLOCK));
 		if (ufd < 0)
 			kfree(ctx);
 	} else {
@@ -249,3 +258,9 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 
 	return ufd;
 }
+
+asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask,
+			     size_t sizemask)
+{
+	return sys_signalfd4(ufd, user_mask, sizemask, 0);
+}
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
index 8182f0542a21..8c177eb7e344 100644
--- a/fs/smbfs/cache.c
+++ b/fs/smbfs/cache.c
@@ -13,7 +13,6 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/dirent.h>
 #include <linux/smb_fs.h>
 #include <linux/pagemap.h>
 #include <linux/net.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 2294783320cb..e4f8d51a5553 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -408,7 +408,7 @@ smb_file_release(struct inode *inode, struct file * file)
  * privileges, so we need our own check for this.
  */
 static int
-smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
+smb_file_permission(struct inode *inode, int mask)
 {
 	int mode = inode->i_mode;
 	int error = 0;
@@ -417,7 +417,7 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 
 	/* Look at user permissions */
 	mode >>= 6;
-	if ((mode & 7 & mask) != mask)
+	if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
 		error = -EACCES;
 	return error;
 }
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 376ef3ee6ed7..3528f40ffb0f 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -67,7 +67,7 @@ static void smb_destroy_inode(struct inode *inode)
 	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
 
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index d517a27b7f4b..ee536e8a649a 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -16,7 +16,6 @@
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/dcache.h>
-#include <linux/dirent.h>
 #include <linux/nls.h>
 #include <linux/smp_lock.h>
 #include <linux/net.h>
diff --git a/fs/splice.c b/fs/splice.c
index 399442179d89..a1e701c27156 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -371,7 +371,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			 * for an in-flight io page
 			 */
 			if (flags & SPLICE_F_NONBLOCK) {
-				if (TestSetPageLocked(page)) {
+				if (!trylock_page(page)) {
 					error = -EAGAIN;
 					break;
 				}
@@ -772,7 +772,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
 	ssize_t ret;
 	int err;
 
-	err = remove_suid(out->f_path.dentry);
+	err = file_remove_suid(out);
 	if (unlikely(err))
 		return err;
 
@@ -830,7 +830,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	ssize_t ret;
 
 	inode_double_lock(inode, pipe->inode);
-	ret = remove_suid(out->f_path.dentry);
+	ret = file_remove_suid(out);
 	if (likely(!ret))
 		ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
 	inode_double_unlock(inode, pipe->inode);
@@ -898,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
+	if (unlikely(out->f_flags & O_APPEND))
+		return -EINVAL;
+
 	ret = rw_verify_area(WRITE, out, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
@@ -1161,36 +1164,6 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 }
 
 /*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
-	int partial;
-
-	if (!access_ok(VERIFY_READ, src, n))
-		return -EFAULT;
-
-	pagefault_disable();
-	partial = __copy_from_user_inatomic(dst, src, n);
-	pagefault_enable();
-
-	/*
-	 * Didn't copy everything, drop the mmap_sem and do a faulting copy
-	 */
-	if (unlikely(partial)) {
-		up_read(&current->mm->mmap_sem);
-		partial = copy_from_user(dst, src, n);
-		down_read(&current->mm->mmap_sem);
-	}
-
-	return partial;
-}
-
-/*
  * Map an iov into an array of pages and offset/length tupples. With the
  * partial_page structure, we can map several non-contiguous ranges into
  * our ones pages[] map instead of splitting that operation into pieces.
@@ -1203,8 +1176,6 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 {
 	int buffers = 0, error = 0;
 
-	down_read(&current->mm->mmap_sem);
-
 	while (nr_vecs) {
 		unsigned long off, npages;
 		struct iovec entry;
@@ -1213,7 +1184,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		int i;
 
 		error = -EFAULT;
-		if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+		if (copy_from_user(&entry, iov, sizeof(entry)))
 			break;
 
 		base = entry.iov_base;
@@ -1247,9 +1218,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
 
-		error = get_user_pages(current, current->mm,
-				       (unsigned long) base, npages, 0, 0,
-				       &pages[buffers], NULL);
+		error = get_user_pages_fast((unsigned long)base, npages,
+					0, &pages[buffers]);
 
 		if (unlikely(error <= 0))
 			break;
@@ -1288,8 +1258,6 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		iov++;
 	}
 
-	up_read(&current->mm->mmap_sem);
-
 	if (buffers)
 		return buffers;
 
diff --git a/fs/stat.c b/fs/stat.c
index 9cf41f719d50..7c46fbeb8b76 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,13 +57,13 @@ EXPORT_SYMBOL(vfs_getattr);
 
 int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = __user_walk_fd(dfd, name, LOOKUP_FOLLOW, &nd);
+	error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
 	if (!error) {
-		error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat);
-		path_put(&nd.path);
+		error = vfs_getattr(path.mnt, path.dentry, stat);
+		path_put(&path);
 	}
 	return error;
 }
@@ -77,13 +77,13 @@ EXPORT_SYMBOL(vfs_stat);
 
 int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = __user_walk_fd(dfd, name, 0, &nd);
+	error = user_path_at(dfd, name, 0, &path);
 	if (!error) {
-		error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat);
-		path_put(&nd.path);
+		error = vfs_getattr(path.mnt, path.dentry, stat);
+		path_put(&path);
 	}
 	return error;
 }
@@ -291,29 +291,29 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
 	return error;
 }
 
-asmlinkage long sys_readlinkat(int dfd, const char __user *path,
+asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
 				char __user *buf, int bufsiz)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
 	if (bufsiz <= 0)
 		return -EINVAL;
 
-	error = __user_walk_fd(dfd, path, 0, &nd);
+	error = user_path_at(dfd, pathname, 0, &path);
 	if (!error) {
-		struct inode *inode = nd.path.dentry->d_inode;
+		struct inode *inode = path.dentry->d_inode;
 
 		error = -EINVAL;
 		if (inode->i_op && inode->i_op->readlink) {
-			error = security_inode_readlink(nd.path.dentry);
+			error = security_inode_readlink(path.dentry);
 			if (!error) {
-				touch_atime(nd.path.mnt, nd.path.dentry);
-				error = inode->i_op->readlink(nd.path.dentry,
+				touch_atime(path.mnt, path.dentry);
+				error = inode->i_op->readlink(path.dentry,
 							      buf, bufsiz);
 			}
 		}
-		path_put(&nd.path);
+		path_put(&path);
 	}
 	return error;
 }
diff --git a/fs/super.c b/fs/super.c
index 453877c5697b..400a7608f15e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -70,6 +70,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
+		INIT_LIST_HEAD(&s->s_dentry_lru);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -681,7 +682,7 @@ void emergency_remount(void)
  * filesystems which don't use real block-devices.  -- jrs
  */
 
-static struct idr unnamed_dev_idr;
+static DEFINE_IDA(unnamed_dev_ida);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
 
 int set_anon_super(struct super_block *s, void *data)
@@ -690,10 +691,10 @@ int set_anon_super(struct super_block *s, void *data)
 	int error;
 
  retry:
-	if (idr_pre_get(&unnamed_dev_idr, GFP_ATOMIC) == 0)
+	if (ida_pre_get(&unnamed_dev_ida, GFP_ATOMIC) == 0)
 		return -ENOMEM;
 	spin_lock(&unnamed_dev_lock);
-	error = idr_get_new(&unnamed_dev_idr, NULL, &dev);
+	error = ida_get_new(&unnamed_dev_ida, &dev);
 	spin_unlock(&unnamed_dev_lock);
 	if (error == -EAGAIN)
 		/* We raced and lost with another CPU. */
@@ -703,7 +704,7 @@ int set_anon_super(struct super_block *s, void *data)
 
 	if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
 		spin_lock(&unnamed_dev_lock);
-		idr_remove(&unnamed_dev_idr, dev);
+		ida_remove(&unnamed_dev_ida, dev);
 		spin_unlock(&unnamed_dev_lock);
 		return -EMFILE;
 	}
@@ -719,17 +720,12 @@ void kill_anon_super(struct super_block *sb)
 
 	generic_shutdown_super(sb);
 	spin_lock(&unnamed_dev_lock);
-	idr_remove(&unnamed_dev_idr, slot);
+	ida_remove(&unnamed_dev_ida, slot);
 	spin_unlock(&unnamed_dev_lock);
 }
 
 EXPORT_SYMBOL(kill_anon_super);
 
-void __init unnamed_dev_init(void)
-{
-	idr_init(&unnamed_dev_idr);
-}
-
 void kill_litter_super(struct super_block *sb)
 {
 	if (sb->s_root)
@@ -759,9 +755,13 @@ int get_sb_bdev(struct file_system_type *fs_type,
 {
 	struct block_device *bdev;
 	struct super_block *s;
+	fmode_t mode = FMODE_READ;
 	int error = 0;
 
-	bdev = open_bdev_excl(dev_name, flags, fs_type);
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
@@ -784,11 +784,12 @@ int get_sb_bdev(struct file_system_type *fs_type,
 			goto error_bdev;
 		}
 
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, mode);
 	} else {
 		char b[BDEVNAME_SIZE];
 
 		s->s_flags = flags;
+		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
@@ -806,7 +807,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, mode);
 error:
 	return error;
 }
@@ -816,10 +817,11 @@ EXPORT_SYMBOL(get_sb_bdev);
 void kill_block_super(struct super_block *sb)
 {
 	struct block_device *bdev = sb->s_bdev;
+	fmode_t mode = sb->s_mode;
 
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, mode);
 }
 
 EXPORT_SYMBOL(kill_block_super);
diff --git a/fs/sync.c b/fs/sync.c
index 228e17b5e9ee..2967562d416f 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -139,7 +139,8 @@ asmlinkage long sys_fdatasync(unsigned int fd)
  * before performing the write.
  *
  * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
- * range which are not presently under writeback.
+ * range which are not presently under writeback. Note that this may block for
+ * significant periods due to exhaustion of disk request structures.
  *
  * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
  * after performing the write.
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 006fc64227dd..66f6e58a7e4b 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -61,6 +61,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 	int size = dentry->d_inode->i_size;
 	loff_t offs = *off;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
+	char *temp;
 
 	if (size) {
 		if (offs > size)
@@ -69,23 +70,33 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 			count = size - offs;
 	}
 
+	temp = kmalloc(count, GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
 	mutex_lock(&bb->mutex);
 
 	count = fill_read(dentry, bb->buffer, offs, count);
-	if (count < 0)
-		goto out_unlock;
+	if (count < 0) {
+		mutex_unlock(&bb->mutex);
+		goto out_free;
+	}
 
-	if (copy_to_user(userbuf, bb->buffer, count)) {
+	memcpy(temp, bb->buffer, count);
+
+	mutex_unlock(&bb->mutex);
+
+	if (copy_to_user(userbuf, temp, count)) {
 		count = -EFAULT;
-		goto out_unlock;
+		goto out_free;
 	}
 
 	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
 
 	*off = offs + count;
 
- out_unlock:
-	mutex_unlock(&bb->mutex);
+ out_free:
+	kfree(temp);
 	return count;
 }
 
@@ -118,6 +129,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 	int size = dentry->d_inode->i_size;
 	loff_t offs = *off;
 	int count = min_t(size_t, bytes, PAGE_SIZE);
+	char *temp;
 
 	if (size) {
 		if (offs > size)
@@ -126,19 +138,27 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 			count = size - offs;
 	}
 
-	mutex_lock(&bb->mutex);
+	temp = kmalloc(count, GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
 
-	if (copy_from_user(bb->buffer, userbuf, count)) {
+	if (copy_from_user(temp, userbuf, count)) {
 		count = -EFAULT;
-		goto out_unlock;
+		goto out_free;
 	}
 
+	mutex_lock(&bb->mutex);
+
+	memcpy(bb->buffer, temp, count);
+
 	count = flush_write(dentry, bb->buffer, offs, count);
+	mutex_unlock(&bb->mutex);
+
 	if (count > 0)
 		*off = offs + count;
 
- out_unlock:
-	mutex_unlock(&bb->mutex);
+out_free:
+	kfree(temp);
 	return count;
 }
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 8c0e4b92574f..82d3b79d0e08 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -370,17 +370,17 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 	memset(acxt, 0, sizeof(*acxt));
 	acxt->parent_sd = parent_sd;
 
-	/* Lookup parent inode.  inode initialization and I_NEW
-	 * clearing are protected by sysfs_mutex.  By grabbing it and
-	 * looking up with _nowait variant, inode state can be
-	 * determined reliably.
+	/* Lookup parent inode.  inode initialization is protected by
+	 * sysfs_mutex, so inode existence can be determined by
+	 * looking up inode while holding sysfs_mutex.
 	 */
 	mutex_lock(&sysfs_mutex);
 
-	inode = ilookup5_nowait(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
-				parent_sd);
+	inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
+			 parent_sd);
+	if (inode) {
+		WARN_ON(inode->i_state & I_NEW);
 
-	if (inode && !(inode->i_state & I_NEW)) {
 		/* parent inode available */
 		acxt->parent_inode = inode;
 
@@ -393,12 +393,11 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 			mutex_lock(&inode->i_mutex);
 			mutex_lock(&sysfs_mutex);
 		}
-	} else
-		iput(inode);
+	}
 }
 
 /**
- *	sysfs_add_one - add sysfs_dirent to parent
+ *	__sysfs_add_one - add sysfs_dirent to parent without warning
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be added
  *
@@ -417,7 +416,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  *	0 on success, -EEXIST if entry with the given name already
  *	exists.
  */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
 	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
 		return -EEXIST;
@@ -435,6 +434,36 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 }
 
 /**
+ *	sysfs_add_one - add sysfs_dirent to parent
+ *	@acxt: addrm context to use
+ *	@sd: sysfs_dirent to be added
+ *
+ *	Get @acxt->parent_sd and set sd->s_parent to it and increment
+ *	nlink of parent inode if @sd is a directory and link into the
+ *	children list of the parent.
+ *
+ *	This function should be called between calls to
+ *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to sysfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by sysfs_addrm_start().
+ *
+ *	RETURNS:
+ *	0 on success, -EEXIST if entry with the given name already
+ *	exists.
+ */
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+{
+	int ret;
+
+	ret = __sysfs_add_one(acxt, sd);
+	WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' "
+		       "can not be created\n", sd->s_name);
+	return ret;
+}
+
+/**
  *	sysfs_remove_one - remove sysfs_dirent from parent
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be removed
@@ -606,6 +635,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 
 	return sd;
 }
+EXPORT_SYMBOL_GPL(sysfs_get_dirent);
 
 static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 		      const char *name, struct sysfs_dirent **p_sd)
@@ -799,16 +829,12 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 	if (!new_dentry)
 		goto out_unlock;
 
-	/* rename kobject and sysfs_dirent */
+	/* rename sysfs_dirent */
 	error = -ENOMEM;
 	new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
 	if (!new_name)
 		goto out_unlock;
 
-	error = kobject_set_name(kobj, "%s", new_name);
-	if (error)
-		goto out_unlock;
-
 	dup_name = sd->s_name;
 	sd->s_name = new_name;
 
@@ -957,4 +983,5 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 const struct file_operations sysfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= sysfs_readdir,
+	.llseek		= generic_file_llseek,
 };
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e7735f643cd1..1f4a3f877262 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,14 +14,23 @@
 #include <linux/kobject.h>
 #include <linux/kallsyms.h>
 #include <linux/slab.h>
+#include <linux/fsnotify.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/limits.h>
 #include <asm/uaccess.h>
 
 #include "sysfs.h"
 
+/* used in crash dumps to help with debugging */
+static char last_sysfs_file[PATH_MAX];
+void sysfs_printk_last_file(void)
+{
+	printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file);
+}
+
 /*
  * There's one sysfs_buffer for each open file and one
  * sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -327,6 +336,11 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	struct sysfs_buffer *buffer;
 	struct sysfs_ops *ops;
 	int error = -EACCES;
+	char *p;
+
+	p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
+	if (p)
+		memmove(last_sysfs_file, p, strlen(p) + 1);
 
 	/* need attr_sd for attr and ops, its parent for kobj */
 	if (!sysfs_get_active_two(attr_sd))
@@ -336,9 +350,8 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	if (kobj->ktype && kobj->ktype->sysfs_ops)
 		ops = kobj->ktype->sysfs_ops;
 	else {
-		printk(KERN_ERR "missing sysfs attribute operations for "
+		WARN(1, KERN_ERR "missing sysfs attribute operations for "
 		       "kobject: %s\n", kobject_name(kobj));
-		WARN_ON(1);
 		goto err_out;
 	}
 
@@ -440,7 +453,23 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 	return POLLERR|POLLPRI;
 }
 
-void sysfs_notify(struct kobject *k, char *dir, char *attr)
+void sysfs_notify_dirent(struct sysfs_dirent *sd)
+{
+	struct sysfs_open_dirent *od;
+
+	spin_lock(&sysfs_open_dirent_lock);
+
+	od = sd->s_attr.open;
+	if (od) {
+		atomic_inc(&od->event);
+		wake_up_interruptible(&od->poll);
+	}
+
+	spin_unlock(&sysfs_open_dirent_lock);
+}
+EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
+
+void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 {
 	struct sysfs_dirent *sd = k->sd;
 
@@ -450,19 +479,8 @@ void sysfs_notify(struct kobject *k, char *dir, char *attr)
 		sd = sysfs_find_dirent(sd, dir);
 	if (sd && attr)
 		sd = sysfs_find_dirent(sd, attr);
-	if (sd) {
-		struct sysfs_open_dirent *od;
-
-		spin_lock(&sysfs_open_dirent_lock);
-
-		od = sd->s_attr.open;
-		if (od) {
-			atomic_inc(&od->event);
-			wake_up_interruptible(&od->poll);
-		}
-
-		spin_unlock(&sysfs_open_dirent_lock);
-	}
+	if (sd)
+		sysfs_notify_dirent(sd);
 
 	mutex_unlock(&sysfs_mutex);
 }
@@ -585,9 +603,11 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-	rc = notify_change(victim, &newattrs);
+	newattrs.ia_ctime = current_fs_time(inode->i_sb);
+	rc = sysfs_setattr(victim, &newattrs);
 
 	if (rc == 0) {
+		fsnotify_change(victim, newattrs.ia_valid);
 		mutex_lock(&sysfs_mutex);
 		victim_sd->s_mode = newattrs.ia_mode;
 		mutex_unlock(&sysfs_mutex);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index eeba38417b1d..fe611949a7f7 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -134,9 +134,8 @@ void sysfs_remove_group(struct kobject * kobj,
 	if (grp->name) {
 		sd = sysfs_get_dirent(dir_sd, grp->name);
 		if (!sd) {
-			printk(KERN_WARNING "sysfs group %p not found for "
+			WARN(!sd, KERN_WARNING "sysfs group %p not found for "
 				"kobject '%s'\n", grp, kobject_name(kobj));
-			WARN_ON(!sd);
 			return;
 		}
 	} else
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 14f0023984d7..ab343e371d64 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -16,6 +16,7 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/module.h>
 
 #include "sysfs.h"
 
@@ -115,3 +116,17 @@ out_err:
 	sysfs_dir_cachep = NULL;
 	goto out;
 }
+
+#undef sysfs_get
+struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
+{
+	return __sysfs_get(sd);
+}
+EXPORT_SYMBOL_GPL(sysfs_get);
+
+#undef sysfs_put
+void sysfs_put(struct sysfs_dirent *sd)
+{
+	__sysfs_put(sd);
+}
+EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 817f5966edca..a3ba217fbe74 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -19,13 +19,8 @@
 
 #include "sysfs.h"
 
-/**
- *	sysfs_create_link - create symlink between two objects.
- *	@kobj:	object whose directory we're creating the link in.
- *	@target:	object we're pointing to.
- *	@name:		name of the symlink.
- */
-int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name)
+static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
+				const char *name, int warn)
 {
 	struct sysfs_dirent *parent_sd = NULL;
 	struct sysfs_dirent *target_sd = NULL;
@@ -65,7 +60,10 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
 	sysfs_addrm_start(&acxt, parent_sd);
-	error = sysfs_add_one(&acxt, sd);
+	if (warn)
+		error = sysfs_add_one(&acxt, sd);
+	else
+		error = __sysfs_add_one(&acxt, sd);
 	sysfs_addrm_finish(&acxt);
 
 	if (error)
@@ -80,6 +78,33 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 }
 
 /**
+ *	sysfs_create_link - create symlink between two objects.
+ *	@kobj:	object whose directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ */
+int sysfs_create_link(struct kobject *kobj, struct kobject *target,
+		      const char *name)
+{
+	return sysfs_do_create_link(kobj, target, name, 1);
+}
+
+/**
+ *	sysfs_create_link_nowarn - create symlink between two objects.
+ *	@kobj:	object whose directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
+ *
+ *	This function does the same as sysf_create_link(), but it
+ *	doesn't warn if the link already exists.
+ */
+int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
+			     const char *name)
+{
+	return sysfs_do_create_link(kobj, target, name, 0);
+}
+
+/**
  *	sysfs_remove_link - remove symlink in object's directory.
  *	@kobj:	object we're acting for.
  *	@name:	name of the symlink to remove.
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ce4e15f8aaeb..93c6d6b27c4d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -107,6 +107,7 @@ struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
 void sysfs_put_active_two(struct sysfs_dirent *sd);
 void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 		       struct sysfs_dirent *parent_sd);
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
@@ -123,7 +124,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 			struct sysfs_dirent **p_sd);
 void sysfs_remove_subdir(struct sysfs_dirent *sd);
 
-static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
+static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
 {
 	if (sd) {
 		WARN_ON(!atomic_read(&sd->s_count));
@@ -131,12 +132,14 @@ static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
 	}
 	return sd;
 }
+#define sysfs_get(sd) __sysfs_get(sd)
 
-static inline void sysfs_put(struct sysfs_dirent *sd)
+static inline void __sysfs_put(struct sysfs_dirent *sd)
 {
 	if (sd && atomic_dec_and_test(&sd->s_count))
 		release_sysfs_dirent(sd);
 }
+#define sysfs_put(sd) __sysfs_put(sd)
 
 /*
  * inode.c
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c5d60de0658f..df0d435baa48 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -326,7 +326,7 @@ static void sysv_destroy_inode(struct inode *inode)
 	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *p)
+static void init_once(void *p)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
 
diff --git a/fs/timerfd.c b/fs/timerfd.c
index d87d354ec424..0862f0e49d0c 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -52,11 +52,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
 
 static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 {
-	ktime_t now, remaining;
-
-	now = ctx->tmr.base->get_time();
-	remaining = ktime_sub(ctx->tmr.expires, now);
+	ktime_t remaining;
 
+	remaining = hrtimer_expires_remaining(&ctx->tmr);
 	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
 }
 
@@ -74,7 +72,7 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
 	ctx->ticks = 0;
 	ctx->tintv = timespec_to_ktime(ktmr->it_interval);
 	hrtimer_init(&ctx->tmr, ctx->clockid, htmode);
-	ctx->tmr.expires = texp;
+	hrtimer_set_expires(&ctx->tmr, texp);
 	ctx->tmr.function = timerfd_tmrproc;
 	if (texp.tv64 != 0)
 		hrtimer_start(&ctx->tmr, texp, htmode);
@@ -184,7 +182,11 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	int ufd;
 	struct timerfd_ctx *ctx;
 
-	if (flags)
+	/* Check the TFD_* constants for consistency.  */
+	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
 		return -EINVAL;
 	if (clockid != CLOCK_MONOTONIC &&
 	    clockid != CLOCK_REALTIME)
@@ -198,7 +200,8 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	ctx->clockid = clockid;
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 
-	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx);
+	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
+			       flags & (O_CLOEXEC | O_NONBLOCK));
 	if (ufd < 0)
 		kfree(ctx);
 
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index d81fb9ed2b8e..1a4973e10664 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -263,8 +263,8 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 
 	idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
 
-	/* And make sure we have twice the index size of space reserved */
-	idx_size <<= 1;
+	/* And make sure we have thrice the index size of space reserved */
+	idx_size = idx_size + (idx_size << 1);
 
 	/*
 	 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
@@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
 	int subtract_lebs;
 	long long available;
 
-	/*
-	 * Force the amount available to the total size reported if the used
-	 * space is zero.
-	 */
-	if (c->lst.total_used <= UBIFS_INO_NODE_SZ &&
-	    c->budg_data_growth + c->budg_dd_growth == 0) {
-		/* Do the same calculation as for c->block_cnt */
-		available = c->main_lebs - 2;
-		available *= c->leb_size - c->dark_wm;
-		return available;
-	}
-
 	available = c->main_bytes - c->lst.total_used;
 
 	/*
@@ -388,11 +376,11 @@ static int can_use_rp(struct ubifs_info *c)
  * This function makes sure UBIFS has enough free eraseblocks for index growth
  * and data.
  *
- * When budgeting index space, UBIFS reserves twice as more LEBs as the index
+ * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
  * would take if it was consolidated and written to the flash. This guarantees
  * that the "in-the-gaps" commit method always succeeds and UBIFS will always
  * be able to commit dirty index. So this function basically adds amount of
- * budgeted index space to the size of the current index, multiplies this by 2,
+ * budgeted index space to the size of the current index, multiplies this by 3,
  * and makes sure this does not exceed the amount of free eraseblocks.
  *
  * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
@@ -426,19 +414,21 @@ static int do_budget_space(struct ubifs_info *c)
 	 *    @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
 	 *    @c->lst.taken_empty_lebs
 	 *
-	 * @empty_lebs are available because they are empty. @freeable_cnt are
-	 * available because they contain only free and dirty space and the
-	 * index allocation always occurs after wbufs are synch'ed.
-	 * @idx_gc_cnt are available because they are index LEBs that have been
-	 * garbage collected (including trivial GC) and are awaiting the commit
-	 * before they can be unmapped - note that the in-the-gaps method will
-	 * grab these if it needs them. @taken_empty_lebs are empty_lebs that
-	 * have already been allocated for some purpose (also includes those
-	 * LEBs on the @idx_gc list).
+	 * @c->lst.empty_lebs are available because they are empty.
+	 * @c->freeable_cnt are available because they contain only free and
+	 * dirty space, @c->idx_gc_cnt are available because they are index
+	 * LEBs that have been garbage collected and are awaiting the commit
+	 * before they can be used. And the in-the-gaps method will grab these
+	 * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have
+	 * already been allocated for some purpose.
 	 *
-	 * Note, @taken_empty_lebs may temporarily be higher by one because of
-	 * the way we serialize LEB allocations and budgeting. See a comment in
-	 * 'ubifs_find_free_space()'.
+	 * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because
+	 * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they
+	 * are taken until after the commit).
+	 *
+	 * Note, @c->lst.taken_empty_lebs may temporarily be higher by one
+	 * because of the way we serialize LEB allocations and budgeting. See a
+	 * comment in 'ubifs_find_free_space()'.
 	 */
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
 	       c->lst.taken_empty_lebs;
@@ -543,8 +533,16 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 	int err, idx_growth, data_growth, dd_growth;
 	struct retries_info ri;
 
+	ubifs_assert(req->new_page <= 1);
+	ubifs_assert(req->dirtied_page <= 1);
+	ubifs_assert(req->new_dent <= 1);
+	ubifs_assert(req->mod_dent <= 1);
+	ubifs_assert(req->new_ino <= 1);
+	ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
 	ubifs_assert(req->dirtied_ino <= 4);
 	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+	ubifs_assert(!(req->new_ino_d & 7));
+	ubifs_assert(!(req->dirtied_ino_d & 7));
 
 	data_growth = calc_data_growth(c, req);
 	dd_growth = calc_dd_growth(c, req);
@@ -618,8 +616,16 @@ again:
  */
 void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
+	ubifs_assert(req->new_page <= 1);
+	ubifs_assert(req->dirtied_page <= 1);
+	ubifs_assert(req->new_dent <= 1);
+	ubifs_assert(req->mod_dent <= 1);
+	ubifs_assert(req->new_ino <= 1);
+	ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
 	ubifs_assert(req->dirtied_ino <= 4);
 	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+	ubifs_assert(!(req->new_ino_d & 7));
+	ubifs_assert(!(req->dirtied_ino_d & 7));
 	if (!req->recalculate) {
 		ubifs_assert(req->idx_growth >= 0);
 		ubifs_assert(req->data_growth >= 0);
@@ -647,7 +653,11 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 
 	ubifs_assert(c->budg_idx_growth >= 0);
 	ubifs_assert(c->budg_data_growth >= 0);
+	ubifs_assert(c->budg_dd_growth >= 0);
 	ubifs_assert(c->min_idx_lebs < c->main_lebs);
+	ubifs_assert(!(c->budg_idx_growth & 7));
+	ubifs_assert(!(c->budg_data_growth & 7));
+	ubifs_assert(!(c->budg_dd_growth & 7));
 	spin_unlock(&c->space_lock);
 }
 
@@ -686,41 +696,114 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
 void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 				      struct ubifs_inode *ui)
 {
-	struct ubifs_budget_req req = {.dd_growth = c->inode_budget,
-				       .dirtied_ino_d = ui->data_len};
+	struct ubifs_budget_req req;
 
+	memset(&req, 0, sizeof(struct ubifs_budget_req));
+	req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
 	ubifs_release_budget(c, &req);
 }
 
 /**
- * ubifs_budg_get_free_space - return amount of free space.
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, tripled because we always allow enough
+ * space to write the index thrice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+{
+	int divisor, factor, f;
+
+	/*
+	 * Reported space size is @free * X, where X is UBIFS block size
+	 * divided by UBIFS block size + all overhead one data block
+	 * introduces. The overhead is the node header + indexing overhead.
+	 *
+	 * Indexing overhead calculations are based on the following formula:
+	 * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
+	 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
+	 * as less than maximum fanout, we assume that each data node
+	 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
+	 * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+	 * for the index.
+	 */
+	f = c->fanout > 3 ? c->fanout >> 1 : 2;
+	factor = UBIFS_BLOCK_SIZE;
+	divisor = UBIFS_MAX_DATA_NODE_SZ;
+	divisor += (c->max_idx_node_sz * 3) / (f - 1);
+	free *= factor;
+	do_div(free, divisor);
+	return free;
+}
+
+/**
+ * ubifs_get_free_space - return amount of free space.
  * @c: UBIFS file-system description object
  *
- * This function returns amount of free space on the file-system.
+ * This function calculates amount of free space to report to user-space.
+ *
+ * Because UBIFS may introduce substantial overhead (the index, node headers,
+ * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * amount of free flash space it has (well, because not all dirty space is
+ * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * it would bread user expectetion about what free space is. Users seem to
+ * accustomed to assume that if the file-system reports N bytes of free space,
+ * they would be able to fit a file of N bytes to the FS. This almost works for
+ * traditional file-systems, because they have way less overhead than UBIFS.
+ * So, to keep users happy, UBIFS tries to take the overhead into account.
  */
-long long ubifs_budg_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space(struct ubifs_info *c)
 {
-	int min_idx_lebs, rsvd_idx_lebs;
+	int min_idx_lebs, rsvd_idx_lebs, lebs;
 	long long available, outstanding, free;
 
-	/* Do exactly the same calculations as in 'do_budget_space()' */
 	spin_lock(&c->space_lock);
 	min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	outstanding = c->budg_data_growth + c->budg_dd_growth;
 
-	if (min_idx_lebs > c->lst.idx_lebs)
-		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
-	else
-		rsvd_idx_lebs = 0;
-
-	if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt
-				- c->lst.taken_empty_lebs) {
+	/*
+	 * Force the amount available to the total size reported if the used
+	 * space is zero.
+	 */
+	if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
 		spin_unlock(&c->space_lock);
-		return 0;
+		return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
 	}
 
 	available = ubifs_calc_available(c, min_idx_lebs);
-	outstanding = c->budg_data_growth + c->budg_dd_growth;
-	c->min_idx_lebs = min_idx_lebs;
+
+	/*
+	 * When reporting free space to user-space, UBIFS guarantees that it is
+	 * possible to write a file of free space size. This means that for
+	 * empty LEBs we may use more precise calculations than
+	 * 'ubifs_calc_available()' is using. Namely, we know that in empty
+	 * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
+	 * Thus, amend the available space.
+	 *
+	 * Note, the calculations below are similar to what we have in
+	 * 'do_budget_space()', so refer there for comments.
+	 */
+	if (min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+	else
+		rsvd_idx_lebs = 0;
+	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+	       c->lst.taken_empty_lebs;
+	lebs -= rsvd_idx_lebs;
+	available += lebs * (c->dark_wm - c->leb_overhead);
 	spin_unlock(&c->space_lock);
 
 	if (available > outstanding)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 3b516316c9b3..0a6aa2cc78f0 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -74,6 +74,7 @@ static int do_commit(struct ubifs_info *c)
 			goto out_up;
 	}
 
+	c->cmt_no += 1;
 	err = ubifs_gc_start_commit(c);
 	if (err)
 		goto out_up;
@@ -115,7 +116,7 @@ static int do_commit(struct ubifs_info *c)
 		goto out;
 
 	mutex_lock(&c->mst_mutex);
-	c->mst_node->cmt_no      = cpu_to_le64(++c->cmt_no);
+	c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);
 	c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
 	c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
 	c->mst_node->root_offs   = cpu_to_le32(zroot.offs);
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 5bb51dac3c16..a0ada596b17c 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -91,8 +91,6 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
  *
  * Note, if the input buffer was not compressed, it is copied to the output
  * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
- *
- * This functions returns %0 on success or a negative error code on failure.
  */
 void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
 		    int *compr_type)
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4e3aaeba4eca..7186400750e7 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -222,30 +222,38 @@ void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
 {
 	const struct ubifs_inode *ui = ubifs_inode(inode);
 
-	printk(KERN_DEBUG "inode      %lu\n", inode->i_ino);
-	printk(KERN_DEBUG "size       %llu\n",
+	printk(KERN_DEBUG "Dump in-memory inode:");
+	printk(KERN_DEBUG "\tinode          %lu\n", inode->i_ino);
+	printk(KERN_DEBUG "\tsize           %llu\n",
 	       (unsigned long long)i_size_read(inode));
-	printk(KERN_DEBUG "nlink      %u\n", inode->i_nlink);
-	printk(KERN_DEBUG "uid        %u\n", (unsigned int)inode->i_uid);
-	printk(KERN_DEBUG "gid        %u\n", (unsigned int)inode->i_gid);
-	printk(KERN_DEBUG "atime      %u.%u\n",
+	printk(KERN_DEBUG "\tnlink          %u\n", inode->i_nlink);
+	printk(KERN_DEBUG "\tuid            %u\n", (unsigned int)inode->i_uid);
+	printk(KERN_DEBUG "\tgid            %u\n", (unsigned int)inode->i_gid);
+	printk(KERN_DEBUG "\tatime          %u.%u\n",
 	       (unsigned int)inode->i_atime.tv_sec,
 	       (unsigned int)inode->i_atime.tv_nsec);
-	printk(KERN_DEBUG "mtime      %u.%u\n",
+	printk(KERN_DEBUG "\tmtime          %u.%u\n",
 	       (unsigned int)inode->i_mtime.tv_sec,
 	       (unsigned int)inode->i_mtime.tv_nsec);
-	printk(KERN_DEBUG "ctime       %u.%u\n",
+	printk(KERN_DEBUG "\tctime          %u.%u\n",
 	       (unsigned int)inode->i_ctime.tv_sec,
 	       (unsigned int)inode->i_ctime.tv_nsec);
-	printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum);
-	printk(KERN_DEBUG "xattr_size  %u\n", ui->xattr_size);
-	printk(KERN_DEBUG "xattr_cnt   %u\n", ui->xattr_cnt);
-	printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names);
-	printk(KERN_DEBUG "dirty       %u\n", ui->dirty);
-	printk(KERN_DEBUG "xattr       %u\n", ui->xattr);
-	printk(KERN_DEBUG "flags       %d\n", ui->flags);
-	printk(KERN_DEBUG "compr_type  %d\n", ui->compr_type);
-	printk(KERN_DEBUG "data_len    %d\n", ui->data_len);
+	printk(KERN_DEBUG "\tcreat_sqnum    %llu\n", ui->creat_sqnum);
+	printk(KERN_DEBUG "\txattr_size     %u\n", ui->xattr_size);
+	printk(KERN_DEBUG "\txattr_cnt      %u\n", ui->xattr_cnt);
+	printk(KERN_DEBUG "\txattr_names    %u\n", ui->xattr_names);
+	printk(KERN_DEBUG "\tdirty          %u\n", ui->dirty);
+	printk(KERN_DEBUG "\txattr          %u\n", ui->xattr);
+	printk(KERN_DEBUG "\tbulk_read      %u\n", ui->xattr);
+	printk(KERN_DEBUG "\tsynced_i_size  %llu\n",
+	       (unsigned long long)ui->synced_i_size);
+	printk(KERN_DEBUG "\tui_size        %llu\n",
+	       (unsigned long long)ui->ui_size);
+	printk(KERN_DEBUG "\tflags          %d\n", ui->flags);
+	printk(KERN_DEBUG "\tcompr_type     %d\n", ui->compr_type);
+	printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read);
+	printk(KERN_DEBUG "\tread_in_a_row  %lu\n", ui->read_in_a_row);
+	printk(KERN_DEBUG "\tdata_len       %d\n", ui->data_len);
 }
 
 void dbg_dump_node(const struct ubifs_info *c, const void *node)
@@ -538,7 +546,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 		printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
 		for (i = 0; i < n; i++)
 			printk(KERN_DEBUG "\t  ino %llu\n",
-			       le64_to_cpu(orph->inos[i]));
+			       (unsigned long long)le64_to_cpu(orph->inos[i]));
 		break;
 	}
 	default:
@@ -568,8 +576,8 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
 {
 	spin_lock(&dbg_lock);
-	printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs  %d\n",
-	       lst->empty_lebs, lst->idx_lebs);
+	printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
+	       "idx_lebs  %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
 	printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
 	       "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
 	       lst->total_dirty);
@@ -587,8 +595,8 @@ void dbg_dump_budg(struct ubifs_info *c)
 	struct ubifs_gced_idx_leb *idx_gc;
 
 	spin_lock(&dbg_lock);
-	printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, "
-	       "budg_dd_growth %lld, budg_idx_growth %lld\n",
+	printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
+	       "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
 	       c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
 	printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
 	       "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
@@ -634,7 +642,7 @@ void dbg_dump_lprops(struct ubifs_info *c)
 	struct ubifs_lprops lp;
 	struct ubifs_lp_stats lst;
 
-	printk(KERN_DEBUG "Dumping LEB properties\n");
+	printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
 	ubifs_get_lp_stats(c, &lst);
 	dbg_dump_lstats(&lst);
 
@@ -647,6 +655,43 @@ void dbg_dump_lprops(struct ubifs_info *c)
 	}
 }
 
+void dbg_dump_lpt_info(struct ubifs_info *c)
+{
+	int i;
+
+	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz);
+	printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz);
+	printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz);
+	printk(KERN_DEBUG "\tltab_sz:       %d\n", c->ltab_sz);
+	printk(KERN_DEBUG "\tlsave_sz:      %d\n", c->lsave_sz);
+	printk(KERN_DEBUG "\tbig_lpt:       %d\n", c->big_lpt);
+	printk(KERN_DEBUG "\tlpt_hght:      %d\n", c->lpt_hght);
+	printk(KERN_DEBUG "\tpnode_cnt:     %d\n", c->pnode_cnt);
+	printk(KERN_DEBUG "\tnnode_cnt:     %d\n", c->nnode_cnt);
+	printk(KERN_DEBUG "\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt);
+	printk(KERN_DEBUG "\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt);
+	printk(KERN_DEBUG "\tlsave_cnt:     %d\n", c->lsave_cnt);
+	printk(KERN_DEBUG "\tspace_bits:    %d\n", c->space_bits);
+	printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
+	printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
+	printk(KERN_DEBUG "\tlpt_spc_bits:  %d\n", c->lpt_spc_bits);
+	printk(KERN_DEBUG "\tpcnt_bits:     %d\n", c->pcnt_bits);
+	printk(KERN_DEBUG "\tlnum_bits:     %d\n", c->lnum_bits);
+	printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
+	printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
+	       c->nhead_lnum, c->nhead_offs);
+	printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
+	if (c->big_lpt)
+		printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
+		       c->lsave_lnum, c->lsave_offs);
+	for (i = 0; i < c->lpt_lebs; i++)
+		printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d "
+		       "cmt %d\n", i + c->lpt_first, c->ltab[i].free,
+		       c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt);
+	spin_unlock(&dbg_lock);
+}
+
 void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 {
 	struct ubifs_scan_leb *sleb;
@@ -655,7 +700,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 	if (dbg_failure_mode)
 		return;
 
-	printk(KERN_DEBUG "Dumping LEB %d\n", lnum);
+	printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
 
 	sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
 	if (IS_ERR(sleb)) {
@@ -720,8 +765,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
 	int i;
 
-	printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n",
-	       cat, heap->cnt);
+	printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
+	       current->pid, cat, heap->cnt);
 	for (i = 0; i < heap->cnt; i++) {
 		struct ubifs_lprops *lprops = heap->arr[i];
 
@@ -736,7 +781,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 {
 	int i;
 
-	printk(KERN_DEBUG "Dumping pnode:\n");
+	printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
 	printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
 	       (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
 	printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -755,7 +800,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
 	int level;
 
 	printk(KERN_DEBUG "\n");
-	printk(KERN_DEBUG "Dumping the TNC tree\n");
+	printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
 	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
 	level = znode->level;
 	printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -2208,16 +2253,17 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
 		  int offset, int len, int dtype)
 {
-	int err;
+	int err, failing;
 
 	if (in_failure_mode(desc))
 		return -EIO;
-	if (do_fail(desc, lnum, 1))
+	failing = do_fail(desc, lnum, 1);
+	if (failing)
 		cut_data(buf, len);
 	err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
 	if (err)
 		return err;
-	if (in_failure_mode(desc))
+	if (failing)
 		return -EIO;
 	return 0;
 }
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 3c4f1e93c9e0..33d6b95071e4 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -27,7 +27,7 @@
 
 #define UBIFS_DBG(op) op
 
-#define ubifs_assert(expr)  do {                                               \
+#define ubifs_assert(expr) do {                                                \
 	if (unlikely(!(expr))) {                                               \
 		printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
 		       __func__, __LINE__, current->pid);                      \
@@ -73,50 +73,50 @@ const char *dbg_key_str1(const struct ubifs_info *c,
 			 const union ubifs_key *key);
 
 /*
- * DBGKEY macros require dbg_lock to be held, which it is in the dbg message
+ * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
  * macros.
  */
 #define DBGKEY(key) dbg_key_str0(c, (key))
 #define DBGKEY1(key) dbg_key_str1(c, (key))
 
 /* General messages */
-#define dbg_gen(fmt, ...)        dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)   dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
 
 /* Additional journal messages */
-#define dbg_jnl(fmt, ...)        dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
 
 /* Additional TNC messages */
-#define dbg_tnc(fmt, ...)        dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
 
 /* Additional lprops messages */
-#define dbg_lp(fmt, ...)         dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
 
 /* Additional LEB find messages */
-#define dbg_find(fmt, ...)       dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
 
 /* Additional mount messages */
-#define dbg_mnt(fmt, ...)        dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
 
 /* Additional I/O messages */
-#define dbg_io(fmt, ...)         dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
 
 /* Additional commit messages */
-#define dbg_cmt(fmt, ...)        dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
 
 /* Additional budgeting messages */
-#define dbg_budg(fmt, ...)       dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
 
 /* Additional log messages */
-#define dbg_log(fmt, ...)        dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
 
 /* Additional gc messages */
-#define dbg_gc(fmt, ...)         dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
 
 /* Additional scan messages */
-#define dbg_scan(fmt, ...)       dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
 
 /* Additional recovery messages */
-#define dbg_rcvry(fmt, ...)      dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
 
 /*
  * Debugging message type flags (must match msg_type_names in debug.c).
@@ -224,6 +224,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
 void dbg_dump_budg(struct ubifs_info *c);
 void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
 void dbg_dump_lprops(struct ubifs_info *c);
+void dbg_dump_lpt_info(struct ubifs_info *c);
 void dbg_dump_leb(const struct ubifs_info *c, int lnum);
 void dbg_dump_znode(const struct ubifs_info *c,
 		    const struct ubifs_znode *znode);
@@ -239,34 +240,25 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
 				 struct ubifs_zbranch *zbr, void *priv);
 typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 				  struct ubifs_znode *znode, void *priv);
-
 int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
 		   dbg_znode_callback znode_cb, void *priv);
 
 /* Checking functions */
 
 int dbg_check_lprops(struct ubifs_info *c);
-
 int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
-
 int dbg_check_cats(struct ubifs_info *c);
-
 int dbg_check_ltab(struct ubifs_info *c);
-
+int dbg_chk_lpt_free_spc(struct ubifs_info *c);
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len);
 int dbg_check_synced_i_size(struct inode *inode);
-
 int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
-
 int dbg_check_tnc(struct ubifs_info *c, int extra);
-
 int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
-
 int dbg_check_filesystem(struct ubifs_info *c);
-
 void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
 		    int add_pos);
-
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
 			int row, int col);
@@ -329,71 +321,80 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #else /* !CONFIG_UBIFS_FS_DEBUG */
 
 #define UBIFS_DBG(op)
-#define ubifs_assert(expr)                         ({})
-#define ubifs_assert_cmt_locked(c)
+
+/* Use "if (0)" to make compiler check arguments even if debugging is off */
+#define ubifs_assert(expr)  do {                                               \
+	if (0 && (expr))                                                       \
+		printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+		       __func__, __LINE__, current->pid);                      \
+} while (0)
+
+#define dbg_err(fmt, ...)   do {                                               \
+	if (0)                                                                 \
+		ubifs_err(fmt, ##__VA_ARGS__);                                 \
+} while (0)
+
+#define dbg_msg(fmt, ...) do {                                                 \
+	if (0)                                                                 \
+		printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n",         \
+		       current->pid, __func__, ##__VA_ARGS__);                 \
+} while (0)
+
 #define dbg_dump_stack()
-#define dbg_err(fmt, ...)                          ({})
-#define dbg_msg(fmt, ...)                          ({})
-#define dbg_key(c, key, fmt, ...)                  ({})
-
-#define dbg_gen(fmt, ...)                          ({})
-#define dbg_jnl(fmt, ...)                          ({})
-#define dbg_tnc(fmt, ...)                          ({})
-#define dbg_lp(fmt, ...)                           ({})
-#define dbg_find(fmt, ...)                         ({})
-#define dbg_mnt(fmt, ...)                          ({})
-#define dbg_io(fmt, ...)                           ({})
-#define dbg_cmt(fmt, ...)                          ({})
-#define dbg_budg(fmt, ...)                         ({})
-#define dbg_log(fmt, ...)                          ({})
-#define dbg_gc(fmt, ...)                           ({})
-#define dbg_scan(fmt, ...)                         ({})
-#define dbg_rcvry(fmt, ...)                        ({})
-
-#define dbg_ntype(type)                            ""
-#define dbg_cstate(cmt_state)                      ""
-#define dbg_get_key_dump(c, key)                   ({})
-#define dbg_dump_inode(c, inode)                   ({})
-#define dbg_dump_node(c, node)                     ({})
-#define dbg_dump_budget_req(req)                   ({})
-#define dbg_dump_lstats(lst)                       ({})
-#define dbg_dump_budg(c)                           ({})
-#define dbg_dump_lprop(c, lp)                      ({})
-#define dbg_dump_lprops(c)                         ({})
-#define dbg_dump_leb(c, lnum)                      ({})
-#define dbg_dump_znode(c, znode)                   ({})
-#define dbg_dump_heap(c, heap, cat)                ({})
-#define dbg_dump_pnode(c, pnode, parent, iip)      ({})
-#define dbg_dump_tnc(c)                            ({})
-#define dbg_dump_index(c)                          ({})
+#define ubifs_assert_cmt_locked(c)
 
-#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+#define dbg_gen(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+
+#define DBGKEY(key)  ((char *)(key))
+#define DBGKEY1(key) ((char *)(key))
+
+#define dbg_ntype(type)                       ""
+#define dbg_cstate(cmt_state)                 ""
+#define dbg_get_key_dump(c, key)              ({})
+#define dbg_dump_inode(c, inode)              ({})
+#define dbg_dump_node(c, node)                ({})
+#define dbg_dump_budget_req(req)              ({})
+#define dbg_dump_lstats(lst)                  ({})
+#define dbg_dump_budg(c)                      ({})
+#define dbg_dump_lprop(c, lp)                 ({})
+#define dbg_dump_lprops(c)                    ({})
+#define dbg_dump_lpt_info(c)                  ({})
+#define dbg_dump_leb(c, lnum)                 ({})
+#define dbg_dump_znode(c, znode)              ({})
+#define dbg_dump_heap(c, heap, cat)           ({})
+#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_tnc(c)                       ({})
+#define dbg_dump_index(c)                     ({})
 
+#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
 #define dbg_check_old_index(c, zroot)              0
-
 #define dbg_check_cats(c)                          0
-
 #define dbg_check_ltab(c)                          0
-
+#define dbg_chk_lpt_free_spc(c)                    0
+#define dbg_chk_lpt_sz(c, action, len)             0
 #define dbg_check_synced_i_size(inode)             0
-
 #define dbg_check_dir_size(c, dir)                 0
-
 #define dbg_check_tnc(c, x)                        0
-
 #define dbg_check_idx_size(c, idx_size)            0
-
 #define dbg_check_filesystem(c)                    0
-
 #define dbg_check_heap(c, heap, cat, add_pos)      ({})
-
 #define dbg_check_lprops(c)                        0
 #define dbg_check_lpt_nodes(c, cnode, row, col)    0
-
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
-
 #define dbg_failure_mode                           0
 #define dbg_failure_mode_registration(c)           ({})
 #define dbg_failure_mode_deregistration(c)         ({})
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e90374be7d3b..526c01ec8003 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -165,7 +165,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
 	}
 
 	inode->i_ino = ++c->highest_inum;
-	inode->i_generation = ++c->vfs_gen;
 	/*
 	 * The creation sequence number remains with this inode for its
 	 * lifetime. All nodes for this inode have a greater sequence number,
@@ -220,15 +219,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 
 	err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
 	if (err) {
-		/*
-		 * Do not hash the direntry if parent 'i_nlink' is zero, because
-		 * this has side-effects - '->delete_inode()' call will not be
-		 * called for the parent orphan inode, because 'd_count' of its
-		 * direntry will stay 1 (it'll be negative direntry I guess)
-		 * and prevent 'iput_final()' until the dentry is destroyed due
-		 * to unmount or memory pressure.
-		 */
-		if (err == -ENOENT && dir->i_nlink != 0) {
+		if (err == -ENOENT) {
 			dbg_gen("not found");
 			goto done;
 		}
@@ -435,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 
 	while (1) {
 		dbg_gen("feed '%s', ino %llu, new f_pos %#x",
-			dent->name, le64_to_cpu(dent->inum),
+			dent->name, (unsigned long long)le64_to_cpu(dent->inum),
 			key_hash_flash(c, &dent->key));
 		ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
 
@@ -525,7 +516,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
 	struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
-					.dirtied_ino_d = ui->data_len };
+				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
 
 	/*
 	 * Budget request settings: new direntry, changing the target inode,
@@ -596,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	if (err) {
 		if (err != -ENOSPC)
 			return err;
-		err = 0;
 		budgeted = 0;
 	}
 
@@ -727,8 +717,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
-	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-					.dirtied_ino_d = 1 };
+	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
 
 	/*
 	 * Budget request settings: new inode, new direntry and changing parent
@@ -789,7 +778,8 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
 	int err, devlen = 0;
 	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-					.new_ino_d = devlen, .dirtied_ino = 1 };
+					.new_ino_d = ALIGN(devlen, 8),
+					.dirtied_ino = 1 };
 
 	/*
 	 * Budget request settings: new inode, new direntry and changing parent
@@ -863,7 +853,8 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 	int err, len = strlen(symname);
 	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
 	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-					.new_ino_d = len, .dirtied_ino = 1 };
+					.new_ino_d = ALIGN(len, 8),
+					.dirtied_ino = 1 };
 
 	/*
 	 * Budget request settings: new inode, new direntry and changing parent
@@ -1012,7 +1003,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
 					.dirtied_ino = 3 };
 	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
-				.dirtied_ino_d = old_inode_ui->data_len };
+			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
 	struct timespec time;
 
 	/*
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 005a3b854d96..51cf511d44d9 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -53,6 +53,7 @@
 
 #include "ubifs.h"
 #include <linux/mount.h>
+#include <linux/namei.h>
 
 static int read_block(struct inode *inode, void *addr, unsigned int block,
 		      struct ubifs_data_node *dn)
@@ -146,6 +147,12 @@ static int do_readpage(struct page *page)
 				err = ret;
 				if (err != -ENOENT)
 					break;
+			} else if (block + 1 == beyond) {
+				int dlen = le32_to_cpu(dn->size);
+				int ilen = i_size & (UBIFS_BLOCK_SIZE - 1);
+
+				if (ilen && ilen < dlen)
+					memset(addr + ilen, 0, dlen - ilen);
 			}
 		}
 		if (++i >= UBIFS_BLOCKS_PER_PAGE)
@@ -576,8 +583,262 @@ out:
 	return copied;
 }
 
+/**
+ * populate_page - copy data nodes into a page for bulk-read.
+ * @c: UBIFS file-system description object
+ * @page: page
+ * @bu: bulk-read information
+ * @n: next zbranch slot
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int populate_page(struct ubifs_info *c, struct page *page,
+			 struct bu_info *bu, int *n)
+{
+	int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0;
+	struct inode *inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	unsigned int page_block;
+	void *addr, *zaddr;
+	pgoff_t end_index;
+
+	dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+		inode->i_ino, page->index, i_size, page->flags);
+
+	addr = zaddr = kmap(page);
+
+	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	if (!i_size || page->index > end_index) {
+		hole = 1;
+		memset(addr, 0, PAGE_CACHE_SIZE);
+		goto out_hole;
+	}
+
+	page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	while (1) {
+		int err, len, out_len, dlen;
+
+		if (nn >= bu->cnt) {
+			hole = 1;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		} else if (key_block(c, &bu->zbranch[nn].key) == page_block) {
+			struct ubifs_data_node *dn;
+
+			dn = bu->buf + (bu->zbranch[nn].offs - offs);
+
+			ubifs_assert(dn->ch.sqnum >
+				     ubifs_inode(inode)->creat_sqnum);
+
+			len = le32_to_cpu(dn->size);
+			if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+				goto out_err;
+
+			dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+			out_len = UBIFS_BLOCK_SIZE;
+			err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+					       le16_to_cpu(dn->compr_type));
+			if (err || len != out_len)
+				goto out_err;
+
+			if (len < UBIFS_BLOCK_SIZE)
+				memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+			nn += 1;
+			read = (i << UBIFS_BLOCK_SHIFT) + len;
+		} else if (key_block(c, &bu->zbranch[nn].key) < page_block) {
+			nn += 1;
+			continue;
+		} else {
+			hole = 1;
+			memset(addr, 0, UBIFS_BLOCK_SIZE);
+		}
+		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+			break;
+		addr += UBIFS_BLOCK_SIZE;
+		page_block += 1;
+	}
+
+	if (end_index == page->index) {
+		int len = i_size & (PAGE_CACHE_SIZE - 1);
+
+		if (len && len < read)
+			memset(zaddr + len, 0, read - len);
+	}
+
+out_hole:
+	if (hole) {
+		SetPageChecked(page);
+		dbg_gen("hole");
+	}
+
+	SetPageUptodate(page);
+	ClearPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	*n = nn;
+	return 0;
+
+out_err:
+	ClearPageUptodate(page);
+	SetPageError(page);
+	flush_dcache_page(page);
+	kunmap(page);
+	ubifs_err("bad data node (block %u, inode %lu)",
+		  page_block, inode->i_ino);
+	return -EINVAL;
+}
+
+/**
+ * ubifs_do_bulk_read - do bulk-read.
+ * @c: UBIFS file-system description object
+ * @page1: first page
+ *
+ * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
+ */
+static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
+{
+	pgoff_t offset = page1->index, end_index;
+	struct address_space *mapping = page1->mapping;
+	struct inode *inode = mapping->host;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct bu_info *bu;
+	int err, page_idx, page_cnt, ret = 0, n = 0;
+	loff_t isize;
+
+	bu = kmalloc(sizeof(struct bu_info), GFP_NOFS);
+	if (!bu)
+		return 0;
+
+	bu->buf_len = c->bulk_read_buf_size;
+	bu->buf = kmalloc(bu->buf_len, GFP_NOFS);
+	if (!bu->buf)
+		goto out_free;
+
+	data_key_init(c, &bu->key, inode->i_ino,
+		      offset << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+
+	err = ubifs_tnc_get_bu_keys(c, bu);
+	if (err)
+		goto out_warn;
+
+	if (bu->eof) {
+		/* Turn off bulk-read at the end of the file */
+		ui->read_in_a_row = 1;
+		ui->bulk_read = 0;
+	}
+
+	page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	if (!page_cnt) {
+		/*
+		 * This happens when there are multiple blocks per page and the
+		 * blocks for the first page we are looking for, are not
+		 * together. If all the pages were like this, bulk-read would
+		 * reduce performance, so we turn it off for a while.
+		 */
+		ui->read_in_a_row = 0;
+		ui->bulk_read = 0;
+		goto out_free;
+	}
+
+	if (bu->cnt) {
+		err = ubifs_tnc_bulk_read(c, bu);
+		if (err)
+			goto out_warn;
+	}
+
+	err = populate_page(c, page1, bu, &n);
+	if (err)
+		goto out_warn;
+
+	unlock_page(page1);
+	ret = 1;
+
+	isize = i_size_read(inode);
+	if (isize == 0)
+		goto out_free;
+	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+	for (page_idx = 1; page_idx < page_cnt; page_idx++) {
+		pgoff_t page_offset = offset + page_idx;
+		struct page *page;
+
+		if (page_offset > end_index)
+			break;
+		page = find_or_create_page(mapping, page_offset,
+					   GFP_NOFS | __GFP_COLD);
+		if (!page)
+			break;
+		if (!PageUptodate(page))
+			err = populate_page(c, page, bu, &n);
+		unlock_page(page);
+		page_cache_release(page);
+		if (err)
+			break;
+	}
+
+	ui->last_page_read = offset + page_idx - 1;
+
+out_free:
+	kfree(bu->buf);
+	kfree(bu);
+	return ret;
+
+out_warn:
+	ubifs_warn("ignoring error %d and skipping bulk-read", err);
+	goto out_free;
+}
+
+/**
+ * ubifs_bulk_read - determine whether to bulk-read and, if so, do it.
+ * @page: page from which to start bulk-read.
+ *
+ * Some flash media are capable of reading sequentially at faster rates. UBIFS
+ * bulk-read facility is designed to take advantage of that, by reading in one
+ * go consecutive data nodes that are also located consecutively in the same
+ * LEB. This function returns %1 if a bulk-read is done and %0 otherwise.
+ */
+static int ubifs_bulk_read(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	pgoff_t index = page->index, last_page_read = ui->last_page_read;
+	int ret = 0;
+
+	ui->last_page_read = index;
+
+	if (!c->bulk_read)
+		return 0;
+	/*
+	 * Bulk-read is protected by ui_mutex, but it is an optimization, so
+	 * don't bother if we cannot lock the mutex.
+	 */
+	if (!mutex_trylock(&ui->ui_mutex))
+		return 0;
+	if (index != last_page_read + 1) {
+		/* Turn off bulk-read if we stop reading sequentially */
+		ui->read_in_a_row = 1;
+		if (ui->bulk_read)
+			ui->bulk_read = 0;
+		goto out_unlock;
+	}
+	if (!ui->bulk_read) {
+		ui->read_in_a_row += 1;
+		if (ui->read_in_a_row < 3)
+			goto out_unlock;
+		/* Three reads in a row, so switch on bulk-read */
+		ui->bulk_read = 1;
+	}
+	ret = ubifs_do_bulk_read(c, page);
+out_unlock:
+	mutex_unlock(&ui->ui_mutex);
+	return ret;
+}
+
 static int ubifs_readpage(struct file *file, struct page *page)
 {
+	if (ubifs_bulk_read(page))
+		return 0;
 	do_readpage(page);
 	unlock_page(page);
 	return 0;
@@ -792,7 +1053,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 	int err;
 	struct ubifs_budget_req req;
 	loff_t old_size = inode->i_size, new_size = attr->ia_size;
-	int offset = new_size & (UBIFS_BLOCK_SIZE - 1);
+	int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
 	dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
@@ -810,8 +1071,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 	/* A funny way to budget for truncation node */
 	req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
 	err = ubifs_budget_space(c, &req);
-	if (err)
-		return err;
+	if (err) {
+		/*
+		 * Treat truncations to zero as deletion and always allow them,
+		 * just like we do for '->unlink()'.
+		 */
+		if (new_size || err != -ENOSPC)
+			return err;
+		budgeted = 0;
+	}
 
 	err = vmtruncate(inode, new_size);
 	if (err)
@@ -868,7 +1136,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 	err = ubifs_jnl_truncate(c, inode, old_size, new_size);
 	mutex_unlock(&ui->ui_mutex);
 out_budg:
-	ubifs_release_budget(c, &req);
+	if (budgeted)
+		ubifs_release_budget(c, &req);
+	else {
+		c->nospace = c->nospace_rp = 0;
+		smp_wmb();
+	}
 	return err;
 }
 
@@ -889,7 +1162,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
 	loff_t new_size = attr->ia_size;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	struct ubifs_budget_req req = { .dirtied_ino = 1,
-					.dirtied_ino_d = ui->data_len };
+				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
 
 	err = ubifs_budget_space(c, &req);
 	if (err)
@@ -940,7 +1213,8 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = dentry->d_inode;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 
-	dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid);
+	dbg_gen("ino %lu, mode %#x, ia_valid %#x",
+		inode->i_ino, inode->i_mode, attr->ia_valid);
 	err = inode_change_ok(inode, attr);
 	if (err)
 		return err;
@@ -1050,7 +1324,7 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode)
 	if (mctime_update_needed(inode, &now)) {
 		int err, release;
 		struct ubifs_budget_req req = { .dirtied_ino = 1,
-						.dirtied_ino_d = ui->data_len };
+				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
 
 		err = ubifs_budget_space(c, &req);
 		if (err)
@@ -1269,6 +1543,7 @@ struct file_operations ubifs_file_operations = {
 	.fsync          = ubifs_fsync,
 	.unlocked_ioctl = ubifs_ioctl,
 	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ubifs_compat_ioctl,
 #endif
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 10394c548367..717d79c97c5e 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
  * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
  * or do not have an LEB which satisfies the @min_space criteria.
  *
- * Note:
- *   o LEBs which have less than dead watermark of dirty space are never picked
- *   by this function;
- *
- * Returns zero and the LEB properties of
- * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a
- * negative error code in case of other failures. The returned LEB is marked as
- * "taken".
+ * Note, LEBs which have less than dead watermark of free + dirty space are
+ * never picked by this function.
  *
  * The additional @pick_free argument controls if this function has to return a
  * free or freeable LEB if one is present. For example, GC must to set it to %1,
@@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
  *
  * In addition @pick_free is set to %2 by the recovery process in order to
  * recover gc_lnum in which case an index LEB must not be returned.
+ *
+ * This function returns zero and the LEB properties of found dirty LEB in case
+ * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
+ * case of other failures. The returned LEB is marked as "taken".
  */
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
 			 int min_space, int pick_free)
@@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
 		int lebs, rsvd_idx_lebs = 0;
 
 		spin_lock(&c->space_lock);
-		lebs = c->lst.empty_lebs;
+		lebs = c->lst.empty_lebs + c->idx_gc_cnt;
 		lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
 
 		/*
@@ -290,9 +288,14 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
 		idx_lp = idx_heap->arr[0];
 		sum = idx_lp->free + idx_lp->dirty;
 		/*
-		 * Since we reserve twice as more space for the index than it
+		 * Since we reserve thrice as much space for the index than it
 		 * actually takes, it does not make sense to pick indexing LEBs
-		 * with less than half LEB of dirty space.
+		 * with less than, say, half LEB of dirty space. May be half is
+		 * not the optimal boundary - this should be tested and
+		 * checked. This boundary should determine how much we use
+		 * in-the-gaps to consolidate the index comparing to how much
+		 * we use garbage collector to consolidate it. The "half"
+		 * criteria just feels to be fine.
 		 */
 		if (sum < min_space || sum < c->half_leb_size)
 			idx_lp = NULL;
@@ -312,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
 		lp = idx_lp;
 
 	if (lp) {
-		ubifs_assert(lp->dirty >= c->dead_wm);
+		ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
 		goto found;
 	}
 
@@ -504,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 		rsvd_idx_lebs = 0;
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
 	       c->lst.taken_empty_lebs;
-	ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);
 	if (rsvd_idx_lebs < lebs)
 		/*
 		 * OK to allocate an empty LEB, but we still don't want to go
@@ -899,11 +901,11 @@ static int get_idx_gc_leb(struct ubifs_info *c)
 	 * it is needed now for this commit.
 	 */
 	lp = ubifs_lpt_lookup_dirty(c, lnum);
-	if (unlikely(IS_ERR(lp)))
+	if (IS_ERR(lp))
 		return PTR_ERR(lp);
 	lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
 			     lp->flags | LPROPS_INDEX, -1);
-	if (unlikely(IS_ERR(lp)))
+	if (IS_ERR(lp))
 		return PTR_ERR(lp);
 	dbg_find("LEB %d, dirty %d and free %d flags %#x",
 		 lp->lnum, lp->dirty, lp->free, lp->flags);
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index d0f3dac29081..0bef6501d58a 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -96,6 +96,48 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 
 /**
+ * joinup - bring data nodes for an inode together.
+ * @c: UBIFS file-system description object
+ * @sleb: describes scanned LEB
+ * @inum: inode number
+ * @blk: block number
+ * @data: list to which to add data nodes
+ *
+ * This function looks at the first few nodes in the scanned LEB @sleb and adds
+ * them to @data if they are data nodes from @inum and have a larger block
+ * number than @blk. This function returns %0 on success and a negative error
+ * code on failure.
+ */
+static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
+		  unsigned int blk, struct list_head *data)
+{
+	int err, cnt = 6, lnum = sleb->lnum, offs;
+	struct ubifs_scan_node *snod, *tmp;
+	union ubifs_key *key;
+
+	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+		key = &snod->key;
+		if (key_inum(c, key) == inum &&
+		    key_type(c, key) == UBIFS_DATA_KEY &&
+		    key_block(c, key) > blk) {
+			offs = snod->offs;
+			err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
+			if (err < 0)
+				return err;
+			list_del(&snod->list);
+			if (err) {
+				list_add_tail(&snod->list, data);
+				blk = key_block(c, key);
+			} else
+				kfree(snod);
+			cnt = 6;
+		} else if (--cnt == 0)
+			break;
+	}
+	return 0;
+}
+
+/**
  * move_nodes - move nodes.
  * @c: UBIFS file-system description object
  * @sleb: describes nodes to move
@@ -116,16 +158,21 @@ static int switch_gc_head(struct ubifs_info *c)
 static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 {
 	struct ubifs_scan_node *snod, *tmp;
-	struct list_head large, medium, small;
+	struct list_head data, large, medium, small;
 	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 	int avail, err, min = INT_MAX;
+	unsigned int blk = 0;
+	ino_t inum = 0;
 
+	INIT_LIST_HEAD(&data);
 	INIT_LIST_HEAD(&large);
 	INIT_LIST_HEAD(&medium);
 	INIT_LIST_HEAD(&small);
 
-	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-		struct list_head *lst;
+	while (!list_empty(&sleb->nodes)) {
+		struct list_head *lst = sleb->nodes.next;
+
+		snod = list_entry(lst, struct ubifs_scan_node, list);
 
 		ubifs_assert(snod->type != UBIFS_IDX_NODE);
 		ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -136,7 +183,6 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		if (err < 0)
 			goto out;
 
-		lst = &snod->list;
 		list_del(lst);
 		if (!err) {
 			/* The node is obsolete, remove it from the list */
@@ -145,15 +191,30 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		}
 
 		/*
-		 * Sort the list of nodes so that large nodes go first, and
-		 * small nodes go last.
+		 * Sort the list of nodes so that data nodes go first, large
+		 * nodes go second, and small nodes go last.
 		 */
-		if (snod->len > MEDIUM_NODE_WM)
-			list_add(lst, &large);
+		if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
+			if (inum != key_inum(c, &snod->key)) {
+				if (inum) {
+					/*
+					 * Try to move data nodes from the same
+					 * inode together.
+					 */
+					err = joinup(c, sleb, inum, blk, &data);
+					if (err)
+						goto out;
+				}
+				inum = key_inum(c, &snod->key);
+				blk = key_block(c, &snod->key);
+			}
+			list_add_tail(lst, &data);
+		} else if (snod->len > MEDIUM_NODE_WM)
+			list_add_tail(lst, &large);
 		else if (snod->len > SMALL_NODE_WM)
-			list_add(lst, &medium);
+			list_add_tail(lst, &medium);
 		else
-			list_add(lst, &small);
+			list_add_tail(lst, &small);
 
 		/* And find the smallest node */
 		if (snod->len < min)
@@ -164,6 +225,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 	 * Join the tree lists so that we'd have one roughly sorted list
 	 * ('large' will be the head of the joined list).
 	 */
+	list_splice(&data, &large);
 	list_splice(&medium, large.prev);
 	list_splice(&small, large.prev);
 
@@ -334,15 +396,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 
 		err = move_nodes(c, sleb);
 		if (err)
-			goto out;
+			goto out_inc_seq;
 
 		err = gc_sync_wbufs(c);
 		if (err)
-			goto out;
+			goto out_inc_seq;
 
 		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
 		if (err)
-			goto out;
+			goto out_inc_seq;
+
+		/* Allow for races with TNC */
+		c->gced_lnum = lnum;
+		smp_wmb();
+		c->gc_seq += 1;
+		smp_wmb();
 
 		if (c->gc_lnum == -1) {
 			c->gc_lnum = lnum;
@@ -363,6 +431,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 out:
 	ubifs_scan_destroy(sleb);
 	return err;
+
+out_inc_seq:
+	/* We may have moved at least some nodes so allow for races with TNC */
+	c->gced_lnum = lnum;
+	smp_wmb();
+	c->gc_seq += 1;
+	smp_wmb();
+	goto out;
 }
 
 /**
@@ -639,7 +715,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
 	 */
 	while (1) {
 		lp = ubifs_fast_find_freeable(c);
-		if (unlikely(IS_ERR(lp))) {
+		if (IS_ERR(lp)) {
 			err = PTR_ERR(lp);
 			goto out;
 		}
@@ -651,7 +727,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
 		if (err)
 			goto out;
 		lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
-		if (unlikely(IS_ERR(lp))) {
+		if (IS_ERR(lp)) {
 			err = PTR_ERR(lp);
 			goto out;
 		}
@@ -666,7 +742,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
 	/* Record index freeable LEBs for unmapping after commit */
 	while (1) {
 		lp = ubifs_fast_find_frdi_idx(c);
-		if (unlikely(IS_ERR(lp))) {
+		if (IS_ERR(lp)) {
 			err = PTR_ERR(lp);
 			goto out;
 		}
@@ -682,7 +758,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
 		/* Don't release the LEB until after the next commit */
 		flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
 		lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
-		if (unlikely(IS_ERR(lp))) {
+		if (IS_ERR(lp)) {
 			err = PTR_ERR(lp);
 			kfree(idx_gc);
 			goto out;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 3374f91b6709..01682713af69 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -54,12 +54,28 @@
 #include "ubifs.h"
 
 /**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+	if (!c->ro_media) {
+		c->ro_media = 1;
+		c->no_chk_data_crc = 0;
+		ubifs_warn("switched to read-only mode, error %d", err);
+		dbg_dump_stack();
+	}
+}
+
+/**
  * ubifs_check_node - check node.
  * @c: UBIFS file-system description object
  * @buf: node to check
  * @lnum: logical eraseblock number
  * @offs: offset within the logical eraseblock
  * @quiet: print no messages
+ * @chk_crc: indicates whether to always check the CRC
  *
  * This function checks node magic number and CRC checksum. This function also
  * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -71,7 +87,7 @@
  * or magic.
  */
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet)
+		     int offs, int quiet, int chk_crc)
 {
 	int err = -EINVAL, type, node_len;
 	uint32_t crc, node_crc, magic;
@@ -107,6 +123,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 		   node_len > c->ranges[type].max_len)
 		goto out_len;
 
+	if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc)
+		if (c->no_chk_data_crc)
+			return 0;
+
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
 	if (crc != node_crc) {
@@ -708,7 +728,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 		goto out;
 	}
 
-	err = ubifs_check_node(c, buf, lnum, offs, 0);
+	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
 	if (err) {
 		ubifs_err("expected node type %d", type);
 		return err;
@@ -767,7 +787,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
 		goto out;
 	}
 
-	err = ubifs_check_node(c, buf, lnum, offs, 0);
+	err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
 	if (err) {
 		ubifs_err("expected node type %d", type);
 		return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 283155abe5f5..22993f867d19 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -447,13 +447,11 @@ static int get_dent_type(int mode)
  * @ino: buffer in which to pack inode node
  * @inode: inode to pack
  * @last: indicates the last node of the group
- * @last_reference: non-zero if this is a deletion inode
  */
 static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
-		       const struct inode *inode, int last,
-		       int last_reference)
+		       const struct inode *inode, int last)
 {
-	int data_len = 0;
+	int data_len = 0, last_reference = !inode->i_nlink;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
 	ino->ch.node_type = UBIFS_INO_NODE;
@@ -596,9 +594,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 	ubifs_prep_grp_node(c, dent, dlen, 0);
 
 	ino = (void *)dent + aligned_dlen;
-	pack_inode(c, ino, inode, 0, last_reference);
+	pack_inode(c, ino, inode, 0);
 	ino = (void *)ino + aligned_ilen;
-	pack_inode(c, ino, dir, 1, 0);
+	pack_inode(c, ino, dir, 1);
 
 	if (last_reference) {
 		err = ubifs_add_orphan(c, inode->i_ino);
@@ -606,6 +604,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 			release_head(c, BASEHD);
 			goto out_finish;
 		}
+		ui->del_cmtno = c->cmt_no;
 	}
 
 	err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
@@ -750,30 +749,25 @@ out_free:
  * ubifs_jnl_write_inode - flush inode to the journal.
  * @c: UBIFS file-system description object
  * @inode: inode to flush
- * @deletion: inode has been deleted
  *
  * This function writes inode @inode to the journal. If the inode is
  * synchronous, it also synchronizes the write-buffer. Returns zero in case of
  * success and a negative error code in case of failure.
  */
-int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
-			  int deletion)
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
 {
-	int err, len, lnum, offs, sync = 0;
+	int err, lnum, offs;
 	struct ubifs_ino_node *ino;
 	struct ubifs_inode *ui = ubifs_inode(inode);
+	int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
 
-	dbg_jnl("ino %lu%s", inode->i_ino,
-		deletion ? " (last reference)" : "");
-	if (deletion)
-		ubifs_assert(inode->i_nlink == 0);
+	dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
 
-	len = UBIFS_INO_NODE_SZ;
 	/*
 	 * If the inode is being deleted, do not write the attached data. No
 	 * need to synchronize the write-buffer either.
 	 */
-	if (!deletion) {
+	if (!last_reference) {
 		len += ui->data_len;
 		sync = IS_SYNC(inode);
 	}
@@ -786,7 +780,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
 	if (err)
 		goto out_free;
 
-	pack_inode(c, ino, inode, 1, deletion);
+	pack_inode(c, ino, inode, 1);
 	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
 	if (err)
 		goto out_release;
@@ -795,7 +789,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
 					  inode->i_ino);
 	release_head(c, BASEHD);
 
-	if (deletion) {
+	if (last_reference) {
 		err = ubifs_tnc_remove_ino(c, inode->i_ino);
 		if (err)
 			goto out_ro;
@@ -828,6 +822,65 @@ out_free:
 }
 
 /**
+ * ubifs_jnl_delete_inode - delete an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to delete
+ *
+ * This function deletes inode @inode which includes removing it from orphans,
+ * deleting it from TNC and, in some cases, writing a deletion inode to the
+ * journal.
+ *
+ * When regular file inodes are unlinked or a directory inode is removed, the
+ * 'ubifs_jnl_update()' function writes a corresponding deletion inode and
+ * direntry to the media, and adds the inode to orphans. After this, when the
+ * last reference to this inode has been dropped, this function is called. In
+ * general, it has to write one more deletion inode to the media, because if
+ * a commit happened between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal
+ * anymore, and in fact it might not be on the flash anymore, because it might
+ * have been garbage-collected already. And for optimization reasons UBIFS does
+ * not read the orphan area if it has been unmounted cleanly, so it would have
+ * no indication in the journal that there is a deleted inode which has to be
+ * removed from TNC.
+ *
+ * However, if there was no commit between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion
+ * inode to the media for the second time. And this is quite a typical case.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
+{
+	int err;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	ubifs_assert(inode->i_nlink == 0);
+
+	if (ui->del_cmtno != c->cmt_no)
+		/* A commit happened for sure */
+		return ubifs_jnl_write_inode(c, inode);
+
+	down_read(&c->commit_sem);
+	/*
+	 * Check commit number again, because the first test has been done
+	 * without @c->commit_sem, so a commit might have happened.
+	 */
+	if (ui->del_cmtno != c->cmt_no) {
+		up_read(&c->commit_sem);
+		return ubifs_jnl_write_inode(c, inode);
+	}
+
+	err = ubifs_tnc_remove_ino(c, inode->i_ino);
+	if (err)
+		ubifs_ro_mode(c, err);
+	else
+		ubifs_delete_orphan(c, inode->i_ino);
+	up_read(&c->commit_sem);
+	return err;
+}
+
+/**
  * ubifs_jnl_rename - rename a directory entry.
  * @c: UBIFS file-system description object
  * @old_dir: parent inode of directory entry to rename
@@ -917,16 +970,16 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 
 	p = (void *)dent2 + aligned_dlen2;
 	if (new_inode) {
-		pack_inode(c, p, new_inode, 0, last_reference);
+		pack_inode(c, p, new_inode, 0);
 		p += ALIGN(ilen, 8);
 	}
 
 	if (!move)
-		pack_inode(c, p, old_dir, 1, 0);
+		pack_inode(c, p, old_dir, 1);
 	else {
-		pack_inode(c, p, old_dir, 0, 0);
+		pack_inode(c, p, old_dir, 0);
 		p += ALIGN(plen, 8);
-		pack_inode(c, p, new_dir, 1, 0);
+		pack_inode(c, p, new_dir, 1);
 	}
 
 	if (last_reference) {
@@ -935,6 +988,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 			release_head(c, BASEHD);
 			goto out_finish;
 		}
+		new_ui->del_cmtno = c->cmt_no;
 	}
 
 	err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
@@ -1131,7 +1185,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 	if (err)
 		goto out_free;
 
-	pack_inode(c, ino, inode, 0, 0);
+	pack_inode(c, ino, inode, 0);
 	ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
 	if (dlen)
 		ubifs_prep_grp_node(c, dn, dlen, 1);
@@ -1251,9 +1305,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
 	ubifs_prep_grp_node(c, xent, xlen, 0);
 
 	ino = (void *)xent + aligned_xlen;
-	pack_inode(c, ino, inode, 0, 1);
+	pack_inode(c, ino, inode, 0);
 	ino = (void *)ino + UBIFS_INO_NODE_SZ;
-	pack_inode(c, ino, host, 1, 0);
+	pack_inode(c, ino, host, 1);
 
 	err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
 	if (!sync && !err)
@@ -1320,7 +1374,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
 			   const struct inode *host)
 {
 	int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
-	struct ubifs_inode *host_ui = ubifs_inode(inode);
+	struct ubifs_inode *host_ui = ubifs_inode(host);
 	struct ubifs_ino_node *ino;
 	union ubifs_key key;
 	int sync = IS_DIRSYNC(host);
@@ -1344,8 +1398,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
 	if (err)
 		goto out_free;
 
-	pack_inode(c, ino, host, 0, 0);
-	pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0);
+	pack_inode(c, ino, host, 0);
+	pack_inode(c, (void *)ino + aligned_len1, inode, 1);
 
 	err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
 	if (!sync && !err) {
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 8f7476007549..9ee65086f627 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -484,7 +484,7 @@ static inline void key_copy(const struct ubifs_info *c,
  * @key2: the second key to compare
  *
  * This function compares 2 keys and returns %-1 if @key1 is less than
- * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2.
+ * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2.
  */
 static inline int keys_cmp(const struct ubifs_info *c,
 			   const union ubifs_key *key1,
@@ -503,6 +503,26 @@ static inline int keys_cmp(const struct ubifs_info *c,
 }
 
 /**
+ * keys_eq - determine if keys are equivalent.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and
+ * %0 if not.
+ */
+static inline int keys_eq(const struct ubifs_info *c,
+			  const union ubifs_key *key1,
+			  const union ubifs_key *key2)
+{
+	if (key1->u32[0] != key2->u32[0])
+		return 0;
+	if (key1->u32[1] != key2->u32[1])
+		return 0;
+	return 1;
+}
+
+/**
  * is_hash_key - is a key vulnerable to hash collisions.
  * @c: UBIFS file-system description object
  * @key: key
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36857b9ed59e..3e0aa7367556 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -317,6 +317,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 	return 0;
 
 out_unlock:
+	if (err != -EAGAIN)
+		ubifs_ro_mode(c, err);
 	mutex_unlock(&c->log_mutex);
 	kfree(ref);
 	kfree(bud);
@@ -410,7 +412,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
 		return -ENOMEM;
 
 	cs->ch.node_type = UBIFS_CS_NODE;
-	cs->cmt_no = cpu_to_le64(c->cmt_no + 1);
+	cs->cmt_no = cpu_to_le64(c->cmt_no);
 	ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
 
 	/*
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 2ba93da71b65..f27176e9b70d 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -125,6 +125,7 @@ static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
 			}
 		}
 	}
+
 	/* Not greater than parent, so compare to children */
 	while (1) {
 		/* Compare to left child */
@@ -460,18 +461,6 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
 }
 
 /**
- * ubifs_get_lprops - get reference to LEB properties.
- * @c: the UBIFS file-system description object
- *
- * This function locks lprops. Lprops have to be unlocked by
- * 'ubifs_release_lprops()'.
- */
-void ubifs_get_lprops(struct ubifs_info *c)
-{
-	mutex_lock(&c->lp_mutex);
-}
-
-/**
  * calc_dark - calculate LEB dark space size.
  * @c: the UBIFS file-system description object
  * @spc: amount of free and dirty space in the LEB
@@ -576,7 +565,6 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 	ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
 
 	spin_lock(&c->space_lock);
-
 	if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
 		c->lst.taken_empty_lebs -= 1;
 
@@ -637,31 +625,12 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 		c->lst.taken_empty_lebs += 1;
 
 	change_category(c, lprops);
-
 	c->idx_gc_cnt += idx_gc_cnt;
-
 	spin_unlock(&c->space_lock);
-
 	return lprops;
 }
 
 /**
- * ubifs_release_lprops - release lprops lock.
- * @c: the UBIFS file-system description object
- *
- * This function has to be called after each 'ubifs_get_lprops()' call to
- * unlock lprops.
- */
-void ubifs_release_lprops(struct ubifs_info *c)
-{
-	ubifs_assert(mutex_is_locked(&c->lp_mutex));
-	ubifs_assert(c->lst.empty_lebs >= 0 &&
-		     c->lst.empty_lebs <= c->main_lebs);
-
-	mutex_unlock(&c->lp_mutex);
-}
-
-/**
  * ubifs_get_lp_stats - get lprops statistics.
  * @c: UBIFS file-system description object
  * @st: return statistics
@@ -1262,7 +1231,6 @@ static int scan_check_cb(struct ubifs_info *c,
 	}
 
 	ubifs_scan_destroy(sleb);
-
 	return LPT_SCAN_CONTINUE;
 
 out_print:
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 9ff2463177e5..db8bd0e518b2 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -109,7 +109,8 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
 	c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
 	c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
 	c->lpt_sz += c->ltab_sz;
-	c->lpt_sz += c->lsave_sz;
+	if (c->big_lpt)
+		c->lpt_sz += c->lsave_sz;
 
 	/* Add wastage */
 	sz = c->lpt_sz;
@@ -287,25 +288,56 @@ uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
 	const int k = 32 - nrbits;
 	uint8_t *p = *addr;
 	int b = *pos;
-	uint32_t val;
+	uint32_t uninitialized_var(val);
+	const int bytes = (nrbits + b + 7) >> 3;
 
 	ubifs_assert(nrbits > 0);
 	ubifs_assert(nrbits <= 32);
 	ubifs_assert(*pos >= 0);
 	ubifs_assert(*pos < 8);
 	if (b) {
-		val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) |
-		      ((uint32_t)p[4] << 24);
+		switch (bytes) {
+		case 2:
+			val = p[1];
+			break;
+		case 3:
+			val = p[1] | ((uint32_t)p[2] << 8);
+			break;
+		case 4:
+			val = p[1] | ((uint32_t)p[2] << 8) |
+				     ((uint32_t)p[3] << 16);
+			break;
+		case 5:
+			val = p[1] | ((uint32_t)p[2] << 8) |
+				     ((uint32_t)p[3] << 16) |
+				     ((uint32_t)p[4] << 24);
+		}
 		val <<= (8 - b);
 		val |= *p >> b;
 		nrbits += b;
-	} else
-		val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
-		      ((uint32_t)p[3] << 24);
+	} else {
+		switch (bytes) {
+		case 1:
+			val = p[0];
+			break;
+		case 2:
+			val = p[0] | ((uint32_t)p[1] << 8);
+			break;
+		case 3:
+			val = p[0] | ((uint32_t)p[1] << 8) |
+				     ((uint32_t)p[2] << 16);
+			break;
+		case 4:
+			val = p[0] | ((uint32_t)p[1] << 8) |
+				     ((uint32_t)p[2] << 16) |
+				     ((uint32_t)p[3] << 24);
+			break;
+		}
+	}
 	val <<= k;
 	val >>= k;
 	b = nrbits & 7;
-	p += nrbits / 8;
+	p += nrbits >> 3;
 	*addr = p;
 	*pos = b;
 	ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 5f0b83e20af6..eed5a0025d63 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -177,8 +177,6 @@ static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
 			return 0;
 		}
 	}
-	dbg_err("last LEB %d", *lnum);
-	dump_stack();
 	return -ENOSPC;
 }
 
@@ -193,6 +191,9 @@ static int layout_cnodes(struct ubifs_info *c)
 	int lnum, offs, len, alen, done_lsave, done_ltab, err;
 	struct ubifs_cnode *cnode;
 
+	err = dbg_chk_lpt_sz(c, 0, 0);
+	if (err)
+		return err;
 	cnode = c->lpt_cnext;
 	if (!cnode)
 		return 0;
@@ -206,6 +207,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		c->lsave_lnum = lnum;
 		c->lsave_offs = offs;
 		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 	}
 
 	if (offs + c->ltab_sz <= c->leb_size) {
@@ -213,6 +215,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		c->ltab_lnum = lnum;
 		c->ltab_offs = offs;
 		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 	}
 
 	do {
@@ -226,9 +229,10 @@ static int layout_cnodes(struct ubifs_info *c)
 		while (offs + len > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, alen - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -238,6 +242,7 @@ static int layout_cnodes(struct ubifs_info *c)
 				c->lsave_lnum = lnum;
 				c->lsave_offs = offs;
 				offs += c->lsave_sz;
+				dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 				continue;
 			}
 			if (!done_ltab) {
@@ -245,6 +250,7 @@ static int layout_cnodes(struct ubifs_info *c)
 				c->ltab_lnum = lnum;
 				c->ltab_offs = offs;
 				offs += c->ltab_sz;
+				dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 				continue;
 			}
 			break;
@@ -257,6 +263,7 @@ static int layout_cnodes(struct ubifs_info *c)
 			c->lpt_offs = offs;
 		}
 		offs += len;
+		dbg_chk_lpt_sz(c, 1, len);
 		cnode = cnode->cnext;
 	} while (cnode && cnode != c->lpt_cnext);
 
@@ -265,9 +272,10 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->lsave_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, alen - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -276,6 +284,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		c->lsave_lnum = lnum;
 		c->lsave_offs = offs;
 		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 	}
 
 	/* Make sure to place LPT's own lprops table */
@@ -283,9 +292,10 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->ltab_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+			dbg_chk_lpt_sz(c, 2, alen - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -294,11 +304,23 @@ static int layout_cnodes(struct ubifs_info *c)
 		c->ltab_lnum = lnum;
 		c->ltab_offs = offs;
 		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 	}
 
 	alen = ALIGN(offs, c->min_io_size);
 	upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+	dbg_chk_lpt_sz(c, 4, alen - offs);
+	err = dbg_chk_lpt_sz(c, 3, alen);
+	if (err)
+		return err;
 	return 0;
+
+no_space:
+	ubifs_err("LPT out of space");
+	dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
+		"done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+	dbg_dump_lpt_info(c);
+	return err;
 }
 
 /**
@@ -333,8 +355,6 @@ static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
 			*lnum = i + c->lpt_first;
 			return 0;
 		}
-	dbg_err("last LEB %d", *lnum);
-	dump_stack();
 	return -ENOSPC;
 }
 
@@ -369,12 +389,14 @@ static int write_cnodes(struct ubifs_info *c)
 		done_lsave = 1;
 		ubifs_pack_lsave(c, buf + offs, c->lsave);
 		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 	}
 
 	if (offs + c->ltab_sz <= c->leb_size) {
 		done_ltab = 1;
 		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
 		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 	}
 
 	/* Loop for each cnode */
@@ -392,10 +414,12 @@ static int write_cnodes(struct ubifs_info *c)
 						       alen, UBI_SHORTTERM);
 				if (err)
 					return err;
+				dbg_chk_lpt_sz(c, 4, alen - wlen);
 			}
+			dbg_chk_lpt_sz(c, 2, 0);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
@@ -408,12 +432,14 @@ static int write_cnodes(struct ubifs_info *c)
 				done_lsave = 1;
 				ubifs_pack_lsave(c, buf + offs, c->lsave);
 				offs += c->lsave_sz;
+				dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 				continue;
 			}
 			if (!done_ltab) {
 				done_ltab = 1;
 				ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
 				offs += c->ltab_sz;
+				dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 				continue;
 			}
 			break;
@@ -435,6 +461,7 @@ static int write_cnodes(struct ubifs_info *c)
 		clear_bit(COW_ZNODE, &cnode->flags);
 		smp_mb__after_clear_bit();
 		offs += len;
+		dbg_chk_lpt_sz(c, 1, len);
 		cnode = cnode->cnext;
 	} while (cnode && cnode != c->lpt_cnext);
 
@@ -448,9 +475,10 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
+			dbg_chk_lpt_sz(c, 2, alen - wlen);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -461,6 +489,7 @@ static int write_cnodes(struct ubifs_info *c)
 		done_lsave = 1;
 		ubifs_pack_lsave(c, buf + offs, c->lsave);
 		offs += c->lsave_sz;
+		dbg_chk_lpt_sz(c, 1, c->lsave_sz);
 	}
 
 	/* Make sure to place LPT's own lprops table */
@@ -473,9 +502,10 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
+			dbg_chk_lpt_sz(c, 2, alen - wlen);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
-				return err;
+				goto no_space;
 			offs = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
@@ -486,6 +516,7 @@ static int write_cnodes(struct ubifs_info *c)
 		done_ltab = 1;
 		ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
 		offs += c->ltab_sz;
+		dbg_chk_lpt_sz(c, 1, c->ltab_sz);
 	}
 
 	/* Write remaining data in buffer */
@@ -495,6 +526,12 @@ static int write_cnodes(struct ubifs_info *c)
 	err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
 	if (err)
 		return err;
+
+	dbg_chk_lpt_sz(c, 4, alen - wlen);
+	err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size));
+	if (err)
+		return err;
+
 	c->nhead_lnum = lnum;
 	c->nhead_offs = ALIGN(offs, c->min_io_size);
 
@@ -503,7 +540,15 @@ static int write_cnodes(struct ubifs_info *c)
 	dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
 	if (c->big_lpt)
 		dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
 	return 0;
+
+no_space:
+	ubifs_err("LPT out of space mismatch");
+	dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
+	        "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+	dbg_dump_lpt_info(c);
+	return err;
 }
 
 /**
@@ -1044,6 +1089,8 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
 	int pos = 0, node_type, node_len;
 	uint16_t crc, calc_crc;
 
+	if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8)
+		return 0;
 	node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
 	if (node_type == UBIFS_LPT_NOT_A_NODE)
 		return 0;
@@ -1156,6 +1203,9 @@ int ubifs_lpt_start_commit(struct ubifs_info *c)
 	dbg_lp("");
 
 	mutex_lock(&c->lp_mutex);
+	err = dbg_chk_lpt_free_spc(c);
+	if (err)
+		goto out;
 	err = dbg_check_ltab(c);
 	if (err)
 		goto out;
@@ -1645,4 +1695,121 @@ int dbg_check_ltab(struct ubifs_info *c)
 	return 0;
 }
 
+/**
+ * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_chk_lpt_free_spc(struct ubifs_info *c)
+{
+	long long free = 0;
+	int i;
+
+	for (i = 0; i < c->lpt_lebs; i++) {
+		if (c->ltab[i].tgc || c->ltab[i].cmt)
+			continue;
+		if (i + c->lpt_first == c->nhead_lnum)
+			free += c->leb_size - c->nhead_offs;
+		else if (c->ltab[i].free == c->leb_size)
+			free += c->leb_size;
+	}
+	if (free < c->lpt_sz) {
+		dbg_err("LPT space error: free %lld lpt_sz %lld",
+			free, c->lpt_sz);
+		dbg_dump_lpt_info(c);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
+ * @c: the UBIFS file-system description object
+ * @action: action
+ * @len: length written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
+{
+	long long chk_lpt_sz, lpt_sz;
+	int err = 0;
+
+	switch (action) {
+	case 0:
+		c->chk_lpt_sz = 0;
+		c->chk_lpt_sz2 = 0;
+		c->chk_lpt_lebs = 0;
+		c->chk_lpt_wastage = 0;
+		if (c->dirty_pn_cnt > c->pnode_cnt) {
+			dbg_err("dirty pnodes %d exceed max %d",
+				c->dirty_pn_cnt, c->pnode_cnt);
+			err = -EINVAL;
+		}
+		if (c->dirty_nn_cnt > c->nnode_cnt) {
+			dbg_err("dirty nnodes %d exceed max %d",
+				c->dirty_nn_cnt, c->nnode_cnt);
+			err = -EINVAL;
+		}
+		return err;
+	case 1:
+		c->chk_lpt_sz += len;
+		return 0;
+	case 2:
+		c->chk_lpt_sz += len;
+		c->chk_lpt_wastage += len;
+		c->chk_lpt_lebs += 1;
+		return 0;
+	case 3:
+		chk_lpt_sz = c->leb_size;
+		chk_lpt_sz *= c->chk_lpt_lebs;
+		chk_lpt_sz += len - c->nhead_offs;
+		if (c->chk_lpt_sz != chk_lpt_sz) {
+			dbg_err("LPT wrote %lld but space used was %lld",
+				c->chk_lpt_sz, chk_lpt_sz);
+			err = -EINVAL;
+		}
+		if (c->chk_lpt_sz > c->lpt_sz) {
+			dbg_err("LPT wrote %lld but lpt_sz is %lld",
+				c->chk_lpt_sz, c->lpt_sz);
+			err = -EINVAL;
+		}
+		if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) {
+			dbg_err("LPT layout size %lld but wrote %lld",
+				c->chk_lpt_sz, c->chk_lpt_sz2);
+			err = -EINVAL;
+		}
+		if (c->chk_lpt_sz2 && c->new_nhead_offs != len) {
+			dbg_err("LPT new nhead offs: expected %d was %d",
+				c->new_nhead_offs, len);
+			err = -EINVAL;
+		}
+		lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+		lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+		lpt_sz += c->ltab_sz;
+		if (c->big_lpt)
+			lpt_sz += c->lsave_sz;
+		if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) {
+			dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
+				c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz);
+			err = -EINVAL;
+		}
+		if (err)
+			dbg_dump_lpt_info(c);
+		c->chk_lpt_sz2 = c->chk_lpt_sz;
+		c->chk_lpt_sz = 0;
+		c->chk_lpt_wastage = 0;
+		c->chk_lpt_lebs = 0;
+		c->new_nhead_offs = len;
+		return err;
+	case 4:
+		c->chk_lpt_sz += len;
+		c->chk_lpt_wastage += len;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4beccfc256d2..4fa81d867e41 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -80,20 +80,6 @@ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
 }
 
 /**
- * ubifs_ro_mode - switch UBIFS to read read-only mode.
- * @c: UBIFS file-system description object
- * @err: error code which is the reason of switching to R/O mode
- */
-static inline void ubifs_ro_mode(struct ubifs_info *c, int err)
-{
-	if (!c->ro_media) {
-		c->ro_media = 1;
-		ubifs_warn("switched to read-only mode, error %d", err);
-		dbg_dump_stack();
-	}
-}
-
-/**
  * ubifs_compr_present - check if compressor was compiled in.
  * @compr_type: compressor type to check
  *
@@ -298,45 +284,57 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,
 }
 
 /**
- * ubifs_reported_space - calculate reported free space.
- * @c: the UBIFS file-system description object
- * @free: amount of free space
- *
- * This function calculates amount of free space which will be reported to
- * user-space. User-space application tend to expect that if the file-system
- * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
- * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
- * above expectetion.
- *
- * This function assumes free space is made up of uncompressed data nodes and
- * full index nodes (one per data node, doubled because we always allow enough
- * space to write the index twice).
- *
- * Note, the calculation is pessimistic, which means that most of the time
- * UBIFS reports less space than it actually has.
+ * ubifs_current_time - round current time to time granularity.
+ * @inode: inode
  */
-static inline long long ubifs_reported_space(const struct ubifs_info *c,
-					     uint64_t free)
+static inline struct timespec ubifs_current_time(struct inode *inode)
 {
-	int divisor, factor;
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
 
-	divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1);
-	factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ;
-	do_div(free, divisor);
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+static inline int ubifs_tnc_lookup(struct ubifs_info *c,
+				   const union ubifs_key *key, void *node)
+{
+	return ubifs_tnc_locate(c, key, node, NULL, NULL);
+}
 
-	return free * factor;
+/**
+ * ubifs_get_lprops - get reference to LEB properties.
+ * @c: the UBIFS file-system description object
+ *
+ * This function locks lprops. Lprops have to be unlocked by
+ * 'ubifs_release_lprops()'.
+ */
+static inline void ubifs_get_lprops(struct ubifs_info *c)
+{
+	mutex_lock(&c->lp_mutex);
 }
 
 /**
- * ubifs_current_time - round current time to time granularity.
- * @inode: inode
+ * ubifs_release_lprops - release lprops lock.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called after each 'ubifs_get_lprops()' call to
+ * unlock lprops.
  */
-static inline struct timespec ubifs_current_time(struct inode *inode)
+static inline void ubifs_release_lprops(struct ubifs_info *c)
 {
-	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
-		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+	ubifs_assert(mutex_is_locked(&c->lp_mutex));
+	ubifs_assert(c->lst.empty_lebs >= 0 &&
+		     c->lst.empty_lebs <= c->main_lebs);
+	mutex_unlock(&c->lp_mutex);
 }
 
 #endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 3afeb9242c6a..02d3462f4d3e 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -310,10 +310,10 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
 	c->cmt_orphans -= cnt;
 	spin_unlock(&c->orphan_lock);
 	if (c->cmt_orphans)
-		orph->cmt_no = cpu_to_le64(c->cmt_no + 1);
+		orph->cmt_no = cpu_to_le64(c->cmt_no);
 	else
 		/* Mark the last node of the commit */
-		orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63));
+		orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63));
 	ubifs_assert(c->ohead_offs + len <= c->leb_size);
 	ubifs_assert(c->ohead_lnum >= c->orph_first);
 	ubifs_assert(c->ohead_lnum <= c->orph_last);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index acf5c5fffc60..0ed82479b44b 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -87,7 +87,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
 
 	dbg_scan("scanning %s", dbg_ntype(ch->node_type));
 
-	if (ubifs_check_node(c, buf, lnum, offs, quiet))
+	if (ubifs_check_node(c, buf, lnum, offs, quiet, 1))
 		return SCANNED_A_CORRUPT_NODE;
 
 	if (ch->node_type == UBIFS_PAD_NODE) {
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 00eb9c68ad03..8780efbf40ac 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -30,7 +30,6 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
-#include <linux/random.h>
 #include <linux/kthread.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
@@ -149,7 +148,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 	if (err)
 		goto out_invalid;
 
-	/* Disable readahead */
+	/* Disable read-ahead */
 	inode->i_mapping->backing_dev_info = &c->bdi;
 
 	switch (inode->i_mode & S_IFMT) {
@@ -278,7 +277,7 @@ static void ubifs_destroy_inode(struct inode *inode)
  */
 static int ubifs_write_inode(struct inode *inode, int wait)
 {
-	int err;
+	int err = 0;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
@@ -299,10 +298,18 @@ static int ubifs_write_inode(struct inode *inode, int wait)
 		return 0;
 	}
 
-	dbg_gen("inode %lu", inode->i_ino);
-	err = ubifs_jnl_write_inode(c, inode, 0);
-	if (err)
-		ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+	/*
+	 * As an optimization, do not write orphan inodes to the media just
+	 * because this is not needed.
+	 */
+	dbg_gen("inode %lu, mode %#x, nlink %u",
+		inode->i_ino, (int)inode->i_mode, inode->i_nlink);
+	if (inode->i_nlink) {
+		err = ubifs_jnl_write_inode(c, inode);
+		if (err)
+			ubifs_err("can't write inode %lu, error %d",
+				  inode->i_ino, err);
+	}
 
 	ui->dirty = 0;
 	mutex_unlock(&ui->ui_mutex);
@@ -314,8 +321,9 @@ static void ubifs_delete_inode(struct inode *inode)
 {
 	int err;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	struct ubifs_inode *ui = ubifs_inode(inode);
 
-	if (ubifs_inode(inode)->xattr)
+	if (ui->xattr)
 		/*
 		 * Extended attribute inode deletions are fully handled in
 		 * 'ubifs_removexattr()'. These inodes are special and have
@@ -323,7 +331,7 @@ static void ubifs_delete_inode(struct inode *inode)
 		 */
 		goto out;
 
-	dbg_gen("inode %lu", inode->i_ino);
+	dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
 	ubifs_assert(!atomic_read(&inode->i_count));
 	ubifs_assert(inode->i_nlink == 0);
 
@@ -331,15 +339,19 @@ static void ubifs_delete_inode(struct inode *inode)
 	if (is_bad_inode(inode))
 		goto out;
 
-	ubifs_inode(inode)->ui_size = inode->i_size = 0;
-	err = ubifs_jnl_write_inode(c, inode, 1);
+	ui->ui_size = inode->i_size = 0;
+	err = ubifs_jnl_delete_inode(c, inode);
 	if (err)
 		/*
 		 * Worst case we have a lost orphan inode wasting space, so a
-		 * simple error message is ok here.
+		 * simple error message is OK here.
 		 */
-		ubifs_err("can't write inode %lu, error %d", inode->i_ino, err);
+		ubifs_err("can't delete inode %lu, error %d",
+			  inode->i_ino, err);
+
 out:
+	if (ui->dirty)
+		ubifs_release_dirty_inode_budget(c, ui);
 	clear_inode(inode);
 }
 
@@ -358,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct ubifs_info *c = dentry->d_sb->s_fs_info;
 	unsigned long long free;
+	__le32 *uuid = (__le32 *)c->uuid;
 
-	free = ubifs_budg_get_free_space(c);
+	free = ubifs_get_free_space(c);
 	dbg_gen("free space %lld bytes (%lld blocks)",
 		free, free >> UBIFS_BLOCK_SHIFT);
 
@@ -374,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = 0;
 	buf->f_ffree = 0;
 	buf->f_namelen = UBIFS_MAX_NLEN;
-
+	buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
+	buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
 	return 0;
 }
 
@@ -387,6 +401,16 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
 	else if (c->mount_opts.unmount_mode == 1)
 		seq_printf(s, ",norm_unmount");
 
+	if (c->mount_opts.bulk_read == 2)
+		seq_printf(s, ",bulk_read");
+	else if (c->mount_opts.bulk_read == 1)
+		seq_printf(s, ",no_bulk_read");
+
+	if (c->mount_opts.chk_data_crc == 2)
+		seq_printf(s, ",chk_data_crc");
+	else if (c->mount_opts.chk_data_crc == 1)
+		seq_printf(s, ",no_chk_data_crc");
+
 	return 0;
 }
 
@@ -394,13 +418,26 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 {
 	struct ubifs_info *c = sb->s_fs_info;
 	int i, ret = 0, err;
+	long long bud_bytes;
 
-	if (c->jheads)
+	if (c->jheads) {
 		for (i = 0; i < c->jhead_cnt; i++) {
 			err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
 			if (err && !ret)
 				ret = err;
 		}
+
+		/* Commit the journal unless it has too little data */
+		spin_lock(&c->buds_lock);
+		bud_bytes = c->bud_bytes;
+		spin_unlock(&c->buds_lock);
+		if (bud_bytes > c->leb_size) {
+			err = ubifs_run_commit(c);
+			if (err)
+				return err;
+		}
+	}
+
 	/*
 	 * We ought to call sync for c->ubi but it does not have one. If it had
 	 * it would in turn call mtd->sync, however mtd operations are
@@ -518,6 +555,24 @@ static int init_constants_early(struct ubifs_info *c)
 	c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
 	c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
 
+	/*
+	 * Calculate how many bytes would be wasted at the end of LEB if it was
+	 * fully filled with data nodes of maximum size. This is used in
+	 * calculations when reporting free space.
+	 */
+	c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
+	/* Buffer size for bulk-reads */
+	c->bulk_read_buf_size = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
+	if (c->bulk_read_buf_size > c->leb_size)
+		c->bulk_read_buf_size = c->leb_size;
+	if (c->bulk_read_buf_size > 128 * 1024) {
+		/* Check if we can kmalloc more than 128KiB */
+		void *try = kmalloc(c->bulk_read_buf_size, GFP_KERNEL);
+
+		kfree(try);
+		if (!try)
+			c->bulk_read_buf_size = 128 * 1024;
+	}
 	return 0;
 }
 
@@ -635,13 +690,11 @@ static int init_constants_late(struct ubifs_info *c)
 	 * internally because it does not make much sense for UBIFS, but it is
 	 * necessary to report something for the 'statfs()' call.
 	 *
-	 * Subtract the LEB reserved for GC and the LEB which is reserved for
-	 * deletions.
-	 *
-	 * Review 'ubifs_calc_available()' if changing this calculation.
+	 * Subtract the LEB reserved for GC, the LEB which is reserved for
+	 * deletions, and assume only one journal head is available.
 	 */
-	tmp64 = c->main_lebs - 2;
-	tmp64 *= (uint64_t)c->leb_size - c->dark_wm;
+	tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
+	tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
 	tmp64 = ubifs_reported_space(c, tmp64);
 	c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
 
@@ -822,17 +875,29 @@ static int check_volume_empty(struct ubifs_info *c)
  *
  * Opt_fast_unmount: do not run a journal commit before un-mounting
  * Opt_norm_unmount: run a journal commit before un-mounting
+ * Opt_bulk_read: enable bulk-reads
+ * Opt_no_bulk_read: disable bulk-reads
+ * Opt_chk_data_crc: check CRCs when reading data nodes
+ * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
  * Opt_err: just end of array marker
  */
 enum {
 	Opt_fast_unmount,
 	Opt_norm_unmount,
+	Opt_bulk_read,
+	Opt_no_bulk_read,
+	Opt_chk_data_crc,
+	Opt_no_chk_data_crc,
 	Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_fast_unmount, "fast_unmount"},
 	{Opt_norm_unmount, "norm_unmount"},
+	{Opt_bulk_read, "bulk_read"},
+	{Opt_no_bulk_read, "no_bulk_read"},
+	{Opt_chk_data_crc, "chk_data_crc"},
+	{Opt_no_chk_data_crc, "no_chk_data_crc"},
 	{Opt_err, NULL},
 };
 
@@ -870,6 +935,22 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
 			c->mount_opts.unmount_mode = 1;
 			c->fast_unmount = 0;
 			break;
+		case Opt_bulk_read:
+			c->mount_opts.bulk_read = 2;
+			c->bulk_read = 1;
+			break;
+		case Opt_no_bulk_read:
+			c->mount_opts.bulk_read = 1;
+			c->bulk_read = 0;
+			break;
+		case Opt_chk_data_crc:
+			c->mount_opts.chk_data_crc = 2;
+			c->no_chk_data_crc = 0;
+			break;
+		case Opt_no_chk_data_crc:
+			c->mount_opts.chk_data_crc = 1;
+			c->no_chk_data_crc = 1;
+			break;
 		default:
 			ubifs_err("unrecognized mount option \"%s\" "
 				  "or missing value", p);
@@ -978,6 +1059,8 @@ static int mount_ubifs(struct ubifs_info *c)
 			goto out_free;
 	}
 
+	c->always_chk_crc = 1;
+
 	err = ubifs_read_superblock(c);
 	if (err)
 		goto out_free;
@@ -1006,17 +1089,14 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_dereg;
 	}
 
+	sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
 	if (!mounted_read_only) {
 		err = alloc_wbufs(c);
 		if (err)
 			goto out_cbuf;
 
 		/* Create background thread */
-		sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num,
-			c->vi.vol_id);
 		c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
-		if (!c->bgt)
-			c->bgt = ERR_PTR(-EINVAL);
 		if (IS_ERR(c->bgt)) {
 			err = PTR_ERR(c->bgt);
 			c->bgt = NULL;
@@ -1122,24 +1202,28 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_infos;
 
-	ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num,
-		  c->vi.vol_id);
+	c->always_chk_crc = 0;
+
+	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
+		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
 	if (mounted_read_only)
 		ubifs_msg("mounted read-only");
 	x = (long long)c->main_lebs * c->leb_size;
-	ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
-		  x, x >> 10, x >> 20, c->main_lebs);
+	ubifs_msg("file system size:   %lld bytes (%lld KiB, %lld MiB, %d "
+		  "LEBs)", x, x >> 10, x >> 20, c->main_lebs);
 	x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
-	ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
-		  x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
-	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
-	ubifs_msg("media format %d, latest format %d",
+	ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
+		  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
+	ubifs_msg("media format:       %d (latest is %d)",
 		  c->fmt_version, UBIFS_FORMAT_VERSION);
+	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
+	ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
+		c->report_rp_size, c->report_rp_size >> 10);
 
 	dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
 	dbg_msg("min. I/O unit size:  %d bytes", c->min_io_size);
 	dbg_msg("LEB size:            %d bytes (%d KiB)",
-		c->leb_size, c->leb_size / 1024);
+		c->leb_size, c->leb_size >> 10);
 	dbg_msg("data journal heads:  %d",
 		c->jhead_cnt - NONDATA_JHEADS_CNT);
 	dbg_msg("UUID:                %02X%02X%02X%02X-%02X%02X"
@@ -1265,6 +1349,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 
 	mutex_lock(&c->umount_mutex);
 	c->remounting_rw = 1;
+	c->always_chk_crc = 1;
 
 	/* Check for enough free space */
 	if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
@@ -1328,20 +1413,20 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 
 	/* Create background thread */
 	c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
-	if (!c->bgt)
-		c->bgt = ERR_PTR(-EINVAL);
 	if (IS_ERR(c->bgt)) {
 		err = PTR_ERR(c->bgt);
 		c->bgt = NULL;
 		ubifs_err("cannot spawn \"%s\", error %d",
 			  c->bgt_name, err);
-		return err;
+		goto out;
 	}
 	wake_up_process(c->bgt);
 
 	c->orph_buf = vmalloc(c->leb_size);
-	if (!c->orph_buf)
-		return -ENOMEM;
+	if (!c->orph_buf) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	/* Check for enough log space */
 	lnum = c->lhead_lnum + 1;
@@ -1368,6 +1453,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	dbg_gen("re-mounted read-write");
 	c->vfs_sb->s_flags &= ~MS_RDONLY;
 	c->remounting_rw = 0;
+	c->always_chk_crc = 0;
 	mutex_unlock(&c->umount_mutex);
 	return 0;
 
@@ -1383,6 +1469,7 @@ out:
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
 	c->remounting_rw = 0;
+	c->always_chk_crc = 0;
 	mutex_unlock(&c->umount_mutex);
 	return err;
 }
@@ -1391,12 +1478,9 @@ out:
  * commit_on_unmount - commit the journal when un-mounting.
  * @c: UBIFS file-system description object
  *
- * This function is called during un-mounting and it commits the journal unless
- * the "fast unmount" mode is enabled. It also avoids committing the journal if
- * it contains too few data.
- *
- * Sometimes recovery requires the journal to be committed at least once, and
- * this function takes care about this.
+ * This function is called during un-mounting and re-mounting, and it commits
+ * the journal unless the "fast unmount" mode is enabled. It also avoids
+ * committing the journal if it contains too few data.
  */
 static void commit_on_unmount(struct ubifs_info *c)
 {
@@ -1469,6 +1553,7 @@ static void ubifs_put_super(struct super_block *sb)
 	 */
 	ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
 	ubifs_assert(c->budg_idx_growth == 0);
+	ubifs_assert(c->budg_dd_growth == 0);
 	ubifs_assert(c->budg_data_growth == 0);
 
 	/*
@@ -1657,7 +1742,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	INIT_LIST_HEAD(&c->orph_new);
 
 	c->highest_inum = UBIFS_FIRST_INO;
-	get_random_bytes(&c->vfs_gen, sizeof(int));
 	c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
 
 	ubi_get_volume_info(ubi, &c->vi);
@@ -1671,10 +1755,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/*
-	 * UBIFS provids 'backing_dev_info' in order to disable readahead. For
+	 * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
 	 * UBIFS, I/O is not deferred, it is done immediately in readpage,
 	 * which means the user would have to wait not just for their own I/O
-	 * but the readahead I/O as well i.e. completely pointless.
+	 * but the read-ahead I/O as well i.e. completely pointless.
 	 *
 	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
 	 */
@@ -1841,7 +1925,7 @@ static struct file_system_type ubifs_fs_type = {
 /*
  * Inode slab cache constructor.
  */
-static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
+static void inode_slab_ctor(void *obj)
 {
 	struct ubifs_inode *ui = obj;
 	inode_init_once(&ui->vfs_inode);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e909f4a96443..d27fd918b9c9 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -284,7 +284,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
 	}
 
 	zn = copy_znode(c, znode);
-	if (unlikely(IS_ERR(zn)))
+	if (IS_ERR(zn))
 		return zn;
 
 	if (zbr->len) {
@@ -470,6 +470,10 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 	if (node_len != len)
 		return 0;
 
+	if (type == UBIFS_DATA_NODE && !c->always_chk_crc)
+		if (c->no_chk_data_crc)
+			return 0;
+
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
 	if (crc != node_crc)
@@ -506,7 +510,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
 		if (keys_cmp(c, key, &node_key) != 0)
 			ret = 0;
 	}
-	if (ret == 0)
+	if (ret == 0 && c->replaying)
 		dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
 			zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
 	return ret;
@@ -1128,7 +1132,7 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
 			ubifs_assert(znode == c->zroot.znode);
 			znode = dirty_cow_znode(c, &c->zroot);
 		}
-		if (unlikely(IS_ERR(znode)) || !p)
+		if (IS_ERR(znode) || !p)
 			break;
 		ubifs_assert(path[p - 1] >= 0);
 		ubifs_assert(path[p - 1] < znode->child_cnt);
@@ -1382,23 +1386,62 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
 }
 
 /**
- * ubifs_tnc_lookup - look up a file-system node.
+ * maybe_leb_gced - determine if a LEB may have been garbage collected.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @gc_seq1: garbage collection sequence number
+ *
+ * This function determines if @lnum may have been garbage collected since
+ * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
+ * %0 is returned.
+ */
+static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
+{
+	int gc_seq2, gced_lnum;
+
+	gced_lnum = c->gced_lnum;
+	smp_rmb();
+	gc_seq2 = c->gc_seq;
+	/* Same seq means no GC */
+	if (gc_seq1 == gc_seq2)
+		return 0;
+	/* Different by more than 1 means we don't know */
+	if (gc_seq1 + 1 != gc_seq2)
+		return 1;
+	/*
+	 * We have seen the sequence number has increased by 1. Now we need to
+	 * be sure we read the right LEB number, so read it again.
+	 */
+	smp_rmb();
+	if (gced_lnum != c->gced_lnum)
+		return 1;
+	/* Finally we can check lnum */
+	if (gced_lnum == lnum)
+		return 1;
+	return 0;
+}
+
+/**
+ * ubifs_tnc_locate - look up a file-system node and return it and its location.
  * @c: UBIFS file-system description object
  * @key: node key to lookup
  * @node: the node is returned here
+ * @lnum: LEB number is returned here
+ * @offs: offset is returned here
  *
  * This function look up and reads node with key @key. The caller has to make
  * sure the @node buffer is large enough to fit the node. Returns zero in case
  * of success, %-ENOENT if the node was not found, and a negative error code in
- * case of failure.
+ * case of failure. The node location can be returned in @lnum and @offs.
  */
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
-		     void *node)
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+		     void *node, int *lnum, int *offs)
 {
-	int found, n, err;
+	int found, n, err, safely = 0, gc_seq1;
 	struct ubifs_znode *znode;
 	struct ubifs_zbranch zbr, *zt;
 
+again:
 	mutex_lock(&c->tnc_mutex);
 	found = ubifs_lookup_level0(c, key, &znode, &n);
 	if (!found) {
@@ -1409,6 +1452,10 @@ int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
 		goto out;
 	}
 	zt = &znode->zbranch[n];
+	if (lnum) {
+		*lnum = zt->lnum;
+		*offs = zt->offs;
+	}
 	if (is_hash_key(c, key)) {
 		/*
 		 * In this case the leaf node cache gets used, so we pass the
@@ -1417,11 +1464,31 @@ int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
 		err = tnc_read_node_nm(c, zt, node);
 		goto out;
 	}
+	if (safely) {
+		err = ubifs_tnc_read_node(c, zt, node);
+		goto out;
+	}
+	/* Drop the TNC mutex prematurely and race with garbage collection */
 	zbr = znode->zbranch[n];
+	gc_seq1 = c->gc_seq;
 	mutex_unlock(&c->tnc_mutex);
 
-	err = ubifs_tnc_read_node(c, &zbr, node);
-	return err;
+	if (ubifs_get_wbuf(c, zbr.lnum)) {
+		/* We do not GC journal heads */
+		err = ubifs_tnc_read_node(c, &zbr, node);
+		return err;
+	}
+
+	err = fallible_read_node(c, key, &zbr, node);
+	if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+		/*
+		 * The node may have been GC'ed out from under us so try again
+		 * while keeping the TNC mutex locked.
+		 */
+		safely = 1;
+		goto again;
+	}
+	return 0;
 
 out:
 	mutex_unlock(&c->tnc_mutex);
@@ -1429,58 +1496,289 @@ out:
 }
 
 /**
- * ubifs_tnc_locate - look up a file-system node and return it and its location.
+ * ubifs_tnc_get_bu_keys - lookup keys for bulk-read.
  * @c: UBIFS file-system description object
- * @key: node key to lookup
- * @node: the node is returned here
- * @lnum: LEB number is returned here
- * @offs: offset is returned here
+ * @bu: bulk-read parameters and results
  *
- * This function is the same as 'ubifs_tnc_lookup()' but it returns the node
- * location also. See 'ubifs_tnc_lookup()'.
+ * Lookup consecutive data node keys for the same inode that reside
+ * consecutively in the same LEB.
  */
-int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
-		     void *node, int *lnum, int *offs)
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
 {
-	int found, n, err;
+	int n, err = 0, lnum = -1, uninitialized_var(offs);
+	int uninitialized_var(len);
+	unsigned int block = key_block(c, &bu->key);
 	struct ubifs_znode *znode;
-	struct ubifs_zbranch zbr, *zt;
+
+	bu->cnt = 0;
+	bu->blk_cnt = 0;
+	bu->eof = 0;
 
 	mutex_lock(&c->tnc_mutex);
-	found = ubifs_lookup_level0(c, key, &znode, &n);
-	if (!found) {
-		err = -ENOENT;
-		goto out;
-	} else if (found < 0) {
-		err = found;
+	/* Find first key */
+	err = ubifs_lookup_level0(c, &bu->key, &znode, &n);
+	if (err < 0)
 		goto out;
+	if (err) {
+		/* Key found */
+		len = znode->zbranch[n].len;
+		/* The buffer must be big enough for at least 1 node */
+		if (len > bu->buf_len) {
+			err = -EINVAL;
+			goto out;
+		}
+		/* Add this key */
+		bu->zbranch[bu->cnt++] = znode->zbranch[n];
+		bu->blk_cnt += 1;
+		lnum = znode->zbranch[n].lnum;
+		offs = ALIGN(znode->zbranch[n].offs + len, 8);
 	}
-	zt = &znode->zbranch[n];
-	if (is_hash_key(c, key)) {
-		/*
-		 * In this case the leaf node cache gets used, so we pass the
-		 * address of the zbranch and keep the mutex locked
-		 */
-		*lnum = zt->lnum;
-		*offs = zt->offs;
-		err = tnc_read_node_nm(c, zt, node);
-		goto out;
+	while (1) {
+		struct ubifs_zbranch *zbr;
+		union ubifs_key *key;
+		unsigned int next_block;
+
+		/* Find next key */
+		err = tnc_next(c, &znode, &n);
+		if (err)
+			goto out;
+		zbr = &znode->zbranch[n];
+		key = &zbr->key;
+		/* See if there is another data key for this file */
+		if (key_inum(c, key) != key_inum(c, &bu->key) ||
+		    key_type(c, key) != UBIFS_DATA_KEY) {
+			err = -ENOENT;
+			goto out;
+		}
+		if (lnum < 0) {
+			/* First key found */
+			lnum = zbr->lnum;
+			offs = ALIGN(zbr->offs + zbr->len, 8);
+			len = zbr->len;
+			if (len > bu->buf_len) {
+				err = -EINVAL;
+				goto out;
+			}
+		} else {
+			/*
+			 * The data nodes must be in consecutive positions in
+			 * the same LEB.
+			 */
+			if (zbr->lnum != lnum || zbr->offs != offs)
+				goto out;
+			offs += ALIGN(zbr->len, 8);
+			len = ALIGN(len, 8) + zbr->len;
+			/* Must not exceed buffer length */
+			if (len > bu->buf_len)
+				goto out;
+		}
+		/* Allow for holes */
+		next_block = key_block(c, key);
+		bu->blk_cnt += (next_block - block - 1);
+		if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+		block = next_block;
+		/* Add this key */
+		bu->zbranch[bu->cnt++] = *zbr;
+		bu->blk_cnt += 1;
+		/* See if we have room for more */
+		if (bu->cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
+		if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+			goto out;
 	}
-	zbr = znode->zbranch[n];
+out:
+	if (err == -ENOENT) {
+		bu->eof = 1;
+		err = 0;
+	}
+	bu->gc_seq = c->gc_seq;
 	mutex_unlock(&c->tnc_mutex);
+	if (err)
+		return err;
+	/*
+	 * An enormous hole could cause bulk-read to encompass too many
+	 * page cache pages, so limit the number here.
+	 */
+	if (bu->blk_cnt > UBIFS_MAX_BULK_READ)
+		bu->blk_cnt = UBIFS_MAX_BULK_READ;
+	/*
+	 * Ensure that bulk-read covers a whole number of page cache
+	 * pages.
+	 */
+	if (UBIFS_BLOCKS_PER_PAGE == 1 ||
+	    !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1)))
+		return 0;
+	if (bu->eof) {
+		/* At the end of file we can round up */
+		bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1;
+		return 0;
+	}
+	/* Exclude data nodes that do not make up a whole page cache page */
+	block = key_block(c, &bu->key) + bu->blk_cnt;
+	block &= ~(UBIFS_BLOCKS_PER_PAGE - 1);
+	while (bu->cnt) {
+		if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block)
+			break;
+		bu->cnt -= 1;
+	}
+	return 0;
+}
 
-	*lnum = zbr.lnum;
-	*offs = zbr.offs;
+/**
+ * read_wbuf - bulk-read from a LEB with a wbuf.
+ * @wbuf: wbuf that may overlap the read
+ * @buf: buffer into which to read
+ * @len: read length
+ * @lnum: LEB number from which to read
+ * @offs: offset from which to read
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum,
+		     int offs)
+{
+	const struct ubifs_info *c = wbuf->c;
+	int rlen, overlap;
 
-	err = ubifs_tnc_read_node(c, &zbr, node);
-	return err;
+	dbg_io("LEB %d:%d, length %d", lnum, offs, len);
+	ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+	ubifs_assert(!(offs & 7) && offs < c->leb_size);
+	ubifs_assert(offs + len <= c->leb_size);
+
+	spin_lock(&wbuf->lock);
+	overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+	if (!overlap) {
+		/* We may safely unlock the write-buffer and read the data */
+		spin_unlock(&wbuf->lock);
+		return ubi_read(c->ubi, lnum, buf, offs, len);
+	}
+
+	/* Don't read under wbuf */
+	rlen = wbuf->offs - offs;
+	if (rlen < 0)
+		rlen = 0;
+
+	/* Copy the rest from the write-buffer */
+	memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+	spin_unlock(&wbuf->lock);
+
+	if (rlen > 0)
+		/* Read everything that goes before write-buffer */
+		return ubi_read(c->ubi, lnum, buf, offs, rlen);
+
+	return 0;
+}
+
+/**
+ * validate_data_node - validate data nodes for bulk-read.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing data node to validate
+ * @zbr: zbranch of data node to validate
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int validate_data_node(struct ubifs_info *c, void *buf,
+			      struct ubifs_zbranch *zbr)
+{
+	union ubifs_key key1;
+	struct ubifs_ch *ch = buf;
+	int err, len;
+
+	if (ch->node_type != UBIFS_DATA_NODE) {
+		ubifs_err("bad node type (%d but expected %d)",
+			  ch->node_type, UBIFS_DATA_NODE);
+		goto out_err;
+	}
+
+	err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0);
+	if (err) {
+		ubifs_err("expected node type %d", UBIFS_DATA_NODE);
+		goto out;
+	}
+
+	len = le32_to_cpu(ch->len);
+	if (len != zbr->len) {
+		ubifs_err("bad node length %d, expected %d", len, zbr->len);
+		goto out_err;
+	}
+
+	/* Make sure the key of the read node is correct */
+	key_read(c, buf + UBIFS_KEY_OFFSET, &key1);
+	if (!keys_eq(c, &zbr->key, &key1)) {
+		ubifs_err("bad key in node at LEB %d:%d",
+			  zbr->lnum, zbr->offs);
+		dbg_tnc("looked for key %s found node's key %s",
+			DBGKEY(&zbr->key), DBGKEY1(&key1));
+		goto out_err;
+	}
 
+	return 0;
+
+out_err:
+	err = -EINVAL;
 out:
-	mutex_unlock(&c->tnc_mutex);
+	ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs);
+	dbg_dump_node(c, buf);
+	dbg_dump_stack();
 	return err;
 }
 
 /**
+ * ubifs_tnc_bulk_read - read a number of data nodes in one go.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read parameters and results
+ *
+ * This functions reads and validates the data nodes that were identified by the
+ * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success,
+ * -EAGAIN to indicate a race with GC, or another negative error code on
+ * failure.
+ */
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
+{
+	int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i;
+	struct ubifs_wbuf *wbuf;
+	void *buf;
+
+	len = bu->zbranch[bu->cnt - 1].offs;
+	len += bu->zbranch[bu->cnt - 1].len - offs;
+	if (len > bu->buf_len) {
+		ubifs_err("buffer too small %d vs %d", bu->buf_len, len);
+		return -EINVAL;
+	}
+
+	/* Do the read */
+	wbuf = ubifs_get_wbuf(c, lnum);
+	if (wbuf)
+		err = read_wbuf(wbuf, bu->buf, len, lnum, offs);
+	else
+		err = ubi_read(c->ubi, lnum, bu->buf, offs, len);
+
+	/* Check for a race with GC */
+	if (maybe_leb_gced(c, lnum, bu->gc_seq))
+		return -EAGAIN;
+
+	if (err && err != -EBADMSG) {
+		ubifs_err("failed to read from LEB %d:%d, error %d",
+			  lnum, offs, err);
+		dbg_dump_stack();
+		dbg_tnc("key %s", DBGKEY(&bu->key));
+		return err;
+	}
+
+	/* Validate the nodes read */
+	buf = bu->buf;
+	for (i = 0; i < bu->cnt; i++) {
+		err = validate_data_node(c, buf, &bu->zbranch[i]);
+		if (err)
+			return err;
+		buf = buf + ALIGN(bu->zbranch[i].len, 8);
+	}
+
+	return 0;
+}
+
+/**
  * do_lookup_nm- look up a "hashed" node.
  * @c: UBIFS file-system description object
  * @key: node key to lookup
@@ -1498,7 +1796,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 {
 	int found, n, err;
 	struct ubifs_znode *znode;
-	struct ubifs_zbranch zbr;
 
 	dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
 	mutex_lock(&c->tnc_mutex);
@@ -1522,11 +1819,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 		goto out_unlock;
 	}
 
-	zbr = znode->zbranch[n];
-	mutex_unlock(&c->tnc_mutex);
-
-	err = tnc_read_node_nm(c, &zbr, node);
-	return err;
+	err = tnc_read_node_nm(c, &znode->zbranch[n], node);
 
 out_unlock:
 	mutex_unlock(&c->tnc_mutex);
@@ -1669,7 +1962,7 @@ static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
 {
 	struct ubifs_znode *zn, *zi, *zp;
 	int i, keep, move, appending = 0;
-	union ubifs_key *key = &zbr->key;
+	union ubifs_key *key = &zbr->key, *key1;
 
 	ubifs_assert(n >= 0 && n <= c->fanout);
 
@@ -1710,20 +2003,33 @@ again:
 	zn->level = znode->level;
 
 	/* Decide where to split */
-	if (znode->level == 0 && n == c->fanout &&
-	    key_type(c, key) == UBIFS_DATA_KEY) {
-		union ubifs_key *key1;
-
-		/*
-		 * If this is an inode which is being appended - do not split
-		 * it because no other zbranches can be inserted between
-		 * zbranches of consecutive data nodes anyway.
-		 */
-		key1 = &znode->zbranch[n - 1].key;
-		if (key_inum(c, key1) == key_inum(c, key) &&
-		    key_type(c, key1) == UBIFS_DATA_KEY &&
-		    key_block(c, key1) == key_block(c, key) - 1)
-			appending = 1;
+	if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) {
+		/* Try not to split consecutive data keys */
+		if (n == c->fanout) {
+			key1 = &znode->zbranch[n - 1].key;
+			if (key_inum(c, key1) == key_inum(c, key) &&
+			    key_type(c, key1) == UBIFS_DATA_KEY)
+				appending = 1;
+		} else
+			goto check_split;
+	} else if (appending && n != c->fanout) {
+		/* Try not to split consecutive data keys */
+		appending = 0;
+check_split:
+		if (n >= (c->fanout + 1) / 2) {
+			key1 = &znode->zbranch[0].key;
+			if (key_inum(c, key1) == key_inum(c, key) &&
+			    key_type(c, key1) == UBIFS_DATA_KEY) {
+				key1 = &znode->zbranch[n].key;
+				if (key_inum(c, key1) != key_inum(c, key) ||
+				    key_type(c, key1) != UBIFS_DATA_KEY) {
+					keep = n;
+					move = c->fanout - keep;
+					zi = znode;
+					goto do_split;
+				}
+			}
+		}
 	}
 
 	if (appending) {
@@ -1753,6 +2059,8 @@ again:
 			zbr->znode->parent = zn;
 	}
 
+do_split:
+
 	__set_bit(DIRTY_ZNODE, &zn->flags);
 	atomic_long_inc(&c->dirty_zn_cnt);
 
@@ -1779,14 +2087,11 @@ again:
 
 	/* Insert new znode (produced by spitting) into the parent */
 	if (zp) {
-		i = n;
+		if (n == 0 && zi == znode && znode->iip == 0)
+			correct_parent_keys(c, znode);
+
 		/* Locate insertion point */
 		n = znode->iip + 1;
-		if (appending && n != c->fanout)
-			appending = 0;
-
-		if (i == 0 && zi == znode && znode->iip == 0)
-			correct_parent_keys(c, znode);
 
 		/* Tail recursion */
 		zbr->key = zn->zbranch[0].key;
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8117e65ba2e9..8ac76b1c2d55 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -372,26 +372,25 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
 		written = layout_leb_in_gaps(c, p);
 		if (written < 0) {
 			err = written;
-			if (err == -ENOSPC) {
-				if (!dbg_force_in_the_gaps_enabled) {
-					/*
-					 * Do not print scary warnings if the
-					 * debugging option which forces
-					 * in-the-gaps is enabled.
-					 */
-					ubifs_err("out of space");
-					spin_lock(&c->space_lock);
-					dbg_dump_budg(c);
-					spin_unlock(&c->space_lock);
-					dbg_dump_lprops(c);
-				}
-				/* Try to commit anyway */
-				err = 0;
-				break;
+			if (err != -ENOSPC) {
+				kfree(c->gap_lebs);
+				c->gap_lebs = NULL;
+				return err;
 			}
-			kfree(c->gap_lebs);
-			c->gap_lebs = NULL;
-			return err;
+			if (!dbg_force_in_the_gaps_enabled) {
+				/*
+				 * Do not print scary warnings if the debugging
+				 * option which forces in-the-gaps is enabled.
+				 */
+				ubifs_err("out of space");
+				spin_lock(&c->space_lock);
+				dbg_dump_budg(c);
+				spin_unlock(&c->space_lock);
+				dbg_dump_lprops(c);
+			}
+			/* Try to commit anyway */
+			err = 0;
+			break;
 		}
 		p++;
 		cnt -= written;
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index a25c1cc1f8d9..b48db999903e 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -480,8 +480,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 	}
 
 	/* Make sure the key of the read node is correct */
-	key_read(c, key, &key1);
-	if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) {
+	key_read(c, node + UBIFS_KEY_OFFSET, &key1);
+	if (!keys_eq(c, key, &key1)) {
 		ubifs_err("bad key in node at LEB %d:%d",
 			  zbr->lnum, zbr->offs);
 		dbg_tnc("looked for key %s found node's key %s",
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0cc7da9bed47..0b378042a3a2 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -75,7 +75,6 @@
  */
 #define UBIFS_BLOCK_SIZE  4096
 #define UBIFS_BLOCK_SHIFT 12
-#define UBIFS_BLOCK_MASK  0x00000FFF
 
 /* UBIFS padding byte pattern (must not be first or last byte of node magic) */
 #define UBIFS_PADDING_BYTE 0xCE
@@ -87,7 +86,7 @@
 #define UBIFS_SK_LEN 8
 
 /* Minimum index tree fanout */
-#define UBIFS_MIN_FANOUT 2
+#define UBIFS_MIN_FANOUT 3
 
 /* Maximum number of levels in UBIFS indexing B-tree */
 #define UBIFS_MAX_LEVELS 512
@@ -228,10 +227,10 @@ enum {
 /* Minimum number of orphan area logical eraseblocks */
 #define UBIFS_MIN_ORPH_LEBS 1
 /*
- * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1
+ * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1
  * for GC, 1 for deletions, and at least 1 for committed data).
  */
-#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5)
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6)
 
 /* Minimum number of logical eraseblocks */
 #define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e4f89f271827..a7bd32fa15b9 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -20,8 +20,6 @@
  *          Adrian Hunter
  */
 
-/* Implementation version 0.7 */
-
 #ifndef __UBIFS_H__
 #define __UBIFS_H__
 
@@ -144,6 +142,9 @@
 /* Maximum expected tree height for use by bottom_up_buf */
 #define BOTTOM_UP_HEIGHT 64
 
+/* Maximum number of data nodes to bulk-read */
+#define UBIFS_MAX_BULK_READ 32
+
 /*
  * Lockdep classes for UBIFS inode @ui_mutex.
  */
@@ -322,15 +323,18 @@ struct ubifs_gced_idx_leb {
  * struct ubifs_inode - UBIFS in-memory inode description.
  * @vfs_inode: VFS inode description object
  * @creat_sqnum: sequence number at time of creation
+ * @del_cmtno: commit number corresponding to the time the inode was deleted,
+ *             protected by @c->commit_sem;
  * @xattr_size: summarized size of all extended attributes in bytes
  * @xattr_cnt: count of extended attributes this inode has
  * @xattr_names: sum of lengths of all extended attribute names belonging to
  *               this inode
  * @dirty: non-zero if the inode is dirty
  * @xattr: non-zero if this is an extended attribute inode
+ * @bulk_read: non-zero if bulk-read should be used
  * @ui_mutex: serializes inode write-back with the rest of VFS operations,
- *            serializes "clean <-> dirty" state changes, protects @dirty,
- *            @ui_size, and @xattr_size
+ *            serializes "clean <-> dirty" state changes, serializes bulk-read,
+ *            protects @dirty, @bulk_read, @ui_size, and @xattr_size
  * @ui_lock: protects @synced_i_size
  * @synced_i_size: synchronized size of inode, i.e. the value of inode size
  *                 currently stored on the flash; used only for regular file
@@ -338,6 +342,8 @@ struct ubifs_gced_idx_leb {
  * @ui_size: inode size used by UBIFS when writing to flash
  * @flags: inode flags (@UBIFS_COMPR_FL, etc)
  * @compr_type: default compression type used for this inode
+ * @last_page_read: page number of last page read (for bulk read)
+ * @read_in_a_row: number of consecutive pages read in a row (for bulk read)
  * @data_len: length of the data attached to the inode
  * @data: inode's data
  *
@@ -373,17 +379,21 @@ struct ubifs_gced_idx_leb {
 struct ubifs_inode {
 	struct inode vfs_inode;
 	unsigned long long creat_sqnum;
+	unsigned long long del_cmtno;
 	unsigned int xattr_size;
 	unsigned int xattr_cnt;
 	unsigned int xattr_names;
 	unsigned int dirty:1;
 	unsigned int xattr:1;
+	unsigned int bulk_read:1;
 	struct mutex ui_mutex;
 	spinlock_t ui_lock;
 	loff_t synced_i_size;
 	loff_t ui_size;
 	int flags;
 	int compr_type;
+	pgoff_t last_page_read;
+	pgoff_t read_in_a_row;
 	int data_len;
 	void *data;
 };
@@ -697,8 +707,8 @@ struct ubifs_jhead {
  * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
  * @key: key
  * @znode: znode address in memory
- * @lnum: LEB number of the indexing node
- * @offs: offset of the indexing node within @lnum
+ * @lnum: LEB number of the target node (indexing node or data node)
+ * @offs: target node offset within @lnum
  * @len: target node length
  */
 struct ubifs_zbranch {
@@ -743,6 +753,28 @@ struct ubifs_znode {
 };
 
 /**
+ * struct bu_info - bulk-read information
+ * @key: first data node key
+ * @zbranch: zbranches of data nodes to bulk read
+ * @buf: buffer to read into
+ * @buf_len: buffer length
+ * @gc_seq: GC sequence number to detect races with GC
+ * @cnt: number of data nodes for bulk read
+ * @blk_cnt: number of data blocks including holes
+ * @oef: end of file reached
+ */
+struct bu_info {
+	union ubifs_key key;
+	struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ];
+	void *buf;
+	int buf_len;
+	int gc_seq;
+	int cnt;
+	int blk_cnt;
+	int eof;
+};
+
+/**
  * struct ubifs_node_range - node length range description data structure.
  * @len: fixed node length
  * @min_len: minimum possible node length
@@ -779,7 +811,7 @@ struct ubifs_compressor {
 /**
  * struct ubifs_budget_req - budget requirements of an operation.
  *
- * @fast: non-zero if the budgeting should try to aquire budget quickly and
+ * @fast: non-zero if the budgeting should try to acquire budget quickly and
  *        should not try to call write-back
  * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
  *               have to be re-calculated
@@ -805,21 +837,31 @@ struct ubifs_compressor {
  * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
  * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
  * dirty by the re-name operation.
+ *
+ * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to
+ * make sure the amount of inode data which contribute to @new_ino_d and
+ * @dirtied_ino_d fields are aligned.
  */
 struct ubifs_budget_req {
 	unsigned int fast:1;
 	unsigned int recalculate:1;
+#ifndef UBIFS_DEBUG
 	unsigned int new_page:1;
 	unsigned int dirtied_page:1;
 	unsigned int new_dent:1;
 	unsigned int mod_dent:1;
 	unsigned int new_ino:1;
 	unsigned int new_ino_d:13;
-#ifndef UBIFS_DEBUG
 	unsigned int dirtied_ino:4;
 	unsigned int dirtied_ino_d:15;
 #else
 	/* Not bit-fields to check for overflows */
+	unsigned int new_page;
+	unsigned int dirtied_page;
+	unsigned int new_dent;
+	unsigned int mod_dent;
+	unsigned int new_ino;
+	unsigned int new_ino_d;
 	unsigned int dirtied_ino;
 	unsigned int dirtied_ino_d;
 #endif
@@ -851,22 +893,26 @@ struct ubifs_orphan {
 /**
  * struct ubifs_mount_opts - UBIFS-specific mount options information.
  * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
+ * @bulk_read: enable bulk-reads
+ * @chk_data_crc: check CRCs when reading data nodes
  */
 struct ubifs_mount_opts {
 	unsigned int unmount_mode:2;
+	unsigned int bulk_read:2;
+	unsigned int chk_data_crc:2;
 };
 
 /**
  * struct ubifs_info - UBIFS file-system description data structure
  * (per-superblock).
  * @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable readahead
+ * @bdi: backing device info object to make VFS happy and disable read-ahead
  *
  * @highest_inum: highest used inode number
- * @vfs_gen: VFS inode generation counter
  * @max_sqnum: current global sequence number
- * @cmt_no: commit number (last successfully completed commit)
- * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters
+ * @cmt_no: commit number of the last successfully completed commit, protected
+ *          by @commit_sem
+ * @cnt_lock: protects @highest_inum and @max_sqnum counters
  * @fmt_version: UBIFS on-flash format version
  * @uuid: UUID from super block
  *
@@ -894,13 +940,12 @@ struct ubifs_mount_opts {
  * @cmt_state: commit state
  * @cs_lock: commit state lock
  * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
+ *
  * @fast_unmount: do not run journal commit before un-mounting
  * @big_lpt: flag that LPT is too big to write whole during commit
- * @check_lpt_free: flag that indicates LPT GC may be needed
- * @nospace: non-zero if the file-system does not have flash space (used as
- *           optimization)
- * @nospace_rp: the same as @nospace, but additionally means that even reserved
- *              pool is full
+ * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
+ *                   recovery)
+ * @bulk_read: enable bulk-reads
  *
  * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
  *             @calc_idx_sz
@@ -924,6 +969,7 @@ struct ubifs_mount_opts {
  * @mst_node: master node
  * @mst_offs: offset of valid master node
  * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
+ * @bulk_read_buf_size: buffer size for bulk-reads
  *
  * @log_lebs: number of logical eraseblocks in the log
  * @log_bytes: log size in bytes
@@ -966,12 +1012,17 @@ struct ubifs_mount_opts {
  *                        but which still have to be taken into account because
  *                        the index has not been committed so far
  * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
- *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst;
+ *              @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
+ *              @nospace, and @nospace_rp;
  * @min_idx_lebs: minimum number of LEBs required for the index
  * @old_idx_sz: size of index on flash
  * @calc_idx_sz: temporary variable which is used to calculate new index size
  *               (contains accurate new index size at end of TNC commit start)
  * @lst: lprops statistics
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ *           optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ *              pool is full
  *
  * @page_budget: budget for a page
  * @inode_budget: budget for an inode
@@ -984,6 +1035,9 @@ struct ubifs_mount_opts {
  * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
  * @max_inode_sz: maximum possible inode size in bytes
  * @max_znode_sz: size of znode in bytes
+ *
+ * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
+ *                data nodes of maximum size - used in free space reporting
  * @dead_wm: LEB dead space watermark
  * @dark_wm: LEB dark space watermark
  * @block_cnt: count of 4KiB blocks on the FS
@@ -1017,6 +1071,8 @@ struct ubifs_mount_opts {
  * @sbuf: a buffer of LEB size used by GC and replay for scanning
  * @idx_gc: list of index LEBs that have been garbage collected
  * @idx_gc_cnt: number of elements on the idx_gc list
+ * @gc_seq: incremented for every non-index LEB garbage collected
+ * @gced_lnum: last non-index LEB that was garbage collected
  *
  * @infos_list: links all 'ubifs_info' objects
  * @umount_mutex: serializes shrinker and un-mount
@@ -1045,6 +1101,7 @@ struct ubifs_mount_opts {
  * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
  * @dirty_nn_cnt: number of dirty nnodes
  * @dirty_pn_cnt: number of dirty pnodes
+ * @check_lpt_free: flag that indicates LPT GC may be needed
  * @lpt_sz: LPT size
  * @lpt_nod_buf: buffer for an on-flash nnode or pnode
  * @lpt_buf: buffer of LEB size used by LPT
@@ -1086,6 +1143,7 @@ struct ubifs_mount_opts {
  * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
  * @size_tree: inode size information for recovery
  * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
+ * @always_chk_crc: always check CRCs (while mounting and remounting rw)
  * @mount_opts: UBIFS-specific mount options
  *
  * @dbg_buf: a buffer of LEB size used for debugging purposes
@@ -1103,7 +1161,6 @@ struct ubifs_info {
 	struct backing_dev_info bdi;
 
 	ino_t highest_inum;
-	unsigned int vfs_gen;
 	unsigned long long max_sqnum;
 	unsigned long long cmt_no;
 	spinlock_t cnt_lock;
@@ -1131,11 +1188,11 @@ struct ubifs_info {
 	int cmt_state;
 	spinlock_t cs_lock;
 	wait_queue_head_t cmt_wq;
+
 	unsigned int fast_unmount:1;
 	unsigned int big_lpt:1;
-	unsigned int check_lpt_free:1;
-	unsigned int nospace:1;
-	unsigned int nospace_rp:1;
+	unsigned int no_chk_data_crc:1;
+	unsigned int bulk_read:1;
 
 	struct mutex tnc_mutex;
 	struct ubifs_zbranch zroot;
@@ -1160,6 +1217,7 @@ struct ubifs_info {
 	struct ubifs_mst_node *mst_node;
 	int mst_offs;
 	struct mutex mst_mutex;
+	int bulk_read_buf_size;
 
 	int log_lebs;
 	long long log_bytes;
@@ -1203,6 +1261,8 @@ struct ubifs_info {
 	unsigned long long old_idx_sz;
 	unsigned long long calc_idx_sz;
 	struct ubifs_lp_stats lst;
+	unsigned int nospace:1;
+	unsigned int nospace_rp:1;
 
 	int page_budget;
 	int inode_budget;
@@ -1214,6 +1274,8 @@ struct ubifs_info {
 	int max_idx_node_sz;
 	long long max_inode_sz;
 	int max_znode_sz;
+
+	int leb_overhead;
 	int dead_wm;
 	int dark_wm;
 	int block_cnt;
@@ -1247,6 +1309,8 @@ struct ubifs_info {
 	void *sbuf;
 	struct list_head idx_gc;
 	int idx_gc_cnt;
+	volatile int gc_seq;
+	volatile int gced_lnum;
 
 	struct list_head infos_list;
 	struct mutex umount_mutex;
@@ -1275,6 +1339,7 @@ struct ubifs_info {
 	int lpt_drty_flgs;
 	int dirty_nn_cnt;
 	int dirty_pn_cnt;
+	int check_lpt_free;
 	long long lpt_sz;
 	void *lpt_nod_buf;
 	void *lpt_buf;
@@ -1316,6 +1381,7 @@ struct ubifs_info {
 	struct ubifs_mst_node *rcvrd_mst_node;
 	struct rb_root size_tree;
 	int remounting_rw;
+	int always_chk_crc;
 	struct ubifs_mount_opts mount_opts;
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
@@ -1328,6 +1394,12 @@ struct ubifs_info {
 	unsigned long fail_timeout;
 	unsigned int fail_cnt;
 	unsigned int fail_cnt_max;
+	long long chk_lpt_sz;
+	long long chk_lpt_sz2;
+	long long chk_lpt_wastage;
+	int chk_lpt_lebs;
+	int new_nhead_lnum;
+	int new_nhead_offs;
 #endif
 };
 
@@ -1346,6 +1418,7 @@ extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 
 /* io.c */
+void ubifs_ro_mode(struct ubifs_info *c, int err);
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
 int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
 			   int dtype);
@@ -1357,7 +1430,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
 		     int offs, int dtype);
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet);
+		     int offs, int quiet, int chk_crc);
 void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
 void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
 int ubifs_io_init(struct ubifs_info *c);
@@ -1399,8 +1472,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 		     int deletion, int xent);
 int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 			 const union ubifs_key *key, const void *buf, int len);
-int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,
-			  int last_reference);
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
 int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		     const struct dentry *old_dentry,
 		     const struct inode *new_dir,
@@ -1423,9 +1496,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
 				struct ubifs_budget_req *req);
 void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
 			 struct ubifs_budget_req *req);
-long long ubifs_budg_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 
 /* find.c */
@@ -1440,8 +1514,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
 /* tnc.c */
 int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
 			struct ubifs_znode **zn, int *n);
-int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key,
-		     void *node);
 int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 			void *node, const struct qstr *nm);
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
@@ -1471,6 +1543,8 @@ void destroy_old_idx(struct ubifs_info *c);
 int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
 		       int lnum, int offs);
 int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu);
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu);
 
 /* tnc_misc.c */
 struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
@@ -1567,12 +1641,10 @@ int ubifs_lpt_post_commit(struct ubifs_info *c);
 void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
 
 /* lprops.c */
-void ubifs_get_lprops(struct ubifs_info *c);
 const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 					   const struct ubifs_lprops *lp,
 					   int free, int dirty, int flags,
 					   int idx_gc_cnt);
-void ubifs_release_lprops(struct ubifs_info *c);
 void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
 void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
 		      int cat);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 1388a078e1a9..cfd31e229c89 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -61,7 +61,7 @@
 
 /*
  * Limit the number of extended attributes per inode so that the total size
- * (xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.
  */
 #define MAX_XATTRS_PER_INODE 65535
 
@@ -103,14 +103,14 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 	struct inode *inode;
 	struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
 	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
-					.new_ino_d = size, .dirtied_ino = 1,
-					.dirtied_ino_d = host_ui->data_len};
+				.new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
+				.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
 
 	if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
 		return -ENOSPC;
 	/*
 	 * Linux limits the maximum size of the extended attribute names list
-	 * to %XATTR_LIST_MAX. This means we should not allow creating more*
+	 * to %XATTR_LIST_MAX. This means we should not allow creating more
 	 * extended attributes if the name list becomes larger. This limitation
 	 * is artificial for UBIFS, though.
 	 */
@@ -128,7 +128,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 		goto out_budg;
 	}
 
-	mutex_lock(&host_ui->ui_mutex);
 	/* Re-define all operations to be "nothing" */
 	inode->i_mapping->a_ops = &none_address_operations;
 	inode->i_op = &none_inode_operations;
@@ -141,23 +140,19 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 	ui->data = kmalloc(size, GFP_NOFS);
 	if (!ui->data) {
 		err = -ENOMEM;
-		goto out_unlock;
+		goto out_free;
 	}
-
 	memcpy(ui->data, value, size);
+	inode->i_size = ui->ui_size = size;
+	ui->data_len = size;
+
+	mutex_lock(&host_ui->ui_mutex);
 	host->i_ctime = ubifs_current_time(host);
 	host_ui->xattr_cnt += 1;
 	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
 	host_ui->xattr_size += CALC_XATTR_BYTES(size);
 	host_ui->xattr_names += nm->len;
 
-	/*
-	 * We do not use i_size_write() because nobody can race with us as we
-	 * are holding host @host->i_mutex - every xattr operation for this
-	 * inode is serialized by it.
-	 */
-	inode->i_size = ui->ui_size = size;
-	ui->data_len = size;
 	err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
 	if (err)
 		goto out_cancel;
@@ -172,8 +167,8 @@ out_cancel:
 	host_ui->xattr_cnt -= 1;
 	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
 	host_ui->xattr_size -= CALC_XATTR_BYTES(size);
-out_unlock:
 	mutex_unlock(&host_ui->ui_mutex);
+out_free:
 	make_bad_inode(inode);
 	iput(inode);
 out_budg:
@@ -200,29 +195,28 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
 	struct ubifs_inode *host_ui = ubifs_inode(host);
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	struct ubifs_budget_req req = { .dirtied_ino = 2,
-				.dirtied_ino_d = size + host_ui->data_len };
+		.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
 
 	ubifs_assert(ui->data_len == inode->i_size);
 	err = ubifs_budget_space(c, &req);
 	if (err)
 		return err;
 
-	mutex_lock(&host_ui->ui_mutex);
-	host->i_ctime = ubifs_current_time(host);
-	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
-	host_ui->xattr_size += CALC_XATTR_BYTES(size);
-
 	kfree(ui->data);
 	ui->data = kmalloc(size, GFP_NOFS);
 	if (!ui->data) {
 		err = -ENOMEM;
-		goto out_unlock;
+		goto out_free;
 	}
-
 	memcpy(ui->data, value, size);
 	inode->i_size = ui->ui_size = size;
 	ui->data_len = size;
 
+	mutex_lock(&host_ui->ui_mutex);
+	host->i_ctime = ubifs_current_time(host);
+	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+	host_ui->xattr_size += CALC_XATTR_BYTES(size);
+
 	/*
 	 * It is important to write the host inode after the xattr inode
 	 * because if the host inode gets synchronized (via 'fsync()'), then
@@ -240,9 +234,9 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
 out_cancel:
 	host_ui->xattr_size -= CALC_XATTR_BYTES(size);
 	host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
-	make_bad_inode(inode);
-out_unlock:
 	mutex_unlock(&host_ui->ui_mutex);
+	make_bad_inode(inode);
+out_free:
 	ubifs_release_budget(c, &req);
 	return err;
 }
@@ -312,6 +306,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,
 
 	dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
 		host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+	ubifs_assert(mutex_is_locked(&host->i_mutex));
 
 	if (size > UBIFS_MAX_INO_DATA)
 		return -ERANGE;
@@ -384,7 +379,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
 	if (!xent)
 		return -ENOMEM;
 
-	mutex_lock(&host->i_mutex);
 	xent_key_init(c, &key, host->i_ino, &nm);
 	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
 	if (err) {
@@ -419,7 +413,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
 out_iput:
 	iput(inode);
 out_unlock:
-	mutex_unlock(&host->i_mutex);
 	kfree(xent);
 	return err;
 }
@@ -449,13 +442,11 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 		return -ERANGE;
 
 	lowest_xent_key(c, &key, host->i_ino);
-
-	mutex_lock(&host->i_mutex);
 	while (1) {
 		int type;
 
 		xent = ubifs_tnc_next_ent(c, &key, &nm);
-		if (unlikely(IS_ERR(xent))) {
+		if (IS_ERR(xent)) {
 			err = PTR_ERR(xent);
 			break;
 		}
@@ -479,7 +470,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 		pxent = xent;
 		key_read(c, &xent->key, &key);
 	}
-	mutex_unlock(&host->i_mutex);
 
 	kfree(pxent);
 	if (err != -ENOENT) {
@@ -497,8 +487,8 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
 	int err;
 	struct ubifs_inode *host_ui = ubifs_inode(host);
 	struct ubifs_inode *ui = ubifs_inode(inode);
-	struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1,
-					.dirtied_ino_d = host_ui->data_len };
+	struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1,
+				.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
 
 	ubifs_assert(ui->data_len == inode->i_size);
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ed6e146a0d9..eb91f3b70320 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {
 	.release		= udf_release_file,
 	.fsync			= udf_fsync_file,
 	.splice_read		= generic_file_splice_read,
+	.llseek			= generic_file_llseek,
 };
 
 const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index eb9cfa23dc3d..a4f2b3ce45b0 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	*err = -ENOSPC;
 
 	iinfo = UDF_I(inode);
-	iinfo->i_unique = 0;
-	iinfo->i_lenExtents = 0;
-	iinfo->i_next_alloc_block = 0;
-	iinfo->i_next_alloc_goal = 0;
-	iinfo->i_strat4096 = 0;
+	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
+		iinfo->i_efe = 1;
+		if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
+			sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
+		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+					    sizeof(struct extendedFileEntry),
+					    GFP_KERNEL);
+	} else {
+		iinfo->i_efe = 0;
+		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+					    sizeof(struct fileEntry),
+					    GFP_KERNEL);
+	}
+	if (!iinfo->i_ext.i_data) {
+		iput(inode);
+		*err = -ENOMEM;
+		return NULL;
+	}
 
 	block = udf_new_block(dir->i_sb, NULL,
 			      dinfo->i_location.partitionReferenceNum,
@@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 		lvhd->uniqueID = cpu_to_le64(uniqueID);
 		mark_buffer_dirty(sbi->s_lvid_bh);
 	}
+	mutex_unlock(&sbi->s_alloc_mutex);
 	inode->i_mode = mode;
 	inode->i_uid = current->fsuid;
 	if (dir->i_mode & S_ISGID) {
@@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	iinfo->i_lenEAttr = 0;
 	iinfo->i_lenAlloc = 0;
 	iinfo->i_use = 0;
-	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
-		iinfo->i_efe = 1;
-		if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
-			sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
-		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-					    sizeof(struct extendedFileEntry),
-					    GFP_KERNEL);
-	} else {
-		iinfo->i_efe = 0;
-		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
-					    sizeof(struct fileEntry),
-					    GFP_KERNEL);
-	}
-	if (!iinfo->i_ext.i_data) {
-		iput(inode);
-		*err = -ENOMEM;
-		mutex_unlock(&sbi->s_alloc_mutex);
-		return NULL;
-	}
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
 	else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
@@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 		iinfo->i_crtime = current_fs_time(inode->i_sb);
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
-	mutex_unlock(&sbi->s_alloc_mutex);
 
 	if (DQUOT_ALLOC_INODE(inode)) {
 		DQUOT_DROP(inode);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index d3231947db19..082409cd4b8a 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -142,7 +142,7 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
 }
 
 static struct fileIdentDesc *udf_find_entry(struct inode *dir,
-					    struct dentry *dentry,
+					    struct qstr *child,
 					    struct udf_fileident_bh *fibh,
 					    struct fileIdentDesc *cfi)
 {
@@ -159,8 +159,8 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 	sector_t offset;
 	struct extent_position epos = {};
 	struct udf_inode_info *dinfo = UDF_I(dir);
-	int isdotdot = dentry->d_name.len == 2 &&
-		dentry->d_name.name[0] == '.' && dentry->d_name.name[1] == '.';
+	int isdotdot = child->len == 2 &&
+		child->name[0] == '.' && child->name[1] == '.';
 
 	size = udf_ext0_offset(dir) + dir->i_size;
 	f_pos = udf_ext0_offset(dir);
@@ -238,8 +238,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
 			continue;
 
 		flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
-		if (flen && udf_match(flen, fname, dentry->d_name.len,
-				      dentry->d_name.name))
+		if (flen && udf_match(flen, fname, child->len, child->name))
 			goto out_ok;
 	}
 
@@ -283,7 +282,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 	} else
 #endif /* UDF_RECOVERY */
 
-	if (udf_find_entry(dir, dentry, &fibh, &cfi)) {
+	if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) {
 		if (fibh.sbh != fibh.ebh)
 			brelse(fibh.ebh);
 		brelse(fibh.sbh);
@@ -783,7 +782,7 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
 
 	retval = -ENOENT;
 	lock_kernel();
-	fi = udf_find_entry(dir, dentry, &fibh, &cfi);
+	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (!fi)
 		goto out;
 
@@ -829,7 +828,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
 
 	retval = -ENOENT;
 	lock_kernel();
-	fi = udf_find_entry(dir, dentry, &fibh, &cfi);
+	fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
 	if (!fi)
 		goto out;
 
@@ -1113,7 +1112,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct udf_inode_info *old_iinfo = UDF_I(old_inode);
 
 	lock_kernel();
-	ofi = udf_find_entry(old_dir, old_dentry, &ofibh, &ocfi);
+	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
 	if (ofi) {
 		if (ofibh.sbh != ofibh.ebh)
 			brelse(ofibh.ebh);
@@ -1124,7 +1123,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    != old_inode->i_ino)
 		goto end_rename;
 
-	nfi = udf_find_entry(new_dir, new_dentry, &nfibh, &ncfi);
+	nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
 	if (nfi) {
 		if (!new_inode) {
 			if (nfibh.sbh != nfibh.ebh)
@@ -1192,7 +1191,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 	udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL);
 
 	/* The old fid may have moved - find it again */
-	ofi = udf_find_entry(old_dir, old_dentry, &ofibh, &ocfi);
+	ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
 	udf_delete_entry(old_dir, ofi, &ofibh, &ocfi);
 
 	if (new_inode) {
@@ -1243,15 +1242,11 @@ end_rename:
 
 static struct dentry *udf_get_parent(struct dentry *child)
 {
-	struct dentry *parent;
 	struct inode *inode = NULL;
-	struct dentry dotdot;
+	struct qstr dotdot = {.name = "..", .len = 2};
 	struct fileIdentDesc cfi;
 	struct udf_fileident_bh fibh;
 
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-
 	lock_kernel();
 	if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
 		goto out_unlock;
@@ -1266,13 +1261,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
 		goto out_unlock;
 	unlock_kernel();
 
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-
-	return parent;
+	return d_obtain_alias(inode);
 out_unlock:
 	unlock_kernel();
 	return ERR_PTR(-EACCES);
@@ -1283,7 +1272,6 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
 					u16 partref, __u32 generation)
 {
 	struct inode *inode;
-	struct dentry *result;
 	kernel_lb_addr loc;
 
 	if (block == 0)
@@ -1300,12 +1288,7 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
 		iput(inode);
 		return ERR_PTR(-ESTALE);
 	}
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+	return d_obtain_alias(inode);
 }
 
 static struct dentry *udf_fh_to_dentry(struct super_block *sb,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 44cc702f96cc..e25e7010627b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -148,7 +148,7 @@ static void udf_destroy_inode(struct inode *inode)
 	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
 
@@ -369,7 +369,7 @@ enum {
 	Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_novrs,	"novrs"},
 	{Opt_nostrict,	"nostrict"},
 	{Opt_bs,	"bs=%u"},
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index df0bef18742d..dbbbc4668769 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -667,4 +667,5 @@ const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ufs_readdir,
 	.fsync		= file_fsync,
+	.llseek		= generic_file_llseek,
 };
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 85b22b5977fa..e65212dfb60e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -76,6 +76,7 @@
 
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/stat.h>
@@ -308,7 +309,7 @@ enum {
        Opt_err
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_type_old, "ufstype=old"},
 	{Opt_type_sunx86, "ufstype=sunx86"},
 	{Opt_type_sun, "ufstype=sun"},
@@ -1232,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
 	unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
-	struct match_token *tp = tokens;
+	const struct match_token *tp = tokens;
 
 	while (tp->token != Opt_onerror_panic && tp->token != mval)
 		++tp;
@@ -1301,7 +1302,7 @@ static void ufs_destroy_inode(struct inode *inode)
 	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
 
diff --git a/fs/utimes.c b/fs/utimes.c
index b6b664e7145e..6929e3e91d05 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -48,66 +48,22 @@ static bool nsec_valid(long nsec)
 	return nsec >= 0 && nsec <= 999999999;
 }
 
-/* If times==NULL, set access and modification to current time,
- * must be owner or have write permission.
- * Else, update from *times, must be owner or super user.
- */
-long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+static int utimes_common(struct path *path, struct timespec *times)
 {
 	int error;
-	struct nameidata nd;
-	struct dentry *dentry;
-	struct inode *inode;
 	struct iattr newattrs;
-	struct file *f = NULL;
-	struct vfsmount *mnt;
-
-	error = -EINVAL;
-	if (times && (!nsec_valid(times[0].tv_nsec) ||
-		      !nsec_valid(times[1].tv_nsec))) {
-		goto out;
-	}
-
-	if (flags & ~AT_SYMLINK_NOFOLLOW)
-		goto out;
-
-	if (filename == NULL && dfd != AT_FDCWD) {
-		error = -EINVAL;
-		if (flags & AT_SYMLINK_NOFOLLOW)
-			goto out;
+	struct inode *inode = path->dentry->d_inode;
 
-		error = -EBADF;
-		f = fget(dfd);
-		if (!f)
-			goto out;
-		dentry = f->f_path.dentry;
-		mnt = f->f_path.mnt;
-	} else {
-		error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
-		if (error)
-			goto out;
-
-		dentry = nd.path.dentry;
-		mnt = nd.path.mnt;
-	}
-
-	inode = dentry->d_inode;
-
-	error = mnt_want_write(mnt);
+	error = mnt_want_write(path->mnt);
 	if (error)
-		goto dput_and_out;
+		goto out;
 
 	if (times && times[0].tv_nsec == UTIME_NOW &&
 		     times[1].tv_nsec == UTIME_NOW)
 		times = NULL;
 
-	/* In most cases, the checks are done in inode_change_ok() */
 	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
 	if (times) {
-		error = -EPERM;
-                if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-			goto mnt_drop_write_and_out;
-
 		if (times[0].tv_nsec == UTIME_OMIT)
 			newattrs.ia_valid &= ~ATTR_ATIME;
 		else if (times[0].tv_nsec != UTIME_NOW) {
@@ -123,21 +79,13 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 			newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
 			newattrs.ia_valid |= ATTR_MTIME_SET;
 		}
-
 		/*
-		 * For the UTIME_OMIT/UTIME_NOW and UTIME_NOW/UTIME_OMIT
-		 * cases, we need to make an extra check that is not done by
-		 * inode_change_ok().
+		 * Tell inode_change_ok(), that this is an explicit time
+		 * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET
+		 * were used.
 		 */
-		if (((times[0].tv_nsec == UTIME_NOW &&
-			    times[1].tv_nsec == UTIME_OMIT)
-		     ||
-		     (times[0].tv_nsec == UTIME_OMIT &&
-			    times[1].tv_nsec == UTIME_NOW))
-		    && !is_owner_or_cap(inode))
-			goto mnt_drop_write_and_out;
+		newattrs.ia_valid |= ATTR_TIMES_SET;
 	} else {
-
 		/*
 		 * If times is NULL (or both times are UTIME_NOW),
 		 * then we need to check permissions, because
@@ -148,21 +96,76 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 			goto mnt_drop_write_and_out;
 
 		if (!is_owner_or_cap(inode)) {
-			error = permission(inode, MAY_WRITE, NULL);
+			error = inode_permission(inode, MAY_WRITE);
 			if (error)
 				goto mnt_drop_write_and_out;
 		}
 	}
 	mutex_lock(&inode->i_mutex);
-	error = notify_change(dentry, &newattrs);
+	error = notify_change(path->dentry, &newattrs);
 	mutex_unlock(&inode->i_mutex);
+
 mnt_drop_write_and_out:
-	mnt_drop_write(mnt);
-dput_and_out:
-	if (f)
-		fput(f);
-	else
-		path_put(&nd.path);
+	mnt_drop_write(path->mnt);
+out:
+	return error;
+}
+
+/*
+ * do_utimes - change times on filename or file descriptor
+ * @dfd: open file descriptor, -1 or AT_FDCWD
+ * @filename: path name or NULL
+ * @times: new times or NULL
+ * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment)
+ *
+ * If filename is NULL and dfd refers to an open file, then operate on
+ * the file.  Otherwise look up filename, possibly using dfd as a
+ * starting point.
+ *
+ * If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+{
+	int error = -EINVAL;
+
+	if (times && (!nsec_valid(times[0].tv_nsec) ||
+		      !nsec_valid(times[1].tv_nsec))) {
+		goto out;
+	}
+
+	if (flags & ~AT_SYMLINK_NOFOLLOW)
+		goto out;
+
+	if (filename == NULL && dfd != AT_FDCWD) {
+		struct file *file;
+
+		if (flags & AT_SYMLINK_NOFOLLOW)
+			goto out;
+
+		file = fget(dfd);
+		error = -EBADF;
+		if (!file)
+			goto out;
+
+		error = utimes_common(&file->f_path, times);
+		fput(file);
+	} else {
+		struct path path;
+		int lookup_flags = 0;
+
+		if (!(flags & AT_SYMLINK_NOFOLLOW))
+			lookup_flags |= LOOKUP_FOLLOW;
+
+		error = user_path_at(dfd, filename, lookup_flags, &path);
+		if (error)
+			goto out;
+
+		error = utimes_common(&path, times);
+		path_put(&path);
+	}
+
 out:
 	return error;
 }
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index b546ba69be82..155c10b4adbd 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -621,7 +621,7 @@ shortname:
 	memcpy(de->name, msdos_name, MSDOS_NAME);
 	de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
 	de->lcase = lcase;
-	fat_date_unix2dos(ts->tv_sec, &time, &date);
+	fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
 	de->time = de->ctime = time;
 	de->date = de->cdate = de->adate = date;
 	de->ctime_cs = 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index 4706a8b1f495..468377e66531 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -63,7 +63,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 			return -EPERM;
 	}
 
-	return permission(inode, mask, NULL);
+	return inode_permission(inode, mask);
 }
 
 int
@@ -252,40 +252,40 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 }
 
 asmlinkage long
-sys_setxattr(const char __user *path, const char __user *name,
+sys_setxattr(const char __user *pathname, const char __user *name,
 	     const void __user *value, size_t size, int flags)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		return error;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = setxattr(nd.path.dentry, name, value, size, flags);
-		mnt_drop_write(nd.path.mnt);
+		error = setxattr(path.dentry, name, value, size, flags);
+		mnt_drop_write(path.mnt);
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
 asmlinkage long
-sys_lsetxattr(const char __user *path, const char __user *name,
+sys_lsetxattr(const char __user *pathname, const char __user *name,
 	      const void __user *value, size_t size, int flags)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk_link(path, &nd);
+	error = user_lpath(pathname, &path);
 	if (error)
 		return error;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = setxattr(nd.path.dentry, name, value, size, flags);
-		mnt_drop_write(nd.path.mnt);
+		error = setxattr(path.dentry, name, value, size, flags);
+		mnt_drop_write(path.mnt);
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
@@ -350,32 +350,32 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 }
 
 asmlinkage ssize_t
-sys_getxattr(const char __user *path, const char __user *name,
+sys_getxattr(const char __user *pathname, const char __user *name,
 	     void __user *value, size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	ssize_t error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		return error;
-	error = getxattr(nd.path.dentry, name, value, size);
-	path_put(&nd.path);
+	error = getxattr(path.dentry, name, value, size);
+	path_put(&path);
 	return error;
 }
 
 asmlinkage ssize_t
-sys_lgetxattr(const char __user *path, const char __user *name, void __user *value,
+sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value,
 	      size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	ssize_t error;
 
-	error = user_path_walk_link(path, &nd);
+	error = user_lpath(pathname, &path);
 	if (error)
 		return error;
-	error = getxattr(nd.path.dentry, name, value, size);
-	path_put(&nd.path);
+	error = getxattr(path.dentry, name, value, size);
+	path_put(&path);
 	return error;
 }
 
@@ -425,30 +425,30 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 }
 
 asmlinkage ssize_t
-sys_listxattr(const char __user *path, char __user *list, size_t size)
+sys_listxattr(const char __user *pathname, char __user *list, size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	ssize_t error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		return error;
-	error = listxattr(nd.path.dentry, list, size);
-	path_put(&nd.path);
+	error = listxattr(path.dentry, list, size);
+	path_put(&path);
 	return error;
 }
 
 asmlinkage ssize_t
-sys_llistxattr(const char __user *path, char __user *list, size_t size)
+sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
 {
-	struct nameidata nd;
+	struct path path;
 	ssize_t error;
 
-	error = user_path_walk_link(path, &nd);
+	error = user_lpath(pathname, &path);
 	if (error)
 		return error;
-	error = listxattr(nd.path.dentry, list, size);
-	path_put(&nd.path);
+	error = listxattr(path.dentry, list, size);
+	path_put(&path);
 	return error;
 }
 
@@ -486,38 +486,38 @@ removexattr(struct dentry *d, const char __user *name)
 }
 
 asmlinkage long
-sys_removexattr(const char __user *path, const char __user *name)
+sys_removexattr(const char __user *pathname, const char __user *name)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk(path, &nd);
+	error = user_path(pathname, &path);
 	if (error)
 		return error;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = removexattr(nd.path.dentry, name);
-		mnt_drop_write(nd.path.mnt);
+		error = removexattr(path.dentry, name);
+		mnt_drop_write(path.mnt);
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
 asmlinkage long
-sys_lremovexattr(const char __user *path, const char __user *name)
+sys_lremovexattr(const char __user *pathname, const char __user *name)
 {
-	struct nameidata nd;
+	struct path path;
 	int error;
 
-	error = user_path_walk_link(path, &nd);
+	error = user_lpath(pathname, &path);
 	if (error)
 		return error;
-	error = mnt_want_write(nd.path.mnt);
+	error = mnt_want_write(path.mnt);
 	if (!error) {
-		error = removexattr(nd.path.dentry, name);
-		mnt_drop_write(nd.path.mnt);
+		error = removexattr(path.dentry, name);
+		mnt_drop_write(path.mnt);
 	}
-	path_put(&nd.path);
+	path_put(&path);
 	return error;
 }
 
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 36ec614e699a..737c9a425361 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -106,7 +106,8 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 				   xfs_iops.o \
 				   xfs_lrw.o \
 				   xfs_super.o \
-				   xfs_vnode.o)
+				   xfs_vnode.o \
+				   xfs_xattr.o)
 
 # Objects in support/
 xfs-y				+= $(addprefix support/, \
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 9b1bb17a0501..1cd3b55ee3d2 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -90,7 +90,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
 }
 
 void
-kmem_free(void *ptr, size_t size)
+kmem_free(const void *ptr)
 {
 	if (!is_vmalloc_addr(ptr)) {
 		kfree(ptr);
@@ -100,7 +100,7 @@ kmem_free(void *ptr, size_t size)
 }
 
 void *
-kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
+kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
 	     unsigned int __nocast flags)
 {
 	void	*new;
@@ -110,7 +110,7 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
 		if (new)
 			memcpy(new, ptr,
 				((oldsize < newsize) ? oldsize : newsize));
-		kmem_free(ptr, oldsize);
+		kmem_free(ptr);
 	}
 	return new;
 }
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 5e9564902976..af6843c7ee4b 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -57,8 +57,8 @@ kmem_flags_convert(unsigned int __nocast flags)
 extern void *kmem_alloc(size_t, unsigned int __nocast);
 extern void *kmem_zalloc(size_t, unsigned int __nocast);
 extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
-extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
-extern void  kmem_free(void *, size_t);
+extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
+extern void  kmem_free(const void *);
 
 /*
  * Zone interfaces
@@ -79,7 +79,7 @@ kmem_zone_init(int size, char *zone_name)
 
 static inline kmem_zone_t *
 kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
-		     void (*construct)(kmem_zone_t *, void *))
+		     void (*construct)(void *))
 {
 	return kmem_cache_create(zone_name, size, 0, flags, construct);
 }
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
deleted file mode 100644
index 3abe7e9ceb33..000000000000
--- a/fs/xfs/linux-2.6/sema.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_SEMA_H__
-#define __XFS_SUPPORT_SEMA_H__
-
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <linux/semaphore.h>
-#include <asm/atomic.h>
-
-/*
- * sema_t structure just maps to struct semaphore in Linux kernel.
- */
-
-typedef struct semaphore sema_t;
-
-#define initnsema(sp, val, name)	sema_init(sp, val)
-#define psema(sp, b)			down(sp)
-#define vsema(sp)			up(sp)
-#define freesema(sema)			do { } while (0)
-
-static inline int issemalocked(sema_t *sp)
-{
-	return down_trylock(sp) || (up(sp), 0);
-}
-
-/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
- */
-static inline int cpsema(sema_t *sp)
-{
-	return down_trylock(sp) ? 0 : 1;
-}
-
-#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a55c3b26d840..a44d68eb50b5 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -73,7 +73,6 @@ xfs_page_trace(
 	unsigned long	pgoff)
 {
 	xfs_inode_t	*ip;
-	bhv_vnode_t	*vp = vn_from_inode(inode);
 	loff_t		isize = i_size_read(inode);
 	loff_t		offset = page_offset(page);
 	int		delalloc = -1, unmapped = -1, unwritten = -1;
@@ -81,7 +80,7 @@ xfs_page_trace(
 	if (page_has_buffers(page))
 		xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
 
-	ip = xfs_vtoi(vp);
+	ip = XFS_I(inode);
 	if (!ip->i_rwtrace)
 		return;
 
@@ -409,7 +408,6 @@ xfs_start_buffer_writeback(
 STATIC void
 xfs_start_page_writeback(
 	struct page		*page,
-	struct writeback_control *wbc,
 	int			clear_dirty,
 	int			buffers)
 {
@@ -676,7 +674,7 @@ xfs_probe_cluster(
 			} else
 				pg_offset = PAGE_CACHE_SIZE;
 
-			if (page->index == tindex && !TestSetPageLocked(page)) {
+			if (page->index == tindex && trylock_page(page)) {
 				pg_len = xfs_probe_page(page, pg_offset, mapped);
 				unlock_page(page);
 			}
@@ -760,7 +758,7 @@ xfs_convert_page(
 
 	if (page->index != tindex)
 		goto fail;
-	if (TestSetPageLocked(page))
+	if (!trylock_page(page))
 		goto fail;
 	if (PageWriteback(page))
 		goto fail_unlock_page;
@@ -858,7 +856,7 @@ xfs_convert_page(
 				done = 1;
 			}
 		}
-		xfs_start_page_writeback(page, wbc, !page_dirty, count);
+		xfs_start_page_writeback(page, !page_dirty, count);
 	}
 
 	return done;
@@ -1105,7 +1103,7 @@ xfs_page_state_convert(
 			 * that we are writing into for the first time.
 			 */
 			type = IOMAP_NEW;
-			if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
+			if (trylock_buffer(bh)) {
 				ASSERT(buffer_mapped(bh));
 				if (iomap_valid)
 					all_bh = 1;
@@ -1130,7 +1128,7 @@ xfs_page_state_convert(
 		SetPageUptodate(page);
 
 	if (startio)
-		xfs_start_page_writeback(page, wbc, 1, count);
+		xfs_start_page_writeback(page, 1, count);
 
 	if (ioend && iomap_valid) {
 		offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
@@ -1340,6 +1338,10 @@ __xfs_get_blocks(
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
+
+	if (!create && direct && offset >= i_size_read(inode))
+		return 0;
+
 	error = xfs_iomap(XFS_I(inode), offset, size,
 			     create ? flags : BMAPI_READ, &iomap, &niomap);
 	if (error)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 98e0e86093b4..36d5fcd3f593 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -58,7 +58,7 @@ xfs_buf_trace(
 		bp, id,
 		(void *)(unsigned long)bp->b_flags,
 		(void *)(unsigned long)bp->b_hold.counter,
-		(void *)(unsigned long)bp->b_sema.count.counter,
+		(void *)(unsigned long)bp->b_sema.count,
 		(void *)current,
 		data, ra,
 		(void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
@@ -253,7 +253,7 @@ _xfs_buf_initialize(
 
 	memset(bp, 0, sizeof(xfs_buf_t));
 	atomic_set(&bp->b_hold, 1);
-	init_MUTEX_LOCKED(&bp->b_iodonesema);
+	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_list);
 	INIT_LIST_HEAD(&bp->b_hash_list);
 	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
@@ -310,8 +310,7 @@ _xfs_buf_free_pages(
 	xfs_buf_t	*bp)
 {
 	if (bp->b_pages != bp->b_page_array) {
-		kmem_free(bp->b_pages,
-			  bp->b_page_count * sizeof(struct page *));
+		kmem_free(bp->b_pages);
 	}
 }
 
@@ -839,6 +838,7 @@ xfs_buf_rele(
 		return;
 	}
 
+	ASSERT(atomic_read(&bp->b_hold) > 0);
 	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
 		if (bp->b_relse) {
 			atomic_inc(&bp->b_hold);
@@ -852,11 +852,6 @@ xfs_buf_rele(
 			spin_unlock(&hash->bh_lock);
 			xfs_buf_free(bp);
 		}
-	} else {
-		/*
-		 * Catch reference count leaks
-		 */
-		ASSERT(atomic_read(&bp->b_hold) >= 0);
 	}
 }
 
@@ -1006,12 +1001,13 @@ xfs_buf_iodone_work(
 	 * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
 	 * ordered flag and reissue them.  Because we can't tell the higher
 	 * layers directly that they should not issue ordered I/O anymore, they
-	 * need to check if the ordered flag was cleared during I/O completion.
+	 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
 	 */
 	if ((bp->b_error == EOPNOTSUPP) &&
 	    (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
 		XB_TRACE(bp, "ordered_retry", bp->b_iodone);
 		bp->b_flags &= ~XBF_ORDERED;
+		bp->b_flags |= _XFS_BARRIER_FAILED;
 		xfs_buf_iorequest(bp);
 	} else if (bp->b_iodone)
 		(*(bp->b_iodone))(bp);
@@ -1038,7 +1034,7 @@ xfs_buf_ioend(
 			xfs_buf_iodone_work(&bp->b_iodone_work);
 		}
 	} else {
-		up(&bp->b_iodonesema);
+		complete(&bp->b_iowait);
 	}
 }
 
@@ -1276,7 +1272,7 @@ xfs_buf_iowait(
 	XB_TRACE(bp, "iowait", 0);
 	if (atomic_read(&bp->b_io_remaining))
 		blk_run_address_space(bp->b_target->bt_mapping);
-	down(&bp->b_iodonesema);
+	wait_for_completion(&bp->b_iowait);
 	XB_TRACE(bp, "iowaited", (long)bp->b_error);
 	return bp->b_error;
 }
@@ -1398,7 +1394,7 @@ STATIC void
 xfs_free_bufhash(
 	xfs_buftarg_t		*btp)
 {
-	kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t));
+	kmem_free(btp->bt_hash);
 	btp->bt_hash = NULL;
 }
 
@@ -1428,13 +1424,10 @@ xfs_unregister_buftarg(
 
 void
 xfs_free_buftarg(
-	xfs_buftarg_t		*btp,
-	int			external)
+	xfs_buftarg_t		*btp)
 {
 	xfs_flush_buftarg(btp, 1);
 	xfs_blkdev_issue_flush(btp);
-	if (external)
-		xfs_blkdev_put(btp->bt_bdev);
 	xfs_free_bufhash(btp);
 	iput(btp->bt_mapping->host);
 
@@ -1444,7 +1437,7 @@ xfs_free_buftarg(
 	xfs_unregister_buftarg(btp);
 	kthread_stop(btp->bt_task);
 
-	kmem_free(btp, sizeof(*btp));
+	kmem_free(btp);
 }
 
 STATIC int
@@ -1575,7 +1568,7 @@ xfs_alloc_buftarg(
 	return btp;
 
 error:
-	kmem_free(btp, sizeof(*btp));
+	kmem_free(btp);
 	return NULL;
 }
 
@@ -1803,7 +1796,7 @@ int __init
 xfs_buf_init(void)
 {
 #ifdef XFS_BUF_TRACE
-	xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
+	xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
 #endif
 
 	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index f948ec7ba9a4..456519a088c7 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -85,6 +85,14 @@ typedef enum {
 	 * modifications being lost.
 	 */
 	_XBF_PAGE_LOCKED = (1 << 22),
+
+	/*
+	 * If we try a barrier write, but it fails we have to communicate
+	 * this to the upper layers.  Unfortunately b_error gets overwritten
+	 * when the buffer is re-issued so we have to add another flag to
+	 * keep this information.
+	 */
+	_XFS_BARRIER_FAILED = (1 << 23),
 } xfs_buf_flags_t;
 
 typedef enum {
@@ -157,7 +165,7 @@ typedef struct xfs_buf {
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
 	xfs_buf_relse_t		b_relse;	/* releasing function */
 	xfs_buf_bdstrat_t	b_strat;	/* pre-write function */
-	struct semaphore	b_iodonesema;	/* Semaphore for I/O waiters */
+	struct completion	b_iowait;	/* queue for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
 	void			*b_fspriv3;
@@ -352,7 +360,7 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_CPSEMA(bp)	(xfs_buf_cond_lock(bp) == 0)
 #define XFS_BUF_VSEMA(bp)	xfs_buf_unlock(bp)
 #define XFS_BUF_PSEMA(bp,x)	xfs_buf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema);
+#define XFS_BUF_FINISH_IOWAIT(bp)	complete(&bp->b_iowait);
 
 #define XFS_BUF_SET_TARGET(bp, target)	((bp)->b_target = (target))
 #define XFS_BUF_TARGET(bp)		((bp)->b_target)
@@ -429,7 +437,7 @@ static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
  *	Handling of buftargs.
  */
 extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
-extern void xfs_free_buftarg(xfs_buftarg_t *, int);
+extern void xfs_free_buftarg(xfs_buftarg_t *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index c672b3238b14..7f7abec25e14 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -139,7 +139,7 @@ xfs_nfs_get_inode(
 	}
 
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-	return ip->i_vnode;
+	return VFS_I(ip);
 }
 
 STATIC struct dentry *
@@ -148,7 +148,6 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 {
 	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
 	struct inode		*inode = NULL;
-	struct dentry		*result;
 
 	if (fh_len < xfs_fileid_length(fileid_type))
 		return NULL;
@@ -164,16 +163,7 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	if (!inode)
-		return NULL;
-	if (IS_ERR(inode))
-		return ERR_PTR(PTR_ERR(inode));
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+	return d_obtain_alias(inode);
 }
 
 STATIC struct dentry *
@@ -182,7 +172,6 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
 {
 	struct xfs_fid64	*fid64 = (struct xfs_fid64 *)fid;
 	struct inode		*inode = NULL;
-	struct dentry		*result;
 
 	switch (fileid_type) {
 	case FILEID_INO32_GEN_PARENT:
@@ -195,16 +184,7 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
 		break;
 	}
 
-	if (!inode)
-		return NULL;
-	if (IS_ERR(inode))
-		return ERR_PTR(PTR_ERR(inode));
-	result = d_alloc_anon(inode);
-	if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
+	return d_obtain_alias(inode);
 }
 
 STATIC struct dentry *
@@ -213,18 +193,12 @@ xfs_fs_get_parent(
 {
 	int			error;
 	struct xfs_inode	*cip;
-	struct dentry		*parent;
 
-	error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip);
+	error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
 	if (unlikely(error))
 		return ERR_PTR(-error);
 
-	parent = d_alloc_anon(cip->i_vnode);
-	if (unlikely(!parent)) {
-		iput(cip->i_vnode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return parent;
+	return d_obtain_alias(VFS_I(cip));
 }
 
 const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5f60363b9343..3fee790f138b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -204,15 +204,6 @@ xfs_file_fsync(
 	return -xfs_fsync(XFS_I(dentry->d_inode));
 }
 
-/*
- * Unfortunately we can't just use the clean and simple readdir implementation
- * below, because nfs might call back into ->lookup from the filldir callback
- * and that will deadlock the low-level btree code.
- *
- * Hopefully we'll find a better workaround that allows to use the optimal
- * version at least for local readdirs for 2.6.25.
- */
-#if 0
 STATIC int
 xfs_file_readdir(
 	struct file	*filp,
@@ -244,125 +235,6 @@ xfs_file_readdir(
 		return -error;
 	return 0;
 }
-#else
-
-struct hack_dirent {
-	u64		ino;
-	loff_t		offset;
-	int		namlen;
-	unsigned int	d_type;
-	char		name[];
-};
-
-struct hack_callback {
-	char		*dirent;
-	size_t		len;
-	size_t		used;
-};
-
-STATIC int
-xfs_hack_filldir(
-	void		*__buf,
-	const char	*name,
-	int		namlen,
-	loff_t		offset,
-	u64		ino,
-	unsigned int	d_type)
-{
-	struct hack_callback *buf = __buf;
-	struct hack_dirent *de = (struct hack_dirent *)(buf->dirent + buf->used);
-	unsigned int reclen;
-
-	reclen = ALIGN(sizeof(struct hack_dirent) + namlen, sizeof(u64));
-	if (buf->used + reclen > buf->len)
-		return -EINVAL;
-
-	de->namlen = namlen;
-	de->offset = offset;
-	de->ino = ino;
-	de->d_type = d_type;
-	memcpy(de->name, name, namlen);
-	buf->used += reclen;
-	return 0;
-}
-
-STATIC int
-xfs_file_readdir(
-	struct file	*filp,
-	void		*dirent,
-	filldir_t	filldir)
-{
-	struct inode	*inode = filp->f_path.dentry->d_inode;
-	xfs_inode_t	*ip = XFS_I(inode);
-	struct hack_callback buf;
-	struct hack_dirent *de;
-	int		error;
-	loff_t		size;
-	int		eof = 0;
-	xfs_off_t       start_offset, curr_offset, offset;
-
-	/*
-	 * Try fairly hard to get memory
-	 */
-	buf.len = PAGE_CACHE_SIZE;
-	do {
-		buf.dirent = kmalloc(buf.len, GFP_KERNEL);
-		if (buf.dirent)
-			break;
-		buf.len >>= 1;
-	} while (buf.len >= 1024);
-
-	if (!buf.dirent)
-		return -ENOMEM;
-
-	curr_offset = filp->f_pos;
-	if (curr_offset == 0x7fffffff)
-		offset = 0xffffffff;
-	else
-		offset = filp->f_pos;
-
-	while (!eof) {
-		unsigned int reclen;
-
-		start_offset = offset;
-
-		buf.used = 0;
-		error = -xfs_readdir(ip, &buf, buf.len, &offset,
-				     xfs_hack_filldir);
-		if (error || offset == start_offset) {
-			size = 0;
-			break;
-		}
-
-		size = buf.used;
-		de = (struct hack_dirent *)buf.dirent;
-		while (size > 0) {
-			curr_offset = de->offset /* & 0x7fffffff */;
-			if (filldir(dirent, de->name, de->namlen,
-					curr_offset & 0x7fffffff,
-					de->ino, de->d_type)) {
-				goto done;
-			}
-
-			reclen = ALIGN(sizeof(struct hack_dirent) + de->namlen,
-				       sizeof(u64));
-			size -= reclen;
-			de = (struct hack_dirent *)((char *)de + reclen);
-		}
-	}
-
- done:
-	if (!error) {
-		if (size == 0)
-			filp->f_pos = offset & 0x7fffffff;
-		else if (de)
-			filp->f_pos = curr_offset;
-	}
-
-	kfree(buf.dirent);
-	return error;
-}
-#endif
 
 STATIC int
 xfs_file_mmap(
@@ -475,6 +347,7 @@ const struct file_operations xfs_invis_file_operations = {
 const struct file_operations xfs_dir_file_operations = {
 	.read		= generic_read_dir,
 	.readdir	= xfs_file_readdir,
+	.llseek		= generic_file_llseek,
 	.unlocked_ioctl	= xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= xfs_file_compat_ioctl,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1eefe61f0e10..36caa6d957df 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -31,7 +31,7 @@ xfs_tosspages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	struct address_space *mapping = ip->i_vnode->i_mapping;
+	struct address_space *mapping = VFS_I(ip)->i_mapping;
 
 	if (mapping->nrpages)
 		truncate_inode_pages(mapping, first);
@@ -44,7 +44,7 @@ xfs_flushinval_pages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	struct address_space *mapping = ip->i_vnode->i_mapping;
+	struct address_space *mapping = VFS_I(ip)->i_mapping;
 	int		ret = 0;
 
 	if (mapping->nrpages) {
@@ -64,7 +64,7 @@ xfs_flush_pages(
 	uint64_t	flags,
 	int		fiopt)
 {
-	struct address_space *mapping = ip->i_vnode->i_mapping;
+	struct address_space *mapping = VFS_I(ip)->i_mapping;
 	int		ret = 0;
 	int		ret2;
 
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a42ba9d71156..d3438c72dcaf 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -48,6 +48,8 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_quota.h"
+#include "xfs_inode_item.h"
 
 #include <linux/capability.h>
 #include <linux/dcache.h>
@@ -84,17 +86,15 @@ xfs_find_handle(
 	switch (cmd) {
 	case XFS_IOC_PATH_TO_FSHANDLE:
 	case XFS_IOC_PATH_TO_HANDLE: {
-		struct nameidata	nd;
-		int			error;
-
-		error = user_path_walk_link((const char __user *)hreq.path, &nd);
+		struct path path;
+		int error = user_lpath((const char __user *)hreq.path, &path);
 		if (error)
 			return error;
 
-		ASSERT(nd.path.dentry);
-		ASSERT(nd.path.dentry->d_inode);
-		inode = igrab(nd.path.dentry->d_inode);
-		path_put(&nd.path);
+		ASSERT(path.dentry);
+		ASSERT(path.dentry->d_inode);
+		inode = igrab(path.dentry->d_inode);
+		path_put(&path);
 		break;
 	}
 
@@ -245,7 +245,7 @@ xfs_vget_fsop_handlereq(
 
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
-	*inode = XFS_ITOV(ip);
+	*inode = VFS_I(ip);
 	return 0;
 }
 
@@ -311,11 +311,10 @@ xfs_open_by_handle(
 		return new_fd;
 	}
 
-	dentry = d_alloc_anon(inode);
-	if (dentry == NULL) {
-		iput(inode);
+	dentry = d_obtain_alias(inode);
+	if (IS_ERR(dentry)) {
 		put_unused_fd(new_fd);
-		return -XFS_ERROR(ENOMEM);
+		return PTR_ERR(dentry);
 	}
 
 	/* Ensure umount returns EBUSY on umounts while this file is open. */
@@ -470,6 +469,12 @@ xfs_attrlist_by_handle(
 	if (al_hreq.buflen > XATTR_LIST_MAX)
 		return -XFS_ERROR(EINVAL);
 
+	/*
+	 * Reject flags, only allow namespaces.
+	 */
+	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+		return -XFS_ERROR(EINVAL);
+
 	error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode);
 	if (error)
 		goto out;
@@ -589,7 +594,7 @@ xfs_attrmulti_by_handle(
 		goto out;
 
 	error = E2BIG;
-	size = am_hreq.opcount * sizeof(attr_multiop_t);
+	size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
 	if (!size || size > 16 * PAGE_SIZE)
 		goto out_vn_rele;
 
@@ -682,9 +687,9 @@ xfs_ioc_space(
 		return -XFS_ERROR(EFAULT);
 
 	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-		attr_flags |= ATTR_NONBLOCK;
+		attr_flags |= XFS_ATTR_NONBLOCK;
 	if (ioflags & IO_INVIS)
-		attr_flags |= ATTR_DMI;
+		attr_flags |= XFS_ATTR_DMI;
 
 	error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos,
 					      NULL, attr_flags);
@@ -875,6 +880,322 @@ xfs_ioc_fsgetxattr(
 	return 0;
 }
 
+STATIC void
+xfs_set_diflags(
+	struct xfs_inode	*ip,
+	unsigned int		xflags)
+{
+	unsigned int		di_flags;
+
+	/* can't set PREALLOC this way, just preserve it */
+	di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+	if (xflags & XFS_XFLAG_IMMUTABLE)
+		di_flags |= XFS_DIFLAG_IMMUTABLE;
+	if (xflags & XFS_XFLAG_APPEND)
+		di_flags |= XFS_DIFLAG_APPEND;
+	if (xflags & XFS_XFLAG_SYNC)
+		di_flags |= XFS_DIFLAG_SYNC;
+	if (xflags & XFS_XFLAG_NOATIME)
+		di_flags |= XFS_DIFLAG_NOATIME;
+	if (xflags & XFS_XFLAG_NODUMP)
+		di_flags |= XFS_DIFLAG_NODUMP;
+	if (xflags & XFS_XFLAG_PROJINHERIT)
+		di_flags |= XFS_DIFLAG_PROJINHERIT;
+	if (xflags & XFS_XFLAG_NODEFRAG)
+		di_flags |= XFS_DIFLAG_NODEFRAG;
+	if (xflags & XFS_XFLAG_FILESTREAM)
+		di_flags |= XFS_DIFLAG_FILESTREAM;
+	if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+		if (xflags & XFS_XFLAG_RTINHERIT)
+			di_flags |= XFS_DIFLAG_RTINHERIT;
+		if (xflags & XFS_XFLAG_NOSYMLINKS)
+			di_flags |= XFS_DIFLAG_NOSYMLINKS;
+		if (xflags & XFS_XFLAG_EXTSZINHERIT)
+			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+		if (xflags & XFS_XFLAG_REALTIME)
+			di_flags |= XFS_DIFLAG_REALTIME;
+		if (xflags & XFS_XFLAG_EXTSIZE)
+			di_flags |= XFS_DIFLAG_EXTSIZE;
+	}
+
+	ip->i_d.di_flags = di_flags;
+}
+
+STATIC void
+xfs_diflags_to_linux(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+	unsigned int		xflags = xfs_ip2xflags(ip);
+
+	if (xflags & XFS_XFLAG_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+	if (xflags & XFS_XFLAG_APPEND)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+	if (xflags & XFS_XFLAG_SYNC)
+		inode->i_flags |= S_SYNC;
+	else
+		inode->i_flags &= ~S_SYNC;
+	if (xflags & XFS_XFLAG_NOATIME)
+		inode->i_flags |= S_NOATIME;
+	else
+		inode->i_flags &= ~S_NOATIME;
+}
+
+#define FSX_PROJID	1
+#define FSX_EXTSIZE	2
+#define FSX_XFLAGS	4
+#define FSX_NONBLOCK	8
+
+STATIC int
+xfs_ioctl_setattr(
+	xfs_inode_t		*ip,
+	struct fsxattr		*fa,
+	int			mask)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	unsigned int		lock_flags = 0;
+	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
+	struct xfs_dquot	*olddquot = NULL;
+	int			code;
+
+	xfs_itrace_entry(ip);
+
+	if (mp->m_flags & XFS_MOUNT_RDONLY)
+		return XFS_ERROR(EROFS);
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * If disk quotas is on, we make sure that the dquots do exist on disk,
+	 * before we start any other transactions. Trying to do this later
+	 * is messy. We don't care to take a readlock to look at the ids
+	 * in inode here, because we can't hold it across the trans_reserve.
+	 * If the IDs do change before we take the ilock, we're covered
+	 * because the i_*dquot fields will get updated anyway.
+	 */
+	if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
+		code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid,
+					 ip->i_d.di_gid, fa->fsx_projid,
+					 XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+		if (code)
+			return code;
+	}
+
+	/*
+	 * For the other attributes, we acquire the inode lock and
+	 * first do an error checking pass.
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+	code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+	if (code)
+		goto error_return;
+
+	lock_flags = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lock_flags);
+
+	/*
+	 * CAP_FOWNER overrides the following restrictions:
+	 *
+	 * The user ID of the calling process must be equal
+	 * to the file owner ID, except in cases where the
+	 * CAP_FSETID capability is applicable.
+	 */
+	if (current->fsuid != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+		code = XFS_ERROR(EPERM);
+		goto error_return;
+	}
+
+	/*
+	 * Do a quota reservation only if projid is actually going to change.
+	 */
+	if (mask & FSX_PROJID) {
+		if (XFS_IS_PQUOTA_ON(mp) &&
+		    ip->i_d.di_projid != fa->fsx_projid) {
+			ASSERT(tp);
+			code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+						capable(CAP_FOWNER) ?
+						XFS_QMOPT_FORCE_RES : 0);
+			if (code)	/* out of quota */
+				goto error_return;
+		}
+	}
+
+	if (mask & FSX_EXTSIZE) {
+		/*
+		 * Can't change extent size if any extents are allocated.
+		 */
+		if (ip->i_d.di_nextents &&
+		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+		     fa->fsx_extsize)) {
+			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			goto error_return;
+		}
+
+		/*
+		 * Extent size must be a multiple of the appropriate block
+		 * size, if set at all.
+		 */
+		if (fa->fsx_extsize != 0) {
+			xfs_extlen_t	size;
+
+			if (XFS_IS_REALTIME_INODE(ip) ||
+			    ((mask & FSX_XFLAGS) &&
+			    (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
+				size = mp->m_sb.sb_rextsize <<
+				       mp->m_sb.sb_blocklog;
+			} else {
+				size = mp->m_sb.sb_blocksize;
+			}
+
+			if (fa->fsx_extsize % size) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
+		}
+	}
+
+
+	if (mask & FSX_XFLAGS) {
+		/*
+		 * Can't change realtime flag if any extents are allocated.
+		 */
+		if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+		    (XFS_IS_REALTIME_INODE(ip)) !=
+		    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			goto error_return;
+		}
+
+		/*
+		 * If realtime flag is set then must have realtime data.
+		 */
+		if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+			if ((mp->m_sb.sb_rblocks == 0) ||
+			    (mp->m_sb.sb_rextsize == 0) ||
+			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
+		}
+
+		/*
+		 * Can't modify an immutable/append-only file unless
+		 * we have appropriate permission.
+		 */
+		if ((ip->i_d.di_flags &
+				(XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
+		     (fa->fsx_xflags &
+				(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+		    !capable(CAP_LINUX_IMMUTABLE)) {
+			code = XFS_ERROR(EPERM);
+			goto error_return;
+		}
+	}
+
+	xfs_trans_ijoin(tp, ip, lock_flags);
+	xfs_trans_ihold(tp, ip);
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 * If the system was configured with the "restricted_chown"
+	 * option, the owner is not permitted to give away the file,
+	 * and can change the group id only to a group of which he
+	 * or she is a member.
+	 */
+	if (mask & FSX_PROJID) {
+		/*
+		 * CAP_FSETID overrides the following restrictions:
+		 *
+		 * The set-user-ID and set-group-ID bits of a file will be
+		 * cleared upon successful return from chown()
+		 */
+		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+		    !capable(CAP_FSETID))
+			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+
+		/*
+		 * Change the ownerships and register quota modifications
+		 * in the transaction.
+		 */
+		if (ip->i_d.di_projid != fa->fsx_projid) {
+			if (XFS_IS_PQUOTA_ON(mp)) {
+				olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+							&ip->i_gdquot, gdqp);
+			}
+			ip->i_d.di_projid = fa->fsx_projid;
+
+			/*
+			 * We may have to rev the inode as well as
+			 * the superblock version number since projids didn't
+			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
+			 */
+			if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+				xfs_bump_ino_vers2(tp, ip);
+		}
+
+	}
+
+	if (mask & FSX_EXTSIZE)
+		ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+	if (mask & FSX_XFLAGS) {
+		xfs_set_diflags(ip, fa->fsx_xflags);
+		xfs_diflags_to_linux(ip);
+	}
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+
+	XFS_STATS_INC(xs_ig_attrchg);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 * This is slightly sub-optimal in that truncates require
+	 * two sync transactions instead of one for wsync filesystems.
+	 * One for the truncate and one for the timestamps since we
+	 * don't want to change the timestamps unless we're sure the
+	 * truncate worked.  Truncates are less than 1% of the laddis
+	 * mix so this probably isn't worth the trouble to optimize.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+	code = xfs_trans_commit(tp, 0);
+	xfs_iunlock(ip, lock_flags);
+
+	/*
+	 * Release any dquot(s) the inode had kept before chown.
+	 */
+	XFS_QM_DQRELE(mp, olddquot);
+	XFS_QM_DQRELE(mp, udqp);
+	XFS_QM_DQRELE(mp, gdqp);
+
+	if (code)
+		return code;
+
+	if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
+		XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
+				NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
+				(mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
+	}
+
+	return 0;
+
+ error_return:
+	XFS_QM_DQRELE(mp, udqp);
+	XFS_QM_DQRELE(mp, gdqp);
+	xfs_trans_cancel(tp, 0);
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+	return code;
+}
+
 STATIC int
 xfs_ioc_fssetxattr(
 	xfs_inode_t		*ip,
@@ -882,31 +1203,16 @@ xfs_ioc_fssetxattr(
 	void			__user *arg)
 {
 	struct fsxattr		fa;
-	struct bhv_vattr	*vattr;
-	int			error;
-	int			attr_flags;
+	unsigned int		mask;
 
 	if (copy_from_user(&fa, arg, sizeof(fa)))
 		return -EFAULT;
 
-	vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
-	if (unlikely(!vattr))
-		return -ENOMEM;
-
-	attr_flags = 0;
+	mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
 	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-		attr_flags |= ATTR_NONBLOCK;
-
-	vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
-	vattr->va_xflags  = fa.fsx_xflags;
-	vattr->va_extsize = fa.fsx_extsize;
-	vattr->va_projid  = fa.fsx_projid;
+		mask |= FSX_NONBLOCK;
 
-	error = -xfs_setattr(ip, vattr, attr_flags, NULL);
-	if (!error)
-		vn_revalidate(XFS_ITOV(ip));	/* update flags */
-	kfree(vattr);
-	return 0;
+	return -xfs_ioctl_setattr(ip, &fa, mask);
 }
 
 STATIC int
@@ -928,10 +1234,9 @@ xfs_ioc_setxflags(
 	struct file		*filp,
 	void			__user *arg)
 {
-	struct bhv_vattr	*vattr;
+	struct fsxattr		fa;
 	unsigned int		flags;
-	int			attr_flags;
-	int			error;
+	unsigned int		mask;
 
 	if (copy_from_user(&flags, arg, sizeof(flags)))
 		return -EFAULT;
@@ -941,22 +1246,12 @@ xfs_ioc_setxflags(
 		      FS_SYNC_FL))
 		return -EOPNOTSUPP;
 
-	vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
-	if (unlikely(!vattr))
-		return -ENOMEM;
-
-	attr_flags = 0;
+	mask = FSX_XFLAGS;
 	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-		attr_flags |= ATTR_NONBLOCK;
-
-	vattr->va_mask = XFS_AT_XFLAGS;
-	vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
+		mask |= FSX_NONBLOCK;
+	fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
 
-	error = -xfs_setattr(ip, vattr, attr_flags, NULL);
-	if (likely(!error))
-		vn_revalidate(XFS_ITOV(ip));	/* update flags */
-	kfree(vattr);
-	return error;
+	return -xfs_ioctl_setattr(ip, &fa, mask);
 }
 
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 2bf287ef5489..095d271f3434 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,7 +62,7 @@ void
 xfs_synchronize_atime(
 	xfs_inode_t	*ip)
 {
-	struct inode	*inode = ip->i_vnode;
+	struct inode	*inode = VFS_I(ip);
 
 	if (inode) {
 		ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
@@ -79,7 +79,7 @@ void
 xfs_mark_inode_dirty_sync(
 	xfs_inode_t	*ip)
 {
-	struct inode	*inode = ip->i_vnode;
+	struct inode	*inode = VFS_I(ip);
 
 	if (inode)
 		mark_inode_dirty_sync(inode);
@@ -89,36 +89,31 @@ xfs_mark_inode_dirty_sync(
  * Change the requested timestamp in the given inode.
  * We don't lock across timestamp updates, and we don't log them but
  * we do record the fact that there is dirty information in core.
- *
- * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
- *		with XFS_ICHGTIME_ACC to be sure that access time
- *		update will take.  Calling first with XFS_ICHGTIME_ACC
- *		and then XFS_ICHGTIME_MOD may fail to modify the access
- *		timestamp if the filesystem is mounted noacctm.
  */
 void
 xfs_ichgtime(
 	xfs_inode_t	*ip,
 	int		flags)
 {
-	struct inode	*inode = vn_to_inode(XFS_ITOV(ip));
+	struct inode	*inode = VFS_I(ip);
 	timespec_t	tv;
+	int		sync_it = 0;
+
+	tv = current_fs_time(inode->i_sb);
 
-	nanotime(&tv);
-	if (flags & XFS_ICHGTIME_MOD) {
+	if ((flags & XFS_ICHGTIME_MOD) &&
+	    !timespec_equal(&inode->i_mtime, &tv)) {
 		inode->i_mtime = tv;
 		ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
 		ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+		sync_it = 1;
 	}
-	if (flags & XFS_ICHGTIME_ACC) {
-		inode->i_atime = tv;
-		ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
-		ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
-	}
-	if (flags & XFS_ICHGTIME_CHG) {
+	if ((flags & XFS_ICHGTIME_CHG) &&
+	    !timespec_equal(&inode->i_ctime, &tv)) {
 		inode->i_ctime = tv;
 		ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
 		ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
+		sync_it = 1;
 	}
 
 	/*
@@ -130,72 +125,11 @@ xfs_ichgtime(
 	 * ensure that the compiler does not reorder the update
 	 * of i_update_core above the timestamp updates above.
 	 */
-	SYNCHRONIZE();
-	ip->i_update_core = 1;
-	if (!(inode->i_state & I_NEW))
+	if (sync_it) {
+		SYNCHRONIZE();
+		ip->i_update_core = 1;
 		mark_inode_dirty_sync(inode);
-}
-
-/*
- * Variant on the above which avoids querying the system clock
- * in situations where we know the Linux inode timestamps have
- * just been updated (and so we can update our inode cheaply).
- */
-void
-xfs_ichgtime_fast(
-	xfs_inode_t	*ip,
-	struct inode	*inode,
-	int		flags)
-{
-	timespec_t	*tvp;
-
-	/*
-	 * Atime updates for read() & friends are handled lazily now, and
-	 * explicit updates must go through xfs_ichgtime()
-	 */
-	ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
-
-	if (flags & XFS_ICHGTIME_MOD) {
-		tvp = &inode->i_mtime;
-		ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
-		ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
-	}
-	if (flags & XFS_ICHGTIME_CHG) {
-		tvp = &inode->i_ctime;
-		ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
-		ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec;
 	}
-
-	/*
-	 * We update the i_update_core field _after_ changing
-	 * the timestamps in order to coordinate properly with
-	 * xfs_iflush() so that we don't lose timestamp updates.
-	 * This keeps us from having to hold the inode lock
-	 * while doing this.  We use the SYNCHRONIZE macro to
-	 * ensure that the compiler does not reorder the update
-	 * of i_update_core above the timestamp updates above.
-	 */
-	SYNCHRONIZE();
-	ip->i_update_core = 1;
-	if (!(inode->i_state & I_NEW))
-		mark_inode_dirty_sync(inode);
-}
-
-
-/*
- * Pull the link count and size up from the xfs inode to the linux inode
- */
-STATIC void
-xfs_validate_fields(
-	struct inode		*inode)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	loff_t size;
-
-	/* we're under i_sem so i_size can't change under us */
-	size = XFS_ISIZE(ip);
-	if (i_size_read(inode) != size)
-		i_size_write(inode, size);
 }
 
 /*
@@ -245,8 +179,7 @@ STATIC void
 xfs_cleanup_inode(
 	struct inode	*dir,
 	struct inode	*inode,
-	struct dentry	*dentry,
-	int		mode)
+	struct dentry	*dentry)
 {
 	struct xfs_name	teardown;
 
@@ -257,10 +190,7 @@ xfs_cleanup_inode(
 	 */
 	xfs_dentry_to_name(&teardown, dentry);
 
-	if (S_ISDIR(mode))
-		xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode));
-	else
-		xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
+	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
 	iput(inode);
 }
 
@@ -275,7 +205,7 @@ xfs_vn_mknod(
 	struct xfs_inode *ip = NULL;
 	xfs_acl_t	*default_acl = NULL;
 	struct xfs_name	name;
-	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
+	int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS;
 	int		error;
 
 	/*
@@ -320,7 +250,7 @@ xfs_vn_mknod(
 	if (unlikely(error))
 		goto out_free_acl;
 
-	inode = ip->i_vnode;
+	inode = VFS_I(ip);
 
 	error = xfs_init_security(inode, dir);
 	if (unlikely(error))
@@ -335,14 +265,11 @@ xfs_vn_mknod(
 	}
 
 
-	if (S_ISDIR(mode))
-		xfs_validate_fields(inode);
 	d_instantiate(dentry, inode);
-	xfs_validate_fields(dir);
 	return -error;
 
  out_cleanup_inode:
-	xfs_cleanup_inode(dir, inode, dentry, mode);
+	xfs_cleanup_inode(dir, inode, dentry);
  out_free_acl:
 	if (default_acl)
 		_ACL_FREE(default_acl);
@@ -382,7 +309,7 @@ xfs_vn_lookup(
 		return ERR_PTR(-ENAMETOOLONG);
 
 	xfs_dentry_to_name(&name, dentry);
-	error = xfs_lookup(XFS_I(dir), &name, &cip);
+	error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
 	if (unlikely(error)) {
 		if (unlikely(error != ENOENT))
 			return ERR_PTR(-error);
@@ -390,7 +317,47 @@ xfs_vn_lookup(
 		return NULL;
 	}
 
-	return d_splice_alias(cip->i_vnode, dentry);
+	return d_splice_alias(VFS_I(cip), dentry);
+}
+
+STATIC struct dentry *
+xfs_vn_ci_lookup(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	struct nameidata *nd)
+{
+	struct xfs_inode *ip;
+	struct xfs_name	xname;
+	struct xfs_name ci_name;
+	struct qstr	dname;
+	int		error;
+
+	if (dentry->d_name.len >= MAXNAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	xfs_dentry_to_name(&xname, dentry);
+	error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
+	if (unlikely(error)) {
+		if (unlikely(error != ENOENT))
+			return ERR_PTR(-error);
+		/*
+		 * call d_add(dentry, NULL) here when d_drop_negative_children
+		 * is called in xfs_vn_mknod (ie. allow negative dentries
+		 * with CI filesystems).
+		 */
+		return NULL;
+	}
+
+	/* if exact match, just splice and exit */
+	if (!ci_name.name)
+		return d_splice_alias(VFS_I(ip), dentry);
+
+	/* else case-insensitive match... */
+	dname.name = ci_name.name;
+	dname.len = ci_name.len;
+	dentry = d_add_ci(dentry, VFS_I(ip), &dname);
+	kmem_free(ci_name.name);
+	return dentry;
 }
 
 STATIC int
@@ -414,7 +381,6 @@ xfs_vn_link(
 	}
 
 	xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
-	xfs_validate_fields(inode);
 	d_instantiate(dentry, inode);
 	return 0;
 }
@@ -424,19 +390,23 @@ xfs_vn_unlink(
 	struct inode	*dir,
 	struct dentry	*dentry)
 {
-	struct inode	*inode;
 	struct xfs_name	name;
 	int		error;
 
-	inode = dentry->d_inode;
 	xfs_dentry_to_name(&name, dentry);
 
-	error = xfs_remove(XFS_I(dir), &name, XFS_I(inode));
-	if (likely(!error)) {
-		xfs_validate_fields(dir);	/* size needs update */
-		xfs_validate_fields(inode);
-	}
-	return -error;
+	error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+	if (error)
+		return error;
+
+	/*
+	 * With unlink, the VFS makes the dentry "negative": no inode,
+	 * but still hashed. This is incompatible with case-insensitive
+	 * mode, so invalidate (unhash) the dentry in CI-mode.
+	 */
+	if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
+		d_invalidate(dentry);
+	return 0;
 }
 
 STATIC int
@@ -459,43 +429,22 @@ xfs_vn_symlink(
 	if (unlikely(error))
 		goto out;
 
-	inode = cip->i_vnode;
+	inode = VFS_I(cip);
 
 	error = xfs_init_security(inode, dir);
 	if (unlikely(error))
 		goto out_cleanup_inode;
 
 	d_instantiate(dentry, inode);
-	xfs_validate_fields(dir);
-	xfs_validate_fields(inode);
 	return 0;
 
  out_cleanup_inode:
-	xfs_cleanup_inode(dir, inode, dentry, 0);
+	xfs_cleanup_inode(dir, inode, dentry);
  out:
 	return -error;
 }
 
 STATIC int
-xfs_vn_rmdir(
-	struct inode	*dir,
-	struct dentry	*dentry)
-{
-	struct inode	*inode = dentry->d_inode;
-	struct xfs_name	name;
-	int		error;
-
-	xfs_dentry_to_name(&name, dentry);
-
-	error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode));
-	if (likely(!error)) {
-		xfs_validate_fields(inode);
-		xfs_validate_fields(dir);
-	}
-	return -error;
-}
-
-STATIC int
 xfs_vn_rename(
 	struct inode	*odir,
 	struct dentry	*odentry,
@@ -505,22 +454,13 @@ xfs_vn_rename(
 	struct inode	*new_inode = ndentry->d_inode;
 	struct xfs_name	oname;
 	struct xfs_name	nname;
-	int		error;
 
 	xfs_dentry_to_name(&oname, odentry);
 	xfs_dentry_to_name(&nname, ndentry);
 
-	error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+	return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
 			   XFS_I(ndir), &nname, new_inode ?
 			   			XFS_I(new_inode) : NULL);
-	if (likely(!error)) {
-		if (new_inode)
-			xfs_validate_fields(new_inode);
-		xfs_validate_fields(odir);
-		if (ndir != odir)
-			xfs_validate_fields(ndir);
-	}
-	return -error;
 }
 
 /*
@@ -589,8 +529,7 @@ xfs_check_acl(
 STATIC int
 xfs_vn_permission(
 	struct inode		*inode,
-	int			mask,
-	struct nameidata	*nd)
+	int			mask)
 {
 	return generic_permission(inode, mask, xfs_check_acl);
 }
@@ -660,57 +599,9 @@ xfs_vn_getattr(
 STATIC int
 xfs_vn_setattr(
 	struct dentry	*dentry,
-	struct iattr	*attr)
+	struct iattr	*iattr)
 {
-	struct inode	*inode = dentry->d_inode;
-	unsigned int	ia_valid = attr->ia_valid;
-	bhv_vattr_t	vattr = { 0 };
-	int		flags = 0;
-	int		error;
-
-	if (ia_valid & ATTR_UID) {
-		vattr.va_mask |= XFS_AT_UID;
-		vattr.va_uid = attr->ia_uid;
-	}
-	if (ia_valid & ATTR_GID) {
-		vattr.va_mask |= XFS_AT_GID;
-		vattr.va_gid = attr->ia_gid;
-	}
-	if (ia_valid & ATTR_SIZE) {
-		vattr.va_mask |= XFS_AT_SIZE;
-		vattr.va_size = attr->ia_size;
-	}
-	if (ia_valid & ATTR_ATIME) {
-		vattr.va_mask |= XFS_AT_ATIME;
-		vattr.va_atime = attr->ia_atime;
-		inode->i_atime = attr->ia_atime;
-	}
-	if (ia_valid & ATTR_MTIME) {
-		vattr.va_mask |= XFS_AT_MTIME;
-		vattr.va_mtime = attr->ia_mtime;
-	}
-	if (ia_valid & ATTR_CTIME) {
-		vattr.va_mask |= XFS_AT_CTIME;
-		vattr.va_ctime = attr->ia_ctime;
-	}
-	if (ia_valid & ATTR_MODE) {
-		vattr.va_mask |= XFS_AT_MODE;
-		vattr.va_mode = attr->ia_mode;
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-			inode->i_mode &= ~S_ISGID;
-	}
-
-	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))
-		flags |= ATTR_UTIME;
-#ifdef ATTR_NO_BLOCK
-	if ((ia_valid & ATTR_NO_BLOCK))
-		flags |= ATTR_NONBLOCK;
-#endif
-
-	error = xfs_setattr(XFS_I(inode), &vattr, flags, NULL);
-	if (likely(!error))
-		vn_revalidate(vn_from_inode(inode));
-	return -error;
+	return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
 }
 
 /*
@@ -728,109 +619,6 @@ xfs_vn_truncate(
 	WARN_ON(error);
 }
 
-STATIC int
-xfs_vn_setxattr(
-	struct dentry	*dentry,
-	const char	*name,
-	const void	*data,
-	size_t		size,
-	int		flags)
-{
-	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
-	char		*attr = (char *)name;
-	attrnames_t	*namesp;
-	int		xflags = 0;
-	int		error;
-
-	namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
-	if (!namesp)
-		return -EOPNOTSUPP;
-	attr += namesp->attr_namelen;
-	error = namesp->attr_capable(vp, NULL);
-	if (error)
-		return error;
-
-	/* Convert Linux syscall to XFS internal ATTR flags */
-	if (flags & XATTR_CREATE)
-		xflags |= ATTR_CREATE;
-	if (flags & XATTR_REPLACE)
-		xflags |= ATTR_REPLACE;
-	xflags |= namesp->attr_flag;
-	return namesp->attr_set(vp, attr, (void *)data, size, xflags);
-}
-
-STATIC ssize_t
-xfs_vn_getxattr(
-	struct dentry	*dentry,
-	const char	*name,
-	void		*data,
-	size_t		size)
-{
-	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
-	char		*attr = (char *)name;
-	attrnames_t	*namesp;
-	int		xflags = 0;
-	ssize_t		error;
-
-	namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
-	if (!namesp)
-		return -EOPNOTSUPP;
-	attr += namesp->attr_namelen;
-	error = namesp->attr_capable(vp, NULL);
-	if (error)
-		return error;
-
-	/* Convert Linux syscall to XFS internal ATTR flags */
-	if (!size) {
-		xflags |= ATTR_KERNOVAL;
-		data = NULL;
-	}
-	xflags |= namesp->attr_flag;
-	return namesp->attr_get(vp, attr, (void *)data, size, xflags);
-}
-
-STATIC ssize_t
-xfs_vn_listxattr(
-	struct dentry		*dentry,
-	char			*data,
-	size_t			size)
-{
-	bhv_vnode_t		*vp = vn_from_inode(dentry->d_inode);
-	int			error, xflags = ATTR_KERNAMELS;
-	ssize_t			result;
-
-	if (!size)
-		xflags |= ATTR_KERNOVAL;
-	xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS;
-
-	error = attr_generic_list(vp, data, size, xflags, &result);
-	if (error < 0)
-		return error;
-	return result;
-}
-
-STATIC int
-xfs_vn_removexattr(
-	struct dentry	*dentry,
-	const char	*name)
-{
-	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
-	char		*attr = (char *)name;
-	attrnames_t	*namesp;
-	int		xflags = 0;
-	int		error;
-
-	namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
-	if (!namesp)
-		return -EOPNOTSUPP;
-	attr += namesp->attr_namelen;
-	error = namesp->attr_capable(vp, NULL);
-	if (error)
-		return error;
-	xflags |= namesp->attr_flag;
-	return namesp->attr_remove(vp, attr, xflags);
-}
-
 STATIC long
 xfs_vn_fallocate(
 	struct inode	*inode,
@@ -854,18 +642,18 @@ xfs_vn_fallocate(
 
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 	error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-						0, NULL, ATTR_NOLOCK);
+				      0, NULL, XFS_ATTR_NOLOCK);
 	if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
 	    offset + len > i_size_read(inode))
 		new_size = offset + len;
 
 	/* Change file size if needed */
 	if (new_size) {
-		bhv_vattr_t	va;
+		struct iattr iattr;
 
-		va.va_mask = XFS_AT_SIZE;
-		va.va_size = new_size;
-		error = xfs_setattr(ip, &va, ATTR_NOLOCK, NULL);
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = new_size;
+		error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL);
 	}
 
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -873,46 +661,172 @@ out_error:
 	return error;
 }
 
-const struct inode_operations xfs_inode_operations = {
+static const struct inode_operations xfs_inode_operations = {
 	.permission		= xfs_vn_permission,
 	.truncate		= xfs_vn_truncate,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
-	.setxattr		= xfs_vn_setxattr,
-	.getxattr		= xfs_vn_getxattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
 	.listxattr		= xfs_vn_listxattr,
-	.removexattr		= xfs_vn_removexattr,
 	.fallocate		= xfs_vn_fallocate,
 };
 
-const struct inode_operations xfs_dir_inode_operations = {
+static const struct inode_operations xfs_dir_inode_operations = {
 	.create			= xfs_vn_create,
 	.lookup			= xfs_vn_lookup,
 	.link			= xfs_vn_link,
 	.unlink			= xfs_vn_unlink,
 	.symlink		= xfs_vn_symlink,
 	.mkdir			= xfs_vn_mkdir,
-	.rmdir			= xfs_vn_rmdir,
+	/*
+	 * Yes, XFS uses the same method for rmdir and unlink.
+	 *
+	 * There are some subtile differences deeper in the code,
+	 * but we use S_ISDIR to check for those.
+	 */
+	.rmdir			= xfs_vn_unlink,
 	.mknod			= xfs_vn_mknod,
 	.rename			= xfs_vn_rename,
 	.permission		= xfs_vn_permission,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
-	.setxattr		= xfs_vn_setxattr,
-	.getxattr		= xfs_vn_getxattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
 	.listxattr		= xfs_vn_listxattr,
-	.removexattr		= xfs_vn_removexattr,
 };
 
-const struct inode_operations xfs_symlink_inode_operations = {
+static const struct inode_operations xfs_dir_ci_inode_operations = {
+	.create			= xfs_vn_create,
+	.lookup			= xfs_vn_ci_lookup,
+	.link			= xfs_vn_link,
+	.unlink			= xfs_vn_unlink,
+	.symlink		= xfs_vn_symlink,
+	.mkdir			= xfs_vn_mkdir,
+	/*
+	 * Yes, XFS uses the same method for rmdir and unlink.
+	 *
+	 * There are some subtile differences deeper in the code,
+	 * but we use S_ISDIR to check for those.
+	 */
+	.rmdir			= xfs_vn_unlink,
+	.mknod			= xfs_vn_mknod,
+	.rename			= xfs_vn_rename,
+	.permission		= xfs_vn_permission,
+	.getattr		= xfs_vn_getattr,
+	.setattr		= xfs_vn_setattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
+	.listxattr		= xfs_vn_listxattr,
+};
+
+static const struct inode_operations xfs_symlink_inode_operations = {
 	.readlink		= generic_readlink,
 	.follow_link		= xfs_vn_follow_link,
 	.put_link		= xfs_vn_put_link,
 	.permission		= xfs_vn_permission,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
-	.setxattr		= xfs_vn_setxattr,
-	.getxattr		= xfs_vn_getxattr,
+	.setxattr		= generic_setxattr,
+	.getxattr		= generic_getxattr,
+	.removexattr		= generic_removexattr,
 	.listxattr		= xfs_vn_listxattr,
-	.removexattr		= xfs_vn_removexattr,
 };
+
+STATIC void
+xfs_diflags_to_iflags(
+	struct inode		*inode,
+	struct xfs_inode	*ip)
+{
+	if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+	if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+	if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+		inode->i_flags |= S_SYNC;
+	else
+		inode->i_flags &= ~S_SYNC;
+	if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+		inode->i_flags |= S_NOATIME;
+	else
+		inode->i_flags &= ~S_NOATIME;
+}
+
+/*
+ * Initialize the Linux inode, set up the operation vectors and
+ * unlock the inode.
+ *
+ * When reading existing inodes from disk this is called directly
+ * from xfs_iget, when creating a new inode it is called from
+ * xfs_ialloc after setting up the inode.
+ */
+void
+xfs_setup_inode(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = ip->i_vnode;
+
+	inode->i_mode	= ip->i_d.di_mode;
+	inode->i_nlink	= ip->i_d.di_nlink;
+	inode->i_uid	= ip->i_d.di_uid;
+	inode->i_gid	= ip->i_d.di_gid;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		inode->i_rdev =
+			MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+			      sysv_minor(ip->i_df.if_u2.if_rdev));
+		break;
+	default:
+		inode->i_rdev = 0;
+		break;
+	}
+
+	inode->i_generation = ip->i_d.di_gen;
+	i_size_write(inode, ip->i_d.di_size);
+	inode->i_atime.tv_sec	= ip->i_d.di_atime.t_sec;
+	inode->i_atime.tv_nsec	= ip->i_d.di_atime.t_nsec;
+	inode->i_mtime.tv_sec	= ip->i_d.di_mtime.t_sec;
+	inode->i_mtime.tv_nsec	= ip->i_d.di_mtime.t_nsec;
+	inode->i_ctime.tv_sec	= ip->i_d.di_ctime.t_sec;
+	inode->i_ctime.tv_nsec	= ip->i_d.di_ctime.t_nsec;
+	xfs_diflags_to_iflags(inode, ip);
+	xfs_iflags_clear(ip, XFS_IMODIFIED);
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &xfs_inode_operations;
+		inode->i_fop = &xfs_file_operations;
+		inode->i_mapping->a_ops = &xfs_address_space_operations;
+		break;
+	case S_IFDIR:
+		if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+			inode->i_op = &xfs_dir_ci_inode_operations;
+		else
+			inode->i_op = &xfs_dir_inode_operations;
+		inode->i_fop = &xfs_dir_file_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &xfs_symlink_inode_operations;
+		if (!(ip->i_df.if_flags & XFS_IFINLINE))
+			inode->i_mapping->a_ops = &xfs_address_space_operations;
+		break;
+	default:
+		inode->i_op = &xfs_inode_operations;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		break;
+	}
+
+	xfs_iflags_clear(ip, XFS_INEW);
+	barrier();
+
+	unlock_new_inode(inode);
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index 14d0deb7afff..8b1a1e31dc21 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -18,23 +18,14 @@
 #ifndef __XFS_IOPS_H__
 #define __XFS_IOPS_H__
 
-extern const struct inode_operations xfs_inode_operations;
-extern const struct inode_operations xfs_dir_inode_operations;
-extern const struct inode_operations xfs_symlink_inode_operations;
+struct xfs_inode;
 
 extern const struct file_operations xfs_file_operations;
 extern const struct file_operations xfs_dir_file_operations;
 extern const struct file_operations xfs_invis_file_operations;
 
+extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
 
-struct xfs_inode;
-extern void xfs_ichgtime(struct xfs_inode *, int);
-extern void xfs_ichgtime_fast(struct xfs_inode *, struct inode *, int);
-
-#define xfs_vtoi(vp) \
-	((struct xfs_inode *)vn_to_inode(vp)->i_private)
-
-#define XFS_I(inode) \
-	((struct xfs_inode *)(inode)->i_private)
+extern void xfs_setup_inode(struct xfs_inode *);
 
 #endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 4edc46915b57..cc0f7b3a9795 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -45,13 +45,13 @@
 #include <mrlock.h>
 #include <sv.h>
 #include <mutex.h>
-#include <sema.h>
 #include <time.h>
 
 #include <support/ktrace.h>
 #include <support/debug.h>
 #include <support/uuid.h>
 
+#include <linux/semaphore.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
@@ -76,6 +76,7 @@
 #include <linux/log2.h>
 #include <linux/spinlock.h>
 #include <linux/random.h>
+#include <linux/ctype.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -125,8 +126,6 @@
 
 #define current_cpu()		(raw_smp_processor_id())
 #define current_pid()		(current->pid)
-#define current_fsuid(cred)	(current->fsuid)
-#define current_fsgid(cred)	(current->fsgid)
 #define current_test_flags(f)	(current->flags & (f))
 #define current_set_flags_nested(sp, f)		\
 		(*(sp) = current->flags, current->flags |= (f))
@@ -179,7 +178,7 @@
 #define xfs_sort(a,n,s,fn)	sort(a,n,s,fn,NULL)
 #define xfs_stack_trace()	dump_stack()
 #define xfs_itruncate_data(ip, off)	\
-	(-vmtruncate(vn_to_inode(XFS_ITOV(ip)), (off)))
+	(-vmtruncate(VFS_I(ip), (off)))
 
 
 /* Move the kernel do_div definition off to one side */
@@ -299,4 +298,11 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 	return x;
 }
 
+/* ARM old ABI has some weird alignment/padding */
+#if defined(__arm__) && !defined(__ARM_EABI__)
+#define __arch_pack __attribute__((packed))
+#else
+#define __arch_pack
+#endif
+
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 5e3b57516ec7..1957e5357d04 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -137,7 +137,7 @@ xfs_iozero(
 	struct address_space	*mapping;
 	int			status;
 
-	mapping = ip->i_vnode->i_mapping;
+	mapping = VFS_I(ip)->i_mapping;
 	do {
 		unsigned offset, bytes;
 		void *fsdata;
@@ -674,9 +674,7 @@ start:
 	 */
 	if (likely(!(ioflags & IO_INVIS) &&
 		   !mnt_want_write(file->f_path.mnt))) {
-		file_update_time(file);
-		xfs_ichgtime_fast(xip, inode,
-				  XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 		mnt_drop_write(file->f_path.mnt);
 	}
 
@@ -711,7 +709,7 @@ start:
 	     !capable(CAP_FSETID)) {
 		error = xfs_write_clear_setuid(xip);
 		if (likely(!error))
-			error = -remove_suid(file->f_path.dentry);
+			error = -file_remove_suid(file);
 		if (unlikely(error)) {
 			goto out_unlock_internal;
 		}
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index e480b6102051..3d5b67c075c7 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -98,12 +98,21 @@ xfs_read_xfsstats(
 	return len;
 }
 
-void
+int
 xfs_init_procfs(void)
 {
 	if (!proc_mkdir("fs/xfs", NULL))
-		return;
-	create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL);
+		goto out;
+
+	if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
+			xfs_read_xfsstats, NULL))
+		goto out_remove_entry;
+	return 0;
+
+ out_remove_entry:
+	remove_proc_entry("fs/xfs", NULL);
+ out:
+	return -ENOMEM;
 }
 
 void
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index afd0b0d5fdb2..e83820febc9f 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -134,7 +134,7 @@ DECLARE_PER_CPU(struct xfsstats, xfsstats);
 #define XFS_STATS_DEC(v)	(per_cpu(xfsstats, current_cpu()).v--)
 #define XFS_STATS_ADD(v, inc)	(per_cpu(xfsstats, current_cpu()).v += (inc))
 
-extern void xfs_init_procfs(void);
+extern int xfs_init_procfs(void);
 extern void xfs_cleanup_procfs(void);
 
 
@@ -144,8 +144,14 @@ extern void xfs_cleanup_procfs(void);
 # define XFS_STATS_DEC(count)
 # define XFS_STATS_ADD(count, inc)
 
-static inline void xfs_init_procfs(void) { };
-static inline void xfs_cleanup_procfs(void) { };
+static inline int xfs_init_procfs(void)
+{
+	return 0;
+}
+
+static inline void xfs_cleanup_procfs(void)
+{
+}
 
 #endif	/* !CONFIG_PROC_FS */
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 742b2c7852c1..37ebe36056eb 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -52,6 +52,12 @@
 #include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
+#include "xfs_filestream.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_trace.h"
+#include "xfs_extfree_item.h"
+#include "xfs_mru_cache.h"
+#include "xfs_inode_item.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -60,6 +66,7 @@
 #include <linux/writeback.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/parser.h>
 
 static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
@@ -74,7 +81,10 @@ xfs_args_allocate(
 {
 	struct xfs_mount_args	*args;
 
-	args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP);
+	args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
+	if (!args)
+		return NULL;
+
 	args->logbufs = args->logbufsize = -1;
 	strncpy(args->fsname, sb->s_id, MAXNAMELEN);
 
@@ -138,6 +148,23 @@ xfs_args_allocate(
 #define MNTOPT_XDSM	"xdsm"		/* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_DMI	"dmi"		/* DMI enabled (DMAPI / XDSM) */
 
+/*
+ * Table driven mount option parser.
+ *
+ * Currently only used for remount, but it will be used for mount
+ * in the future, too.
+ */
+enum {
+	Opt_barrier, Opt_nobarrier, Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_err, NULL}
+};
+
+
 STATIC unsigned long
 suffix_strtoul(char *s, char **endp, unsigned int base)
 {
@@ -314,6 +341,7 @@ xfs_parseargs(
 			args->flags |= XFSMNT_ATTR2;
 		} else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
 			args->flags &= ~XFSMNT_ATTR2;
+			args->flags |= XFSMNT_NOATTR2;
 		} else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
 			args->flags2 |= XFSMNT2_FILESTREAMS;
 		} else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
@@ -553,115 +581,6 @@ xfs_max_file_offset(
 	return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
 
-STATIC_INLINE void
-xfs_set_inodeops(
-	struct inode		*inode)
-{
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFREG:
-		inode->i_op = &xfs_inode_operations;
-		inode->i_fop = &xfs_file_operations;
-		inode->i_mapping->a_ops = &xfs_address_space_operations;
-		break;
-	case S_IFDIR:
-		inode->i_op = &xfs_dir_inode_operations;
-		inode->i_fop = &xfs_dir_file_operations;
-		break;
-	case S_IFLNK:
-		inode->i_op = &xfs_symlink_inode_operations;
-		if (!(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE))
-			inode->i_mapping->a_ops = &xfs_address_space_operations;
-		break;
-	default:
-		inode->i_op = &xfs_inode_operations;
-		init_special_inode(inode, inode->i_mode, inode->i_rdev);
-		break;
-	}
-}
-
-STATIC_INLINE void
-xfs_revalidate_inode(
-	xfs_mount_t		*mp,
-	bhv_vnode_t		*vp,
-	xfs_inode_t		*ip)
-{
-	struct inode		*inode = vn_to_inode(vp);
-
-	inode->i_mode	= ip->i_d.di_mode;
-	inode->i_nlink	= ip->i_d.di_nlink;
-	inode->i_uid	= ip->i_d.di_uid;
-	inode->i_gid	= ip->i_d.di_gid;
-
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFBLK:
-	case S_IFCHR:
-		inode->i_rdev =
-			MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
-			      sysv_minor(ip->i_df.if_u2.if_rdev));
-		break;
-	default:
-		inode->i_rdev = 0;
-		break;
-	}
-
-	inode->i_generation = ip->i_d.di_gen;
-	i_size_write(inode, ip->i_d.di_size);
-	inode->i_atime.tv_sec	= ip->i_d.di_atime.t_sec;
-	inode->i_atime.tv_nsec	= ip->i_d.di_atime.t_nsec;
-	inode->i_mtime.tv_sec	= ip->i_d.di_mtime.t_sec;
-	inode->i_mtime.tv_nsec	= ip->i_d.di_mtime.t_nsec;
-	inode->i_ctime.tv_sec	= ip->i_d.di_ctime.t_sec;
-	inode->i_ctime.tv_nsec	= ip->i_d.di_ctime.t_nsec;
-	if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
-		inode->i_flags |= S_IMMUTABLE;
-	else
-		inode->i_flags &= ~S_IMMUTABLE;
-	if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
-		inode->i_flags |= S_APPEND;
-	else
-		inode->i_flags &= ~S_APPEND;
-	if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
-		inode->i_flags |= S_SYNC;
-	else
-		inode->i_flags &= ~S_SYNC;
-	if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
-		inode->i_flags |= S_NOATIME;
-	else
-		inode->i_flags &= ~S_NOATIME;
-	xfs_iflags_clear(ip, XFS_IMODIFIED);
-}
-
-void
-xfs_initialize_vnode(
-	struct xfs_mount	*mp,
-	bhv_vnode_t		*vp,
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = vn_to_inode(vp);
-
-	if (!ip->i_vnode) {
-		ip->i_vnode = vp;
-		inode->i_private = ip;
-	}
-
-	/*
-	 * We need to set the ops vectors, and unlock the inode, but if
-	 * we have been called during the new inode create process, it is
-	 * too early to fill in the Linux inode.  We will get called a
-	 * second time once the inode is properly set up, and then we can
-	 * finish our work.
-	 */
-	if (ip->i_d.di_mode != 0 && (inode->i_state & I_NEW)) {
-		xfs_revalidate_inode(mp, vp, ip);
-		xfs_set_inodeops(inode);
-
-		xfs_iflags_clear(ip, XFS_INEW);
-		barrier();
-
-		unlock_new_inode(inode);
-	}
-}
-
 int
 xfs_blkdev_get(
 	xfs_mount_t		*mp,
@@ -670,7 +589,7 @@ xfs_blkdev_get(
 {
 	int			error = 0;
 
-	*bdevp = open_bdev_excl(name, 0, mp);
+	*bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
 	if (IS_ERR(*bdevp)) {
 		error = PTR_ERR(*bdevp);
 		printk("XFS: Invalid device [%s], error=%d\n", name, error);
@@ -684,7 +603,7 @@ xfs_blkdev_put(
 	struct block_device	*bdev)
 {
 	if (bdev)
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
 }
 
 /*
@@ -733,14 +652,6 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
 		return;
 	}
 
-	if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered ==
-					QUEUE_ORDERED_NONE) {
-		xfs_fs_cmn_err(CE_NOTE, mp,
-		  "Disabling barriers, not supported by the underlying device");
-		mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		return;
-	}
-
 	if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
 		xfs_fs_cmn_err(CE_NOTE, mp,
 		  "Disabling barriers, underlying device is readonly");
@@ -764,6 +675,139 @@ xfs_blkdev_issue_flush(
 	blkdev_issue_flush(buftarg->bt_bdev, NULL);
 }
 
+STATIC void
+xfs_close_devices(
+	struct xfs_mount	*mp)
+{
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+		struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+		xfs_free_buftarg(mp->m_logdev_targp);
+		xfs_blkdev_put(logdev);
+	}
+	if (mp->m_rtdev_targp) {
+		struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+		xfs_free_buftarg(mp->m_rtdev_targp);
+		xfs_blkdev_put(rtdev);
+	}
+	xfs_free_buftarg(mp->m_ddev_targp);
+}
+
+/*
+ * The file system configurations are:
+ *	(1) device (partition) with data and internal log
+ *	(2) logical volume with data and log subvolumes.
+ *	(3) logical volume with data, log, and realtime subvolumes.
+ *
+ * We only have to handle opening the log and realtime volumes here if
+ * they are present.  The data subvolume has already been opened by
+ * get_sb_bdev() and is stored in sb->s_bdev.
+ */
+STATIC int
+xfs_open_devices(
+	struct xfs_mount	*mp,
+	struct xfs_mount_args	*args)
+{
+	struct block_device	*ddev = mp->m_super->s_bdev;
+	struct block_device	*logdev = NULL, *rtdev = NULL;
+	int			error;
+
+	/*
+	 * Open real time and log devices - order is important.
+	 */
+	if (args->logname[0]) {
+		error = xfs_blkdev_get(mp, args->logname, &logdev);
+		if (error)
+			goto out;
+	}
+
+	if (args->rtname[0]) {
+		error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+		if (error)
+			goto out_close_logdev;
+
+		if (rtdev == ddev || rtdev == logdev) {
+			cmn_err(CE_WARN,
+	"XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
+			error = EINVAL;
+			goto out_close_rtdev;
+		}
+	}
+
+	/*
+	 * Setup xfs_mount buffer target pointers
+	 */
+	error = ENOMEM;
+	mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+	if (!mp->m_ddev_targp)
+		goto out_close_rtdev;
+
+	if (rtdev) {
+		mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+		if (!mp->m_rtdev_targp)
+			goto out_free_ddev_targ;
+	}
+
+	if (logdev && logdev != ddev) {
+		mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1);
+		if (!mp->m_logdev_targp)
+			goto out_free_rtdev_targ;
+	} else {
+		mp->m_logdev_targp = mp->m_ddev_targp;
+	}
+
+	return 0;
+
+ out_free_rtdev_targ:
+	if (mp->m_rtdev_targp)
+		xfs_free_buftarg(mp->m_rtdev_targp);
+ out_free_ddev_targ:
+	xfs_free_buftarg(mp->m_ddev_targp);
+ out_close_rtdev:
+	if (rtdev)
+		xfs_blkdev_put(rtdev);
+ out_close_logdev:
+	if (logdev && logdev != ddev)
+		xfs_blkdev_put(logdev);
+ out:
+	return error;
+}
+
+/*
+ * Setup xfs_mount buffer target pointers based on superblock
+ */
+STATIC int
+xfs_setup_devices(
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
+				    mp->m_sb.sb_sectsize);
+	if (error)
+		return error;
+
+	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+		unsigned int	log_sector_size = BBSIZE;
+
+		if (xfs_sb_version_hassector(&mp->m_sb))
+			log_sector_size = mp->m_sb.sb_logsectsize;
+		error = xfs_setsize_buftarg(mp->m_logdev_targp,
+					    mp->m_sb.sb_blocksize,
+					    log_sector_size);
+		if (error)
+			return error;
+	}
+	if (mp->m_rtdev_targp) {
+		error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+					    mp->m_sb.sb_blocksize,
+					    mp->m_sb.sb_sectsize);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
 /*
  * XFS AIL push thread support
  */
@@ -826,63 +870,21 @@ STATIC struct inode *
 xfs_fs_alloc_inode(
 	struct super_block	*sb)
 {
-	bhv_vnode_t		*vp;
-
-	vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
-	if (unlikely(!vp))
-		return NULL;
-	return vn_to_inode(vp);
+	return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
 }
 
 STATIC void
 xfs_fs_destroy_inode(
 	struct inode		*inode)
 {
-	kmem_zone_free(xfs_vnode_zone, vn_from_inode(inode));
+	kmem_zone_free(xfs_vnode_zone, inode);
 }
 
 STATIC void
 xfs_fs_inode_init_once(
-	kmem_zone_t		*zonep,
 	void			*vnode)
 {
-	inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
-}
-
-STATIC int __init
-xfs_init_zones(void)
-{
-	xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
-					KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-					KM_ZONE_SPREAD,
-					xfs_fs_inode_init_once);
-	if (!xfs_vnode_zone)
-		goto out;
-
-	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
-	if (!xfs_ioend_zone)
-		goto out_destroy_vnode_zone;
-
-	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
-						  xfs_ioend_zone);
-	if (!xfs_ioend_pool)
-		goto out_free_ioend_zone;
-	return 0;
-
- out_free_ioend_zone:
-	kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
-	kmem_zone_destroy(xfs_vnode_zone);
- out:
-	return -ENOMEM;
-}
-
-STATIC void
-xfs_destroy_zones(void)
-{
-	mempool_destroy(xfs_ioend_pool);
-	kmem_zone_destroy(xfs_vnode_zone);
-	kmem_zone_destroy(xfs_ioend_zone);
+	inode_init_once((struct inode *)vnode);
 }
 
 /*
@@ -987,7 +989,7 @@ void
 xfs_flush_inode(
 	xfs_inode_t	*ip)
 {
-	struct inode	*inode = ip->i_vnode;
+	struct inode	*inode = VFS_I(ip);
 
 	igrab(inode);
 	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
@@ -1012,7 +1014,7 @@ void
 xfs_flush_device(
 	xfs_inode_t	*ip)
 {
-	struct inode	*inode = vn_to_inode(XFS_ITOV(ip));
+	struct inode	*inode = VFS_I(ip);
 
 	igrab(inode);
 	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
@@ -1074,7 +1076,7 @@ xfssyncd(
 			list_del(&work->w_list);
 			if (work == &mp->m_sync_work)
 				continue;
-			kmem_free(work, sizeof(struct bhv_vfs_sync_work));
+			kmem_free(work);
 		}
 	}
 
@@ -1082,18 +1084,76 @@ xfssyncd(
 }
 
 STATIC void
+xfs_free_fsname(
+	struct xfs_mount	*mp)
+{
+	kfree(mp->m_fsname);
+	kfree(mp->m_rtname);
+	kfree(mp->m_logname);
+}
+
+STATIC void
 xfs_fs_put_super(
 	struct super_block	*sb)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
+	struct xfs_inode	*rip = mp->m_rootip;
+	int			unmount_event_flags = 0;
 	int			error;
 
 	kthread_stop(mp->m_sync_task);
 
 	xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI);
-	error = xfs_unmount(mp, 0, NULL);
-	if (error)
-		printk("XFS: unmount got error=%d\n", error);
+
+#ifdef HAVE_DMAPI
+	if (mp->m_flags & XFS_MOUNT_DMAPI) {
+		unmount_event_flags =
+			(mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ?
+				0 : DM_FLAGS_UNWANTED;
+		/*
+		 * Ignore error from dmapi here, first unmount is not allowed
+		 * to fail anyway, and second we wouldn't want to fail a
+		 * unmount because of dmapi.
+		 */
+		XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
+				NULL, NULL, 0, 0, unmount_event_flags);
+	}
+#endif
+
+	/*
+	 * Blow away any referenced inode in the filestreams cache.
+	 * This can and will cause log traffic as inodes go inactive
+	 * here.
+	 */
+	xfs_filestream_unmount(mp);
+
+	XFS_bflush(mp->m_ddev_targp);
+	error = xfs_unmount_flush(mp, 0);
+	WARN_ON(error);
+
+	/*
+	 * If we're forcing a shutdown, typically because of a media error,
+	 * we want to make sure we invalidate dirty pages that belong to
+	 * referenced vnodes as well.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
+		ASSERT(error != EFSCORRUPTED);
+	}
+
+	if (mp->m_flags & XFS_MOUNT_DMAPI) {
+		XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
+				unmount_event_flags);
+	}
+
+	xfs_unmountfs(mp);
+	xfs_freesb(mp);
+	xfs_icsb_destroy_counters(mp);
+	xfs_close_devices(mp);
+	xfs_qmops_put(mp);
+	xfs_dmops_put(mp);
+	xfs_free_fsname(mp);
+	kfree(mp);
 }
 
 STATIC void
@@ -1216,14 +1276,74 @@ xfs_fs_remount(
 	char			*options)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
-	struct xfs_mount_args	*args = xfs_args_allocate(sb, 0);
-	int			error;
+	substring_t		args[MAX_OPT_ARGS];
+	char			*p;
 
-	error = xfs_parseargs(mp, options, args, 1);
-	if (!error)
-		error = xfs_mntupdate(mp, flags, args);
-	kmem_free(args, sizeof(*args));
-	return -error;
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_barrier:
+			mp->m_flags |= XFS_MOUNT_BARRIER;
+
+			/*
+			 * Test if barriers are actually working if we can,
+			 * else delay this check until the filesystem is
+			 * marked writeable.
+			 */
+			if (!(mp->m_flags & XFS_MOUNT_RDONLY))
+				xfs_mountfs_check_barriers(mp);
+			break;
+		case Opt_nobarrier:
+			mp->m_flags &= ~XFS_MOUNT_BARRIER;
+			break;
+		default:
+			/*
+			 * Logically we would return an error here to prevent
+			 * users from believing they might have changed
+			 * mount options using remount which can't be changed.
+			 *
+			 * But unfortunately mount(8) adds all options from
+			 * mtab and fstab to the mount arguments in some cases
+			 * so we can't blindly reject options, but have to
+			 * check for each specified option if it actually
+			 * differs from the currently set option and only
+			 * reject it if that's the case.
+			 *
+			 * Until that is implemented we return success for
+			 * every remount request, and silently ignore all
+			 * options that we can't actually change.
+			 */
+#if 0
+			printk(KERN_INFO
+	"XFS: mount option \"%s\" not supported for remount\n", p);
+			return -EINVAL;
+#else
+			break;
+#endif
+		}
+	}
+
+	/* rw/ro -> rw */
+	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+		mp->m_flags &= ~XFS_MOUNT_RDONLY;
+		if (mp->m_flags & XFS_MOUNT_BARRIER)
+			xfs_mountfs_check_barriers(mp);
+	}
+
+	/* rw -> ro */
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+		xfs_filestream_flush(mp);
+		xfs_sync(mp, SYNC_DATA_QUIESCE);
+		xfs_attr_quiesce(mp);
+		mp->m_flags |= XFS_MOUNT_RDONLY;
+	}
+
+	return 0;
 }
 
 /*
@@ -1300,6 +1420,245 @@ xfs_fs_setxquota(
 				   Q_XSETPQLIM), id, (caddr_t)fdq);
 }
 
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ */
+STATIC int
+xfs_start_flags(
+	struct xfs_mount_args	*ap,
+	struct xfs_mount	*mp)
+{
+	int			error;
+
+	/* Values are in BBs */
+	if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+		/*
+		 * At this point the superblock has not been read
+		 * in, therefore we do not know the block size.
+		 * Before the mount call ends we will convert
+		 * these to FSBs.
+		 */
+		mp->m_dalign = ap->sunit;
+		mp->m_swidth = ap->swidth;
+	}
+
+	if (ap->logbufs != -1 &&
+	    ap->logbufs != 0 &&
+	    (ap->logbufs < XLOG_MIN_ICLOGS ||
+	     ap->logbufs > XLOG_MAX_ICLOGS)) {
+		cmn_err(CE_WARN,
+			"XFS: invalid logbufs value: %d [not %d-%d]",
+			ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+		return XFS_ERROR(EINVAL);
+	}
+	mp->m_logbufs = ap->logbufs;
+	if (ap->logbufsize != -1 &&
+	    ap->logbufsize !=  0 &&
+	    (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
+	     ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
+	     !is_power_of_2(ap->logbufsize))) {
+		cmn_err(CE_WARN,
+	"XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+			ap->logbufsize);
+		return XFS_ERROR(EINVAL);
+	}
+
+	error = ENOMEM;
+
+	mp->m_logbsize = ap->logbufsize;
+	mp->m_fsname_len = strlen(ap->fsname) + 1;
+
+	mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
+	if (!mp->m_fsname)
+		goto out;
+
+	if (ap->rtname[0]) {
+		mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
+		if (!mp->m_rtname)
+			goto out_free_fsname;
+
+	}
+
+	if (ap->logname[0]) {
+		mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
+		if (!mp->m_logname)
+			goto out_free_rtname;
+	}
+
+	if (ap->flags & XFSMNT_WSYNC)
+		mp->m_flags |= XFS_MOUNT_WSYNC;
+#if XFS_BIG_INUMS
+	if (ap->flags & XFSMNT_INO64) {
+		mp->m_flags |= XFS_MOUNT_INO64;
+		mp->m_inoadd = XFS_INO64_OFFSET;
+	}
+#endif
+	if (ap->flags & XFSMNT_RETERR)
+		mp->m_flags |= XFS_MOUNT_RETERR;
+	if (ap->flags & XFSMNT_NOALIGN)
+		mp->m_flags |= XFS_MOUNT_NOALIGN;
+	if (ap->flags & XFSMNT_SWALLOC)
+		mp->m_flags |= XFS_MOUNT_SWALLOC;
+	if (ap->flags & XFSMNT_OSYNCISOSYNC)
+		mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
+	if (ap->flags & XFSMNT_32BITINODES)
+		mp->m_flags |= XFS_MOUNT_32BITINODES;
+
+	if (ap->flags & XFSMNT_IOSIZE) {
+		if (ap->iosizelog > XFS_MAX_IO_LOG ||
+		    ap->iosizelog < XFS_MIN_IO_LOG) {
+			cmn_err(CE_WARN,
+		"XFS: invalid log iosize: %d [not %d-%d]",
+				ap->iosizelog, XFS_MIN_IO_LOG,
+				XFS_MAX_IO_LOG);
+			return XFS_ERROR(EINVAL);
+		}
+
+		mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+		mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
+	}
+
+	if (ap->flags & XFSMNT_IKEEP)
+		mp->m_flags |= XFS_MOUNT_IKEEP;
+	if (ap->flags & XFSMNT_DIRSYNC)
+		mp->m_flags |= XFS_MOUNT_DIRSYNC;
+	if (ap->flags & XFSMNT_ATTR2)
+		mp->m_flags |= XFS_MOUNT_ATTR2;
+	if (ap->flags & XFSMNT_NOATTR2)
+		mp->m_flags |= XFS_MOUNT_NOATTR2;
+
+	if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
+		mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+
+	/*
+	 * no recovery flag requires a read-only mount
+	 */
+	if (ap->flags & XFSMNT_NORECOVERY) {
+		if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+			cmn_err(CE_WARN,
+	"XFS: tried to mount a FS read-write without recovery!");
+			return XFS_ERROR(EINVAL);
+		}
+		mp->m_flags |= XFS_MOUNT_NORECOVERY;
+	}
+
+	if (ap->flags & XFSMNT_NOUUID)
+		mp->m_flags |= XFS_MOUNT_NOUUID;
+	if (ap->flags & XFSMNT_BARRIER)
+		mp->m_flags |= XFS_MOUNT_BARRIER;
+	else
+		mp->m_flags &= ~XFS_MOUNT_BARRIER;
+
+	if (ap->flags2 & XFSMNT2_FILESTREAMS)
+		mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+
+	if (ap->flags & XFSMNT_DMAPI)
+		mp->m_flags |= XFS_MOUNT_DMAPI;
+	return 0;
+
+
+ out_free_rtname:
+	kfree(mp->m_rtname);
+ out_free_fsname:
+	kfree(mp->m_fsname);
+ out:
+	return error;
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock _has_ now been read in.
+ */
+STATIC int
+xfs_finish_flags(
+	struct xfs_mount_args	*ap,
+	struct xfs_mount	*mp)
+{
+	int			ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+	/* Fail a mount where the logbuf is smaller then the log stripe */
+	if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+		if ((ap->logbufsize <= 0) &&
+		    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+			mp->m_logbsize = mp->m_sb.sb_logsunit;
+		} else if (ap->logbufsize > 0 &&
+			   ap->logbufsize < mp->m_sb.sb_logsunit) {
+			cmn_err(CE_WARN,
+	"XFS: logbuf size must be greater than or equal to log stripe size");
+			return XFS_ERROR(EINVAL);
+		}
+	} else {
+		/* Fail a mount if the logbuf is larger than 32K */
+		if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+			cmn_err(CE_WARN,
+	"XFS: logbuf size for version 1 logs must be 16K or 32K");
+			return XFS_ERROR(EINVAL);
+		}
+	}
+
+	/*
+	 * mkfs'ed attr2 will turn on attr2 mount unless explicitly
+	 * told by noattr2 to turn it off
+	 */
+	if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+	    !(ap->flags & XFSMNT_NOATTR2))
+		mp->m_flags |= XFS_MOUNT_ATTR2;
+
+	/*
+	 * prohibit r/w mounts of read-only filesystems
+	 */
+	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+		cmn_err(CE_WARN,
+	"XFS: cannot mount a read-only filesystem as read-write");
+		return XFS_ERROR(EROFS);
+	}
+
+	/*
+	 * check for shared mount.
+	 */
+	if (ap->flags & XFSMNT_SHARED) {
+		if (!xfs_sb_version_hasshared(&mp->m_sb))
+			return XFS_ERROR(EINVAL);
+
+		/*
+		 * For IRIX 6.5, shared mounts must have the shared
+		 * version bit set, have the persistent readonly
+		 * field set, must be version 0 and can only be mounted
+		 * read-only.
+		 */
+		if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
+		     (mp->m_sb.sb_shared_vn != 0))
+			return XFS_ERROR(EINVAL);
+
+		mp->m_flags |= XFS_MOUNT_SHARED;
+
+		/*
+		 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
+		 */
+		if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
+			return XFS_ERROR(EINVAL);
+	}
+
+	if (ap->flags & XFSMNT_UQUOTA) {
+		mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+		if (ap->flags & XFSMNT_UQUOTAENF)
+			mp->m_qflags |= XFS_UQUOTA_ENFD;
+	}
+
+	if (ap->flags & XFSMNT_GQUOTA) {
+		mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+		if (ap->flags & XFSMNT_GQUOTAENF)
+			mp->m_qflags |= XFS_OQUOTA_ENFD;
+	} else if (ap->flags & XFSMNT_PQUOTA) {
+		mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+		if (ap->flags & XFSMNT_PQUOTAENF)
+			mp->m_qflags |= XFS_OQUOTA_ENFD;
+	}
+
+	return 0;
+}
+
 STATIC int
 xfs_fs_fill_super(
 	struct super_block	*sb,
@@ -1308,11 +1667,21 @@ xfs_fs_fill_super(
 {
 	struct inode		*root;
 	struct xfs_mount	*mp = NULL;
-	struct xfs_mount_args	*args = xfs_args_allocate(sb, silent);
-	int			error;
+	struct xfs_mount_args	*args;
+	int			flags = 0, error = ENOMEM;
 
-	mp = xfs_mount_init();
+	args = xfs_args_allocate(sb, silent);
+	if (!args)
+		return -ENOMEM;
 
+	mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
+	if (!mp)
+		goto out_free_args;
+
+	spin_lock_init(&mp->m_sb_lock);
+	mutex_init(&mp->m_ilock);
+	mutex_init(&mp->m_growlock);
+	atomic_set(&mp->m_active_trans, 0);
 	INIT_LIST_HEAD(&mp->m_sync_list);
 	spin_lock_init(&mp->m_sync_lock);
 	init_waitqueue_head(&mp->m_wait_single_sync_task);
@@ -1325,16 +1694,60 @@ xfs_fs_fill_super(
 
 	error = xfs_parseargs(mp, (char *)data, args, 0);
 	if (error)
-		goto fail_vfsop;
+		goto out_free_mp;
 
 	sb_min_blocksize(sb, BBSIZE);
+	sb->s_xattr = xfs_xattr_handlers;
 	sb->s_export_op = &xfs_export_operations;
 	sb->s_qcop = &xfs_quotactl_operations;
 	sb->s_op = &xfs_super_operations;
 
-	error = xfs_mount(mp, args, NULL);
+	error = xfs_dmops_get(mp, args);
+	if (error)
+		goto out_free_mp;
+	error = xfs_qmops_get(mp, args);
+	if (error)
+		goto out_put_dmops;
+
+	if (args->flags & XFSMNT_QUIET)
+		flags |= XFS_MFSI_QUIET;
+
+	error = xfs_open_devices(mp, args);
+	if (error)
+		goto out_put_qmops;
+
+	if (xfs_icsb_init_counters(mp))
+		mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+
+	/*
+	 * Setup flags based on mount(2) options and then the superblock
+	 */
+	error = xfs_start_flags(args, mp);
+	if (error)
+		goto out_free_fsname;
+	error = xfs_readsb(mp, flags);
+	if (error)
+		goto out_free_fsname;
+	error = xfs_finish_flags(args, mp);
 	if (error)
-		goto fail_vfsop;
+		goto out_free_sb;
+
+	error = xfs_setup_devices(mp);
+	if (error)
+		goto out_free_sb;
+
+	if (mp->m_flags & XFS_MOUNT_BARRIER)
+		xfs_mountfs_check_barriers(mp);
+
+	error = xfs_filestream_mount(mp);
+	if (error)
+		goto out_free_sb;
+
+	error = xfs_mountfs(mp);
+	if (error)
+		goto out_filestream_unmount;
+
+	XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
 
 	sb->s_dirt = 1;
 	sb->s_magic = XFS_SB_MAGIC;
@@ -1344,7 +1757,7 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	root = igrab(mp->m_rootip->i_vnode);
+	root = igrab(VFS_I(mp->m_rootip));
 	if (!root) {
 		error = ENOENT;
 		goto fail_unmount;
@@ -1369,10 +1782,28 @@ xfs_fs_fill_super(
 
 	xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
 
-	kmem_free(args, sizeof(*args));
+	kfree(args);
 	return 0;
 
-fail_vnrele:
+ out_filestream_unmount:
+	xfs_filestream_unmount(mp);
+ out_free_sb:
+	xfs_freesb(mp);
+ out_free_fsname:
+	xfs_free_fsname(mp);
+	xfs_icsb_destroy_counters(mp);
+	xfs_close_devices(mp);
+ out_put_qmops:
+	xfs_qmops_put(mp);
+ out_put_dmops:
+	xfs_dmops_put(mp);
+ out_free_mp:
+	kfree(mp);
+ out_free_args:
+	kfree(args);
+	return -error;
+
+ fail_vnrele:
 	if (sb->s_root) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
@@ -1380,12 +1811,20 @@ fail_vnrele:
 		iput(root);
 	}
 
-fail_unmount:
-	xfs_unmount(mp, 0, NULL);
+ fail_unmount:
+	/*
+	 * Blow away any referenced inode in the filestreams cache.
+	 * This can and will cause log traffic as inodes go inactive
+	 * here.
+	 */
+	xfs_filestream_unmount(mp);
 
-fail_vfsop:
-	kmem_free(args, sizeof(*args));
-	return -error;
+	XFS_bflush(mp->m_ddev_targp);
+	error = xfs_unmount_flush(mp, 0);
+	WARN_ON(error);
+
+	xfs_unmountfs(mp);
+	goto out_free_sb;
 }
 
 STATIC int
@@ -1430,9 +1869,235 @@ static struct file_system_type xfs_fs_type = {
 	.fs_flags		= FS_REQUIRES_DEV,
 };
 
+STATIC int __init
+xfs_alloc_trace_bufs(void)
+{
+#ifdef XFS_ALLOC_TRACE
+	xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_MAYFAIL);
+	if (!xfs_alloc_trace_buf)
+		goto out;
+#endif
+#ifdef XFS_BMAP_TRACE
+	xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_MAYFAIL);
+	if (!xfs_bmap_trace_buf)
+		goto out_free_alloc_trace;
+#endif
+#ifdef XFS_BMBT_TRACE
+	xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
+	if (!xfs_bmbt_trace_buf)
+		goto out_free_bmap_trace;
+#endif
+#ifdef XFS_ATTR_TRACE
+	xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
+	if (!xfs_attr_trace_buf)
+		goto out_free_bmbt_trace;
+#endif
+#ifdef XFS_DIR2_TRACE
+	xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_MAYFAIL);
+	if (!xfs_dir2_trace_buf)
+		goto out_free_attr_trace;
+#endif
+
+	return 0;
+
+#ifdef XFS_DIR2_TRACE
+ out_free_attr_trace:
+#endif
+#ifdef XFS_ATTR_TRACE
+	ktrace_free(xfs_attr_trace_buf);
+ out_free_bmbt_trace:
+#endif
+#ifdef XFS_BMBT_TRACE
+	ktrace_free(xfs_bmbt_trace_buf);
+ out_free_bmap_trace:
+#endif
+#ifdef XFS_BMAP_TRACE
+	ktrace_free(xfs_bmap_trace_buf);
+ out_free_alloc_trace:
+#endif
+#ifdef XFS_ALLOC_TRACE
+	ktrace_free(xfs_alloc_trace_buf);
+ out:
+#endif
+	return -ENOMEM;
+}
+
+STATIC void
+xfs_free_trace_bufs(void)
+{
+#ifdef XFS_DIR2_TRACE
+	ktrace_free(xfs_dir2_trace_buf);
+#endif
+#ifdef XFS_ATTR_TRACE
+	ktrace_free(xfs_attr_trace_buf);
+#endif
+#ifdef XFS_BMBT_TRACE
+	ktrace_free(xfs_bmbt_trace_buf);
+#endif
+#ifdef XFS_BMAP_TRACE
+	ktrace_free(xfs_bmap_trace_buf);
+#endif
+#ifdef XFS_ALLOC_TRACE
+	ktrace_free(xfs_alloc_trace_buf);
+#endif
+}
+
+STATIC int __init
+xfs_init_zones(void)
+{
+	xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
+					KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
+					KM_ZONE_SPREAD,
+					xfs_fs_inode_init_once);
+	if (!xfs_vnode_zone)
+		goto out;
+
+	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+	if (!xfs_ioend_zone)
+		goto out_destroy_vnode_zone;
+
+	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
+						  xfs_ioend_zone);
+	if (!xfs_ioend_pool)
+		goto out_destroy_ioend_zone;
+
+	xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+						"xfs_log_ticket");
+	if (!xfs_log_ticket_zone)
+		goto out_destroy_ioend_pool;
+
+	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+						"xfs_bmap_free_item");
+	if (!xfs_bmap_free_item_zone)
+		goto out_destroy_log_ticket_zone;
+	xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+						"xfs_btree_cur");
+	if (!xfs_btree_cur_zone)
+		goto out_destroy_bmap_free_item_zone;
+
+	xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+						"xfs_da_state");
+	if (!xfs_da_state_zone)
+		goto out_destroy_btree_cur_zone;
+
+	xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
+	if (!xfs_dabuf_zone)
+		goto out_destroy_da_state_zone;
+
+	xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+	if (!xfs_ifork_zone)
+		goto out_destroy_dabuf_zone;
+
+	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+	if (!xfs_trans_zone)
+		goto out_destroy_ifork_zone;
+
+	/*
+	 * The size of the zone allocated buf log item is the maximum
+	 * size possible under XFS.  This wastes a little bit of memory,
+	 * but it is much faster.
+	 */
+	xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
+				(((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+				  NBWORD) * sizeof(int))), "xfs_buf_item");
+	if (!xfs_buf_item_zone)
+		goto out_destroy_trans_zone;
+
+	xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+			((XFS_EFD_MAX_FAST_EXTENTS - 1) *
+				 sizeof(xfs_extent_t))), "xfs_efd_item");
+	if (!xfs_efd_zone)
+		goto out_destroy_buf_item_zone;
+
+	xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+			((XFS_EFI_MAX_FAST_EXTENTS - 1) *
+				sizeof(xfs_extent_t))), "xfs_efi_item");
+	if (!xfs_efi_zone)
+		goto out_destroy_efd_zone;
+
+	xfs_inode_zone =
+		kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
+					KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
+					KM_ZONE_SPREAD, NULL);
+	if (!xfs_inode_zone)
+		goto out_destroy_efi_zone;
+
+	xfs_ili_zone =
+		kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
+					KM_ZONE_SPREAD, NULL);
+	if (!xfs_ili_zone)
+		goto out_destroy_inode_zone;
+
+#ifdef CONFIG_XFS_POSIX_ACL
+	xfs_acl_zone = kmem_zone_init(sizeof(xfs_acl_t), "xfs_acl");
+	if (!xfs_acl_zone)
+		goto out_destroy_ili_zone;
+#endif
+
+	return 0;
+
+#ifdef CONFIG_XFS_POSIX_ACL
+ out_destroy_ili_zone:
+#endif
+	kmem_zone_destroy(xfs_ili_zone);
+ out_destroy_inode_zone:
+	kmem_zone_destroy(xfs_inode_zone);
+ out_destroy_efi_zone:
+	kmem_zone_destroy(xfs_efi_zone);
+ out_destroy_efd_zone:
+	kmem_zone_destroy(xfs_efd_zone);
+ out_destroy_buf_item_zone:
+	kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_trans_zone:
+	kmem_zone_destroy(xfs_trans_zone);
+ out_destroy_ifork_zone:
+	kmem_zone_destroy(xfs_ifork_zone);
+ out_destroy_dabuf_zone:
+	kmem_zone_destroy(xfs_dabuf_zone);
+ out_destroy_da_state_zone:
+	kmem_zone_destroy(xfs_da_state_zone);
+ out_destroy_btree_cur_zone:
+	kmem_zone_destroy(xfs_btree_cur_zone);
+ out_destroy_bmap_free_item_zone:
+	kmem_zone_destroy(xfs_bmap_free_item_zone);
+ out_destroy_log_ticket_zone:
+	kmem_zone_destroy(xfs_log_ticket_zone);
+ out_destroy_ioend_pool:
+	mempool_destroy(xfs_ioend_pool);
+ out_destroy_ioend_zone:
+	kmem_zone_destroy(xfs_ioend_zone);
+ out_destroy_vnode_zone:
+	kmem_zone_destroy(xfs_vnode_zone);
+ out:
+	return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_zones(void)
+{
+#ifdef CONFIG_XFS_POSIX_ACL
+	kmem_zone_destroy(xfs_acl_zone);
+#endif
+	kmem_zone_destroy(xfs_ili_zone);
+	kmem_zone_destroy(xfs_inode_zone);
+	kmem_zone_destroy(xfs_efi_zone);
+	kmem_zone_destroy(xfs_efd_zone);
+	kmem_zone_destroy(xfs_buf_item_zone);
+	kmem_zone_destroy(xfs_trans_zone);
+	kmem_zone_destroy(xfs_ifork_zone);
+	kmem_zone_destroy(xfs_dabuf_zone);
+	kmem_zone_destroy(xfs_da_state_zone);
+	kmem_zone_destroy(xfs_btree_cur_zone);
+	kmem_zone_destroy(xfs_bmap_free_item_zone);
+	kmem_zone_destroy(xfs_log_ticket_zone);
+	mempool_destroy(xfs_ioend_pool);
+	kmem_zone_destroy(xfs_ioend_zone);
+	kmem_zone_destroy(xfs_vnode_zone);
+
+}
 
 STATIC int __init
-init_xfs_fs( void )
+init_xfs_fs(void)
 {
 	int			error;
 	static char		message[] __initdata = KERN_INFO \
@@ -1441,42 +2106,73 @@ init_xfs_fs( void )
 	printk(message);
 
 	ktrace_init(64);
+	vn_init();
+	xfs_dir_startup();
 
 	error = xfs_init_zones();
-	if (error < 0)
-		goto undo_zones;
+	if (error)
+		goto out;
+
+	error = xfs_alloc_trace_bufs();
+	if (error)
+		goto out_destroy_zones;
+
+	error = xfs_mru_cache_init();
+	if (error)
+		goto out_free_trace_buffers;
+
+	error = xfs_filestream_init();
+	if (error)
+		goto out_mru_cache_uninit;
 
 	error = xfs_buf_init();
-	if (error < 0)
-		goto undo_buffers;
+	if (error)
+		goto out_filestream_uninit;
+
+	error = xfs_init_procfs();
+	if (error)
+		goto out_buf_terminate;
+
+	error = xfs_sysctl_register();
+	if (error)
+		goto out_cleanup_procfs;
 
-	vn_init();
-	xfs_init();
-	uuid_init();
 	vfs_initquota();
 
 	error = register_filesystem(&xfs_fs_type);
 	if (error)
-		goto undo_register;
+		goto out_sysctl_unregister;
 	return 0;
 
-undo_register:
+ out_sysctl_unregister:
+	xfs_sysctl_unregister();
+ out_cleanup_procfs:
+	xfs_cleanup_procfs();
+ out_buf_terminate:
 	xfs_buf_terminate();
-
-undo_buffers:
+ out_filestream_uninit:
+	xfs_filestream_uninit();
+ out_mru_cache_uninit:
+	xfs_mru_cache_uninit();
+ out_free_trace_buffers:
+	xfs_free_trace_bufs();
+ out_destroy_zones:
 	xfs_destroy_zones();
-
-undo_zones:
+ out:
 	return error;
 }
 
 STATIC void __exit
-exit_xfs_fs( void )
+exit_xfs_fs(void)
 {
 	vfs_exitquota();
 	unregister_filesystem(&xfs_fs_type);
-	xfs_cleanup();
+	xfs_sysctl_unregister();
+	xfs_cleanup_procfs();
 	xfs_buf_terminate();
+	xfs_filestream_uninit();
+	xfs_mru_cache_uninit();
+	xfs_free_trace_bufs();
 	xfs_destroy_zones();
 	ktrace_uninit();
 }
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 3efb7c6d3303..fe2ef4e6a0f9 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -101,18 +101,13 @@ struct block_device;
 
 extern __uint64_t xfs_max_file_offset(unsigned int);
 
-extern void xfs_initialize_vnode(struct xfs_mount *mp, bhv_vnode_t *vp,
-		struct xfs_inode *ip);
-
 extern void xfs_flush_inode(struct xfs_inode *);
 extern void xfs_flush_device(struct xfs_inode *);
 
-extern int  xfs_blkdev_get(struct xfs_mount *, const char *,
-				struct block_device **);
-extern void xfs_blkdev_put(struct block_device *);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 
 extern const struct export_operations xfs_export_operations;
+extern struct xattr_handler *xfs_xattr_handlers[];
 
 #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
 
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index bb997d75c05c..7dacb5bbde3f 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -259,15 +259,17 @@ static ctl_table xfs_root_table[] = {
 	{}
 };
 
-void
+int
 xfs_sysctl_register(void)
 {
 	xfs_table_header = register_sysctl_table(xfs_root_table);
+	if (!xfs_table_header)
+		return -ENOMEM;
+	return 0;
 }
 
 void
 xfs_sysctl_unregister(void)
 {
-	if (xfs_table_header)
-		unregister_sysctl_table(xfs_table_header);
+	unregister_sysctl_table(xfs_table_header);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index 98b97e399d6f..4aadb8056c37 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -93,10 +93,10 @@ enum {
 extern xfs_param_t	xfs_params;
 
 #ifdef CONFIG_SYSCTL
-extern void xfs_sysctl_register(void);
+extern int xfs_sysctl_register(void);
 extern void xfs_sysctl_unregister(void);
 #else
-# define xfs_sysctl_register()		do { } while (0)
+# define xfs_sysctl_register()		(0)
 # define xfs_sysctl_unregister()	do { } while (0)
 #endif /* CONFIG_SYSCTL */
 
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index bc7afe007338..b52528bbbfff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -33,7 +33,7 @@
 
 
 /*
- * Dedicated vnode inactive/reclaim sync semaphores.
+ * Dedicated vnode inactive/reclaim sync wait queues.
  * Prime number of hash buckets since address is used as the key.
  */
 #define NVSYNC                  37
@@ -82,74 +82,6 @@ vn_ioerror(
 		xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
 }
 
-/*
- * Revalidate the Linux inode from the XFS inode.
- * Note: i_size _not_ updated; we must hold the inode
- * semaphore when doing that - callers responsibility.
- */
-int
-vn_revalidate(
-	bhv_vnode_t		*vp)
-{
-	struct inode		*inode = vn_to_inode(vp);
-	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	unsigned long		xflags;
-
-	xfs_itrace_entry(ip);
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
-	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	inode->i_mode	    = ip->i_d.di_mode;
-	inode->i_uid	    = ip->i_d.di_uid;
-	inode->i_gid	    = ip->i_d.di_gid;
-	inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
-	inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
-	inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
-	inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
-
-	xflags = xfs_ip2xflags(ip);
-	if (xflags & XFS_XFLAG_IMMUTABLE)
-		inode->i_flags |= S_IMMUTABLE;
-	else
-		inode->i_flags &= ~S_IMMUTABLE;
-	if (xflags & XFS_XFLAG_APPEND)
-		inode->i_flags |= S_APPEND;
-	else
-		inode->i_flags &= ~S_APPEND;
-	if (xflags & XFS_XFLAG_SYNC)
-		inode->i_flags |= S_SYNC;
-	else
-		inode->i_flags &= ~S_SYNC;
-	if (xflags & XFS_XFLAG_NOATIME)
-		inode->i_flags |= S_NOATIME;
-	else
-		inode->i_flags &= ~S_NOATIME;
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-	xfs_iflags_clear(ip, XFS_IMODIFIED);
-	return 0;
-}
-
-/*
- * Add a reference to a referenced vnode.
- */
-bhv_vnode_t *
-vn_hold(
-	bhv_vnode_t	*vp)
-{
-	struct inode	*inode;
-
-	XFS_STATS_INC(vn_hold);
-
-	inode = igrab(vn_to_inode(vp));
-	ASSERT(inode);
-
-	return vp;
-}
-
 #ifdef	XFS_INODE_TRACE
 
 /*
@@ -158,7 +90,7 @@ vn_hold(
  */
 static inline int xfs_icount(struct xfs_inode *ip)
 {
-	bhv_vnode_t *vp = XFS_ITOV_NULL(ip);
+	struct inode *vp = VFS_I(ip);
 
 	if (vp)
 		return vn_count(vp);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 25eb2a9e8d9b..683ce16210ff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -19,24 +19,9 @@
 #define __XFS_VNODE_H__
 
 struct file;
-struct bhv_vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
 
-typedef struct inode	bhv_vnode_t;
-
-/*
- * Vnode to Linux inode mapping.
- */
-static inline bhv_vnode_t *vn_from_inode(struct inode *inode)
-{
-	return inode;
-}
-static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
-{
-	return vnode;
-}
-
 /*
  * Return values for xfs_inactive.  A return value of
  * VN_INACTIVE_NOCACHE implies that the file system behavior
@@ -66,87 +51,8 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
 					   Prevent VM access to the pages until
 					   the operation completes. */
 
-/*
- * Vnode attributes.  va_mask indicates those attributes the caller
- * wants to set or extract.
- */
-typedef struct bhv_vattr {
-	int		va_mask;	/* bit-mask of attributes present */
-	mode_t		va_mode;	/* file access mode and type */
-	xfs_nlink_t	va_nlink;	/* number of references to file */
-	uid_t		va_uid;		/* owner user id */
-	gid_t		va_gid;		/* owner group id */
-	xfs_ino_t	va_nodeid;	/* file id */
-	xfs_off_t	va_size;	/* file size in bytes */
-	u_long		va_blocksize;	/* blocksize preferred for i/o */
-	struct timespec	va_atime;	/* time of last access */
-	struct timespec	va_mtime;	/* time of last modification */
-	struct timespec	va_ctime;	/* time file changed */
-	u_int		va_gen;		/* generation number of file */
-	xfs_dev_t	va_rdev;	/* device the special file represents */
-	__int64_t	va_nblocks;	/* number of blocks allocated */
-	u_long		va_xflags;	/* random extended file flags */
-	u_long		va_extsize;	/* file extent size */
-	u_long		va_nextents;	/* number of extents in file */
-	u_long		va_anextents;	/* number of attr extents in file */
-	prid_t		va_projid;	/* project id */
-} bhv_vattr_t;
-
-/*
- * setattr or getattr attributes
- */
-#define XFS_AT_TYPE		0x00000001
-#define XFS_AT_MODE		0x00000002
-#define XFS_AT_UID		0x00000004
-#define XFS_AT_GID		0x00000008
-#define XFS_AT_FSID		0x00000010
-#define XFS_AT_NODEID		0x00000020
-#define XFS_AT_NLINK		0x00000040
-#define XFS_AT_SIZE		0x00000080
-#define XFS_AT_ATIME		0x00000100
-#define XFS_AT_MTIME		0x00000200
-#define XFS_AT_CTIME		0x00000400
-#define XFS_AT_RDEV		0x00000800
-#define XFS_AT_BLKSIZE		0x00001000
-#define XFS_AT_NBLOCKS		0x00002000
-#define XFS_AT_VCODE		0x00004000
-#define XFS_AT_MAC		0x00008000
-#define XFS_AT_UPDATIME		0x00010000
-#define XFS_AT_UPDMTIME		0x00020000
-#define XFS_AT_UPDCTIME		0x00040000
-#define XFS_AT_ACL		0x00080000
-#define XFS_AT_CAP		0x00100000
-#define XFS_AT_INF		0x00200000
-#define XFS_AT_XFLAGS		0x00400000
-#define XFS_AT_EXTSIZE		0x00800000
-#define XFS_AT_NEXTENTS		0x01000000
-#define XFS_AT_ANEXTENTS	0x02000000
-#define XFS_AT_PROJID		0x04000000
-#define XFS_AT_SIZE_NOPERM	0x08000000
-#define XFS_AT_GENCOUNT		0x10000000
-
-#define XFS_AT_ALL	(XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
-		XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
-		XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
-		XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\
-		XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\
-		XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT)
-
-#define XFS_AT_STAT	(XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
-		XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
-		XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
-		XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID)
-
-#define XFS_AT_TIMES	(XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME)
-
-#define XFS_AT_UPDTIMES	(XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME)
-
-#define XFS_AT_NOSET	(XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\
-		XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\
-		XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT)
 
 extern void	vn_init(void);
-extern int	vn_revalidate(bhv_vnode_t *);
 
 /*
  * Yeah, these don't take vnode anymore at all, all this should be
@@ -156,57 +62,52 @@ extern void	vn_iowait(struct xfs_inode *ip);
 extern void	vn_iowake(struct xfs_inode *ip);
 extern void	vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
 
-static inline int vn_count(bhv_vnode_t *vp)
+static inline int vn_count(struct inode *vp)
 {
-	return atomic_read(&vn_to_inode(vp)->i_count);
+	return atomic_read(&vp->i_count);
 }
 
-/*
- * Vnode reference counting functions (and macros for compatibility).
- */
-extern bhv_vnode_t	*vn_hold(bhv_vnode_t *);
+#define IHOLD(ip) \
+do { \
+	ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+	atomic_inc(&(VFS_I(ip)->i_count)); \
+	xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
 
-#if defined(XFS_INODE_TRACE)
-#define VN_HOLD(vp)		\
-	((void)vn_hold(vp),	\
-	  xfs_itrace_hold(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address))
-#define VN_RELE(vp)		\
-	  (xfs_itrace_rele(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address), \
-	   iput(vn_to_inode(vp)))
-#else
-#define VN_HOLD(vp)		((void)vn_hold(vp))
-#define VN_RELE(vp)		(iput(vn_to_inode(vp)))
-#endif
+#define IRELE(ip) \
+do { \
+	xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+	iput(VFS_I(ip)); \
+} while (0)
 
-static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
+static inline struct inode *vn_grab(struct inode *vp)
 {
-	struct inode *inode = igrab(vn_to_inode(vp));
-	return inode ? vn_from_inode(inode) : NULL;
+	return igrab(vp);
 }
 
 /*
  * Dealing with bad inodes
  */
-static inline int VN_BAD(bhv_vnode_t *vp)
+static inline int VN_BAD(struct inode *vp)
 {
-	return is_bad_inode(vn_to_inode(vp));
+	return is_bad_inode(vp);
 }
 
 /*
  * Extracting atime values in various formats
  */
-static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime)
+static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
 {
 	bs_atime->tv_sec = vp->i_atime.tv_sec;
 	bs_atime->tv_nsec = vp->i_atime.tv_nsec;
 }
 
-static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts)
+static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
 {
 	*ts = vp->i_atime;
 }
 
-static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
+static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
 {
 	*tt = vp->i_atime.tv_sec;
 }
@@ -214,20 +115,11 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 /*
  * Some useful predicates.
  */
-#define VN_MAPPED(vp)	mapping_mapped(vn_to_inode(vp)->i_mapping)
-#define VN_CACHED(vp)	(vn_to_inode(vp)->i_mapping->nrpages)
-#define VN_DIRTY(vp)	mapping_tagged(vn_to_inode(vp)->i_mapping, \
+#define VN_MAPPED(vp)	mapping_mapped(vp->i_mapping)
+#define VN_CACHED(vp)	(vp->i_mapping->nrpages)
+#define VN_DIRTY(vp)	mapping_tagged(vp->i_mapping, \
 					PAGECACHE_TAG_DIRTY)
 
-/*
- * Flags to vop_setattr/getattr.
- */
-#define	ATTR_UTIME	0x01	/* non-default utime(2) request */
-#define	ATTR_DMI	0x08	/* invocation from a DMI function */
-#define	ATTR_LAZY	0x80	/* set/get attributes lazily */
-#define	ATTR_NONBLOCK	0x100	/* return EAGAIN if operation would block */
-#define ATTR_NOLOCK	0x200	/* Don't grab any conflicting locks */
-#define ATTR_NOSIZETOK	0x400	/* Don't get the SIZE token */
 
 /*
  * Tracking vnode activity.
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
new file mode 100644
index 000000000000..964621fde6ed
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (C) 2008 Christoph Hellwig.
+ * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_acl.h"
+#include "xfs_vnodeops.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+
+/*
+ * ACL handling.  Should eventually be moved into xfs_acl.c
+ */
+
+static int
+xfs_decode_acl(const char *name)
+{
+	if (strcmp(name, "posix_acl_access") == 0)
+		return _ACL_TYPE_ACCESS;
+	else if (strcmp(name, "posix_acl_default") == 0)
+		return _ACL_TYPE_DEFAULT;
+	return -EINVAL;
+}
+
+/*
+ * Get system extended attributes which at the moment only
+ * includes Posix ACLs.
+ */
+static int
+xfs_xattr_system_get(struct inode *inode, const char *name,
+		void *buffer, size_t size)
+{
+	int acl;
+
+	acl = xfs_decode_acl(name);
+	if (acl < 0)
+		return acl;
+
+	return xfs_acl_vget(inode, buffer, size, acl);
+}
+
+static int
+xfs_xattr_system_set(struct inode *inode, const char *name,
+		const void *value, size_t size, int flags)
+{
+	int acl;
+
+	acl = xfs_decode_acl(name);
+	if (acl < 0)
+		return acl;
+	if (flags & XATTR_CREATE)
+		return -EINVAL;
+
+	if (!value)
+		return xfs_acl_vremove(inode, acl);
+
+	return xfs_acl_vset(inode, (void *)value, size, acl);
+}
+
+static struct xattr_handler xfs_xattr_system_handler = {
+	.prefix	= XATTR_SYSTEM_PREFIX,
+	.get	= xfs_xattr_system_get,
+	.set	= xfs_xattr_system_set,
+};
+
+
+/*
+ * Real xattr handling.  The only difference between the namespaces is
+ * a flag passed to the low-level attr code.
+ */
+
+static int
+__xfs_xattr_get(struct inode *inode, const char *name,
+		void *value, size_t size, int xflags)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+	int error, asize = size;
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (!size) {
+		xflags |= ATTR_KERNOVAL;
+		value = NULL;
+	}
+
+	error = -xfs_attr_get(ip, name, value, &asize, xflags);
+	if (error)
+		return error;
+	return asize;
+}
+
+static int
+__xfs_xattr_set(struct inode *inode, const char *name, const void *value,
+		size_t size, int flags, int xflags)
+{
+	struct xfs_inode *ip = XFS_I(inode);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (flags & XATTR_CREATE)
+		xflags |= ATTR_CREATE;
+	if (flags & XATTR_REPLACE)
+		xflags |= ATTR_REPLACE;
+
+	if (!value)
+		return -xfs_attr_remove(ip, name, xflags);
+	return -xfs_attr_set(ip, name, (void *)value, size, xflags);
+}
+
+static int
+xfs_xattr_user_get(struct inode *inode, const char *name,
+		void *value, size_t size)
+{
+	return __xfs_xattr_get(inode, name, value, size, 0);
+}
+
+static int
+xfs_xattr_user_set(struct inode *inode, const char *name,
+		const void *value, size_t size, int flags)
+{
+	return __xfs_xattr_set(inode, name, value, size, flags, 0);
+}
+
+static struct xattr_handler xfs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.get	= xfs_xattr_user_get,
+	.set	= xfs_xattr_user_set,
+};
+
+
+static int
+xfs_xattr_trusted_get(struct inode *inode, const char *name,
+		void *value, size_t size)
+{
+	return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT);
+}
+
+static int
+xfs_xattr_trusted_set(struct inode *inode, const char *name,
+		const void *value, size_t size, int flags)
+{
+	return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT);
+}
+
+static struct xattr_handler xfs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.get	= xfs_xattr_trusted_get,
+	.set	= xfs_xattr_trusted_set,
+};
+
+
+static int
+xfs_xattr_secure_get(struct inode *inode, const char *name,
+		void *value, size_t size)
+{
+	return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE);
+}
+
+static int
+xfs_xattr_secure_set(struct inode *inode, const char *name,
+		const void *value, size_t size, int flags)
+{
+	return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE);
+}
+
+static struct xattr_handler xfs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.get	= xfs_xattr_secure_get,
+	.set	= xfs_xattr_secure_set,
+};
+
+
+struct xattr_handler *xfs_xattr_handlers[] = {
+	&xfs_xattr_user_handler,
+	&xfs_xattr_trusted_handler,
+	&xfs_xattr_security_handler,
+	&xfs_xattr_system_handler,
+	NULL
+};
+
+static unsigned int xfs_xattr_prefix_len(int flags)
+{
+	if (flags & XFS_ATTR_SECURE)
+		return sizeof("security");
+	else if (flags & XFS_ATTR_ROOT)
+		return sizeof("trusted");
+	else
+		return sizeof("user");
+}
+
+static const char *xfs_xattr_prefix(int flags)
+{
+	if (flags & XFS_ATTR_SECURE)
+		return xfs_xattr_security_handler.prefix;
+	else if (flags & XFS_ATTR_ROOT)
+		return xfs_xattr_trusted_handler.prefix;
+	else
+		return xfs_xattr_user_handler.prefix;
+}
+
+static int
+xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
+		char *name, int namelen, int valuelen, char *value)
+{
+	unsigned int prefix_len = xfs_xattr_prefix_len(flags);
+	char *offset;
+	int arraytop;
+
+	ASSERT(context->count >= 0);
+
+	/*
+	 * Only show root namespace entries if we are actually allowed to
+	 * see them.
+	 */
+	if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
+		return 0;
+
+	arraytop = context->count + prefix_len + namelen + 1;
+	if (arraytop > context->firstu) {
+		context->count = -1;	/* insufficient space */
+		return 1;
+	}
+	offset = (char *)context->alist + context->count;
+	strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+	offset += prefix_len;
+	strncpy(offset, name, namelen);			/* real name */
+	offset += namelen;
+	*offset = '\0';
+	context->count += prefix_len + namelen + 1;
+	return 0;
+}
+
+static int
+xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags,
+		char *name, int namelen, int valuelen, char *value)
+{
+	context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
+	return 0;
+}
+
+static int
+list_one_attr(const char *name, const size_t len, void *data,
+		size_t size, ssize_t *result)
+{
+	char *p = data + *result;
+
+	*result += len;
+	if (!size)
+		return 0;
+	if (*result > size)
+		return -ERANGE;
+
+	strcpy(p, name);
+	return 0;
+}
+
+ssize_t
+xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+	struct xfs_attr_list_context context;
+	struct attrlist_cursor_kern cursor = { 0 };
+	struct inode		*inode = dentry->d_inode;
+	int			error;
+
+	/*
+	 * First read the regular on-disk attributes.
+	 */
+	memset(&context, 0, sizeof(context));
+	context.dp = XFS_I(inode);
+	context.cursor = &cursor;
+	context.resynch = 1;
+	context.alist = data;
+	context.bufsize = size;
+	context.firstu = context.bufsize;
+
+	if (size)
+		context.put_listent = xfs_xattr_put_listent;
+	else
+		context.put_listent = xfs_xattr_put_listent_sizes;
+
+	xfs_attr_list_int(&context);
+	if (context.count < 0)
+		return -ERANGE;
+
+	/*
+	 * Then add the two synthetic ACL attributes.
+	 */
+	if (xfs_acl_vhasacl_access(inode)) {
+		error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
+				strlen(POSIX_ACL_XATTR_ACCESS) + 1,
+				data, size, &context.count);
+		if (error)
+			return error;
+	}
+
+	if (xfs_acl_vhasacl_default(inode)) {
+		error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
+				strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
+				data, size, &context.count);
+		if (error)
+			return error;
+	}
+
+	return context.count;
+}
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 85df3288efd5..f2705f2fd43c 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,11 +101,18 @@ xfs_qm_dqinit(
 	if (brandnewdquot) {
 		dqp->dq_flnext = dqp->dq_flprev = dqp;
 		mutex_init(&dqp->q_qlock);
-		initnsema(&dqp->q_flock, 1, "fdq");
 		sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
 
+		/*
+		 * Because we want to use a counting completion, complete
+		 * the flush completion once to allow a single access to
+		 * the flush completion without blocking.
+		 */
+		init_completion(&dqp->q_flush);
+		complete(&dqp->q_flush);
+
 #ifdef XFS_DQUOT_TRACE
-		dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
+		dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS);
 		xfs_dqtrace_entry(dqp, "DQINIT");
 #endif
 	} else {
@@ -150,7 +157,6 @@ xfs_qm_dqdestroy(
 	ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
 
 	mutex_destroy(&dqp->q_qlock);
-	freesema(&dqp->q_flock);
 	sv_destroy(&dqp->q_pinwait);
 
 #ifdef XFS_DQUOT_TRACE
@@ -431,7 +437,7 @@ xfs_qm_dqalloc(
 	 * when it unlocks the inode. Since we want to keep the quota
 	 * inode around, we bump the vnode ref count now.
 	 */
-	VN_HOLD(XFS_ITOV(quotip));
+	IHOLD(quotip);
 
 	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
 	nmaps = 1;
@@ -1211,7 +1217,7 @@ xfs_qm_dqflush(
 	int			error;
 
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+	ASSERT(!completion_done(&dqp->q_flush));
 	xfs_dqtrace_entry(dqp, "DQFLUSH");
 
 	/*
@@ -1348,34 +1354,18 @@ xfs_qm_dqflush_done(
 	xfs_dqfunlock(dqp);
 }
 
-
-int
-xfs_qm_dqflock_nowait(
-	xfs_dquot_t *dqp)
-{
-	int locked;
-
-	locked = cpsema(&((dqp)->q_flock));
-
-	/* XXX ifdef these out */
-	if (locked)
-		(dqp)->dq_flags |= XFS_DQ_FLOCKED;
-	return (locked);
-}
-
-
 int
 xfs_qm_dqlock_nowait(
 	xfs_dquot_t *dqp)
 {
-	return (mutex_trylock(&((dqp)->q_qlock)));
+	return mutex_trylock(&dqp->q_qlock);
 }
 
 void
 xfs_dqlock(
 	xfs_dquot_t *dqp)
 {
-	mutex_lock(&(dqp->q_qlock));
+	mutex_lock(&dqp->q_qlock);
 }
 
 void
@@ -1435,8 +1425,7 @@ xfs_dqlock2(
 /* ARGSUSED */
 int
 xfs_qm_dqpurge(
-	xfs_dquot_t	*dqp,
-	uint		flags)
+	xfs_dquot_t	*dqp)
 {
 	xfs_dqhash_t	*thishash;
 	xfs_mount_t	*mp = dqp->q_mount;
@@ -1469,7 +1458,7 @@ xfs_qm_dqpurge(
 	 * if we're turning off quotas. Basically, we need this flush
 	 * lock, and are willing to block on it.
 	 */
-	if (! xfs_qm_dqflock_nowait(dqp)) {
+	if (!xfs_dqflock_nowait(dqp)) {
 		/*
 		 * Block on the flush lock after nudging dquot buffer,
 		 * if it is incore.
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 5c371a92e3e2..8958d0faf8d3 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -82,7 +82,7 @@ typedef struct xfs_dquot {
 	xfs_qcnt_t	 q_res_icount;	/* total inos allocd+reserved */
 	xfs_qcnt_t	 q_res_rtbcount;/* total realtime blks used+reserved */
 	mutex_t		 q_qlock;	/* quota lock */
-	sema_t		 q_flock;	/* flush lock */
+	struct completion q_flush;	/* flush completion queue */
 	uint		 q_pincount;	/* pin count for this dquot */
 	sv_t		 q_pinwait;	/* sync var for pinning */
 #ifdef XFS_DQUOT_TRACE
@@ -113,17 +113,25 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
 
 
 /*
- * The following three routines simply manage the q_flock
- * semaphore embedded in the dquot.  This semaphore synchronizes
- * processes attempting to flush the in-core dquot back to disk.
+ * Manage the q_flush completion queue embedded in the dquot.  This completion
+ * queue synchronizes processes attempting to flush the in-core dquot back to
+ * disk.
  */
-#define xfs_dqflock(dqp)	 { psema(&((dqp)->q_flock), PINOD | PRECALC);\
-				   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
-#define xfs_dqfunlock(dqp)	 { ASSERT(issemalocked(&((dqp)->q_flock))); \
-				   vsema(&((dqp)->q_flock)); \
-				   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
+{
+	wait_for_completion(&dqp->q_flush);
+}
+
+static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+	return try_wait_for_completion(&dqp->q_flush);
+}
+
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+	complete(&dqp->q_flush);
+}
 
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
@@ -164,10 +172,9 @@ extern void		xfs_qm_dqprint(xfs_dquot_t *);
 
 extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
 extern int		xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern int		xfs_qm_dqpurge(xfs_dquot_t *, uint);
+extern int		xfs_qm_dqpurge(xfs_dquot_t *);
 extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
 extern int		xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern int		xfs_qm_dqflock_nowait(xfs_dquot_t *);
 extern void		xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
 extern void		xfs_qm_adjust_dqtimers(xfs_mount_t *,
 					xfs_disk_dquot_t *);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 36e05ca78412..f028644caa5e 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -151,7 +151,7 @@ xfs_qm_dquot_logitem_push(
 	dqp = logitem->qli_dquot;
 
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
-	ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+	ASSERT(!completion_done(&dqp->q_flush));
 
 	/*
 	 * Since we were able to lock the dquot's flush lock and
@@ -245,7 +245,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	 * inode flush completed and the inode was taken off the AIL.
 	 * So, just get out.
 	 */
-	if (!issemalocked(&(dqp->q_flock))  ||
+	if (completion_done(&dqp->q_flush)  ||
 	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
 		qip->qli_pushbuf_flag = 0;
 		xfs_dqunlock(dqp);
@@ -258,7 +258,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	if (bp != NULL) {
 		if (XFS_BUF_ISDELAYWRITE(bp)) {
 			dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-				  issemalocked(&(dqp->q_flock)));
+				  !completion_done(&dqp->q_flush));
 			qip->qli_pushbuf_flag = 0;
 			xfs_dqunlock(dqp);
 
@@ -317,7 +317,7 @@ xfs_qm_dquot_logitem_trylock(
 		return (XFS_ITEM_LOCKED);
 
 	retval = XFS_ITEM_SUCCESS;
-	if (! xfs_qm_dqflock_nowait(dqp)) {
+	if (!xfs_dqflock_nowait(dqp)) {
 		/*
 		 * The dquot is already being flushed.	It may have been
 		 * flushed delayed write, however, and we don't want to
@@ -576,8 +576,8 @@ xfs_qm_qoffend_logitem_committed(
 	 * xfs_trans_delete_ail() drops the AIL lock.
 	 */
 	xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs);
-	kmem_free(qfs, sizeof(xfs_qoff_logitem_t));
-	kmem_free(qfe, sizeof(xfs_qoff_logitem_t));
+	kmem_free(qfs);
+	kmem_free(qfe);
 	return (xfs_lsn_t)-1;
 }
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index d31cce1165c5..df0ffef9775a 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -192,8 +192,8 @@ xfs_qm_destroy(
 		xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
 		xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
 	}
-	kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t));
-	kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t));
+	kmem_free(xqm->qm_usr_dqhtable);
+	kmem_free(xqm->qm_grp_dqhtable);
 	xqm->qm_usr_dqhtable = NULL;
 	xqm->qm_grp_dqhtable = NULL;
 	xqm->qm_dqhashmask = 0;
@@ -201,7 +201,7 @@ xfs_qm_destroy(
 #ifdef DEBUG
 	mutex_destroy(&qcheck_lock);
 #endif
-	kmem_free(xqm, sizeof(xfs_qm_t));
+	kmem_free(xqm);
 }
 
 /*
@@ -310,8 +310,7 @@ xfs_qm_unmount_quotadestroy(
  */
 void
 xfs_qm_mount_quotas(
-	xfs_mount_t	*mp,
-	int		mfsi_flags)
+	xfs_mount_t	*mp)
 {
 	int		error = 0;
 	uint		sbf;
@@ -346,8 +345,7 @@ xfs_qm_mount_quotas(
 	/*
 	 * If any of the quotas are not consistent, do a quotacheck.
 	 */
-	if (XFS_QM_NEED_QUOTACHECK(mp) &&
-	    !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
+	if (XFS_QM_NEED_QUOTACHECK(mp)) {
 		error = xfs_qm_quotacheck(mp);
 		if (error) {
 			/* Quotacheck failed and disabled quotas. */
@@ -445,11 +443,11 @@ xfs_qm_unmount_quotas(
 		}
 	}
 	if (uqp) {
-		 XFS_PURGE_INODE(uqp);
+		 IRELE(uqp);
 		 mp->m_quotainfo->qi_uquotaip = NULL;
 	}
 	if (gqp) {
-		XFS_PURGE_INODE(gqp);
+		IRELE(gqp);
 		mp->m_quotainfo->qi_gquotaip = NULL;
 	}
 out:
@@ -484,7 +482,7 @@ again:
 		xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
 		/* XXX a sentinel would be better */
 		recl = XFS_QI_MPLRECLAIMS(mp);
-		if (! xfs_qm_dqflock_nowait(dqp)) {
+		if (!xfs_dqflock_nowait(dqp)) {
 			/*
 			 * If we can't grab the flush lock then check
 			 * to see if the dquot has been flushed delayed
@@ -631,7 +629,7 @@ xfs_qm_dqpurge_int(
 		 * freelist in INACTIVE state.
 		 */
 		nextdqp = dqp->MPL_NEXT;
-		nmisses += xfs_qm_dqpurge(dqp, flags);
+		nmisses += xfs_qm_dqpurge(dqp);
 		dqp = nextdqp;
 	}
 	xfs_qm_mplist_unlock(mp);
@@ -1062,7 +1060,7 @@ xfs_qm_sync(
 
 		/* XXX a sentinel would be better */
 		recl = XFS_QI_MPLRECLAIMS(mp);
-		if (! xfs_qm_dqflock_nowait(dqp)) {
+		if (!xfs_dqflock_nowait(dqp)) {
 			if (nowait) {
 				xfs_dqunlock(dqp);
 				continue;
@@ -1134,7 +1132,7 @@ xfs_qm_init_quotainfo(
 	 * and change the superblock accordingly.
 	 */
 	if ((error = xfs_qm_init_quotainos(mp))) {
-		kmem_free(qinf, sizeof(xfs_quotainfo_t));
+		kmem_free(qinf);
 		mp->m_quotainfo = NULL;
 		return error;
 	}
@@ -1240,15 +1238,15 @@ xfs_qm_destroy_quotainfo(
 	xfs_qm_list_destroy(&qi->qi_dqlist);
 
 	if (qi->qi_uquotaip) {
-		XFS_PURGE_INODE(qi->qi_uquotaip);
+		IRELE(qi->qi_uquotaip);
 		qi->qi_uquotaip = NULL; /* paranoia */
 	}
 	if (qi->qi_gquotaip) {
-		XFS_PURGE_INODE(qi->qi_gquotaip);
+		IRELE(qi->qi_gquotaip);
 		qi->qi_gquotaip = NULL;
 	}
 	mutex_destroy(&qi->qi_quotaofflock);
-	kmem_free(qi, sizeof(xfs_quotainfo_t));
+	kmem_free(qi);
 	mp->m_quotainfo = NULL;
 }
 
@@ -1394,7 +1392,7 @@ xfs_qm_qino_alloc(
 	 * locked exclusively and joined to the transaction already.
 	 */
 	ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
-	VN_HOLD(XFS_ITOV((*ip)));
+	IHOLD(*ip);
 
 	/*
 	 * Make the changes in the superblock, and log those too.
@@ -1623,7 +1621,7 @@ xfs_qm_dqiterate(
 			break;
 	} while (nmaps > 0);
 
-	kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
+	kmem_free(map);
 
 	return error;
 }
@@ -2079,7 +2077,7 @@ xfs_qm_shake_freelist(
 		 * Try to grab the flush lock. If this dquot is in the process of
 		 * getting flushed to disk, we don't want to reclaim it.
 		 */
-		if (! xfs_qm_dqflock_nowait(dqp)) {
+		if (!xfs_dqflock_nowait(dqp)) {
 			xfs_dqunlock(dqp);
 			dqp = dqp->dq_flnext;
 			continue;
@@ -2257,7 +2255,7 @@ xfs_qm_dqreclaim_one(void)
 		 * Try to grab the flush lock. If this dquot is in the process of
 		 * getting flushed to disk, we don't want to reclaim it.
 		 */
-		if (! xfs_qm_dqflock_nowait(dqp)) {
+		if (!xfs_dqflock_nowait(dqp)) {
 			xfs_dqunlock(dqp);
 			continue;
 		}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index cd2300e374af..44f25349e478 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
 
 extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern void		xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void		xfs_qm_mount_quotas(xfs_mount_t *);
 extern int		xfs_qm_quotacheck(xfs_mount_t *);
 extern void		xfs_qm_unmount_quotadestroy(xfs_mount_t *);
 extern int		xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index f4f6c4c861d7..eea2e60b456b 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -162,7 +162,7 @@ xfs_qm_newmount(
 			 * mounting, and get on with the boring life
 			 * without disk quotas.
 			 */
-			xfs_qm_mount_quotas(mp, 0);
+			xfs_qm_mount_quotas(mp);
 		} else {
 			/*
 			 * Clear the quota flags, but remember them. This
@@ -184,13 +184,12 @@ STATIC int
 xfs_qm_endmount(
 	xfs_mount_t	*mp,
 	uint		needquotamount,
-	uint		quotaflags,
-	int		mfsi_flags)
+	uint		quotaflags)
 {
 	if (needquotamount) {
 		ASSERT(mp->m_qflags == 0);
 		mp->m_qflags = quotaflags;
-		xfs_qm_mount_quotas(mp, mfsi_flags);
+		xfs_qm_mount_quotas(mp);
 	}
 
 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 768a3b27d2b6..1a3b803dfa55 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -362,11 +362,11 @@ xfs_qm_scall_quotaoff(
 	 * if we don't need them anymore.
 	 */
 	if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
-		XFS_PURGE_INODE(XFS_QI_UQIP(mp));
+		IRELE(XFS_QI_UQIP(mp));
 		XFS_QI_UQIP(mp) = NULL;
 	}
 	if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
-		XFS_PURGE_INODE(XFS_QI_GQIP(mp));
+		IRELE(XFS_QI_GQIP(mp));
 		XFS_QI_GQIP(mp) = NULL;
 	}
 out_error:
@@ -1034,7 +1034,7 @@ xfs_qm_dqrele_all_inodes(
 {
 	xfs_inode_t	*ip, *topino;
 	uint		ireclaims;
-	bhv_vnode_t	*vp;
+	struct inode	*vp;
 	boolean_t	vnode_refd;
 
 	ASSERT(mp->m_quotainfo);
@@ -1059,7 +1059,7 @@ again:
 			ip = ip->i_mnext;
 			continue;
 		}
-		vp = XFS_ITOV_NULL(ip);
+		vp = VFS_I(ip);
 		if (!vp) {
 			ASSERT(ip->i_udquot == NULL);
 			ASSERT(ip->i_gdquot == NULL);
@@ -1449,14 +1449,14 @@ xfs_qm_internalqcheck(
 		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
 			xfs_dqtest_cmp(d);
 			e = (xfs_dqtest_t *) d->HL_NEXT;
-			kmem_free(d, sizeof(xfs_dqtest_t));
+			kmem_free(d);
 			d = e;
 		}
 		h1 = &qmtest_gdqtab[i];
 		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
 			xfs_dqtest_cmp(d);
 			e = (xfs_dqtest_t *) d->HL_NEXT;
-			kmem_free(d, sizeof(xfs_dqtest_t));
+			kmem_free(d);
 			d = e;
 		}
 	}
@@ -1467,8 +1467,8 @@ xfs_qm_internalqcheck(
 	} else {
 		cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
 	}
-	kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
-	kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+	kmem_free(qmtest_udqtab);
+	kmem_free(qmtest_gdqtab);
 	mutex_unlock(&qcheck_lock);
 	return (qmtest_nfails);
 }
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 5e4a40b1c565..c4fcea600bc2 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -158,9 +158,6 @@ for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
 #define XFS_IS_SUSER_DQUOT(dqp)		\
 	(!((dqp)->q_core.d_id))
 
-#define XFS_PURGE_INODE(ip)		\
-	IRELE(ip);
-
 #define DQFLAGTO_TYPESTR(d)	(((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
 				 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
 				 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 0b75d302508f..a34ef05489b1 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -89,7 +89,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 		if (sleep & KM_SLEEP)
 			panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
 
-		kmem_free(ktp, sizeof(*ktp));
+		kmem_free(ktp);
 
 		return NULL;
 	}
@@ -126,7 +126,7 @@ ktrace_free(ktrace_t *ktp)
 	} else {
 		entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
 
-		kmem_free(ktp->kt_entries, entries_size);
+		kmem_free(ktp->kt_entries);
 	}
 
 	kmem_zone_free(ktrace_hdr_zone, ktp);
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 493a6ecf8590..5830c040ea7e 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -17,7 +17,7 @@
  */
 #include <xfs.h>
 
-static mutex_t	uuid_monitor;
+static DEFINE_MUTEX(uuid_monitor);
 static int	uuid_table_size;
 static uuid_t	*uuid_table;
 
@@ -132,9 +132,3 @@ uuid_table_remove(uuid_t *uuid)
 	ASSERT(i < uuid_table_size);
 	mutex_unlock(&uuid_monitor);
 }
-
-void __init
-uuid_init(void)
-{
-	mutex_init(&uuid_monitor);
-}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
index b6f5922199ba..cff5b607d445 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/support/uuid.h
@@ -22,7 +22,6 @@ typedef struct {
 	unsigned char	__u_bits[16];
 } uuid_t;
 
-extern void uuid_init(void);
 extern void uuid_create_nil(uuid_t *uuid);
 extern int uuid_is_nil(uuid_t *uuid);
 extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index ebee3a4f703a..b2f639a1416f 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,15 +37,15 @@
 #include <linux/capability.h>
 #include <linux/posix_acl_xattr.h>
 
-STATIC int	xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *);
+STATIC int	xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
 STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
 STATIC void	xfs_acl_get_endian(xfs_acl_t *);
 STATIC int	xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
 STATIC int	xfs_acl_invalid(xfs_acl_t *);
 STATIC void	xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void	xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *);
-STATIC void	xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *);
-STATIC int	xfs_acl_allow_set(bhv_vnode_t *, int);
+STATIC void	xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
+STATIC void	xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
+STATIC int	xfs_acl_allow_set(struct inode *, int);
 
 kmem_zone_t *xfs_acl_zone;
 
@@ -55,7 +55,7 @@ kmem_zone_t *xfs_acl_zone;
  */
 int
 xfs_acl_vhasacl_access(
-	bhv_vnode_t	*vp)
+	struct inode	*vp)
 {
 	int		error;
 
@@ -68,7 +68,7 @@ xfs_acl_vhasacl_access(
  */
 int
 xfs_acl_vhasacl_default(
-	bhv_vnode_t	*vp)
+	struct inode	*vp)
 {
 	int		error;
 
@@ -207,7 +207,7 @@ posix_acl_xfs_to_xattr(
 
 int
 xfs_acl_vget(
-	bhv_vnode_t	*vp,
+	struct inode	*vp,
 	void		*acl,
 	size_t		size,
 	int		kind)
@@ -217,7 +217,6 @@ xfs_acl_vget(
 	posix_acl_xattr_header	*ext_acl = acl;
 	int			flags = 0;
 
-	VN_HOLD(vp);
 	if(size) {
 		if (!(_ACL_ALLOC(xfs_acl))) {
 			error = ENOMEM;
@@ -239,11 +238,10 @@ xfs_acl_vget(
 			goto out;
 		}
 		if (kind == _ACL_TYPE_ACCESS)
-			xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, xfs_acl);
+			xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
 		error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
 	}
 out:
-	VN_RELE(vp);
 	if(xfs_acl)
 		_ACL_FREE(xfs_acl);
 	return -error;
@@ -251,28 +249,26 @@ out:
 
 int
 xfs_acl_vremove(
-	bhv_vnode_t	*vp,
+	struct inode	*vp,
 	int		kind)
 {
 	int		error;
 
-	VN_HOLD(vp);
 	error = xfs_acl_allow_set(vp, kind);
 	if (!error) {
-		error = xfs_attr_remove(xfs_vtoi(vp),
+		error = xfs_attr_remove(XFS_I(vp),
 						kind == _ACL_TYPE_DEFAULT?
 						SGI_ACL_DEFAULT: SGI_ACL_FILE,
 						ATTR_ROOT);
 		if (error == ENOATTR)
 			error = 0;	/* 'scool */
 	}
-	VN_RELE(vp);
 	return -error;
 }
 
 int
 xfs_acl_vset(
-	bhv_vnode_t		*vp,
+	struct inode		*vp,
 	void			*acl,
 	size_t			size,
 	int			kind)
@@ -298,7 +294,6 @@ xfs_acl_vset(
 		return 0;
 	}
 
-	VN_HOLD(vp);
 	error = xfs_acl_allow_set(vp, kind);
 
 	/* Incoming ACL exists, set file mode based on its value */
@@ -321,7 +316,6 @@ xfs_acl_vset(
 	}
 
 out:
-	VN_RELE(vp);
 	_ACL_FREE(xfs_acl);
 	return -error;
 }
@@ -341,8 +335,7 @@ xfs_acl_iaccess(
 
 	/* If the file has no ACL return -1. */
 	rval = sizeof(xfs_acl_t);
-	if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval,
-					ATTR_ROOT | ATTR_KERNACCESS)) {
+	if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, ATTR_ROOT)) {
 		_ACL_FREE(acl);
 		return -1;
 	}
@@ -364,7 +357,7 @@ xfs_acl_iaccess(
 
 STATIC int
 xfs_acl_allow_set(
-	bhv_vnode_t	*vp,
+	struct inode	*vp,
 	int		kind)
 {
 	if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
@@ -373,7 +366,7 @@ xfs_acl_allow_set(
 		return ENOTDIR;
 	if (vp->i_sb->s_flags & MS_RDONLY)
 		return EROFS;
-	if (xfs_vtoi(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
+	if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
 		return EPERM;
 	return 0;
 }
@@ -567,7 +560,7 @@ xfs_acl_get_endian(
  */
 STATIC void
 xfs_acl_get_attr(
-	bhv_vnode_t	*vp,
+	struct inode	*vp,
 	xfs_acl_t	*aclp,
 	int		kind,
 	int		flags,
@@ -577,7 +570,7 @@ xfs_acl_get_attr(
 
 	ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
 	flags |= ATTR_ROOT;
-	*error = xfs_attr_get(xfs_vtoi(vp),
+	*error = xfs_attr_get(XFS_I(vp),
 					kind == _ACL_TYPE_ACCESS ?
 					SGI_ACL_FILE : SGI_ACL_DEFAULT,
 					(char *)aclp, &len, flags);
@@ -591,7 +584,7 @@ xfs_acl_get_attr(
  */
 STATIC void
 xfs_acl_set_attr(
-	bhv_vnode_t	*vp,
+	struct inode	*vp,
 	xfs_acl_t	*aclp,
 	int		kind,
 	int		*error)
@@ -616,7 +609,7 @@ xfs_acl_set_attr(
 		INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
 	}
 	INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-	*error = xfs_attr_set(xfs_vtoi(vp),
+	*error = xfs_attr_set(XFS_I(vp),
 				kind == _ACL_TYPE_ACCESS ?
 				SGI_ACL_FILE: SGI_ACL_DEFAULT,
 				(char *)newacl, len, ATTR_ROOT);
@@ -625,7 +618,7 @@ xfs_acl_set_attr(
 
 int
 xfs_acl_vtoacl(
-	bhv_vnode_t	*vp,
+	struct inode	*vp,
 	xfs_acl_t	*access_acl,
 	xfs_acl_t	*default_acl)
 {
@@ -640,7 +633,7 @@ xfs_acl_vtoacl(
 		if (error)
 			access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
 		else /* We have a good ACL and the file mode, synchronize. */
-			xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, access_acl);
+			xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
 	}
 
 	if (default_acl) {
@@ -657,7 +650,7 @@ xfs_acl_vtoacl(
  */
 int
 xfs_acl_inherit(
-	bhv_vnode_t	*vp,
+	struct inode	*vp,
 	mode_t		mode,
 	xfs_acl_t	*pdaclp)
 {
@@ -716,11 +709,11 @@ out_error:
  */
 STATIC int
 xfs_acl_setmode(
-	bhv_vnode_t	*vp,
+	struct inode	*vp,
 	xfs_acl_t	*acl,
 	int		*basicperms)
 {
-	bhv_vattr_t	va;
+	struct iattr	iattr;
 	xfs_acl_entry_t	*ap;
 	xfs_acl_entry_t	*gap = NULL;
 	int		i, nomask = 1;
@@ -734,25 +727,25 @@ xfs_acl_setmode(
 	 * Copy the u::, g::, o::, and m:: bits from the ACL into the
 	 * mode.  The m:: bits take precedence over the g:: bits.
 	 */
-	va.va_mask = XFS_AT_MODE;
-	va.va_mode = xfs_vtoi(vp)->i_d.di_mode;
-	va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
+	iattr.ia_valid = ATTR_MODE;
+	iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
+	iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
 	ap = acl->acl_entry;
 	for (i = 0; i < acl->acl_cnt; ++i) {
 		switch (ap->ae_tag) {
 		case ACL_USER_OBJ:
-			va.va_mode |= ap->ae_perm << 6;
+			iattr.ia_mode |= ap->ae_perm << 6;
 			break;
 		case ACL_GROUP_OBJ:
 			gap = ap;
 			break;
 		case ACL_MASK:	/* more than just standard modes */
 			nomask = 0;
-			va.va_mode |= ap->ae_perm << 3;
+			iattr.ia_mode |= ap->ae_perm << 3;
 			*basicperms = 0;
 			break;
 		case ACL_OTHER:
-			va.va_mode |= ap->ae_perm;
+			iattr.ia_mode |= ap->ae_perm;
 			break;
 		default:	/* more than just standard modes */
 			*basicperms = 0;
@@ -763,9 +756,9 @@ xfs_acl_setmode(
 
 	/* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
 	if (gap && nomask)
-		va.va_mode |= gap->ae_perm << 3;
+		iattr.ia_mode |= gap->ae_perm << 3;
 
-	return xfs_setattr(xfs_vtoi(vp), &va, 0, sys_cred);
+	return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
 }
 
 /*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 332a772461c4..a4e293b93efa 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -46,6 +46,8 @@ typedef struct xfs_acl {
 #define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
+#define _ACL_TYPE_ACCESS	1
+#define _ACL_TYPE_DEFAULT	2
 
 #ifdef CONFIG_XFS_POSIX_ACL
 
@@ -57,17 +59,15 @@ extern struct kmem_zone *xfs_acl_zone;
 		(zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
 #define xfs_acl_zone_destroy(zone)	kmem_zone_destroy(zone)
 
-extern int xfs_acl_inherit(bhv_vnode_t *, mode_t mode, xfs_acl_t *);
+extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
 extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(bhv_vnode_t *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(bhv_vnode_t *);
-extern int xfs_acl_vhasacl_default(bhv_vnode_t *);
-extern int xfs_acl_vset(bhv_vnode_t *, void *, size_t, int);
-extern int xfs_acl_vget(bhv_vnode_t *, void *, size_t, int);
-extern int xfs_acl_vremove(bhv_vnode_t *, int);
+extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vhasacl_access(struct inode *);
+extern int xfs_acl_vhasacl_default(struct inode *);
+extern int xfs_acl_vset(struct inode *, void *, size_t, int);
+extern int xfs_acl_vget(struct inode *, void *, size_t, int);
+extern int xfs_acl_vremove(struct inode *, int);
 
-#define _ACL_TYPE_ACCESS	1
-#define _ACL_TYPE_DEFAULT	2
 #define _ACL_PERM_INVALID(perm)	((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
 
 #define _ACL_INHERIT(c,m,d)	(xfs_acl_inherit(c,m,d))
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index f9472a2076d4..0b3b5efe848c 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -92,16 +92,6 @@
 	((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
     }
 
-/* define generic INT_ macros */
-
-#define INT_GET(reference,arch) \
-    (((arch) == ARCH_NOCONVERT) \
-	? \
-	    (reference) \
-	: \
-	    INT_SWAP((reference),(reference)) \
-    )
-
 /* does not return a value */
 #define INT_SET(reference,arch,valueref) \
     (__builtin_constant_p(valueref) ? \
@@ -112,64 +102,6 @@
 	) \
     )
 
-/* does not return a value */
-#define INT_MOD_EXPR(reference,arch,code) \
-    (((arch) == ARCH_NOCONVERT) \
-	? \
-	    (void)((reference) code) \
-	: \
-	    (void)( \
-		(reference) = INT_GET((reference),arch) , \
-		((reference) code), \
-		INT_SET(reference, arch, reference) \
-	    ) \
-    )
-
-/* does not return a value */
-#define INT_MOD(reference,arch,delta) \
-    (void)( \
-	INT_MOD_EXPR(reference,arch,+=(delta)) \
-    )
-
-/*
- * INT_COPY - copy a value between two locations with the
- *	      _same architecture_ but _potentially different sizes_
- *
- *	    if the types of the two parameters are equal or they are
- *		in native architecture, a simple copy is done
- *
- *	    otherwise, architecture conversions are done
- *
- */
-
-/* does not return a value */
-#define INT_COPY(dst,src,arch) \
-    ( \
-	((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
-	    ? \
-		(void)((dst) = (src)) \
-	    : \
-		INT_SET(dst, arch, INT_GET(src, arch)) \
-    )
-
-/*
- * INT_XLATE - copy a value in either direction between two locations
- *	       with different architectures
- *
- *		    dir < 0	- copy from memory to buffer (native to arch)
- *		    dir > 0	- copy from buffer to memory (arch to native)
- */
-
-/* does not return a value */
-#define INT_XLATE(buf,mem,dir,arch) {\
-    ASSERT(dir); \
-    if (dir>0) { \
-	(mem)=INT_GET(buf, arch); \
-    } else { \
-	INT_SET(buf, arch, mem); \
-    } \
-}
-
 /*
  * In directories inode numbers are stored as unaligned arrays of unsigned
  * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index df151a859186..f7cdc28aff41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -16,8 +16,6 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-#include <linux/capability.h>
-
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
@@ -57,11 +55,6 @@
  * Provide the external interfaces to manage attribute lists.
  */
 
-#define ATTR_SYSCOUNT	2
-static struct attrnames posix_acl_access;
-static struct attrnames posix_acl_default;
-static struct attrnames *attr_system_names[ATTR_SYSCOUNT];
-
 /*========================================================================
  * Function prototypes for the kernel.
  *========================================================================*/
@@ -116,6 +109,17 @@ xfs_attr_name_to_xname(
 	return 0;
 }
 
+STATIC int
+xfs_inode_hasattr(
+	struct xfs_inode	*ip)
+{
+	if (!XFS_IFORK_Q(ip) ||
+	    (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     ip->i_d.di_anextents == 0))
+		return 0;
+	return 1;
+}
+
 /*========================================================================
  * Overall external interface routines.
  *========================================================================*/
@@ -127,10 +131,8 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
 	xfs_da_args_t   args;
 	int             error;
 
-	if ((XFS_IFORK_Q(ip) == 0) ||
-	    (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     ip->i_d.di_anextents == 0))
-		return(ENOATTR);
+	if (!xfs_inode_hasattr(ip))
+		return ENOATTR;
 
 	/*
 	 * Fill in the arg structure for this request.
@@ -148,11 +150,7 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
 	/*
 	 * Decide on what work routines to call based on the inode size.
 	 */
-	if (XFS_IFORK_Q(ip) == 0 ||
-	    (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     ip->i_d.di_anextents == 0)) {
-		error = XFS_ERROR(ENOATTR);
-	} else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+	if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
 		error = xfs_attr_shortform_getvalue(&args);
 	} else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
 		error = xfs_attr_leaf_get(&args);
@@ -196,6 +194,46 @@ xfs_attr_get(
 	return(error);
 }
 
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+int
+xfs_attr_calc_size(
+	struct xfs_inode 	*ip,
+	int			namelen,
+	int			valuelen,
+	int			*local)
+{
+	struct xfs_mount 	*mp = ip->i_mount;
+	int			size;
+	int			nblks;
+
+	/*
+	 * Determine space new attribute will use, and if it would be
+	 * "local" or "remote" (note: local != inline).
+	 */
+	size = xfs_attr_leaf_newentsize(namelen, valuelen,
+					mp->m_sb.sb_blocksize, local);
+
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	if (*local) {
+		if (size > (mp->m_sb.sb_blocksize >> 1)) {
+			/* Double split possible */
+			nblks *= 2;
+		}
+	} else {
+		/*
+		 * Out of line attribute, cannot double split, but
+		 * make room for the attribute value itself.
+		 */
+		uint	dblocks = XFS_B_TO_FSB(mp, valuelen);
+		nblks += dblocks;
+		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+	}
+
+	return nblks;
+}
+
 STATIC int
 xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
 		char *value, int valuelen, int flags)
@@ -204,10 +242,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
 	xfs_fsblock_t	firstblock;
 	xfs_bmap_free_t flist;
 	int		error, err2, committed;
-	int		local, size;
-	uint		nblks;
 	xfs_mount_t	*mp = dp->i_mount;
 	int             rsvd = (flags & ATTR_ROOT) != 0;
+	int		local;
 
 	/*
 	 * Attach the dquots to the inode.
@@ -241,33 +278,10 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
 	args.firstblock = &firstblock;
 	args.flist = &flist;
 	args.whichfork = XFS_ATTR_FORK;
-	args.addname = 1;
-	args.oknoent = 1;
-
-	/*
-	 * Determine space new attribute will use, and if it would be
-	 * "local" or "remote" (note: local != inline).
-	 */
-	size = xfs_attr_leaf_newentsize(name->len, valuelen,
-					mp->m_sb.sb_blocksize, &local);
-
-	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
-	if (local) {
-		if (size > (mp->m_sb.sb_blocksize >> 1)) {
-			/* Double split possible */
-			nblks <<= 1;
-		}
-	} else {
-		uint	dblocks = XFS_B_TO_FSB(mp, valuelen);
-		/* Out of line attribute, cannot double split, but make
-		 * room for the attribute value itself.
-		 */
-		nblks += dblocks;
-		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
-	}
+	args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
 
 	/* Size is now blocks for attribute data */
-	args.total = nblks;
+	args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
 
 	/*
 	 * Start our first transaction of the day.
@@ -289,18 +303,17 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
 	if (rsvd)
 		args.trans->t_flags |= XFS_TRANS_RESERVE;
 
-	if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
-				      XFS_ATTRSET_LOG_RES(mp, nblks),
-				      0, XFS_TRANS_PERM_LOG_RES,
-				      XFS_ATTRSET_LOG_COUNT))) {
+	if ((error = xfs_trans_reserve(args.trans, args.total,
+			XFS_ATTRSET_LOG_RES(mp, args.total), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
 		xfs_trans_cancel(args.trans, 0);
 		return(error);
 	}
 	xfs_ilock(dp, XFS_ILOCK_EXCL);
 
-	error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0,
-			 rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
-				XFS_QMOPT_RES_REGBLKS);
+	error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0,
+				rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+				       XFS_QMOPT_RES_REGBLKS);
 	if (error) {
 		xfs_iunlock(dp, XFS_ILOCK_EXCL);
 		xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
@@ -387,7 +400,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
 		 * Commit the leaf transformation.  We'll need another (linked)
 		 * transaction to add the new attribute to the leaf.
 		 */
-		if ((error = xfs_attr_rolltrans(&args.trans, dp)))
+
+		error = xfs_trans_roll(&args.trans, dp);
+		if (error)
 			goto out;
 
 	}
@@ -529,9 +544,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
 	/*
 	 * Decide on what work routines to call based on the inode size.
 	 */
-	if (XFS_IFORK_Q(dp) == 0 ||
-	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     dp->i_d.di_anextents == 0)) {
+	if (!xfs_inode_hasattr(dp)) {
 		error = XFS_ERROR(ENOATTR);
 		goto out;
 	}
@@ -601,29 +614,33 @@ xfs_attr_remove(
 		return error;
 
 	xfs_ilock(dp, XFS_ILOCK_SHARED);
-	if (XFS_IFORK_Q(dp) == 0 ||
-		   (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-		    dp->i_d.di_anextents == 0)) {
+	if (!xfs_inode_hasattr(dp)) {
 		xfs_iunlock(dp, XFS_ILOCK_SHARED);
-		return(XFS_ERROR(ENOATTR));
+		return XFS_ERROR(ENOATTR);
 	}
 	xfs_iunlock(dp, XFS_ILOCK_SHARED);
 
 	return xfs_attr_remove_int(dp, &xname, flags);
 }
 
-STATIC int
+int
 xfs_attr_list_int(xfs_attr_list_context_t *context)
 {
 	int error;
 	xfs_inode_t *dp = context->dp;
 
+	XFS_STATS_INC(xs_attr_list);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return EIO;
+
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+	xfs_attr_trace_l_c("syscall start", context);
+
 	/*
 	 * Decide on what work routines to call based on the inode size.
 	 */
-	if (XFS_IFORK_Q(dp) == 0 ||
-	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     dp->i_d.di_anextents == 0)) {
+	if (!xfs_inode_hasattr(dp)) {
 		error = 0;
 	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
 		error = xfs_attr_shortform_list(context);
@@ -632,6 +649,10 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
 	} else {
 		error = xfs_attr_node_list(context);
 	}
+
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+	xfs_attr_trace_l_c("syscall end", context);
+
 	return error;
 }
 
@@ -648,74 +669,50 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
  */
 /*ARGSUSED*/
 STATIC int
-xfs_attr_put_listent(xfs_attr_list_context_t *context, attrnames_t *namesp,
+xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
 		     char *name, int namelen,
 		     int valuelen, char *value)
 {
+	struct attrlist *alist = (struct attrlist *)context->alist;
 	attrlist_ent_t *aep;
 	int arraytop;
 
 	ASSERT(!(context->flags & ATTR_KERNOVAL));
 	ASSERT(context->count >= 0);
 	ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
-	ASSERT(context->firstu >= sizeof(*context->alist));
+	ASSERT(context->firstu >= sizeof(*alist));
 	ASSERT(context->firstu <= context->bufsize);
 
-	arraytop = sizeof(*context->alist) +
-			context->count * sizeof(context->alist->al_offset[0]);
+	/*
+	 * Only list entries in the right namespace.
+	 */
+	if (((context->flags & ATTR_SECURE) == 0) !=
+	    ((flags & XFS_ATTR_SECURE) == 0))
+		return 0;
+	if (((context->flags & ATTR_ROOT) == 0) !=
+	    ((flags & XFS_ATTR_ROOT) == 0))
+		return 0;
+
+	arraytop = sizeof(*alist) +
+			context->count * sizeof(alist->al_offset[0]);
 	context->firstu -= ATTR_ENTSIZE(namelen);
 	if (context->firstu < arraytop) {
 		xfs_attr_trace_l_c("buffer full", context);
-		context->alist->al_more = 1;
+		alist->al_more = 1;
 		context->seen_enough = 1;
 		return 1;
 	}
 
-	aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
+	aep = (attrlist_ent_t *)&context->alist[context->firstu];
 	aep->a_valuelen = valuelen;
 	memcpy(aep->a_name, name, namelen);
-	aep->a_name[ namelen ] = 0;
-	context->alist->al_offset[ context->count++ ] = context->firstu;
-	context->alist->al_count = context->count;
+	aep->a_name[namelen] = 0;
+	alist->al_offset[context->count++] = context->firstu;
+	alist->al_count = context->count;
 	xfs_attr_trace_l_c("add", context);
 	return 0;
 }
 
-STATIC int
-xfs_attr_kern_list(xfs_attr_list_context_t *context, attrnames_t *namesp,
-		     char *name, int namelen,
-		     int valuelen, char *value)
-{
-	char *offset;
-	int arraytop;
-
-	ASSERT(context->count >= 0);
-
-	arraytop = context->count + namesp->attr_namelen + namelen + 1;
-	if (arraytop > context->firstu) {
-		context->count = -1;	/* insufficient space */
-		return 1;
-	}
-	offset = (char *)context->alist + context->count;
-	strncpy(offset, namesp->attr_name, namesp->attr_namelen);
-	offset += namesp->attr_namelen;
-	strncpy(offset, name, namelen);			/* real name */
-	offset += namelen;
-	*offset = '\0';
-	context->count += namesp->attr_namelen + namelen + 1;
-	return 0;
-}
-
-/*ARGSUSED*/
-STATIC int
-xfs_attr_kern_list_sizes(xfs_attr_list_context_t *context, attrnames_t *namesp,
-		     char *name, int namelen,
-		     int valuelen, char *value)
-{
-	context->count += namesp->attr_namelen + namelen + 1;
-	return 0;
-}
-
 /*
  * Generate a list of extended attribute names and optionally
  * also value lengths.  Positive return value follows the XFS
@@ -732,10 +729,9 @@ xfs_attr_list(
 	attrlist_cursor_kern_t *cursor)
 {
 	xfs_attr_list_context_t context;
+	struct attrlist *alist;
 	int error;
 
-	XFS_STATS_INC(xs_attr_list);
-
 	/*
 	 * Validate the cursor.
 	 */
@@ -756,52 +752,23 @@ xfs_attr_list(
 	/*
 	 * Initialize the output buffer.
 	 */
+	memset(&context, 0, sizeof(context));
 	context.dp = dp;
 	context.cursor = cursor;
-	context.count = 0;
-	context.dupcnt = 0;
 	context.resynch = 1;
 	context.flags = flags;
-	context.seen_enough = 0;
-	context.alist = (attrlist_t *)buffer;
-	context.put_value = 0;
-
-	if (flags & ATTR_KERNAMELS) {
-		context.bufsize = bufsize;
-		context.firstu = context.bufsize;
-		if (flags & ATTR_KERNOVAL)
-			context.put_listent = xfs_attr_kern_list_sizes;
-		else
-			context.put_listent = xfs_attr_kern_list;
-	} else {
-		context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
-		context.firstu = context.bufsize;
-		context.alist->al_count = 0;
-		context.alist->al_more = 0;
-		context.alist->al_offset[0] = context.bufsize;
-		context.put_listent = xfs_attr_put_listent;
-	}
-
-	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-		return EIO;
+	context.alist = buffer;
+	context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
+	context.firstu = context.bufsize;
+	context.put_listent = xfs_attr_put_listent;
 
-	xfs_ilock(dp, XFS_ILOCK_SHARED);
-	xfs_attr_trace_l_c("syscall start", &context);
+	alist = (struct attrlist *)context.alist;
+	alist->al_count = 0;
+	alist->al_more = 0;
+	alist->al_offset[0] = context.bufsize;
 
 	error = xfs_attr_list_int(&context);
-
-	xfs_iunlock(dp, XFS_ILOCK_SHARED);
-	xfs_attr_trace_l_c("syscall end", &context);
-
-	if (context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS)) {
-		/* must return negated buffer size or the error */
-		if (context.count < 0)
-			error = XFS_ERROR(ERANGE);
-		else
-			error = -context.count;
-	} else
-		ASSERT(error >= 0);
-
+	ASSERT(error >= 0);
 	return error;
 }
 
@@ -816,12 +783,10 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
 
 	xfs_ilock(dp, XFS_ILOCK_SHARED);
-	if ((XFS_IFORK_Q(dp) == 0) ||
-	    (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
-	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     dp->i_d.di_anextents == 0)) {
+	if (!xfs_inode_hasattr(dp) ||
+	    dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
 		xfs_iunlock(dp, XFS_ILOCK_SHARED);
-		return(0);
+		return 0;
 	}
 	xfs_iunlock(dp, XFS_ILOCK_SHARED);
 
@@ -854,10 +819,8 @@ xfs_attr_inactive(xfs_inode_t *dp)
 	/*
 	 * Decide on what work routines to call based on the inode size.
 	 */
-	if ((XFS_IFORK_Q(dp) == 0) ||
-	    (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
-	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     dp->i_d.di_anextents == 0)) {
+	if (!xfs_inode_hasattr(dp) ||
+	    dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
 		error = 0;
 		goto out;
 	}
@@ -974,7 +937,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 			xfs_da_brelse(args->trans, bp);
 			return(retval);
 		}
-		args->rename = 1;			/* an atomic rename */
+		args->op_flags |= XFS_DA_OP_RENAME;	/* an atomic rename */
 		args->blkno2 = args->blkno;		/* set 2nd entry info*/
 		args->index2 = args->index;
 		args->rmtblkno2 = args->rmtblkno;
@@ -1019,7 +982,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * Commit the current trans (including the inode) and start
 		 * a new one.
 		 */
-		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
 			return (error);
 
 		/*
@@ -1033,7 +997,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 * Commit the transaction that added the attr name so that
 	 * later routines can manage their own transactions.
 	 */
-	if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+	error = xfs_trans_roll(&args->trans, dp);
+	if (error)
 		return (error);
 
 	/*
@@ -1054,7 +1019,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 * so that one disappears and one appears atomically.  Then we
 	 * must remove the "old" attribute/value pair.
 	 */
-	if (args->rename) {
+	if (args->op_flags & XFS_DA_OP_RENAME) {
 		/*
 		 * In a separate transaction, set the incomplete flag on the
 		 * "old" attr and clear the incomplete flag on the "new" attr.
@@ -1122,7 +1087,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		/*
 		 * Commit the remove and start the next trans in series.
 		 */
-		error = xfs_attr_rolltrans(&args->trans, dp);
+		error = xfs_trans_roll(&args->trans, dp);
 
 	} else if (args->rmtblkno > 0) {
 		/*
@@ -1307,7 +1272,7 @@ restart:
 	} else if (retval == EEXIST) {
 		if (args->flags & ATTR_CREATE)
 			goto out;
-		args->rename = 1;			/* atomic rename op */
+		args->op_flags |= XFS_DA_OP_RENAME;	/* atomic rename op */
 		args->blkno2 = args->blkno;		/* set 2nd entry info*/
 		args->index2 = args->index;
 		args->rmtblkno2 = args->rmtblkno;
@@ -1353,7 +1318,8 @@ restart:
 			 * Commit the node conversion and start the next
 			 * trans in the chain.
 			 */
-			if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+			error = xfs_trans_roll(&args->trans, dp);
+			if (error)
 				goto out;
 
 			goto restart;
@@ -1404,7 +1370,8 @@ restart:
 	 * Commit the leaf addition or btree split and start the next
 	 * trans in the chain.
 	 */
-	if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+	error = xfs_trans_roll(&args->trans, dp);
+	if (error)
 		goto out;
 
 	/*
@@ -1425,7 +1392,7 @@ restart:
 	 * so that one disappears and one appears atomically.  Then we
 	 * must remove the "old" attribute/value pair.
 	 */
-	if (args->rename) {
+	if (args->op_flags & XFS_DA_OP_RENAME) {
 		/*
 		 * In a separate transaction, set the incomplete flag on the
 		 * "old" attr and clear the incomplete flag on the "new" attr.
@@ -1504,7 +1471,8 @@ restart:
 		/*
 		 * Commit and start the next trans in the chain.
 		 */
-		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
 			goto out;
 
 	} else if (args->rmtblkno > 0) {
@@ -1636,7 +1604,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 		/*
 		 * Commit the Btree join operation and start a new trans.
 		 */
-		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
 			goto out;
 	}
 
@@ -2137,7 +2106,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		/*
 		 * Start the next trans in the chain.
 		 */
-		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+		error = xfs_trans_roll(&args->trans, dp);
+		if (error)
 			return (error);
 	}
 
@@ -2287,7 +2257,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		/*
 		 * Close out trans and start the next one in the chain.
 		 */
-		if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
+		error = xfs_trans_roll(&args->trans, args->dp);
+		if (error)
 			return (error);
 	}
 	return(0);
@@ -2300,23 +2271,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 void
 xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
 {
-	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where,
-		(__psunsigned_t)context->dp,
-		(__psunsigned_t)context->cursor->hashval,
-		(__psunsigned_t)context->cursor->blkno,
-		(__psunsigned_t)context->cursor->offset,
-		(__psunsigned_t)context->alist,
-		(__psunsigned_t)context->bufsize,
-		(__psunsigned_t)context->count,
-		(__psunsigned_t)context->firstu,
-		(__psunsigned_t)
-			((context->count > 0) &&
-			!(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
-				? (ATTR_ENTRY(context->alist,
-					      context->count-1)->a_valuelen)
-				: 0,
-		(__psunsigned_t)context->dupcnt,
-		(__psunsigned_t)context->flags,
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, context,
 		(__psunsigned_t)NULL,
 		(__psunsigned_t)NULL,
 		(__psunsigned_t)NULL);
@@ -2329,23 +2284,7 @@ void
 xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
 			 struct xfs_da_intnode *node)
 {
-	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where,
-		(__psunsigned_t)context->dp,
-		(__psunsigned_t)context->cursor->hashval,
-		(__psunsigned_t)context->cursor->blkno,
-		(__psunsigned_t)context->cursor->offset,
-		(__psunsigned_t)context->alist,
-		(__psunsigned_t)context->bufsize,
-		(__psunsigned_t)context->count,
-		(__psunsigned_t)context->firstu,
-		(__psunsigned_t)
-			((context->count > 0) &&
-			!(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
-				? (ATTR_ENTRY(context->alist,
-					      context->count-1)->a_valuelen)
-				: 0,
-		(__psunsigned_t)context->dupcnt,
-		(__psunsigned_t)context->flags,
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, context,
 		(__psunsigned_t)be16_to_cpu(node->hdr.count),
 		(__psunsigned_t)be32_to_cpu(node->btree[0].hashval),
 		(__psunsigned_t)be32_to_cpu(node->btree[
@@ -2359,23 +2298,7 @@ void
 xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
 			  struct xfs_da_node_entry *btree)
 {
-	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where,
-		(__psunsigned_t)context->dp,
-		(__psunsigned_t)context->cursor->hashval,
-		(__psunsigned_t)context->cursor->blkno,
-		(__psunsigned_t)context->cursor->offset,
-		(__psunsigned_t)context->alist,
-		(__psunsigned_t)context->bufsize,
-		(__psunsigned_t)context->count,
-		(__psunsigned_t)context->firstu,
-		(__psunsigned_t)
-			((context->count > 0) &&
-			!(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
-				? (ATTR_ENTRY(context->alist,
-					      context->count-1)->a_valuelen)
-				: 0,
-		(__psunsigned_t)context->dupcnt,
-		(__psunsigned_t)context->flags,
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, context,
 		(__psunsigned_t)be32_to_cpu(btree->hashval),
 		(__psunsigned_t)be32_to_cpu(btree->before),
 		(__psunsigned_t)NULL);
@@ -2388,23 +2311,7 @@ void
 xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
 			      struct xfs_attr_leafblock *leaf)
 {
-	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where,
-		(__psunsigned_t)context->dp,
-		(__psunsigned_t)context->cursor->hashval,
-		(__psunsigned_t)context->cursor->blkno,
-		(__psunsigned_t)context->cursor->offset,
-		(__psunsigned_t)context->alist,
-		(__psunsigned_t)context->bufsize,
-		(__psunsigned_t)context->count,
-		(__psunsigned_t)context->firstu,
-		(__psunsigned_t)
-			((context->count > 0) &&
-			!(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
-				? (ATTR_ENTRY(context->alist,
-					      context->count-1)->a_valuelen)
-				: 0,
-		(__psunsigned_t)context->dupcnt,
-		(__psunsigned_t)context->flags,
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, context,
 		(__psunsigned_t)be16_to_cpu(leaf->hdr.count),
 		(__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval),
 		(__psunsigned_t)be32_to_cpu(leaf->entries[
@@ -2417,329 +2324,24 @@ xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
  */
 void
 xfs_attr_trace_enter(int type, char *where,
-			 __psunsigned_t a2, __psunsigned_t a3,
-			 __psunsigned_t a4, __psunsigned_t a5,
-			 __psunsigned_t a6, __psunsigned_t a7,
-			 __psunsigned_t a8, __psunsigned_t a9,
-			 __psunsigned_t a10, __psunsigned_t a11,
-			 __psunsigned_t a12, __psunsigned_t a13,
-			 __psunsigned_t a14, __psunsigned_t a15)
+			 struct xfs_attr_list_context *context,
+			 __psunsigned_t a13, __psunsigned_t a14,
+			 __psunsigned_t a15)
 {
 	ASSERT(xfs_attr_trace_buf);
 	ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
-					 (void *)where,
-					 (void *)a2,  (void *)a3,  (void *)a4,
-					 (void *)a5,  (void *)a6,  (void *)a7,
-					 (void *)a8,  (void *)a9,  (void *)a10,
-					 (void *)a11, (void *)a12, (void *)a13,
-					 (void *)a14, (void *)a15);
+		(void *)((__psunsigned_t)where),
+		(void *)((__psunsigned_t)context->dp),
+		(void *)((__psunsigned_t)context->cursor->hashval),
+		(void *)((__psunsigned_t)context->cursor->blkno),
+		(void *)((__psunsigned_t)context->cursor->offset),
+		(void *)((__psunsigned_t)context->alist),
+		(void *)((__psunsigned_t)context->bufsize),
+		(void *)((__psunsigned_t)context->count),
+		(void *)((__psunsigned_t)context->firstu),
+		NULL,
+		(void *)((__psunsigned_t)context->dupcnt),
+		(void *)((__psunsigned_t)context->flags),
+		(void *)a13, (void *)a14, (void *)a15);
 }
 #endif	/* XFS_ATTR_TRACE */
-
-
-/*========================================================================
- * System (pseudo) namespace attribute interface routines.
- *========================================================================*/
-
-STATIC int
-posix_acl_access_set(
-	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-	return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
-}
-
-STATIC int
-posix_acl_access_remove(
-	bhv_vnode_t *vp, char *name, int xflags)
-{
-	return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
-}
-
-STATIC int
-posix_acl_access_get(
-	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-	return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
-}
-
-STATIC int
-posix_acl_access_exists(
-	bhv_vnode_t *vp)
-{
-	return xfs_acl_vhasacl_access(vp);
-}
-
-STATIC int
-posix_acl_default_set(
-	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-	return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
-}
-
-STATIC int
-posix_acl_default_get(
-	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-	return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
-}
-
-STATIC int
-posix_acl_default_remove(
-	bhv_vnode_t *vp, char *name, int xflags)
-{
-	return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
-}
-
-STATIC int
-posix_acl_default_exists(
-	bhv_vnode_t *vp)
-{
-	return xfs_acl_vhasacl_default(vp);
-}
-
-static struct attrnames posix_acl_access = {
-	.attr_name	= "posix_acl_access",
-	.attr_namelen	= sizeof("posix_acl_access") - 1,
-	.attr_get	= posix_acl_access_get,
-	.attr_set	= posix_acl_access_set,
-	.attr_remove	= posix_acl_access_remove,
-	.attr_exists	= posix_acl_access_exists,
-};
-
-static struct attrnames posix_acl_default = {
-	.attr_name	= "posix_acl_default",
-	.attr_namelen	= sizeof("posix_acl_default") - 1,
-	.attr_get	= posix_acl_default_get,
-	.attr_set	= posix_acl_default_set,
-	.attr_remove	= posix_acl_default_remove,
-	.attr_exists	= posix_acl_default_exists,
-};
-
-static struct attrnames *attr_system_names[] =
-	{ &posix_acl_access, &posix_acl_default };
-
-
-/*========================================================================
- * Namespace-prefix-style attribute name interface routines.
- *========================================================================*/
-
-STATIC int
-attr_generic_set(
-	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-	return -xfs_attr_set(xfs_vtoi(vp), name, data, size, xflags);
-}
-
-STATIC int
-attr_generic_get(
-	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-	int	error, asize = size;
-
-	error = xfs_attr_get(xfs_vtoi(vp), name, data, &asize, xflags);
-	if (!error)
-		return asize;
-	return -error;
-}
-
-STATIC int
-attr_generic_remove(
-	bhv_vnode_t *vp, char *name, int xflags)
-{
-	return -xfs_attr_remove(xfs_vtoi(vp), name, xflags);
-}
-
-STATIC int
-attr_generic_listadd(
-	attrnames_t		*prefix,
-	attrnames_t		*namesp,
-	void			*data,
-	size_t			size,
-	ssize_t			*result)
-{
-	char			*p = data + *result;
-
-	*result += prefix->attr_namelen;
-	*result += namesp->attr_namelen + 1;
-	if (!size)
-		return 0;
-	if (*result > size)
-		return -ERANGE;
-	strcpy(p, prefix->attr_name);
-	p += prefix->attr_namelen;
-	strcpy(p, namesp->attr_name);
-	p += namesp->attr_namelen + 1;
-	return 0;
-}
-
-STATIC int
-attr_system_list(
-	bhv_vnode_t		*vp,
-	void			*data,
-	size_t			size,
-	ssize_t			*result)
-{
-	attrnames_t		*namesp;
-	int			i, error = 0;
-
-	for (i = 0; i < ATTR_SYSCOUNT; i++) {
-		namesp = attr_system_names[i];
-		if (!namesp->attr_exists || !namesp->attr_exists(vp))
-			continue;
-		error = attr_generic_listadd(&attr_system, namesp,
-						data, size, result);
-		if (error)
-			break;
-	}
-	return error;
-}
-
-int
-attr_generic_list(
-	bhv_vnode_t *vp, void *data, size_t size, int xflags, ssize_t *result)
-{
-	attrlist_cursor_kern_t	cursor = { 0 };
-	int			error;
-
-	error = xfs_attr_list(xfs_vtoi(vp), data, size, xflags, &cursor);
-	if (error > 0)
-		return -error;
-	*result = -error;
-	return attr_system_list(vp, data, size, result);
-}
-
-attrnames_t *
-attr_lookup_namespace(
-	char			*name,
-	struct attrnames	**names,
-	int			nnames)
-{
-	int			i;
-
-	for (i = 0; i < nnames; i++)
-		if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen))
-			return names[i];
-	return NULL;
-}
-
-/*
- * Some checks to prevent people abusing EAs to get over quota:
- * - Don't allow modifying user EAs on devices/symlinks;
- * - Don't allow modifying user EAs if sticky bit set;
- */
-STATIC int
-attr_user_capable(
-	bhv_vnode_t	*vp,
-	cred_t		*cred)
-{
-	struct inode	*inode = vn_to_inode(vp);
-
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return -EPERM;
-	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
-	    !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
-	    (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER))
-		return -EPERM;
-	return 0;
-}
-
-STATIC int
-attr_trusted_capable(
-	bhv_vnode_t	*vp,
-	cred_t		*cred)
-{
-	struct inode	*inode = vn_to_inode(vp);
-
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-		return -EPERM;
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	return 0;
-}
-
-STATIC int
-attr_system_set(
-	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-	attrnames_t	*namesp;
-	int		error;
-
-	if (xflags & ATTR_CREATE)
-		return -EINVAL;
-
-	namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
-	if (!namesp)
-		return -EOPNOTSUPP;
-	error = namesp->attr_set(vp, name, data, size, xflags);
-	if (!error)
-		error = vn_revalidate(vp);
-	return error;
-}
-
-STATIC int
-attr_system_get(
-	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
-	attrnames_t	*namesp;
-
-	namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
-	if (!namesp)
-		return -EOPNOTSUPP;
-	return namesp->attr_get(vp, name, data, size, xflags);
-}
-
-STATIC int
-attr_system_remove(
-	bhv_vnode_t *vp, char *name, int xflags)
-{
-	attrnames_t	*namesp;
-
-	namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
-	if (!namesp)
-		return -EOPNOTSUPP;
-	return namesp->attr_remove(vp, name, xflags);
-}
-
-struct attrnames attr_system = {
-	.attr_name	= "system.",
-	.attr_namelen	= sizeof("system.") - 1,
-	.attr_flag	= ATTR_SYSTEM,
-	.attr_get	= attr_system_get,
-	.attr_set	= attr_system_set,
-	.attr_remove	= attr_system_remove,
-	.attr_capable	= (attrcapable_t)fs_noerr,
-};
-
-struct attrnames attr_trusted = {
-	.attr_name	= "trusted.",
-	.attr_namelen	= sizeof("trusted.") - 1,
-	.attr_flag	= ATTR_ROOT,
-	.attr_get	= attr_generic_get,
-	.attr_set	= attr_generic_set,
-	.attr_remove	= attr_generic_remove,
-	.attr_capable	= attr_trusted_capable,
-};
-
-struct attrnames attr_secure = {
-	.attr_name	= "security.",
-	.attr_namelen	= sizeof("security.") - 1,
-	.attr_flag	= ATTR_SECURE,
-	.attr_get	= attr_generic_get,
-	.attr_set	= attr_generic_set,
-	.attr_remove	= attr_generic_remove,
-	.attr_capable	= (attrcapable_t)fs_noerr,
-};
-
-struct attrnames attr_user = {
-	.attr_name	= "user.",
-	.attr_namelen	= sizeof("user.") - 1,
-	.attr_get	= attr_generic_get,
-	.attr_set	= attr_generic_set,
-	.attr_remove	= attr_generic_remove,
-	.attr_capable	= attr_user_capable,
-};
-
-struct attrnames *attr_namespaces[] =
-	{ &attr_system, &attr_trusted, &attr_secure, &attr_user };
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 6cfc9384fe35..fb3b2a68b9b9 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -18,9 +18,11 @@
 #ifndef __XFS_ATTR_H__
 #define	__XFS_ATTR_H__
 
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
+
 /*
- * xfs_attr.h
- *
  * Large attribute lists are structured around Btrees where all the data
  * elements are in the leaf nodes.  Attribute names are hashed into an int,
  * then that int is used as the index into the Btree.  Since the hashval
@@ -35,35 +37,6 @@
  * External interfaces
  *========================================================================*/
 
-struct cred;
-struct xfs_attr_list_context;
-
-typedef int (*attrset_t)(bhv_vnode_t *, char *, void *, size_t, int);
-typedef int (*attrget_t)(bhv_vnode_t *, char *, void *, size_t, int);
-typedef int (*attrremove_t)(bhv_vnode_t *, char *, int);
-typedef int (*attrexists_t)(bhv_vnode_t *);
-typedef int (*attrcapable_t)(bhv_vnode_t *, struct cred *);
-
-typedef struct attrnames {
-	char *		attr_name;
-	unsigned int	attr_namelen;
-	unsigned int	attr_flag;
-	attrget_t	attr_get;
-	attrset_t	attr_set;
-	attrremove_t	attr_remove;
-	attrexists_t	attr_exists;
-	attrcapable_t	attr_capable;
-} attrnames_t;
-
-#define ATTR_NAMECOUNT	4
-extern struct attrnames attr_user;
-extern struct attrnames attr_secure;
-extern struct attrnames attr_system;
-extern struct attrnames attr_trusted;
-extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT];
-
-extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int);
-extern int attr_generic_list(bhv_vnode_t *, void *, size_t, int, ssize_t *);
 
 #define ATTR_DONTFOLLOW	0x0001	/* -- unused, from IRIX -- */
 #define ATTR_ROOT	0x0002	/* use attrs in root (trusted) namespace */
@@ -71,16 +44,9 @@ extern int attr_generic_list(bhv_vnode_t *, void *, size_t, int, ssize_t *);
 #define ATTR_SECURE	0x0008	/* use attrs in security namespace */
 #define ATTR_CREATE	0x0010	/* pure create: fail if attr already exists */
 #define ATTR_REPLACE	0x0020	/* pure set: fail if attr does not exist */
-#define ATTR_SYSTEM	0x0100	/* use attrs in system (pseudo) namespace */
 
-#define ATTR_KERNACCESS	0x0400	/* [kernel] iaccess, inode held io-locked */
 #define ATTR_KERNOTIME	0x1000	/* [kernel] don't update inode timestamps */
 #define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */
-#define ATTR_KERNAMELS	0x4000	/* [kernel] list attr names (simple list) */
-
-#define ATTR_KERNORMALS	0x0800	/* [kernel] normal attr list: user+secure */
-#define ATTR_KERNROOTLS	0x8000	/* [kernel] include root in the attr list */
-#define ATTR_KERNFULLS	(ATTR_KERNORMALS|ATTR_KERNROOTLS)
 
 /*
  * The maximum size (into the kernel or returned from the kernel) of an
@@ -119,22 +85,6 @@ typedef struct attrlist_ent {	/* data from attr_list() */
 	 &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
 
 /*
- * Multi-attribute operation vector.
- */
-typedef struct attr_multiop {
-	int	am_opcode;	/* operation to perform (ATTR_OP_GET, etc.) */
-	int	am_error;	/* [out arg] result of this sub-op (an errno) */
-	char	*am_attrname;	/* attribute name to work with */
-	char	*am_attrvalue;	/* [in/out arg] attribute value (raw bytes) */
-	int	am_length;	/* [in/out arg] length of value */
-	int	am_flags;	/* bitwise OR of attr API flags defined above */
-} attr_multiop_t;
-
-#define ATTR_OP_GET	1	/* return the indicated attr's value */
-#define ATTR_OP_SET	2	/* set/create the indicated attr/value pair */
-#define ATTR_OP_REMOVE	3	/* remove the indicated attr */
-
-/*
  * Kernel-internal version of the attrlist cursor.
  */
 typedef struct attrlist_cursor_kern {
@@ -148,20 +98,41 @@ typedef struct attrlist_cursor_kern {
 
 
 /*========================================================================
- * Function prototypes for the kernel.
+ * Structure used to pass context around among the routines.
  *========================================================================*/
 
-struct xfs_inode;
-struct attrlist_cursor_kern;
-struct xfs_da_args;
+
+typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+				      char *, int, int, char *);
+
+typedef struct xfs_attr_list_context {
+	struct xfs_inode		*dp;		/* inode */
+	struct attrlist_cursor_kern	*cursor;	/* position in list */
+	char				*alist;		/* output buffer */
+	int				seen_enough;	/* T/F: seen enough of list? */
+	ssize_t				count;		/* num used entries */
+	int				dupcnt;		/* count dup hashvals seen */
+	int				bufsize;	/* total buffer size */
+	int				firstu;		/* first used byte in buffer */
+	int				flags;		/* from VOP call */
+	int				resynch;	/* T/F: resynch with cursor */
+	int				put_value;	/* T/F: need value for listent */
+	put_listent_func_t		put_listent;	/* list output fmt function */
+	int				index;		/* index into output buffer */
+} xfs_attr_list_context_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
 
 /*
  * Overall external interface routines.
  */
+int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
 int xfs_attr_inactive(struct xfs_inode *dp);
-
-int xfs_attr_shortform_getvalue(struct xfs_da_args *);
 int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
 int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
 
 #endif	/* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 303d41e4217b..79da6b2ea99e 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -94,13 +94,6 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
  * Namespace helper routines
  *========================================================================*/
 
-STATIC_INLINE attrnames_t *
-xfs_attr_flags_namesp(int flags)
-{
-	return ((flags & XFS_ATTR_SECURE) ? &attr_secure:
-		  ((flags & XFS_ATTR_ROOT) ? &attr_trusted : &attr_user));
-}
-
 /*
  * If namespace bits don't match return 0.
  * If all match then return 1.
@@ -111,25 +104,6 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
 	return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
 }
 
-/*
- * If namespace bits don't match and we don't have an override for it
- * then return 0.
- * If all match or are overridable then return 1.
- */
-STATIC_INLINE int
-xfs_attr_namesp_match_overrides(int arg_flags, int ondisk_flags)
-{
-	if (((arg_flags & ATTR_SECURE) == 0) !=
-	    ((ondisk_flags & XFS_ATTR_SECURE) == 0) &&
-	    !(arg_flags & ATTR_KERNORMALS))
-		return 0;
-	if (((arg_flags & ATTR_ROOT) == 0) !=
-	    ((ondisk_flags & XFS_ATTR_ROOT) == 0) &&
-	    !(arg_flags & ATTR_KERNROOTLS))
-		return 0;
-	return 1;
-}
-
 
 /*========================================================================
  * External routines when attribute fork size < XFS_LITINO(mp).
@@ -369,9 +343,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 	 * Fix up the start offset of the attribute fork
 	 */
 	totsize -= size;
-	if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname &&
-	    (mp->m_flags & XFS_MOUNT_ATTR2) && 
-	    (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
+	if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
+				!(args->op_flags & XFS_DA_OP_ADDNAME) &&
+				(mp->m_flags & XFS_MOUNT_ATTR2) &&
+				(dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
 		/*
 		 * Last attribute now removed, revert to original
 		 * inode format making all literal area available
@@ -389,9 +364,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 		xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
 		dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
 		ASSERT(dp->i_d.di_forkoff);
-		ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname ||
-			!(mp->m_flags & XFS_MOUNT_ATTR2) ||
-			dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+		ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+				(args->op_flags & XFS_DA_OP_ADDNAME) ||
+				!(mp->m_flags & XFS_MOUNT_ATTR2) ||
+				dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
 		dp->i_afp->if_ext_max =
 			XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 		dp->i_df.if_ext_max =
@@ -531,7 +507,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 	nargs.total = args->total;
 	nargs.whichfork = XFS_ATTR_FORK;
 	nargs.trans = args->trans;
-	nargs.oknoent = 1;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
 
 	sfe = &sf->list[0];
 	for (i = 0; i < sf->hdr.count; i++) {
@@ -555,7 +531,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 out:
 	if(bp)
 		xfs_da_buf_done(bp);
-	kmem_free(tmpbuffer, size);
+	kmem_free(tmpbuffer);
 	return(error);
 }
 
@@ -624,15 +600,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	    (XFS_ISRESET_CURSOR(cursor) &&
              (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
 		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-			attrnames_t	*namesp;
-
-			if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
-				sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-				continue;
-			}
-			namesp = xfs_attr_flags_namesp(sfe->flags);
 			error = context->put_listent(context,
-					   namesp,
+					   sfe->flags,
 					   (char *)sfe->nameval,
 					   (int)sfe->namelen,
 					   (int)sfe->valuelen,
@@ -676,13 +645,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 					     XFS_ERRLEVEL_LOW,
 					     context->dp->i_mount, sfe);
 			xfs_attr_trace_l_c("sf corrupted", context);
-			kmem_free(sbuf, sbsize);
+			kmem_free(sbuf);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
-		if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
-			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-			continue;
-		}
+
 		sbp->entno = i;
 		sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen);
 		sbp->name = (char *)sfe->nameval;
@@ -717,7 +683,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 		}
 	}
 	if (i == nsbuf) {
-		kmem_free(sbuf, sbsize);
+		kmem_free(sbuf);
 		xfs_attr_trace_l_c("blk end", context);
 		return(0);
 	}
@@ -726,16 +692,12 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	 * Loop putting entries into the user buffer.
 	 */
 	for ( ; i < nsbuf; i++, sbp++) {
-		attrnames_t	*namesp;
-
-		namesp = xfs_attr_flags_namesp(sbp->flags);
-
 		if (cursor->hashval != sbp->hash) {
 			cursor->hashval = sbp->hash;
 			cursor->offset = 0;
 		}
 		error = context->put_listent(context,
-					namesp,
+					sbp->flags,
 					sbp->name,
 					sbp->namelen,
 					sbp->valuelen,
@@ -747,7 +709,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 		cursor->offset++;
 	}
 
-	kmem_free(sbuf, sbsize);
+	kmem_free(sbuf);
 	xfs_attr_trace_l_c("sf E-O-F", context);
 	return(0);
 }
@@ -853,7 +815,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
 	nargs.total = args->total;
 	nargs.whichfork = XFS_ATTR_FORK;
 	nargs.trans = args->trans;
-	nargs.oknoent = 1;
+	nargs.op_flags = XFS_DA_OP_OKNOENT;
 	entry = &leaf->entries[0];
 	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
 		if (entry->flags & XFS_ATTR_INCOMPLETE)
@@ -873,7 +835,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
 	error = 0;
 
 out:
-	kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
+	kmem_free(tmpbuffer);
 	return(error);
 }
 
@@ -1155,7 +1117,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
 	entry->hashval = cpu_to_be32(args->hashval);
 	entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
 	entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
-	if (args->rename) {
+	if (args->op_flags & XFS_DA_OP_RENAME) {
 		entry->flags |= XFS_ATTR_INCOMPLETE;
 		if ((args->blkno2 == args->blkno) &&
 		    (args->index2 <= args->index)) {
@@ -1271,7 +1233,7 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
 				be16_to_cpu(hdr_s->count), mp);
 	xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
 
-	kmem_free(tmpbuffer, XFS_LBSIZE(mp));
+	kmem_free(tmpbuffer);
 }
 
 /*
@@ -1921,7 +1883,7 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 				be16_to_cpu(drop_hdr->count), mp);
 		}
 		memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
-		kmem_free(tmpbuffer, state->blocksize);
+		kmem_free(tmpbuffer);
 	}
 
 	xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
@@ -2400,8 +2362,6 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 	 */
 	retval = 0;
 	for (  ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
-		attrnames_t *namesp;
-
 		if (be32_to_cpu(entry->hashval) != cursor->hashval) {
 			cursor->hashval = be32_to_cpu(entry->hashval);
 			cursor->offset = 0;
@@ -2409,17 +2369,13 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 
 		if (entry->flags & XFS_ATTR_INCOMPLETE)
 			continue;		/* skip incomplete entries */
-		if (!xfs_attr_namesp_match_overrides(context->flags, entry->flags))
-			continue;
-
-		namesp = xfs_attr_flags_namesp(entry->flags);
 
 		if (entry->flags & XFS_ATTR_LOCAL) {
 			xfs_attr_leaf_name_local_t *name_loc =
 				XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
 
 			retval = context->put_listent(context,
-						namesp,
+						entry->flags,
 						(char *)name_loc->nameval,
 						(int)name_loc->namelen,
 						be16_to_cpu(name_loc->valuelen),
@@ -2446,16 +2402,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 				if (retval)
 					return retval;
 				retval = context->put_listent(context,
-						namesp,
+						entry->flags,
 						(char *)name_rmt->name,
 						(int)name_rmt->namelen,
 						valuelen,
 						(char*)args.value);
-				kmem_free(args.value, valuelen);
-			}
-			else {
+				kmem_free(args.value);
+			} else {
 				retval = context->put_listent(context,
-						namesp,
+						entry->flags,
 						(char *)name_rmt->name,
 						(int)name_rmt->namelen,
 						valuelen,
@@ -2543,9 +2498,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 	/*
 	 * Commit the flag value change and start the next trans in series.
 	 */
-	error = xfs_attr_rolltrans(&args->trans, args->dp);
-
-	return(error);
+	return xfs_trans_roll(&args->trans, args->dp);
 }
 
 /*
@@ -2592,9 +2545,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
 	/*
 	 * Commit the flag value change and start the next trans in series.
 	 */
-	error = xfs_attr_rolltrans(&args->trans, args->dp);
-
-	return(error);
+	return xfs_trans_roll(&args->trans, args->dp);
 }
 
 /*
@@ -2710,7 +2661,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	/*
 	 * Commit the flag value change and start the next trans in series.
 	 */
-	error = xfs_attr_rolltrans(&args->trans, args->dp);
+	error = xfs_trans_roll(&args->trans, args->dp);
 
 	return(error);
 }
@@ -2768,7 +2719,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
 	/*
 	 * Commit the invalidate and start the next transaction.
 	 */
-	error = xfs_attr_rolltrans(trans, dp);
+	error = xfs_trans_roll(trans, dp);
 
 	return (error);
 }
@@ -2870,7 +2821,8 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
 		/*
 		 * Atomically commit the whole invalidate stuff.
 		 */
-		if ((error = xfs_attr_rolltrans(trans, dp)))
+		error = xfs_trans_roll(trans, dp);
+		if (error)
 			return (error);
 	}
 
@@ -2954,7 +2906,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
 			error = tmp;	/* save only the 1st errno */
 	}
 
-	kmem_free((xfs_caddr_t)list, size);
+	kmem_free((xfs_caddr_t)list);
 	return(error);
 }
 
@@ -3009,7 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 			/*
 			 * Roll to next transaction.
 			 */
-			if ((error = xfs_attr_rolltrans(trans, dp)))
+			error = xfs_trans_roll(trans, dp);
+			if (error)
 				return (error);
 		}
 
@@ -3019,60 +2972,3 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 
 	return(0);
 }
-
-
-/*
- * Roll from one trans in the sequence of PERMANENT transactions to the next.
- */
-int
-xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
-{
-	xfs_trans_t *trans;
-	unsigned int logres, count;
-	int	error;
-
-	/*
-	 * Ensure that the inode is always logged.
-	 */
-	trans = *transp;
-	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
-
-	/*
-	 * Copy the critical parameters from one trans to the next.
-	 */
-	logres = trans->t_log_res;
-	count = trans->t_log_count;
-	*transp = xfs_trans_dup(trans);
-
-	/*
-	 * Commit the current transaction.
-	 * If this commit failed, then it'd just unlock those items that
-	 * are not marked ihold. That also means that a filesystem shutdown
-	 * is in progress. The caller takes the responsibility to cancel
-	 * the duplicate transaction that gets returned.
-	 */
-	if ((error = xfs_trans_commit(trans, 0)))
-		return (error);
-
-	trans = *transp;
-
-	/*
-	 * Reserve space in the log for th next transaction.
-	 * This also pushes items in the "AIL", the list of logged items,
-	 * out to disk if they are taking up space at the tail of the log
-	 * that we want to use.  This requires that either nothing be locked
-	 * across this call, or that anything that is locked be logged in
-	 * the prior and the next transactions.
-	 */
-	error = xfs_trans_reserve(trans, 0, logres, 0,
-				  XFS_TRANS_PERM_LOG_RES, count);
-	/*
-	 *  Ensure that the inode is in the new transaction and locked.
-	 */
-	if (!error) {
-		xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
-		xfs_trans_ihold(trans, dp);
-	}
-	return (error);
-
-}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 040f732ce1e2..83e9af417ca2 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -30,7 +30,7 @@
 
 struct attrlist;
 struct attrlist_cursor_kern;
-struct attrnames;
+struct xfs_attr_list_context;
 struct xfs_dabuf;
 struct xfs_da_args;
 struct xfs_da_state;
@@ -204,33 +204,6 @@ static inline int xfs_attr_leaf_entsize_local_max(int bsize)
 	return (((bsize) >> 1) + ((bsize) >> 2));
 }
 
-
-/*========================================================================
- * Structure used to pass context around among the routines.
- *========================================================================*/
-
-
-struct xfs_attr_list_context;
-
-typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, struct attrnames *,
-				      char *, int, int, char *);
-
-typedef struct xfs_attr_list_context {
-	struct xfs_inode		*dp;		/* inode */
-	struct attrlist_cursor_kern	*cursor;	/* position in list */
-	struct attrlist			*alist;		/* output buffer */
-	int				seen_enough;	/* T/F: seen enough of list? */
-	int				count;		/* num used entries */
-	int				dupcnt;		/* count dup hashvals seen */
-	int				bufsize;	/* total buffer size */
-	int				firstu;		/* first used byte in buffer */
-	int				flags;		/* from VOP call */
-	int				resynch;	/* T/F: resynch with cursor */
-	int				put_value;	/* T/F: need value for listent */
-	put_listent_func_t		put_listent;	/* list output fmt function */
-	int				index;		/* index into output buffer */
-} xfs_attr_list_context_t;
-
 /*
  * Used to keep a list of "remote value" extents when unlinking an inode.
  */
@@ -301,6 +274,4 @@ int	xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
 				   struct xfs_dabuf *leaf2_bp);
 int	xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
 					int *local);
-int	xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
-
 #endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index f67f917803b1..ea22839caed2 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -97,13 +97,9 @@ void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
 void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
 			      struct xfs_attr_leafblock *leaf);
 void xfs_attr_trace_enter(int type, char *where,
-			     __psunsigned_t a2, __psunsigned_t a3,
-			     __psunsigned_t a4, __psunsigned_t a5,
-			     __psunsigned_t a6, __psunsigned_t a7,
-			     __psunsigned_t a8, __psunsigned_t a9,
-			     __psunsigned_t a10, __psunsigned_t a11,
-			     __psunsigned_t a12, __psunsigned_t a13,
-			     __psunsigned_t a14, __psunsigned_t a15);
+			     struct xfs_attr_list_context *context,
+			     __psunsigned_t a13, __psunsigned_t a14,
+			     __psunsigned_t a15);
 #else
 #define	xfs_attr_trace_l_c(w,c)
 #define	xfs_attr_trace_l_cn(w,c,n)
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index fab0b6d5a41b..48228848f5ae 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -25,109 +25,6 @@
  * XFS bit manipulation routines, used in non-realtime code.
  */
 
-#ifndef HAVE_ARCH_HIGHBIT
-/*
- * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
- */
-static const char xfs_highbit[256] = {
-       -1, 0, 1, 1, 2, 2, 2, 2,			/* 00 .. 07 */
-	3, 3, 3, 3, 3, 3, 3, 3,			/* 08 .. 0f */
-	4, 4, 4, 4, 4, 4, 4, 4,			/* 10 .. 17 */
-	4, 4, 4, 4, 4, 4, 4, 4,			/* 18 .. 1f */
-	5, 5, 5, 5, 5, 5, 5, 5,			/* 20 .. 27 */
-	5, 5, 5, 5, 5, 5, 5, 5,			/* 28 .. 2f */
-	5, 5, 5, 5, 5, 5, 5, 5,			/* 30 .. 37 */
-	5, 5, 5, 5, 5, 5, 5, 5,			/* 38 .. 3f */
-	6, 6, 6, 6, 6, 6, 6, 6,			/* 40 .. 47 */
-	6, 6, 6, 6, 6, 6, 6, 6,			/* 48 .. 4f */
-	6, 6, 6, 6, 6, 6, 6, 6,			/* 50 .. 57 */
-	6, 6, 6, 6, 6, 6, 6, 6,			/* 58 .. 5f */
-	6, 6, 6, 6, 6, 6, 6, 6,			/* 60 .. 67 */
-	6, 6, 6, 6, 6, 6, 6, 6,			/* 68 .. 6f */
-	6, 6, 6, 6, 6, 6, 6, 6,			/* 70 .. 77 */
-	6, 6, 6, 6, 6, 6, 6, 6,			/* 78 .. 7f */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* 80 .. 87 */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* 88 .. 8f */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* 90 .. 97 */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* 98 .. 9f */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* a0 .. a7 */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* a8 .. af */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* b0 .. b7 */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* b8 .. bf */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* c0 .. c7 */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* c8 .. cf */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* d0 .. d7 */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* d8 .. df */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* e0 .. e7 */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* e8 .. ef */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* f0 .. f7 */
-	7, 7, 7, 7, 7, 7, 7, 7,			/* f8 .. ff */
-};
-#endif
-
-/*
- * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
- */
-inline int
-xfs_highbit32(
-	__uint32_t	v)
-{
-#ifdef HAVE_ARCH_HIGHBIT
-	return highbit32(v);
-#else
-	int		i;
-
-	if (v & 0xffff0000)
-		if (v & 0xff000000)
-			i = 24;
-		else
-			i = 16;
-	else if (v & 0x0000ffff)
-		if (v & 0x0000ff00)
-			i = 8;
-		else
-			i = 0;
-	else
-		return -1;
-	return i + xfs_highbit[(v >> i) & 0xff];
-#endif
-}
-
-/*
- * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_lowbit64(
-	__uint64_t	v)
-{
-	__uint32_t	w = (__uint32_t)v;
-	int		n = 0;
-
-	if (w) {	/* lower bits */
-		n = ffs(w);
-	} else {	/* upper bits */
-		w = (__uint32_t)(v >> 32);
-		if (w && (n = ffs(w)))
-			n += 32;
-	}
-	return n - 1;
-}
-
-/*
- * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_highbit64(
-	__uint64_t	v)
-{
-	__uint32_t	h = (__uint32_t)(v >> 32);
-
-	if (h)
-		return xfs_highbit32(h) + 32;
-	return xfs_highbit32((__uint32_t)v);
-}
-
-
 /*
  * Return whether bitmap is empty.
  * Size is number of words in the bitmap, which is padded to word boundary
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 082641a9782c..8e0e463dae2d 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -47,13 +47,39 @@ static inline __uint64_t xfs_mask64lo(int n)
 }
 
 /* Get high bit set out of 32-bit argument, -1 if none set */
-extern int xfs_highbit32(__uint32_t v);
+static inline int xfs_highbit32(__uint32_t v)
+{
+	return fls(v) - 1;
+}
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+	return fls64(v) - 1;
+}
+
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+	unsigned long	t = v;
+	return (v) ? find_first_bit(&t, 32) : -1;
+}
 
 /* Get low bit set out of 64-bit argument, -1 if none set */
-extern int xfs_lowbit64(__uint64_t v);
+static inline int xfs_lowbit64(__uint64_t v)
+{
+	__uint32_t	w = (__uint32_t)v;
+	int		n = 0;
 
-/* Get high bit set out of 64-bit argument, -1 if none set */
-extern int xfs_highbit64(__uint64_t);
+	if (w) {	/* lower bits */
+		n = ffs(w);
+	} else {	/* upper bits */
+		w = (__uint32_t)(v >> 32);
+		if (w && (n = ffs(w)))
+		n += 32;
+	}
+	return n - 1;
+}
 
 /* Return whether bitmap is empty (1 == empty) */
 extern int xfs_bitmap_empty(uint *map, uint size);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 53c259f5a5af..a1aab9275d5a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -384,14 +384,14 @@ xfs_bmap_count_tree(
 	int             levelin,
 	int		*count);
 
-STATIC int
+STATIC void
 xfs_bmap_count_leaves(
 	xfs_ifork_t		*ifp,
 	xfs_extnum_t		idx,
 	int			numrecs,
 	int			*count);
 
-STATIC int
+STATIC void
 xfs_bmap_disk_count_leaves(
 	xfs_extnum_t		idx,
 	xfs_bmbt_block_t	*block,
@@ -428,7 +428,8 @@ xfs_bmap_add_attrfork_btree(
 		cur->bc_private.b.firstblock = *firstblock;
 		if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
 			goto error0;
-		ASSERT(stat == 1);	/* must be at least one entry */
+		/* must be at least one entry */
+		XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
 		if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
 			goto error0;
 		if (stat == 0) {
@@ -816,13 +817,13 @@ xfs_bmap_add_extent_delay_real(
 					RIGHT.br_startblock,
 					RIGHT.br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_delete(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
 					LEFT.br_startblock,
 					LEFT.br_blockcount +
@@ -860,7 +861,7 @@ xfs_bmap_add_extent_delay_real(
 					LEFT.br_startblock, LEFT.br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
 					LEFT.br_startblock,
 					LEFT.br_blockcount +
@@ -895,7 +896,7 @@ xfs_bmap_add_extent_delay_real(
 					RIGHT.br_startblock,
 					RIGHT.br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
 					new->br_startblock,
 					PREV.br_blockcount +
@@ -928,11 +929,11 @@ xfs_bmap_add_extent_delay_real(
 					new->br_startblock, new->br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 0);
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
 			cur->bc_rec.b.br_state = XFS_EXT_NORM;
 			if ((error = xfs_bmbt_insert(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		*dnew = 0;
 		/* DELTA: The in-core extent described by new changed type. */
@@ -963,7 +964,7 @@ xfs_bmap_add_extent_delay_real(
 					LEFT.br_startblock, LEFT.br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
 					LEFT.br_startblock,
 					LEFT.br_blockcount +
@@ -1004,11 +1005,11 @@ xfs_bmap_add_extent_delay_real(
 					new->br_startblock, new->br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 0);
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
 			cur->bc_rec.b.br_state = XFS_EXT_NORM;
 			if ((error = xfs_bmbt_insert(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
 		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
@@ -1054,7 +1055,7 @@ xfs_bmap_add_extent_delay_real(
 					RIGHT.br_startblock,
 					RIGHT.br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, new->br_startoff,
 					new->br_startblock,
 					new->br_blockcount +
@@ -1094,11 +1095,11 @@ xfs_bmap_add_extent_delay_real(
 					new->br_startblock, new->br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 0);
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
 			cur->bc_rec.b.br_state = XFS_EXT_NORM;
 			if ((error = xfs_bmbt_insert(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
 		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
@@ -1149,11 +1150,11 @@ xfs_bmap_add_extent_delay_real(
 					new->br_startblock, new->br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 0);
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
 			cur->bc_rec.b.br_state = XFS_EXT_NORM;
 			if ((error = xfs_bmbt_insert(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
 		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
@@ -1377,19 +1378,19 @@ xfs_bmap_add_extent_unwritten_real(
 					RIGHT.br_startblock,
 					RIGHT.br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_delete(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_delete(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
 				LEFT.br_startblock,
 				LEFT.br_blockcount + PREV.br_blockcount +
@@ -1426,13 +1427,13 @@ xfs_bmap_add_extent_unwritten_real(
 					PREV.br_startblock, PREV.br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_delete(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
 				LEFT.br_startblock,
 				LEFT.br_blockcount + PREV.br_blockcount,
@@ -1469,13 +1470,13 @@ xfs_bmap_add_extent_unwritten_real(
 					RIGHT.br_startblock,
 					RIGHT.br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_delete(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, new->br_startoff,
 				new->br_startblock,
 				new->br_blockcount + RIGHT.br_blockcount,
@@ -1508,7 +1509,7 @@ xfs_bmap_add_extent_unwritten_real(
 					new->br_startblock, new->br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, new->br_startoff,
 				new->br_startblock, new->br_blockcount,
 				newext)))
@@ -1549,7 +1550,7 @@ xfs_bmap_add_extent_unwritten_real(
 					PREV.br_startblock, PREV.br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur,
 				PREV.br_startoff + new->br_blockcount,
 				PREV.br_startblock + new->br_blockcount,
@@ -1596,7 +1597,7 @@ xfs_bmap_add_extent_unwritten_real(
 					PREV.br_startblock, PREV.br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur,
 				PREV.br_startoff + new->br_blockcount,
 				PREV.br_startblock + new->br_blockcount,
@@ -1606,7 +1607,7 @@ xfs_bmap_add_extent_unwritten_real(
 			cur->bc_rec.b = *new;
 			if ((error = xfs_bmbt_insert(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		/* DELTA: One in-core extent is split in two. */
 		temp = PREV.br_startoff;
@@ -1640,7 +1641,7 @@ xfs_bmap_add_extent_unwritten_real(
 					PREV.br_startblock,
 					PREV.br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
 				PREV.br_startblock,
 				PREV.br_blockcount - new->br_blockcount,
@@ -1682,7 +1683,7 @@ xfs_bmap_add_extent_unwritten_real(
 					PREV.br_startblock, PREV.br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
 				PREV.br_startblock,
 				PREV.br_blockcount - new->br_blockcount,
@@ -1692,11 +1693,11 @@ xfs_bmap_add_extent_unwritten_real(
 					new->br_startblock, new->br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 0);
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
 			cur->bc_rec.b.br_state = XFS_EXT_NORM;
 			if ((error = xfs_bmbt_insert(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		/* DELTA: One in-core extent is split in two. */
 		temp = PREV.br_startoff;
@@ -1732,27 +1733,34 @@ xfs_bmap_add_extent_unwritten_real(
 					PREV.br_startblock, PREV.br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			/* new right extent - oldext */
 			if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
 				r[1].br_startblock, r[1].br_blockcount,
 				r[1].br_state)))
 				goto done;
 			/* new left extent - oldext */
-			PREV.br_blockcount =
-				new->br_startoff - PREV.br_startoff;
 			cur->bc_rec.b = PREV;
+			cur->bc_rec.b.br_blockcount =
+				new->br_startoff - PREV.br_startoff;
 			if ((error = xfs_bmbt_insert(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
-			if ((error = xfs_bmbt_increment(cur, 0, &i)))
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+			/*
+			 * Reset the cursor to the position of the new extent
+			 * we are about to insert as we can't trust it after
+			 * the previous insert.
+			 */
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
 			/* new middle extent - newext */
-			cur->bc_rec.b = *new;
+			cur->bc_rec.b.br_state = new->br_state;
 			if ((error = xfs_bmbt_insert(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		/* DELTA: One in-core extent is split in three. */
 		temp = PREV.br_startoff;
@@ -2097,13 +2105,13 @@ xfs_bmap_add_extent_hole_real(
 					right.br_startblock,
 					right.br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_delete(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, left.br_startoff,
 					left.br_startblock,
 					left.br_blockcount +
@@ -2139,7 +2147,7 @@ xfs_bmap_add_extent_hole_real(
 					left.br_startblock,
 					left.br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, left.br_startoff,
 					left.br_startblock,
 					left.br_blockcount +
@@ -2174,7 +2182,7 @@ xfs_bmap_add_extent_hole_real(
 					right.br_startblock,
 					right.br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			if ((error = xfs_bmbt_update(cur, new->br_startoff,
 					new->br_startblock,
 					new->br_blockcount +
@@ -2208,11 +2216,11 @@ xfs_bmap_add_extent_hole_real(
 					new->br_startblock,
 					new->br_blockcount, &i)))
 				goto done;
-			ASSERT(i == 0);
+			XFS_WANT_CORRUPTED_GOTO(i == 0, done);
 			cur->bc_rec.b.br_state = new->br_state;
 			if ((error = xfs_bmbt_insert(cur, &i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		/* DELTA: A new extent was added in a hole. */
 		temp = new->br_startoff;
@@ -3131,7 +3139,7 @@ xfs_bmap_del_extent(
 					got.br_startblock, got.br_blockcount,
 					&i)))
 				goto done;
-			ASSERT(i == 1);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		}
 		da_old = da_new = 0;
 	} else {
@@ -3164,7 +3172,7 @@ xfs_bmap_del_extent(
 		}
 		if ((error = xfs_bmbt_delete(cur, &i)))
 			goto done;
-		ASSERT(i == 1);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 		break;
 
 	case 2:
@@ -3268,7 +3276,7 @@ xfs_bmap_del_extent(
 							got.br_startblock,
 							temp, &i)))
 						goto done;
-					ASSERT(i == 1);
+					XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 					/*
 					 * Update the btree record back
 					 * to the original value.
@@ -3289,7 +3297,7 @@ xfs_bmap_del_extent(
 					error = XFS_ERROR(ENOSPC);
 					goto done;
 				}
-				ASSERT(i == 1);
+				XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			} else
 				flags |= XFS_ILOG_FEXT(whichfork);
 			XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -3992,7 +4000,7 @@ xfs_bmap_add_attrfork(
 		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 	}
 	ASSERT(ip->i_d.di_anextents == 0);
-	VN_HOLD(XFS_ITOV(ip));
+	IHOLD(ip);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	switch (ip->i_d.di_format) {
@@ -5970,7 +5978,7 @@ unlock_and_return:
 	xfs_iunlock_map_shared(ip, lock);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
-	kmem_free(map, subnex * sizeof(*map));
+	kmem_free(map);
 
 	return error;
 }
@@ -6088,7 +6096,7 @@ xfs_bmap_get_bp(
 		tp = cur->bc_tp;
 		licp = &tp->t_items;
 		while (!bp && licp != NULL) {
-			if (XFS_LIC_ARE_ALL_FREE(licp)) {
+			if (xfs_lic_are_all_free(licp)) {
 				licp = licp->lic_next;
 				continue;
 			}
@@ -6098,11 +6106,11 @@ xfs_bmap_get_bp(
 				xfs_buf_log_item_t	*bip;
 				xfs_buf_t		*lbp;
 
-				if (XFS_LIC_ISFREE(licp, i)) {
+				if (xfs_lic_isfree(licp, i)) {
 					continue;
 				}
 
-				lidp = XFS_LIC_SLOT(licp, i);
+				lidp = xfs_lic_slot(licp, i);
 				lip = lidp->lid_item;
 				if (lip->li_type != XFS_LI_BUF)
 					continue;
@@ -6359,13 +6367,9 @@ xfs_bmap_count_blocks(
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
-		if (unlikely(xfs_bmap_count_leaves(ifp, 0,
+		xfs_bmap_count_leaves(ifp, 0,
 			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
-			count) < 0)) {
-			XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)",
-					 XFS_ERRLEVEL_LOW, mp);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
+			count);
 		return 0;
 	}
 
@@ -6446,13 +6450,7 @@ xfs_bmap_count_tree(
 		for (;;) {
 			nextbno = be64_to_cpu(block->bb_rightsib);
 			numrecs = be16_to_cpu(block->bb_numrecs);
-			if (unlikely(xfs_bmap_disk_count_leaves(0,
-					block, numrecs, count) < 0)) {
-				xfs_trans_brelse(tp, bp);
-				XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
-						 XFS_ERRLEVEL_LOW, mp);
-				return XFS_ERROR(EFSCORRUPTED);
-			}
+			xfs_bmap_disk_count_leaves(0, block, numrecs, count);
 			xfs_trans_brelse(tp, bp);
 			if (nextbno == NULLFSBLOCK)
 				break;
@@ -6470,7 +6468,7 @@ xfs_bmap_count_tree(
 /*
  * Count leaf blocks given a range of extent records.
  */
-STATIC int
+STATIC void
 xfs_bmap_count_leaves(
 	xfs_ifork_t		*ifp,
 	xfs_extnum_t		idx,
@@ -6483,14 +6481,13 @@ xfs_bmap_count_leaves(
 		xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
 		*count += xfs_bmbt_get_blockcount(frp);
 	}
-	return 0;
 }
 
 /*
  * Count leaf blocks given a range of extent records originally
  * in btree format.
  */
-STATIC int
+STATIC void
 xfs_bmap_disk_count_leaves(
 	xfs_extnum_t		idx,
 	xfs_bmbt_block_t	*block,
@@ -6504,5 +6501,4 @@ xfs_bmap_disk_count_leaves(
 		frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
 		*count += xfs_bmbt_disk_get_blockcount(frp);
 	}
-	return 0;
 }
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 6ff70cda451c..9f3e3a836d15 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -54,12 +54,23 @@ typedef struct xfs_bmap_free_item
 
 /*
  * Header for free extent list.
+ *
+ * xbf_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent.  In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs.  In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0.  If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
  */
 typedef	struct xfs_bmap_free
 {
 	xfs_bmap_free_item_t	*xbf_first;	/* list of to-be-free extents */
 	int			xbf_count;	/* count of items on list */
-	int			xbf_low;	/* kludge: alloc in low mode */
+	int			xbf_low;	/* alloc in low mode */
 } xfs_bmap_free_t;
 
 #define	XFS_BMAP_MAX_NMAP	4
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 4f0e849d973e..23efad29a5cd 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -1493,12 +1493,27 @@ xfs_bmbt_split(
 	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
 	args.fsbno = cur->bc_private.b.firstblock;
 	args.firstblock = args.fsbno;
+	args.minleft = 0;
 	if (args.fsbno == NULLFSBLOCK) {
 		args.fsbno = lbno;
 		args.type = XFS_ALLOCTYPE_START_BNO;
-	} else
+		/*
+		 * Make sure there is sufficient room left in the AG to
+		 * complete a full tree split for an extent insert.  If
+		 * we are converting the middle part of an extent then
+		 * we may need space for two tree splits.
+		 *
+		 * We are relying on the caller to make the correct block
+		 * reservation for this operation to succeed.  If the
+		 * reservation amount is insufficient then we may fail a
+		 * block allocation here and corrupt the filesystem.
+		 */
+		args.minleft = xfs_trans_get_block_res(args.tp);
+	} else if (cur->bc_private.b.flist->xbf_low)
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	else
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-	args.mod = args.minleft = args.alignment = args.total = args.isfl =
+	args.mod = args.alignment = args.total = args.isfl =
 		args.userdata = args.minalignslop = 0;
 	args.minlen = args.maxlen = args.prod = 1;
 	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
@@ -1510,6 +1525,21 @@ xfs_bmbt_split(
 		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 		return error;
 	}
+	if (args.fsbno == NULLFSBLOCK && args.minleft) {
+		/*
+		 * Could not find an AG with enough free space to satisfy
+		 * a full btree split.  Try again without minleft and if
+		 * successful activate the lowspace algorithm.
+		 */
+		args.fsbno = 0;
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.minleft = 0;
+		if ((error = xfs_alloc_vextent(&args))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		cur->bc_private.b.flist->xbf_low = 1;
+	}
 	if (args.fsbno == NULLFSBLOCK) {
 		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
 		*stat = 0;
@@ -2029,22 +2059,8 @@ xfs_bmbt_increment(
  * Insert the current record at the point referenced by cur.
  *
  * A multi-level split of the tree on insert will invalidate the original
- * cursor. It appears, however, that some callers assume that the cursor is
- * always valid. Hence if we do a multi-level split we need to revalidate the
- * cursor.
- *
- * When a split occurs, we will see a new cursor returned. Use that as a
- * trigger to determine if we need to revalidate the original cursor. If we get
- * a split, then use the original irec to lookup up the path of the record we
- * just inserted.
- *
- * Note that the fact that the btree root is in the inode means that we can
- * have the level of the tree change without a "split" occurring at the root
- * level. What happens is that the root is migrated to an allocated block and
- * the inode root is pointed to it. This means a single split can change the
- * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
- * the level change should be accounted as a split so as to correctly trigger a
- * revalidation of the old cursor.
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
  */
 int					/* error */
 xfs_bmbt_insert(
@@ -2057,14 +2073,11 @@ xfs_bmbt_insert(
 	xfs_fsblock_t	nbno;
 	xfs_btree_cur_t	*ncur;
 	xfs_bmbt_rec_t	nrec;
-	xfs_bmbt_irec_t	oirec;		/* original irec */
 	xfs_btree_cur_t	*pcur;
-	int		splits = 0;
 
 	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
 	level = 0;
 	nbno = NULLFSBLOCK;
-	oirec = cur->bc_rec.b;
 	xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
 	ncur = NULL;
 	pcur = cur;
@@ -2073,13 +2086,11 @@ xfs_bmbt_insert(
 				&i))) {
 			if (pcur != cur)
 				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-			goto error0;
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
 		}
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 		if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
-			/* allocating a new root is effectively a split */
-			if (cur->bc_nlevels != pcur->bc_nlevels)
-				splits++;
 			cur->bc_nlevels = pcur->bc_nlevels;
 			cur->bc_private.b.allocated +=
 				pcur->bc_private.b.allocated;
@@ -2093,21 +2104,10 @@ xfs_bmbt_insert(
 			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
 		}
 		if (ncur) {
-			splits++;
 			pcur = ncur;
 			ncur = NULL;
 		}
 	} while (nbno != NULLFSBLOCK);
-
-	if (splits > 1) {
-		/* revalidate the old cursor as we had a multi-level split */
-		error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
-				oirec.br_startblock, oirec.br_blockcount, &i);
-		if (error)
-			goto error0;
-		ASSERT(i == 1);
-	}
-
 	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
 	*stat = i;
 	return 0;
@@ -2254,7 +2254,9 @@ xfs_bmbt_newroot(
 #endif
 		args.fsbno = be64_to_cpu(*pp);
 		args.type = XFS_ALLOCTYPE_START_BNO;
-	} else
+	} else if (cur->bc_private.b.flist->xbf_low)
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	else
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	if ((error = xfs_alloc_vextent(&args))) {
 		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index aeb87ca69fcc..cc593a84c345 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -46,38 +46,11 @@ kmem_zone_t	*xfs_btree_cur_zone;
 /*
  * Btree magic numbers.
  */
-const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
-{
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
 	XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
 };
 
 /*
- * Prototypes for internal routines.
- */
-
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int				/* number of records fitting in block */
-xfs_btree_maxrecs(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	xfs_btree_block_t	*block);/* generic btree block pointer */
-
-/*
- * Internal routines.
- */
-
-/*
- * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
- */
-STATIC xfs_btree_block_t *			/* generic btree block pointer */
-xfs_btree_get_block(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level,	/* level in btree */
-	struct xfs_buf		**bpp);	/* buffer containing the block */
-
-/*
  * Checking routine: return maxrecs for the block.
  */
 STATIC int				/* number of records fitting in block */
@@ -457,35 +430,6 @@ xfs_btree_dup_cursor(
 }
 
 /*
- * Change the cursor to point to the first record at the given level.
- * Other levels are unaffected.
- */
-int					/* success=1, failure=0 */
-xfs_btree_firstrec(
-	xfs_btree_cur_t		*cur,	/* btree cursor */
-	int			level)	/* level to change */
-{
-	xfs_btree_block_t	*block;	/* generic btree block pointer */
-	xfs_buf_t		*bp;	/* buffer containing block */
-
-	/*
-	 * Get the block pointer for this level.
-	 */
-	block = xfs_btree_get_block(cur, level, &bp);
-	xfs_btree_check_block(cur, block, level, bp);
-	/*
-	 * It's empty, there is no such record.
-	 */
-	if (!block->bb_h.bb_numrecs)
-		return 0;
-	/*
-	 * Set the ptr value to 1, that's the first record/key.
-	 */
-	cur->bc_ptrs[level] = 1;
-	return 1;
-}
-
-/*
  * Retrieve the block pointer from the cursor at the given level.
  * This may be a bmap btree root or from a buffer.
  */
@@ -626,6 +570,13 @@ xfs_btree_init_cursor(
 		cur->bc_private.a.agbp = agbp;
 		cur->bc_private.a.agno = agno;
 		break;
+	case XFS_BTNUM_INO:
+		/*
+		 * Inode allocation btree fields.
+		 */
+		cur->bc_private.a.agbp = agbp;
+		cur->bc_private.a.agno = agno;
+		break;
 	case XFS_BTNUM_BMAP:
 		/*
 		 * Bmap btree fields.
@@ -638,13 +589,6 @@ xfs_btree_init_cursor(
 		cur->bc_private.b.flags = 0;
 		cur->bc_private.b.whichfork = whichfork;
 		break;
-	case XFS_BTNUM_INO:
-		/*
-		 * Inode allocation btree fields.
-		 */
-		cur->bc_private.i.agbp = agbp;
-		cur->bc_private.i.agno = agno;
-		break;
 	default:
 		ASSERT(0);
 	}
@@ -671,6 +615,35 @@ xfs_btree_islastblock(
 }
 
 /*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+int					/* success=1, failure=0 */
+xfs_btree_firstrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	xfs_btree_block_t	*block;	/* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (!block->bb_h.bb_numrecs)
+		return 0;
+	/*
+	 * Set the ptr value to 1, that's the first record/key.
+	 */
+	cur->bc_ptrs[level] = 1;
+	return 1;
+}
+
+/*
  * Change the cursor to point to the last record in the current block
  * at the given level.  Other levels are unaffected.
  */
@@ -890,12 +863,12 @@ xfs_btree_readahead_core(
 	case XFS_BTNUM_INO:
 		i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
 		if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
-			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 				be32_to_cpu(i->bb_leftsib), 1);
 			rval++;
 		}
 		if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
-			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
 				be32_to_cpu(i->bb_rightsib), 1);
 			rval++;
 		}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7440b78f9cec..1f528a2a3754 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -158,8 +158,8 @@ typedef struct xfs_btree_cur
 	__uint8_t	bc_blocklog;	/* log2(blocksize) of btree blocks */
 	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
 	union {
-		struct {			/* needed for BNO, CNT */
-			struct xfs_buf	*agbp;	/* agf buffer pointer */
+		struct {			/* needed for BNO, CNT, INO */
+			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
 			xfs_agnumber_t	agno;	/* ag number */
 		} a;
 		struct {			/* needed for BMAP */
@@ -172,10 +172,6 @@ typedef struct xfs_btree_cur
 			char		flags;		/* flags */
 #define	XFS_BTCUR_BPRV_WASDEL	1			/* was delayed */
 		} b;
-		struct {			/* needed for INO */
-			struct xfs_buf	*agbp;	/* agi buffer pointer */
-			xfs_agnumber_t	agno;	/* ag number */
-		} i;
 	}		bc_private;	/* per-btree type data */
 } xfs_btree_cur_t;
 
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 53a71c62025d..002fc2617c8e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -732,12 +732,13 @@ xfs_buf_item_init(
 	bip->bli_item.li_ops = &xfs_buf_item_ops;
 	bip->bli_item.li_mountp = mp;
 	bip->bli_buf = bp;
+	xfs_buf_hold(bp);
 	bip->bli_format.blf_type = XFS_LI_BUF;
 	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
 	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
 	bip->bli_format.blf_map_size = map_size;
 #ifdef XFS_BLI_TRACE
-	bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
+	bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
 #endif
 
 #ifdef XFS_TRANS_DEBUG
@@ -867,6 +868,21 @@ xfs_buf_item_dirty(
 	return (bip->bli_flags & XFS_BLI_DIRTY);
 }
 
+STATIC void
+xfs_buf_item_free(
+	xfs_buf_log_item_t	*bip)
+{
+#ifdef XFS_TRANS_DEBUG
+	kmem_free(bip->bli_orig);
+	kmem_free(bip->bli_logged);
+#endif /* XFS_TRANS_DEBUG */
+
+#ifdef XFS_BLI_TRACE
+	ktrace_free(bip->bli_trace);
+#endif
+	kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
 /*
  * This is called when the buf log item is no longer needed.  It should
  * free the buf log item associated with the given buffer and clear
@@ -887,18 +903,8 @@ xfs_buf_item_relse(
 	    (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
 		XFS_BUF_CLR_IODONE_FUNC(bp);
 	}
-
-#ifdef XFS_TRANS_DEBUG
-	kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
-	bip->bli_orig = NULL;
-	kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
-	bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-
-#ifdef XFS_BLI_TRACE
-	ktrace_free(bip->bli_trace);
-#endif
-	kmem_zone_free(xfs_buf_item_zone, bip);
+	xfs_buf_rele(bp);
+	xfs_buf_item_free(bip);
 }
 
 
@@ -1056,7 +1062,7 @@ xfs_buf_iodone_callbacks(
 			   anyway. */
 			XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
 			XFS_BUF_DONE(bp);
-			XFS_BUF_V_IODONESEMA(bp);
+			XFS_BUF_FINISH_IOWAIT(bp);
 		}
 		return;
 	}
@@ -1120,6 +1126,7 @@ xfs_buf_iodone(
 
 	ASSERT(bip->bli_buf == bp);
 
+	xfs_buf_rele(bp);
 	mp = bip->bli_item.li_mountp;
 
 	/*
@@ -1136,18 +1143,7 @@ xfs_buf_iodone(
 	 * xfs_trans_delete_ail() drops the AIL lock.
 	 */
 	xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
-
-#ifdef XFS_TRANS_DEBUG
-	kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
-	bip->bli_orig = NULL;
-	kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
-	bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-
-#ifdef XFS_BLI_TRACE
-	ktrace_free(bip->bli_trace);
-#endif
-	kmem_zone_free(xfs_buf_item_zone, bip);
+	xfs_buf_item_free(bip);
 }
 
 #if defined(XFS_BLI_TRACE)
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index d5d1e60ee224..d2ce5dd70d87 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -78,6 +78,7 @@ struct xfs_mount_args {
 #define XFSMNT_IOSIZE		0x00002000	/* optimize for I/O size */
 #define XFSMNT_OSYNCISOSYNC	0x00004000	/* o_sync is REALLY o_sync */
 						/* (osyncisdsync is default) */
+#define XFSMNT_NOATTR2		0x00008000	/* turn off ATTR2 EA format */
 #define XFSMNT_32BITINODES	0x00200000	/* restrict inodes to 32
 						 * bits of address space */
 #define XFSMNT_GQUOTA		0x00400000	/* group quota accounting */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 021a8f7e563f..9e561a9cefca 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1431,7 +1431,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 	}
 	if (level < 0) {
 		*result = XFS_ERROR(ENOENT);	/* we're out of our tree */
-		ASSERT(args->oknoent);
+		ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
 		return(0);
 	}
 
@@ -1530,6 +1530,28 @@ xfs_da_hashname(const uchar_t *name, int namelen)
 	}
 }
 
+enum xfs_dacmp
+xfs_da_compname(
+	struct xfs_da_args *args,
+	const char 	*name,
+	int 		len)
+{
+	return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+					XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+static xfs_dahash_t
+xfs_default_hashname(
+	struct xfs_name	*name)
+{
+	return xfs_da_hashname(name->name, name->len);
+}
+
+const struct xfs_nameops xfs_default_nameops = {
+	.hashname	= xfs_default_hashname,
+	.compname	= xfs_da_compname
+};
+
 /*
  * Add a block to the btree ahead of the file.
  * Return the new block number to the caller.
@@ -1598,7 +1620,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 					args->firstblock, args->total,
 					&mapp[mapi], &nmap, args->flist,
 					NULL))) {
-				kmem_free(mapp, sizeof(*mapp) * count);
+				kmem_free(mapp);
 				return error;
 			}
 			if (nmap < 1)
@@ -1620,11 +1642,11 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
 	    bno + count) {
 		if (mapp != &map)
-			kmem_free(mapp, sizeof(*mapp) * count);
+			kmem_free(mapp);
 		return XFS_ERROR(ENOSPC);
 	}
 	if (mapp != &map)
-		kmem_free(mapp, sizeof(*mapp) * count);
+		kmem_free(mapp);
 	*new_blkno = (xfs_dablk_t)bno;
 	return 0;
 }
@@ -2090,10 +2112,10 @@ xfs_da_do_buf(
 		}
 	}
 	if (bplist) {
-		kmem_free(bplist, sizeof(*bplist) * nmap);
+		kmem_free(bplist);
 	}
 	if (mapp != &map) {
-		kmem_free(mapp, sizeof(*mapp) * nfsb);
+		kmem_free(mapp);
 	}
 	if (bpp)
 		*bpp = rbp;
@@ -2102,11 +2124,11 @@ exit1:
 	if (bplist) {
 		for (i = 0; i < nbplist; i++)
 			xfs_trans_brelse(trans, bplist[i]);
-		kmem_free(bplist, sizeof(*bplist) * nmap);
+		kmem_free(bplist);
 	}
 exit0:
 	if (mapp != &map)
-		kmem_free(mapp, sizeof(*mapp) * nfsb);
+		kmem_free(mapp);
 	if (bpp)
 		*bpp = NULL;
 	return error;
@@ -2218,7 +2240,7 @@ xfs_da_state_free(xfs_da_state_t *state)
 
 #ifdef XFS_DABUF_DEBUG
 xfs_dabuf_t	*xfs_dabuf_global_list;
-spinlock_t	xfs_dabuf_global_lock;
+static DEFINE_SPINLOCK(xfs_dabuf_global_lock);
 #endif
 
 /*
@@ -2315,7 +2337,7 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf)
 	if (dabuf->dirty)
 		xfs_da_buf_clean(dabuf);
 	if (dabuf->nbuf > 1)
-		kmem_free(dabuf->data, BBTOB(dabuf->bbcount));
+		kmem_free(dabuf->data);
 #ifdef XFS_DABUF_DEBUG
 	{
 		spin_lock(&xfs_dabuf_global_lock);
@@ -2332,7 +2354,7 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf)
 	if (dabuf->nbuf == 1)
 		kmem_zone_free(xfs_dabuf_zone, dabuf);
 	else
-		kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
+		kmem_free(dabuf);
 }
 
 /*
@@ -2403,7 +2425,7 @@ xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
 	for (i = 0; i < nbuf; i++)
 		xfs_trans_brelse(tp, bplist[i]);
 	if (bplist != &bp)
-		kmem_free(bplist, nbuf * sizeof(*bplist));
+		kmem_free(bplist);
 }
 
 /*
@@ -2429,7 +2451,7 @@ xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
 	for (i = 0; i < nbuf; i++)
 		xfs_trans_binval(tp, bplist[i]);
 	if (bplist != &bp)
-		kmem_free(bplist, nbuf * sizeof(*bplist));
+		kmem_free(bplist);
 }
 
 /*
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 7facf86f74f9..8be0b00ede9a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -99,6 +99,15 @@ typedef struct xfs_da_node_entry xfs_da_node_entry_t;
  *========================================================================*/
 
 /*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+	XFS_CMP_DIFFERENT,	/* names are completely different */
+	XFS_CMP_EXACT,		/* names are exactly the same */
+	XFS_CMP_CASE		/* names are same but differ in case */
+};
+
+/*
  * Structure to ease passing around component names.
  */
 typedef struct xfs_da_args {
@@ -123,13 +132,20 @@ typedef struct xfs_da_args {
 	int		index2;		/* index of 2nd attr in blk */
 	xfs_dablk_t	rmtblkno2;	/* remote attr value starting blkno */
 	int		rmtblkcnt2;	/* remote attr value block count */
-	unsigned char	justcheck;	/* T/F: check for ok with no space */
-	unsigned char	rename;		/* T/F: this is an atomic rename op */
-	unsigned char	addname;	/* T/F: this is an add operation */
-	unsigned char	oknoent;	/* T/F: ok to return ENOENT, else die */
+	int		op_flags;	/* operation flags */
+	enum xfs_dacmp	cmpresult;	/* name compare result for lookups */
 } xfs_da_args_t;
 
 /*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK	0x0001	/* check for ok with no space */
+#define XFS_DA_OP_RENAME	0x0002	/* this is an atomic rename op */
+#define XFS_DA_OP_ADDNAME	0x0004	/* this is an add operation */
+#define XFS_DA_OP_OKNOENT	0x0008	/* lookup/add op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP	0x0010	/* lookup to return CI name if found */
+
+/*
  * Structure to describe buffer(s) for a block.
  * This is needed in the directory version 2 format case, when
  * multiple non-contiguous fsblocks might be needed to cover one
@@ -201,6 +217,14 @@ typedef struct xfs_da_state {
 		(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
 		(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
 
+/*
+ * Name ops for directory and/or attr name operations
+ */
+struct xfs_nameops {
+	xfs_dahash_t	(*hashname)(struct xfs_name *);
+	enum xfs_dacmp	(*compname)(struct xfs_da_args *, const char *, int);
+};
+
 
 #ifdef __KERNEL__
 /*========================================================================
@@ -249,6 +273,10 @@ int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 					  xfs_dabuf_t *dead_buf);
 
 uint xfs_da_hashname(const uchar_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+				const char *name, int len);
+
+
 xfs_da_state_t *xfs_da_state_alloc(void);
 void xfs_da_state_free(xfs_da_state_t *state);
 
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 5f3647cb9885..75b0cd4da0ea 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -116,7 +116,7 @@ xfs_swapext(
  out_put_file:
 	fput(file);
  out_free_sxp:
-	kmem_free(sxp, sizeof(xfs_swapext_t));
+	kmem_free(sxp);
  out:
 	return error;
 }
@@ -128,10 +128,8 @@ xfs_swap_extents(
 	xfs_swapext_t	*sxp)
 {
 	xfs_mount_t	*mp;
-	xfs_inode_t	*ips[2];
 	xfs_trans_t	*tp;
 	xfs_bstat_t	*sbp = &sxp->sx_stat;
-	bhv_vnode_t	*vp, *tvp;
 	xfs_ifork_t	*tempifp, *ifp, *tifp;
 	int		ilf_fields, tilf_fields;
 	static uint	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
@@ -150,19 +148,15 @@ xfs_swap_extents(
 	}
 
 	sbp = &sxp->sx_stat;
-	vp = XFS_ITOV(ip);
-	tvp = XFS_ITOV(tip);
-
-	/* Lock in i_ino order */
-	if (ip->i_ino < tip->i_ino) {
-		ips[0] = ip;
-		ips[1] = tip;
-	} else {
-		ips[0] = tip;
-		ips[1] = ip;
-	}
 
-	xfs_lock_inodes(ips, 2, lock_flags);
+	/*
+	 * we have to do two separate lock calls here to keep lockdep
+	 * happy. If we try to get all the locks in one call, lock will
+	 * report false positives when we drop the ILOCK and regain them
+	 * below.
+	 */
+	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
 	locked = 1;
 
 	/* Verify that both files have the same format */
@@ -184,7 +178,7 @@ xfs_swap_extents(
 		goto error0;
 	}
 
-	if (VN_CACHED(tvp) != 0) {
+	if (VN_CACHED(VFS_I(tip)) != 0) {
 		xfs_inval_cached_trace(tip, 0, -1, 0, -1);
 		error = xfs_flushinval_pages(tip, 0, -1,
 				FI_REMAPF_LOCKED);
@@ -193,7 +187,7 @@ xfs_swap_extents(
 	}
 
 	/* Verify O_DIRECT for ftmp */
-	if (VN_CACHED(tvp) != 0) {
+	if (VN_CACHED(VFS_I(tip)) != 0) {
 		error = XFS_ERROR(EINVAL);
 		goto error0;
 	}
@@ -237,7 +231,7 @@ xfs_swap_extents(
 	 * vop_read (or write in the case of autogrow) they block on the iolock
 	 * until we have switched the extents.
 	 */
-	if (VN_MAPPED(vp)) {
+	if (VN_MAPPED(VFS_I(ip))) {
 		error = XFS_ERROR(EBUSY);
 		goto error0;
 	}
@@ -265,7 +259,7 @@ xfs_swap_extents(
 		locked = 0;
 		goto error0;
 	}
-	xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
+	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
 
 	/*
 	 * Count the number of extended attribute blocks
@@ -350,15 +344,11 @@ xfs_swap_extents(
 		break;
 	}
 
-	/*
-	 * Increment vnode ref counts since xfs_trans_commit &
-	 * xfs_trans_cancel will both unlock the inodes and
-	 * decrement the associated ref counts.
-	 */
-	VN_HOLD(vp);
-	VN_HOLD(tvp);
 
+	IHOLD(ip);
 	xfs_trans_ijoin(tp, ip, lock_flags);
+
+	IHOLD(tip);
 	xfs_trans_ijoin(tp, tip, lock_flags);
 
 	xfs_trans_log_inode(tp, ip,  ilf_fields);
@@ -381,6 +371,6 @@ xfs_swap_extents(
 		xfs_iunlock(tip, lock_flags);
 	}
 	if (tempifp != NULL)
-		kmem_free(tempifp, sizeof(xfs_ifork_t));
+		kmem_free(tempifp);
 	return error;
 }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 7cb26529766b..80e0dc51361c 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -46,6 +46,54 @@
 
 struct xfs_name xfs_name_dotdot = {"..", 2};
 
+extern const struct xfs_nameops xfs_default_nameops;
+
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+	struct xfs_name	*name)
+{
+	xfs_dahash_t	hash;
+	int		i;
+
+	for (i = 0, hash = 0; i < name->len; i++)
+		hash = tolower(name->name[i]) ^ rol32(hash, 7);
+
+	return hash;
+}
+
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+	struct xfs_da_args *args,
+	const char	*name,
+	int 		len)
+{
+	enum xfs_dacmp	result;
+	int		i;
+
+	if (args->namelen != len)
+		return XFS_CMP_DIFFERENT;
+
+	result = XFS_CMP_EXACT;
+	for (i = 0; i < len; i++) {
+		if (args->name[i] == name[i])
+			continue;
+		if (tolower(args->name[i]) != tolower(name[i]))
+			return XFS_CMP_DIFFERENT;
+		result = XFS_CMP_CASE;
+	}
+
+	return result;
+}
+
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+	.hashname	= xfs_ascii_ci_hashname,
+	.compname	= xfs_ascii_ci_compname,
+};
+
 void
 xfs_dir_mount(
 	xfs_mount_t	*mp)
@@ -65,6 +113,10 @@ xfs_dir_mount(
 		(mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
 		(uint)sizeof(xfs_da_node_entry_t);
 	mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+	if (xfs_sb_version_hasasciici(&mp->m_sb))
+		mp->m_dirnameops = &xfs_ascii_ci_nameops;
+	else
+		mp->m_dirnameops = &xfs_default_nameops;
 }
 
 /*
@@ -162,9 +214,10 @@ xfs_dir_createname(
 		return rval;
 	XFS_STATS_INC(xs_dir_create);
 
+	memset(&args, 0, sizeof(xfs_da_args_t));
 	args.name = name->name;
 	args.namelen = name->len;
-	args.hashval = xfs_da_hashname(name->name, name->len);
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
 	args.inumber = inum;
 	args.dp = dp;
 	args.firstblock = first;
@@ -172,8 +225,7 @@ xfs_dir_createname(
 	args.total = total;
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
-	args.justcheck = 0;
-	args.addname = args.oknoent = 1;
+	args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_addname(&args);
@@ -191,14 +243,43 @@ xfs_dir_createname(
 }
 
 /*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+	struct xfs_da_args *args,
+	const char	*name,
+	int		len)
+{
+	if (args->cmpresult == XFS_CMP_DIFFERENT)
+		return ENOENT;
+	if (args->cmpresult != XFS_CMP_CASE ||
+					!(args->op_flags & XFS_DA_OP_CILOOKUP))
+		return EEXIST;
+
+	args->value = kmem_alloc(len, KM_MAYFAIL);
+	if (!args->value)
+		return ENOMEM;
+
+	memcpy(args->value, name, len);
+	args->valuelen = len;
+	return EEXIST;
+}
+
+/*
  * Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
  */
+
 int
 xfs_dir_lookup(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*dp,
 	struct xfs_name	*name,
-	xfs_ino_t	*inum)		/* out: inode number */
+	xfs_ino_t	*inum,		/* out: inode number */
+	struct xfs_name *ci_name)	/* out: actual name if CI match */
 {
 	xfs_da_args_t	args;
 	int		rval;
@@ -206,15 +287,17 @@ xfs_dir_lookup(
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_lookup);
-	memset(&args, 0, sizeof(xfs_da_args_t));
 
+	memset(&args, 0, sizeof(xfs_da_args_t));
 	args.name = name->name;
 	args.namelen = name->len;
-	args.hashval = xfs_da_hashname(name->name, name->len);
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
 	args.dp = dp;
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
-	args.oknoent = 1;
+	args.op_flags = XFS_DA_OP_OKNOENT;
+	if (ci_name)
+		args.op_flags |= XFS_DA_OP_CILOOKUP;
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_lookup(&args);
@@ -230,8 +313,13 @@ xfs_dir_lookup(
 		rval = xfs_dir2_node_lookup(&args);
 	if (rval == EEXIST)
 		rval = 0;
-	if (rval == 0)
+	if (!rval) {
 		*inum = args.inumber;
+		if (ci_name) {
+			ci_name->name = args.value;
+			ci_name->len = args.valuelen;
+		}
+	}
 	return rval;
 }
 
@@ -255,9 +343,10 @@ xfs_dir_removename(
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_remove);
 
+	memset(&args, 0, sizeof(xfs_da_args_t));
 	args.name = name->name;
 	args.namelen = name->len;
-	args.hashval = xfs_da_hashname(name->name, name->len);
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
 	args.inumber = ino;
 	args.dp = dp;
 	args.firstblock = first;
@@ -265,7 +354,6 @@ xfs_dir_removename(
 	args.total = total;
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
-	args.justcheck = args.addname = args.oknoent = 0;
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_removename(&args);
@@ -338,9 +426,10 @@ xfs_dir_replace(
 	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
 		return rval;
 
+	memset(&args, 0, sizeof(xfs_da_args_t));
 	args.name = name->name;
 	args.namelen = name->len;
-	args.hashval = xfs_da_hashname(name->name, name->len);
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
 	args.inumber = inum;
 	args.dp = dp;
 	args.firstblock = first;
@@ -348,7 +437,6 @@ xfs_dir_replace(
 	args.total = total;
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
-	args.justcheck = args.addname = args.oknoent = 0;
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_replace(&args);
@@ -384,15 +472,16 @@ xfs_dir_canenter(
 		return 0;
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	memset(&args, 0, sizeof(xfs_da_args_t));
 
+	memset(&args, 0, sizeof(xfs_da_args_t));
 	args.name = name->name;
 	args.namelen = name->len;
-	args.hashval = xfs_da_hashname(name->name, name->len);
+	args.hashval = dp->i_mount->m_dirnameops->hashname(name);
 	args.dp = dp;
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
-	args.justcheck = args.addname = args.oknoent = 1;
+	args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+							XFS_DA_OP_OKNOENT;
 
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_addname(&args);
@@ -493,7 +582,7 @@ xfs_dir2_grow_inode(
 					args->firstblock, args->total,
 					&mapp[mapi], &nmap, args->flist,
 					NULL))) {
-				kmem_free(mapp, sizeof(*mapp) * count);
+				kmem_free(mapp);
 				return error;
 			}
 			if (nmap < 1)
@@ -525,14 +614,14 @@ xfs_dir2_grow_inode(
 	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
 	    bno + count) {
 		if (mapp != &map)
-			kmem_free(mapp, sizeof(*mapp) * count);
+			kmem_free(mapp);
 		return XFS_ERROR(ENOSPC);
 	}
 	/*
 	 * Done with the temporary mapping table.
 	 */
 	if (mapp != &map)
-		kmem_free(mapp, sizeof(*mapp) * count);
+		kmem_free(mapp);
 	*dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
 	/*
 	 * Update file's size if this is the data space and it grew.
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 6392f939029f..1d9ef96f33aa 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -74,7 +74,8 @@ extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
 				xfs_fsblock_t *first,
 				struct xfs_bmap_free *flist, xfs_extlen_t tot);
 extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
-				struct xfs_name *name, xfs_ino_t *inum);
+				struct xfs_name *name, xfs_ino_t *inum,
+				struct xfs_name *ci_name);
 extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
 				struct xfs_name *name, xfs_ino_t ino,
 				xfs_fsblock_t *first,
@@ -99,4 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
 				struct xfs_dabuf *bp);
 
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name,
+				int len);
+
 #endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index fb5a556725b3..e2fa0a1d8e96 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -215,7 +215,7 @@ xfs_dir2_block_addname(
 	/*
 	 * If this isn't a real add, we're done with the buffer.
 	 */
-	if (args->justcheck)
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
 		xfs_da_brelse(tp, bp);
 	/*
 	 * If we don't have space for the new entry & leaf ...
@@ -225,7 +225,7 @@ xfs_dir2_block_addname(
 		 * Not trying to actually do anything, or don't have
 		 * a space reservation: return no-space.
 		 */
-		if (args->justcheck || args->total == 0)
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
 			return XFS_ERROR(ENOSPC);
 		/*
 		 * Convert to the next larger format.
@@ -240,7 +240,7 @@ xfs_dir2_block_addname(
 	/*
 	 * Just checking, and it would work, so say so.
 	 */
-	if (args->justcheck)
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
 		return 0;
 	needlog = needscan = 0;
 	/*
@@ -610,14 +610,15 @@ xfs_dir2_block_lookup(
 	/*
 	 * Get the offset from the leaf entry, to point to the data.
 	 */
-	dep = (xfs_dir2_data_entry_t *)
-	      ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+	dep = (xfs_dir2_data_entry_t *)((char *)block +
+		xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
 	/*
-	 * Fill in inode number, release the block.
+	 * Fill in inode number, CI name if appropriate, release the block.
 	 */
 	args->inumber = be64_to_cpu(dep->inumber);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
 	xfs_da_brelse(args->trans, bp);
-	return XFS_ERROR(EEXIST);
+	return XFS_ERROR(error);
 }
 
 /*
@@ -643,6 +644,7 @@ xfs_dir2_block_lookup_int(
 	int			mid;		/* binary search current idx */
 	xfs_mount_t		*mp;		/* filesystem mount point */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
 
 	dp = args->dp;
 	tp = args->trans;
@@ -673,7 +675,7 @@ xfs_dir2_block_lookup_int(
 		else
 			high = mid - 1;
 		if (low > high) {
-			ASSERT(args->oknoent);
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
 			xfs_da_brelse(tp, bp);
 			return XFS_ERROR(ENOENT);
 		}
@@ -697,20 +699,31 @@ xfs_dir2_block_lookup_int(
 		dep = (xfs_dir2_data_entry_t *)
 			((char *)block + xfs_dir2_dataptr_to_off(mp, addr));
 		/*
-		 * Compare, if it's right give back buffer & entry number.
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
 		 */
-		if (dep->namelen == args->namelen &&
-		    dep->name[0] == args->name[0] &&
-		    memcmp(dep->name, args->name, args->namelen) == 0) {
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
 			*bpp = bp;
 			*entno = mid;
-			return 0;
+			if (cmp == XFS_CMP_EXACT)
+				return 0;
 		}
-	} while (++mid < be32_to_cpu(btp->count) && be32_to_cpu(blp[mid].hashval) == hash);
+	} while (++mid < be32_to_cpu(btp->count) &&
+			be32_to_cpu(blp[mid].hashval) == hash);
+
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was found earlier, return success.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE)
+		return 0;
 	/*
 	 * No match, release the buffer and return ENOENT.
 	 */
-	ASSERT(args->oknoent);
 	xfs_da_brelse(tp, bp);
 	return XFS_ERROR(ENOENT);
 }
@@ -1033,6 +1046,7 @@ xfs_dir2_sf_to_block(
 	xfs_dir2_sf_t		*sfp;		/* shortform structure */
 	__be16			*tagp;		/* end of data entry */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	struct xfs_name		name;
 
 	xfs_dir2_trace_args("sf_to_block", args);
 	dp = args->dp;
@@ -1071,7 +1085,7 @@ xfs_dir2_sf_to_block(
 	 */
 	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
 	if (error) {
-		kmem_free(buf, buf_len);
+		kmem_free(buf);
 		return error;
 	}
 	/*
@@ -1079,7 +1093,7 @@ xfs_dir2_sf_to_block(
 	 */
 	error = xfs_dir2_data_init(args, blkno, &bp);
 	if (error) {
-		kmem_free(buf, buf_len);
+		kmem_free(buf);
 		return error;
 	}
 	block = bp->data;
@@ -1187,8 +1201,10 @@ xfs_dir2_sf_to_block(
 		tagp = xfs_dir2_data_entry_tag_p(dep);
 		*tagp = cpu_to_be16((char *)dep - (char *)block);
 		xfs_dir2_data_log_entry(tp, bp, dep);
-		blp[2 + i].hashval = cpu_to_be32(xfs_da_hashname(
-					(char *)sfep->name, sfep->namelen));
+		name.name = sfep->name;
+		name.len = sfep->namelen;
+		blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+							hashname(&name));
 		blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
 						 (char *)dep - (char *)block));
 		offset = (int)((char *)(tagp + 1) - (char *)block);
@@ -1198,7 +1214,7 @@ xfs_dir2_sf_to_block(
 			sfep = xfs_dir2_sf_nextentry(sfp, sfep);
 	}
 	/* Done with the temporary buffer */
-	kmem_free(buf, buf_len);
+	kmem_free(buf);
 	/*
 	 * Sort the leaf entries by hash value.
 	 */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index fb8c9e08b23d..498f8d694330 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -65,6 +65,7 @@ xfs_dir2_data_check(
 	xfs_mount_t		*mp;		/* filesystem mount point */
 	char			*p;		/* current data position */
 	int			stale;		/* count of stale leaves */
+	struct xfs_name		name;
 
 	mp = dp->i_mount;
 	d = bp->data;
@@ -140,7 +141,9 @@ xfs_dir2_data_check(
 			addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
 				(xfs_dir2_data_aoff_t)
 				((char *)dep - (char *)d));
-			hash = xfs_da_hashname((char *)dep->name, dep->namelen);
+			name.name = dep->name;
+			name.len = dep->namelen;
+			hash = mp->m_dirnameops->hashname(&name);
 			for (i = 0; i < be32_to_cpu(btp->count); i++) {
 				if (be32_to_cpu(lep[i].address) == addr &&
 				    be32_to_cpu(lep[i].hashval) == hash)
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index bc52b803d79b..93535992cb60 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -263,20 +263,21 @@ xfs_dir2_leaf_addname(
 	 * If we don't have enough free bytes but we can make enough
 	 * by compacting out stale entries, we'll do that.
 	 */
-	if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < needbytes &&
-	    be16_to_cpu(leaf->hdr.stale) > 1) {
+	if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
+				needbytes && be16_to_cpu(leaf->hdr.stale) > 1) {
 		compact = 1;
 	}
 	/*
 	 * Otherwise if we don't have enough free bytes we need to
 	 * convert to node form.
 	 */
-	else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
-		 needbytes) {
+	else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(
+						leaf->hdr.count)] < needbytes) {
 		/*
 		 * Just checking or no space reservation, give up.
 		 */
-		if (args->justcheck || args->total == 0) {
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+							args->total == 0) {
 			xfs_da_brelse(tp, lbp);
 			return XFS_ERROR(ENOSPC);
 		}
@@ -301,7 +302,7 @@ xfs_dir2_leaf_addname(
 	 * If just checking, then it will fit unless we needed to allocate
 	 * a new data block.
 	 */
-	if (args->justcheck) {
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
 		xfs_da_brelse(tp, lbp);
 		return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
 	}
@@ -1110,7 +1111,7 @@ xfs_dir2_leaf_getdents(
 		*offset = XFS_DIR2_MAX_DATAPTR;
 	else
 		*offset = xfs_dir2_byte_to_dataptr(mp, curoff);
-	kmem_free(map, map_size * sizeof(*map));
+	kmem_free(map);
 	if (bp)
 		xfs_da_brelse(NULL, bp);
 	return error;
@@ -1298,12 +1299,13 @@ xfs_dir2_leaf_lookup(
 	      ((char *)dbp->data +
 	       xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
 	/*
-	 * Return the found inode number.
+	 * Return the found inode number & CI name if appropriate
 	 */
 	args->inumber = be64_to_cpu(dep->inumber);
+	error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
 	xfs_da_brelse(tp, dbp);
 	xfs_da_brelse(tp, lbp);
-	return XFS_ERROR(EEXIST);
+	return XFS_ERROR(error);
 }
 
 /*
@@ -1319,8 +1321,8 @@ xfs_dir2_leaf_lookup_int(
 	int			*indexp,	/* out: index in leaf block */
 	xfs_dabuf_t		**dbpp)		/* out: data buffer */
 {
-	xfs_dir2_db_t		curdb;		/* current data block number */
-	xfs_dabuf_t		*dbp;		/* data buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dabuf_t		*dbp = NULL;	/* data buffer */
 	xfs_dir2_data_entry_t	*dep;		/* data entry */
 	xfs_inode_t		*dp;		/* incore directory inode */
 	int			error;		/* error return code */
@@ -1331,6 +1333,8 @@ xfs_dir2_leaf_lookup_int(
 	xfs_mount_t		*mp;		/* filesystem mount point */
 	xfs_dir2_db_t		newdb;		/* new data block number */
 	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		cidb = -1;	/* case match data block no. */
+	enum xfs_dacmp		cmp;		/* name compare result */
 
 	dp = args->dp;
 	tp = args->trans;
@@ -1338,11 +1342,10 @@ xfs_dir2_leaf_lookup_int(
 	/*
 	 * Read the leaf block into the buffer.
 	 */
-	if ((error =
-	    xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-		    XFS_DATA_FORK))) {
+	error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+							XFS_DATA_FORK);
+	if (error)
 		return error;
-	}
 	*lbpp = lbp;
 	leaf = lbp->data;
 	xfs_dir2_leaf_check(dp, lbp);
@@ -1354,9 +1357,9 @@ xfs_dir2_leaf_lookup_int(
 	 * Loop over all the entries with the right hash value
 	 * looking to match the name.
 	 */
-	for (lep = &leaf->ents[index], dbp = NULL, curdb = -1;
-	     index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
-	     lep++, index++) {
+	for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+				be32_to_cpu(lep->hashval) == args->hashval;
+				lep++, index++) {
 		/*
 		 * Skip over stale leaf entries.
 		 */
@@ -1373,10 +1376,10 @@ xfs_dir2_leaf_lookup_int(
 		if (newdb != curdb) {
 			if (dbp)
 				xfs_da_brelse(tp, dbp);
-			if ((error =
-			    xfs_da_read_buf(tp, dp,
-				    xfs_dir2_db_to_da(mp, newdb), -1, &dbp,
-				    XFS_DATA_FORK))) {
+			error = xfs_da_read_buf(tp, dp,
+						xfs_dir2_db_to_da(mp, newdb),
+						-1, &dbp, XFS_DATA_FORK);
+			if (error) {
 				xfs_da_brelse(tp, lbp);
 				return error;
 			}
@@ -1386,24 +1389,50 @@ xfs_dir2_leaf_lookup_int(
 		/*
 		 * Point to the data entry.
 		 */
-		dep = (xfs_dir2_data_entry_t *)
-		      ((char *)dbp->data +
-		       xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+		dep = (xfs_dir2_data_entry_t *)((char *)dbp->data +
+			xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
 		/*
-		 * If it matches then return it.
+		 * Compare name and if it's an exact match, return the index
+		 * and buffer. If it's the first case-insensitive match, store
+		 * the index and buffer and continue looking for an exact match.
 		 */
-		if (dep->namelen == args->namelen &&
-		    dep->name[0] == args->name[0] &&
-		    memcmp(dep->name, args->name, args->namelen) == 0) {
-			*dbpp = dbp;
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
 			*indexp = index;
-			return 0;
+			/* case exact match: return the current buffer. */
+			if (cmp == XFS_CMP_EXACT) {
+				*dbpp = dbp;
+				return 0;
+			}
+			cidb = curdb;
 		}
 	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	/*
+	 * Here, we can only be doing a lookup (not a rename or remove).
+	 * If a case-insensitive match was found earlier, re-read the
+	 * appropriate data block if required and return it.
+	 */
+	if (args->cmpresult == XFS_CMP_CASE) {
+		ASSERT(cidb != -1);
+		if (cidb != curdb) {
+			xfs_da_brelse(tp, dbp);
+			error = xfs_da_read_buf(tp, dp,
+						xfs_dir2_db_to_da(mp, cidb),
+						-1, &dbp, XFS_DATA_FORK);
+			if (error) {
+				xfs_da_brelse(tp, lbp);
+				return error;
+			}
+		}
+		*dbpp = dbp;
+		return 0;
+	}
 	/*
 	 * No match found, return ENOENT.
 	 */
-	ASSERT(args->oknoent);
+	ASSERT(cidb == -1);
 	if (dbp)
 		xfs_da_brelse(tp, dbp);
 	xfs_da_brelse(tp, lbp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 8dade711f099..fa6c3a5ddbc6 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -226,7 +226,7 @@ xfs_dir2_leafn_add(
 	ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
 	       be32_to_cpu(leaf->ents[index].hashval) >= args->hashval);
 
-	if (args->justcheck)
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
 		return 0;
 
 	/*
@@ -387,28 +387,26 @@ xfs_dir2_leafn_lasthash(
 }
 
 /*
- * Look up a leaf entry in a node-format leaf block.
- * If this is an addname then the extrablk in state is a freespace block,
- * otherwise it's a data block.
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
  */
-int
-xfs_dir2_leafn_lookup_int(
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
 	xfs_dabuf_t		*bp,		/* leaf buffer */
 	xfs_da_args_t		*args,		/* operation arguments */
 	int			*indexp,	/* out: leaf entry index */
 	xfs_da_state_t		*state)		/* state to fill in */
 {
-	xfs_dabuf_t		*curbp;		/* current data/free buffer */
-	xfs_dir2_db_t		curdb;		/* current data block number */
-	xfs_dir2_db_t		curfdb;		/* current free block number */
-	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_dabuf_t		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_db_t		curfdb = -1;	/* current free block number */
 	xfs_inode_t		*dp;		/* incore directory inode */
 	int			error;		/* error return value */
 	int			fi;		/* free entry index */
-	xfs_dir2_free_t		*free=NULL;	/* free block structure */
+	xfs_dir2_free_t		*free = NULL;	/* free block structure */
 	int			index;		/* leaf entry index */
 	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
-	int			length=0;	/* length of new data entry */
+	int			length;		/* length of new data entry */
 	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
 	xfs_mount_t		*mp;		/* filesystem mount point */
 	xfs_dir2_db_t		newdb;		/* new data block number */
@@ -431,33 +429,20 @@ xfs_dir2_leafn_lookup_int(
 	/*
 	 * Do we have a buffer coming in?
 	 */
-	if (state->extravalid)
+	if (state->extravalid) {
+		/* If so, it's a free block buffer, get the block number. */
 		curbp = state->extrablk.bp;
-	else
-		curbp = NULL;
-	/*
-	 * For addname, it's a free block buffer, get the block number.
-	 */
-	if (args->addname) {
-		curfdb = curbp ? state->extrablk.blkno : -1;
-		curdb = -1;
-		length = xfs_dir2_data_entsize(args->namelen);
-		if ((free = (curbp ? curbp->data : NULL)))
-			ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
-	}
-	/*
-	 * For others, it's a data block buffer, get the block number.
-	 */
-	else {
-		curfdb = -1;
-		curdb = curbp ? state->extrablk.blkno : -1;
+		curfdb = state->extrablk.blkno;
+		free = curbp->data;
+		ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
 	}
+	length = xfs_dir2_data_entsize(args->namelen);
 	/*
 	 * Loop over leaf entries with the right hash value.
 	 */
-	for (lep = &leaf->ents[index];
-	     index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
-	     lep++, index++) {
+	for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+				be32_to_cpu(lep->hashval) == args->hashval;
+				lep++, index++) {
 		/*
 		 * Skip stale leaf entries.
 		 */
@@ -471,161 +456,244 @@ xfs_dir2_leafn_lookup_int(
 		 * For addname, we're looking for a place to put the new entry.
 		 * We want to use a data block with an entry of equal
 		 * hash value to ours if there is one with room.
+		 *
+		 * If this block isn't the data block we already have
+		 * in hand, take a look at it.
 		 */
-		if (args->addname) {
+		if (newdb != curdb) {
+			curdb = newdb;
 			/*
-			 * If this block isn't the data block we already have
-			 * in hand, take a look at it.
+			 * Convert the data block to the free block
+			 * holding its freespace information.
 			 */
-			if (newdb != curdb) {
-				curdb = newdb;
-				/*
-				 * Convert the data block to the free block
-				 * holding its freespace information.
-				 */
-				newfdb = xfs_dir2_db_to_fdb(mp, newdb);
-				/*
-				 * If it's not the one we have in hand,
-				 * read it in.
-				 */
-				if (newfdb != curfdb) {
-					/*
-					 * If we had one before, drop it.
-					 */
-					if (curbp)
-						xfs_da_brelse(tp, curbp);
-					/*
-					 * Read the free block.
-					 */
-					if ((error = xfs_da_read_buf(tp, dp,
-							xfs_dir2_db_to_da(mp,
-								newfdb),
-							-1, &curbp,
-							XFS_DATA_FORK))) {
-						return error;
-					}
-					free = curbp->data;
-					ASSERT(be32_to_cpu(free->hdr.magic) ==
-					       XFS_DIR2_FREE_MAGIC);
-					ASSERT((be32_to_cpu(free->hdr.firstdb) %
-						XFS_DIR2_MAX_FREE_BESTS(mp)) ==
-					       0);
-					ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
-					ASSERT(curdb <
-					       be32_to_cpu(free->hdr.firstdb) +
-					       be32_to_cpu(free->hdr.nvalid));
-				}
-				/*
-				 * Get the index for our entry.
-				 */
-				fi = xfs_dir2_db_to_fdindex(mp, curdb);
-				/*
-				 * If it has room, return it.
-				 */
-				if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
-					XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
-							 XFS_ERRLEVEL_LOW, mp);
-					if (curfdb != newfdb)
-						xfs_da_brelse(tp, curbp);
-					return XFS_ERROR(EFSCORRUPTED);
-				}
-				curfdb = newfdb;
-				if (be16_to_cpu(free->bests[fi]) >= length) {
-					*indexp = index;
-					state->extravalid = 1;
-					state->extrablk.bp = curbp;
-					state->extrablk.blkno = curfdb;
-					state->extrablk.index = fi;
-					state->extrablk.magic =
-						XFS_DIR2_FREE_MAGIC;
-					ASSERT(args->oknoent);
-					return XFS_ERROR(ENOENT);
-				}
-			}
-		}
-		/*
-		 * Not adding a new entry, so we really want to find
-		 * the name given to us.
-		 */
-		else {
+			newfdb = xfs_dir2_db_to_fdb(mp, newdb);
 			/*
-			 * If it's a different data block, go get it.
+			 * If it's not the one we have in hand, read it in.
 			 */
-			if (newdb != curdb) {
+			if (newfdb != curfdb) {
 				/*
-				 * If we had a block before, drop it.
+				 * If we had one before, drop it.
 				 */
 				if (curbp)
 					xfs_da_brelse(tp, curbp);
 				/*
-				 * Read the data block.
+				 * Read the free block.
 				 */
-				if ((error =
-				    xfs_da_read_buf(tp, dp,
-					    xfs_dir2_db_to_da(mp, newdb), -1,
-					    &curbp, XFS_DATA_FORK))) {
+				error = xfs_da_read_buf(tp, dp,
+						xfs_dir2_db_to_da(mp, newfdb),
+						-1, &curbp, XFS_DATA_FORK);
+				if (error)
 					return error;
-				}
-				xfs_dir2_data_check(dp, curbp);
-				curdb = newdb;
+				free = curbp->data;
+				ASSERT(be32_to_cpu(free->hdr.magic) ==
+					XFS_DIR2_FREE_MAGIC);
+				ASSERT((be32_to_cpu(free->hdr.firstdb) %
+					XFS_DIR2_MAX_FREE_BESTS(mp)) == 0);
+				ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
+				ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) +
+					be32_to_cpu(free->hdr.nvalid));
 			}
 			/*
-			 * Point to the data entry.
+			 * Get the index for our entry.
 			 */
-			dep = (xfs_dir2_data_entry_t *)
-			      ((char *)curbp->data +
-			       xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+			fi = xfs_dir2_db_to_fdindex(mp, curdb);
 			/*
-			 * Compare the entry, return it if it matches.
+			 * If it has room, return it.
 			 */
-			if (dep->namelen == args->namelen &&
-			    dep->name[0] == args->name[0] &&
-			    memcmp(dep->name, args->name, args->namelen) == 0) {
-				args->inumber = be64_to_cpu(dep->inumber);
-				*indexp = index;
-				state->extravalid = 1;
-				state->extrablk.bp = curbp;
-				state->extrablk.blkno = curdb;
-				state->extrablk.index =
-					(int)((char *)dep -
-					      (char *)curbp->data);
-				state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-				return XFS_ERROR(EEXIST);
+			if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
+				XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+							XFS_ERRLEVEL_LOW, mp);
+				if (curfdb != newfdb)
+					xfs_da_brelse(tp, curbp);
+				return XFS_ERROR(EFSCORRUPTED);
 			}
+			curfdb = newfdb;
+			if (be16_to_cpu(free->bests[fi]) >= length)
+				goto out;
 		}
 	}
+	/* Didn't find any space */
+	fi = -1;
+out:
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+	if (curbp) {
+		/* Giving back a free block. */
+		state->extravalid = 1;
+		state->extrablk.bp = curbp;
+		state->extrablk.index = fi;
+		state->extrablk.blkno = curfdb;
+		state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+	} else {
+		state->extravalid = 0;
+	}
 	/*
-	 * Didn't find a match.
-	 * If we are holding a buffer, give it back in case our caller
-	 * finds it useful.
+	 * Return the index, that will be the insertion point.
 	 */
-	if ((state->extravalid = (curbp != NULL))) {
-		state->extrablk.bp = curbp;
-		state->extrablk.index = -1;
+	*indexp = index;
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	xfs_dabuf_t		*curbp = NULL;	/* current data/free buffer */
+	xfs_dir2_db_t		curdb = -1;	/* current data block number */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	enum xfs_dacmp		cmp;		/* comparison result */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->data;
+	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+	ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
+#endif
+	xfs_dir2_leafn_check(dp, bp);
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid) {
+		curbp = state->extrablk.bp;
+		curdb = state->extrablk.blkno;
+	}
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+				be32_to_cpu(lep->hashval) == args->hashval;
+				lep++, index++) {
 		/*
-		 * For addname, giving back a free block.
+		 * Skip stale leaf entries.
 		 */
-		if (args->addname) {
-			state->extrablk.blkno = curfdb;
-			state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+		if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+		/*
+		 * Not adding a new entry, so we really want to find
+		 * the name given to us.
+		 *
+		 * If it's a different data block, go get it.
+		 */
+		if (newdb != curdb) {
+			/*
+			 * If we had a block before that we aren't saving
+			 * for a CI name, drop it
+			 */
+			if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+						curdb != state->extrablk.blkno))
+				xfs_da_brelse(tp, curbp);
+			/*
+			 * If needing the block that is saved with a CI match,
+			 * use it otherwise read in the new data block.
+			 */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+					newdb == state->extrablk.blkno) {
+				ASSERT(state->extravalid);
+				curbp = state->extrablk.bp;
+			} else {
+				error = xfs_da_read_buf(tp, dp,
+						xfs_dir2_db_to_da(mp, newdb),
+						-1, &curbp, XFS_DATA_FORK);
+				if (error)
+					return error;
+			}
+			xfs_dir2_data_check(dp, curbp);
+			curdb = newdb;
 		}
 		/*
-		 * For other callers, giving back a data block.
+		 * Point to the data entry.
 		 */
-		else {
+		dep = (xfs_dir2_data_entry_t *)((char *)curbp->data +
+			xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+		/*
+		 * Compare the entry and if it's an exact match, return
+		 * EEXIST immediately. If it's the first case-insensitive
+		 * match, store the block & inode number and continue looking.
+		 */
+		cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			/* If there is a CI match block, drop it */
+			if (args->cmpresult != XFS_CMP_DIFFERENT &&
+						curdb != state->extrablk.blkno)
+				xfs_da_brelse(tp, state->extrablk.bp);
+			args->cmpresult = cmp;
+			args->inumber = be64_to_cpu(dep->inumber);
+			*indexp = index;
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
 			state->extrablk.blkno = curdb;
+			state->extrablk.index = (int)((char *)dep -
+							(char *)curbp->data);
 			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			if (cmp == XFS_CMP_EXACT)
+				return XFS_ERROR(EEXIST);
 		}
 	}
-	/*
-	 * Return the final index, that will be the insertion point.
-	 */
+	ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
+					(args->op_flags & XFS_DA_OP_OKNOENT));
+	if (curbp) {
+		if (args->cmpresult == XFS_CMP_DIFFERENT) {
+			/* Giving back last used data block. */
+			state->extravalid = 1;
+			state->extrablk.bp = curbp;
+			state->extrablk.index = -1;
+			state->extrablk.blkno = curdb;
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+		} else {
+			/* If the curbp is not the CI match block, drop it */
+			if (state->extrablk.bp != curbp)
+				xfs_da_brelse(tp, curbp);
+		}
+	} else {
+		state->extravalid = 0;
+	}
 	*indexp = index;
-	ASSERT(index == be16_to_cpu(leaf->hdr.count) || args->oknoent);
 	return XFS_ERROR(ENOENT);
 }
 
 /*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	if (args->op_flags & XFS_DA_OP_ADDNAME)
+		return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+							state);
+	return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
  * Move count leaf entries from source to destination leaf.
  * Log entries and headers.  Stale entries are preserved.
  */
@@ -823,9 +891,10 @@ xfs_dir2_leafn_rebalance(
 	 */
 	if (!state->inleaf)
 		blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
-	
-	/* 
-	 * Finally sanity check just to make sure we are not returning a negative index 
+
+	/*
+	 * Finally sanity check just to make sure we are not returning a
+	 * negative index
 	 */
 	if(blk2->index < 0) {
 		state->inleaf = 1;
@@ -1332,7 +1401,7 @@ xfs_dir2_node_addname(
 		/*
 		 * It worked, fix the hash values up the btree.
 		 */
-		if (!args->justcheck)
+		if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
 			xfs_da_fixhashpath(state, &state->path);
 	} else {
 		/*
@@ -1515,7 +1584,8 @@ xfs_dir2_node_addname_int(
 		/*
 		 * Not allowed to allocate, return failure.
 		 */
-		if (args->justcheck || args->total == 0) {
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+							args->total == 0) {
 			/*
 			 * Drop the freespace buffer unless it came from our
 			 * caller.
@@ -1661,7 +1731,7 @@ xfs_dir2_node_addname_int(
 		/*
 		 * If just checking, we succeeded.
 		 */
-		if (args->justcheck) {
+		if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
 			if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
 				xfs_da_buf_done(fbp);
 			return 0;
@@ -1767,6 +1837,14 @@ xfs_dir2_node_lookup(
 	error = xfs_da_node_lookup_int(state, &rval);
 	if (error)
 		rval = error;
+	else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
+		/* If a CI match, dup the actual name and return EEXIST */
+		xfs_dir2_data_entry_t	*dep;
+
+		dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp->
+						data + state->extrablk.index);
+		rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+	}
 	/*
 	 * Release the btree blocks and leaf block.
 	 */
@@ -1810,9 +1888,8 @@ xfs_dir2_node_removename(
 	 * Look up the entry we're deleting, set up the cursor.
 	 */
 	error = xfs_da_node_lookup_int(state, &rval);
-	if (error) {
+	if (error)
 		rval = error;
-	}
 	/*
 	 * Didn't find it, upper layer screwed up.
 	 */
@@ -1829,9 +1906,8 @@ xfs_dir2_node_removename(
 	 */
 	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
 		&state->extrablk, &rval);
-	if (error) {
+	if (error)
 		return error;
-	}
 	/*
 	 * Fix the hash values up the btree.
 	 */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 919d275a1cef..b46af0013ec9 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -255,7 +255,7 @@ xfs_dir2_block_to_sf(
 	xfs_dir2_sf_check(args);
 out:
 	xfs_trans_log_inode(args->trans, dp, logflags);
-	kmem_free(block, mp->m_dirblksize);
+	kmem_free(block);
 	return error;
 }
 
@@ -332,7 +332,7 @@ xfs_dir2_sf_addname(
 		/*
 		 * Just checking or no space reservation, it doesn't fit.
 		 */
-		if (args->justcheck || args->total == 0)
+		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
 			return XFS_ERROR(ENOSPC);
 		/*
 		 * Convert to block form then add the name.
@@ -345,7 +345,7 @@ xfs_dir2_sf_addname(
 	/*
 	 * Just checking, it fits.
 	 */
-	if (args->justcheck)
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
 		return 0;
 	/*
 	 * Do it the easy way - just add it at the end.
@@ -512,7 +512,7 @@ xfs_dir2_sf_addname_hard(
 		sfep = xfs_dir2_sf_nextentry(sfp, sfep);
 		memcpy(sfep, oldsfep, old_isize - nbytes);
 	}
-	kmem_free(buf, old_isize);
+	kmem_free(buf);
 	dp->i_d.di_size = new_isize;
 	xfs_dir2_sf_check(args);
 }
@@ -812,8 +812,11 @@ xfs_dir2_sf_lookup(
 {
 	xfs_inode_t		*dp;		/* incore directory inode */
 	int			i;		/* entry index */
+	int			error;
 	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
 	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	enum xfs_dacmp		cmp;		/* comparison result */
+	xfs_dir2_sf_entry_t	*ci_sfep;	/* case-insens. entry */
 
 	xfs_dir2_trace_args("sf_lookup", args);
 	xfs_dir2_sf_check(args);
@@ -836,6 +839,7 @@ xfs_dir2_sf_lookup(
 	 */
 	if (args->namelen == 1 && args->name[0] == '.') {
 		args->inumber = dp->i_ino;
+		args->cmpresult = XFS_CMP_EXACT;
 		return XFS_ERROR(EEXIST);
 	}
 	/*
@@ -844,28 +848,41 @@ xfs_dir2_sf_lookup(
 	if (args->namelen == 2 &&
 	    args->name[0] == '.' && args->name[1] == '.') {
 		args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+		args->cmpresult = XFS_CMP_EXACT;
 		return XFS_ERROR(EEXIST);
 	}
 	/*
 	 * Loop over all the entries trying to match ours.
 	 */
-	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
-	     i < sfp->hdr.count;
-	     i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
-		if (sfep->namelen == args->namelen &&
-		    sfep->name[0] == args->name[0] &&
-		    memcmp(args->name, sfep->name, args->namelen) == 0) {
-			args->inumber =
-				xfs_dir2_sf_get_inumber(sfp,
-					xfs_dir2_sf_inumberp(sfep));
-			return XFS_ERROR(EEXIST);
+	ci_sfep = NULL;
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
+				i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+		/*
+		 * Compare name and if it's an exact match, return the inode
+		 * number. If it's the first case-insensitive match, store the
+		 * inode number and continue looking for an exact match.
+		 */
+		cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
+								sfep->namelen);
+		if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+			args->cmpresult = cmp;
+			args->inumber = xfs_dir2_sf_get_inumber(sfp,
+						xfs_dir2_sf_inumberp(sfep));
+			if (cmp == XFS_CMP_EXACT)
+				return XFS_ERROR(EEXIST);
+			ci_sfep = sfep;
 		}
 	}
+	ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
 	/*
-	 * Didn't find it.
+	 * Here, we can only be doing a lookup (not a rename or replace).
+	 * If a case-insensitive match was not found, return ENOENT.
 	 */
-	ASSERT(args->oknoent);
-	return XFS_ERROR(ENOENT);
+	if (!ci_sfep)
+		return XFS_ERROR(ENOENT);
+	/* otherwise process the CI match as required by the caller */
+	error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+	return XFS_ERROR(error);
 }
 
 /*
@@ -904,24 +921,21 @@ xfs_dir2_sf_removename(
 	 * Loop over the old directory entries.
 	 * Find the one we're deleting.
 	 */
-	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
-	     i < sfp->hdr.count;
-	     i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
-		if (sfep->namelen == args->namelen &&
-		    sfep->name[0] == args->name[0] &&
-		    memcmp(sfep->name, args->name, args->namelen) == 0) {
+	for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
+				i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+		if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
 			ASSERT(xfs_dir2_sf_get_inumber(sfp,
-					xfs_dir2_sf_inumberp(sfep)) ==
-				args->inumber);
+						xfs_dir2_sf_inumberp(sfep)) ==
+								args->inumber);
 			break;
 		}
 	}
 	/*
 	 * Didn't find it.
 	 */
-	if (i == sfp->hdr.count) {
+	if (i == sfp->hdr.count)
 		return XFS_ERROR(ENOENT);
-	}
 	/*
 	 * Calculate sizes.
 	 */
@@ -1042,11 +1056,10 @@ xfs_dir2_sf_replace(
 	 */
 	else {
 		for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
-		     i < sfp->hdr.count;
-		     i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
-			if (sfep->namelen == args->namelen &&
-			    sfep->name[0] == args->name[0] &&
-			    memcmp(args->name, sfep->name, args->namelen) == 0) {
+				i < sfp->hdr.count;
+				i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+			if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+								XFS_CMP_EXACT) {
 #if XFS_BIG_INUMS || defined(DEBUG)
 				ino = xfs_dir2_sf_get_inumber(sfp,
 					xfs_dir2_sf_inumberp(sfep));
@@ -1061,7 +1074,7 @@ xfs_dir2_sf_replace(
 		 * Didn't find it.
 		 */
 		if (i == sfp->hdr.count) {
-			ASSERT(args->oknoent);
+			ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
 #if XFS_BIG_INUMS
 			if (i8elevated)
 				xfs_dir2_sf_toino4(args);
@@ -1174,7 +1187,7 @@ xfs_dir2_sf_toino4(
 	/*
 	 * Clean up the inode.
 	 */
-	kmem_free(buf, oldsize);
+	kmem_free(buf);
 	dp->i_d.di_size = newsize;
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
 }
@@ -1251,7 +1264,7 @@ xfs_dir2_sf_toino8(
 	/*
 	 * Clean up the inode.
 	 */
-	kmem_free(buf, oldsize);
+	kmem_free(buf);
 	dp->i_d.di_size = newsize;
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
 }
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
index 005629d702d2..deecc9d238f8 100644
--- a/fs/xfs/xfs_dir2_sf.h
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -62,7 +62,7 @@ typedef union {
  * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
  * Only need 16 bits, this is the byte offset into the single block form.
  */
-typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t;
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
 
 /*
  * The parent directory has a dedicated field, and the self-pointer must
@@ -76,14 +76,14 @@ typedef struct xfs_dir2_sf_hdr {
 	__uint8_t		count;		/* count of entries */
 	__uint8_t		i8count;	/* count of 8-byte inode #s */
 	xfs_dir2_inou_t		parent;		/* parent dir inode number */
-} xfs_dir2_sf_hdr_t;
+} __arch_pack xfs_dir2_sf_hdr_t;
 
 typedef struct xfs_dir2_sf_entry {
 	__uint8_t		namelen;	/* actual name length */
 	xfs_dir2_sf_off_t	offset;		/* saved offset */
 	__uint8_t		name[1];	/* name, variable size */
 	xfs_dir2_inou_t		inumber;	/* inode number, var. offset */
-} xfs_dir2_sf_entry_t;
+} __arch_pack xfs_dir2_sf_entry_t; 
 
 typedef struct xfs_dir2_sf {
 	xfs_dir2_sf_hdr_t	hdr;		/* shortform header */
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
index f3fb2ffd6f5c..6cc7c0c681ac 100644
--- a/fs/xfs/xfs_dir2_trace.c
+++ b/fs/xfs/xfs_dir2_trace.c
@@ -85,7 +85,8 @@ xfs_dir2_trace_args(
 		(void *)((unsigned long)(args->inumber >> 32)),
 		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
 		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)args->justcheck, NULL, NULL);
+		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+		NULL, NULL);
 }
 
 void
@@ -100,7 +101,7 @@ xfs_dir2_trace_args_b(
 		(void *)((unsigned long)(args->inumber >> 32)),
 		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
 		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)args->justcheck,
+		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
 		(void *)(bp ? bp->bps[0] : NULL), NULL);
 }
 
@@ -117,7 +118,7 @@ xfs_dir2_trace_args_bb(
 		(void *)((unsigned long)(args->inumber >> 32)),
 		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
 		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)args->justcheck,
+		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
 		(void *)(lbp ? lbp->bps[0] : NULL),
 		(void *)(dbp ? dbp->bps[0] : NULL));
 }
@@ -157,8 +158,8 @@ xfs_dir2_trace_args_db(
 		(void *)((unsigned long)(args->inumber >> 32)),
 		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
 		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)args->justcheck, (void *)(long)db,
-		(void *)dbp);
+		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+		(void *)(long)db, (void *)dbp);
 }
 
 void
@@ -173,7 +174,7 @@ xfs_dir2_trace_args_i(
 		(void *)((unsigned long)(args->inumber >> 32)),
 		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
 		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)args->justcheck,
+		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
 		(void *)((unsigned long)(i >> 32)),
 		(void *)((unsigned long)(i & 0xFFFFFFFF)));
 }
@@ -190,7 +191,8 @@ xfs_dir2_trace_args_s(
 		(void *)((unsigned long)(args->inumber >> 32)),
 		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
 		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)args->justcheck, (void *)(long)s, NULL);
+		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+		(void *)(long)s, NULL);
 }
 
 void
@@ -208,7 +210,7 @@ xfs_dir2_trace_args_sb(
 		(void *)((unsigned long)(args->inumber >> 32)),
 		(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
 		(void *)args->dp, (void *)args->trans,
-		(void *)(unsigned long)args->justcheck, (void *)(long)s,
-		(void *)dbp);
+		(void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+		(void *)(long)s, (void *)dbp);
 }
 #endif	/* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index f71784ab6a60..2813cdd72375 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_DMAPI_H__
 #define __XFS_DMAPI_H__
 
-#include <linux/version.h>
 /*	Values used to define the on-disk version of dm_attrname_t. All
  *	on-disk attribute names start with the 8-byte string "SGI_DMI_".
  *
@@ -166,6 +165,6 @@ typedef enum {
 
 #define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
 			DM_FLAGS_NDELAY : 0)
-#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
+#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
 
 #endif  /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 05e5365d3c31..f227ecd1a294 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,22 +58,11 @@ xfs_error_trap(int e)
 	}
 	return e;
 }
-#endif
-
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
 
 int	xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t	xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *	xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
 
-void
-xfs_error_test_init(void)
-{
-	memset(xfs_etest, 0, sizeof(xfs_etest));
-	memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid));
-	memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname));
-}
-
 int
 xfs_error_test(int error_tag, int *fsidp, char *expression,
 	       int line, char *file, unsigned long randfactor)
@@ -150,8 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 				xfs_etest[i]);
 			xfs_etest[i] = 0;
 			xfs_etest_fsid[i] = 0LL;
-			kmem_free(xfs_etest_fsname[i],
-				  strlen(xfs_etest_fsname[i]) + 1);
+			kmem_free(xfs_etest_fsname[i]);
 			xfs_etest_fsname[i] = NULL;
 		}
 	}
@@ -163,7 +151,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 
 	return 0;
 }
-#endif /* DEBUG || INDUCE_IO_ERROR */
+#endif /* DEBUG */
 
 static void
 xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
@@ -175,7 +163,7 @@ xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
 		newfmt = kmem_alloc(len, KM_SLEEP);
 		sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
 		icmn_err(level, newfmt, ap);
-		kmem_free(newfmt, len);
+		kmem_free(newfmt);
 	} else {
 		icmn_err(level, fmt, ap);
 	}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 6490d2a9f8e1..11543f10b0c6 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -125,23 +125,14 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
 #define XFS_RANDOM_DIOWRITE_IOERR			(XFS_RANDOM_DEFAULT/10)
 #define	XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT
 
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+#ifdef DEBUG
 extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
-extern void xfs_error_test_init(void);
 
 #define	XFS_NUM_INJECT_ERROR				10
-
-#ifdef __ANSI_CPP__
-#define XFS_TEST_ERROR(expr, mp, tag, rf)		\
-	((expr) || \
-	 xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
-			 (rf)))
-#else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)		\
 	((expr) || \
 	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
 			(rf)))
-#endif /* __ANSI_CPP__ */
 
 extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
 extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
@@ -149,7 +140,7 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define XFS_TEST_ERROR(expr, mp, tag, rf)	(expr)
 #define xfs_errortag_add(tag, mp)		(ENOSYS)
 #define xfs_errortag_clearall(mp, loud)		(ENOSYS)
-#endif /* (DEBUG || INDUCE_IO_ERROR) */
+#endif /* DEBUG */
 
 /*
  * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 132bd07b9bb8..8aa28f751b2a 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -41,8 +41,7 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
 	int nexts = efip->efi_format.efi_nextents;
 
 	if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
-		kmem_free(efip, sizeof(xfs_efi_log_item_t) +
-				(nexts - 1) * sizeof(xfs_extent_t));
+		kmem_free(efip);
 	} else {
 		kmem_zone_free(xfs_efi_zone, efip);
 	}
@@ -374,8 +373,7 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
 	int nexts = efdp->efd_format.efd_nextents;
 
 	if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
-		kmem_free(efdp, sizeof(xfs_efd_log_item_t) +
-				(nexts - 1) * sizeof(xfs_extent_t));
+		kmem_free(efdp);
 	} else {
 		kmem_zone_free(xfs_efd_zone, efdp);
 	}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 3f3785b10804..f3bb75da384e 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -397,10 +397,12 @@ int
 xfs_filestream_init(void)
 {
 	item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
+	if (!item_zone)
+		return -ENOMEM;
 #ifdef XFS_FILESTREAMS_TRACE
-	xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP);
+	xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS);
 #endif
-	return item_zone ? 0 : -ENOMEM;
+	return 0;
 }
 
 /*
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 3bed6433d050..01c0cc88d3f3 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_LOGV2	0x0100	/* log format version 2	*/
 #define XFS_FSOP_GEOM_FLAGS_SECTOR	0x0200	/* sector sizes >1BB	*/
 #define XFS_FSOP_GEOM_FLAGS_ATTR2	0x0400	/* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI	0x1000	/* ASCII only CI names */
 #define XFS_FSOP_GEOM_FLAGS_LAZYSB	0x4000	/* lazy superblock counters */
 
 
@@ -371,6 +372,9 @@ typedef struct xfs_fsop_attrlist_handlereq {
 
 typedef struct xfs_attr_multiop {
 	__u32		am_opcode;
+#define ATTR_OP_GET	1	/* return the indicated attr's value */
+#define ATTR_OP_SET	2	/* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE	3	/* remove the indicated attr */
 	__s32		am_error;
 	void		__user *am_attrname;
 	void		__user *am_attrvalue;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 381ebda4f7bc..84583cf73db3 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -95,6 +95,8 @@ xfs_fs_geometry(
 				XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
 			(xfs_sb_version_hassector(&mp->m_sb) ?
 				XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
+			(xfs_sb_version_hasasciici(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) |
 			(xfs_sb_version_haslazysbcount(&mp->m_sb) ?
 				XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
 			(xfs_sb_version_hasattr2(&mp->m_sb) ?
@@ -625,7 +627,7 @@ xfs_fs_goingdown(
 			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
 			thaw_bdev(sb->s_bdev, sb);
 		}
-	
+
 		break;
 	}
 	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index e5310c90e50f..83502f3edef0 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -181,7 +181,7 @@ xfs_inobt_delrec(
 		 * then we can get rid of this level.
 		 */
 		if (numrecs == 1 && level > 0) {
-			agbp = cur->bc_private.i.agbp;
+			agbp = cur->bc_private.a.agbp;
 			agi = XFS_BUF_TO_AGI(agbp);
 			/*
 			 * pp is still set to the first pointer in the block.
@@ -194,7 +194,7 @@ xfs_inobt_delrec(
 			 * Free the block.
 			 */
 			if ((error = xfs_free_extent(cur->bc_tp,
-				XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1)))
+				XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
 				return error;
 			xfs_trans_binval(cur->bc_tp, bp);
 			xfs_ialloc_log_agi(cur->bc_tp, agbp,
@@ -379,7 +379,7 @@ xfs_inobt_delrec(
 		rrecs = be16_to_cpu(right->bb_numrecs);
 		rbp = bp;
 		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-				cur->bc_private.i.agno, lbno, 0, &lbp,
+				cur->bc_private.a.agno, lbno, 0, &lbp,
 				XFS_INO_BTREE_REF)))
 			return error;
 		left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -401,7 +401,7 @@ xfs_inobt_delrec(
 		lrecs = be16_to_cpu(left->bb_numrecs);
 		lbp = bp;
 		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-				cur->bc_private.i.agno, rbno, 0, &rbp,
+				cur->bc_private.a.agno, rbno, 0, &rbp,
 				XFS_INO_BTREE_REF)))
 			return error;
 		right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -484,7 +484,7 @@ xfs_inobt_delrec(
 		xfs_buf_t		*rrbp;
 
 		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-				cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0,
+				cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
 				&rrbp, XFS_INO_BTREE_REF)))
 			return error;
 		rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
@@ -497,7 +497,7 @@ xfs_inobt_delrec(
 	 * Free the deleting block.
 	 */
 	if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
-				     cur->bc_private.i.agno, rbno), 1)))
+				     cur->bc_private.a.agno, rbno), 1)))
 		return error;
 	xfs_trans_binval(cur->bc_tp, rbp);
 	/*
@@ -854,7 +854,7 @@ xfs_inobt_lookup(
 	{
 		xfs_agi_t	*agi;	/* a.g. inode header */
 
-		agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+		agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
 		agno = be32_to_cpu(agi->agi_seqno);
 		agbno = be32_to_cpu(agi->agi_root);
 	}
@@ -1089,7 +1089,7 @@ xfs_inobt_lshift(
 	 * Set up the left neighbor as "left".
 	 */
 	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-			cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib),
+			cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
 			0, &lbp, XFS_INO_BTREE_REF)))
 		return error;
 	left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -1207,10 +1207,10 @@ xfs_inobt_newroot(
 	/*
 	 * Get a block & a buffer.
 	 */
-	agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+	agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
 	args.tp = cur->bc_tp;
 	args.mp = cur->bc_mp;
-	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
 		be32_to_cpu(agi->agi_root));
 	args.mod = args.minleft = args.alignment = args.total = args.wasdel =
 		args.isfl = args.userdata = args.minalignslop = 0;
@@ -1233,7 +1233,7 @@ xfs_inobt_newroot(
 	 */
 	agi->agi_root = cpu_to_be32(args.agbno);
 	be32_add_cpu(&agi->agi_level, 1);
-	xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
+	xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
 		XFS_AGI_ROOT | XFS_AGI_LEVEL);
 	/*
 	 * At the previous root level there are now two blocks: the old
@@ -1376,7 +1376,7 @@ xfs_inobt_rshift(
 	 * Set up the right neighbor as "right".
 	 */
 	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-			cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib),
+			cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
 			0, &rbp, XFS_INO_BTREE_REF)))
 		return error;
 	right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -1492,7 +1492,7 @@ xfs_inobt_split(
 	 * Allocate the new block.
 	 * If we can't do it, we're toast.  Give up.
 	 */
-	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
 	args.mod = args.minleft = args.alignment = args.total = args.wasdel =
 		args.isfl = args.userdata = args.minalignslop = 0;
 	args.minlen = args.maxlen = args.prod = 1;
@@ -1725,7 +1725,7 @@ xfs_inobt_decrement(
 
 		agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
 		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-				cur->bc_private.i.agno, agbno, 0, &bp,
+				cur->bc_private.a.agno, agbno, 0, &bp,
 				XFS_INO_BTREE_REF)))
 			return error;
 		lev--;
@@ -1897,7 +1897,7 @@ xfs_inobt_increment(
 
 		agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
 		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-				cur->bc_private.i.agno, agbno, 0, &bp,
+				cur->bc_private.a.agno, agbno, 0, &bp,
 				XFS_INO_BTREE_REF)))
 			return error;
 		lev--;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b07604b94d9f..e229e9e001c2 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -216,7 +216,14 @@ finish_inode:
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 	init_waitqueue_head(&ip->i_ipin_wait);
 	atomic_set(&ip->i_pincount, 0);
-	initnsema(&ip->i_flock, 1, "xfsfino");
+
+	/*
+	 * Because we want to use a counting completion, complete
+	 * the flush completion once to allow a single access to
+	 * the flush completion without blocking.
+	 */
+	init_completion(&ip->i_flush);
+	complete(&ip->i_flush);
 
 	if (lock_flags)
 		xfs_ilock(ip, lock_flags);
@@ -288,10 +295,17 @@ finish_inode:
 	*ipp = ip;
 
 	/*
+	 * Set up the Linux with the Linux inode.
+	 */
+	ip->i_vnode = inode;
+	inode->i_private = ip;
+
+	/*
 	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
 	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
 	 */
-	xfs_initialize_vnode(mp, inode, ip);
+	if (ip->i_d.di_mode != 0)
+		xfs_setup_inode(ip);
 	return 0;
 }
 
@@ -411,10 +425,11 @@ xfs_iput(xfs_inode_t	*ip,
  * Special iput for brand-new inodes that are still locked
  */
 void
-xfs_iput_new(xfs_inode_t	*ip,
-	     uint		lock_flags)
+xfs_iput_new(
+	xfs_inode_t	*ip,
+	uint		lock_flags)
 {
-	struct inode	*inode = ip->i_vnode;
+	struct inode	*inode = VFS_I(ip);
 
 	xfs_itrace_entry(ip);
 
@@ -775,26 +790,3 @@ xfs_isilocked(
 }
 #endif
 
-/*
- * The following three routines simply manage the i_flock
- * semaphore embedded in the inode.  This semaphore synchronizes
- * processes attempting to flush the in-core inode back to disk.
- */
-void
-xfs_iflock(xfs_inode_t *ip)
-{
-	psema(&(ip->i_flock), PINOD|PLTWAIT);
-}
-
-int
-xfs_iflock_nowait(xfs_inode_t *ip)
-{
-	return (cpsema(&(ip->i_flock)));
-}
-
-void
-xfs_ifunlock(xfs_inode_t *ip)
-{
-	ASSERT(issemalocked(&(ip->i_flock)));
-	vsema(&(ip->i_flock));
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e569bf5d6cf0..dbd9cef852ec 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -580,8 +580,8 @@ xfs_iformat_extents(
 		xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
 		for (i = 0; i < nex; i++, dp++) {
 			xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-			ep->l0 = be64_to_cpu(get_unaligned(&dp->l0));
-			ep->l1 = be64_to_cpu(get_unaligned(&dp->l1));
+			ep->l0 = get_unaligned_be64(&dp->l0);
+			ep->l1 = get_unaligned_be64(&dp->l1);
 		}
 		XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
 		if (whichfork != XFS_DATA_FORK ||
@@ -835,22 +835,22 @@ xfs_iread(
 	 * Do this before xfs_iformat in case it adds entries.
 	 */
 #ifdef	XFS_INODE_TRACE
-	ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_SLEEP);
+	ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BMAP_TRACE
-	ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
+	ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_BMBT_TRACE
-	ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
+	ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_RW_TRACE
-	ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
+	ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_ILOCK_TRACE
-	ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
+	ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_DIR2_TRACE
-	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
+	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
 
 	/*
@@ -1046,9 +1046,9 @@ xfs_ialloc(
 {
 	xfs_ino_t	ino;
 	xfs_inode_t	*ip;
-	bhv_vnode_t	*vp;
 	uint		flags;
 	int		error;
+	timespec_t	tv;
 
 	/*
 	 * Call the space management code to pick
@@ -1077,13 +1077,12 @@ xfs_ialloc(
 	}
 	ASSERT(ip != NULL);
 
-	vp = XFS_ITOV(ip);
 	ip->i_d.di_mode = (__uint16_t)mode;
 	ip->i_d.di_onlink = 0;
 	ip->i_d.di_nlink = nlink;
 	ASSERT(ip->i_d.di_nlink == nlink);
-	ip->i_d.di_uid = current_fsuid(cr);
-	ip->i_d.di_gid = current_fsgid(cr);
+	ip->i_d.di_uid = current_fsuid();
+	ip->i_d.di_gid = current_fsgid();
 	ip->i_d.di_projid = prid;
 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 
@@ -1130,7 +1129,13 @@ xfs_ialloc(
 	ip->i_size = 0;
 	ip->i_d.di_nextents = 0;
 	ASSERT(ip->i_d.di_nblocks == 0);
-	xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
+
+	nanotime(&tv);
+	ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+	ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+	ip->i_d.di_atime = ip->i_d.di_mtime;
+	ip->i_d.di_ctime = ip->i_d.di_mtime;
+
 	/*
 	 * di_gen will have been taken care of in xfs_iread.
 	 */
@@ -1220,7 +1225,7 @@ xfs_ialloc(
 	xfs_trans_log_inode(tp, ip, flags);
 
 	/* now that we have an i_mode we can setup inode ops and unlock */
-	xfs_initialize_vnode(tp->t_mountp, vp, ip);
+	xfs_setup_inode(ip);
 
 	*ipp = ip;
 	return 0;
@@ -1399,7 +1404,6 @@ xfs_itruncate_start(
 	xfs_fsize_t	last_byte;
 	xfs_off_t	toss_start;
 	xfs_mount_t	*mp;
-	bhv_vnode_t	*vp;
 	int		error = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1408,7 +1412,6 @@ xfs_itruncate_start(
 	       (flags == XFS_ITRUNC_MAYBE));
 
 	mp = ip->i_mount;
-	vp = XFS_ITOV(ip);
 
 	/* wait for the completion of any pending DIOs */
 	if (new_size < ip->i_size)
@@ -1457,7 +1460,7 @@ xfs_itruncate_start(
 
 #ifdef DEBUG
 	if (new_size == 0) {
-		ASSERT(VN_CACHED(vp) == 0);
+		ASSERT(VN_CACHED(VFS_I(ip)) == 0);
 	}
 #endif
 	return error;
@@ -1763,67 +1766,6 @@ xfs_itruncate_finish(
 	return 0;
 }
 
-
-/*
- * xfs_igrow_start
- *
- * Do the first part of growing a file: zero any data in the last
- * block that is beyond the old EOF.  We need to do this before
- * the inode is joined to the transaction to modify the i_size.
- * That way we can drop the inode lock and call into the buffer
- * cache to get the buffer mapping the EOF.
- */
-int
-xfs_igrow_start(
-	xfs_inode_t	*ip,
-	xfs_fsize_t	new_size,
-	cred_t		*credp)
-{
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-	ASSERT(new_size > ip->i_size);
-
-	/*
-	 * Zero any pages that may have been created by
-	 * xfs_write_file() beyond the end of the file
-	 * and any blocks between the old and new file sizes.
-	 */
-	return xfs_zero_eof(ip, new_size, ip->i_size);
-}
-
-/*
- * xfs_igrow_finish
- *
- * This routine is called to extend the size of a file.
- * The inode must have both the iolock and the ilock locked
- * for update and it must be a part of the current transaction.
- * The xfs_igrow_start() function must have been called previously.
- * If the change_flag is not zero, the inode change timestamp will
- * be updated.
- */
-void
-xfs_igrow_finish(
-	xfs_trans_t	*tp,
-	xfs_inode_t	*ip,
-	xfs_fsize_t	new_size,
-	int		change_flag)
-{
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-	ASSERT(ip->i_transp == tp);
-	ASSERT(new_size > ip->i_size);
-
-	/*
-	 * Update the file size.  Update the inode change timestamp
-	 * if change_flag set.
-	 */
-	ip->i_d.di_size = new_size;
-	ip->i_size = new_size;
-	if (change_flag)
-		xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-}
-
-
 /*
  * This is called when the inode's link count goes to 0.
  * We place the on-disk inode on a list in the AGI.  It
@@ -2258,7 +2200,7 @@ xfs_ifree_cluster(
 		xfs_trans_binval(tp, bp);
 	}
 
-	kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
+	kmem_free(ip_found);
 	xfs_put_perag(mp, pag);
 }
 
@@ -2470,7 +2412,7 @@ xfs_iroot_realloc(
 						     (int)new_size);
 		memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
 	}
-	kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+	kmem_free(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
 	ASSERT(ifp->if_broot_bytes <=
@@ -2514,7 +2456,7 @@ xfs_idata_realloc(
 
 	if (new_size == 0) {
 		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
-			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+			kmem_free(ifp->if_u1.if_data);
 		}
 		ifp->if_u1.if_data = NULL;
 		real_size = 0;
@@ -2529,7 +2471,7 @@ xfs_idata_realloc(
 			ASSERT(ifp->if_real_bytes != 0);
 			memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
 			      new_size);
-			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+			kmem_free(ifp->if_u1.if_data);
 			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 		}
 		real_size = 0;
@@ -2636,7 +2578,7 @@ xfs_idestroy_fork(
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if (ifp->if_broot != NULL) {
-		kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+		kmem_free(ifp->if_broot);
 		ifp->if_broot = NULL;
 	}
 
@@ -2650,7 +2592,7 @@ xfs_idestroy_fork(
 		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
 		    (ifp->if_u1.if_data != NULL)) {
 			ASSERT(ifp->if_real_bytes != 0);
-			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+			kmem_free(ifp->if_u1.if_data);
 			ifp->if_u1.if_data = NULL;
 			ifp->if_real_bytes = 0;
 		}
@@ -2691,7 +2633,6 @@ xfs_idestroy(
 		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
 	mrfree(&ip->i_lock);
 	mrfree(&ip->i_iolock);
-	freesema(&ip->i_flock);
 
 #ifdef XFS_INODE_TRACE
 	ktrace_free(ip->i_trace);
@@ -3058,7 +2999,7 @@ xfs_iflush_cluster(
 
 out_free:
 	read_unlock(&pag->pag_ici_lock);
-	kmem_free(ilist, ilist_size);
+	kmem_free(ilist);
 	return 0;
 
 
@@ -3102,17 +3043,17 @@ cluster_corrupt_out:
 	 * Unlocks the flush lock
 	 */
 	xfs_iflush_abort(iq);
-	kmem_free(ilist, ilist_size);
+	kmem_free(ilist);
 	return XFS_ERROR(EFSCORRUPTED);
 }
 
 /*
  * xfs_iflush() will write a modified inode's changes out to the
  * inode's on disk home.  The caller must have the inode lock held
- * in at least shared mode and the inode flush semaphore must be
- * held as well.  The inode lock will still be held upon return from
+ * in at least shared mode and the inode flush completion must be
+ * active as well.  The inode lock will still be held upon return from
  * the call and the caller is free to unlock it.
- * The inode flush lock will be unlocked when the inode reaches the disk.
+ * The inode flush will be completed when the inode reaches the disk.
  * The flags indicate how the inode's buffer should be written out.
  */
 int
@@ -3131,7 +3072,7 @@ xfs_iflush(
 	XFS_STATS_INC(xs_iflush_count);
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(issemalocked(&(ip->i_flock)));
+	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 
@@ -3143,8 +3084,6 @@ xfs_iflush(
 	 * flush lock and do nothing.
 	 */
 	if (xfs_inode_clean(ip)) {
-		ASSERT((iip != NULL) ?
-			 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
 		xfs_ifunlock(ip);
 		return 0;
 	}
@@ -3296,7 +3235,7 @@ xfs_iflush_int(
 #endif
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-	ASSERT(issemalocked(&(ip->i_flock)));
+	ASSERT(!completion_done(&ip->i_flush));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 
@@ -3528,7 +3467,6 @@ xfs_iflush_all(
 	xfs_mount_t	*mp)
 {
 	xfs_inode_t	*ip;
-	bhv_vnode_t	*vp;
 
  again:
 	XFS_MOUNT_ILOCK(mp);
@@ -3543,14 +3481,13 @@ xfs_iflush_all(
 			continue;
 		}
 
-		vp = XFS_ITOV_NULL(ip);
-		if (!vp) {
+		if (!VFS_I(ip)) {
 			XFS_MOUNT_IUNLOCK(mp);
 			xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
 			goto again;
 		}
 
-		ASSERT(vn_count(vp) == 0);
+		ASSERT(vn_count(VFS_I(ip)) == 0);
 
 		ip = ip->i_mnext;
 	} while (ip != mp->m_inodes);
@@ -3770,7 +3707,7 @@ xfs_iext_add_indirect_multi(
 	 * (all extents past */
 	if (nex2) {
 		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
-		nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP);
+		nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
 		memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
 		erp->er_extcount -= nex2;
 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
@@ -3836,7 +3773,7 @@ xfs_iext_add_indirect_multi(
 			erp = xfs_iext_irec_new(ifp, erp_idx);
 		}
 		memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
-		kmem_free(nex2_ep, byte_diff);
+		kmem_free(nex2_ep);
 		erp->er_extcount += nex2;
 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
 	}
@@ -4070,8 +4007,7 @@ xfs_iext_realloc_direct(
 			ifp->if_u1.if_extents =
 				kmem_realloc(ifp->if_u1.if_extents,
 						rnew_size,
-						ifp->if_real_bytes,
-						KM_SLEEP);
+						ifp->if_real_bytes, KM_NOFS);
 		}
 		if (rnew_size > ifp->if_real_bytes) {
 			memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -4112,7 +4048,7 @@ xfs_iext_direct_to_inline(
 	 */
 	memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
 		nextents * sizeof(xfs_bmbt_rec_t));
-	kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+	kmem_free(ifp->if_u1.if_extents);
 	ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 	ifp->if_real_bytes = 0;
 }
@@ -4130,7 +4066,7 @@ xfs_iext_inline_to_direct(
 	xfs_ifork_t	*ifp,		/* inode fork pointer */
 	int		new_size)	/* number of extents in file */
 {
-	ifp->if_u1.if_extents = kmem_alloc(new_size, KM_SLEEP);
+	ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
 	memset(ifp->if_u1.if_extents, 0, new_size);
 	if (ifp->if_bytes) {
 		memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
@@ -4162,7 +4098,7 @@ xfs_iext_realloc_indirect(
 	} else {
 		ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
 			kmem_realloc(ifp->if_u1.if_ext_irec,
-				new_size, size, KM_SLEEP);
+				new_size, size, KM_NOFS);
 	}
 }
 
@@ -4182,11 +4118,11 @@ xfs_iext_indirect_to_direct(
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 	size = nextents * sizeof(xfs_bmbt_rec_t);
 
-	xfs_iext_irec_compact_full(ifp);
+	xfs_iext_irec_compact_pages(ifp);
 	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
 
 	ep = ifp->if_u1.if_ext_irec->er_extbuf;
-	kmem_free(ifp->if_u1.if_ext_irec, sizeof(xfs_ext_irec_t));
+	kmem_free(ifp->if_u1.if_ext_irec);
 	ifp->if_flags &= ~XFS_IFEXTIREC;
 	ifp->if_u1.if_extents = ep;
 	ifp->if_bytes = size;
@@ -4212,7 +4148,7 @@ xfs_iext_destroy(
 		}
 		ifp->if_flags &= ~XFS_IFEXTIREC;
 	} else if (ifp->if_real_bytes) {
-		kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+		kmem_free(ifp->if_u1.if_extents);
 	} else if (ifp->if_bytes) {
 		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
 			sizeof(xfs_bmbt_rec_t));
@@ -4404,11 +4340,10 @@ xfs_iext_irec_init(
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 
-	erp = (xfs_ext_irec_t *)
-		kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP);
+	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
 
 	if (nextents == 0) {
-		ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+		ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
 	} else if (!ifp->if_real_bytes) {
 		xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
 	} else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
@@ -4456,7 +4391,7 @@ xfs_iext_irec_new(
 
 	/* Initialize new extent record */
 	erp = ifp->if_u1.if_ext_irec;
-	erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+	erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
 	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
 	memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
 	erp[erp_idx].er_extcount = 0;
@@ -4483,7 +4418,7 @@ xfs_iext_irec_remove(
 	if (erp->er_extbuf) {
 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
 			-erp->er_extcount);
-		kmem_free(erp->er_extbuf, XFS_IEXT_BUFSZ);
+		kmem_free(erp->er_extbuf);
 	}
 	/* Compact extent records */
 	erp = ifp->if_u1.if_ext_irec;
@@ -4501,8 +4436,7 @@ xfs_iext_irec_remove(
 		xfs_iext_realloc_indirect(ifp,
 			nlists * sizeof(xfs_ext_irec_t));
 	} else {
-		kmem_free(ifp->if_u1.if_ext_irec,
-			sizeof(xfs_ext_irec_t));
+		kmem_free(ifp->if_u1.if_ext_irec);
 	}
 	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
 }
@@ -4515,8 +4449,7 @@ xfs_iext_irec_remove(
  * compaction policy is as follows:
  *
  *    Full Compaction: Extents fit into a single page (or inline buffer)
- *    Full Compaction: Extents occupy less than 10% of allocated space
- * Partial Compaction: Extents occupy > 10% and < 50% of allocated space
+ * Partial Compaction: Extents occupy less than 50% of allocated space
  *      No Compaction: Extents occupy at least 50% of allocated space
  */
 void
@@ -4537,8 +4470,6 @@ xfs_iext_irec_compact(
 		xfs_iext_direct_to_inline(ifp, nextents);
 	} else if (nextents <= XFS_LINEAR_EXTS) {
 		xfs_iext_indirect_to_direct(ifp);
-	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) {
-		xfs_iext_irec_compact_full(ifp);
 	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
 		xfs_iext_irec_compact_pages(ifp);
 	}
@@ -4562,7 +4493,7 @@ xfs_iext_irec_compact_pages(
 		erp_next = erp + 1;
 		if (erp_next->er_extcount <=
 		    (XFS_LINEAR_EXTS - erp->er_extcount)) {
-			memmove(&erp->er_extbuf[erp->er_extcount],
+			memcpy(&erp->er_extbuf[erp->er_extcount],
 				erp_next->er_extbuf, erp_next->er_extcount *
 				sizeof(xfs_bmbt_rec_t));
 			erp->er_extcount += erp_next->er_extcount;
@@ -4571,75 +4502,13 @@ xfs_iext_irec_compact_pages(
 			 * so er_extoffs don't get modified in
 			 * xfs_iext_irec_remove.
 			 */
-			kmem_free(erp_next->er_extbuf, XFS_IEXT_BUFSZ);
-			erp_next->er_extbuf = NULL;
-			xfs_iext_irec_remove(ifp, erp_idx + 1);
-			nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-		} else {
-			erp_idx++;
-		}
-	}
-}
-
-/*
- * Fully compact the extent records managed by the indirection array.
- */
-void
-xfs_iext_irec_compact_full(
-	xfs_ifork_t	*ifp)			/* inode fork pointer */
-{
-	xfs_bmbt_rec_host_t *ep, *ep_next;	/* extent record pointers */
-	xfs_ext_irec_t	*erp, *erp_next;	/* extent irec pointers */
-	int		erp_idx = 0;		/* extent irec index */
-	int		ext_avail;		/* empty entries in ex list */
-	int		ext_diff;		/* number of exts to add */
-	int		nlists;			/* number of irec's (ex lists) */
-
-	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	erp = ifp->if_u1.if_ext_irec;
-	ep = &erp->er_extbuf[erp->er_extcount];
-	erp_next = erp + 1;
-	ep_next = erp_next->er_extbuf;
-	while (erp_idx < nlists - 1) {
-		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
-		ext_diff = MIN(ext_avail, erp_next->er_extcount);
-		memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
-		erp->er_extcount += ext_diff;
-		erp_next->er_extcount -= ext_diff;
-		/* Remove next page */
-		if (erp_next->er_extcount == 0) {
-			/*
-			 * Free page before removing extent record
-			 * so er_extoffs don't get modified in
-			 * xfs_iext_irec_remove.
-			 */
-			kmem_free(erp_next->er_extbuf,
-				erp_next->er_extcount * sizeof(xfs_bmbt_rec_t));
+			kmem_free(erp_next->er_extbuf);
 			erp_next->er_extbuf = NULL;
 			xfs_iext_irec_remove(ifp, erp_idx + 1);
-			erp = &ifp->if_u1.if_ext_irec[erp_idx];
 			nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-		/* Update next page */
 		} else {
-			/* Move rest of page up to become next new page */
-			memmove(erp_next->er_extbuf, ep_next,
-				erp_next->er_extcount * sizeof(xfs_bmbt_rec_t));
-			ep_next = erp_next->er_extbuf;
-			memset(&ep_next[erp_next->er_extcount], 0,
-				(XFS_LINEAR_EXTS - erp_next->er_extcount) *
-				sizeof(xfs_bmbt_rec_t));
-		}
-		if (erp->er_extcount == XFS_LINEAR_EXTS) {
 			erp_idx++;
-			if (erp_idx < nlists)
-				erp = &ifp->if_u1.if_ext_irec[erp_idx];
-			else
-				break;
 		}
-		ep = &erp->er_extbuf[erp->er_extcount];
-		erp_next = erp + 1;
-		ep_next = erp_next->er_extbuf;
 	}
 }
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0a999fee4f03..1420c49674d7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -87,8 +87,7 @@ typedef struct xfs_ifork {
  * Flags for xfs_ichgtime().
  */
 #define	XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
-#define	XFS_ICHGTIME_ACC	0x2	/* data fork access timestamp */
-#define	XFS_ICHGTIME_CHG	0x4	/* inode field change timestamp */
+#define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
 
 /*
  * Per-fork incore inode flags.
@@ -204,7 +203,7 @@ typedef struct xfs_inode {
 	struct xfs_inode	*i_mprev;	/* ptr to prev inode */
 	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
 	struct list_head	i_reclaim;	/* reclaim list */
-	bhv_vnode_t		*i_vnode;	/* vnode backpointer */
+	struct inode		*i_vnode;	/* vnode backpointer */
 	struct xfs_dquot	*i_udquot;	/* user dquot */
 	struct xfs_dquot	*i_gdquot;	/* group dquot */
 
@@ -223,7 +222,7 @@ typedef struct xfs_inode {
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
 	mrlock_t		i_iolock;	/* inode IO lock */
-	sema_t			i_flock;	/* inode flush lock */
+	struct completion	i_flush;	/* inode flush completion q */
 	atomic_t		i_pincount;	/* inode pin count */
 	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
@@ -263,6 +262,18 @@ typedef struct xfs_inode {
 #define XFS_ISIZE(ip)	(((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
 				(ip)->i_size : (ip)->i_d.di_size;
 
+/* Convert from vfs inode to xfs inode */
+static inline struct xfs_inode *XFS_I(struct inode *inode)
+{
+	return (struct xfs_inode *)inode->i_private;
+}
+
+/* convert from xfs inode to vfs inode */
+static inline struct inode *VFS_I(struct xfs_inode *ip)
+{
+	return (struct inode *)ip->i_vnode;
+}
+
 /*
  * i_flags helper functions
  */
@@ -439,9 +450,6 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 #define	XFS_ITRUNC_DEFINITE	0x1
 #define	XFS_ITRUNC_MAYBE	0x2
 
-#define	XFS_ITOV(ip)		((ip)->i_vnode)
-#define	XFS_ITOV_NULL(ip)	((ip)->i_vnode)
-
 /*
  * For multiple groups support: if S_ISGID bit is set in the parent
  * directory, group of new file is set to that of the parent, and
@@ -473,11 +481,8 @@ int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void		xfs_iunlock(xfs_inode_t *, uint);
 void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
-void		xfs_iflock(xfs_inode_t *);
-int		xfs_iflock_nowait(xfs_inode_t *);
 uint		xfs_ilock_map_shared(xfs_inode_t *);
 void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void		xfs_ifunlock(xfs_inode_t *);
 void		xfs_ireclaim(xfs_inode_t *);
 int		xfs_finish_reclaim(xfs_inode_t *, int, int);
 int		xfs_finish_reclaim_all(struct xfs_mount *, int);
@@ -507,9 +512,6 @@ int		xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
 int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
 				     xfs_fsize_t, int, int);
 int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-int		xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *);
-void		xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *,
-				 xfs_fsize_t, int);
 
 void		xfs_idestroy_fork(xfs_inode_t *, int);
 void		xfs_idestroy(xfs_inode_t *);
@@ -525,6 +527,7 @@ void		xfs_iflush_all(struct xfs_mount *);
 void		xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
 void		xfs_lock_inodes(xfs_inode_t **, int, uint);
+void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
 void		xfs_synchronize_atime(xfs_inode_t *);
 void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
@@ -573,6 +576,26 @@ extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone	*xfs_inode_zone;
 extern struct kmem_zone	*xfs_ili_zone;
 
+/*
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
+ */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+	wait_for_completion(&ip->i_flush);
+}
+
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+	return try_wait_for_completion(&ip->i_flush);
+}
+
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+	complete(&ip->i_flush);
+}
+
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 167b33f15772..97c7452e2620 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -686,7 +686,7 @@ xfs_inode_item_unlock(
 		ASSERT(ip->i_d.di_nextents > 0);
 		ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
 		ASSERT(ip->i_df.if_bytes > 0);
-		kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes);
+		kmem_free(iip->ili_extents_buf);
 		iip->ili_extents_buf = NULL;
 	}
 	if (iip->ili_aextents_buf != NULL) {
@@ -694,7 +694,7 @@ xfs_inode_item_unlock(
 		ASSERT(ip->i_d.di_anextents > 0);
 		ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
 		ASSERT(ip->i_afp->if_bytes > 0);
-		kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes);
+		kmem_free(iip->ili_aextents_buf);
 		iip->ili_aextents_buf = NULL;
 	}
 
@@ -779,11 +779,10 @@ xfs_inode_item_pushbuf(
 	ASSERT(iip->ili_push_owner == current_pid());
 
 	/*
-	 * If flushlock isn't locked anymore, chances are that the
-	 * inode flush completed and the inode was taken off the AIL.
-	 * So, just get out.
+	 * If a flush is not in progress anymore, chances are that the
+	 * inode was taken off the AIL. So, just get out.
 	 */
-	if (!issemalocked(&(ip->i_flock)) ||
+	if (completion_done(&ip->i_flush) ||
 	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
 		iip->ili_pushbuf_flag = 0;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -805,7 +804,7 @@ xfs_inode_item_pushbuf(
 			 * If not, we can flush it async.
 			 */
 			dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-				  issemalocked(&(ip->i_flock)));
+				  !completion_done(&ip->i_flush));
 			iip->ili_pushbuf_flag = 0;
 			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 			xfs_buftrace("INODE ITEM PUSH", bp);
@@ -858,7 +857,7 @@ xfs_inode_item_push(
 	ip = iip->ili_inode;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-	ASSERT(issemalocked(&(ip->i_flock)));
+	ASSERT(!completion_done(&ip->i_flush));
 	/*
 	 * Since we were able to lock the inode's flush lock and
 	 * we found it on the AIL, the inode must be dirty.  This
@@ -957,8 +956,7 @@ xfs_inode_item_destroy(
 {
 #ifdef XFS_TRANS_DEBUG
 	if (ip->i_itemp->ili_root_size != 0) {
-		kmem_free(ip->i_itemp->ili_orig_root,
-			  ip->i_itemp->ili_root_size);
+		kmem_free(ip->i_itemp->ili_orig_root);
 	}
 #endif
 	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7edcde691d1a..67f22b2b44b3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -889,6 +889,16 @@ xfs_iomap_write_unwritten(
 	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
 
+	/*
+	 * Reserve enough blocks in this transaction for two complete extent
+	 * btree splits.  We may be converting the middle part of an unwritten
+	 * extent and in this case we will insert two new extents in the btree
+	 * each of which could cause a full split.
+	 *
+	 * This reservation amount will be used in the first call to
+	 * xfs_bmbt_split() to select an AG with enough space to satisfy the
+	 * rest of the operation.
+	 */
 	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
 
 	do {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 419de15aeb43..cf6754a3c5b3 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,7 +59,6 @@ xfs_bulkstat_one_iget(
 {
 	xfs_icdinode_t	*dic;	/* dinode core info pointer */
 	xfs_inode_t	*ip;		/* incore inode pointer */
-	bhv_vnode_t	*vp;
 	int		error;
 
 	error = xfs_iget(mp, NULL, ino,
@@ -72,7 +71,6 @@ xfs_bulkstat_one_iget(
 	ASSERT(ip != NULL);
 	ASSERT(ip->i_blkno != (xfs_daddr_t)0);
 
-	vp = XFS_ITOV(ip);
 	dic = &ip->i_d;
 
 	/* xfs_iget returns the following without needing
@@ -85,7 +83,7 @@ xfs_bulkstat_one_iget(
 	buf->bs_uid = dic->di_uid;
 	buf->bs_gid = dic->di_gid;
 	buf->bs_size = dic->di_size;
-	vn_atime_to_bstime(vp, &buf->bs_atime);
+	vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
 	buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
 	buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
 	buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -257,7 +255,7 @@ xfs_bulkstat_one(
 		*ubused = error;
 
  out_free:
-	kmem_free(buf, sizeof(*buf));
+	kmem_free(buf);
 	return error;
 }
 
@@ -708,7 +706,7 @@ xfs_bulkstat(
 	/*
 	 * Done, we're either out of filesystem or space to put the data.
 	 */
-	kmem_free(irbuf, irbsize);
+	kmem_free(irbuf);
 	*ubcountp = ubelem;
 	/*
 	 * Found some inodes, return them now and return the error next time.
@@ -914,7 +912,7 @@ xfs_inumbers(
 		}
 		*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
 	}
-	kmem_free(buffer, bcount * sizeof(*buffer));
+	kmem_free(buffer);
 	if (cur)
 		xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
 					   XFS_BTREE_NOERROR));
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ad3d26ddfe31..0b02c6443551 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -124,16 +124,27 @@ STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
 STATIC int	xlog_iclogs_empty(xlog_t *log);
 
 #if defined(XFS_LOG_TRACE)
+
+#define XLOG_TRACE_LOGGRANT_SIZE	2048
+#define XLOG_TRACE_ICLOG_SIZE		256
+
+void
+xlog_trace_loggrant_alloc(xlog_t *log)
+{
+	log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
+}
+
+void
+xlog_trace_loggrant_dealloc(xlog_t *log)
+{
+	ktrace_free(log->l_grant_trace);
+}
+
 void
 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 {
 	unsigned long cnts;
 
-	if (!log->l_grant_trace) {
-		log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
-		if (!log->l_grant_trace)
-			return;
-	}
 	/* ticket counts are 1 byte each */
 	cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
 
@@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 }
 
 void
+xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
+{
+	iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
+}
+
+void
+xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
+{
+	ktrace_free(iclog->ic_trace);
+}
+
+void
 xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
 {
-	if (!iclog->ic_trace)
-		iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
 	ktrace_enter(iclog->ic_trace,
 		     (void *)((unsigned long)state),
 		     (void *)((unsigned long)current_pid()),
@@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
 		     (void *)NULL, (void *)NULL);
 }
 #else
+
+#define	xlog_trace_loggrant_alloc(log)
+#define	xlog_trace_loggrant_dealloc(log)
 #define	xlog_trace_loggrant(log,tic,string)
+
+#define	xlog_trace_iclog_alloc(iclog)
+#define	xlog_trace_iclog_dealloc(iclog)
 #define	xlog_trace_iclog(iclog,state)
+
 #endif /* XFS_LOG_TRACE */
 
 
@@ -226,20 +254,24 @@ xlog_grant_sub_space(struct log *log, int bytes)
 static void
 xlog_grant_add_space_write(struct log *log, int bytes)
 {
-	log->l_grant_write_bytes += bytes;
-	if (log->l_grant_write_bytes > log->l_logsize) {
-		log->l_grant_write_bytes -= log->l_logsize;
+	int tmp = log->l_logsize - log->l_grant_write_bytes;
+	if (tmp > bytes)
+		log->l_grant_write_bytes += bytes;
+	else {
 		log->l_grant_write_cycle++;
+		log->l_grant_write_bytes = bytes - tmp;
 	}
 }
 
 static void
 xlog_grant_add_space_reserve(struct log *log, int bytes)
 {
-	log->l_grant_reserve_bytes += bytes;
-	if (log->l_grant_reserve_bytes > log->l_logsize) {
-		log->l_grant_reserve_bytes -= log->l_logsize;
+	int tmp = log->l_logsize - log->l_grant_reserve_bytes;
+	if (tmp > bytes)
+		log->l_grant_reserve_bytes += bytes;
+	else {
 		log->l_grant_reserve_cycle++;
+		log->l_grant_reserve_bytes = bytes - tmp;
 	}
 }
 
@@ -332,15 +364,12 @@ xfs_log_done(xfs_mount_t	*mp,
 	} else {
 		xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
 		xlog_regrant_reserve_log_space(log, ticket);
-	}
-
-	/* If this ticket was a permanent reservation and we aren't
-	 * trying to release it, reset the inited flags; so next time
-	 * we write, a start record will be written out.
-	 */
-	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
-	    (flags & XFS_LOG_REL_PERM_RESERV) == 0)
+		/* If this ticket was a permanent reservation and we aren't
+		 * trying to release it, reset the inited flags; so next time
+		 * we write, a start record will be written out.
+		 */
 		ticket->t_flags |= XLOG_TIC_INITED;
+	}
 
 	return lsn;
 }	/* xfs_log_done */
@@ -353,11 +382,11 @@ xfs_log_done(xfs_mount_t	*mp,
  * Asynchronous forces are implemented by setting the WANT_SYNC
  * bit in the appropriate in-core log and then returning.
  *
- * Synchronous forces are implemented with a semaphore.  All callers
- * to force a given lsn to disk will wait on a semaphore attached to the
+ * Synchronous forces are implemented with a signal variable. All callers
+ * to force a given lsn to disk will wait on a the sv attached to the
  * specific in-core log.  When given in-core log finally completes its
  * write to disk, that thread will wake up all threads waiting on the
- * semaphore.
+ * sv.
  */
 int
 _xfs_log_force(
@@ -584,12 +613,12 @@ error:
  * mp		- ubiquitous xfs mount point structure
  */
 int
-xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
+xfs_log_mount_finish(xfs_mount_t *mp)
 {
 	int	error;
 
 	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-		error = xlog_recover_finish(mp->m_log, mfsi_flags);
+		error = xlog_recover_finish(mp->m_log);
 	else {
 		error = 0;
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
@@ -703,7 +732,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
 		      iclog->ic_state == XLOG_STATE_DIRTY)) {
 			if (!XLOG_FORCED_SHUTDOWN(log)) {
-				sv_wait(&iclog->ic_forcesema, PMEM,
+				sv_wait(&iclog->ic_force_wait, PMEM,
 					&log->l_icloglock, s);
 			} else {
 				spin_unlock(&log->l_icloglock);
@@ -744,7 +773,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 			|| iclog->ic_state == XLOG_STATE_DIRTY
 			|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
 
-				sv_wait(&iclog->ic_forcesema, PMEM,
+				sv_wait(&iclog->ic_force_wait, PMEM,
 					&log->l_icloglock, s);
 		} else {
 			spin_unlock(&log->l_icloglock);
@@ -834,7 +863,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 				break;
 			tail_lsn = 0;
 			free_bytes -= tic->t_unit_res;
-			sv_signal(&tic->t_sema);
+			sv_signal(&tic->t_wait);
 			tic = tic->t_next;
 		} while (tic != log->l_write_headq);
 	}
@@ -855,7 +884,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 				break;
 			tail_lsn = 0;
 			free_bytes -= need_bytes;
-			sv_signal(&tic->t_sema);
+			sv_signal(&tic->t_wait);
 			tic = tic->t_next;
 		} while (tic != log->l_reserve_headq);
 	}
@@ -1004,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp)
 	l = iclog->ic_log;
 
 	/*
-	 * If the ordered flag has been removed by a lower
-	 * layer, it means the underlyin device no longer supports
+	 * If the _XFS_BARRIER_FAILED flag was set by a lower
+	 * layer, it means the underlying device no longer supports
 	 * barrier I/O. Warn loudly and turn off barriers.
 	 */
-	if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) {
+	if (bp->b_flags & _XFS_BARRIER_FAILED) {
+		bp->b_flags &= ~_XFS_BARRIER_FAILED;
 		l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
 		xfs_fs_cmn_err(CE_WARN, l->l_mp,
 				"xlog_iodone: Barriers are no longer supported"
@@ -1228,8 +1258,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
 
 	spin_lock_init(&log->l_icloglock);
 	spin_lock_init(&log->l_grant_lock);
-	initnsema(&log->l_flushsema, 0, "ic-flush");
+	sv_init(&log->l_flush_wait, 0, "flush_wait");
 
+	xlog_trace_loggrant_alloc(log);
 	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
 	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
 
@@ -1281,8 +1312,10 @@ xlog_alloc_log(xfs_mount_t	*mp,
 
 		ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
 		ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
-		sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
-		sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write");
+		sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+		sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+
+		xlog_trace_iclog_alloc(iclog);
 
 		iclogp = &iclog->ic_next;
 	}
@@ -1561,33 +1594,21 @@ xlog_dealloc_log(xlog_t *log)
 
 	iclog = log->l_iclog;
 	for (i=0; i<log->l_iclog_bufs; i++) {
-		sv_destroy(&iclog->ic_forcesema);
-		sv_destroy(&iclog->ic_writesema);
+		sv_destroy(&iclog->ic_force_wait);
+		sv_destroy(&iclog->ic_write_wait);
 		xfs_buf_free(iclog->ic_bp);
-#ifdef XFS_LOG_TRACE
-		if (iclog->ic_trace != NULL) {
-			ktrace_free(iclog->ic_trace);
-		}
-#endif
+		xlog_trace_iclog_dealloc(iclog);
 		next_iclog = iclog->ic_next;
-		kmem_free(iclog, sizeof(xlog_in_core_t));
+		kmem_free(iclog);
 		iclog = next_iclog;
 	}
-	freesema(&log->l_flushsema);
 	spinlock_destroy(&log->l_icloglock);
 	spinlock_destroy(&log->l_grant_lock);
 
 	xfs_buf_free(log->l_xbuf);
-#ifdef XFS_LOG_TRACE
-	if (log->l_trace != NULL) {
-		ktrace_free(log->l_trace);
-	}
-	if (log->l_grant_trace != NULL) {
-		ktrace_free(log->l_grant_trace);
-	}
-#endif
+	xlog_trace_loggrant_dealloc(log);
 	log->l_mp->m_log = NULL;
-	kmem_free(log, sizeof(xlog_t));
+	kmem_free(log);
 }	/* xlog_dealloc_log */
 
 /*
@@ -1973,7 +1994,7 @@ xlog_write(xfs_mount_t *	mp,
 /* Clean iclogs starting from the head.  This ordering must be
  * maintained, so an iclog doesn't become ACTIVE beyond one that
  * is SYNCING.  This is also required to maintain the notion that we use
- * a counting semaphore to hold off would be writers to the log when every
+ * a ordered wait queue to hold off would be writers to the log when every
  * iclog is trying to sync to disk.
  *
  * State Change: DIRTY -> ACTIVE
@@ -2097,6 +2118,7 @@ xlog_state_do_callback(
 	int		   funcdidcallbacks; /* flag: function did callbacks */
 	int		   repeats;	/* for issuing console warnings if
 					 * looping too many times */
+	int		   wake = 0;
 
 	spin_lock(&log->l_icloglock);
 	first_iclog = iclog = log->l_iclog;
@@ -2236,7 +2258,7 @@ xlog_state_do_callback(
 			xlog_state_clean_log(log);
 
 			/* wake up threads waiting in xfs_log_force() */
-			sv_broadcast(&iclog->ic_forcesema);
+			sv_broadcast(&iclog->ic_force_wait);
 
 			iclog = iclog->ic_next;
 		} while (first_iclog != iclog);
@@ -2278,15 +2300,13 @@ xlog_state_do_callback(
 	}
 #endif
 
-	flushcnt = 0;
-	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) {
-		flushcnt = log->l_flushcnt;
-		log->l_flushcnt = 0;
-	}
+	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+		wake = 1;
 	spin_unlock(&log->l_icloglock);
-	while (flushcnt--)
-		vsema(&log->l_flushsema);
-}	/* xlog_state_do_callback */
+
+	if (wake)
+		sv_broadcast(&log->l_flush_wait);
+}
 
 
 /*
@@ -2300,8 +2320,7 @@ xlog_state_do_callback(
  * the second completion goes through.
  *
  * Callbacks could take time, so they are done outside the scope of the
- * global state machine log lock.  Assume that the calls to cvsema won't
- * take a long time.  At least we know it won't sleep.
+ * global state machine log lock.
  */
 STATIC void
 xlog_state_done_syncing(
@@ -2337,7 +2356,7 @@ xlog_state_done_syncing(
 	 * iclog buffer, we wake them all, one will get to do the
 	 * I/O, the others get to wait for the result.
 	 */
-	sv_broadcast(&iclog->ic_writesema);
+	sv_broadcast(&iclog->ic_write_wait);
 	spin_unlock(&log->l_icloglock);
 	xlog_state_do_callback(log, aborted, iclog);	/* also cleans log */
 }	/* xlog_state_done_syncing */
@@ -2345,11 +2364,9 @@ xlog_state_done_syncing(
 
 /*
  * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
- * sleep.  The flush semaphore is set to the number of in-core buffers and
- * decremented around disk syncing.  Therefore, if all buffers are syncing,
- * this semaphore will cause new writes to sleep until a sync completes.
- * Otherwise, this code just does p() followed by v().  This approximates
- * a sleep/wakeup except we can't race.
+ * sleep.  We wait on the flush queue on the head iclog as that should be
+ * the first iclog to complete flushing. Hence if all iclogs are syncing,
+ * we will wait here and all new writes will sleep until a sync completes.
  *
  * The in-core logs are used in a circular fashion. They are not used
  * out-of-order even when an iclog past the head is free.
@@ -2384,16 +2401,15 @@ restart:
 	}
 
 	iclog = log->l_iclog;
-	if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) {
-		log->l_flushcnt++;
-		spin_unlock(&log->l_icloglock);
+	if (iclog->ic_state != XLOG_STATE_ACTIVE) {
 		xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
 		XFS_STATS_INC(xs_log_noiclogs);
-		/* Ensure that log writes happen */
-		psema(&log->l_flushsema, PINOD);
+
+		/* Wait for log writes to have flushed */
+		sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
 		goto restart;
 	}
-	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+
 	head = &iclog->ic_header;
 
 	atomic_inc(&iclog->ic_refcnt);	/* prevents sync */
@@ -2507,7 +2523,7 @@ xlog_grant_log_space(xlog_t	   *log,
 			goto error_return;
 
 		XFS_STATS_INC(xs_sleep_logspace);
-		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
 		/*
 		 * If we got an error, and the filesystem is shutting down,
 		 * we'll catch it down below. So just continue...
@@ -2533,7 +2549,7 @@ redo:
 		xlog_trace_loggrant(log, tic,
 				    "xlog_grant_log_space: sleep 2");
 		XFS_STATS_INC(xs_sleep_logspace);
-		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
 
 		if (XLOG_FORCED_SHUTDOWN(log)) {
 			spin_lock(&log->l_grant_lock);
@@ -2632,7 +2648,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 			if (free_bytes < ntic->t_unit_res)
 				break;
 			free_bytes -= ntic->t_unit_res;
-			sv_signal(&ntic->t_sema);
+			sv_signal(&ntic->t_wait);
 			ntic = ntic->t_next;
 		} while (ntic != log->l_write_headq);
 
@@ -2643,7 +2659,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 			xlog_trace_loggrant(log, tic,
 				    "xlog_regrant_write_log_space: sleep 1");
 			XFS_STATS_INC(xs_sleep_logspace);
-			sv_wait(&tic->t_sema, PINOD|PLTWAIT,
+			sv_wait(&tic->t_wait, PINOD|PLTWAIT,
 				&log->l_grant_lock, s);
 
 			/* If we're shutting down, this tic is already
@@ -2672,7 +2688,7 @@ redo:
 		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
 			xlog_ins_ticketq(&log->l_write_headq, tic);
 		XFS_STATS_INC(xs_sleep_logspace);
-		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
 
 		/* If we're shutting down, this tic is already off the queue */
 		if (XLOG_FORCED_SHUTDOWN(log)) {
@@ -2915,7 +2931,7 @@ xlog_state_switch_iclogs(xlog_t		*log,
  *	2. the current iclog is drity, and the previous iclog is in the
  *		active or dirty state.
  *
- * We may sleep (call psema) if:
+ * We may sleep if:
  *
  *	1. the current iclog is not in the active nor dirty state.
  *	2. the current iclog dirty, and the previous iclog is not in the
@@ -3012,7 +3028,7 @@ maybe_sleep:
 			return XFS_ERROR(EIO);
 		}
 		XFS_STATS_INC(xs_log_force_sleep);
-		sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
+		sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
 		/*
 		 * No need to grab the log lock here since we're
 		 * only deciding whether or not to return EIO
@@ -3095,7 +3111,7 @@ try_again:
 						 XLOG_STATE_SYNCING))) {
 			ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
 			XFS_STATS_INC(xs_log_force_sleep);
-			sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
+			sv_wait(&iclog->ic_prev->ic_write_wait, PSWP,
 				&log->l_icloglock, s);
 			*log_flushed = 1;
 			already_slept = 1;
@@ -3115,7 +3131,7 @@ try_again:
 	    !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
 
 		/*
-		 * Don't wait on the forcesema if we know that we've
+		 * Don't wait on completion if we know that we've
 		 * gotten a log write error.
 		 */
 		if (iclog->ic_state & XLOG_STATE_IOERROR) {
@@ -3123,7 +3139,7 @@ try_again:
 			return XFS_ERROR(EIO);
 		}
 		XFS_STATS_INC(xs_log_force_sleep);
-		sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
+		sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
 		/*
 		 * No need to grab the log lock here since we're
 		 * only deciding whether or not to return EIO
@@ -3179,7 +3195,7 @@ STATIC void
 xlog_ticket_put(xlog_t		*log,
 		xlog_ticket_t	*ticket)
 {
-	sv_destroy(&ticket->t_sema);
+	sv_destroy(&ticket->t_wait);
 	kmem_zone_free(xfs_log_ticket_zone, ticket);
 }	/* xlog_ticket_put */
 
@@ -3269,7 +3285,7 @@ xlog_ticket_get(xlog_t		*log,
 	tic->t_trans_type	= 0;
 	if (xflags & XFS_LOG_PERM_RESERV)
 		tic->t_flags |= XLOG_TIC_PERM_RESERV;
-	sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
+	sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
 
 	xlog_tic_reset_res(tic);
 
@@ -3556,14 +3572,14 @@ xfs_log_force_umount(
 	 */
 	if ((tic = log->l_reserve_headq)) {
 		do {
-			sv_signal(&tic->t_sema);
+			sv_signal(&tic->t_wait);
 			tic = tic->t_next;
 		} while (tic != log->l_reserve_headq);
 	}
 
 	if ((tic = log->l_write_headq)) {
 		do {
-			sv_signal(&tic->t_sema);
+			sv_signal(&tic->t_wait);
 			tic = tic->t_next;
 		} while (tic != log->l_write_headq);
 	}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d1d678ecb63e..d47b91f10822 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -149,7 +149,7 @@ int	  xfs_log_mount(struct xfs_mount	*mp,
 			struct xfs_buftarg	*log_target,
 			xfs_daddr_t		start_block,
 			int		 	num_bblocks);
-int	  xfs_log_mount_finish(struct xfs_mount *mp, int);
+int	  xfs_log_mount_finish(struct xfs_mount *mp);
 void	  xfs_log_move_tail(struct xfs_mount	*mp,
 			    xfs_lsn_t		tail_lsn);
 int	  xfs_log_notify(struct xfs_mount	*mp,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8952a392b5f3..e7d8f84443fa 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -241,7 +241,7 @@ typedef struct xlog_res {
 } xlog_res_t;
 
 typedef struct xlog_ticket {
-	sv_t		   t_sema;	 /* sleep on this semaphore      : 20 */
+	sv_t		   t_wait;	 /* ticket wait queue            : 20 */
 	struct xlog_ticket *t_next;	 /*			         :4|8 */
 	struct xlog_ticket *t_prev;	 /*				 :4|8 */
 	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4  */
@@ -314,7 +314,7 @@ typedef struct xlog_rec_ext_header {
  *	xlog_rec_header_t into the reserved space.
  * - ic_data follows, so a write to disk can start at the beginning of
  *	the iclog.
- * - ic_forcesema is used to implement synchronous forcing of the iclog to disk.
+ * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
  * - ic_next is the pointer to the next iclog in the ring.
  * - ic_bp is a pointer to the buffer used to write this incore log to disk.
  * - ic_log is a pointer back to the global log structure.
@@ -339,8 +339,8 @@ typedef struct xlog_rec_ext_header {
  * and move everything else out to subsequent cachelines.
  */
 typedef struct xlog_iclog_fields {
-	sv_t			ic_forcesema;
-	sv_t			ic_writesema;
+	sv_t			ic_force_wait;
+	sv_t			ic_write_wait;
 	struct xlog_in_core	*ic_next;
 	struct xlog_in_core	*ic_prev;
 	struct xfs_buf		*ic_bp;
@@ -377,8 +377,8 @@ typedef struct xlog_in_core {
 /*
  * Defines to save our code from this glop.
  */
-#define	ic_forcesema	hic_fields.ic_forcesema
-#define ic_writesema	hic_fields.ic_writesema
+#define	ic_force_wait	hic_fields.ic_force_wait
+#define ic_write_wait	hic_fields.ic_write_wait
 #define	ic_next		hic_fields.ic_next
 #define	ic_prev		hic_fields.ic_prev
 #define	ic_bp		hic_fields.ic_bp
@@ -423,10 +423,8 @@ typedef struct log {
 	int			l_logBBsize;    /* size of log in BB chunks */
 
 	/* The following block of fields are changed while holding icloglock */
-	sema_t			l_flushsema ____cacheline_aligned_in_smp;
-						/* iclog flushing semaphore */
-	int			l_flushcnt;	/* # of procs waiting on this
-						 * sema */
+	sv_t			l_flush_wait ____cacheline_aligned_in_smp;
+						/* waiting for iclog flush */
 	int			l_covered_state;/* state of "covering disk
 						 * log entries" */
 	xlog_in_core_t		*l_iclog;       /* head log queue	*/
@@ -450,7 +448,6 @@ typedef struct log {
 	int			l_grant_write_bytes;
 
 #ifdef XFS_LOG_TRACE
-	struct ktrace		*l_trace;
 	struct ktrace		*l_grant_trace;
 #endif
 
@@ -470,7 +467,7 @@ extern int	 xlog_find_tail(xlog_t	*log,
 				xfs_daddr_t *head_blk,
 				xfs_daddr_t *tail_blk);
 extern int	 xlog_recover(xlog_t *log);
-extern int	 xlog_recover_finish(xlog_t *log, int mfsi_flags);
+extern int	 xlog_recover_finish(xlog_t *log);
 extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 extern void	 xlog_recover_process_iunlinks(xlog_t *log);
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e65ab4af0955..82d46ce69d5f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1715,8 +1715,7 @@ xlog_check_buffer_cancelled(
 					} else {
 						prevp->bc_next = bcp->bc_next;
 					}
-					kmem_free(bcp,
-						  sizeof(xfs_buf_cancel_t));
+					kmem_free(bcp);
 				}
 			}
 			return 1;
@@ -2519,7 +2518,7 @@ write_inode_buffer:
 
 error:
 	if (need_free)
-		kmem_free(in_f, sizeof(*in_f));
+		kmem_free(in_f);
 	return XFS_ERROR(error);
 }
 
@@ -2830,16 +2829,14 @@ xlog_recover_free_trans(
 		item = item->ri_next;
 		 /* Free the regions in the item. */
 		for (i = 0; i < free_item->ri_cnt; i++) {
-			kmem_free(free_item->ri_buf[i].i_addr,
-				  free_item->ri_buf[i].i_len);
+			kmem_free(free_item->ri_buf[i].i_addr);
 		}
 		/* Free the item itself */
-		kmem_free(free_item->ri_buf,
-			  (free_item->ri_total * sizeof(xfs_log_iovec_t)));
-		kmem_free(free_item, sizeof(xlog_recover_item_t));
+		kmem_free(free_item->ri_buf);
+		kmem_free(free_item);
 	} while (first_item != item);
 	/* Free the transaction recover structure */
-	kmem_free(trans, sizeof(xlog_recover_t));
+	kmem_free(trans);
 }
 
 STATIC int
@@ -3786,8 +3783,7 @@ xlog_do_log_recovery(
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS1);
 	if (error != 0) {
-		kmem_free(log->l_buf_cancel_table,
-			  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+		kmem_free(log->l_buf_cancel_table);
 		log->l_buf_cancel_table = NULL;
 		return error;
 	}
@@ -3806,8 +3802,7 @@ xlog_do_log_recovery(
 	}
 #endif	/* DEBUG */
 
-	kmem_free(log->l_buf_cancel_table,
-		  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+	kmem_free(log->l_buf_cancel_table);
 	log->l_buf_cancel_table = NULL;
 
 	return error;
@@ -3945,8 +3940,7 @@ xlog_recover(
  */
 int
 xlog_recover_finish(
-	xlog_t		*log,
-	int		mfsi_flags)
+	xlog_t		*log)
 {
 	/*
 	 * Now we're ready to do the transactions needed for the
@@ -3974,9 +3968,7 @@ xlog_recover_finish(
 		xfs_log_force(log->l_mp, (xfs_lsn_t)0,
 			      (XFS_LOG_FORCE | XFS_LOG_SYNC));
 
-		if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
-			xlog_recover_process_iunlinks(log);
-		}
+		xlog_recover_process_iunlinks(log);
 
 		xlog_recover_check_summary(log);
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da3988453b71..a4503f5e9497 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -47,12 +47,10 @@
 
 STATIC int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int	xfs_uuid_mount(xfs_mount_t *);
-STATIC void	xfs_uuid_unmount(xfs_mount_t *mp);
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
 
 
 #ifdef HAVE_PERCPU_SB
-STATIC void	xfs_icsb_destroy_counters(xfs_mount_t *);
 STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
 						int);
 STATIC void	xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
@@ -63,7 +61,6 @@ STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
 
 #else
 
-#define xfs_icsb_destroy_counters(mp)			do { } while (0)
 #define xfs_icsb_balance_counter(mp, a, b)		do { } while (0)
 #define xfs_icsb_balance_counter_locked(mp, a, b)	do { } while (0)
 #define xfs_icsb_modify_counters(mp, a, b, c)		do { } while (0)
@@ -126,34 +123,12 @@ static const struct {
 };
 
 /*
- * Return a pointer to an initialized xfs_mount structure.
- */
-xfs_mount_t *
-xfs_mount_init(void)
-{
-	xfs_mount_t *mp;
-
-	mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP);
-
-	if (xfs_icsb_init_counters(mp)) {
-		mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
-	}
-
-	spin_lock_init(&mp->m_sb_lock);
-	mutex_init(&mp->m_ilock);
-	mutex_init(&mp->m_growlock);
-	atomic_set(&mp->m_active_trans, 0);
-
-	return mp;
-}
-
-/*
  * Free up the resources associated with a mount structure.  Assume that
  * the structure was initially zeroed, so we can tell which fields got
  * initialized.
  */
-void
-xfs_mount_free(
+STATIC void
+xfs_free_perag(
 	xfs_mount_t	*mp)
 {
 	if (mp->m_perag) {
@@ -161,28 +136,9 @@ xfs_mount_free(
 
 		for (agno = 0; agno < mp->m_maxagi; agno++)
 			if (mp->m_perag[agno].pagb_list)
-				kmem_free(mp->m_perag[agno].pagb_list,
-						sizeof(xfs_perag_busy_t) *
-							XFS_PAGB_NUM_SLOTS);
-		kmem_free(mp->m_perag,
-			  sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
+				kmem_free(mp->m_perag[agno].pagb_list);
+		kmem_free(mp->m_perag);
 	}
-
-	spinlock_destroy(&mp->m_ail_lock);
-	spinlock_destroy(&mp->m_sb_lock);
-	mutex_destroy(&mp->m_ilock);
-	mutex_destroy(&mp->m_growlock);
-	if (mp->m_quotainfo)
-		XFS_QM_DONE(mp);
-
-	if (mp->m_fsname != NULL)
-		kmem_free(mp->m_fsname, mp->m_fsname_len);
-	if (mp->m_rtname != NULL)
-		kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1);
-	if (mp->m_logname != NULL)
-		kmem_free(mp->m_logname, strlen(mp->m_logname) + 1);
-
-	xfs_icsb_destroy_counters(mp);
 }
 
 /*
@@ -288,6 +244,19 @@ xfs_mount_validate_sb(
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
+	/*
+	 * Until this is fixed only page-sized or smaller data blocks work.
+	 */
+	if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+		xfs_fs_mount_cmn_err(flags,
+			"file system with blocksize %d bytes",
+			sbp->sb_blocksize);
+		xfs_fs_mount_cmn_err(flags,
+			"only pagesize (%ld) or less will currently work.",
+			PAGE_SIZE);
+		return XFS_ERROR(ENOSYS);
+	}
+
 	if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
 	    xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
 		xfs_fs_mount_cmn_err(flags,
@@ -309,19 +278,6 @@ xfs_mount_validate_sb(
 		return XFS_ERROR(ENOSYS);
 	}
 
-	/*
-	 * Until this is fixed only page-sized or smaller data blocks work.
-	 */
-	if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-		xfs_fs_mount_cmn_err(flags,
-			"file system with blocksize %d bytes",
-			sbp->sb_blocksize);
-		xfs_fs_mount_cmn_err(flags,
-			"only pagesize (%ld) or less will currently work.",
-			PAGE_SIZE);
-		return XFS_ERROR(ENOSYS);
-	}
-
 	return 0;
 }
 
@@ -734,11 +690,11 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
  * Update alignment values based on mount options and sb values
  */
 STATIC int
-xfs_update_alignment(xfs_mount_t *mp, int mfsi_flags, __uint64_t *update_flags)
+xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
 {
 	xfs_sb_t	*sbp = &(mp->m_sb);
 
-	if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
+	if (mp->m_dalign) {
 		/*
 		 * If stripe unit and stripe width are not multiples
 		 * of the fs blocksize turn off alignment.
@@ -894,7 +850,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
  * Check that the data (and log if separate) are an ok size.
  */
 STATIC int
-xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
+xfs_check_sizes(xfs_mount_t *mp)
 {
 	xfs_buf_t	*bp;
 	xfs_daddr_t	d;
@@ -917,8 +873,7 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
 		return error;
 	}
 
-	if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
-	    mp->m_logdev_targp != mp->m_ddev_targp) {
+	if (mp->m_logdev_targp != mp->m_ddev_targp) {
 		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
 			cmn_err(CE_WARN, "XFS: size check 3 failed");
@@ -953,15 +908,13 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
  */
 int
 xfs_mountfs(
-	xfs_mount_t	*mp,
-	int		mfsi_flags)
+	xfs_mount_t	*mp)
 {
 	xfs_sb_t	*sbp = &(mp->m_sb);
 	xfs_inode_t	*rip;
 	__uint64_t	resblks;
 	__int64_t	update_flags = 0LL;
 	uint		quotamount, quotaflags;
-	int		agno;
 	int		uuid_mounted = 0;
 	int		error = 0;
 
@@ -994,9 +947,19 @@ xfs_mountfs(
 		 * Re-check for ATTR2 in case it was found in bad_features2
 		 * slot.
 		 */
-		if (xfs_sb_version_hasattr2(&mp->m_sb))
+		if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+		   !(mp->m_flags & XFS_MOUNT_NOATTR2))
 			mp->m_flags |= XFS_MOUNT_ATTR2;
+	}
+
+	if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+	   (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+		xfs_sb_version_removeattr2(&mp->m_sb);
+		update_flags |= XFS_SB_FEATURES2;
 
+		/* update sb_versionnum for the clearing of the morebits */
+		if (!sbp->sb_features2)
+			update_flags |= XFS_SB_VERSIONNUM;
 	}
 
 	/*
@@ -1005,7 +968,7 @@ xfs_mountfs(
 	 * allocator alignment is within an ag, therefore ag has
 	 * to be aligned at stripe boundary.
 	 */
-	error = xfs_update_alignment(mp, mfsi_flags, &update_flags);
+	error = xfs_update_alignment(mp, &update_flags);
 	if (error)
 		goto error1;
 
@@ -1024,8 +987,7 @@ xfs_mountfs(
 	 * since a single partition filesystem is identical to a single
 	 * partition volume/filesystem.
 	 */
-	if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
-	    (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
+	if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
 		if (xfs_uuid_mount(mp)) {
 			error = XFS_ERROR(EINVAL);
 			goto error1;
@@ -1053,7 +1015,7 @@ xfs_mountfs(
 	/*
 	 * Check that the data (and log if separate) are an ok size.
 	 */
-	error = xfs_check_sizes(mp, mfsi_flags);
+	error = xfs_check_sizes(mp);
 	if (error)
 		goto error1;
 
@@ -1067,13 +1029,6 @@ xfs_mountfs(
 	}
 
 	/*
-	 * For client case we are done now
-	 */
-	if (mfsi_flags & XFS_MFSI_CLIENT) {
-		return 0;
-	}
-
-	/*
 	 *  Copies the low order bits of the timestamp and the randomly
 	 *  set "sequence" number out of a UUID.
 	 */
@@ -1097,8 +1052,10 @@ xfs_mountfs(
 	 * Allocate and initialize the per-ag data.
 	 */
 	init_rwsem(&mp->m_peraglock);
-	mp->m_perag =
-		kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
+	mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
+				  KM_MAYFAIL);
+	if (!mp->m_perag)
+		goto error1;
 
 	mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
 
@@ -1210,7 +1167,7 @@ xfs_mountfs(
 	 * delayed until after the root and real-time bitmap inodes
 	 * were consistently read in.
 	 */
-	error = xfs_log_mount_finish(mp, mfsi_flags);
+	error = xfs_log_mount_finish(mp);
 	if (error) {
 		cmn_err(CE_WARN, "XFS: log mount finish failed");
 		goto error4;
@@ -1219,7 +1176,7 @@ xfs_mountfs(
 	/*
 	 * Complete the quota initialisation, post-log-replay component.
 	 */
-	error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags);
+	error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
 	if (error)
 		goto error4;
 
@@ -1253,31 +1210,25 @@ xfs_mountfs(
  error3:
 	xfs_log_unmount_dealloc(mp);
  error2:
-	for (agno = 0; agno < sbp->sb_agcount; agno++)
-		if (mp->m_perag[agno].pagb_list)
-			kmem_free(mp->m_perag[agno].pagb_list,
-			  sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
-	kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t));
-	mp->m_perag = NULL;
-	/* FALLTHROUGH */
+	xfs_free_perag(mp);
  error1:
 	if (uuid_mounted)
-		xfs_uuid_unmount(mp);
-	xfs_freesb(mp);
+		uuid_table_remove(&mp->m_sb.sb_uuid);
 	return error;
 }
 
 /*
- * xfs_unmountfs
- *
  * This flushes out the inodes,dquots and the superblock, unmounts the
  * log and makes sure that incore structures are freed.
  */
-int
-xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
+void
+xfs_unmountfs(
+	struct xfs_mount	*mp)
 {
-	__uint64_t	resblks;
-	int		error = 0;
+	__uint64_t		resblks;
+	int			error;
+
+	IRELE(mp->m_rootip);
 
 	/*
 	 * We can potentially deadlock here if we have an inode cluster
@@ -1334,32 +1285,20 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
 	xfs_log_unmount(mp);			/* Done! No more fs ops. */
 
-	xfs_freesb(mp);
-
 	/*
 	 * All inodes from this mount point should be freed.
 	 */
 	ASSERT(mp->m_inodes == NULL);
 
-	xfs_unmountfs_close(mp, cr);
 	if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
-		xfs_uuid_unmount(mp);
+		uuid_table_remove(&mp->m_sb.sb_uuid);
 
-#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+#if defined(DEBUG)
 	xfs_errortag_clearall(mp, 0);
 #endif
-	xfs_mount_free(mp);
-	return 0;
-}
-
-void
-xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
-{
-	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
-		xfs_free_buftarg(mp->m_logdev_targp, 1);
-	if (mp->m_rtdev_targp)
-		xfs_free_buftarg(mp->m_rtdev_targp, 1);
-	xfs_free_buftarg(mp->m_ddev_targp, 0);
+	xfs_free_perag(mp);
+	if (mp->m_quotainfo)
+		XFS_QM_DONE(mp);
 }
 
 STATIC void
@@ -1905,16 +1844,6 @@ xfs_uuid_mount(
 }
 
 /*
- * Remove filesystem from the UUID table.
- */
-STATIC void
-xfs_uuid_unmount(
-	xfs_mount_t	*mp)
-{
-	uuid_table_remove(&mp->m_sb.sb_uuid);
-}
-
-/*
  * Used to log changes to the superblock unit and width fields which could
  * be altered by the mount options, as well as any potential sb_features2
  * fixup. Only the first superblock is updated.
@@ -1928,7 +1857,8 @@ xfs_mount_log_sb(
 	int		error;
 
 	ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
-			 XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2));
+			 XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
+			 XFS_SB_VERSIONNUM));
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
 	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
@@ -2109,7 +2039,7 @@ xfs_icsb_reinit_counters(
 	xfs_icsb_unlock(mp);
 }
 
-STATIC void
+void
 xfs_icsb_destroy_counters(
 	xfs_mount_t	*mp)
 {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 63e0693a358a..f3c1024b1241 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -61,6 +61,7 @@ struct xfs_bmap_free;
 struct xfs_extdelta;
 struct xfs_swapext;
 struct xfs_mru_cache;
+struct xfs_nameops;
 
 /*
  * Prototypes and functions for the Data Migration subsystem.
@@ -113,7 +114,7 @@ struct xfs_dqtrxops;
 struct xfs_quotainfo;
 
 typedef int	(*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
-typedef int	(*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int);
+typedef int	(*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
 typedef int	(*xfs_qmunmount_t)(struct xfs_mount *);
 typedef void	(*xfs_qmdone_t)(struct xfs_mount *);
 typedef void	(*xfs_dqrele_t)(struct xfs_dquot *);
@@ -157,8 +158,8 @@ typedef struct xfs_qmops {
 
 #define XFS_QM_INIT(mp, mnt, fl) \
 	(*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
-#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \
-	(*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl, mfsi_flags)
+#define XFS_QM_MOUNT(mp, mnt, fl) \
+	(*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
 #define XFS_QM_UNMOUNT(mp) \
 	(*(mp)->m_qm_ops->xfs_qmunmount)(mp)
 #define XFS_QM_DONE(mp) \
@@ -210,12 +211,14 @@ typedef struct xfs_icsb_cnts {
 
 extern int	xfs_icsb_init_counters(struct xfs_mount *);
 extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
+extern void	xfs_icsb_destroy_counters(struct xfs_mount *);
 extern void	xfs_icsb_sync_counters(struct xfs_mount *, int);
 extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 
 #else
-#define xfs_icsb_init_counters(mp)	(0)
-#define xfs_icsb_reinit_counters(mp)	do { } while (0)
+#define xfs_icsb_init_counters(mp)		(0)
+#define xfs_icsb_destroy_counters(mp)		do { } while (0)
+#define xfs_icsb_reinit_counters(mp)		do { } while (0)
 #define xfs_icsb_sync_counters(mp, flags)	do { } while (0)
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
 #endif
@@ -313,6 +316,7 @@ typedef struct xfs_mount {
 	__uint8_t		m_inode_quiesce;/* call quiesce on new inodes.
 						   field governed by m_ilock */
 	__uint8_t		m_sectbb_log;	/* sectlog - BBSHIFT */
+	const struct xfs_nameops *m_dirnameops;	/* vector of dir name ops */
 	int			m_dirblksize;	/* directory block sz--bytes */
 	int			m_dirblkfsbs;	/* directory block sz--fsbs */
 	xfs_dablk_t		m_dirdatablk;	/* blockno of dir data v2 */
@@ -378,6 +382,7 @@ typedef struct xfs_mount {
 						   counters */
 #define XFS_MOUNT_FILESTREAMS	(1ULL << 24)	/* enable the filestreams
 						   allocator */
+#define XFS_MOUNT_NOATTR2	(1ULL << 25)	/* disable use of attr2 format */
 
 
 /*
@@ -437,13 +442,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 /*
  * Flags for xfs_mountfs
  */
-#define XFS_MFSI_SECOND		0x01	/* Secondary mount -- skip stuff */
-#define XFS_MFSI_CLIENT		0x02	/* Is a client -- skip lots of stuff */
-/*	XFS_MFSI_RRINODES	*/
-#define XFS_MFSI_NOUNLINK	0x08	/* Skip unlinked inode processing in */
-					/* log recovery */
-#define XFS_MFSI_NO_QUOTACHECK	0x10	/* Skip quotacheck processing */
-/*	XFS_MFSI_CONVERT_SUNIT	*/
 #define XFS_MFSI_QUIET		0x40	/* Be silent if mount errors found */
 
 #define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
@@ -510,15 +508,12 @@ typedef struct xfs_mod_sb {
 #define	XFS_MOUNT_ILOCK(mp)	mutex_lock(&((mp)->m_ilock))
 #define	XFS_MOUNT_IUNLOCK(mp)	mutex_unlock(&((mp)->m_ilock))
 
-extern xfs_mount_t *xfs_mount_init(void);
 extern void	xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern int	xfs_log_sbcount(xfs_mount_t *, uint);
-extern void	xfs_mount_free(xfs_mount_t *mp);
-extern int	xfs_mountfs(xfs_mount_t *mp, int);
+extern int	xfs_mountfs(xfs_mount_t *mp);
 extern void	xfs_mountfs_check_barriers(xfs_mount_t *mp);
 
-extern int	xfs_unmountfs(xfs_mount_t *, struct cred *);
-extern void	xfs_unmountfs_close(xfs_mount_t *, struct cred *);
+extern void	xfs_unmountfs(xfs_mount_t *);
 extern int	xfs_unmountfs_writesb(xfs_mount_t *);
 extern int	xfs_unmount_flush(xfs_mount_t *, int);
 extern int	xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
@@ -544,9 +539,6 @@ extern void	xfs_qmops_put(struct xfs_mount *);
 
 extern struct xfs_dmops xfs_dmcore_xfs;
 
-extern int	xfs_init(void);
-extern void	xfs_cleanup(void);
-
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index a0b2c0a2589a..afee7eb24323 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -307,15 +307,18 @@ xfs_mru_cache_init(void)
 	xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
 	                                 "xfs_mru_cache_elem");
 	if (!xfs_mru_elem_zone)
-		return ENOMEM;
+		goto out;
 
 	xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
-	if (!xfs_mru_reap_wq) {
-		kmem_zone_destroy(xfs_mru_elem_zone);
-		return ENOMEM;
-	}
+	if (!xfs_mru_reap_wq)
+		goto out_destroy_mru_elem_zone;
 
 	return 0;
+
+ out_destroy_mru_elem_zone:
+	kmem_zone_destroy(xfs_mru_elem_zone);
+ out:
+	return -ENOMEM;
 }
 
 void
@@ -382,9 +385,9 @@ xfs_mru_cache_create(
 
 exit:
 	if (err && mru && mru->lists)
-		kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
+		kmem_free(mru->lists);
 	if (err && mru)
-		kmem_free(mru, sizeof(*mru));
+		kmem_free(mru);
 
 	return err;
 }
@@ -424,8 +427,8 @@ xfs_mru_cache_destroy(
 
 	xfs_mru_cache_flush(mru);
 
-	kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
-	kmem_free(mru, sizeof(*mru));
+	kmem_free(mru->lists);
+	kmem_free(mru);
 }
 
 /*
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d8063e1ad298..d700dacdb10e 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -336,22 +336,18 @@ xfs_rename(
 		ASSERT(error != EEXIST);
 		if (error)
 			goto abort_return;
-		xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-	} else {
-		/*
-		 * We always want to hit the ctime on the source inode.
-		 * We do it in the if clause above for the 'new_parent &&
-		 * src_is_directory' case, and here we get all the other
-		 * cases.  This isn't strictly required by the standards
-		 * since the source inode isn't really being changed,
-		 * but old unix file systems did it and some incremental
-		 * backup programs won't work without it.
-		 */
-		xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
 	}
 
 	/*
+	 * We always want to hit the ctime on the source inode.
+	 *
+	 * This isn't strictly required by the standards since the source
+	 * inode isn't really being changed, but old unix file systems did
+	 * it and some incremental backup programs won't work without it.
+	 */
+	xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+
+	/*
 	 * Adjust the link count on src_dp.  This is necessary when
 	 * renaming a directory, either within one parent when
 	 * the target existed, or across two parent directories.
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a0dc6e5bc5b9..e2f68de16159 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -74,18 +74,6 @@ STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
  */
 
 /*
- * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
- */
-STATIC int
-xfs_lowbit32(
-	__uint32_t	v)
-{
-	if (v)
-		return ffs(v) - 1;
-	return -1;
-}
-
-/*
  * Allocate space to the bitmap or summary file, and zero it, for growfs.
  */
 STATIC int				/* error */
@@ -450,6 +438,7 @@ xfs_rtallocate_extent_near(
 	}
 	bbno = XFS_BITTOBLOCK(mp, bno);
 	i = 0;
+	ASSERT(minlen != 0);
 	log2len = xfs_highbit32(minlen);
 	/*
 	 * Loop over all bitmap blocks (bbno + i is current block).
@@ -618,6 +607,8 @@ xfs_rtallocate_extent_size(
 	xfs_suminfo_t	sum;		/* summary information for extents */
 
 	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	ASSERT(maxlen != 0);
+
 	/*
 	 * Loop over all the levels starting with maxlen.
 	 * At each level, look at all the bitmap blocks, to see if there
@@ -675,6 +666,9 @@ xfs_rtallocate_extent_size(
 		*rtblock = NULLRTBLOCK;
 		return 0;
 	}
+	ASSERT(minlen != 0);
+	ASSERT(maxlen != 0);
+
 	/*
 	 * Loop over sizes, from maxlen down to minlen.
 	 * This time, when we do the allocations, allow smaller ones
@@ -1961,6 +1955,7 @@ xfs_growfs_rt(
 				  nsbp->sb_blocksize * nsbp->sb_rextsize);
 		nsbp->sb_rextents = nsbp->sb_rblocks;
 		do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+		ASSERT(nsbp->sb_rextents != 0);
 		nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
 		nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
 		nrsumsize =
@@ -2062,7 +2057,7 @@ xfs_growfs_rt(
 	/*
 	 * Free the fake mp structure.
 	 */
-	kmem_free(nmp, sizeof(*nmp));
+	kmem_free(nmp);
 
 	return error;
 }
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index b0f31c09a76d..3a82576dde9a 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -314,7 +314,7 @@ xfs_bioerror_relse(
 		 * ASYNC buffers.
 		 */
 		XFS_BUF_ERROR(bp, EIO);
-		XFS_BUF_V_IODONESEMA(bp);
+		XFS_BUF_FINISH_IOWAIT(bp);
 	} else {
 		xfs_buf_relse(bp);
 	}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index d904efe7f871..3f8cf1587f4c 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -46,10 +46,12 @@ struct xfs_mount;
 #define XFS_SB_VERSION_SECTORBIT	0x0800
 #define	XFS_SB_VERSION_EXTFLGBIT	0x1000
 #define	XFS_SB_VERSION_DIRV2BIT		0x2000
+#define	XFS_SB_VERSION_BORGBIT		0x4000	/* ASCII only case-insens. */
 #define	XFS_SB_VERSION_MOREBITSBIT	0x8000
 #define	XFS_SB_VERSION_OKSASHFBITS	\
 	(XFS_SB_VERSION_EXTFLGBIT | \
-	 XFS_SB_VERSION_DIRV2BIT)
+	 XFS_SB_VERSION_DIRV2BIT | \
+	 XFS_SB_VERSION_BORGBIT)
 #define	XFS_SB_VERSION_OKREALFBITS	\
 	(XFS_SB_VERSION_ATTRBIT | \
 	 XFS_SB_VERSION_NLINKBIT | \
@@ -437,6 +439,12 @@ static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
 		((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
 }
 
+static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
+{
+	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+		(sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
+
 static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 {
 	return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
@@ -473,6 +481,13 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
 		((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT)));
 }
 
+static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
+{
+	sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+	if (!sbp->sb_features2)
+		sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
+
 /*
  * end of superblock version macros
  */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 140386434aa3..4e1c22a23be5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -43,6 +43,7 @@
 #include "xfs_quota.h"
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
 
 
 STATIC void	xfs_trans_apply_sb_deltas(xfs_trans_t *);
@@ -253,7 +254,7 @@ _xfs_trans_alloc(
 	tp->t_mountp = mp;
 	tp->t_items_free = XFS_LIC_NUM_SLOTS;
 	tp->t_busy_free = XFS_LBC_NUM_SLOTS;
-	XFS_LIC_INIT(&(tp->t_items));
+	xfs_lic_init(&(tp->t_items));
 	XFS_LBC_INIT(&(tp->t_busy));
 	return tp;
 }
@@ -282,7 +283,7 @@ xfs_trans_dup(
 	ntp->t_mountp = tp->t_mountp;
 	ntp->t_items_free = XFS_LIC_NUM_SLOTS;
 	ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
-	XFS_LIC_INIT(&(ntp->t_items));
+	xfs_lic_init(&(ntp->t_items));
 	XFS_LBC_INIT(&(ntp->t_busy));
 
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -889,7 +890,7 @@ shut_us_down:
 
 	tp->t_commit_lsn = commit_lsn;
 	if (nvec > XFS_TRANS_LOGVEC_COUNT) {
-		kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t));
+		kmem_free(log_vector);
 	}
 
 	/*
@@ -1169,7 +1170,7 @@ xfs_trans_cancel(
 		while (licp != NULL) {
 			lidp = licp->lic_descs;
 			for (i = 0; i < licp->lic_unused; i++, lidp++) {
-				if (XFS_LIC_ISFREE(licp, i)) {
+				if (xfs_lic_isfree(licp, i)) {
 					continue;
 				}
 
@@ -1216,6 +1217,68 @@ xfs_trans_free(
 	kmem_zone_free(xfs_trans_zone, tp);
 }
 
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to
+ * the next: permanent transactions are only flushed out when
+ * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * as possible to let chunks of it go to the log. So we commit the
+ * chunk we've been working on and get a new transaction to continue.
+ */
+int
+xfs_trans_roll(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*dp)
+{
+	struct xfs_trans	*trans;
+	unsigned int		logres, count;
+	int			error;
+
+	/*
+	 * Ensure that the inode is always logged.
+	 */
+	trans = *tpp;
+	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+
+	/*
+	 * Copy the critical parameters from one trans to the next.
+	 */
+	logres = trans->t_log_res;
+	count = trans->t_log_count;
+	*tpp = xfs_trans_dup(trans);
+
+	/*
+	 * Commit the current transaction.
+	 * If this commit failed, then it'd just unlock those items that
+	 * are not marked ihold. That also means that a filesystem shutdown
+	 * is in progress. The caller takes the responsibility to cancel
+	 * the duplicate transaction that gets returned.
+	 */
+	error = xfs_trans_commit(trans, 0);
+	if (error)
+		return (error);
+
+	trans = *tpp;
+
+	/*
+	 * Reserve space in the log for th next transaction.
+	 * This also pushes items in the "AIL", the list of logged items,
+	 * out to disk if they are taking up space at the tail of the log
+	 * that we want to use.  This requires that either nothing be locked
+	 * across this call, or that anything that is locked be logged in
+	 * the prior and the next transactions.
+	 */
+	error = xfs_trans_reserve(trans, 0, logres, 0,
+				  XFS_TRANS_PERM_LOG_RES, count);
+	/*
+	 *  Ensure that the inode is in the new transaction and locked.
+	 */
+	if (error)
+		return error;
+
+	xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(trans, dp);
+	return 0;
+}
 
 /*
  * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
@@ -1253,7 +1316,7 @@ xfs_trans_committed(
 	 * Special case the chunk embedded in the transaction.
 	 */
 	licp = &(tp->t_items);
-	if (!(XFS_LIC_ARE_ALL_FREE(licp))) {
+	if (!(xfs_lic_are_all_free(licp))) {
 		xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
 	}
 
@@ -1262,10 +1325,10 @@ xfs_trans_committed(
 	 */
 	licp = licp->lic_next;
 	while (licp != NULL) {
-		ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+		ASSERT(!xfs_lic_are_all_free(licp));
 		xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
 		next_licp = licp->lic_next;
-		kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+		kmem_free(licp);
 		licp = next_licp;
 	}
 
@@ -1325,7 +1388,7 @@ xfs_trans_chunk_committed(
 
 	lidp = licp->lic_descs;
 	for (i = 0; i < licp->lic_unused; i++, lidp++) {
-		if (XFS_LIC_ISFREE(licp, i)) {
+		if (xfs_lic_isfree(licp, i)) {
 			continue;
 		}
 
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0804207c7391..74c80bd2b0ec 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -210,62 +210,52 @@ typedef struct xfs_log_item_chunk {
  * lic_unused to the right value (0 matches all free).  The
  * lic_descs.lid_index values are set up as each desc is allocated.
  */
-#define	XFS_LIC_INIT(cp)	xfs_lic_init(cp)
 static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
 {
 	cp->lic_free = XFS_LIC_FREEMASK;
 }
 
-#define	XFS_LIC_INIT_SLOT(cp,slot)	xfs_lic_init_slot(cp, slot)
 static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
 {
 	cp->lic_descs[slot].lid_index = (unsigned char)(slot);
 }
 
-#define	XFS_LIC_VACANCY(cp)		xfs_lic_vacancy(cp)
 static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
 {
 	return cp->lic_free & XFS_LIC_FREEMASK;
 }
 
-#define	XFS_LIC_ALL_FREE(cp)		xfs_lic_all_free(cp)
 static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
 {
 	cp->lic_free = XFS_LIC_FREEMASK;
 }
 
-#define	XFS_LIC_ARE_ALL_FREE(cp)	xfs_lic_are_all_free(cp)
 static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
 {
 	return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
 }
 
-#define	XFS_LIC_ISFREE(cp,slot)	xfs_lic_isfree(cp,slot)
 static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
 {
 	return (cp->lic_free & (1 << slot));
 }
 
-#define	XFS_LIC_CLAIM(cp,slot)		xfs_lic_claim(cp,slot)
 static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
 {
 	cp->lic_free &= ~(1 << slot);
 }
 
-#define	XFS_LIC_RELSE(cp,slot)		xfs_lic_relse(cp,slot)
 static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
 {
 	cp->lic_free |= 1 << slot;
 }
 
-#define	XFS_LIC_SLOT(cp,slot)		xfs_lic_slot(cp,slot)
 static inline xfs_log_item_desc_t *
 xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
 {
 	return &(cp->lic_descs[slot]);
 }
 
-#define	XFS_LIC_DESC_TO_SLOT(dp)	xfs_lic_desc_to_slot(dp)
 static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
 {
 	return (uint)dp->lid_index;
@@ -278,7 +268,6 @@ static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
  * All of this yields the address of the chunk, which is
  * cast to a chunk pointer.
  */
-#define	XFS_LIC_DESC_TO_CHUNK(dp)	xfs_lic_desc_to_chunk(dp)
 static inline xfs_log_item_chunk_t *
 xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 {
@@ -986,6 +975,7 @@ int		_xfs_trans_commit(xfs_trans_t *,
 				  int *);
 #define xfs_trans_commit(tp, flags)	_xfs_trans_commit(tp, flags, NULL)
 void		xfs_trans_cancel(xfs_trans_t *, int);
+int		xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 int		xfs_trans_ail_init(struct xfs_mount *);
 void		xfs_trans_ail_destroy(struct xfs_mount *);
 void		xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index cb0c5839154b..4e855b5ced66 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -1021,16 +1021,16 @@ xfs_trans_buf_item_match(
 	bp = NULL;
 	len = BBTOB(len);
 	licp = &tp->t_items;
-	if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+	if (!xfs_lic_are_all_free(licp)) {
 		for (i = 0; i < licp->lic_unused; i++) {
 			/*
 			 * Skip unoccupied slots.
 			 */
-			if (XFS_LIC_ISFREE(licp, i)) {
+			if (xfs_lic_isfree(licp, i)) {
 				continue;
 			}
 
-			lidp = XFS_LIC_SLOT(licp, i);
+			lidp = xfs_lic_slot(licp, i);
 			blip = (xfs_buf_log_item_t *)lidp->lid_item;
 			if (blip->bli_item.li_type != XFS_LI_BUF) {
 				continue;
@@ -1074,7 +1074,7 @@ xfs_trans_buf_item_match_all(
 	bp = NULL;
 	len = BBTOB(len);
 	for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
-		if (XFS_LIC_ARE_ALL_FREE(licp)) {
+		if (xfs_lic_are_all_free(licp)) {
 			ASSERT(licp == &tp->t_items);
 			ASSERT(licp->lic_next == NULL);
 			return NULL;
@@ -1083,11 +1083,11 @@ xfs_trans_buf_item_match_all(
 			/*
 			 * Skip unoccupied slots.
 			 */
-			if (XFS_LIC_ISFREE(licp, i)) {
+			if (xfs_lic_isfree(licp, i)) {
 				continue;
 			}
 
-			lidp = XFS_LIC_SLOT(licp, i);
+			lidp = xfs_lic_slot(licp, i);
 			blip = (xfs_buf_log_item_t *)lidp->lid_item;
 			if (blip->bli_item.li_type != XFS_LI_BUF) {
 				continue;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 4c70bf5e9985..2a1c0f071f91 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -291,7 +291,7 @@ xfs_trans_inode_broot_debug(
 	iip = ip->i_itemp;
 	if (iip->ili_root_size != 0) {
 		ASSERT(iip->ili_orig_root != NULL);
-		kmem_free(iip->ili_orig_root, iip->ili_root_size);
+		kmem_free(iip->ili_orig_root);
 		iip->ili_root_size = 0;
 		iip->ili_orig_root = NULL;
 	}
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 66a09f0d894b..3c666e8317f8 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -53,11 +53,11 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
 		 * Initialize the chunk, and then
 		 * claim the first slot in the newly allocated chunk.
 		 */
-		XFS_LIC_INIT(licp);
-		XFS_LIC_CLAIM(licp, 0);
+		xfs_lic_init(licp);
+		xfs_lic_claim(licp, 0);
 		licp->lic_unused = 1;
-		XFS_LIC_INIT_SLOT(licp, 0);
-		lidp = XFS_LIC_SLOT(licp, 0);
+		xfs_lic_init_slot(licp, 0);
+		lidp = xfs_lic_slot(licp, 0);
 
 		/*
 		 * Link in the new chunk and update the free count.
@@ -88,14 +88,14 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
 	 */
 	licp = &tp->t_items;
 	while (licp != NULL) {
-		if (XFS_LIC_VACANCY(licp)) {
+		if (xfs_lic_vacancy(licp)) {
 			if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
 				i = licp->lic_unused;
-				ASSERT(XFS_LIC_ISFREE(licp, i));
+				ASSERT(xfs_lic_isfree(licp, i));
 				break;
 			}
 			for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
-				if (XFS_LIC_ISFREE(licp, i))
+				if (xfs_lic_isfree(licp, i))
 					break;
 			}
 			ASSERT(i <= XFS_LIC_MAX_SLOT);
@@ -108,12 +108,12 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
 	 * If we find a free descriptor, claim it,
 	 * initialize it, and return it.
 	 */
-	XFS_LIC_CLAIM(licp, i);
+	xfs_lic_claim(licp, i);
 	if (licp->lic_unused <= i) {
 		licp->lic_unused = i + 1;
-		XFS_LIC_INIT_SLOT(licp, i);
+		xfs_lic_init_slot(licp, i);
 	}
-	lidp = XFS_LIC_SLOT(licp, i);
+	lidp = xfs_lic_slot(licp, i);
 	tp->t_items_free--;
 	lidp->lid_item = lip;
 	lidp->lid_flags = 0;
@@ -136,9 +136,9 @@ xfs_trans_free_item(xfs_trans_t	*tp, xfs_log_item_desc_t *lidp)
 	xfs_log_item_chunk_t	*licp;
 	xfs_log_item_chunk_t	**licpp;
 
-	slot = XFS_LIC_DESC_TO_SLOT(lidp);
-	licp = XFS_LIC_DESC_TO_CHUNK(lidp);
-	XFS_LIC_RELSE(licp, slot);
+	slot = xfs_lic_desc_to_slot(lidp);
+	licp = xfs_lic_desc_to_chunk(lidp);
+	xfs_lic_relse(licp, slot);
 	lidp->lid_item->li_desc = NULL;
 	tp->t_items_free++;
 
@@ -154,14 +154,14 @@ xfs_trans_free_item(xfs_trans_t	*tp, xfs_log_item_desc_t *lidp)
 	 * Also decrement the transaction structure's count of free items
 	 * by the number in a chunk since we are freeing an empty chunk.
 	 */
-	if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) {
+	if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
 		licpp = &(tp->t_items.lic_next);
 		while (*licpp != licp) {
 			ASSERT(*licpp != NULL);
 			licpp = &((*licpp)->lic_next);
 		}
 		*licpp = licp->lic_next;
-		kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+		kmem_free(licp);
 		tp->t_items_free -= XFS_LIC_NUM_SLOTS;
 	}
 }
@@ -207,20 +207,20 @@ xfs_trans_first_item(xfs_trans_t *tp)
 	/*
 	 * If it's not in the first chunk, skip to the second.
 	 */
-	if (XFS_LIC_ARE_ALL_FREE(licp)) {
+	if (xfs_lic_are_all_free(licp)) {
 		licp = licp->lic_next;
 	}
 
 	/*
 	 * Return the first non-free descriptor in the chunk.
 	 */
-	ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+	ASSERT(!xfs_lic_are_all_free(licp));
 	for (i = 0; i < licp->lic_unused; i++) {
-		if (XFS_LIC_ISFREE(licp, i)) {
+		if (xfs_lic_isfree(licp, i)) {
 			continue;
 		}
 
-		return XFS_LIC_SLOT(licp, i);
+		return xfs_lic_slot(licp, i);
 	}
 	cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
 	return NULL;
@@ -242,18 +242,18 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
 	xfs_log_item_chunk_t	*licp;
 	int			i;
 
-	licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+	licp = xfs_lic_desc_to_chunk(lidp);
 
 	/*
 	 * First search the rest of the chunk. The for loop keeps us
 	 * from referencing things beyond the end of the chunk.
 	 */
-	for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) {
-		if (XFS_LIC_ISFREE(licp, i)) {
+	for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
+		if (xfs_lic_isfree(licp, i)) {
 			continue;
 		}
 
-		return XFS_LIC_SLOT(licp, i);
+		return xfs_lic_slot(licp, i);
 	}
 
 	/*
@@ -266,13 +266,13 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
 	}
 
 	licp = licp->lic_next;
-	ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+	ASSERT(!xfs_lic_are_all_free(licp));
 	for (i = 0; i < licp->lic_unused; i++) {
-		if (XFS_LIC_ISFREE(licp, i)) {
+		if (xfs_lic_isfree(licp, i)) {
 			continue;
 		}
 
-		return XFS_LIC_SLOT(licp, i);
+		return xfs_lic_slot(licp, i);
 	}
 	ASSERT(0);
 	/* NOTREACHED */
@@ -300,9 +300,9 @@ xfs_trans_free_items(
 	/*
 	 * Special case the embedded chunk so we don't free it below.
 	 */
-	if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+	if (!xfs_lic_are_all_free(licp)) {
 		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
-		XFS_LIC_ALL_FREE(licp);
+		xfs_lic_all_free(licp);
 		licp->lic_unused = 0;
 	}
 	licp = licp->lic_next;
@@ -311,10 +311,10 @@ xfs_trans_free_items(
 	 * Unlock each item in each chunk and free the chunks.
 	 */
 	while (licp != NULL) {
-		ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+		ASSERT(!xfs_lic_are_all_free(licp));
 		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
 		next_licp = licp->lic_next;
-		kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+		kmem_free(licp);
 		licp = next_licp;
 	}
 
@@ -347,7 +347,7 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
 	/*
 	 * Special case the embedded chunk so we don't free.
 	 */
-	if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+	if (!xfs_lic_are_all_free(licp)) {
 		freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
 	}
 	licpp = &(tp->t_items.lic_next);
@@ -358,12 +358,12 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
 	 * and free empty chunks.
 	 */
 	while (licp != NULL) {
-		ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+		ASSERT(!xfs_lic_are_all_free(licp));
 		freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
 		next_licp = licp->lic_next;
-		if (XFS_LIC_ARE_ALL_FREE(licp)) {
+		if (xfs_lic_are_all_free(licp)) {
 			*licpp = next_licp;
-			kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+			kmem_free(licp);
 			freed -= XFS_LIC_NUM_SLOTS;
 		} else {
 			licpp = &(licp->lic_next);
@@ -402,7 +402,7 @@ xfs_trans_unlock_chunk(
 	freed = 0;
 	lidp = licp->lic_descs;
 	for (i = 0; i < licp->lic_unused; i++, lidp++) {
-		if (XFS_LIC_ISFREE(licp, i)) {
+		if (xfs_lic_isfree(licp, i)) {
 			continue;
 		}
 		lip = lidp->lid_item;
@@ -421,7 +421,7 @@ xfs_trans_unlock_chunk(
 		 */
 		if (!(freeing_chunk) &&
 		    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
-			XFS_LIC_RELSE(licp, i);
+			xfs_lic_relse(licp, i);
 			freed++;
 		}
 	}
@@ -530,7 +530,7 @@ xfs_trans_free_busy(xfs_trans_t *tp)
 	lbcp = tp->t_busy.lbc_next;
 	while (lbcp != NULL) {
 		lbcq = lbcp->lbc_next;
-		kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t));
+		kmem_free(lbcp);
 		lbcp = lbcq;
 	}
 
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 98e5f110ba5f..35d4d414bcc2 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -237,7 +237,7 @@ xfs_droplink(
 
 	ASSERT (ip->i_d.di_nlink > 0);
 	ip->i_d.di_nlink--;
-	drop_nlink(ip->i_vnode);
+	drop_nlink(VFS_I(ip));
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	error = 0;
@@ -301,7 +301,7 @@ xfs_bumplink(
 
 	ASSERT(ip->i_d.di_nlink > 0);
 	ip->i_d.di_nlink++;
-	inc_nlink(ip->i_vnode);
+	inc_nlink(VFS_I(ip));
 	if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
 	    (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
 		/*
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f316cb85d8e2..ef321225d269 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,9 +18,6 @@
 #ifndef __XFS_UTILS_H__
 #define __XFS_UTILS_H__
 
-#define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
-#define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
-
 extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
 				xfs_dev_t, cred_t *, prid_t, int,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 30bacd8bb0e5..439dd3939dda 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -58,586 +58,6 @@
 #include "xfs_utils.h"
 
 
-int __init
-xfs_init(void)
-{
-#ifdef XFS_DABUF_DEBUG
-	extern spinlock_t        xfs_dabuf_global_lock;
-	spin_lock_init(&xfs_dabuf_global_lock);
-#endif
-
-	/*
-	 * Initialize all of the zone allocators we use.
-	 */
-	xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
-						"xfs_log_ticket");
-	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-						"xfs_bmap_free_item");
-	xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
-						"xfs_btree_cur");
-	xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
-						"xfs_da_state");
-	xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
-	xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
-	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
-	xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
-	xfs_mru_cache_init();
-	xfs_filestream_init();
-
-	/*
-	 * The size of the zone allocated buf log item is the maximum
-	 * size possible under XFS.  This wastes a little bit of memory,
-	 * but it is much faster.
-	 */
-	xfs_buf_item_zone =
-		kmem_zone_init((sizeof(xfs_buf_log_item_t) +
-				(((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
-				  NBWORD) * sizeof(int))),
-			       "xfs_buf_item");
-	xfs_efd_zone =
-		kmem_zone_init((sizeof(xfs_efd_log_item_t) +
-			       ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
-				 sizeof(xfs_extent_t))),
-				      "xfs_efd_item");
-	xfs_efi_zone =
-		kmem_zone_init((sizeof(xfs_efi_log_item_t) +
-			       ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
-				 sizeof(xfs_extent_t))),
-				      "xfs_efi_item");
-
-	/*
-	 * These zones warrant special memory allocator hints
-	 */
-	xfs_inode_zone =
-		kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-					KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-					KM_ZONE_SPREAD, NULL);
-	xfs_ili_zone =
-		kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
-					KM_ZONE_SPREAD, NULL);
-
-	/*
-	 * Allocate global trace buffers.
-	 */
-#ifdef XFS_ALLOC_TRACE
-	xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_BMAP_TRACE
-	xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_BMBT_TRACE
-	xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_ATTR_TRACE
-	xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_DIR2_TRACE
-	xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP);
-#endif
-
-	xfs_dir_startup();
-
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
-	xfs_error_test_init();
-#endif /* DEBUG || INDUCE_IO_ERROR */
-
-	xfs_init_procfs();
-	xfs_sysctl_register();
-	return 0;
-}
-
-void __exit
-xfs_cleanup(void)
-{
-	extern kmem_zone_t	*xfs_inode_zone;
-	extern kmem_zone_t	*xfs_efd_zone;
-	extern kmem_zone_t	*xfs_efi_zone;
-
-	xfs_cleanup_procfs();
-	xfs_sysctl_unregister();
-	xfs_filestream_uninit();
-	xfs_mru_cache_uninit();
-	xfs_acl_zone_destroy(xfs_acl_zone);
-
-#ifdef XFS_DIR2_TRACE
-	ktrace_free(xfs_dir2_trace_buf);
-#endif
-#ifdef XFS_ATTR_TRACE
-	ktrace_free(xfs_attr_trace_buf);
-#endif
-#ifdef XFS_BMBT_TRACE
-	ktrace_free(xfs_bmbt_trace_buf);
-#endif
-#ifdef XFS_BMAP_TRACE
-	ktrace_free(xfs_bmap_trace_buf);
-#endif
-#ifdef XFS_ALLOC_TRACE
-	ktrace_free(xfs_alloc_trace_buf);
-#endif
-
-	kmem_zone_destroy(xfs_bmap_free_item_zone);
-	kmem_zone_destroy(xfs_btree_cur_zone);
-	kmem_zone_destroy(xfs_inode_zone);
-	kmem_zone_destroy(xfs_trans_zone);
-	kmem_zone_destroy(xfs_da_state_zone);
-	kmem_zone_destroy(xfs_dabuf_zone);
-	kmem_zone_destroy(xfs_buf_item_zone);
-	kmem_zone_destroy(xfs_efd_zone);
-	kmem_zone_destroy(xfs_efi_zone);
-	kmem_zone_destroy(xfs_ifork_zone);
-	kmem_zone_destroy(xfs_ili_zone);
-	kmem_zone_destroy(xfs_log_ticket_zone);
-}
-
-/*
- * xfs_start_flags
- *
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- */
-STATIC int
-xfs_start_flags(
-	struct xfs_mount_args	*ap,
-	struct xfs_mount	*mp)
-{
-	/* Values are in BBs */
-	if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
-		/*
-		 * At this point the superblock has not been read
-		 * in, therefore we do not know the block size.
-		 * Before the mount call ends we will convert
-		 * these to FSBs.
-		 */
-		mp->m_dalign = ap->sunit;
-		mp->m_swidth = ap->swidth;
-	}
-
-	if (ap->logbufs != -1 &&
-	    ap->logbufs != 0 &&
-	    (ap->logbufs < XLOG_MIN_ICLOGS ||
-	     ap->logbufs > XLOG_MAX_ICLOGS)) {
-		cmn_err(CE_WARN,
-			"XFS: invalid logbufs value: %d [not %d-%d]",
-			ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-		return XFS_ERROR(EINVAL);
-	}
-	mp->m_logbufs = ap->logbufs;
-	if (ap->logbufsize != -1 &&
-	    ap->logbufsize !=  0 &&
-	    (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
-	     ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
-	     !is_power_of_2(ap->logbufsize))) {
-		cmn_err(CE_WARN,
-	"XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
-			ap->logbufsize);
-		return XFS_ERROR(EINVAL);
-	}
-	mp->m_logbsize = ap->logbufsize;
-	mp->m_fsname_len = strlen(ap->fsname) + 1;
-	mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
-	strcpy(mp->m_fsname, ap->fsname);
-	if (ap->rtname[0]) {
-		mp->m_rtname = kmem_alloc(strlen(ap->rtname) + 1, KM_SLEEP);
-		strcpy(mp->m_rtname, ap->rtname);
-	}
-	if (ap->logname[0]) {
-		mp->m_logname = kmem_alloc(strlen(ap->logname) + 1, KM_SLEEP);
-		strcpy(mp->m_logname, ap->logname);
-	}
-
-	if (ap->flags & XFSMNT_WSYNC)
-		mp->m_flags |= XFS_MOUNT_WSYNC;
-#if XFS_BIG_INUMS
-	if (ap->flags & XFSMNT_INO64) {
-		mp->m_flags |= XFS_MOUNT_INO64;
-		mp->m_inoadd = XFS_INO64_OFFSET;
-	}
-#endif
-	if (ap->flags & XFSMNT_RETERR)
-		mp->m_flags |= XFS_MOUNT_RETERR;
-	if (ap->flags & XFSMNT_NOALIGN)
-		mp->m_flags |= XFS_MOUNT_NOALIGN;
-	if (ap->flags & XFSMNT_SWALLOC)
-		mp->m_flags |= XFS_MOUNT_SWALLOC;
-	if (ap->flags & XFSMNT_OSYNCISOSYNC)
-		mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
-	if (ap->flags & XFSMNT_32BITINODES)
-		mp->m_flags |= XFS_MOUNT_32BITINODES;
-
-	if (ap->flags & XFSMNT_IOSIZE) {
-		if (ap->iosizelog > XFS_MAX_IO_LOG ||
-		    ap->iosizelog < XFS_MIN_IO_LOG) {
-			cmn_err(CE_WARN,
-		"XFS: invalid log iosize: %d [not %d-%d]",
-				ap->iosizelog, XFS_MIN_IO_LOG,
-				XFS_MAX_IO_LOG);
-			return XFS_ERROR(EINVAL);
-		}
-
-		mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
-		mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
-	}
-
-	if (ap->flags & XFSMNT_IKEEP)
-		mp->m_flags |= XFS_MOUNT_IKEEP;
-	if (ap->flags & XFSMNT_DIRSYNC)
-		mp->m_flags |= XFS_MOUNT_DIRSYNC;
-	if (ap->flags & XFSMNT_ATTR2)
-		mp->m_flags |= XFS_MOUNT_ATTR2;
-
-	if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
-		mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-
-	/*
-	 * no recovery flag requires a read-only mount
-	 */
-	if (ap->flags & XFSMNT_NORECOVERY) {
-		if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-			cmn_err(CE_WARN,
-	"XFS: tried to mount a FS read-write without recovery!");
-			return XFS_ERROR(EINVAL);
-		}
-		mp->m_flags |= XFS_MOUNT_NORECOVERY;
-	}
-
-	if (ap->flags & XFSMNT_NOUUID)
-		mp->m_flags |= XFS_MOUNT_NOUUID;
-	if (ap->flags & XFSMNT_BARRIER)
-		mp->m_flags |= XFS_MOUNT_BARRIER;
-	else
-		mp->m_flags &= ~XFS_MOUNT_BARRIER;
-
-	if (ap->flags2 & XFSMNT2_FILESTREAMS)
-		mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-
-	if (ap->flags & XFSMNT_DMAPI)
-		mp->m_flags |= XFS_MOUNT_DMAPI;
-	return 0;
-}
-
-/*
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock _has_ now been read in.
- */
-STATIC int
-xfs_finish_flags(
-	struct xfs_mount_args	*ap,
-	struct xfs_mount	*mp)
-{
-	int			ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
-
-	/* Fail a mount where the logbuf is smaller then the log stripe */
-	if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-		if ((ap->logbufsize <= 0) &&
-		    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
-			mp->m_logbsize = mp->m_sb.sb_logsunit;
-		} else if (ap->logbufsize > 0 &&
-			   ap->logbufsize < mp->m_sb.sb_logsunit) {
-			cmn_err(CE_WARN,
-	"XFS: logbuf size must be greater than or equal to log stripe size");
-			return XFS_ERROR(EINVAL);
-		}
-	} else {
-		/* Fail a mount if the logbuf is larger than 32K */
-		if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
-			cmn_err(CE_WARN,
-	"XFS: logbuf size for version 1 logs must be 16K or 32K");
-			return XFS_ERROR(EINVAL);
-		}
-	}
-
-	if (xfs_sb_version_hasattr2(&mp->m_sb))
-		mp->m_flags |= XFS_MOUNT_ATTR2;
-
-	/*
-	 * prohibit r/w mounts of read-only filesystems
-	 */
-	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
-		cmn_err(CE_WARN,
-	"XFS: cannot mount a read-only filesystem as read-write");
-		return XFS_ERROR(EROFS);
-	}
-
-	/*
-	 * check for shared mount.
-	 */
-	if (ap->flags & XFSMNT_SHARED) {
-		if (!xfs_sb_version_hasshared(&mp->m_sb))
-			return XFS_ERROR(EINVAL);
-
-		/*
-		 * For IRIX 6.5, shared mounts must have the shared
-		 * version bit set, have the persistent readonly
-		 * field set, must be version 0 and can only be mounted
-		 * read-only.
-		 */
-		if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
-		     (mp->m_sb.sb_shared_vn != 0))
-			return XFS_ERROR(EINVAL);
-
-		mp->m_flags |= XFS_MOUNT_SHARED;
-
-		/*
-		 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
-		 */
-		if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
-			return XFS_ERROR(EINVAL);
-	}
-
-	if (ap->flags & XFSMNT_UQUOTA) {
-		mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-		if (ap->flags & XFSMNT_UQUOTAENF)
-			mp->m_qflags |= XFS_UQUOTA_ENFD;
-	}
-
-	if (ap->flags & XFSMNT_GQUOTA) {
-		mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-		if (ap->flags & XFSMNT_GQUOTAENF)
-			mp->m_qflags |= XFS_OQUOTA_ENFD;
-	} else if (ap->flags & XFSMNT_PQUOTA) {
-		mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-		if (ap->flags & XFSMNT_PQUOTAENF)
-			mp->m_qflags |= XFS_OQUOTA_ENFD;
-	}
-
-	return 0;
-}
-
-/*
- * xfs_mount
- *
- * The file system configurations are:
- *	(1) device (partition) with data and internal log
- *	(2) logical volume with data and log subvolumes.
- *	(3) logical volume with data, log, and realtime subvolumes.
- *
- * We only have to handle opening the log and realtime volumes here if
- * they are present.  The data subvolume has already been opened by
- * get_sb_bdev() and is stored in vfsp->vfs_super->s_bdev.
- */
-int
-xfs_mount(
-	struct xfs_mount	*mp,
-	struct xfs_mount_args	*args,
-	cred_t			*credp)
-{
-	struct block_device	*ddev, *logdev, *rtdev;
-	int			flags = 0, error;
-
-	ddev = mp->m_super->s_bdev;
-	logdev = rtdev = NULL;
-
-	error = xfs_dmops_get(mp, args);
-	if (error)
-		return error;
-	error = xfs_qmops_get(mp, args);
-	if (error)
-		return error;
-
-	if (args->flags & XFSMNT_QUIET)
-		flags |= XFS_MFSI_QUIET;
-
-	/*
-	 * Open real time and log devices - order is important.
-	 */
-	if (args->logname[0]) {
-		error = xfs_blkdev_get(mp, args->logname, &logdev);
-		if (error)
-			return error;
-	}
-	if (args->rtname[0]) {
-		error = xfs_blkdev_get(mp, args->rtname, &rtdev);
-		if (error) {
-			xfs_blkdev_put(logdev);
-			return error;
-		}
-
-		if (rtdev == ddev || rtdev == logdev) {
-			cmn_err(CE_WARN,
-	"XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
-			xfs_blkdev_put(logdev);
-			xfs_blkdev_put(rtdev);
-			return EINVAL;
-		}
-	}
-
-	/*
-	 * Setup xfs_mount buffer target pointers
-	 */
-	error = ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
-	if (!mp->m_ddev_targp) {
-		xfs_blkdev_put(logdev);
-		xfs_blkdev_put(rtdev);
-		return error;
-	}
-	if (rtdev) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
-		if (!mp->m_rtdev_targp) {
-			xfs_blkdev_put(logdev);
-			xfs_blkdev_put(rtdev);
-			goto error0;
-		}
-	}
-	mp->m_logdev_targp = (logdev && logdev != ddev) ?
-				xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp;
-	if (!mp->m_logdev_targp) {
-		xfs_blkdev_put(logdev);
-		xfs_blkdev_put(rtdev);
-		goto error0;
-	}
-
-	/*
-	 * Setup flags based on mount(2) options and then the superblock
-	 */
-	error = xfs_start_flags(args, mp);
-	if (error)
-		goto error1;
-	error = xfs_readsb(mp, flags);
-	if (error)
-		goto error1;
-	error = xfs_finish_flags(args, mp);
-	if (error)
-		goto error2;
-
-	/*
-	 * Setup xfs_mount buffer target pointers based on superblock
-	 */
-	error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
-				    mp->m_sb.sb_sectsize);
-	if (!error && logdev && logdev != ddev) {
-		unsigned int	log_sector_size = BBSIZE;
-
-		if (xfs_sb_version_hassector(&mp->m_sb))
-			log_sector_size = mp->m_sb.sb_logsectsize;
-		error = xfs_setsize_buftarg(mp->m_logdev_targp,
-					    mp->m_sb.sb_blocksize,
-					    log_sector_size);
-	}
-	if (!error && rtdev)
-		error = xfs_setsize_buftarg(mp->m_rtdev_targp,
-					    mp->m_sb.sb_blocksize,
-					    mp->m_sb.sb_sectsize);
-	if (error)
-		goto error2;
-
-	if (mp->m_flags & XFS_MOUNT_BARRIER)
-		xfs_mountfs_check_barriers(mp);
-
-	if ((error = xfs_filestream_mount(mp)))
-		goto error2;
-
-	error = xfs_mountfs(mp, flags);
-	if (error)
-		goto error2;
-
-	XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
-
-	return 0;
-
-error2:
-	if (mp->m_sb_bp)
-		xfs_freesb(mp);
-error1:
-	xfs_binval(mp->m_ddev_targp);
-	if (logdev && logdev != ddev)
-		xfs_binval(mp->m_logdev_targp);
-	if (rtdev)
-		xfs_binval(mp->m_rtdev_targp);
-error0:
-	xfs_unmountfs_close(mp, credp);
-	xfs_qmops_put(mp);
-	xfs_dmops_put(mp);
-	return error;
-}
-
-int
-xfs_unmount(
-	xfs_mount_t	*mp,
-	int		flags,
-	cred_t		*credp)
-{
-	xfs_inode_t	*rip;
-	bhv_vnode_t	*rvp;
-	int		unmount_event_wanted = 0;
-	int		unmount_event_flags = 0;
-	int		xfs_unmountfs_needed = 0;
-	int		error;
-
-	rip = mp->m_rootip;
-	rvp = XFS_ITOV(rip);
-
-#ifdef HAVE_DMAPI
-	if (mp->m_flags & XFS_MOUNT_DMAPI) {
-		error = XFS_SEND_PREUNMOUNT(mp,
-				rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
-				NULL, NULL, 0, 0,
-				(mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
-					0:DM_FLAGS_UNWANTED);
-			if (error)
-				return XFS_ERROR(error);
-		unmount_event_wanted = 1;
-		unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))?
-					0 : DM_FLAGS_UNWANTED;
-	}
-#endif
-
-	/*
-	 * Blow away any referenced inode in the filestreams cache.
-	 * This can and will cause log traffic as inodes go inactive
-	 * here.
-	 */
-	xfs_filestream_unmount(mp);
-
-	XFS_bflush(mp->m_ddev_targp);
-	error = xfs_unmount_flush(mp, 0);
-	if (error)
-		goto out;
-
-	ASSERT(vn_count(rvp) == 1);
-
-	/*
-	 * Drop the reference count
-	 */
-	IRELE(rip);
-
-	/*
-	 * If we're forcing a shutdown, typically because of a media error,
-	 * we want to make sure we invalidate dirty pages that belong to
-	 * referenced vnodes as well.
-	 */
-	if (XFS_FORCED_SHUTDOWN(mp)) {
-		error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
-		ASSERT(error != EFSCORRUPTED);
-	}
-	xfs_unmountfs_needed = 1;
-
-out:
-	/*	Send DMAPI event, if required.
-	 *	Then do xfs_unmountfs() if needed.
-	 *	Then return error (or zero).
-	 */
-	if (unmount_event_wanted) {
-		/* Note: mp structure must still exist for
-		 * XFS_SEND_UNMOUNT() call.
-		 */
-		XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL,
-			DM_RIGHT_NULL, 0, error, unmount_event_flags);
-	}
-	if (xfs_unmountfs_needed) {
-		/*
-		 * Call common unmount function to flush to disk
-		 * and free the super block buffer & mount structures.
-		 */
-		xfs_unmountfs(mp, credp);
-		xfs_qmops_put(mp);
-		xfs_dmops_put(mp);
-		kmem_free(mp, sizeof(xfs_mount_t));
-	}
-
-	return XFS_ERROR(error);
-}
-
 STATIC void
 xfs_quiesce_fs(
 	xfs_mount_t		*mp)
@@ -694,30 +114,6 @@ xfs_attr_quiesce(
 	xfs_unmountfs_writesb(mp);
 }
 
-int
-xfs_mntupdate(
-	struct xfs_mount		*mp,
-	int				*flags,
-	struct xfs_mount_args		*args)
-{
-	if (!(*flags & MS_RDONLY)) {			/* rw/ro -> rw */
-		if (mp->m_flags & XFS_MOUNT_RDONLY)
-			mp->m_flags &= ~XFS_MOUNT_RDONLY;
-		if (args->flags & XFSMNT_BARRIER) {
-			mp->m_flags |= XFS_MOUNT_BARRIER;
-			xfs_mountfs_check_barriers(mp);
-		} else {
-			mp->m_flags &= ~XFS_MOUNT_BARRIER;
-		}
-	} else if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {	/* rw -> ro */
-		xfs_filestream_flush(mp);
-		xfs_sync(mp, SYNC_DATA_QUIESCE);
-		xfs_attr_quiesce(mp);
-		mp->m_flags |= XFS_MOUNT_RDONLY;
-	}
-	return 0;
-}
-
 /*
  * xfs_unmount_flush implements a set of flush operation on special
  * inodes, which are needed as a separate set of operations so that
@@ -732,7 +128,6 @@ xfs_unmount_flush(
 	xfs_inode_t	*rip = mp->m_rootip;
 	xfs_inode_t	*rbmip;
 	xfs_inode_t	*rsumip = NULL;
-	bhv_vnode_t	*rvp = XFS_ITOV(rip);
 	int		error;
 
 	xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -750,7 +145,7 @@ xfs_unmount_flush(
 		if (error == EFSCORRUPTED)
 			goto fscorrupt_out;
 
-		ASSERT(vn_count(XFS_ITOV(rbmip)) == 1);
+		ASSERT(vn_count(VFS_I(rbmip)) == 1);
 
 		rsumip = mp->m_rsumip;
 		xfs_ilock(rsumip, XFS_ILOCK_EXCL);
@@ -761,7 +156,7 @@ xfs_unmount_flush(
 		if (error == EFSCORRUPTED)
 			goto fscorrupt_out;
 
-		ASSERT(vn_count(XFS_ITOV(rsumip)) == 1);
+		ASSERT(vn_count(VFS_I(rsumip)) == 1);
 	}
 
 	/*
@@ -771,7 +166,7 @@ xfs_unmount_flush(
 	if (error == EFSCORRUPTED)
 		goto fscorrupt_out2;
 
-	if (vn_count(rvp) != 1 && !relocation) {
+	if (vn_count(VFS_I(rip)) != 1 && !relocation) {
 		xfs_iunlock(rip, XFS_ILOCK_EXCL);
 		return XFS_ERROR(EBUSY);
 	}
@@ -888,7 +283,7 @@ xfs_sync_inodes(
 	int             *bypassed)
 {
 	xfs_inode_t	*ip = NULL;
-	bhv_vnode_t	*vp = NULL;
+	struct inode	*vp = NULL;
 	int		error;
 	int		last_error;
 	uint64_t	fflag;
@@ -1008,7 +403,7 @@ xfs_sync_inodes(
 			continue;
 		}
 
-		vp = XFS_ITOV_NULL(ip);
+		vp = VFS_I(ip);
 
 		/*
 		 * If the vnode is gone then this is being torn down,
@@ -1048,7 +443,7 @@ xfs_sync_inodes(
 
 		if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
 			XFS_MOUNT_IUNLOCK(mp);
-			kmem_free(ipointer, sizeof(xfs_iptr_t));
+			kmem_free(ipointer);
 			return 0;
 		}
 
@@ -1083,7 +478,7 @@ xfs_sync_inodes(
 			IPOINTER_INSERT(ip, mp);
 			xfs_ilock(ip, lock_flags);
 
-			ASSERT(vp == XFS_ITOV(ip));
+			ASSERT(vp == VFS_I(ip));
 			ASSERT(ip->i_mount == mp);
 
 			vnode_refed = B_TRUE;
@@ -1194,7 +589,7 @@ xfs_sync_inodes(
 			}
 			XFS_MOUNT_IUNLOCK(mp);
 			ASSERT(ipointer_in == B_FALSE);
-			kmem_free(ipointer, sizeof(xfs_iptr_t));
+			kmem_free(ipointer);
 			return XFS_ERROR(error);
 		}
 
@@ -1224,7 +619,7 @@ xfs_sync_inodes(
 
 	ASSERT(ipointer_in == B_FALSE);
 
-	kmem_free(ipointer, sizeof(xfs_iptr_t));
+	kmem_free(ipointer);
 	return XFS_ERROR(last_error);
 }
 
diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h
index 1688817c55ed..a74b05087da4 100644
--- a/fs/xfs/xfs_vfsops.h
+++ b/fs/xfs/xfs_vfsops.h
@@ -8,11 +8,6 @@ struct kstatfs;
 struct xfs_mount;
 struct xfs_mount_args;
 
-int xfs_mount(struct xfs_mount *mp, struct xfs_mount_args *args,
-		struct cred *credp);
-int xfs_unmount(struct xfs_mount *mp, int flags, struct cred *credp);
-int xfs_mntupdate(struct xfs_mount *mp, int *flags,
-		struct xfs_mount_args *args);
 int xfs_sync(struct xfs_mount *mp, int flags);
 void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 		int lnnum);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index e475e3717eb3..8b6812f66a15 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -75,26 +75,23 @@ xfs_open(
 	return 0;
 }
 
-/*
- * xfs_setattr
- */
 int
 xfs_setattr(
-	xfs_inode_t		*ip,
-	bhv_vattr_t		*vap,
+	struct xfs_inode	*ip,
+	struct iattr		*iattr,
 	int			flags,
 	cred_t			*credp)
 {
 	xfs_mount_t		*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	int			mask = iattr->ia_valid;
 	xfs_trans_t		*tp;
-	int			mask;
 	int			code;
 	uint			lock_flags;
 	uint			commit_flags=0;
 	uid_t			uid=0, iuid=0;
 	gid_t			gid=0, igid=0;
 	int			timeflags = 0;
-	xfs_prid_t		projid=0, iprojid=0;
 	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
 	int			file_owner;
 	int			need_iolock = 1;
@@ -104,30 +101,9 @@ xfs_setattr(
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
 		return XFS_ERROR(EROFS);
 
-	/*
-	 * Cannot set certain attributes.
-	 */
-	mask = vap->va_mask;
-	if (mask & XFS_AT_NOSET) {
-		return XFS_ERROR(EINVAL);
-	}
-
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
-	/*
-	 * Timestamps do not need to be logged and hence do not
-	 * need to be done within a transaction.
-	 */
-	if (mask & XFS_AT_UPDTIMES) {
-		ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
-		timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
-			    ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
-			    ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
-		xfs_ichgtime(ip, timeflags);
-		return 0;
-	}
-
 	olddquot1 = olddquot2 = NULL;
 	udqp = gdqp = NULL;
 
@@ -139,28 +115,22 @@ xfs_setattr(
 	 * If the IDs do change before we take the ilock, we're covered
 	 * because the i_*dquot fields will get updated anyway.
 	 */
-	if (XFS_IS_QUOTA_ON(mp) &&
-	    (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
+	if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
 		uint	qflags = 0;
 
-		if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
-			uid = vap->va_uid;
+		if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
+			uid = iattr->ia_uid;
 			qflags |= XFS_QMOPT_UQUOTA;
 		} else {
 			uid = ip->i_d.di_uid;
 		}
-		if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
-			gid = vap->va_gid;
+		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
+			gid = iattr->ia_gid;
 			qflags |= XFS_QMOPT_GQUOTA;
 		}  else {
 			gid = ip->i_d.di_gid;
 		}
-		if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
-			projid = vap->va_projid;
-			qflags |= XFS_QMOPT_PQUOTA;
-		}  else {
-			projid = ip->i_d.di_projid;
-		}
+
 		/*
 		 * We take a reference when we initialize udqp and gdqp,
 		 * so it is important that we never blindly double trip on
@@ -168,8 +138,8 @@ xfs_setattr(
 		 */
 		ASSERT(udqp == NULL);
 		ASSERT(gdqp == NULL);
-		code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
-					 &udqp, &gdqp);
+		code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid,
+					 qflags, &udqp, &gdqp);
 		if (code)
 			return code;
 	}
@@ -180,10 +150,10 @@ xfs_setattr(
 	 */
 	tp = NULL;
 	lock_flags = XFS_ILOCK_EXCL;
-	if (flags & ATTR_NOLOCK)
+	if (flags & XFS_ATTR_NOLOCK)
 		need_iolock = 0;
-	if (!(mask & XFS_AT_SIZE)) {
-		if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
+	if (!(mask & ATTR_SIZE)) {
+		if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
 		    (mp->m_flags & XFS_MOUNT_WSYNC)) {
 			tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
 			commit_flags = 0;
@@ -196,10 +166,10 @@ xfs_setattr(
 		}
 	} else {
 		if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
-		    !(flags & ATTR_DMI)) {
+		    !(flags & XFS_ATTR_DMI)) {
 			int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
 			code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
-				vap->va_size, 0, dmflags, NULL);
+				iattr->ia_size, 0, dmflags, NULL);
 			if (code) {
 				lock_flags = 0;
 				goto error_return;
@@ -212,16 +182,14 @@ xfs_setattr(
 	xfs_ilock(ip, lock_flags);
 
 	/* boolean: are we the file owner? */
-	file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
+	file_owner = (current_fsuid() == ip->i_d.di_uid);
 
 	/*
 	 * Change various properties of a file.
 	 * Only the owner or users with CAP_FOWNER
 	 * capability may do these things.
 	 */
-	if (mask &
-	    (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
-	     XFS_AT_GID|XFS_AT_PROJID)) {
+	if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
 		/*
 		 * CAP_FOWNER overrides the following restrictions:
 		 *
@@ -245,21 +213,21 @@ xfs_setattr(
 		 * IDs of the calling process shall match the group owner of
 		 * the file when setting the set-group-ID bit on that file
 		 */
-		if (mask & XFS_AT_MODE) {
+		if (mask & ATTR_MODE) {
 			mode_t m = 0;
 
-			if ((vap->va_mode & S_ISUID) && !file_owner)
+			if ((iattr->ia_mode & S_ISUID) && !file_owner)
 				m |= S_ISUID;
-			if ((vap->va_mode & S_ISGID) &&
+			if ((iattr->ia_mode & S_ISGID) &&
 			    !in_group_p((gid_t)ip->i_d.di_gid))
 				m |= S_ISGID;
 #if 0
 			/* Linux allows this, Irix doesn't. */
-			if ((vap->va_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
+			if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
 				m |= S_ISVTX;
 #endif
 			if (m && !capable(CAP_FSETID))
-				vap->va_mode &= ~m;
+				iattr->ia_mode &= ~m;
 		}
 	}
 
@@ -270,7 +238,7 @@ xfs_setattr(
 	 * and can change the group id only to a group of which he
 	 * or she is a member.
 	 */
-	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
+	if (mask & (ATTR_UID|ATTR_GID)) {
 		/*
 		 * These IDs could have changed since we last looked at them.
 		 * But, we're assured that if the ownership did change
@@ -278,12 +246,9 @@ xfs_setattr(
 		 * would have changed also.
 		 */
 		iuid = ip->i_d.di_uid;
-		iprojid = ip->i_d.di_projid;
 		igid = ip->i_d.di_gid;
-		gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
-		uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
-		projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
-			 iprojid;
+		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
+		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
 
 		/*
 		 * CAP_CHOWN overrides the following restrictions:
@@ -303,11 +268,10 @@ xfs_setattr(
 			goto error_return;
 		}
 		/*
-		 * Do a quota reservation only if uid/projid/gid is actually
+		 * Do a quota reservation only if uid/gid is actually
 		 * going to change.
 		 */
 		if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
-		    (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
 		    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
 			ASSERT(tp);
 			code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
@@ -321,13 +285,13 @@ xfs_setattr(
 	/*
 	 * Truncate file.  Must have write permission and not be a directory.
 	 */
-	if (mask & XFS_AT_SIZE) {
+	if (mask & ATTR_SIZE) {
 		/* Short circuit the truncate case for zero length files */
-		if ((vap->va_size == 0) &&
-		   (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
+		if (iattr->ia_size == 0 &&
+		    ip->i_size == 0 && ip->i_d.di_nextents == 0) {
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 			lock_flags &= ~XFS_ILOCK_EXCL;
-			if (mask & XFS_AT_CTIME)
+			if (mask & ATTR_CTIME)
 				xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 			code = 0;
 			goto error_return;
@@ -350,9 +314,9 @@ xfs_setattr(
 	/*
 	 * Change file access or modified times.
 	 */
-	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
+	if (mask & (ATTR_ATIME|ATTR_MTIME)) {
 		if (!file_owner) {
-			if ((flags & ATTR_UTIME) &&
+			if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
 			    !capable(CAP_FOWNER)) {
 				code = XFS_ERROR(EPERM);
 				goto error_return;
@@ -361,90 +325,23 @@ xfs_setattr(
 	}
 
 	/*
-	 * Change extent size or realtime flag.
-	 */
-	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
-		/*
-		 * Can't change extent size if any extents are allocated.
-		 */
-		if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
-		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
-		     vap->va_extsize) ) {
-			code = XFS_ERROR(EINVAL);	/* EFBIG? */
-			goto error_return;
-		}
-
-		/*
-		 * Can't change realtime flag if any extents are allocated.
-		 */
-		if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
-		    (mask & XFS_AT_XFLAGS) &&
-		    (XFS_IS_REALTIME_INODE(ip)) !=
-		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
-			code = XFS_ERROR(EINVAL);	/* EFBIG? */
-			goto error_return;
-		}
-		/*
-		 * Extent size must be a multiple of the appropriate block
-		 * size, if set at all.
-		 */
-		if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
-			xfs_extlen_t	size;
-
-			if (XFS_IS_REALTIME_INODE(ip) ||
-			    ((mask & XFS_AT_XFLAGS) &&
-			    (vap->va_xflags & XFS_XFLAG_REALTIME))) {
-				size = mp->m_sb.sb_rextsize <<
-				       mp->m_sb.sb_blocklog;
-			} else {
-				size = mp->m_sb.sb_blocksize;
-			}
-			if (vap->va_extsize % size) {
-				code = XFS_ERROR(EINVAL);
-				goto error_return;
-			}
-		}
-		/*
-		 * If realtime flag is set then must have realtime data.
-		 */
-		if ((mask & XFS_AT_XFLAGS) &&
-		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
-			if ((mp->m_sb.sb_rblocks == 0) ||
-			    (mp->m_sb.sb_rextsize == 0) ||
-			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
-				code = XFS_ERROR(EINVAL);
-				goto error_return;
-			}
-		}
-
-		/*
-		 * Can't modify an immutable/append-only file unless
-		 * we have appropriate permission.
-		 */
-		if ((mask & XFS_AT_XFLAGS) &&
-		    (ip->i_d.di_flags &
-				(XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
-		     (vap->va_xflags &
-				(XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
-		    !capable(CAP_LINUX_IMMUTABLE)) {
-			code = XFS_ERROR(EPERM);
-			goto error_return;
-		}
-	}
-
-	/*
 	 * Now we can make the changes.  Before we join the inode
-	 * to the transaction, if XFS_AT_SIZE is set then take care of
+	 * to the transaction, if ATTR_SIZE is set then take care of
 	 * the part of the truncation that must be done without the
 	 * inode lock.  This needs to be done before joining the inode
 	 * to the transaction, because the inode cannot be unlocked
 	 * once it is a part of the transaction.
 	 */
-	if (mask & XFS_AT_SIZE) {
+	if (mask & ATTR_SIZE) {
 		code = 0;
-		if ((vap->va_size > ip->i_size) &&
-		    (flags & ATTR_NOSIZETOK) == 0) {
-			code = xfs_igrow_start(ip, vap->va_size, credp);
+		if (iattr->ia_size > ip->i_size) {
+			/*
+			 * Do the first part of growing a file: zero any data
+			 * in the last block that is beyond the old EOF.  We
+			 * need to do this before the inode is joined to the
+			 * transaction to modify the i_size.
+			 */
+			code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
 		}
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
@@ -461,10 +358,10 @@ xfs_setattr(
 		 * not within the range we care about here.
 		 */
 		if (!code &&
-		    (ip->i_size != ip->i_d.di_size) &&
-		    (vap->va_size > ip->i_d.di_size)) {
+		    ip->i_size != ip->i_d.di_size &&
+		    iattr->ia_size > ip->i_d.di_size) {
 			code = xfs_flush_pages(ip,
-					ip->i_d.di_size, vap->va_size,
+					ip->i_d.di_size, iattr->ia_size,
 					XFS_B_ASYNC, FI_NONE);
 		}
 
@@ -472,7 +369,7 @@ xfs_setattr(
 		vn_iowait(ip);
 
 		if (!code)
-			code = xfs_itruncate_data(ip, vap->va_size);
+			code = xfs_itruncate_data(ip, iattr->ia_size);
 		if (code) {
 			ASSERT(tp == NULL);
 			lock_flags &= ~XFS_ILOCK_EXCL;
@@ -501,28 +398,30 @@ xfs_setattr(
 	/*
 	 * Truncate file.  Must have write permission and not be a directory.
 	 */
-	if (mask & XFS_AT_SIZE) {
+	if (mask & ATTR_SIZE) {
 		/*
 		 * Only change the c/mtime if we are changing the size
 		 * or we are explicitly asked to change it. This handles
 		 * the semantic difference between truncate() and ftruncate()
 		 * as implemented in the VFS.
 		 */
-		if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
+		if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
 			timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
 
-		if (vap->va_size > ip->i_size) {
-			xfs_igrow_finish(tp, ip, vap->va_size,
-			    !(flags & ATTR_DMI));
-		} else if ((vap->va_size <= ip->i_size) ||
-			   ((vap->va_size == 0) && ip->i_d.di_nextents)) {
+		if (iattr->ia_size > ip->i_size) {
+			ip->i_d.di_size = iattr->ia_size;
+			ip->i_size = iattr->ia_size;
+			if (!(flags & XFS_ATTR_DMI))
+				xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		} else if (iattr->ia_size <= ip->i_size ||
+			   (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
 			/*
 			 * signal a sync transaction unless
 			 * we're truncating an already unlinked
 			 * file on a wsync filesystem
 			 */
-			code = xfs_itruncate_finish(&tp, ip,
-					    (xfs_fsize_t)vap->va_size,
+			code = xfs_itruncate_finish(&tp, ip, iattr->ia_size,
 					    XFS_DATA_FORK,
 					    ((ip->i_d.di_nlink != 0 ||
 					      !(mp->m_flags & XFS_MOUNT_WSYNC))
@@ -544,9 +443,12 @@ xfs_setattr(
 	/*
 	 * Change file access modes.
 	 */
-	if (mask & XFS_AT_MODE) {
+	if (mask & ATTR_MODE) {
 		ip->i_d.di_mode &= S_IFMT;
-		ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
+		ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
+
+		inode->i_mode &= S_IFMT;
+		inode->i_mode |= iattr->ia_mode & ~S_IFMT;
 
 		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 		timeflags |= XFS_ICHGTIME_CHG;
@@ -559,7 +461,7 @@ xfs_setattr(
 	 * and can change the group id only to a group of which he
 	 * or she is a member.
 	 */
-	if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
+	if (mask & (ATTR_UID|ATTR_GID)) {
 		/*
 		 * CAP_FSETID overrides the following restrictions:
 		 *
@@ -577,39 +479,24 @@ xfs_setattr(
 		 */
 		if (iuid != uid) {
 			if (XFS_IS_UQUOTA_ON(mp)) {
-				ASSERT(mask & XFS_AT_UID);
+				ASSERT(mask & ATTR_UID);
 				ASSERT(udqp);
 				olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 							&ip->i_udquot, udqp);
 			}
 			ip->i_d.di_uid = uid;
+			inode->i_uid = uid;
 		}
 		if (igid != gid) {
 			if (XFS_IS_GQUOTA_ON(mp)) {
 				ASSERT(!XFS_IS_PQUOTA_ON(mp));
-				ASSERT(mask & XFS_AT_GID);
+				ASSERT(mask & ATTR_GID);
 				ASSERT(gdqp);
 				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
 							&ip->i_gdquot, gdqp);
 			}
 			ip->i_d.di_gid = gid;
-		}
-		if (iprojid != projid) {
-			if (XFS_IS_PQUOTA_ON(mp)) {
-				ASSERT(!XFS_IS_GQUOTA_ON(mp));
-				ASSERT(mask & XFS_AT_PROJID);
-				ASSERT(gdqp);
-				olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
-							&ip->i_gdquot, gdqp);
-			}
-			ip->i_d.di_projid = projid;
-			/*
-			 * We may have to rev the inode as well as
-			 * the superblock version number since projids didn't
-			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
-			 */
-			if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
-				xfs_bump_ino_vers2(tp, ip);
+			inode->i_gid = gid;
 		}
 
 		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
@@ -620,82 +507,33 @@ xfs_setattr(
 	/*
 	 * Change file access or modified times.
 	 */
-	if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
-		if (mask & XFS_AT_ATIME) {
-			ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
-			ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
+	if (mask & (ATTR_ATIME|ATTR_MTIME)) {
+		if (mask & ATTR_ATIME) {
+			inode->i_atime = iattr->ia_atime;
+			ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+			ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
 			ip->i_update_core = 1;
-			timeflags &= ~XFS_ICHGTIME_ACC;
 		}
-		if (mask & XFS_AT_MTIME) {
-			ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
-			ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
+		if (mask & ATTR_MTIME) {
+			inode->i_mtime = iattr->ia_mtime;
+			ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+			ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
 			timeflags &= ~XFS_ICHGTIME_MOD;
 			timeflags |= XFS_ICHGTIME_CHG;
 		}
-		if (tp && (flags & ATTR_UTIME))
+		if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
 			xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
 	}
 
 	/*
-	 * Change XFS-added attributes.
-	 */
-	if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
-		if (mask & XFS_AT_EXTSIZE) {
-			/*
-			 * Converting bytes to fs blocks.
-			 */
-			ip->i_d.di_extsize = vap->va_extsize >>
-				mp->m_sb.sb_blocklog;
-		}
-		if (mask & XFS_AT_XFLAGS) {
-			uint	di_flags;
-
-			/* can't set PREALLOC this way, just preserve it */
-			di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
-			if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
-				di_flags |= XFS_DIFLAG_IMMUTABLE;
-			if (vap->va_xflags & XFS_XFLAG_APPEND)
-				di_flags |= XFS_DIFLAG_APPEND;
-			if (vap->va_xflags & XFS_XFLAG_SYNC)
-				di_flags |= XFS_DIFLAG_SYNC;
-			if (vap->va_xflags & XFS_XFLAG_NOATIME)
-				di_flags |= XFS_DIFLAG_NOATIME;
-			if (vap->va_xflags & XFS_XFLAG_NODUMP)
-				di_flags |= XFS_DIFLAG_NODUMP;
-			if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
-				di_flags |= XFS_DIFLAG_PROJINHERIT;
-			if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
-				di_flags |= XFS_DIFLAG_NODEFRAG;
-			if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
-				di_flags |= XFS_DIFLAG_FILESTREAM;
-			if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
-				if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
-					di_flags |= XFS_DIFLAG_RTINHERIT;
-				if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
-					di_flags |= XFS_DIFLAG_NOSYMLINKS;
-				if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
-					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-			} else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
-				if (vap->va_xflags & XFS_XFLAG_REALTIME)
-					di_flags |= XFS_DIFLAG_REALTIME;
-				if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
-					di_flags |= XFS_DIFLAG_EXTSIZE;
-			}
-			ip->i_d.di_flags = di_flags;
-		}
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		timeflags |= XFS_ICHGTIME_CHG;
-	}
-
-	/*
-	 * Change file inode change time only if XFS_AT_CTIME set
+	 * Change file inode change time only if ATTR_CTIME set
 	 * AND we have been called by a DMI function.
 	 */
 
-	if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
-		ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
-		ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
+	if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
+		inode->i_ctime = iattr->ia_ctime;
+		ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+		ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
 		ip->i_update_core = 1;
 		timeflags &= ~XFS_ICHGTIME_CHG;
 	}
@@ -704,7 +542,7 @@ xfs_setattr(
 	 * Send out timestamp changes that need to be set to the
 	 * current time.  Not done when called by a DMI function.
 	 */
-	if (timeflags && !(flags & ATTR_DMI))
+	if (timeflags && !(flags & XFS_ATTR_DMI))
 		xfs_ichgtime(ip, timeflags);
 
 	XFS_STATS_INC(xs_ig_attrchg);
@@ -742,7 +580,7 @@ xfs_setattr(
 	}
 
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
-	    !(flags & ATTR_DMI)) {
+	    !(flags & XFS_ATTR_DMI)) {
 		(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL, NULL, NULL,
 					0, 0, AT_DELAY_FLAG(flags));
@@ -875,7 +713,7 @@ xfs_fsync(
 		return XFS_ERROR(EIO);
 
 	/* capture size updates in I/O completion before writing the inode. */
-	error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+	error = filemap_fdatawait(VFS_I(ip)->i_mapping);
 	if (error)
 		return XFS_ERROR(error);
 
@@ -1321,7 +1159,6 @@ int
 xfs_release(
 	xfs_inode_t	*ip)
 {
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
 	xfs_mount_t	*mp = ip->i_mount;
 	int		error;
 
@@ -1356,13 +1193,13 @@ xfs_release(
 		 * be exposed to that problem.
 		 */
 		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
-		if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
+		if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
 			xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
 	}
 
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-		     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+		     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
 		       ip->i_delayed_blks > 0)) &&
 		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
 		    (!(ip->i_d.di_flags &
@@ -1388,7 +1225,6 @@ int
 xfs_inactive(
 	xfs_inode_t	*ip)
 {
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
 	xfs_bmap_free_t	free_list;
 	xfs_fsblock_t	first_block;
 	int		committed;
@@ -1403,7 +1239,7 @@ xfs_inactive(
 	 * If the inode is already free, then there can be nothing
 	 * to clean up here.
 	 */
-	if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
+	if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
 		ASSERT(ip->i_df.if_real_bytes == 0);
 		ASSERT(ip->i_df.if_broot_bytes == 0);
 		return VN_INACTIVE_CACHE;
@@ -1433,7 +1269,7 @@ xfs_inactive(
 
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
-                     ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
                        ip->i_delayed_blks > 0)) &&
 		      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
 		     (!(ip->i_d.di_flags &
@@ -1601,12 +1437,18 @@ xfs_inactive(
 	return VN_INACTIVE_CACHE;
 }
 
-
+/*
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
+ */
 int
 xfs_lookup(
 	xfs_inode_t		*dp,
 	struct xfs_name		*name,
-	xfs_inode_t		**ipp)
+	xfs_inode_t		**ipp,
+	struct xfs_name		*ci_name)
 {
 	xfs_ino_t		inum;
 	int			error;
@@ -1618,7 +1460,7 @@ xfs_lookup(
 		return XFS_ERROR(EIO);
 
 	lock_mode = xfs_ilock_map_shared(dp);
-	error = xfs_dir_lookup(NULL, dp, name, &inum);
+	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 	xfs_iunlock_map_shared(dp, lock_mode);
 
 	if (error)
@@ -1626,12 +1468,15 @@ xfs_lookup(
 
 	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
 	if (error)
-		goto out;
+		goto out_free_name;
 
 	xfs_itrace_ref(*ipp);
 	return 0;
 
- out:
+out_free_name:
+	if (ci_name)
+		kmem_free(ci_name->name);
+out:
 	*ipp = NULL;
 	return error;
 }
@@ -1688,7 +1533,7 @@ xfs_create(
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
 	error = XFS_QM_DQVOPALLOC(mp, dp,
-			current_fsuid(credp), current_fsgid(credp), prid,
+			current_fsuid(), current_fsgid(), prid,
 			XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		goto std_return;
@@ -1860,111 +1705,6 @@ std_return:
 }
 
 #ifdef DEBUG
-/*
- * Some counters to see if (and how often) we are hitting some deadlock
- * prevention code paths.
- */
-
-int xfs_rm_locks;
-int xfs_rm_lock_delays;
-int xfs_rm_attempts;
-#endif
-
-/*
- * The following routine will lock the inodes associated with the
- * directory and the named entry in the directory. The locks are
- * acquired in increasing inode number.
- *
- * If the entry is "..", then only the directory is locked. The
- * vnode ref count will still include that from the .. entry in
- * this case.
- *
- * There is a deadlock we need to worry about. If the locked directory is
- * in the AIL, it might be blocking up the log. The next inode we lock
- * could be already locked by another thread waiting for log space (e.g
- * a permanent log reservation with a long running transaction (see
- * xfs_itruncate_finish)). To solve this, we must check if the directory
- * is in the ail and use lock_nowait. If we can't lock, we need to
- * drop the inode lock on the directory and try again. xfs_iunlock will
- * potentially push the tail if we were holding up the log.
- */
-STATIC int
-xfs_lock_dir_and_entry(
-	xfs_inode_t	*dp,
-	xfs_inode_t	*ip)	/* inode of entry 'name' */
-{
-	int		attempts;
-	xfs_ino_t	e_inum;
-	xfs_inode_t	*ips[2];
-	xfs_log_item_t	*lp;
-
-#ifdef DEBUG
-	xfs_rm_locks++;
-#endif
-	attempts = 0;
-
-again:
-	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-
-	e_inum = ip->i_ino;
-
-	xfs_itrace_ref(ip);
-
-	/*
-	 * We want to lock in increasing inum. Since we've already
-	 * acquired the lock on the directory, we may need to release
-	 * if if the inum of the entry turns out to be less.
-	 */
-	if (e_inum > dp->i_ino) {
-		/*
-		 * We are already in the right order, so just
-		 * lock on the inode of the entry.
-		 * We need to use nowait if dp is in the AIL.
-		 */
-
-		lp = (xfs_log_item_t *)dp->i_itemp;
-		if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
-			if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-				attempts++;
-#ifdef DEBUG
-				xfs_rm_attempts++;
-#endif
-
-				/*
-				 * Unlock dp and try again.
-				 * xfs_iunlock will try to push the tail
-				 * if the inode is in the AIL.
-				 */
-
-				xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-				if ((attempts % 5) == 0) {
-					delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
-					xfs_rm_lock_delays++;
-#endif
-				}
-				goto again;
-			}
-		} else {
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-		}
-	} else if (e_inum < dp->i_ino) {
-		xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-		ips[0] = ip;
-		ips[1] = dp;
-		xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
-	}
-	/* else	 e_inum == dp->i_ino */
-	/*     This can happen if we're asked to lock /x/..
-	 *     the entry is "..", which is also the parent directory.
-	 */
-
-	return 0;
-}
-
-#ifdef DEBUG
 int xfs_locked_n;
 int xfs_small_retries;
 int xfs_middle_retries;
@@ -2098,12 +1838,52 @@ again:
 #endif
 }
 
-#ifdef	DEBUG
-#define	REMOVE_DEBUG_TRACE(x)	{remove_which_error_return = (x);}
-int remove_which_error_return = 0;
-#else /* ! DEBUG */
-#define	REMOVE_DEBUG_TRACE(x)
-#endif	/* ! DEBUG */
+/*
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
+ */
+void
+xfs_lock_two_inodes(
+	xfs_inode_t		*ip0,
+	xfs_inode_t		*ip1,
+	uint			lock_mode)
+{
+	xfs_inode_t		*temp;
+	int			attempts = 0;
+	xfs_log_item_t		*lp;
+
+	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+		ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+	ASSERT(ip0->i_ino != ip1->i_ino);
+
+	if (ip0->i_ino > ip1->i_ino) {
+		temp = ip0;
+		ip0 = ip1;
+		ip1 = temp;
+	}
+
+ again:
+	xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
+
+	/*
+	 * If the first lock we have locked is in the AIL, we must TRY to get
+	 * the second lock. If we can't get it, we must release the first one
+	 * and try again.
+	 */
+	lp = (xfs_log_item_t *)ip0->i_itemp;
+	if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+		if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+			xfs_iunlock(ip0, lock_mode);
+			if ((++attempts % 5) == 0)
+				delay(1); /* Don't just spin the CPU */
+			goto again;
+		}
+	} else {
+		xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+	}
+}
 
 int
 xfs_remove(
@@ -2113,6 +1893,7 @@ xfs_remove(
 {
 	xfs_mount_t		*mp = dp->i_mount;
 	xfs_trans_t             *tp = NULL;
+	int			is_dir = S_ISDIR(ip->i_d.di_mode);
 	int                     error = 0;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
@@ -2120,8 +1901,10 @@ xfs_remove(
 	int			committed;
 	int			link_zero;
 	uint			resblks;
+	uint			log_count;
 
 	xfs_itrace_entry(dp);
+	xfs_itrace_entry(ip);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
@@ -2134,19 +1917,23 @@ xfs_remove(
 			return error;
 	}
 
-	xfs_itrace_entry(ip);
-	xfs_itrace_ref(ip);
-
 	error = XFS_QM_DQATTACH(mp, dp, 0);
-	if (!error)
-		error = XFS_QM_DQATTACH(mp, ip, 0);
-	if (error) {
-		REMOVE_DEBUG_TRACE(__LINE__);
+	if (error)
 		goto std_return;
-	}
 
-	tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+	error = XFS_QM_DQATTACH(mp, ip, 0);
+	if (error)
+		goto std_return;
+
+	if (is_dir) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+		log_count = XFS_DEFAULT_LOG_COUNT;
+	} else {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+		log_count = XFS_REMOVE_LOG_COUNT;
+	}
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
 	/*
 	 * We try to get the real space reservation first,
 	 * allowing for directory btree deletion(s) implying
@@ -2158,25 +1945,19 @@ xfs_remove(
 	 */
 	resblks = XFS_REMOVE_SPACE_RES(mp);
 	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
-			XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+				  XFS_TRANS_PERM_LOG_RES, log_count);
 	if (error == ENOSPC) {
 		resblks = 0;
 		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
-				XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+					  XFS_TRANS_PERM_LOG_RES, log_count);
 	}
 	if (error) {
 		ASSERT(error != ENOSPC);
-		REMOVE_DEBUG_TRACE(__LINE__);
-		xfs_trans_cancel(tp, 0);
-		return error;
+		cancel_flags = 0;
+		goto out_trans_cancel;
 	}
 
-	error = xfs_lock_dir_and_entry(dp, ip);
-	if (error) {
-		REMOVE_DEBUG_TRACE(__LINE__);
-		xfs_trans_cancel(tp, cancel_flags);
-		goto std_return;
-	}
+	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 
 	/*
 	 * At this point, we've gotten both the directory and the entry
@@ -2189,46 +1970,83 @@ xfs_remove(
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	/*
-	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
+	 * If we're removing a directory perform some additional validation.
 	 */
+	if (is_dir) {
+		ASSERT(ip->i_d.di_nlink >= 2);
+		if (ip->i_d.di_nlink != 2) {
+			error = XFS_ERROR(ENOTEMPTY);
+			goto out_trans_cancel;
+		}
+		if (!xfs_dir_isempty(ip)) {
+			error = XFS_ERROR(ENOTEMPTY);
+			goto out_trans_cancel;
+		}
+	}
+
 	XFS_BMAP_INIT(&free_list, &first_block);
 	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
 					&first_block, &free_list, resblks);
 	if (error) {
 		ASSERT(error != ENOENT);
-		REMOVE_DEBUG_TRACE(__LINE__);
-		goto error1;
+		goto out_bmap_cancel;
 	}
 	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
+	/*
+	 * Bump the in memory generation count on the parent
+	 * directory so that other can know that it has changed.
+	 */
 	dp->i_gen++;
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
-	error = xfs_droplink(tp, ip);
-	if (error) {
-		REMOVE_DEBUG_TRACE(__LINE__);
-		goto error1;
+	if (is_dir) {
+		/*
+		 * Drop the link from ip's "..".
+		 */
+		error = xfs_droplink(tp, dp);
+		if (error)
+			goto out_bmap_cancel;
+
+		/*
+		 * Drop the link from dp to ip.
+		 */
+		error = xfs_droplink(tp, ip);
+		if (error)
+			goto out_bmap_cancel;
+	} else {
+		/*
+		 * When removing a non-directory we need to log the parent
+		 * inode here for the i_gen update.  For a directory this is
+		 * done implicitly by the xfs_droplink call for the ".." entry.
+		 */
+		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 	}
 
-	/* Determine if this is the last link while
+	/*
+	 * Drop the "." link from ip to self.
+	 */
+	error = xfs_droplink(tp, ip);
+	if (error)
+		goto out_bmap_cancel;
+
+	/*
+	 * Determine if this is the last link while
 	 * we are in the transaction.
 	 */
-	link_zero = (ip)->i_d.di_nlink==0;
+	link_zero = (ip->i_d.di_nlink == 0);
 
 	/*
 	 * If this is a synchronous mount, make sure that the
 	 * remove transaction goes to disk before returning to
 	 * the user.
 	 */
-	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
 		xfs_trans_set_sync(tp);
-	}
 
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
-	if (error) {
-		REMOVE_DEBUG_TRACE(__LINE__);
-		goto error_rele;
-	}
+	if (error)
+		goto out_bmap_cancel;
 
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 	if (error)
@@ -2240,38 +2058,26 @@ xfs_remove(
 	 * will get killed on last close in xfs_close() so we don't
 	 * have to worry about that.
 	 */
-	if (link_zero && xfs_inode_is_filestream(ip))
+	if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
 		xfs_filestream_deassociate(ip);
 
 	xfs_itrace_exit(ip);
+	xfs_itrace_exit(dp);
 
-/*	Fall through to std_return with error = 0 */
  std_return:
 	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
-		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-				dp, DM_RIGHT_NULL,
-				NULL, DM_RIGHT_NULL,
-				name->name, NULL, ip->i_d.di_mode, error, 0);
+		XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
+				NULL, DM_RIGHT_NULL, name->name, NULL,
+				ip->i_d.di_mode, error, 0);
 	}
-	return error;
 
- error1:
-	xfs_bmap_cancel(&free_list);
-	cancel_flags |= XFS_TRANS_ABORT;
-	xfs_trans_cancel(tp, cancel_flags);
-	goto std_return;
+	return error;
 
- error_rele:
-	/*
-	 * In this case make sure to not release the inode until after
-	 * the current transaction is aborted.  Releasing it beforehand
-	 * can cause us to go to xfs_inactive and start a recursive
-	 * transaction which can easily deadlock with the current one.
-	 */
+ out_bmap_cancel:
 	xfs_bmap_cancel(&free_list);
 	cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
 	xfs_trans_cancel(tp, cancel_flags);
-
 	goto std_return;
 }
 
@@ -2283,7 +2089,6 @@ xfs_link(
 {
 	xfs_mount_t		*mp = tdp->i_mount;
 	xfs_trans_t		*tp;
-	xfs_inode_t		*ips[2];
 	int			error;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
@@ -2331,15 +2136,7 @@ xfs_link(
 		goto error_return;
 	}
 
-	if (sip->i_ino < tdp->i_ino) {
-		ips[0] = sip;
-		ips[1] = tdp;
-	} else {
-		ips[0] = tdp;
-		ips[1] = sip;
-	}
-
-	xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
+	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
 	/*
 	 * Increment vnode ref counts since xfs_trans_commit &
@@ -2480,7 +2277,7 @@ xfs_mkdir(
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
 	error = XFS_QM_DQVOPALLOC(mp, dp,
-			current_fsuid(credp), current_fsgid(credp), prid,
+			current_fsuid(), current_fsgid(), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		goto std_return;
@@ -2638,186 +2435,6 @@ std_return:
 }
 
 int
-xfs_rmdir(
-	xfs_inode_t             *dp,
-	struct xfs_name		*name,
-	xfs_inode_t		*cdp)
-{
-	xfs_mount_t		*mp = dp->i_mount;
-	xfs_trans_t             *tp;
-	int                     error;
-	xfs_bmap_free_t         free_list;
-	xfs_fsblock_t           first_block;
-	int			cancel_flags;
-	int			committed;
-	int			last_cdp_link;
-	uint			resblks;
-
-	xfs_itrace_entry(dp);
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return XFS_ERROR(EIO);
-
-	if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
-		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
-					dp, DM_RIGHT_NULL,
-					NULL, DM_RIGHT_NULL, name->name,
-					NULL, cdp->i_d.di_mode, 0, 0);
-		if (error)
-			return XFS_ERROR(error);
-	}
-
-	/*
-	 * Get the dquots for the inodes.
-	 */
-	error = XFS_QM_DQATTACH(mp, dp, 0);
-	if (!error)
-		error = XFS_QM_DQATTACH(mp, cdp, 0);
-	if (error) {
-		REMOVE_DEBUG_TRACE(__LINE__);
-		goto std_return;
-	}
-
-	tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
-	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-	/*
-	 * We try to get the real space reservation first,
-	 * allowing for directory btree deletion(s) implying
-	 * possible bmap insert(s).  If we can't get the space
-	 * reservation then we use 0 instead, and avoid the bmap
-	 * btree insert(s) in the directory code by, if the bmap
-	 * insert tries to happen, instead trimming the LAST
-	 * block from the directory.
-	 */
-	resblks = XFS_REMOVE_SPACE_RES(mp);
-	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
-			XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
-	if (error == ENOSPC) {
-		resblks = 0;
-		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
-				XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
-	}
-	if (error) {
-		ASSERT(error != ENOSPC);
-		cancel_flags = 0;
-		goto error_return;
-	}
-	XFS_BMAP_INIT(&free_list, &first_block);
-
-	/*
-	 * Now lock the child directory inode and the parent directory
-	 * inode in the proper order.  This will take care of validating
-	 * that the directory entry for the child directory inode has
-	 * not changed while we were obtaining a log reservation.
-	 */
-	error = xfs_lock_dir_and_entry(dp, cdp);
-	if (error) {
-		xfs_trans_cancel(tp, cancel_flags);
-		goto std_return;
-	}
-
-	IHOLD(dp);
-	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-
-	IHOLD(cdp);
-	xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
-
-	ASSERT(cdp->i_d.di_nlink >= 2);
-	if (cdp->i_d.di_nlink != 2) {
-		error = XFS_ERROR(ENOTEMPTY);
-		goto error_return;
-	}
-	if (!xfs_dir_isempty(cdp)) {
-		error = XFS_ERROR(ENOTEMPTY);
-		goto error_return;
-	}
-
-	error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
-					&first_block, &free_list, resblks);
-	if (error)
-		goto error1;
-
-	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
-	/*
-	 * Bump the in memory generation count on the parent
-	 * directory so that other can know that it has changed.
-	 */
-	dp->i_gen++;
-
-	/*
-	 * Drop the link from cdp's "..".
-	 */
-	error = xfs_droplink(tp, dp);
-	if (error) {
-		goto error1;
-	}
-
-	/*
-	 * Drop the link from dp to cdp.
-	 */
-	error = xfs_droplink(tp, cdp);
-	if (error) {
-		goto error1;
-	}
-
-	/*
-	 * Drop the "." link from cdp to self.
-	 */
-	error = xfs_droplink(tp, cdp);
-	if (error) {
-		goto error1;
-	}
-
-	/* Determine these before committing transaction */
-	last_cdp_link = (cdp)->i_d.di_nlink==0;
-
-	/*
-	 * If this is a synchronous mount, make sure that the
-	 * rmdir transaction goes to disk before returning to
-	 * the user.
-	 */
-	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-		xfs_trans_set_sync(tp);
-	}
-
-	error = xfs_bmap_finish (&tp, &free_list, &committed);
-	if (error) {
-		xfs_bmap_cancel(&free_list);
-		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
-				 XFS_TRANS_ABORT));
-		goto std_return;
-	}
-
-	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	if (error) {
-		goto std_return;
-	}
-
-
-	/* Fall through to std_return with error = 0 or the errno
-	 * from xfs_trans_commit. */
- std_return:
-	if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
-		(void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
-					dp, DM_RIGHT_NULL,
-					NULL, DM_RIGHT_NULL,
-					name->name, NULL, cdp->i_d.di_mode,
-					error, 0);
-	}
-	return error;
-
- error1:
-	xfs_bmap_cancel(&free_list);
-	cancel_flags |= XFS_TRANS_ABORT;
-	/* FALLTHROUGH */
-
- error_return:
-	xfs_trans_cancel(tp, cancel_flags);
-	goto std_return;
-}
-
-int
 xfs_symlink(
 	xfs_inode_t		*dp,
 	struct xfs_name		*link_name,
@@ -2886,7 +2503,7 @@ xfs_symlink(
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
 	error = XFS_QM_DQVOPALLOC(mp, dp,
-			current_fsuid(credp), current_fsgid(credp), prid,
+			current_fsuid(), current_fsgid(), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		goto std_return;
@@ -3181,14 +2798,13 @@ int
 xfs_reclaim(
 	xfs_inode_t	*ip)
 {
-	bhv_vnode_t	*vp = XFS_ITOV(ip);
 
 	xfs_itrace_entry(ip);
 
-	ASSERT(!VN_MAPPED(vp));
+	ASSERT(!VN_MAPPED(VFS_I(ip)));
 
 	/* bad inode, get out here ASAP */
-	if (VN_BAD(vp)) {
+	if (VN_BAD(VFS_I(ip))) {
 		xfs_ireclaim(ip);
 		return 0;
 	}
@@ -3225,7 +2841,7 @@ xfs_reclaim(
 		XFS_MOUNT_ILOCK(mp);
 		spin_lock(&ip->i_flags_lock);
 		__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-		vn_to_inode(vp)->i_private = NULL;
+		VFS_I(ip)->i_private = NULL;
 		ip->i_vnode = NULL;
 		spin_unlock(&ip->i_flags_lock);
 		list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
@@ -3241,8 +2857,7 @@ xfs_finish_reclaim(
 	int		sync_mode)
 {
 	xfs_perag_t	*pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-	bhv_vnode_t	*vp = XFS_ITOV_NULL(ip);
-	int		error;
+	struct inode	*vp = VFS_I(ip);
 
 	if (vp && VN_BAD(vp))
 		goto reclaim;
@@ -3285,29 +2900,16 @@ xfs_finish_reclaim(
 		xfs_iflock(ip);
 	}
 
-	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		if (ip->i_update_core ||
-		    ((ip->i_itemp != NULL) &&
-		     (ip->i_itemp->ili_format.ilf_fields != 0))) {
-			error = xfs_iflush(ip, sync_mode);
-			/*
-			 * If we hit an error, typically because of filesystem
-			 * shutdown, we don't need to let vn_reclaim to know
-			 * because we're gonna reclaim the inode anyway.
-			 */
-			if (error) {
-				xfs_iunlock(ip, XFS_ILOCK_EXCL);
-				goto reclaim;
-			}
-			xfs_iflock(ip); /* synchronize with xfs_iflush_done */
-		}
-
-		ASSERT(ip->i_update_core == 0);
-		ASSERT(ip->i_itemp == NULL ||
-		       ip->i_itemp->ili_format.ilf_fields == 0);
+	/*
+	 * In the case of a forced shutdown we rely on xfs_iflush() to
+	 * wait for the inode to be unpinned before returning an error.
+	 */
+	if (xfs_iflush(ip, sync_mode) == 0) {
+		/* synchronize with xfs_iflush_done */
+		xfs_iflock(ip);
+		xfs_ifunlock(ip);
 	}
 
-	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
  reclaim:
@@ -3418,7 +3020,7 @@ xfs_alloc_file_space(
 
 	/*	Generate a DMAPI event if needed.	*/
 	if (alloc_type != 0 && offset < ip->i_size &&
-			(attr_flags&ATTR_DMI) == 0  &&
+			(attr_flags & XFS_ATTR_DMI) == 0  &&
 			DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
 		xfs_off_t           end_dmi_offset;
 
@@ -3532,7 +3134,7 @@ retry:
 		allocatesize_fsb -= allocated_fsb;
 	}
 dmapi_enospc_check:
-	if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
+	if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
 	    DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
 				ip, DM_RIGHT_NULL,
@@ -3558,6 +3160,13 @@ error1:	/* Just cancel transaction */
 /*
  * Zero file bytes between startoff and endoff inclusive.
  * The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
  */
 STATIC int
 xfs_zero_remaining_bytes(
@@ -3574,6 +3183,17 @@ xfs_zero_remaining_bytes(
 	int			nimap;
 	int			error = 0;
 
+	/*
+	 * Avoid doing I/O beyond eof - it's not necessary
+	 * since nothing can read beyond eof.  The space will
+	 * be zeroed when the file is extended anyway.
+	 */
+	if (startoff >= ip->i_size)
+		return 0;
+
+	if (endoff > ip->i_size)
+		endoff = ip->i_size;
+
 	bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
 				XFS_IS_REALTIME_INODE(ip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp);
@@ -3643,7 +3263,6 @@ xfs_free_file_space(
 	xfs_off_t		len,
 	int			attr_flags)
 {
-	bhv_vnode_t		*vp;
 	int			committed;
 	int			done;
 	xfs_off_t		end_dmi_offset;
@@ -3663,7 +3282,6 @@ xfs_free_file_space(
 	xfs_trans_t		*tp;
 	int			need_iolock = 1;
 
-	vp = XFS_ITOV(ip);
 	mp = ip->i_mount;
 
 	xfs_itrace_entry(ip);
@@ -3679,7 +3297,7 @@ xfs_free_file_space(
 	end_dmi_offset = offset + len;
 	endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
 
-	if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
+	if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
 	    DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
 		if (end_dmi_offset > ip->i_size)
 			end_dmi_offset = ip->i_size;
@@ -3690,7 +3308,7 @@ xfs_free_file_space(
 			return error;
 	}
 
-	if (attr_flags & ATTR_NOLOCK)
+	if (attr_flags & XFS_ATTR_NOLOCK)
 		need_iolock = 0;
 	if (need_iolock) {
 		xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -3700,7 +3318,7 @@ xfs_free_file_space(
 	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
 	ioffset = offset & ~(rounding - 1);
 
-	if (VN_CACHED(vp) != 0) {
+	if (VN_CACHED(VFS_I(ip)) != 0) {
 		xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
 		error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
 		if (error)
@@ -3867,7 +3485,7 @@ xfs_change_file_space(
 	xfs_off_t	startoffset;
 	xfs_off_t	llen;
 	xfs_trans_t	*tp;
-	bhv_vattr_t	va;
+	struct iattr	iattr;
 
 	xfs_itrace_entry(ip);
 
@@ -3941,10 +3559,10 @@ xfs_change_file_space(
 				break;
 		}
 
-		va.va_mask = XFS_AT_SIZE;
-		va.va_size = startoffset;
+		iattr.ia_valid = ATTR_SIZE;
+		iattr.ia_size = startoffset;
 
-		error = xfs_setattr(ip, &va, attr_flags, credp);
+		error = xfs_setattr(ip, &iattr, attr_flags, credp);
 
 		if (error)
 			return error;
@@ -3974,7 +3592,7 @@ xfs_change_file_space(
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_ihold(tp, ip);
 
-	if ((attr_flags & ATTR_DMI) == 0) {
+	if ((attr_flags & XFS_ATTR_DMI) == 0) {
 		ip->i_d.di_mode &= ~S_ISUID;
 
 		/*
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 57335ba4ce53..e932a96bec54 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,9 +2,9 @@
 #define _XFS_VNODEOPS_H 1
 
 struct attrlist_cursor_kern;
-struct bhv_vattr;
 struct cred;
 struct file;
+struct iattr;
 struct inode;
 struct iovec;
 struct kiocb;
@@ -15,14 +15,18 @@ struct xfs_iomap;
 
 
 int xfs_open(struct xfs_inode *ip);
-int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
 		struct cred *credp);
+#define	XFS_ATTR_DMI		0x01	/* invocation from a DMI function */
+#define	XFS_ATTR_NONBLOCK	0x02	/* return EAGAIN if operation would block */
+#define XFS_ATTR_NOLOCK		0x04	/* Don't grab any conflicting locks */
+
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_fsync(struct xfs_inode *ip);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
-		struct xfs_inode **ipp);
+		struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
 		xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
@@ -31,8 +35,6 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 		struct xfs_name *target_name);
 int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
 		mode_t mode, struct xfs_inode **ipp, struct cred *credp);
-int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name,
-		struct xfs_inode *cdp);
 int xfs_readdir(struct xfs_inode	*dp, void *dirent, size_t bufsize,
 		       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,